#include #include #include #define INIT_NM_SIZE 128 #define INIT_SQ_SIZE 4096 struct SeqBuf { mem·Allocator mem; void *heap; int cap, off; byte *it, b[]; }; static void reset(struct SeqBuf *sb) { sb->off = 0; sb->it = sb->b; } static error grow(struct SeqBuf **sb, int min) { void* heap; mem·Allocator mem; vlong newcap; struct SeqBuf *old, *new; old = *sb; mem = old->mem; heap = old->heap; assert((*sb)->cap <= (SIZE_MAX - 1) / 2); newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min)); assert(newcap >= (*sb)->cap+min); if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) { errorf("memory: could not allocate new buffer\n"); return 1; } memcpy(new, old, sizeof(*new) + (*sb)->cap); new->cap = newcap; new->it = new->b + (old->it - old->b); mem.free(heap, old); *sb = new; return 0; } static error put(struct SeqBuf **sb, byte c) { int err; struct SeqBuf *sq; sq = *sb; if(sq->it < (sq->b + sq->cap)) { *sq->it++ = c; return 0; } if(err = grow(sb, 1), err) { errorf("memory fail: could not allocate more buffer\n"); sq->mem.free(sq->heap, sq); return 1; } *((*sb)->it++) = c; return 0; } static error push(struct SeqBuf **sb, int n, void *buf) { int d, err; struct SeqBuf *seq; seq = *sb; if(d = (seq->cap - (seq->it - seq->b)), d < n) { assert(d >= 0); if (err = grow(sb, n-d), err) { errorf("memory fail: could not allocate more buffer\n"); seq->mem.free(seq->heap, seq); return 1; } } seq = *sb; memcpy(seq->it, buf, n); seq->it += n; return 0; } // ----------------------------------------------------------------------- // sequence data struct bio·SeqReader { byte eof; io·Reader rdr; void *io; struct SeqBuf *seq; /* read buffer */ byte *b, *bend, buf[4*4098]; }; static error fill(bio·SeqReader *rdr) { int n; // NOTE: This could lead to an infinite loop. if(rdr->eof) return 0; n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf); if(n < 0) { errorf("read: no data obtained from reader\n"); return 1; } rdr->b = rdr->buf; rdr->bend = rdr->b + n; if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof) *rdr->bend++ = '\0'; return 0; } bio·SeqReader* bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap) { error err; bio·SeqReader *r; r = mem.alloc(heap, 1, sizeof(bio·SeqReader)); r->rdr = rdr; r->io = io; r->eof = 0; r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); r->seq->mem = mem; r->seq->heap = heap; r->seq->it = r->seq->b; r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE; if (err=fill(r), err) { errorf("fill: could not populate buffer\n"); goto ERROR; } return r; ERROR: mem.free(heap, r->seq); mem.free(heap, r); return nil; } error bio·closeseq(bio·SeqReader *rdr) { mem·Allocator mem; void *heap; mem = rdr->seq->mem; heap = rdr->seq->heap; mem.free(heap, rdr->seq); mem.free(heap, rdr); return 0; } static error readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop) { error err; byte *beg; if(rdr->eof && rdr->b == rdr->bend-1) return EOF; reset(rdr->seq); // NOTE: Can this case happen? assert(rdr->b != rdr->bend); if(*rdr->b++ != hdr) { errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--); return 1; } NAME: beg = rdr->b; while(rdr->b != rdr->bend) { if(*rdr->b++ == '\n') { push(&rdr->seq, (rdr->b - 1) - beg, beg); goto SEQ; } } push(&rdr->seq, rdr->b - beg, beg); if(err=fill(rdr), err) { errorf("read: could not populate buffer\n"); return 1; } goto NAME; SEQ: put(&rdr->seq, '\0'); rdr->seq->off = rdr->seq->it - rdr->seq->b; SEQLOOP: beg = rdr->b; while(rdr->b != rdr->bend) { if(*rdr->b == '\n') { push(&rdr->seq, rdr->b - beg, beg); beg = rdr->b + 1; } if(*rdr->b == stop || *rdr->b == '\0') goto SUCCESS; rdr->b++; } push(&rdr->seq, rdr->b - beg, beg); if(err=fill(rdr), err) { errorf("read: could not populate buffer\n"); return 1; } goto SEQLOOP; SUCCESS: push(&rdr->seq, rdr->b - beg, beg); put(&rdr->seq, '\0'); return 0; } /* * fasta files */ error bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq) { error err; err = readfasta(rdr, seq, '>', '>'); if(err && err != EOF) { errorf("parse fail: could not read sequence of record\n"); return err; } seq->name = rdr->seq->b; seq->s = rdr->seq->b + rdr->seq->off; seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end seq->q = nil; return err; } /* * fastq files */ error bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq) { int n; byte *beg; error err; err = readfasta(rdr, seq, '@', '+'); if(err) { errorf("parse fail: could not read sequence of record\n"); return err; } seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); if(*rdr->b++ != '+') { errorf("format error: no '+' character seperator found\n"); return -1; } EATLN: while(rdr->b != rdr->bend) { if (*rdr->b++ == '\n') { n = 0; goto QUAL; } } if(err = fill((bio·SeqReader*)rdr), err) { errorf("read: could not populate buffer\n"); return 1; } goto EATLN; QUAL: beg = rdr->b; while(rdr->b != rdr->bend) { if(*rdr->b == '\n') { push(&rdr->seq, rdr->b - beg, beg); beg = rdr->b + 1; } if(n++ == seq->len || *rdr->b == '\0') { err = *rdr->b == '\0' ? EOF : 0; goto SUCCESS; } rdr->b++; } push(&rdr->seq, rdr->b - beg, beg); if(err = fill((bio·SeqReader*)rdr), err) { errorf("read: could not populate buffer\n"); return 1; } goto QUAL; SUCCESS: push(&rdr->seq, rdr->b - beg, beg); put(&rdr->seq, '\0'); seq->name = rdr->seq->b; seq->s = rdr->seq->b + rdr->seq->off - 1; seq->q = seq->s + seq->len + 1; return err; } // ----------------------------------------------------------------------- // sequence writing error bio·writefasta(io·Writer io, void *wtr, bio·Seq seq) { int i, j, d; char buf[2048], *b = buf, *e = arrend(buf); *b++ = '>'; while(*seq.name) { *b++ = *seq.name++; if(b == e) { io.write(wtr, 1, arrlen(buf), buf); b = buf; } } for(i=0; i