#include #include #include #define INIT_NM_SIZE 128 #define INIT_SQ_SIZE 4096 struct Seqbuf { mem·Allocator heap; void *h; int len, off; byte *it, b[]; }; static void reset(struct Seqbuf *sb) { sb->off = 0; sb->it = sb->b; } static error grow(struct Seqbuf **sb, int min) { struct Seqbuf *old, *new; vlong newlen; old = *sb; assert((*sb)->len <= (SIZE_MAX - 1) / 2); newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min)); assert(newlen >= (*sb)->len+min); if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) { errorf("memory: could not allocate new buffer"); return 1; } memcpy(new, old, sizeof(*sb) + (*sb)->len); new->len = newlen; new->it = new->b + (old->len); old->heap.free(old->h, old); *sb = new; return 0; } static error put(struct Seqbuf **sb, byte c) { int err; struct Seqbuf *sq; sq = *sb; if (sq->it < (sq->b + sq->len)) { *sq->it++ = c; return 0; } if (err = grow(sb, 1), err) { errorf("memory fail: could not allocate more buffer"); sq->heap.free(sq->h, sq); return 1; } *((*sb)->it++) = c; return 0; } static error push(struct Seqbuf **sb, int n, void *buf) { int d, err; struct Seqbuf *seq; seq = *sb; if (d = seq->len - (seq->it - seq->b), d < n) { assert(d > 0); if (err = grow(sb, n-d), err) { errorf("memory fail: could not allocate more buffer"); seq->heap.free(seq->h, seq); return 1; } } memcpy((*sb)->it, buf, n); (*sb)->it += n; return 0; } // ----------------------------------------------------------------------- // sequence data struct bio·SeqReader { byte eof; io·Reader file; void *f; struct Seqbuf *seq; /* read buffer */ byte *b, *bend; byte buf[4*4098]; }; static error fill(bio·SeqReader *rdr) { int n; // NOTE: This could lead to an infinite loop. if (rdr->eof) { return 0; } n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf); if (n < 0) { errorf("read: no data obtained from reader"); return 1; } rdr->b = rdr->buf; rdr->bend = rdr->b + n; if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) { *rdr->bend++ = '\0'; } return 0; } bio·SeqReader* bio·openseq(io·Reader file, void *f, mem·Allocator heap, void *h) { error err; bio·SeqReader *rdr; rdr = heap.alloc(h, 1, sizeof(bio·SeqReader)); rdr->file = file; rdr->f = f; rdr->eof = 0; rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); rdr->seq->heap = heap; rdr->seq->h = h; rdr->seq->it = rdr->seq->b; rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE; if (err = fill(rdr), err) { errorf("fill: could not populate buffer"); goto ERROR; } return rdr; ERROR: heap.free(h, rdr->seq); heap.free(h, rdr); return nil; } error bio·closeseq(bio·SeqReader *rdr) { mem·Allocator heap; void *h; heap = rdr->seq->heap; h = rdr->seq->h; heap.free(h, rdr->seq); heap.free(h, rdr); return 0; } static error readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop) { error err; byte *beg; if (rdr->eof && rdr->b == rdr->bend-1) return EOF; reset(rdr->seq); // NOTE: Can this case happen? assert(rdr->b != rdr->bend); if(*rdr->b++ != hdr) { errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--); return 1; } NAME: beg = rdr->b; while(rdr->b != rdr->bend) { if(*rdr->b++ == '\n') { push(&rdr->seq, (rdr->b - 1) - beg, beg); goto SEQ; } } push(&rdr->seq, rdr->b - beg, beg); if(err = fill(rdr), err) { errorf("read: could not populate buffer"); return 1; } goto NAME; SEQ: put(&rdr->seq, '\0'); rdr->seq->off = rdr->seq->it - rdr->seq->b; SEQL: beg = rdr->b; while(rdr->b != rdr->bend) { if(*rdr->b == '\n') { push(&rdr->seq, rdr->b - beg, beg); beg = rdr->b + 1; } if(*rdr->b == stop || *rdr->b == '\0') goto SUCCESS; rdr->b++; } push(&rdr->seq, rdr->b - beg, beg); if(err = fill(rdr), err) { errorf("read: could not populate buffer"); return 1; } goto SEQL; SUCCESS: push(&rdr->seq, rdr->b - beg, beg); put(&rdr->seq, '\0'); return 0; } /* * fasta files */ error bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq) { error err; err = readfasta(rdr, seq, '>', '>'); if(err && err != EOF) { errorf("parse fail: could not read sequence of record"); return err; } seq->name = rdr->seq->b; seq->s = rdr->seq->b + rdr->seq->off; seq->len = rdr->seq->it - seq->s; seq->q = nil; return err; } /* * fastq files */ error bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq) { int n; byte *beg; error err; err = readfasta(rdr, seq, '@', '+'); if(err) { errorf("parse fail: could not read sequence of record"); return err; } seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); if(*rdr->b++ != '+') { errorf("format error: no '+' character seperator found"); return -1; } EATLN: while(rdr->b != rdr->bend) { if (*rdr->b++ == '\n') { n = 0; goto QUAL; } } if(err = fill((bio·SeqReader*)rdr), err) { errorf("read: could not populate buffer"); return 1; } goto EATLN; QUAL: beg = rdr->b; while(rdr->b != rdr->bend) { if(*rdr->b == '\n') { push(&rdr->seq, rdr->b - beg, beg); beg = rdr->b + 1; } if(n++ == seq->len || *rdr->b == '\0') { err = *rdr->b == '\0' ? EOF : 0; goto SUCCESS; } rdr->b++; } push(&rdr->seq, rdr->b - beg, beg); if(err = fill((bio·SeqReader*)rdr), err) { errorf("read: could not populate buffer"); return 1; } goto QUAL; SUCCESS: push(&rdr->seq, rdr->b - beg, beg); put(&rdr->seq, '\0'); seq->name = rdr->seq->b; seq->s = rdr->seq->b + rdr->seq->off; seq->q = seq->s + seq->len + 1; return err; }