From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/libbio/fasta.c | 393 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 src/libbio/fasta.c (limited to 'src/libbio/fasta.c') diff --git a/src/libbio/fasta.c b/src/libbio/fasta.c new file mode 100644 index 0000000..3788544 --- /dev/null +++ b/src/libbio/fasta.c @@ -0,0 +1,393 @@ +#include +#include +#include + +#define INIT_NM_SIZE 128 +#define INIT_SQ_SIZE 4096 + +struct SeqBuf +{ + mem·Allocator mem; + void *heap; + + int cap, off; + byte *it, b[]; +}; + +static +void +reset(struct SeqBuf *sb) +{ + sb->off = 0; + sb->it = sb->b; +} + +static +error +grow(struct SeqBuf **sb, int min) +{ + void* heap; + mem·Allocator mem; + + vlong newcap; + struct SeqBuf *old, *new; + + old = *sb; + mem = old->mem; + heap = old->heap; + + assert((*sb)->cap <= (SIZE_MAX - 1) / 2); + newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min)); + assert(newcap >= (*sb)->cap+min); + + if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) { + errorf("memory: could not allocate new buffer\n"); + return 1; + } + + memcpy(new, old, sizeof(*new) + (*sb)->cap); + + new->cap = newcap; + new->it = new->b + (old->it - old->b); + mem.free(heap, old); + + *sb = new; + return 0; +} + +static +error +put(struct SeqBuf **sb, byte c) +{ + int err; + struct SeqBuf *sq; + + sq = *sb; + if(sq->it < (sq->b + sq->cap)) { + *sq->it++ = c; + return 0; + } + + if(err = grow(sb, 1), err) { + errorf("memory fail: could not allocate more buffer\n"); + sq->mem.free(sq->heap, sq); + return 1; + } + + *((*sb)->it++) = c; + return 0; +} + +static +error +push(struct SeqBuf **sb, int n, void *buf) +{ + int d, err; + struct SeqBuf *seq; + + seq = *sb; + if(d = (seq->cap - (seq->it - seq->b)), d < n) { + assert(d >= 0); + if (err = grow(sb, n-d), err) { + errorf("memory fail: could not allocate more buffer\n"); + seq->mem.free(seq->heap, seq); + return 1; + } + } + seq = *sb; + + memcpy(seq->it, buf, n); + seq->it += n; + + return 0; +} + +// ----------------------------------------------------------------------- +// sequence data + +struct bio·SeqReader { + byte eof; + io·Reader rdr; + void *io; + + struct SeqBuf *seq; + + /* read buffer */ + byte *b, *bend, buf[4*4098]; +}; + +static +error +fill(bio·SeqReader *rdr) +{ + int n; + // NOTE: This could lead to an infinite loop. + if(rdr->eof) + return 0; + + n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf); + if(n < 0) { + errorf("read: no data obtained from reader\n"); + return 1; + } + rdr->b = rdr->buf; + rdr->bend = rdr->b + n; + if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof) + *rdr->bend++ = '\0'; + + return 0; +} + +bio·SeqReader* +bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap) +{ + error err; + bio·SeqReader *r; + + r = mem.alloc(heap, 1, sizeof(bio·SeqReader)); + r->rdr = rdr; + r->io = io; + r->eof = 0; + + r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); + r->seq->mem = mem; + r->seq->heap = heap; + r->seq->it = r->seq->b; + r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE; + + if (err=fill(r), err) { + errorf("fill: could not populate buffer\n"); + goto ERROR; + } + + return r; + +ERROR: + mem.free(heap, r->seq); + mem.free(heap, r); + return nil; +} + +error +bio·closeseq(bio·SeqReader *rdr) +{ + mem·Allocator mem; + void *heap; + + mem = rdr->seq->mem; + heap = rdr->seq->heap; + + mem.free(heap, rdr->seq); + mem.free(heap, rdr); + + return 0; +} + + +static +error +readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop) +{ + error err; + byte *beg; + + if(rdr->eof && rdr->b == rdr->bend-1) + return EOF; + + reset(rdr->seq); + + // NOTE: Can this case happen? + assert(rdr->b != rdr->bend); + if(*rdr->b++ != hdr) { + errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--); + return 1; + } + +NAME: + beg = rdr->b; + while(rdr->b != rdr->bend) { + if(*rdr->b++ == '\n') { + push(&rdr->seq, (rdr->b - 1) - beg, beg); + goto SEQ; + } + } + push(&rdr->seq, rdr->b - beg, beg); + + if(err=fill(rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto NAME; + +SEQ: + put(&rdr->seq, '\0'); + rdr->seq->off = rdr->seq->it - rdr->seq->b; + +SEQLOOP: + beg = rdr->b; + while(rdr->b != rdr->bend) { + if(*rdr->b == '\n') { + push(&rdr->seq, rdr->b - beg, beg); + beg = rdr->b + 1; + } + + if(*rdr->b == stop || *rdr->b == '\0') + goto SUCCESS; + + rdr->b++; + } + + push(&rdr->seq, rdr->b - beg, beg); + + if(err=fill(rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto SEQLOOP; + +SUCCESS: + push(&rdr->seq, rdr->b - beg, beg); + put(&rdr->seq, '\0'); + + return 0; +} + +/* + * fasta files + */ + +error +bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq) +{ + error err; + + err = readfasta(rdr, seq, '>', '>'); + if(err && err != EOF) { + errorf("parse fail: could not read sequence of record\n"); + return err; + } + + seq->name = rdr->seq->b; + seq->s = rdr->seq->b + rdr->seq->off; + seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end + seq->q = nil; + + return err; +} + +/* + * fastq files + */ + +error +bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq) +{ + int n; + byte *beg; + error err; + + err = readfasta(rdr, seq, '@', '+'); + if(err) { + errorf("parse fail: could not read sequence of record\n"); + return err; + } + + seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); + + if(*rdr->b++ != '+') { + errorf("format error: no '+' character seperator found\n"); + return -1; + } + +EATLN: + while(rdr->b != rdr->bend) { + if (*rdr->b++ == '\n') { + n = 0; + goto QUAL; + } + } + + if(err = fill((bio·SeqReader*)rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto EATLN; + +QUAL: + beg = rdr->b; + while(rdr->b != rdr->bend) { + if(*rdr->b == '\n') { + push(&rdr->seq, rdr->b - beg, beg); + beg = rdr->b + 1; + } + + if(n++ == seq->len || *rdr->b == '\0') { + err = *rdr->b == '\0' ? EOF : 0; + goto SUCCESS; + } + + rdr->b++; + } + + push(&rdr->seq, rdr->b - beg, beg); + + if(err = fill((bio·SeqReader*)rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto QUAL; + + +SUCCESS: + push(&rdr->seq, rdr->b - beg, beg); + put(&rdr->seq, '\0'); + + seq->name = rdr->seq->b; + seq->s = rdr->seq->b + rdr->seq->off - 1; + seq->q = seq->s + seq->len + 1; + + return err; +} + +// ----------------------------------------------------------------------- +// sequence writing + +error +bio·writefasta(io·Writer io, void *wtr, bio·Seq seq) +{ + int i, j, d; + char buf[2048], *b = buf, *e = arrend(buf); + + *b++ = '>'; + while(*seq.name) { + *b++ = *seq.name++; + if(b == e) { + io.write(wtr, 1, arrlen(buf), buf); + b = buf; + } + } + + for(i=0; i