aboutsummaryrefslogtreecommitdiff
path: root/src/libbio/fasta.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libbio/fasta.c')
-rw-r--r--src/libbio/fasta.c393
1 files changed, 393 insertions, 0 deletions
diff --git a/src/libbio/fasta.c b/src/libbio/fasta.c
new file mode 100644
index 0000000..3788544
--- /dev/null
+++ b/src/libbio/fasta.c
@@ -0,0 +1,393 @@
+#include <u.h>
+#include <base.h>
+#include <libbio.h>
+
+#define INIT_NM_SIZE 128
+#define INIT_SQ_SIZE 4096
+
+struct SeqBuf
+{
+ mem·Allocator mem;
+ void *heap;
+
+ int cap, off;
+ byte *it, b[];
+};
+
+static
+void
+reset(struct SeqBuf *sb)
+{
+ sb->off = 0;
+ sb->it = sb->b;
+}
+
+static
+error
+grow(struct SeqBuf **sb, int min)
+{
+ void* heap;
+ mem·Allocator mem;
+
+ vlong newcap;
+ struct SeqBuf *old, *new;
+
+ old = *sb;
+ mem = old->mem;
+ heap = old->heap;
+
+ assert((*sb)->cap <= (SIZE_MAX - 1) / 2);
+ newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min));
+ assert(newcap >= (*sb)->cap+min);
+
+ if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) {
+ errorf("memory: could not allocate new buffer\n");
+ return 1;
+ }
+
+ memcpy(new, old, sizeof(*new) + (*sb)->cap);
+
+ new->cap = newcap;
+ new->it = new->b + (old->it - old->b);
+ mem.free(heap, old);
+
+ *sb = new;
+ return 0;
+}
+
+static
+error
+put(struct SeqBuf **sb, byte c)
+{
+ int err;
+ struct SeqBuf *sq;
+
+ sq = *sb;
+ if(sq->it < (sq->b + sq->cap)) {
+ *sq->it++ = c;
+ return 0;
+ }
+
+ if(err = grow(sb, 1), err) {
+ errorf("memory fail: could not allocate more buffer\n");
+ sq->mem.free(sq->heap, sq);
+ return 1;
+ }
+
+ *((*sb)->it++) = c;
+ return 0;
+}
+
+static
+error
+push(struct SeqBuf **sb, int n, void *buf)
+{
+ int d, err;
+ struct SeqBuf *seq;
+
+ seq = *sb;
+ if(d = (seq->cap - (seq->it - seq->b)), d < n) {
+ assert(d >= 0);
+ if (err = grow(sb, n-d), err) {
+ errorf("memory fail: could not allocate more buffer\n");
+ seq->mem.free(seq->heap, seq);
+ return 1;
+ }
+ }
+ seq = *sb;
+
+ memcpy(seq->it, buf, n);
+ seq->it += n;
+
+ return 0;
+}
+
+// -----------------------------------------------------------------------
+// sequence data
+
+struct bio·SeqReader {
+ byte eof;
+ io·Reader rdr;
+ void *io;
+
+ struct SeqBuf *seq;
+
+ /* read buffer */
+ byte *b, *bend, buf[4*4098];
+};
+
+static
+error
+fill(bio·SeqReader *rdr)
+{
+ int n;
+ // NOTE: This could lead to an infinite loop.
+ if(rdr->eof)
+ return 0;
+
+ n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf);
+ if(n < 0) {
+ errorf("read: no data obtained from reader\n");
+ return 1;
+ }
+ rdr->b = rdr->buf;
+ rdr->bend = rdr->b + n;
+ if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof)
+ *rdr->bend++ = '\0';
+
+ return 0;
+}
+
+bio·SeqReader*
+bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap)
+{
+ error err;
+ bio·SeqReader *r;
+
+ r = mem.alloc(heap, 1, sizeof(bio·SeqReader));
+ r->rdr = rdr;
+ r->io = io;
+ r->eof = 0;
+
+ r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
+ r->seq->mem = mem;
+ r->seq->heap = heap;
+ r->seq->it = r->seq->b;
+ r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE;
+
+ if (err=fill(r), err) {
+ errorf("fill: could not populate buffer\n");
+ goto ERROR;
+ }
+
+ return r;
+
+ERROR:
+ mem.free(heap, r->seq);
+ mem.free(heap, r);
+ return nil;
+}
+
+error
+bio·closeseq(bio·SeqReader *rdr)
+{
+ mem·Allocator mem;
+ void *heap;
+
+ mem = rdr->seq->mem;
+ heap = rdr->seq->heap;
+
+ mem.free(heap, rdr->seq);
+ mem.free(heap, rdr);
+
+ return 0;
+}
+
+
+static
+error
+readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop)
+{
+ error err;
+ byte *beg;
+
+ if(rdr->eof && rdr->b == rdr->bend-1)
+ return EOF;
+
+ reset(rdr->seq);
+
+ // NOTE: Can this case happen?
+ assert(rdr->b != rdr->bend);
+ if(*rdr->b++ != hdr) {
+ errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--);
+ return 1;
+ }
+
+NAME:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b++ == '\n') {
+ push(&rdr->seq, (rdr->b - 1) - beg, beg);
+ goto SEQ;
+ }
+ }
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err=fill(rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto NAME;
+
+SEQ:
+ put(&rdr->seq, '\0');
+ rdr->seq->off = rdr->seq->it - rdr->seq->b;
+
+SEQLOOP:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if(*rdr->b == stop || *rdr->b == '\0')
+ goto SUCCESS;
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err=fill(rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto SEQLOOP;
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ return 0;
+}
+
+/*
+ * fasta files
+ */
+
+error
+bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq)
+{
+ error err;
+
+ err = readfasta(rdr, seq, '>', '>');
+ if(err && err != EOF) {
+ errorf("parse fail: could not read sequence of record\n");
+ return err;
+ }
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off;
+ seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end
+ seq->q = nil;
+
+ return err;
+}
+
+/*
+ * fastq files
+ */
+
+error
+bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq)
+{
+ int n;
+ byte *beg;
+ error err;
+
+ err = readfasta(rdr, seq, '@', '+');
+ if(err) {
+ errorf("parse fail: could not read sequence of record\n");
+ return err;
+ }
+
+ seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
+
+ if(*rdr->b++ != '+') {
+ errorf("format error: no '+' character seperator found\n");
+ return -1;
+ }
+
+EATLN:
+ while(rdr->b != rdr->bend) {
+ if (*rdr->b++ == '\n') {
+ n = 0;
+ goto QUAL;
+ }
+ }
+
+ if(err = fill((bio·SeqReader*)rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto EATLN;
+
+QUAL:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if(n++ == seq->len || *rdr->b == '\0') {
+ err = *rdr->b == '\0' ? EOF : 0;
+ goto SUCCESS;
+ }
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err = fill((bio·SeqReader*)rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto QUAL;
+
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off - 1;
+ seq->q = seq->s + seq->len + 1;
+
+ return err;
+}
+
+// -----------------------------------------------------------------------
+// sequence writing
+
+error
+bio·writefasta(io·Writer io, void *wtr, bio·Seq seq)
+{
+ int i, j, d;
+ char buf[2048], *b = buf, *e = arrend(buf);
+
+ *b++ = '>';
+ while(*seq.name) {
+ *b++ = *seq.name++;
+ if(b == e) {
+ io.write(wtr, 1, arrlen(buf), buf);
+ b = buf;
+ }
+ }
+
+ for(i=0; i<seq.len; i = j) {
+ j = MIN(i+70, seq.len);
+ d = j - i;
+ if((e-b) <= d+1) {
+ io.write(wtr, 1, b-buf, buf);
+ b = buf;
+ }
+ *b++ = '\n';
+ memcpy(b, seq.s+i, d);
+ b += d;
+ }
+
+ *b++ = '\n';
+ io.write(wtr, 1, b-buf, buf);
+
+ return 0;
+}
+
+error
+bio·writefastq(io·Writer io, void *wtr, bio·Seq seq)
+{
+ panicf("need to implement");
+ return 1;
+}