aboutsummaryrefslogtreecommitdiff
path: root/sys/libbio/fasta.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/libbio/fasta.c')
-rw-r--r--sys/libbio/fasta.c393
1 files changed, 0 insertions, 393 deletions
diff --git a/sys/libbio/fasta.c b/sys/libbio/fasta.c
deleted file mode 100644
index 3788544..0000000
--- a/sys/libbio/fasta.c
+++ /dev/null
@@ -1,393 +0,0 @@
-#include <u.h>
-#include <base.h>
-#include <libbio.h>
-
-#define INIT_NM_SIZE 128
-#define INIT_SQ_SIZE 4096
-
-struct SeqBuf
-{
- mem·Allocator mem;
- void *heap;
-
- int cap, off;
- byte *it, b[];
-};
-
-static
-void
-reset(struct SeqBuf *sb)
-{
- sb->off = 0;
- sb->it = sb->b;
-}
-
-static
-error
-grow(struct SeqBuf **sb, int min)
-{
- void* heap;
- mem·Allocator mem;
-
- vlong newcap;
- struct SeqBuf *old, *new;
-
- old = *sb;
- mem = old->mem;
- heap = old->heap;
-
- assert((*sb)->cap <= (SIZE_MAX - 1) / 2);
- newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min));
- assert(newcap >= (*sb)->cap+min);
-
- if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) {
- errorf("memory: could not allocate new buffer\n");
- return 1;
- }
-
- memcpy(new, old, sizeof(*new) + (*sb)->cap);
-
- new->cap = newcap;
- new->it = new->b + (old->it - old->b);
- mem.free(heap, old);
-
- *sb = new;
- return 0;
-}
-
-static
-error
-put(struct SeqBuf **sb, byte c)
-{
- int err;
- struct SeqBuf *sq;
-
- sq = *sb;
- if(sq->it < (sq->b + sq->cap)) {
- *sq->it++ = c;
- return 0;
- }
-
- if(err = grow(sb, 1), err) {
- errorf("memory fail: could not allocate more buffer\n");
- sq->mem.free(sq->heap, sq);
- return 1;
- }
-
- *((*sb)->it++) = c;
- return 0;
-}
-
-static
-error
-push(struct SeqBuf **sb, int n, void *buf)
-{
- int d, err;
- struct SeqBuf *seq;
-
- seq = *sb;
- if(d = (seq->cap - (seq->it - seq->b)), d < n) {
- assert(d >= 0);
- if (err = grow(sb, n-d), err) {
- errorf("memory fail: could not allocate more buffer\n");
- seq->mem.free(seq->heap, seq);
- return 1;
- }
- }
- seq = *sb;
-
- memcpy(seq->it, buf, n);
- seq->it += n;
-
- return 0;
-}
-
-// -----------------------------------------------------------------------
-// sequence data
-
-struct bio·SeqReader {
- byte eof;
- io·Reader rdr;
- void *io;
-
- struct SeqBuf *seq;
-
- /* read buffer */
- byte *b, *bend, buf[4*4098];
-};
-
-static
-error
-fill(bio·SeqReader *rdr)
-{
- int n;
- // NOTE: This could lead to an infinite loop.
- if(rdr->eof)
- return 0;
-
- n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf);
- if(n < 0) {
- errorf("read: no data obtained from reader\n");
- return 1;
- }
- rdr->b = rdr->buf;
- rdr->bend = rdr->b + n;
- if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof)
- *rdr->bend++ = '\0';
-
- return 0;
-}
-
-bio·SeqReader*
-bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap)
-{
- error err;
- bio·SeqReader *r;
-
- r = mem.alloc(heap, 1, sizeof(bio·SeqReader));
- r->rdr = rdr;
- r->io = io;
- r->eof = 0;
-
- r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
- r->seq->mem = mem;
- r->seq->heap = heap;
- r->seq->it = r->seq->b;
- r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE;
-
- if (err=fill(r), err) {
- errorf("fill: could not populate buffer\n");
- goto ERROR;
- }
-
- return r;
-
-ERROR:
- mem.free(heap, r->seq);
- mem.free(heap, r);
- return nil;
-}
-
-error
-bio·closeseq(bio·SeqReader *rdr)
-{
- mem·Allocator mem;
- void *heap;
-
- mem = rdr->seq->mem;
- heap = rdr->seq->heap;
-
- mem.free(heap, rdr->seq);
- mem.free(heap, rdr);
-
- return 0;
-}
-
-
-static
-error
-readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop)
-{
- error err;
- byte *beg;
-
- if(rdr->eof && rdr->b == rdr->bend-1)
- return EOF;
-
- reset(rdr->seq);
-
- // NOTE: Can this case happen?
- assert(rdr->b != rdr->bend);
- if(*rdr->b++ != hdr) {
- errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--);
- return 1;
- }
-
-NAME:
- beg = rdr->b;
- while(rdr->b != rdr->bend) {
- if(*rdr->b++ == '\n') {
- push(&rdr->seq, (rdr->b - 1) - beg, beg);
- goto SEQ;
- }
- }
- push(&rdr->seq, rdr->b - beg, beg);
-
- if(err=fill(rdr), err) {
- errorf("read: could not populate buffer\n");
- return 1;
- }
- goto NAME;
-
-SEQ:
- put(&rdr->seq, '\0');
- rdr->seq->off = rdr->seq->it - rdr->seq->b;
-
-SEQLOOP:
- beg = rdr->b;
- while(rdr->b != rdr->bend) {
- if(*rdr->b == '\n') {
- push(&rdr->seq, rdr->b - beg, beg);
- beg = rdr->b + 1;
- }
-
- if(*rdr->b == stop || *rdr->b == '\0')
- goto SUCCESS;
-
- rdr->b++;
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if(err=fill(rdr), err) {
- errorf("read: could not populate buffer\n");
- return 1;
- }
- goto SEQLOOP;
-
-SUCCESS:
- push(&rdr->seq, rdr->b - beg, beg);
- put(&rdr->seq, '\0');
-
- return 0;
-}
-
-/*
- * fasta files
- */
-
-error
-bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq)
-{
- error err;
-
- err = readfasta(rdr, seq, '>', '>');
- if(err && err != EOF) {
- errorf("parse fail: could not read sequence of record\n");
- return err;
- }
-
- seq->name = rdr->seq->b;
- seq->s = rdr->seq->b + rdr->seq->off;
- seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end
- seq->q = nil;
-
- return err;
-}
-
-/*
- * fastq files
- */
-
-error
-bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq)
-{
- int n;
- byte *beg;
- error err;
-
- err = readfasta(rdr, seq, '@', '+');
- if(err) {
- errorf("parse fail: could not read sequence of record\n");
- return err;
- }
-
- seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
-
- if(*rdr->b++ != '+') {
- errorf("format error: no '+' character seperator found\n");
- return -1;
- }
-
-EATLN:
- while(rdr->b != rdr->bend) {
- if (*rdr->b++ == '\n') {
- n = 0;
- goto QUAL;
- }
- }
-
- if(err = fill((bio·SeqReader*)rdr), err) {
- errorf("read: could not populate buffer\n");
- return 1;
- }
- goto EATLN;
-
-QUAL:
- beg = rdr->b;
- while(rdr->b != rdr->bend) {
- if(*rdr->b == '\n') {
- push(&rdr->seq, rdr->b - beg, beg);
- beg = rdr->b + 1;
- }
-
- if(n++ == seq->len || *rdr->b == '\0') {
- err = *rdr->b == '\0' ? EOF : 0;
- goto SUCCESS;
- }
-
- rdr->b++;
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if(err = fill((bio·SeqReader*)rdr), err) {
- errorf("read: could not populate buffer\n");
- return 1;
- }
- goto QUAL;
-
-
-SUCCESS:
- push(&rdr->seq, rdr->b - beg, beg);
- put(&rdr->seq, '\0');
-
- seq->name = rdr->seq->b;
- seq->s = rdr->seq->b + rdr->seq->off - 1;
- seq->q = seq->s + seq->len + 1;
-
- return err;
-}
-
-// -----------------------------------------------------------------------
-// sequence writing
-
-error
-bio·writefasta(io·Writer io, void *wtr, bio·Seq seq)
-{
- int i, j, d;
- char buf[2048], *b = buf, *e = arrend(buf);
-
- *b++ = '>';
- while(*seq.name) {
- *b++ = *seq.name++;
- if(b == e) {
- io.write(wtr, 1, arrlen(buf), buf);
- b = buf;
- }
- }
-
- for(i=0; i<seq.len; i = j) {
- j = MIN(i+70, seq.len);
- d = j - i;
- if((e-b) <= d+1) {
- io.write(wtr, 1, b-buf, buf);
- b = buf;
- }
- *b++ = '\n';
- memcpy(b, seq.s+i, d);
- b += d;
- }
-
- *b++ = '\n';
- io.write(wtr, 1, b-buf, buf);
-
- return 0;
-}
-
-error
-bio·writefastq(io·Writer io, void *wtr, bio·Seq seq)
-{
- panicf("need to implement");
- return 1;
-}