From 788ddbd8e113cd4f9694aee779c5b5dcca26e30b Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Sat, 25 Apr 2020 11:38:29 -0700 Subject: feat: updated fasta code to allow for iteration --- sys/libbio/io/fasta.c | 234 +++++++++++++++++++++++++++++++++++++++++++++++++ sys/libbio/io/newick.c | 7 +- sys/libbio/test.c | 42 ++++++++- sys/libn/io.c | 12 +++ 4 files changed, 289 insertions(+), 6 deletions(-) (limited to 'sys') diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c index c1358b6..891e222 100644 --- a/sys/libbio/io/fasta.c +++ b/sys/libbio/io/fasta.c @@ -2,3 +2,237 @@ #include #include +#define INIT_NM_SIZE 128 +#define INIT_SQ_SIZE 4096 + +struct Seqbuf +{ + mem·Allocator heap; + void *h; + + int len; + int off; + byte *it; + byte b[]; +}; + +static +void +reset(struct Seqbuf *sb) +{ + sb->off = 0; + sb->it = sb->b; +} + +static +error +grow(struct Seqbuf **sb, int min) +{ + struct Seqbuf *old, *new; + vlong newlen; + + old = *sb; + Assert((*sb)->len <= (SIZE_MAX - 1) / 2); + newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min)); + Assert(newlen >= (*sb)->len+min); + + if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) { + errorf("memory: could not allocate new buffer"); + return 1; + } + + memcpy(new, old, sizeof(*sb) + (*sb)->len); + new->len = newlen; + new->it = new->b + (old->len); + old->heap.free(old->h, old); + + *sb = new; + return 0; +} + +static +error +put(struct Seqbuf **sb, byte c) +{ + int err; + struct Seqbuf *sq; + + sq = *sb; + if (sq->it < (sq->b + sq->len)) { + *sq->it++ = c; + return 0; + } + + if (err = grow(sb, 1), err) { + errorf("memory fail: could not allocate more buffer"); + sq->heap.free(sq->h, sq); + return 1; + } + + *((*sb)->it++) = c; + return 0; +} + +static +error +push(struct Seqbuf **sb, int n, void *buf) +{ + int d, err; + struct Seqbuf *seq; + + seq = *sb; + if (d = seq->len - (seq->it - seq->b), d < n) { + assert(d > 0); + if (err = grow(sb, n-d), err) { + errorf("memory fail: could not allocate more buffer"); + seq->heap.free(seq->h, seq); + return 1; + } + } + + memcpy((*sb)->it, buf, n); + (*sb)->it += n; + + return 0; +} + +struct bio·FastaReader { + byte eof; + io·Reader file; + void *f; + + struct Seqbuf *seq; + + /* read buffer */ + byte *b, *bend; + byte buf[4*4098]; +}; + +static +error +fill(bio·FastaReader *rdr) +{ + int n; + // NOTE: This could lead to an infinite loop. + if (rdr->eof) { + return 0; + } + + n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf); + if (n < 0) { + errorf("read: no data obtained from reader"); + return 1; + } + rdr->b = rdr->buf; + rdr->bend = rdr->b + n; + if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) { + *rdr->bend++ = '\0'; + } + + return 0; +} + +bio·FastaReader* +bio·newfastareader(io·Reader stream, void *s, mem·Allocator heap, void *h) +{ + error err; + bio·FastaReader *rdr; + + rdr = heap.alloc(h, 1, sizeof(bio·FastaReader)); + rdr->file = stream; + rdr->f = s; + rdr->eof = 0; + + rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); + rdr->seq->heap = heap; + rdr->seq->h = h; + rdr->seq->it = rdr->seq->b; + rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE; + + if (err = fill(rdr), err) { + errorf("fill: could not populate buffer"); + goto ERROR; + } + + return rdr; + +ERROR: + heap.free(h, rdr->seq); + heap.free(h, rdr); + return nil; +} + +error +bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq) +{ + error err; + byte *beg; + + if (rdr->eof && rdr->b == rdr->bend-1) { + return EOF; + } + + reset(rdr->seq); + + // NOTE: Can this case happen? + Assert(rdr->b != rdr->bend); + if (*rdr->b++ != '>') { + errorf("fasta format: expected '>', found '%c'", *rdr->b--); + return 1; + } + +NAME: + beg = rdr->b; + while (rdr->b != rdr->bend) { + if (*rdr->b++ == '\n') { + push(&rdr->seq, (rdr->b - 1) - beg, beg); + goto SEQ; + } + } + + push(&rdr->seq, rdr->b - beg, beg); + + if (err = fill(rdr), err) { + errorf("read: could not populate buffer"); + return 1; + } + goto NAME; + +SEQ: + put(&rdr->seq, '\0'); + rdr->seq->off = rdr->seq->it - rdr->seq->b; + +SEQL: + beg = rdr->b; + while (rdr->b != rdr->bend) { + if (*rdr->b == '\n') { + push(&rdr->seq, rdr->b - beg, beg); + beg = rdr->b + 1; + } + + if (*rdr->b == '>' || *rdr->b == '\0') { + goto SUCCESS; + } + + rdr->b++; + } + + push(&rdr->seq, rdr->b - beg, beg); + + if (err = fill(rdr), err) { + errorf("read: could not populate buffer"); + return 1; + } + goto SEQL; + +SUCCESS: + push(&rdr->seq, rdr->b - beg, beg); + put(&rdr->seq, '\0'); + + seq->name = rdr->seq->b; + seq->s = rdr->seq->b + rdr->seq->off; + seq->len = rdr->seq->it - seq->s; + seq->q = nil; + + return 0; +} diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c index 8b02446..0c9921b 100644 --- a/sys/libbio/io/newick.c +++ b/sys/libbio/io/newick.c @@ -309,7 +309,7 @@ ERROR: } bio·Tree -bio·readnewick(io·Peeker stream, void *si, mem·Allocator heap, void *hi) +bio·readnewick(io·Peeker stream, void *s, mem·Allocator heap, void *h) { error err; struct Parser p; @@ -319,9 +319,9 @@ bio·readnewick(io·Peeker stream, void *si, mem·Allocator heap, void *hi) .lev = 0, .root = nil, .tok = (struct Token){ 0 }, - .fimpl = si, + .fimpl = s, .file = stream, - .himpl = hi, + .himpl = h, .heap = heap, }; err = parse(&p); @@ -337,6 +337,7 @@ bio·readnewick(io·Peeker stream, void *si, mem·Allocator heap, void *hi) // ----------------------------------------------------------------------- // Write +static error dump(bio·Node *node, void *impl, io·Putter out) { diff --git a/sys/libbio/test.c b/sys/libbio/test.c index 115ee46..14f2b06 100644 --- a/sys/libbio/test.c +++ b/sys/libbio/test.c @@ -5,9 +5,8 @@ // ----------------------------------------------------------------------- // Point of entry for testing - -int -main() +error +test·newick() { error err; bio·Tree t; @@ -36,3 +35,40 @@ main() return 0; } +error +test·fasta() +{ + error err; + Stream *fd; + + bio·Seq seq; + bio·FastaReader *rdr; + + fd = io·open("/home/nolln/root/data/test/zika.fa", "r"); + rdr = bio·newfastareader((io·Reader){.read = &io·read}, fd, mem·sys, nil); + + err = 0; + while (!err) { + err = bio·readfasta(rdr, &seq); + if (!err) { + printf(">%s\n", seq.name); + } + } + + io·close(fd); + return err <= 0 ? 0 : 1; +} + +error +main() +{ + error err; + if (err = test·newick(), err) { + errorf("test fail: newick"); + } + + if (err = test·fasta(), err) { + errorf("test fail: fasta"); + } +} + diff --git a/sys/libn/io.c b/sys/libn/io.c index ff64ff0..3827a82 100644 --- a/sys/libn/io.c +++ b/sys/libn/io.c @@ -10,6 +10,18 @@ io·open(byte *name, byte *mode) return fopen(name, mode); } +int +io·fd(Stream *s) +{ + return fileno(s); +} + +error +io·stat(Stream *s, io·Stat *buf) +{ + return fstat(fileno(s), buf); +} + error io·close(Stream *s) { -- cgit v1.2.1