aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-04-25 11:38:29 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-04-25 11:38:29 -0700
commit788ddbd8e113cd4f9694aee779c5b5dcca26e30b (patch)
tree32e2db53cf5cd3fb907e6b17ab9ef6bdab8d9f4c /sys
parent1f166f0a4ed9ed32ade1ed21a571c9b2d0171ebc (diff)
feat: updated fasta code to allow for iteration
Diffstat (limited to 'sys')
-rw-r--r--sys/libbio/io/fasta.c234
-rw-r--r--sys/libbio/io/newick.c7
-rw-r--r--sys/libbio/test.c42
-rw-r--r--sys/libn/io.c12
4 files changed, 289 insertions, 6 deletions
diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c
index c1358b6..891e222 100644
--- a/sys/libbio/io/fasta.c
+++ b/sys/libbio/io/fasta.c
@@ -2,3 +2,237 @@
#include <libn.h>
#include <libbio.h>
+#define INIT_NM_SIZE 128
+#define INIT_SQ_SIZE 4096
+
+struct Seqbuf
+{
+ mem·Allocator heap;
+ void *h;
+
+ int len;
+ int off;
+ byte *it;
+ byte b[];
+};
+
+static
+void
+reset(struct Seqbuf *sb)
+{
+ sb->off = 0;
+ sb->it = sb->b;
+}
+
+static
+error
+grow(struct Seqbuf **sb, int min)
+{
+ struct Seqbuf *old, *new;
+ vlong newlen;
+
+ old = *sb;
+ Assert((*sb)->len <= (SIZE_MAX - 1) / 2);
+ newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min));
+ Assert(newlen >= (*sb)->len+min);
+
+ if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) {
+ errorf("memory: could not allocate new buffer");
+ return 1;
+ }
+
+ memcpy(new, old, sizeof(*sb) + (*sb)->len);
+ new->len = newlen;
+ new->it = new->b + (old->len);
+ old->heap.free(old->h, old);
+
+ *sb = new;
+ return 0;
+}
+
+static
+error
+put(struct Seqbuf **sb, byte c)
+{
+ int err;
+ struct Seqbuf *sq;
+
+ sq = *sb;
+ if (sq->it < (sq->b + sq->len)) {
+ *sq->it++ = c;
+ return 0;
+ }
+
+ if (err = grow(sb, 1), err) {
+ errorf("memory fail: could not allocate more buffer");
+ sq->heap.free(sq->h, sq);
+ return 1;
+ }
+
+ *((*sb)->it++) = c;
+ return 0;
+}
+
+static
+error
+push(struct Seqbuf **sb, int n, void *buf)
+{
+ int d, err;
+ struct Seqbuf *seq;
+
+ seq = *sb;
+ if (d = seq->len - (seq->it - seq->b), d < n) {
+ assert(d > 0);
+ if (err = grow(sb, n-d), err) {
+ errorf("memory fail: could not allocate more buffer");
+ seq->heap.free(seq->h, seq);
+ return 1;
+ }
+ }
+
+ memcpy((*sb)->it, buf, n);
+ (*sb)->it += n;
+
+ return 0;
+}
+
+struct bio·FastaReader {
+ byte eof;
+ io·Reader file;
+ void *f;
+
+ struct Seqbuf *seq;
+
+ /* read buffer */
+ byte *b, *bend;
+ byte buf[4*4098];
+};
+
+static
+error
+fill(bio·FastaReader *rdr)
+{
+ int n;
+ // NOTE: This could lead to an infinite loop.
+ if (rdr->eof) {
+ return 0;
+ }
+
+ n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf);
+ if (n < 0) {
+ errorf("read: no data obtained from reader");
+ return 1;
+ }
+ rdr->b = rdr->buf;
+ rdr->bend = rdr->b + n;
+ if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) {
+ *rdr->bend++ = '\0';
+ }
+
+ return 0;
+}
+
+bio·FastaReader*
+bio·newfastareader(io·Reader stream, void *s, mem·Allocator heap, void *h)
+{
+ error err;
+ bio·FastaReader *rdr;
+
+ rdr = heap.alloc(h, 1, sizeof(bio·FastaReader));
+ rdr->file = stream;
+ rdr->f = s;
+ rdr->eof = 0;
+
+ rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
+ rdr->seq->heap = heap;
+ rdr->seq->h = h;
+ rdr->seq->it = rdr->seq->b;
+ rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE;
+
+ if (err = fill(rdr), err) {
+ errorf("fill: could not populate buffer");
+ goto ERROR;
+ }
+
+ return rdr;
+
+ERROR:
+ heap.free(h, rdr->seq);
+ heap.free(h, rdr);
+ return nil;
+}
+
+error
+bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq)
+{
+ error err;
+ byte *beg;
+
+ if (rdr->eof && rdr->b == rdr->bend-1) {
+ return EOF;
+ }
+
+ reset(rdr->seq);
+
+ // NOTE: Can this case happen?
+ Assert(rdr->b != rdr->bend);
+ if (*rdr->b++ != '>') {
+ errorf("fasta format: expected '>', found '%c'", *rdr->b--);
+ return 1;
+ }
+
+NAME:
+ beg = rdr->b;
+ while (rdr->b != rdr->bend) {
+ if (*rdr->b++ == '\n') {
+ push(&rdr->seq, (rdr->b - 1) - beg, beg);
+ goto SEQ;
+ }
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if (err = fill(rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto NAME;
+
+SEQ:
+ put(&rdr->seq, '\0');
+ rdr->seq->off = rdr->seq->it - rdr->seq->b;
+
+SEQL:
+ beg = rdr->b;
+ while (rdr->b != rdr->bend) {
+ if (*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if (*rdr->b == '>' || *rdr->b == '\0') {
+ goto SUCCESS;
+ }
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if (err = fill(rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto SEQL;
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off;
+ seq->len = rdr->seq->it - seq->s;
+ seq->q = nil;
+
+ return 0;
+}
diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c
index 8b02446..0c9921b 100644
--- a/sys/libbio/io/newick.c
+++ b/sys/libbio/io/newick.c
@@ -309,7 +309,7 @@ ERROR:
}
bio·Tree
-bio·readnewick(io·Peeker stream, void *si, mem·Allocator heap, void *hi)
+bio·readnewick(io·Peeker stream, void *s, mem·Allocator heap, void *h)
{
error err;
struct Parser p;
@@ -319,9 +319,9 @@ bio·readnewick(io·Peeker stream, void *si, mem·Allocator heap, void *hi)
.lev = 0,
.root = nil,
.tok = (struct Token){ 0 },
- .fimpl = si,
+ .fimpl = s,
.file = stream,
- .himpl = hi,
+ .himpl = h,
.heap = heap,
};
err = parse(&p);
@@ -337,6 +337,7 @@ bio·readnewick(io·Peeker stream, void *si, mem·Allocator heap, void *hi)
// -----------------------------------------------------------------------
// Write
+static
error
dump(bio·Node *node, void *impl, io·Putter out)
{
diff --git a/sys/libbio/test.c b/sys/libbio/test.c
index 115ee46..14f2b06 100644
--- a/sys/libbio/test.c
+++ b/sys/libbio/test.c
@@ -5,9 +5,8 @@
// -----------------------------------------------------------------------
// Point of entry for testing
-
-int
-main()
+error
+test·newick()
{
error err;
bio·Tree t;
@@ -36,3 +35,40 @@ main()
return 0;
}
+error
+test·fasta()
+{
+ error err;
+ Stream *fd;
+
+ bio·Seq seq;
+ bio·FastaReader *rdr;
+
+ fd = io·open("/home/nolln/root/data/test/zika.fa", "r");
+ rdr = bio·newfastareader((io·Reader){.read = &io·read}, fd, mem·sys, nil);
+
+ err = 0;
+ while (!err) {
+ err = bio·readfasta(rdr, &seq);
+ if (!err) {
+ printf(">%s\n", seq.name);
+ }
+ }
+
+ io·close(fd);
+ return err <= 0 ? 0 : 1;
+}
+
+error
+main()
+{
+ error err;
+ if (err = test·newick(), err) {
+ errorf("test fail: newick");
+ }
+
+ if (err = test·fasta(), err) {
+ errorf("test fail: fasta");
+ }
+}
+
diff --git a/sys/libn/io.c b/sys/libn/io.c
index ff64ff0..3827a82 100644
--- a/sys/libn/io.c
+++ b/sys/libn/io.c
@@ -10,6 +10,18 @@ io·open(byte *name, byte *mode)
return fopen(name, mode);
}
+int
+io·fd(Stream *s)
+{
+ return fileno(s);
+}
+
+error
+io·stat(Stream *s, io·Stat *buf)
+{
+ return fstat(fileno(s), buf);
+}
+
error
io·close(Stream *s)
{