aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-04-22 09:00:11 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-04-22 09:00:11 -0700
commit1f312a24606b09cf1a41aa79946b1963a242f648 (patch)
treeac2a2784979453a0bab6ca930b4aeb169330990a
parent1256c8655b794bb449fb94e7d797436b1c325689 (diff)
feat(libbio): fasta and newick parsing/output
-rw-r--r--sys/libbio/fasta.c346
-rw-r--r--sys/libbio/newick.c418
2 files changed, 764 insertions, 0 deletions
diff --git a/sys/libbio/fasta.c b/sys/libbio/fasta.c
new file mode 100644
index 0000000..484ebb2
--- /dev/null
+++ b/sys/libbio/fasta.c
@@ -0,0 +1,346 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+#define INIT_NM_SIZE 128
+#define INIT_SQ_SIZE 4096
+
+struct Seqbuf
+{
+ mem·Allocator heap;
+ void *h;
+
+ int len, off;
+ byte *it, b[];
+};
+
+static
+void
+reset(struct Seqbuf *sb)
+{
+ sb->off = 0;
+ sb->it = sb->b;
+}
+
+static
+error
+grow(struct Seqbuf **sb, int min)
+{
+ struct Seqbuf *old, *new;
+ vlong newlen;
+
+ old = *sb;
+ assert((*sb)->len <= (SIZE_MAX - 1) / 2);
+ newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min));
+ assert(newlen >= (*sb)->len+min);
+
+ if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) {
+ errorf("memory: could not allocate new buffer");
+ return 1;
+ }
+
+ memcpy(new, old, sizeof(*sb) + (*sb)->len);
+ new->len = newlen;
+ new->it = new->b + (old->len);
+ old->heap.free(old->h, old);
+
+ *sb = new;
+ return 0;
+}
+
+static
+error
+put(struct Seqbuf **sb, byte c)
+{
+ int err;
+ struct Seqbuf *sq;
+
+ sq = *sb;
+ if (sq->it < (sq->b + sq->len)) {
+ *sq->it++ = c;
+ return 0;
+ }
+
+ if (err = grow(sb, 1), err) {
+ errorf("memory fail: could not allocate more buffer");
+ sq->heap.free(sq->h, sq);
+ return 1;
+ }
+
+ *((*sb)->it++) = c;
+ return 0;
+}
+
+static
+error
+push(struct Seqbuf **sb, int n, void *buf)
+{
+ int d, err;
+ struct Seqbuf *seq;
+
+ seq = *sb;
+ if (d = seq->len - (seq->it - seq->b), d < n) {
+ assert(d > 0);
+ if (err = grow(sb, n-d), err) {
+ errorf("memory fail: could not allocate more buffer");
+ seq->heap.free(seq->h, seq);
+ return 1;
+ }
+ }
+
+ memcpy((*sb)->it, buf, n);
+ (*sb)->it += n;
+
+ return 0;
+}
+
+// -----------------------------------------------------------------------
+// sequence data
+
+struct bio·SeqReader {
+ byte eof;
+ io·Reader file;
+ void *f;
+
+ struct Seqbuf *seq;
+
+ /* read buffer */
+ byte *b, *bend;
+ byte buf[4*4098];
+};
+
+static
+error
+fill(bio·SeqReader *rdr)
+{
+ int n;
+ // NOTE: This could lead to an infinite loop.
+ if (rdr->eof) {
+ return 0;
+ }
+
+ n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf);
+ if (n < 0) {
+ errorf("read: no data obtained from reader");
+ return 1;
+ }
+ rdr->b = rdr->buf;
+ rdr->bend = rdr->b + n;
+ if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) {
+ *rdr->bend++ = '\0';
+ }
+
+ return 0;
+}
+
+bio·SeqReader*
+bio·openseq(io·Reader file, void *f, mem·Allocator heap, void *h)
+{
+ error err;
+ bio·SeqReader *rdr;
+
+ rdr = heap.alloc(h, 1, sizeof(bio·SeqReader));
+ rdr->file = file;
+ rdr->f = f;
+ rdr->eof = 0;
+
+ rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
+ rdr->seq->heap = heap;
+ rdr->seq->h = h;
+ rdr->seq->it = rdr->seq->b;
+ rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE;
+
+ if (err = fill(rdr), err) {
+ errorf("fill: could not populate buffer");
+ goto ERROR;
+ }
+
+ return rdr;
+
+ERROR:
+ heap.free(h, rdr->seq);
+ heap.free(h, rdr);
+ return nil;
+}
+
+error
+bio·closeseq(bio·SeqReader *rdr)
+{
+ mem·Allocator heap;
+ void *h;
+
+ heap = rdr->seq->heap;
+ h = rdr->seq->h;
+
+ heap.free(h, rdr->seq);
+ heap.free(h, rdr);
+
+ return 0;
+}
+
+
+static
+error
+readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop)
+{
+ error err;
+ byte *beg;
+
+ if (rdr->eof && rdr->b == rdr->bend-1)
+ return EOF;
+
+ reset(rdr->seq);
+
+ // NOTE: Can this case happen?
+ assert(rdr->b != rdr->bend);
+ if(*rdr->b++ != hdr) {
+ errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--);
+ return 1;
+ }
+
+NAME:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b++ == '\n') {
+ push(&rdr->seq, (rdr->b - 1) - beg, beg);
+ goto SEQ;
+ }
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err = fill(rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto NAME;
+
+SEQ:
+ put(&rdr->seq, '\0');
+ rdr->seq->off = rdr->seq->it - rdr->seq->b;
+
+SEQL:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if(*rdr->b == stop || *rdr->b == '\0')
+ goto SUCCESS;
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err = fill(rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto SEQL;
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ return 0;
+}
+
+/*
+ * fasta files
+ */
+
+error
+bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq)
+{
+ error err;
+
+ err = readfasta(rdr, seq, '>', '>');
+ if(err && err != EOF) {
+ errorf("parse fail: could not read sequence of record");
+ return err;
+ }
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off;
+ seq->len = rdr->seq->it - seq->s;
+ seq->q = nil;
+
+ return err;
+}
+
+/*
+ * fastq files
+ */
+
+error
+bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq)
+{
+ int n;
+ byte *beg;
+ error err;
+
+ err = readfasta(rdr, seq, '@', '+');
+ if(err) {
+ errorf("parse fail: could not read sequence of record");
+ return err;
+ }
+
+ seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
+
+ if(*rdr->b++ != '+') {
+ errorf("format error: no '+' character seperator found");
+ return -1;
+ }
+
+EATLN:
+ while(rdr->b != rdr->bend) {
+ if (*rdr->b++ == '\n') {
+ n = 0;
+ goto QUAL;
+ }
+ }
+
+ if(err = fill((bio·SeqReader*)rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto EATLN;
+
+QUAL:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if(n++ == seq->len || *rdr->b == '\0') {
+ err = *rdr->b == '\0' ? EOF : 0;
+ goto SUCCESS;
+ }
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err = fill((bio·SeqReader*)rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto QUAL;
+
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off;
+ seq->q = seq->s + seq->len + 1;
+
+ return err;
+}
diff --git a/sys/libbio/newick.c b/sys/libbio/newick.c
new file mode 100644
index 0000000..164516f
--- /dev/null
+++ b/sys/libbio/newick.c
@@ -0,0 +1,418 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Tokens
+
+enum TokenKind
+{
+ tok·nil,
+ tok·eof,
+ tok·space,
+ tok·ident,
+ tok·number,
+ tok·lparen,
+ tok·rparen,
+ tok·lbrak,
+ tok·rbrak,
+ tok·comma,
+ tok·semi,
+ tok·colon,
+
+ NUM_TOKENS,
+};
+
+
+struct Token {
+ enum TokenKind kind;
+ union
+ {
+ byte *s;
+ double x;
+ } lit;
+};
+
+static
+byte*
+tokstr(struct Token tok)
+{
+ static byte b[50];
+ switch (tok.kind) {
+ case tok·nil: return "";
+ case tok·eof: return nil;
+ case tok·space: return " ";
+ case tok·ident: return tok.lit.s;
+ case tok·lparen: return "(";
+ case tok·rparen: return ")";
+ case tok·lbrak: return "[";
+ case tok·rbrak: return "]";
+ case tok·comma: return ",";
+ case tok·semi: return ";";
+ case tok·colon: return ":";
+ case tok·number:
+ snprintf(b, arrlen(b), "%f", tok.lit.x);
+ return b;
+ default:
+ return nil;
+ }
+}
+
+
+// -----------------------------------------------------------------------
+// Read
+
+// TODO: Bounds checking on buffer
+static
+struct Token
+lex(io·Peeker s, void* fp)
+{
+#define isvalidchar(C) ((C) == '!') || \
+ ('\"' < (C) && (C) < '\'') || \
+ (')' < (C) && (C) < '+') || \
+ (',' < (C) && (C) < ':') || \
+ (':' < (C) && (C) < '[') || \
+ ((C) == '\\') || \
+ (']' < (C) && (C) <= '~')
+ byte *c;
+ struct Token tok;
+ static byte b[1024];
+ c = b;
+ *c = s.get(fp);
+
+ if (isspace(*c)) {
+ while (isspace(*c)) {
+ *(++c) = s.get(fp);
+ }
+
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·space;
+ tok.lit.s = b;
+ return tok;
+ }
+
+ switch (*c) {
+ case EOF: tok.kind = tok·eof; return tok;
+ case '(': tok.kind = tok·lparen; return tok;
+ case ')': tok.kind = tok·rparen; return tok;
+ case '[': tok.kind = tok·lbrak; return tok;
+ case ']': tok.kind = tok·rbrak; return tok;
+ case ',': tok.kind = tok·comma; return tok;
+ case ';': tok.kind = tok·semi; return tok;
+ case ':': tok.kind = tok·colon; return tok;
+
+ case '.':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ while (isdigit(*c)) {
+ NUM: *(++c) = s.get(fp);
+ }
+ if (*c == '.') goto NUM;
+ if (isvalidchar(*c)) goto IDENT;
+
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·number;
+ tok.lit.x = atof(b);
+ return tok;
+
+ case '\"':
+ while ((*c) != '\"') {
+ *(++c) = s.get(fp);
+ }
+ assert(c - b < 1024);
+
+ *c = '\0';
+ tok.kind = tok·ident;
+ tok.lit.s = b + 1;
+ return tok;
+
+ default:
+ IDENT:
+ while (isvalidchar(*c)) {
+ *(++c) = s.get(fp);
+ }
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = '\0';
+ tok.kind = tok·ident;
+ tok.lit.s = b;
+ return tok;
+ }
+#undef isvalidchar
+}
+
+static
+struct Token
+lex_nospace(io·Peeker s, void *impl)
+{
+ struct Token tok;
+ tok = lex(s, impl);
+ if (tok.kind == tok·space) {
+ tok = lex_nospace(s, impl);
+ }
+
+ return tok;
+}
+
+struct Parser
+{
+ int lev;
+ bio·Node *root;
+ struct Token tok;
+
+ void *fimpl;
+ io·Peeker file;
+ void *himpl;
+ mem·Allocator heap;
+};
+
+static
+error
+parse(struct Parser *p)
+{
+ error err;
+ bio·Node *node;
+ bio·Node *root;
+ struct Token tok;
+
+ node = p->root;
+ for (;;) {
+ tok = lex_nospace(p->file, p->fimpl);
+
+ switch (tok.kind) {
+ case tok·lparen:
+ if (!p->root && p->lev > 0) {
+ errorf("parse format: attempted to make root at non-zero level");
+ goto ERROR;
+ }
+
+ node = p->heap.alloc(p->himpl, 1, sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ if (p->root) {
+ phylo·addchild(p->root, node);
+ root = p->root;
+ } else {
+ root = node;
+ }
+
+ p->lev++;
+ p->root = node;
+ p->tok = tok;
+ err = parse(p);
+ if (err) {
+ goto ERROR;
+ }
+ if (p->tok.kind != tok·rparen) {
+ errorf("incorrect format: closing parentheses expected to proceed opening");
+ goto ERROR;
+ }
+ p->root = root;
+ // NOTE(nnoll): We don't want to override the state of p->tok here!
+ // Jump straight to grabbing next token.
+ continue;
+
+ case tok·rparen:
+ p->lev--;
+ goto DONE;
+
+ /* Comments */
+ case tok·lbrak:
+ if (!node) {
+ errorf("incorrect format: comment found in disallowed region");
+ goto ERROR;
+ }
+ node->comment = str·make("");
+ while (tok.kind != tok·rbrak) {
+ tok = lex_nospace(p->file, p->fimpl);
+ if (tok.kind == tok·eof || tok.kind == tok·nil) {
+ errorf("incorrect format: unmatched comment bracket '['");
+ goto ERROR;
+ }
+ str·append(&node->comment, tokstr(tok));
+ }
+ break;
+
+ case tok·rbrak:
+ errorf("incorrect format: end comment token found in disallowed region");
+ goto ERROR;
+ break;
+
+ case tok·colon:
+ tok = lex_nospace(p->file, p->fimpl);
+ if (tok.kind != tok·number) {
+ errorf("incorrect format: expected number after colon");
+ goto ERROR;
+ }
+ if (node == nil) {
+ errorf("parse error: attempting to set distance of nil node");
+ goto ERROR;
+ }
+ node->dist = tok.lit.x;
+ break;
+
+ case tok·comma:
+ node = nil;
+ break;
+
+ case tok·ident:
+ if (p->tok.kind == tok·rparen) {
+ if (!node) {
+ errorf("parse error: attempting to set name of nil node");
+ goto ERROR;
+ }
+ node->name = str·make(tok.lit.s);
+ } else {
+ if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) {
+ errorf("format error: misplaced identifier for leaf found");
+ goto ERROR;
+ }
+
+ if (!p->root) {
+ errorf("parse error: attempting to create child for no parent");
+ goto ERROR;
+ }
+
+ node = p->heap.alloc(p->himpl, 1, sizeof(*node));
+ memset(node, 0, sizeof(*node));
+ node->name = str·make(tok.lit.s);
+
+ phylo·addchild(p->root, node);
+ }
+ break;
+
+ case tok·number:
+ if (p->tok.kind == tok·rparen) {
+ if (p->lev == 0) {
+ errorf("format error: support value on root not supported");
+ goto ERROR;
+ }
+ node->support = tok.lit.x;
+ } else {
+ errorf("format error: found number in unexpected location");
+ goto ERROR;
+ }
+ break;
+
+ case tok·semi:
+ p->file.unget(p->fimpl, ';');
+ if (p->lev) {
+ errorf("format error: uneven number of parentheses found at ';'");
+ goto ERROR;
+ }
+ goto DONE;
+
+ case tok·eof:
+ goto DONE;
+
+ default:
+ errorf("parse error: unrecognized token");
+ goto ERROR;
+ }
+
+ p->tok = tok;
+ }
+
+DONE:
+ p->tok = tok;
+ return 0;
+ERROR:
+ // TODO(nnoll): cleanup
+ return 1;
+}
+
+error
+bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree)
+{
+ error err;
+ struct Parser p;
+ enum
+ {
+ error·nil,
+ error·notree,
+ error·parse,
+ };
+
+ if (!tree) {
+ return error·notree;
+ }
+
+ p = (struct Parser){
+ .lev = 0,
+ .root = nil,
+ .tok = (struct Token){ 0 },
+ .fimpl = s,
+ .file = stream,
+ .himpl = tree->h,
+ .heap = tree->heap,
+ };
+ err = parse(&p);
+ if (err) {
+ errorf("parsing failed\n");
+ return error·parse;
+ }
+
+ tree->root = p.root;
+ tree->nleaf = 0;
+ tree->root->nnode = 0;
+
+ phylo·countleafs(tree->root, &tree->nleaf);
+ phylo·countnodes(tree->root, &tree->root->nnode);
+
+ return error·nil;
+}
+
+// -----------------------------------------------------------------------
+// Write
+
+static
+error
+dump(bio·Node *node, void *impl, io·Putter out)
+{
+ byte b[24];
+
+ if (!node) {
+ return 1;
+ }
+
+ bio·Node *child;
+ if (node->nchild) {
+ out.put(impl, '(');
+
+ dump(node->child, impl, out);
+ for (child = node->child->sibling; child != nil; child = child->sibling) {
+ out.put(impl, ',');
+ dump(child, impl, out);
+ }
+
+ out.put(impl, ')');
+ }
+ if (node->name) {
+ out.putstr(impl, node->name);
+ }
+
+ if (node->parent) {
+ out.put(impl, ':');
+ snprintf(b, arrlen(b), "%f", node->dist);
+ out.putstr(impl, b);
+ }
+
+ return 0;
+}
+
+error
+bio·writenewick(bio·Tree tree, io·Putter out, void* impl)
+{
+ dump(tree.root, impl, out);
+ out.put(impl, ';');
+ out.put(impl, '\n');
+
+ return 0;
+}