aboutsummaryrefslogtreecommitdiff
path: root/src/libbio
diff options
context:
space:
mode:
Diffstat (limited to 'src/libbio')
-rw-r--r--src/libbio/align.c178
-rw-r--r--src/libbio/fasta.c393
-rw-r--r--src/libbio/newick.c414
-rw-r--r--src/libbio/phylo.c427
-rw-r--r--src/libbio/rules.mk24
-rw-r--r--src/libbio/simulate.c120
-rw-r--r--src/libbio/test.c283
7 files changed, 1839 insertions, 0 deletions
diff --git a/src/libbio/align.c b/src/libbio/align.c
new file mode 100644
index 0000000..20a57df
--- /dev/null
+++ b/src/libbio/align.c
@@ -0,0 +1,178 @@
+#include <u.h>
+#include <libn.h>
+#include <libn/macro/qsort.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// globals
+
+uint64 aln·shft = (2ULL * (aln·K - 1ULL));
+uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL;
+
+// -----------------------------------------------------------------------
+// static data
+
+static uint64 nuctab[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+};
+
+// -----------------------------------------------------------------------
+// hash functions
+
+enum
+{
+ MURM641 = 0xff51afd7ed558ccd,
+ MURM642 = 0xc4ceb9fe1a85ec53
+};
+
+static
+uint64
+minihash(uint64 x, uint64 mask)
+{
+ x = (~x + (x << 21)) & mask;
+ x = x ^ (x >> 24);
+ x = (x + (x << 3) + (x << 8)) & mask;
+ x = x ^ (x >> 14);
+ x = (x + (x << 2) + (x << 4)) & mask;
+ x = x ^ (x >> 28);
+ x = (x + (x << 31));
+
+ return x;
+}
+
+static
+uint64
+murmurhash(uint64 x, uint64 mask)
+{
+ x = x ^ (x >> 33);
+ x = (x * MURM641);
+ x = x ^ (x >> 33);
+ x = (x * MURM642);
+ x = x ^ (x >> 33);
+
+ return x;
+}
+
+// -----------------------------------------------------------------------
+// locality sensitive hashing (with spatial extent)
+
+static
+void
+sortpos(uintptr len, uint64 vals[], int locs[])
+{
+ int tmpi;
+ uint64 tmpu64;
+
+#define LESS(i, j) (locs[i] < locs[j])
+#define SWAP(i, j) (tmpu64 = vals[i], tmpi = locs[i], \
+ vals[i] = vals[j], locs[i] = locs[j], \
+ vals[j] = tmpu64 , locs[j] = tmpi)
+ QSORT(len, LESS, SWAP);
+#undef LESS
+#undef SWAP
+}
+
+/*
+ * sketch
+ * @param seq: '0' terminated string
+ * @param len: number of sequential sketches to keep
+ * @param vals: buffer to store hashes of sketch.
+ * @param locs: buffer to store location of sketch hashes
+ */
+error
+aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N])
+{
+ int i, n, l, *loc;
+ uint64 kmer, h[3], *val;
+ int tmpi[2];
+ uint64 tmpu[2];
+
+ for(n = 0; n < aln·N; n++) {
+ for(l = 0; l < len; l++) {
+ vals[n][l] = UINT64_MAX;
+ }
+ }
+
+ kmer = UINT64_MAX;
+ for(l = 0; *seq != '\0'; seq++, l++) {
+ kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask;
+
+ h[0] = minihash(kmer, aln·mask);
+ h[1] = murmurhash(kmer, aln·mask);
+
+ for(n = 0; n < aln·N; n++) {
+ val = vals[n];
+ loc = locs[n];
+
+ h[2] = (h[0] + n * h[1]) & aln·mask;
+ for (i = 0; i < len && h[2] < val[i]; i++) {
+ ;
+ }
+
+ tmpu[1] = h[2];
+ tmpi[1] = l;
+ for(i -= 1; i >= 0; i--) {
+ tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0];
+ tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0];
+ }
+ }
+ }
+
+ for(n = 0; n < aln·N; n++) {
+ sortpos(len, vals[n], locs[n]);
+ }
+
+ return 0;
+}
+
+static
+int
+lessarrs(int len, uint64 a[], uint64 b[])
+{
+ int l;
+
+ for(l = 0; l < len; l++) {
+ if (a[l] < b[l]) return 1;
+ if (a[l] > b[l]) return 0;
+ }
+
+ return 0;
+}
+
+static
+void
+swaparrs(int len, uint64 a[], uint64 b[])
+{
+ int l;
+ uint64 tmp;
+
+ for(l = 0; l < len; l++) {
+ tmp = a[l], a[l] = b[l], b[l] = tmp;
+ }
+}
+
+error
+aln·sort(uintptr n, int len, uint64 *vals)
+{
+#define LESS(i, j) (lessarrs(len, vals+((i)*len), vals+((j)*len)))
+#define SWAP(i, j) (swaparrs(len, vals+((i)*len), vals+((j)*len)))
+ QSORT(n, LESS, SWAP);
+#undef LESS
+#undef SWAP
+ return 0;
+}
diff --git a/src/libbio/fasta.c b/src/libbio/fasta.c
new file mode 100644
index 0000000..3788544
--- /dev/null
+++ b/src/libbio/fasta.c
@@ -0,0 +1,393 @@
+#include <u.h>
+#include <base.h>
+#include <libbio.h>
+
+#define INIT_NM_SIZE 128
+#define INIT_SQ_SIZE 4096
+
+struct SeqBuf
+{
+ mem·Allocator mem;
+ void *heap;
+
+ int cap, off;
+ byte *it, b[];
+};
+
+static
+void
+reset(struct SeqBuf *sb)
+{
+ sb->off = 0;
+ sb->it = sb->b;
+}
+
+static
+error
+grow(struct SeqBuf **sb, int min)
+{
+ void* heap;
+ mem·Allocator mem;
+
+ vlong newcap;
+ struct SeqBuf *old, *new;
+
+ old = *sb;
+ mem = old->mem;
+ heap = old->heap;
+
+ assert((*sb)->cap <= (SIZE_MAX - 1) / 2);
+ newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min));
+ assert(newcap >= (*sb)->cap+min);
+
+ if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) {
+ errorf("memory: could not allocate new buffer\n");
+ return 1;
+ }
+
+ memcpy(new, old, sizeof(*new) + (*sb)->cap);
+
+ new->cap = newcap;
+ new->it = new->b + (old->it - old->b);
+ mem.free(heap, old);
+
+ *sb = new;
+ return 0;
+}
+
+static
+error
+put(struct SeqBuf **sb, byte c)
+{
+ int err;
+ struct SeqBuf *sq;
+
+ sq = *sb;
+ if(sq->it < (sq->b + sq->cap)) {
+ *sq->it++ = c;
+ return 0;
+ }
+
+ if(err = grow(sb, 1), err) {
+ errorf("memory fail: could not allocate more buffer\n");
+ sq->mem.free(sq->heap, sq);
+ return 1;
+ }
+
+ *((*sb)->it++) = c;
+ return 0;
+}
+
+static
+error
+push(struct SeqBuf **sb, int n, void *buf)
+{
+ int d, err;
+ struct SeqBuf *seq;
+
+ seq = *sb;
+ if(d = (seq->cap - (seq->it - seq->b)), d < n) {
+ assert(d >= 0);
+ if (err = grow(sb, n-d), err) {
+ errorf("memory fail: could not allocate more buffer\n");
+ seq->mem.free(seq->heap, seq);
+ return 1;
+ }
+ }
+ seq = *sb;
+
+ memcpy(seq->it, buf, n);
+ seq->it += n;
+
+ return 0;
+}
+
+// -----------------------------------------------------------------------
+// sequence data
+
+struct bio·SeqReader {
+ byte eof;
+ io·Reader rdr;
+ void *io;
+
+ struct SeqBuf *seq;
+
+ /* read buffer */
+ byte *b, *bend, buf[4*4098];
+};
+
+static
+error
+fill(bio·SeqReader *rdr)
+{
+ int n;
+ // NOTE: This could lead to an infinite loop.
+ if(rdr->eof)
+ return 0;
+
+ n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf);
+ if(n < 0) {
+ errorf("read: no data obtained from reader\n");
+ return 1;
+ }
+ rdr->b = rdr->buf;
+ rdr->bend = rdr->b + n;
+ if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof)
+ *rdr->bend++ = '\0';
+
+ return 0;
+}
+
+bio·SeqReader*
+bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap)
+{
+ error err;
+ bio·SeqReader *r;
+
+ r = mem.alloc(heap, 1, sizeof(bio·SeqReader));
+ r->rdr = rdr;
+ r->io = io;
+ r->eof = 0;
+
+ r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
+ r->seq->mem = mem;
+ r->seq->heap = heap;
+ r->seq->it = r->seq->b;
+ r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE;
+
+ if (err=fill(r), err) {
+ errorf("fill: could not populate buffer\n");
+ goto ERROR;
+ }
+
+ return r;
+
+ERROR:
+ mem.free(heap, r->seq);
+ mem.free(heap, r);
+ return nil;
+}
+
+error
+bio·closeseq(bio·SeqReader *rdr)
+{
+ mem·Allocator mem;
+ void *heap;
+
+ mem = rdr->seq->mem;
+ heap = rdr->seq->heap;
+
+ mem.free(heap, rdr->seq);
+ mem.free(heap, rdr);
+
+ return 0;
+}
+
+
+static
+error
+readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop)
+{
+ error err;
+ byte *beg;
+
+ if(rdr->eof && rdr->b == rdr->bend-1)
+ return EOF;
+
+ reset(rdr->seq);
+
+ // NOTE: Can this case happen?
+ assert(rdr->b != rdr->bend);
+ if(*rdr->b++ != hdr) {
+ errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--);
+ return 1;
+ }
+
+NAME:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b++ == '\n') {
+ push(&rdr->seq, (rdr->b - 1) - beg, beg);
+ goto SEQ;
+ }
+ }
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err=fill(rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto NAME;
+
+SEQ:
+ put(&rdr->seq, '\0');
+ rdr->seq->off = rdr->seq->it - rdr->seq->b;
+
+SEQLOOP:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if(*rdr->b == stop || *rdr->b == '\0')
+ goto SUCCESS;
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err=fill(rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto SEQLOOP;
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ return 0;
+}
+
+/*
+ * fasta files
+ */
+
+error
+bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq)
+{
+ error err;
+
+ err = readfasta(rdr, seq, '>', '>');
+ if(err && err != EOF) {
+ errorf("parse fail: could not read sequence of record\n");
+ return err;
+ }
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off;
+ seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end
+ seq->q = nil;
+
+ return err;
+}
+
+/*
+ * fastq files
+ */
+
+error
+bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq)
+{
+ int n;
+ byte *beg;
+ error err;
+
+ err = readfasta(rdr, seq, '@', '+');
+ if(err) {
+ errorf("parse fail: could not read sequence of record\n");
+ return err;
+ }
+
+ seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
+
+ if(*rdr->b++ != '+') {
+ errorf("format error: no '+' character seperator found\n");
+ return -1;
+ }
+
+EATLN:
+ while(rdr->b != rdr->bend) {
+ if (*rdr->b++ == '\n') {
+ n = 0;
+ goto QUAL;
+ }
+ }
+
+ if(err = fill((bio·SeqReader*)rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto EATLN;
+
+QUAL:
+ beg = rdr->b;
+ while(rdr->b != rdr->bend) {
+ if(*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if(n++ == seq->len || *rdr->b == '\0') {
+ err = *rdr->b == '\0' ? EOF : 0;
+ goto SUCCESS;
+ }
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if(err = fill((bio·SeqReader*)rdr), err) {
+ errorf("read: could not populate buffer\n");
+ return 1;
+ }
+ goto QUAL;
+
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off - 1;
+ seq->q = seq->s + seq->len + 1;
+
+ return err;
+}
+
+// -----------------------------------------------------------------------
+// sequence writing
+
+error
+bio·writefasta(io·Writer io, void *wtr, bio·Seq seq)
+{
+ int i, j, d;
+ char buf[2048], *b = buf, *e = arrend(buf);
+
+ *b++ = '>';
+ while(*seq.name) {
+ *b++ = *seq.name++;
+ if(b == e) {
+ io.write(wtr, 1, arrlen(buf), buf);
+ b = buf;
+ }
+ }
+
+ for(i=0; i<seq.len; i = j) {
+ j = MIN(i+70, seq.len);
+ d = j - i;
+ if((e-b) <= d+1) {
+ io.write(wtr, 1, b-buf, buf);
+ b = buf;
+ }
+ *b++ = '\n';
+ memcpy(b, seq.s+i, d);
+ b += d;
+ }
+
+ *b++ = '\n';
+ io.write(wtr, 1, b-buf, buf);
+
+ return 0;
+}
+
+error
+bio·writefastq(io·Writer io, void *wtr, bio·Seq seq)
+{
+ panicf("need to implement");
+ return 1;
+}
diff --git a/src/libbio/newick.c b/src/libbio/newick.c
new file mode 100644
index 0000000..5e6d30a
--- /dev/null
+++ b/src/libbio/newick.c
@@ -0,0 +1,414 @@
+#include <u.h>
+#include <base.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Tokens
+
+enum TokenKind
+{
+ tok·nil,
+ tok·eof,
+ tok·space,
+ tok·ident,
+ tok·number,
+ tok·lparen,
+ tok·rparen,
+ tok·lbrak,
+ tok·rbrak,
+ tok·comma,
+ tok·semi,
+ tok·colon,
+
+ NUM_TOKENS,
+};
+
+
+struct Token {
+ enum TokenKind kind;
+ union
+ {
+ byte *s;
+ double x;
+ } lit;
+};
+
+static
+byte*
+tokstr(struct Token tok)
+{
+ static byte b[50];
+ switch (tok.kind) {
+ case tok·nil: return "";
+ case tok·eof: return nil;
+ case tok·space: return " ";
+ case tok·ident: return tok.lit.s;
+ case tok·lparen: return "(";
+ case tok·rparen: return ")";
+ case tok·lbrak: return "[";
+ case tok·rbrak: return "]";
+ case tok·comma: return ",";
+ case tok·semi: return ";";
+ case tok·colon: return ":";
+ case tok·number:
+ snprintf(b, arrlen(b), "%f", tok.lit.x);
+ return b;
+ default:
+ return nil;
+ }
+}
+
+
+// -----------------------------------------------------------------------
+// Read
+
+// TODO: Bounds checking on buffer
+static
+struct Token
+lex(io·Peeker s, void* fp)
+{
+#define isvalidchar(C) ((C) == '!') || \
+ ('\"' < (C) && (C) < '\'') || \
+ (')' < (C) && (C) < '+') || \
+ (',' < (C) && (C) < ':') || \
+ (':' < (C) && (C) < '[') || \
+ ((C) == '\\') || \
+ (']' < (C) && (C) <= '~')
+ byte *c;
+ struct Token tok;
+ static byte b[1024];
+ c = b;
+ *c = s.get(fp);
+
+ if (isspace(*c)) {
+ while (isspace(*c)) {
+ *(++c) = s.get(fp);
+ }
+
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·space;
+ tok.lit.s = b;
+ return tok;
+ }
+
+ switch (*c) {
+ case EOF: tok.kind = tok·eof; return tok;
+ case '(': tok.kind = tok·lparen; return tok;
+ case ')': tok.kind = tok·rparen; return tok;
+ case '[': tok.kind = tok·lbrak; return tok;
+ case ']': tok.kind = tok·rbrak; return tok;
+ case ',': tok.kind = tok·comma; return tok;
+ case ';': tok.kind = tok·semi; return tok;
+ case ':': tok.kind = tok·colon; return tok;
+
+ case '.':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ while (isdigit(*c)) {
+ NUM: *(++c) = s.get(fp);
+ }
+ if (*c == '.') goto NUM;
+ if (isvalidchar(*c)) goto IDENT;
+
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·number;
+ tok.lit.x = atof(b);
+ return tok;
+
+ case '\"':
+ while ((*c) != '\"') {
+ *(++c) = s.get(fp);
+ }
+ assert(c - b < 1024);
+
+ *c = '\0';
+ tok.kind = tok·ident;
+ tok.lit.s = b + 1;
+ return tok;
+
+ default:
+ IDENT:
+ while (isvalidchar(*c)) {
+ *(++c) = s.get(fp);
+ }
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = '\0';
+ tok.kind = tok·ident;
+ tok.lit.s = b;
+ return tok;
+ }
+#undef isvalidchar
+}
+
+static
+struct Token
+lex_nospace(io·Peeker s, void *impl)
+{
+ struct Token tok;
+ tok = lex(s, impl);
+ if (tok.kind == tok·space) {
+ tok = lex_nospace(s, impl);
+ }
+
+ return tok;
+}
+
+struct Parser
+{
+ int lev;
+ bio·Node *root;
+ struct Token tok;
+
+ void *io;
+ io·Peeker file;
+ void *heap;
+ mem·Allocator mem;
+};
+
+static
+error
+parse(struct Parser *p)
+{
+ error err;
+ bio·Node *node;
+ bio·Node *root;
+ struct Token tok;
+
+ node = p->root;
+ for (;;) {
+ tok = lex_nospace(p->file, p->io);
+
+ switch (tok.kind) {
+ case tok·lparen:
+ if (!p->root && p->lev > 0) {
+ errorf("parse format: attempted to make root at non-zero level");
+ goto ERROR;
+ }
+
+ node = p->mem.alloc(p->heap, 1, sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ if (p->root) {
+ phylo·addchild(p->root, node);
+ root = p->root;
+ } else {
+ root = node;
+ }
+
+ p->lev++;
+ p->root = node;
+ p->tok = tok;
+ err = parse(p);
+ if (err) {
+ goto ERROR;
+ }
+ if (p->tok.kind != tok·rparen) {
+ errorf("incorrect format: closing parentheses expected to proceed opening");
+ goto ERROR;
+ }
+ p->root = root;
+ // NOTE(nnoll): We don't want to override the state of p->tok here!
+ // Jump straight to grabbing next token.
+ continue;
+
+ case tok·rparen:
+ p->lev--;
+ goto DONE;
+
+ /* Comments */
+ case tok·lbrak:
+ if (!node) {
+ errorf("incorrect format: comment found in disallowed region");
+ goto ERROR;
+ }
+ node->comment = str·make("");
+ while (tok.kind != tok·rbrak) {
+ tok = lex_nospace(p->file, p->io);
+ if (tok.kind == tok·eof || tok.kind == tok·nil) {
+ errorf("incorrect format: unmatched comment bracket '['");
+ goto ERROR;
+ }
+ str·append(&node->comment, tokstr(tok));
+ }
+ break;
+
+ case tok·rbrak:
+ errorf("incorrect format: end comment token found in disallowed region");
+ goto ERROR;
+ break;
+
+ case tok·colon:
+ tok = lex_nospace(p->file, p->io);
+ if (tok.kind != tok·number) {
+ errorf("incorrect format: expected number after colon");
+ goto ERROR;
+ }
+ if (node == nil) {
+ errorf("parse error: attempting to set distance of nil node");
+ goto ERROR;
+ }
+ node->dist = tok.lit.x;
+ break;
+
+ case tok·comma:
+ node = nil;
+ break;
+
+ case tok·ident:
+ if (p->tok.kind == tok·rparen) {
+ if (!node) {
+ errorf("parse error: attempting to set name of nil node");
+ goto ERROR;
+ }
+ node->name = str·make(tok.lit.s);
+ } else {
+ if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) {
+ errorf("format error: misplaced identifier for leaf found");
+ goto ERROR;
+ }
+
+ if (!p->root) {
+ errorf("parse error: attempting to create child for no parent");
+ goto ERROR;
+ }
+
+ node = p->mem.alloc(p->heap, 1, sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->name = str·make(tok.lit.s);
+
+ phylo·addchild(p->root, node);
+ }
+ break;
+
+ case tok·number:
+ if (p->tok.kind == tok·rparen) {
+ if (p->lev == 0) {
+ errorf("format error: support value on root not supported");
+ goto ERROR;
+ }
+ node->support = tok.lit.x;
+ } else {
+ errorf("format error: found number in unexpected location");
+ goto ERROR;
+ }
+ break;
+
+ case tok·semi:
+ p->file.unget(p->io, ';');
+ if (p->lev) {
+ errorf("format error: uneven number of parentheses found at ';'");
+ goto ERROR;
+ }
+ goto DONE;
+
+ case tok·eof:
+ goto DONE;
+
+ default:
+ errorf("parse error: unrecognized token");
+ goto ERROR;
+ }
+
+ p->tok = tok;
+ }
+
+DONE:
+ p->tok = tok;
+ return 0;
+ERROR:
+ // TODO(nnoll): cleanup
+ return 1;
+}
+
+int
+bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree)
+{
+ error err;
+ struct Parser p;
+
+ if (!tree) {
+ errorf("tree pointer nil");
+ return 0;
+ }
+
+ p = (struct Parser){
+ .lev = 0,
+ .root = nil,
+ .tok = (struct Token){ 0 },
+ .io = s,
+ .file = stream,
+ .mem = tree->mem,
+ .heap = tree->heap,
+ };
+ err = parse(&p);
+ if (err) {
+ errorf("parsing failed\n");
+ return 0;
+ }
+
+ tree->root = p.root;
+ tree->nleaf = 0;
+ tree->root->nnode = 0;
+
+ phylo·countleafs(tree->root, &tree->nleaf);
+ phylo·countnodes(tree->root, &tree->root->nnode);
+
+ return 1;
+}
+
+// -----------------------------------------------------------------------
+// Write
+
+static
+error
+dump(bio·Node *node, void *impl, io·Putter out)
+{
+ byte b[24];
+
+ if (!node) {
+ return 1;
+ }
+
+ bio·Node *child;
+ if (node->nchild) {
+ out.put(impl, '(');
+
+ dump(node->child, impl, out);
+ for (child = node->child->sibling; child != nil; child = child->sibling) {
+ out.put(impl, ',');
+ dump(child, impl, out);
+ }
+
+ out.put(impl, ')');
+ }
+ if (node->name) {
+ out.puts(impl, node->name);
+ }
+
+ if (node->parent) {
+ out.put(impl, ':');
+ snprintf(b, arrlen(b), "%f", node->dist);
+ out.puts(impl, b);
+ }
+
+ return 0;
+}
+
+error
+bio·writenewick(bio·Tree tree, io·Putter out, void* impl)
+{
+ dump(tree.root, impl, out);
+ out.put(impl, ';');
+ out.put(impl, '\n');
+
+ return 0;
+}
diff --git a/src/libbio/phylo.c b/src/libbio/phylo.c
new file mode 100644
index 0000000..d50934f
--- /dev/null
+++ b/src/libbio/phylo.c
@@ -0,0 +1,427 @@
+#include <u.h>
+#include <base.h>
+#include <base/macro/qsort.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// subtree manipulation methods
+// NOTE: As of now these don't update nnode & nleaf stats.
+// It is the caller's responsibility to refresh counts.
+
+error
+phylo·addchild(bio·Node* parent, bio·Node* child)
+{
+ bio·Node *it, *sibling;
+ if (!parent->nchild) {
+ parent->child = child;
+ goto SUCCESS;
+ }
+
+ for (it = parent->child, sibling = it; it != nil; it = it->sibling) {
+ sibling = it;
+ }
+ sibling->sibling = child;
+
+SUCCESS:
+ child->parent = parent;
+ parent->nchild++;
+ return 0;
+}
+
+error
+phylo·rmchild(bio·Node *parent, bio·Node *child)
+{
+ bio·Node *it, *prev;
+ enum {
+ error·nil,
+ error·notfound,
+ error·nochildren,
+ };
+
+ prev = nil;
+ for (it = parent->child; it != nil; it = it->sibling) {
+ if (it == child) goto FOUND;
+ prev = it;
+ }
+ return error·notfound;
+
+FOUND:
+ if (prev == nil) {
+ parent->child = child->sibling;
+ } else {
+ prev->sibling = child->sibling;
+ }
+
+ parent->nchild--;
+ return error·nil;
+}
+
+// -----------------------------------------------------------------------
+// subtree statistics
+
+error
+phylo·countnodes(bio·Node *node, int *n)
+{
+ int m;
+ error err;
+ bio·Node *child;
+
+ m = *n;
+ for (child = node->child; child != nil; child = child->sibling) {
+ if (err = phylo·countnodes(child, n), err) {
+ errorf("node count: failure at '%s'", child->name);
+ return 1;
+ }
+ }
+ node->nnode = *n - m;
+ *n += 1;
+
+ return 0;
+}
+
+error
+phylo·countleafs(bio·Node *node, int *n)
+{
+ error err;
+ bio·Node *child;
+
+ if (!node->nchild) {
+ *n += 1;
+ }
+
+ for (child = node->child; child != nil; child = child->sibling) {
+ if (err = phylo·countleafs(child, n), err) {
+ errorf("leaf count: failure at '%s'", child->name);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+// -----------------------------------------------------------------------
+// generic operations on tree
+
+void*
+phylo·postorder(bio·Node *clade, void *(*op)(bio·Node*, void*), void *ctx)
+{
+ bio·Node *it;
+
+ for(it = clade->child; it != nil; it = it->sibling) {
+ ctx = phylo·postorder(it, op, ctx);
+ }
+
+ return op(clade, ctx);
+}
+
+void*
+phylo·preorder(bio·Node *clade, void *(*op)(bio·Node*, void*), void *ctx)
+{
+ bio·Node *it;
+
+ ctx = op(clade, ctx);
+ for(it = clade->child; it != nil; it = it->sibling) {
+ ctx = phylo·preorder(it, op, ctx);
+ }
+
+ return ctx;
+}
+
+int
+phylo·collectpostorder(bio·Node *clade, bio·Node **list)
+{
+ bio·Node *it;
+ int n;
+
+ for(n = 0, it = clade->child; it != nil; it = it->sibling) {
+ n += phylo·collectpostorder(it, list+n);
+ }
+
+ return n;
+}
+
+static
+inline
+void*
+appendleaf(bio·Node *node, void* list)
+{
+ bio·Node **leafs;
+
+ leafs = list;
+ if (!node->nchild) {
+ *leafs++ = node;
+ }
+
+ return leafs;
+}
+
+void
+phylo·getleafs(bio·Tree tree, bio·Node **leafs)
+{
+ phylo·postorder(tree.root, &appendleaf, leafs);
+}
+
+// -----------------------------------------------------------------------
+// tree editing
+
+static
+void
+sortnodelist(bio·Node **head, bio·Node *next)
+{
+ bio·Node tmp, *it;
+
+ it = &tmp;
+ tmp.sibling = *head;
+
+ while (it->sibling != nil && it->sibling->nnode < next->nnode) {
+ it = it->sibling;
+ }
+
+ next->sibling = it->sibling;
+ it->sibling = next;
+ *head = tmp.sibling;
+}
+
+error
+phylo·ladderize(bio·Node *root)
+{
+ int i;
+ error err;
+ bio·Node *child, *sorted, *sibling;
+
+ if (!root->nchild) return 0;
+
+ // ladderize below
+ for (child = root->child; child != nil; child = child->sibling) {
+ if (err = phylo·ladderize(child), err) {
+ errorf("ladderize: failure at '%s'", child->name);
+ return 1;
+ }
+ }
+
+ // ladderize yourself
+ sorted = nil;
+ child = root->child;
+ while (child != nil) {
+ sibling = child->sibling;
+ sortnodelist(&sorted, child);
+ child = sibling;
+ }
+ root->child = sorted;
+
+ return 0;
+}
+
+/*
+ * compute all distances from a given node
+ * must provide a working buffer
+ */
+
+struct Tuple
+{
+ double *d;
+ bio·Node **n;
+};
+
+static
+struct Tuple
+getdistsfrom(bio·Node *node, bio·Node *prev, double curr, double *dist, bio·Node **list)
+{
+ bio·Node *it;
+ struct Tuple ret;
+
+ *dist++ = curr;
+ *list++ = node;
+
+ ret.d = dist;
+ ret.n = list;
+
+ if (node->parent && node->parent != prev) {
+ ret = getdistsfrom(node->parent, node, curr + node->dist, dist, list);
+
+ dist = ret.d;
+ list = ret.n;
+ }
+
+ for (it = node->child; it != nil; it = it->sibling) {
+ if (it != prev) {
+ ret = getdistsfrom(it, node, curr + it->dist, dist, list);
+
+ dist = ret.d;
+ list = ret.n;
+ }
+ }
+
+ return ret;
+}
+
+int
+phylo·getdistsfrom(bio·Node *node, int len, double *dist, bio·Node **list)
+{
+ struct Tuple ret;
+ // TODO: Better bounds checking.
+
+ ret = getdistsfrom(node, nil, 0.0, dist, list);
+
+ assert(ret.n - list == len);
+ assert(ret.d - dist == len);
+
+ return len;
+}
+
+/*
+static
+void
+disttoroot(bio·Node *clade, double anc, double *dists)
+{
+ double d;
+ bio·Node *it;
+
+ *dists++ = anc + clade->dist;
+ d = dists[-1];
+ for (it = clade->child; it != nil; it = it->sibling) {
+ disttoroot(it, d, ++dists);
+ }
+}
+
+void
+phylo·disttoroot(bio·Tree tree, double *dists)
+{
+ disttoroot(tree.root, 0.0, dists);
+}
+*/
+
+/*
+ * compute the path constituting the tree diameter
+ * returns the number of edges in the path
+ */
+
+static
+void
+sort·nodedists(uintptr len, double fs[], bio·Node* ns[])
+{
+ double f;
+ bio·Node *n;
+#define LESS(i, j) (fs[i] < fs[j])
+#define SWAP(i, j) (n = ns[i], f = fs[i], \
+ fs[i] = fs[j], ns[i] = ns[j], \
+ fs[j] = f, ns[j] = n)
+ QSORT(len, LESS, SWAP);
+#undef LESS
+#undef SWAP
+}
+
+#define BUFLEN 4096
+double
+phylo·diameter(bio·Tree tree, int *len, bio·Node **path)
+{
+ // TODO: deal with large tree > BUFLEN gracefully
+ int n;
+ double fbuf[BUFLEN];
+ bio·Node *nbuf[BUFLEN];
+
+ n = tree.root->nnode;
+
+ assert(n < BUFLEN);
+
+ n = phylo·getdistsfrom(tree.root, tree.root->nnode, fbuf, nbuf);
+ sort·nodedists(n, fbuf, nbuf);
+
+ path[0] = nbuf[n-1];
+ printf("first end '%s'\n", path[0]->name);
+
+ n = phylo·getdistsfrom(path[0], n, fbuf, nbuf);
+ sort·nodedists(n, fbuf, nbuf);
+ printf("second end '%s'\n", nbuf[n-1]->name);
+
+ *len = 0;
+
+ // TODO: Traverse up the tree from each node
+ // Find MRCA by intersection of nodes hit
+
+ return 0.0;
+}
+#undef BUFLEN
+
+/*
+ * reroot a tree on a new node
+ */
+static
+error
+rotateparent(bio·Node *node, bio·Node *to)
+{
+ error err;
+
+ // NOTE: will this ever be taken?
+ if (node->parent == to) {
+ return 0;
+ }
+
+ if (!node->parent) {
+ goto RMCHILD;
+ }
+
+ err = rotateparent(node->parent, node);
+ if (err) {
+ errorf("failure: broken tree");
+ return err;
+ }
+
+ err = phylo·addchild(node, node->parent);
+ if (err) {
+ errorf("inconsistent topology: could not add parent '%s' as child of '%s'", node->parent->name, node->name);
+ return err;
+ }
+
+RMCHILD:
+ err = phylo·rmchild(node, to);
+ if (err) {
+ errorf("inconsistent topology: could not remove child '%s' from '%s'", to->name, node->name);
+ return err;
+ }
+
+ node->parent = to;
+ return 0;
+}
+
+#define PREC .00000001
+error
+phylo·reroot(bio·Tree *tree, bio·Node *node, double d)
+{
+ bio·Node *new;
+
+ // TODO: should check that node is part of this tree?
+ // TODO: should we check if node->parent != nil?
+
+ if (fabs(d) < PREC) {
+ new = node;
+ rotateparent(node->parent, node);
+ } else if (fabs(d-node->dist) < PREC) {
+ new = node->parent;
+ if (new->parent->parent) {
+ rotateparent(new->parent->parent, new->parent);
+ }
+ } else {
+ new = tree->mem.alloc(tree->heap, 1, sizeof(*new));
+ memset(new, 0, sizeof(*new));
+
+ phylo·addchild(new, node);
+ node->parent = new;
+
+ phylo·addchild(new, node->parent);
+ if (node->parent->parent) {
+ rotateparent(node->parent->parent, node->parent);
+ }
+ node->parent->parent = new;
+ }
+
+ printf("number of children on old root: %d\n", tree->root->nchild);
+ tree->root = new;
+ tree->nleaf = 0;
+
+ phylo·countleafs(new, &tree->nleaf);
+ phylo·countnodes(new, &new->nnode);
+
+ return 0;
+}
+#undef PREC
diff --git a/src/libbio/rules.mk b/src/libbio/rules.mk
new file mode 100644
index 0000000..07ce97e
--- /dev/null
+++ b/src/libbio/rules.mk
@@ -0,0 +1,24 @@
+include share/push.mk
+
+# Local sources
+SRCS_$(d) := \
+ $(d)/fasta.c \
+ $(d)/newick.c \
+ $(d)/phylo.c
+LIBS_$(d) := $(d)/libbio.a
+BINS_$(d) :=
+# CHECK_$(d) := \
+# $(d)/test.c \
+# $(d)/simulate.c
+
+include share/paths.mk
+
+# Local rules
+$(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io)
+ $(ARCHIVE)
+
+$(TEST_$(d)): TCLIBS = $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a
+$(TEST_$(d)): $(UNIT_$(d)) $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a
+ $(LINK)
+
+include share/pop.mk
diff --git a/src/libbio/simulate.c b/src/libbio/simulate.c
new file mode 100644
index 0000000..0f5a97e
--- /dev/null
+++ b/src/libbio/simulate.c
@@ -0,0 +1,120 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+#define SEQLEN 2560
+static byte *SEQ =
+"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAGGTAC"
+"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG"
+"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCAACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG"
+"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA"
+"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCTCTAAAGTCA"
+"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA"
+"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA"
+"CGTGAAGTCAGTGCTGGAGTTGATTGAAAAGGTCGATGTGCATGCCATTGCGCACCTGACCGGCGGCGGCTTCTGGGAAA"
+"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG"
+"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCGCCCTGCC"
+"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT"
+"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA"
+"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGTTCGGCCTT"
+"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA"
+"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG"
+"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA"
+"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGTGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT"
+"ACAGGCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT"
+"ATCCACTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG"
+"CCGCCGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC"
+"CTGTAAATCGCAAGCTCCAGCAGTTTTTTTCCCCCTTTTCTGGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA"
+"TCCAGGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATCGAAAAAGAGCTCAGT"
+"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG"
+"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATCATTATTAGCGAAGAAC"
+"AAGGCTCCAACTCTCATTCCCGCCATTTACTGGGCAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC"
+"CTCTACAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA"
+"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG"
+"TGCAGTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG"
+"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA"
+"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA"
+"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG"
+"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCAGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT";
+
+byte*
+modify(byte *seq, int *len, double p)
+{
+ byte *head, *new;
+
+ head = calloc(SEQLEN+1, sizeof(byte));
+ new = head;
+ for (; *seq != '\0'; seq++) {
+ if (rng·bernoulli(p)) {
+ switch (rng·randi(5)) {
+ case 0: *new++ = 'A'; break;
+ case 1: *new++ = 'C'; break;
+ case 2: *new++ = 'G'; break;
+ case 3: *new++ = 'T'; break;
+ case 4: continue;
+ }
+ } else {
+ *new++ = *seq;
+ }
+ }
+ *new = '\0';
+ *len = new - head;
+ return head;
+}
+
+#define NSEQS 20
+int
+main()
+{
+ int n, i, l, lens[NSEQS];
+ byte *seqs[NSEQS];
+
+ int locs[aln·N][NSEQS][aln·L];
+ int *loc[aln·N];
+ uint64 vals[aln·N][NSEQS][aln·L];
+ uint64 *val[aln·N];
+
+ rng·init(0);
+
+ seqs[0] = SEQ;
+ lens[0] = SEQLEN;
+
+ for (n = 0; n < aln·N; n++) {
+ for (i = 0; i < NSEQS; i++) {
+ for (l = 0; l < aln·L; l++) {
+ vals[n][i][l] = 0;
+ }
+ }
+ }
+
+ for (i = 1; i < NSEQS; i++) {
+ seqs[i] = modify(SEQ, lens + i, .01*i);
+ }
+
+ for (i = 0; i < NSEQS; i++) {
+ for (n = 0; n < aln·N; n++) {
+ val[n] = vals[n][i];
+ loc[n] = locs[n][i];
+ }
+ aln·sketch(seqs[i], aln·L, val, loc);
+ }
+
+ // for (n = 0; n < aln·N; n++) {
+ // printf("iteration %d\n", n);
+ // printf("[\n");
+ // for (i = 0; i < NSEQS; i++) {
+ // printf(" [");
+ // for (l = 0; l < aln·L; l++) {
+ // printf("%lu,", vals[n][i][l]);
+ // }
+ // printf("],\n");
+ // }
+ // printf("]\n");
+ // }
+
+ for (n = 0; n < aln·N; n++) {
+ aln·sort(NSEQS, aln·L, (uint64*)vals[n]);
+ }
+
+ return 0;
+}
diff --git a/src/libbio/test.c b/src/libbio/test.c
new file mode 100644
index 0000000..9926764
--- /dev/null
+++ b/src/libbio/test.c
@@ -0,0 +1,283 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+#include <time.h>
+
+// -----------------------------------------------------------------------
+// Global data
+
+static byte *SEQ[] = {
+"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAGGTAC"
+"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG"
+"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCAACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG"
+"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA"
+"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCTCTAAAGTCA"
+"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA"
+"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA"
+"CGTGAAGTCAGTGCTGGAGTTGATTGAAAAGGTCGATGTGCATGCCATTGCGCACCTGACCGGCGGCGGCTTCTGGGAAA"
+"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG"
+"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCGCCCTGCC"
+"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT"
+"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA"
+"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGTTCGGCCTT"
+"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA"
+"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG"
+"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA"
+"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGTGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT"
+"ACAGGCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT"
+"ATCCACTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG"
+"CCGCCGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC"
+"CTGTAAATCGCAAGCTCCAGCAGTTTTTTTCCCCCTTTTCTGGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA"
+"TCCAGGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATCGAAAAAGAGCTCAGT"
+"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG"
+"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATCATTATTAGCGAAGAAC"
+"AAGGCTCCAACTCTCATTCCCGCCATTTACTGGGCAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC"
+"CTCTACAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA"
+"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG"
+"TGCAGTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG"
+"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA"
+"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA"
+"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG"
+"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCAGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT",
+
+"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAAATAC"
+"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG"
+"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG"
+"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA"
+"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCAAAGTCA"
+"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA"
+"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA"
+"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG"
+"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCCCCTGCC"
+"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT"
+"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA"
+"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGCGGCCTT"
+"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA"
+"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG"
+"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA"
+"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT"
+"ACAGTCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT"
+"ATCCTCTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG"
+"CCGCTGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC"
+"CTGTTAATCGCAAGCTCCAGCAGCCCCCCCCCCCCTTTTCTGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA"
+"TCCATGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATAAAAAGAGCTCAGT"
+"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG"
+"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATTATTAGCGAAGAAC"
+"AAGGTTCCAACTCTCATTCCCGCCATTTACTGGGAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC"
+"CTCTTCAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA"
+"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG"
+"TGCATTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG"
+"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA"
+"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA"
+"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG"
+"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT",
+};
+
+
+static
+int
+my_read(Stream *s, void *buf, int n)
+{
+ return io·read(s, 1, n, buf);
+}
+
+// -----------------------------------------------------------------------
+// Point of entry for testing
+
+error
+test·newick()
+{
+ error err;
+ bio·Tree t;
+ mem·Arena *heap;
+ Stream *fd[2];
+
+ io·Peeker rdr;
+ io·Putter wtr;
+
+ bio·Node **end, **it, **list;
+
+ heap = mem·makearena(mem·sys, nil);
+ rdr = (io·Peeker){.get = (byte (*)(void *))io·getbyte, .unget = (error (*)(void *, byte))io·ungetbyte};
+ wtr = (io·Putter){.put = (error (*)(void *, byte))io·putbyte, .putstr = (int (*)(void *, string))io·putstring};
+
+ fd[0] = io·open("/home/nolln/root/data/test/zika.nwk", "r");
+ fd[1] = io·open("/home/nolln/root/data/test/zika.proc.nwk", "w");
+
+ t.h = heap;
+ t.heap = (mem·Allocator){ .alloc = (void *(*)(void *, uint, ulong))mem·arenaalloc, .free = nil, };
+
+ if (err = bio·readnewick(rdr, fd[0], &t), err) {
+ errorf("failed to read newick");
+ return 1;
+ }
+ printf("number of children: %d\n", t.root->nchild);
+
+ phylo·ladderize(t.root);
+
+ list = mem·arenaalloc(heap, t.nleaf, sizeof(**list));
+ phylo·getleafs(t, list);
+ for (it = list, end = list + t.nleaf; it != end; ++it) {
+ printf("Leaf '%s'\n", (*it)->name);
+ }
+
+ bio·Node *path[100];
+ // phylo·diameter(t, path);
+
+ printf("Loaded tree with %d leafs and %d nodes\n", t.nleaf, t.root->nnode);
+ err = bio·writenewick(t, wtr, fd[1]);
+
+ io·flush(fd[1]);
+
+ io·close(fd[0]);
+ io·close(fd[1]);
+
+ mem·freearena(heap);
+ return 0;
+}
+
+error
+test·fasta()
+{
+ error err;
+ Stream *fd;
+
+ bio·Seq seq;
+ bio·FastaReader *rdr;
+
+ clock_t t;
+
+ fd = io·open("/home/nolln/root/data/test/zika.fa", "r");
+
+ /* Benchmark against Heng */
+#if 0
+ int n, slen;
+ kseq_t *kseq;
+
+ t = clock();
+ kseq = kseq_init(fd);
+ while (kseq_read(kseq) >= 0) {
+ ++n, slen += kseq->seq.l;
+ }
+ t = clock() - t;
+ printf("heng's code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC);
+
+ kseq_destroy(kseq);
+
+ io·seek(fd, 0, seek·set);
+#endif
+
+ rdr = bio·openfasta((io·Reader){.read = (int (*)(void *, int, int, void *))io·read}, fd, mem·sys, nil);
+
+ t = clock();
+ err = 0;
+ while (!err) {
+ err = bio·readfasta(rdr, &seq);
+ }
+ t = clock() - t;
+ printf("nick's code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC);
+ bio·closefasta(rdr);
+
+
+ io·close(fd);
+ return err <= 0 ? 0 : 1;
+}
+
+#define asrdr(x) (int (*)(void *, int, int, void *))(x)
+error
+test·fastq()
+{
+ error err;
+ Stream *fd;
+
+ bio·Seq seq;
+ bio·FastqReader *rdr;
+
+ clock_t t;
+
+ fd = io·open("/home/nolln/root/data/test/eg.fq", "r");
+
+ rdr = bio·openfastq((io·Reader){.read = asrdr(io·read)}, fd, mem·sys, nil);
+
+ t = clock();
+ err = 0;
+ while (!err) {
+ err = bio·readfastq(rdr, &seq);
+ }
+ t = clock() - t;
+ printf("nick's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC);
+ bio·closefastq(rdr);
+
+
+ io·close(fd);
+ return err <= 0 ? 0 : 1;
+}
+
+error
+test·align()
+{
+ double f;
+ error err;
+ int i, l, n;
+
+ uint64 mem[aln·N][arrlen(SEQ)][aln·L];
+ uint64 *phi[aln·N];
+ int loc[aln·N][arrlen(SEQ)][aln·L];
+ int *pos[aln·N];
+
+ for (i = 0; i < arrlen(SEQ); i++) {
+ for (n = 0; n < aln·N; n++) {
+ phi[n] = mem[n][i];
+ pos[n] = loc[n][i];
+ }
+
+ err = aln·sketch(SEQ[i], aln·L, phi, pos);
+ }
+
+ f = 0;
+ for (n = 0; n < aln·N; n++) {
+ aln·sort(arrlen(SEQ), aln·L, (uint64*)mem[n]);
+
+ if (!memcmp(mem[n][0], mem[n][1], sizeof(uint64)*aln·L)) {
+ f += 1.;
+ printf("True : ");
+ } else {
+ printf("False: ");
+ }
+ for (i = 0; i < arrlen(SEQ); i++) {
+ printf("[");
+ for (l = 0; l < aln·L; l++) {
+ printf("%lu,", mem[n][i][l]);
+ }
+ printf("]");
+ if (i == 0) printf(" ~ ");
+ }
+ printf("\n");
+ }
+
+ printf("Fraction hits %f\n", f/aln·N);
+ return err;
+
+}
+
+error
+main()
+{
+ error err;
+
+ if (err = test·newick(), err) {
+ errorf("test fail: newick");
+ }
+
+#if 0
+ if (err = test·fasta(), err) {
+ errorf("test fail: fasta");
+ }
+
+ if (err = test·fastq(), err) {
+ errorf("test fail: fastq");
+ }
+#endif
+}
+