From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/libbio/align.c | 178 +++++++++++++++++++++ src/libbio/fasta.c | 393 ++++++++++++++++++++++++++++++++++++++++++++++ src/libbio/newick.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++++ src/libbio/phylo.c | 427 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/libbio/rules.mk | 24 +++ src/libbio/simulate.c | 120 ++++++++++++++ src/libbio/test.c | 283 +++++++++++++++++++++++++++++++++ 7 files changed, 1839 insertions(+) create mode 100644 src/libbio/align.c create mode 100644 src/libbio/fasta.c create mode 100644 src/libbio/newick.c create mode 100644 src/libbio/phylo.c create mode 100644 src/libbio/rules.mk create mode 100644 src/libbio/simulate.c create mode 100644 src/libbio/test.c (limited to 'src/libbio') diff --git a/src/libbio/align.c b/src/libbio/align.c new file mode 100644 index 0000000..20a57df --- /dev/null +++ b/src/libbio/align.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include + +// ----------------------------------------------------------------------- +// globals + +uint64 aln·shft = (2ULL * (aln·K - 1ULL)); +uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL; + +// ----------------------------------------------------------------------- +// static data + +static uint64 nuctab[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +// ----------------------------------------------------------------------- +// hash functions + +enum +{ + MURM641 = 0xff51afd7ed558ccd, + MURM642 = 0xc4ceb9fe1a85ec53 +}; + +static +uint64 +minihash(uint64 x, uint64 mask) +{ + x = (~x + (x << 21)) & mask; + x = x ^ (x >> 24); + x = (x + (x << 3) + (x << 8)) & mask; + x = x ^ (x >> 14); + x = (x + (x << 2) + (x << 4)) & mask; + x = x ^ (x >> 28); + x = (x + (x << 31)); + + return x; +} + +static +uint64 +murmurhash(uint64 x, uint64 mask) +{ + x = x ^ (x >> 33); + x = (x * MURM641); + x = x ^ (x >> 33); + x = (x * MURM642); + x = x ^ (x >> 33); + + return x; +} + +// ----------------------------------------------------------------------- +// locality sensitive hashing (with spatial extent) + +static +void +sortpos(uintptr len, uint64 vals[], int locs[]) +{ + int tmpi; + uint64 tmpu64; + +#define LESS(i, j) (locs[i] < locs[j]) +#define SWAP(i, j) (tmpu64 = vals[i], tmpi = locs[i], \ + vals[i] = vals[j], locs[i] = locs[j], \ + vals[j] = tmpu64 , locs[j] = tmpi) + QSORT(len, LESS, SWAP); +#undef LESS +#undef SWAP +} + +/* + * sketch + * @param seq: '0' terminated string + * @param len: number of sequential sketches to keep + * @param vals: buffer to store hashes of sketch. + * @param locs: buffer to store location of sketch hashes + */ +error +aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N]) +{ + int i, n, l, *loc; + uint64 kmer, h[3], *val; + int tmpi[2]; + uint64 tmpu[2]; + + for(n = 0; n < aln·N; n++) { + for(l = 0; l < len; l++) { + vals[n][l] = UINT64_MAX; + } + } + + kmer = UINT64_MAX; + for(l = 0; *seq != '\0'; seq++, l++) { + kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask; + + h[0] = minihash(kmer, aln·mask); + h[1] = murmurhash(kmer, aln·mask); + + for(n = 0; n < aln·N; n++) { + val = vals[n]; + loc = locs[n]; + + h[2] = (h[0] + n * h[1]) & aln·mask; + for (i = 0; i < len && h[2] < val[i]; i++) { + ; + } + + tmpu[1] = h[2]; + tmpi[1] = l; + for(i -= 1; i >= 0; i--) { + tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0]; + tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0]; + } + } + } + + for(n = 0; n < aln·N; n++) { + sortpos(len, vals[n], locs[n]); + } + + return 0; +} + +static +int +lessarrs(int len, uint64 a[], uint64 b[]) +{ + int l; + + for(l = 0; l < len; l++) { + if (a[l] < b[l]) return 1; + if (a[l] > b[l]) return 0; + } + + return 0; +} + +static +void +swaparrs(int len, uint64 a[], uint64 b[]) +{ + int l; + uint64 tmp; + + for(l = 0; l < len; l++) { + tmp = a[l], a[l] = b[l], b[l] = tmp; + } +} + +error +aln·sort(uintptr n, int len, uint64 *vals) +{ +#define LESS(i, j) (lessarrs(len, vals+((i)*len), vals+((j)*len))) +#define SWAP(i, j) (swaparrs(len, vals+((i)*len), vals+((j)*len))) + QSORT(n, LESS, SWAP); +#undef LESS +#undef SWAP + return 0; +} diff --git a/src/libbio/fasta.c b/src/libbio/fasta.c new file mode 100644 index 0000000..3788544 --- /dev/null +++ b/src/libbio/fasta.c @@ -0,0 +1,393 @@ +#include +#include +#include + +#define INIT_NM_SIZE 128 +#define INIT_SQ_SIZE 4096 + +struct SeqBuf +{ + mem·Allocator mem; + void *heap; + + int cap, off; + byte *it, b[]; +}; + +static +void +reset(struct SeqBuf *sb) +{ + sb->off = 0; + sb->it = sb->b; +} + +static +error +grow(struct SeqBuf **sb, int min) +{ + void* heap; + mem·Allocator mem; + + vlong newcap; + struct SeqBuf *old, *new; + + old = *sb; + mem = old->mem; + heap = old->heap; + + assert((*sb)->cap <= (SIZE_MAX - 1) / 2); + newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min)); + assert(newcap >= (*sb)->cap+min); + + if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) { + errorf("memory: could not allocate new buffer\n"); + return 1; + } + + memcpy(new, old, sizeof(*new) + (*sb)->cap); + + new->cap = newcap; + new->it = new->b + (old->it - old->b); + mem.free(heap, old); + + *sb = new; + return 0; +} + +static +error +put(struct SeqBuf **sb, byte c) +{ + int err; + struct SeqBuf *sq; + + sq = *sb; + if(sq->it < (sq->b + sq->cap)) { + *sq->it++ = c; + return 0; + } + + if(err = grow(sb, 1), err) { + errorf("memory fail: could not allocate more buffer\n"); + sq->mem.free(sq->heap, sq); + return 1; + } + + *((*sb)->it++) = c; + return 0; +} + +static +error +push(struct SeqBuf **sb, int n, void *buf) +{ + int d, err; + struct SeqBuf *seq; + + seq = *sb; + if(d = (seq->cap - (seq->it - seq->b)), d < n) { + assert(d >= 0); + if (err = grow(sb, n-d), err) { + errorf("memory fail: could not allocate more buffer\n"); + seq->mem.free(seq->heap, seq); + return 1; + } + } + seq = *sb; + + memcpy(seq->it, buf, n); + seq->it += n; + + return 0; +} + +// ----------------------------------------------------------------------- +// sequence data + +struct bio·SeqReader { + byte eof; + io·Reader rdr; + void *io; + + struct SeqBuf *seq; + + /* read buffer */ + byte *b, *bend, buf[4*4098]; +}; + +static +error +fill(bio·SeqReader *rdr) +{ + int n; + // NOTE: This could lead to an infinite loop. + if(rdr->eof) + return 0; + + n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf); + if(n < 0) { + errorf("read: no data obtained from reader\n"); + return 1; + } + rdr->b = rdr->buf; + rdr->bend = rdr->b + n; + if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof) + *rdr->bend++ = '\0'; + + return 0; +} + +bio·SeqReader* +bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap) +{ + error err; + bio·SeqReader *r; + + r = mem.alloc(heap, 1, sizeof(bio·SeqReader)); + r->rdr = rdr; + r->io = io; + r->eof = 0; + + r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); + r->seq->mem = mem; + r->seq->heap = heap; + r->seq->it = r->seq->b; + r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE; + + if (err=fill(r), err) { + errorf("fill: could not populate buffer\n"); + goto ERROR; + } + + return r; + +ERROR: + mem.free(heap, r->seq); + mem.free(heap, r); + return nil; +} + +error +bio·closeseq(bio·SeqReader *rdr) +{ + mem·Allocator mem; + void *heap; + + mem = rdr->seq->mem; + heap = rdr->seq->heap; + + mem.free(heap, rdr->seq); + mem.free(heap, rdr); + + return 0; +} + + +static +error +readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop) +{ + error err; + byte *beg; + + if(rdr->eof && rdr->b == rdr->bend-1) + return EOF; + + reset(rdr->seq); + + // NOTE: Can this case happen? + assert(rdr->b != rdr->bend); + if(*rdr->b++ != hdr) { + errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--); + return 1; + } + +NAME: + beg = rdr->b; + while(rdr->b != rdr->bend) { + if(*rdr->b++ == '\n') { + push(&rdr->seq, (rdr->b - 1) - beg, beg); + goto SEQ; + } + } + push(&rdr->seq, rdr->b - beg, beg); + + if(err=fill(rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto NAME; + +SEQ: + put(&rdr->seq, '\0'); + rdr->seq->off = rdr->seq->it - rdr->seq->b; + +SEQLOOP: + beg = rdr->b; + while(rdr->b != rdr->bend) { + if(*rdr->b == '\n') { + push(&rdr->seq, rdr->b - beg, beg); + beg = rdr->b + 1; + } + + if(*rdr->b == stop || *rdr->b == '\0') + goto SUCCESS; + + rdr->b++; + } + + push(&rdr->seq, rdr->b - beg, beg); + + if(err=fill(rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto SEQLOOP; + +SUCCESS: + push(&rdr->seq, rdr->b - beg, beg); + put(&rdr->seq, '\0'); + + return 0; +} + +/* + * fasta files + */ + +error +bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq) +{ + error err; + + err = readfasta(rdr, seq, '>', '>'); + if(err && err != EOF) { + errorf("parse fail: could not read sequence of record\n"); + return err; + } + + seq->name = rdr->seq->b; + seq->s = rdr->seq->b + rdr->seq->off; + seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end + seq->q = nil; + + return err; +} + +/* + * fastq files + */ + +error +bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq) +{ + int n; + byte *beg; + error err; + + err = readfasta(rdr, seq, '@', '+'); + if(err) { + errorf("parse fail: could not read sequence of record\n"); + return err; + } + + seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); + + if(*rdr->b++ != '+') { + errorf("format error: no '+' character seperator found\n"); + return -1; + } + +EATLN: + while(rdr->b != rdr->bend) { + if (*rdr->b++ == '\n') { + n = 0; + goto QUAL; + } + } + + if(err = fill((bio·SeqReader*)rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto EATLN; + +QUAL: + beg = rdr->b; + while(rdr->b != rdr->bend) { + if(*rdr->b == '\n') { + push(&rdr->seq, rdr->b - beg, beg); + beg = rdr->b + 1; + } + + if(n++ == seq->len || *rdr->b == '\0') { + err = *rdr->b == '\0' ? EOF : 0; + goto SUCCESS; + } + + rdr->b++; + } + + push(&rdr->seq, rdr->b - beg, beg); + + if(err = fill((bio·SeqReader*)rdr), err) { + errorf("read: could not populate buffer\n"); + return 1; + } + goto QUAL; + + +SUCCESS: + push(&rdr->seq, rdr->b - beg, beg); + put(&rdr->seq, '\0'); + + seq->name = rdr->seq->b; + seq->s = rdr->seq->b + rdr->seq->off - 1; + seq->q = seq->s + seq->len + 1; + + return err; +} + +// ----------------------------------------------------------------------- +// sequence writing + +error +bio·writefasta(io·Writer io, void *wtr, bio·Seq seq) +{ + int i, j, d; + char buf[2048], *b = buf, *e = arrend(buf); + + *b++ = '>'; + while(*seq.name) { + *b++ = *seq.name++; + if(b == e) { + io.write(wtr, 1, arrlen(buf), buf); + b = buf; + } + } + + for(i=0; i +#include +#include + +// ----------------------------------------------------------------------- +// Tokens + +enum TokenKind +{ + tok·nil, + tok·eof, + tok·space, + tok·ident, + tok·number, + tok·lparen, + tok·rparen, + tok·lbrak, + tok·rbrak, + tok·comma, + tok·semi, + tok·colon, + + NUM_TOKENS, +}; + + +struct Token { + enum TokenKind kind; + union + { + byte *s; + double x; + } lit; +}; + +static +byte* +tokstr(struct Token tok) +{ + static byte b[50]; + switch (tok.kind) { + case tok·nil: return ""; + case tok·eof: return nil; + case tok·space: return " "; + case tok·ident: return tok.lit.s; + case tok·lparen: return "("; + case tok·rparen: return ")"; + case tok·lbrak: return "["; + case tok·rbrak: return "]"; + case tok·comma: return ","; + case tok·semi: return ";"; + case tok·colon: return ":"; + case tok·number: + snprintf(b, arrlen(b), "%f", tok.lit.x); + return b; + default: + return nil; + } +} + + +// ----------------------------------------------------------------------- +// Read + +// TODO: Bounds checking on buffer +static +struct Token +lex(io·Peeker s, void* fp) +{ +#define isvalidchar(C) ((C) == '!') || \ + ('\"' < (C) && (C) < '\'') || \ + (')' < (C) && (C) < '+') || \ + (',' < (C) && (C) < ':') || \ + (':' < (C) && (C) < '[') || \ + ((C) == '\\') || \ + (']' < (C) && (C) <= '~') + byte *c; + struct Token tok; + static byte b[1024]; + c = b; + *c = s.get(fp); + + if (isspace(*c)) { + while (isspace(*c)) { + *(++c) = s.get(fp); + } + + s.unget(fp, *c); + assert(c - b < 1024); + + *c = 0; + tok.kind = tok·space; + tok.lit.s = b; + return tok; + } + + switch (*c) { + case EOF: tok.kind = tok·eof; return tok; + case '(': tok.kind = tok·lparen; return tok; + case ')': tok.kind = tok·rparen; return tok; + case '[': tok.kind = tok·lbrak; return tok; + case ']': tok.kind = tok·rbrak; return tok; + case ',': tok.kind = tok·comma; return tok; + case ';': tok.kind = tok·semi; return tok; + case ':': tok.kind = tok·colon; return tok; + + case '.': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + while (isdigit(*c)) { + NUM: *(++c) = s.get(fp); + } + if (*c == '.') goto NUM; + if (isvalidchar(*c)) goto IDENT; + + s.unget(fp, *c); + assert(c - b < 1024); + + *c = 0; + tok.kind = tok·number; + tok.lit.x = atof(b); + return tok; + + case '\"': + while ((*c) != '\"') { + *(++c) = s.get(fp); + } + assert(c - b < 1024); + + *c = '\0'; + tok.kind = tok·ident; + tok.lit.s = b + 1; + return tok; + + default: + IDENT: + while (isvalidchar(*c)) { + *(++c) = s.get(fp); + } + s.unget(fp, *c); + assert(c - b < 1024); + + *c = '\0'; + tok.kind = tok·ident; + tok.lit.s = b; + return tok; + } +#undef isvalidchar +} + +static +struct Token +lex_nospace(io·Peeker s, void *impl) +{ + struct Token tok; + tok = lex(s, impl); + if (tok.kind == tok·space) { + tok = lex_nospace(s, impl); + } + + return tok; +} + +struct Parser +{ + int lev; + bio·Node *root; + struct Token tok; + + void *io; + io·Peeker file; + void *heap; + mem·Allocator mem; +}; + +static +error +parse(struct Parser *p) +{ + error err; + bio·Node *node; + bio·Node *root; + struct Token tok; + + node = p->root; + for (;;) { + tok = lex_nospace(p->file, p->io); + + switch (tok.kind) { + case tok·lparen: + if (!p->root && p->lev > 0) { + errorf("parse format: attempted to make root at non-zero level"); + goto ERROR; + } + + node = p->mem.alloc(p->heap, 1, sizeof(*node)); + memset(node, 0, sizeof(*node)); + + if (p->root) { + phylo·addchild(p->root, node); + root = p->root; + } else { + root = node; + } + + p->lev++; + p->root = node; + p->tok = tok; + err = parse(p); + if (err) { + goto ERROR; + } + if (p->tok.kind != tok·rparen) { + errorf("incorrect format: closing parentheses expected to proceed opening"); + goto ERROR; + } + p->root = root; + // NOTE(nnoll): We don't want to override the state of p->tok here! + // Jump straight to grabbing next token. + continue; + + case tok·rparen: + p->lev--; + goto DONE; + + /* Comments */ + case tok·lbrak: + if (!node) { + errorf("incorrect format: comment found in disallowed region"); + goto ERROR; + } + node->comment = str·make(""); + while (tok.kind != tok·rbrak) { + tok = lex_nospace(p->file, p->io); + if (tok.kind == tok·eof || tok.kind == tok·nil) { + errorf("incorrect format: unmatched comment bracket '['"); + goto ERROR; + } + str·append(&node->comment, tokstr(tok)); + } + break; + + case tok·rbrak: + errorf("incorrect format: end comment token found in disallowed region"); + goto ERROR; + break; + + case tok·colon: + tok = lex_nospace(p->file, p->io); + if (tok.kind != tok·number) { + errorf("incorrect format: expected number after colon"); + goto ERROR; + } + if (node == nil) { + errorf("parse error: attempting to set distance of nil node"); + goto ERROR; + } + node->dist = tok.lit.x; + break; + + case tok·comma: + node = nil; + break; + + case tok·ident: + if (p->tok.kind == tok·rparen) { + if (!node) { + errorf("parse error: attempting to set name of nil node"); + goto ERROR; + } + node->name = str·make(tok.lit.s); + } else { + if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) { + errorf("format error: misplaced identifier for leaf found"); + goto ERROR; + } + + if (!p->root) { + errorf("parse error: attempting to create child for no parent"); + goto ERROR; + } + + node = p->mem.alloc(p->heap, 1, sizeof(*node)); + memset(node, 0, sizeof(*node)); + + node->name = str·make(tok.lit.s); + + phylo·addchild(p->root, node); + } + break; + + case tok·number: + if (p->tok.kind == tok·rparen) { + if (p->lev == 0) { + errorf("format error: support value on root not supported"); + goto ERROR; + } + node->support = tok.lit.x; + } else { + errorf("format error: found number in unexpected location"); + goto ERROR; + } + break; + + case tok·semi: + p->file.unget(p->io, ';'); + if (p->lev) { + errorf("format error: uneven number of parentheses found at ';'"); + goto ERROR; + } + goto DONE; + + case tok·eof: + goto DONE; + + default: + errorf("parse error: unrecognized token"); + goto ERROR; + } + + p->tok = tok; + } + +DONE: + p->tok = tok; + return 0; +ERROR: + // TODO(nnoll): cleanup + return 1; +} + +int +bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree) +{ + error err; + struct Parser p; + + if (!tree) { + errorf("tree pointer nil"); + return 0; + } + + p = (struct Parser){ + .lev = 0, + .root = nil, + .tok = (struct Token){ 0 }, + .io = s, + .file = stream, + .mem = tree->mem, + .heap = tree->heap, + }; + err = parse(&p); + if (err) { + errorf("parsing failed\n"); + return 0; + } + + tree->root = p.root; + tree->nleaf = 0; + tree->root->nnode = 0; + + phylo·countleafs(tree->root, &tree->nleaf); + phylo·countnodes(tree->root, &tree->root->nnode); + + return 1; +} + +// ----------------------------------------------------------------------- +// Write + +static +error +dump(bio·Node *node, void *impl, io·Putter out) +{ + byte b[24]; + + if (!node) { + return 1; + } + + bio·Node *child; + if (node->nchild) { + out.put(impl, '('); + + dump(node->child, impl, out); + for (child = node->child->sibling; child != nil; child = child->sibling) { + out.put(impl, ','); + dump(child, impl, out); + } + + out.put(impl, ')'); + } + if (node->name) { + out.puts(impl, node->name); + } + + if (node->parent) { + out.put(impl, ':'); + snprintf(b, arrlen(b), "%f", node->dist); + out.puts(impl, b); + } + + return 0; +} + +error +bio·writenewick(bio·Tree tree, io·Putter out, void* impl) +{ + dump(tree.root, impl, out); + out.put(impl, ';'); + out.put(impl, '\n'); + + return 0; +} diff --git a/src/libbio/phylo.c b/src/libbio/phylo.c new file mode 100644 index 0000000..d50934f --- /dev/null +++ b/src/libbio/phylo.c @@ -0,0 +1,427 @@ +#include +#include +#include +#include + +// ----------------------------------------------------------------------- +// subtree manipulation methods +// NOTE: As of now these don't update nnode & nleaf stats. +// It is the caller's responsibility to refresh counts. + +error +phylo·addchild(bio·Node* parent, bio·Node* child) +{ + bio·Node *it, *sibling; + if (!parent->nchild) { + parent->child = child; + goto SUCCESS; + } + + for (it = parent->child, sibling = it; it != nil; it = it->sibling) { + sibling = it; + } + sibling->sibling = child; + +SUCCESS: + child->parent = parent; + parent->nchild++; + return 0; +} + +error +phylo·rmchild(bio·Node *parent, bio·Node *child) +{ + bio·Node *it, *prev; + enum { + error·nil, + error·notfound, + error·nochildren, + }; + + prev = nil; + for (it = parent->child; it != nil; it = it->sibling) { + if (it == child) goto FOUND; + prev = it; + } + return error·notfound; + +FOUND: + if (prev == nil) { + parent->child = child->sibling; + } else { + prev->sibling = child->sibling; + } + + parent->nchild--; + return error·nil; +} + +// ----------------------------------------------------------------------- +// subtree statistics + +error +phylo·countnodes(bio·Node *node, int *n) +{ + int m; + error err; + bio·Node *child; + + m = *n; + for (child = node->child; child != nil; child = child->sibling) { + if (err = phylo·countnodes(child, n), err) { + errorf("node count: failure at '%s'", child->name); + return 1; + } + } + node->nnode = *n - m; + *n += 1; + + return 0; +} + +error +phylo·countleafs(bio·Node *node, int *n) +{ + error err; + bio·Node *child; + + if (!node->nchild) { + *n += 1; + } + + for (child = node->child; child != nil; child = child->sibling) { + if (err = phylo·countleafs(child, n), err) { + errorf("leaf count: failure at '%s'", child->name); + return 1; + } + } + + return 0; +} + +// ----------------------------------------------------------------------- +// generic operations on tree + +void* +phylo·postorder(bio·Node *clade, void *(*op)(bio·Node*, void*), void *ctx) +{ + bio·Node *it; + + for(it = clade->child; it != nil; it = it->sibling) { + ctx = phylo·postorder(it, op, ctx); + } + + return op(clade, ctx); +} + +void* +phylo·preorder(bio·Node *clade, void *(*op)(bio·Node*, void*), void *ctx) +{ + bio·Node *it; + + ctx = op(clade, ctx); + for(it = clade->child; it != nil; it = it->sibling) { + ctx = phylo·preorder(it, op, ctx); + } + + return ctx; +} + +int +phylo·collectpostorder(bio·Node *clade, bio·Node **list) +{ + bio·Node *it; + int n; + + for(n = 0, it = clade->child; it != nil; it = it->sibling) { + n += phylo·collectpostorder(it, list+n); + } + + return n; +} + +static +inline +void* +appendleaf(bio·Node *node, void* list) +{ + bio·Node **leafs; + + leafs = list; + if (!node->nchild) { + *leafs++ = node; + } + + return leafs; +} + +void +phylo·getleafs(bio·Tree tree, bio·Node **leafs) +{ + phylo·postorder(tree.root, &appendleaf, leafs); +} + +// ----------------------------------------------------------------------- +// tree editing + +static +void +sortnodelist(bio·Node **head, bio·Node *next) +{ + bio·Node tmp, *it; + + it = &tmp; + tmp.sibling = *head; + + while (it->sibling != nil && it->sibling->nnode < next->nnode) { + it = it->sibling; + } + + next->sibling = it->sibling; + it->sibling = next; + *head = tmp.sibling; +} + +error +phylo·ladderize(bio·Node *root) +{ + int i; + error err; + bio·Node *child, *sorted, *sibling; + + if (!root->nchild) return 0; + + // ladderize below + for (child = root->child; child != nil; child = child->sibling) { + if (err = phylo·ladderize(child), err) { + errorf("ladderize: failure at '%s'", child->name); + return 1; + } + } + + // ladderize yourself + sorted = nil; + child = root->child; + while (child != nil) { + sibling = child->sibling; + sortnodelist(&sorted, child); + child = sibling; + } + root->child = sorted; + + return 0; +} + +/* + * compute all distances from a given node + * must provide a working buffer + */ + +struct Tuple +{ + double *d; + bio·Node **n; +}; + +static +struct Tuple +getdistsfrom(bio·Node *node, bio·Node *prev, double curr, double *dist, bio·Node **list) +{ + bio·Node *it; + struct Tuple ret; + + *dist++ = curr; + *list++ = node; + + ret.d = dist; + ret.n = list; + + if (node->parent && node->parent != prev) { + ret = getdistsfrom(node->parent, node, curr + node->dist, dist, list); + + dist = ret.d; + list = ret.n; + } + + for (it = node->child; it != nil; it = it->sibling) { + if (it != prev) { + ret = getdistsfrom(it, node, curr + it->dist, dist, list); + + dist = ret.d; + list = ret.n; + } + } + + return ret; +} + +int +phylo·getdistsfrom(bio·Node *node, int len, double *dist, bio·Node **list) +{ + struct Tuple ret; + // TODO: Better bounds checking. + + ret = getdistsfrom(node, nil, 0.0, dist, list); + + assert(ret.n - list == len); + assert(ret.d - dist == len); + + return len; +} + +/* +static +void +disttoroot(bio·Node *clade, double anc, double *dists) +{ + double d; + bio·Node *it; + + *dists++ = anc + clade->dist; + d = dists[-1]; + for (it = clade->child; it != nil; it = it->sibling) { + disttoroot(it, d, ++dists); + } +} + +void +phylo·disttoroot(bio·Tree tree, double *dists) +{ + disttoroot(tree.root, 0.0, dists); +} +*/ + +/* + * compute the path constituting the tree diameter + * returns the number of edges in the path + */ + +static +void +sort·nodedists(uintptr len, double fs[], bio·Node* ns[]) +{ + double f; + bio·Node *n; +#define LESS(i, j) (fs[i] < fs[j]) +#define SWAP(i, j) (n = ns[i], f = fs[i], \ + fs[i] = fs[j], ns[i] = ns[j], \ + fs[j] = f, ns[j] = n) + QSORT(len, LESS, SWAP); +#undef LESS +#undef SWAP +} + +#define BUFLEN 4096 +double +phylo·diameter(bio·Tree tree, int *len, bio·Node **path) +{ + // TODO: deal with large tree > BUFLEN gracefully + int n; + double fbuf[BUFLEN]; + bio·Node *nbuf[BUFLEN]; + + n = tree.root->nnode; + + assert(n < BUFLEN); + + n = phylo·getdistsfrom(tree.root, tree.root->nnode, fbuf, nbuf); + sort·nodedists(n, fbuf, nbuf); + + path[0] = nbuf[n-1]; + printf("first end '%s'\n", path[0]->name); + + n = phylo·getdistsfrom(path[0], n, fbuf, nbuf); + sort·nodedists(n, fbuf, nbuf); + printf("second end '%s'\n", nbuf[n-1]->name); + + *len = 0; + + // TODO: Traverse up the tree from each node + // Find MRCA by intersection of nodes hit + + return 0.0; +} +#undef BUFLEN + +/* + * reroot a tree on a new node + */ +static +error +rotateparent(bio·Node *node, bio·Node *to) +{ + error err; + + // NOTE: will this ever be taken? + if (node->parent == to) { + return 0; + } + + if (!node->parent) { + goto RMCHILD; + } + + err = rotateparent(node->parent, node); + if (err) { + errorf("failure: broken tree"); + return err; + } + + err = phylo·addchild(node, node->parent); + if (err) { + errorf("inconsistent topology: could not add parent '%s' as child of '%s'", node->parent->name, node->name); + return err; + } + +RMCHILD: + err = phylo·rmchild(node, to); + if (err) { + errorf("inconsistent topology: could not remove child '%s' from '%s'", to->name, node->name); + return err; + } + + node->parent = to; + return 0; +} + +#define PREC .00000001 +error +phylo·reroot(bio·Tree *tree, bio·Node *node, double d) +{ + bio·Node *new; + + // TODO: should check that node is part of this tree? + // TODO: should we check if node->parent != nil? + + if (fabs(d) < PREC) { + new = node; + rotateparent(node->parent, node); + } else if (fabs(d-node->dist) < PREC) { + new = node->parent; + if (new->parent->parent) { + rotateparent(new->parent->parent, new->parent); + } + } else { + new = tree->mem.alloc(tree->heap, 1, sizeof(*new)); + memset(new, 0, sizeof(*new)); + + phylo·addchild(new, node); + node->parent = new; + + phylo·addchild(new, node->parent); + if (node->parent->parent) { + rotateparent(node->parent->parent, node->parent); + } + node->parent->parent = new; + } + + printf("number of children on old root: %d\n", tree->root->nchild); + tree->root = new; + tree->nleaf = 0; + + phylo·countleafs(new, &tree->nleaf); + phylo·countnodes(new, &new->nnode); + + return 0; +} +#undef PREC diff --git a/src/libbio/rules.mk b/src/libbio/rules.mk new file mode 100644 index 0000000..07ce97e --- /dev/null +++ b/src/libbio/rules.mk @@ -0,0 +1,24 @@ +include share/push.mk + +# Local sources +SRCS_$(d) := \ + $(d)/fasta.c \ + $(d)/newick.c \ + $(d)/phylo.c +LIBS_$(d) := $(d)/libbio.a +BINS_$(d) := +# CHECK_$(d) := \ +# $(d)/test.c \ +# $(d)/simulate.c + +include share/paths.mk + +# Local rules +$(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io) + $(ARCHIVE) + +$(TEST_$(d)): TCLIBS = $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a +$(TEST_$(d)): $(UNIT_$(d)) $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a + $(LINK) + +include share/pop.mk diff --git a/src/libbio/simulate.c b/src/libbio/simulate.c new file mode 100644 index 0000000..0f5a97e --- /dev/null +++ b/src/libbio/simulate.c @@ -0,0 +1,120 @@ +#include +#include +#include + +#define SEQLEN 2560 +static byte *SEQ = +"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAGGTAC" +"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG" +"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCAACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG" +"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA" +"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCTCTAAAGTCA" +"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA" +"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA" +"CGTGAAGTCAGTGCTGGAGTTGATTGAAAAGGTCGATGTGCATGCCATTGCGCACCTGACCGGCGGCGGCTTCTGGGAAA" +"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG" +"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCGCCCTGCC" +"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT" +"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA" +"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGTTCGGCCTT" +"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA" +"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG" +"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA" +"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGTGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT" +"ACAGGCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT" +"ATCCACTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG" +"CCGCCGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC" +"CTGTAAATCGCAAGCTCCAGCAGTTTTTTTCCCCCTTTTCTGGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA" +"TCCAGGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATCGAAAAAGAGCTCAGT" +"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG" +"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATCATTATTAGCGAAGAAC" +"AAGGCTCCAACTCTCATTCCCGCCATTTACTGGGCAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC" +"CTCTACAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA" +"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG" +"TGCAGTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG" +"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA" +"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA" +"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG" +"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCAGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT"; + +byte* +modify(byte *seq, int *len, double p) +{ + byte *head, *new; + + head = calloc(SEQLEN+1, sizeof(byte)); + new = head; + for (; *seq != '\0'; seq++) { + if (rng·bernoulli(p)) { + switch (rng·randi(5)) { + case 0: *new++ = 'A'; break; + case 1: *new++ = 'C'; break; + case 2: *new++ = 'G'; break; + case 3: *new++ = 'T'; break; + case 4: continue; + } + } else { + *new++ = *seq; + } + } + *new = '\0'; + *len = new - head; + return head; +} + +#define NSEQS 20 +int +main() +{ + int n, i, l, lens[NSEQS]; + byte *seqs[NSEQS]; + + int locs[aln·N][NSEQS][aln·L]; + int *loc[aln·N]; + uint64 vals[aln·N][NSEQS][aln·L]; + uint64 *val[aln·N]; + + rng·init(0); + + seqs[0] = SEQ; + lens[0] = SEQLEN; + + for (n = 0; n < aln·N; n++) { + for (i = 0; i < NSEQS; i++) { + for (l = 0; l < aln·L; l++) { + vals[n][i][l] = 0; + } + } + } + + for (i = 1; i < NSEQS; i++) { + seqs[i] = modify(SEQ, lens + i, .01*i); + } + + for (i = 0; i < NSEQS; i++) { + for (n = 0; n < aln·N; n++) { + val[n] = vals[n][i]; + loc[n] = locs[n][i]; + } + aln·sketch(seqs[i], aln·L, val, loc); + } + + // for (n = 0; n < aln·N; n++) { + // printf("iteration %d\n", n); + // printf("[\n"); + // for (i = 0; i < NSEQS; i++) { + // printf(" ["); + // for (l = 0; l < aln·L; l++) { + // printf("%lu,", vals[n][i][l]); + // } + // printf("],\n"); + // } + // printf("]\n"); + // } + + for (n = 0; n < aln·N; n++) { + aln·sort(NSEQS, aln·L, (uint64*)vals[n]); + } + + return 0; +} diff --git a/src/libbio/test.c b/src/libbio/test.c new file mode 100644 index 0000000..9926764 --- /dev/null +++ b/src/libbio/test.c @@ -0,0 +1,283 @@ +#include +#include +#include + +#include + +// ----------------------------------------------------------------------- +// Global data + +static byte *SEQ[] = { +"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAGGTAC" +"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG" +"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCAACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG" +"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA" +"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCTCTAAAGTCA" +"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA" +"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA" +"CGTGAAGTCAGTGCTGGAGTTGATTGAAAAGGTCGATGTGCATGCCATTGCGCACCTGACCGGCGGCGGCTTCTGGGAAA" +"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG" +"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCGCCCTGCC" +"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT" +"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA" +"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGTTCGGCCTT" +"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA" +"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG" +"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA" +"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGTGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT" +"ACAGGCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT" +"ATCCACTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG" +"CCGCCGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC" +"CTGTAAATCGCAAGCTCCAGCAGTTTTTTTCCCCCTTTTCTGGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA" +"TCCAGGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATCGAAAAAGAGCTCAGT" +"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG" +"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATCATTATTAGCGAAGAAC" +"AAGGCTCCAACTCTCATTCCCGCCATTTACTGGGCAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC" +"CTCTACAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA" +"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG" +"TGCAGTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG" +"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA" +"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA" +"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG" +"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCAGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT", + +"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAAATAC" +"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG" +"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG" +"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA" +"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCAAAGTCA" +"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA" +"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA" +"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG" +"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCCCCTGCC" +"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT" +"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA" +"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGCGGCCTT" +"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA" +"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG" +"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA" +"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT" +"ACAGTCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT" +"ATCCTCTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG" +"CCGCTGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC" +"CTGTTAATCGCAAGCTCCAGCAGCCCCCCCCCCCCTTTTCTGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA" +"TCCATGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATAAAAAGAGCTCAGT" +"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG" +"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATTATTAGCGAAGAAC" +"AAGGTTCCAACTCTCATTCCCGCCATTTACTGGGAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC" +"CTCTTCAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA" +"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG" +"TGCATTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG" +"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA" +"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA" +"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG" +"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT", +}; + + +static +int +my_read(Stream *s, void *buf, int n) +{ + return io·read(s, 1, n, buf); +} + +// ----------------------------------------------------------------------- +// Point of entry for testing + +error +test·newick() +{ + error err; + bio·Tree t; + mem·Arena *heap; + Stream *fd[2]; + + io·Peeker rdr; + io·Putter wtr; + + bio·Node **end, **it, **list; + + heap = mem·makearena(mem·sys, nil); + rdr = (io·Peeker){.get = (byte (*)(void *))io·getbyte, .unget = (error (*)(void *, byte))io·ungetbyte}; + wtr = (io·Putter){.put = (error (*)(void *, byte))io·putbyte, .putstr = (int (*)(void *, string))io·putstring}; + + fd[0] = io·open("/home/nolln/root/data/test/zika.nwk", "r"); + fd[1] = io·open("/home/nolln/root/data/test/zika.proc.nwk", "w"); + + t.h = heap; + t.heap = (mem·Allocator){ .alloc = (void *(*)(void *, uint, ulong))mem·arenaalloc, .free = nil, }; + + if (err = bio·readnewick(rdr, fd[0], &t), err) { + errorf("failed to read newick"); + return 1; + } + printf("number of children: %d\n", t.root->nchild); + + phylo·ladderize(t.root); + + list = mem·arenaalloc(heap, t.nleaf, sizeof(**list)); + phylo·getleafs(t, list); + for (it = list, end = list + t.nleaf; it != end; ++it) { + printf("Leaf '%s'\n", (*it)->name); + } + + bio·Node *path[100]; + // phylo·diameter(t, path); + + printf("Loaded tree with %d leafs and %d nodes\n", t.nleaf, t.root->nnode); + err = bio·writenewick(t, wtr, fd[1]); + + io·flush(fd[1]); + + io·close(fd[0]); + io·close(fd[1]); + + mem·freearena(heap); + return 0; +} + +error +test·fasta() +{ + error err; + Stream *fd; + + bio·Seq seq; + bio·FastaReader *rdr; + + clock_t t; + + fd = io·open("/home/nolln/root/data/test/zika.fa", "r"); + + /* Benchmark against Heng */ +#if 0 + int n, slen; + kseq_t *kseq; + + t = clock(); + kseq = kseq_init(fd); + while (kseq_read(kseq) >= 0) { + ++n, slen += kseq->seq.l; + } + t = clock() - t; + printf("heng's code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); + + kseq_destroy(kseq); + + io·seek(fd, 0, seek·set); +#endif + + rdr = bio·openfasta((io·Reader){.read = (int (*)(void *, int, int, void *))io·read}, fd, mem·sys, nil); + + t = clock(); + err = 0; + while (!err) { + err = bio·readfasta(rdr, &seq); + } + t = clock() - t; + printf("nick's code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); + bio·closefasta(rdr); + + + io·close(fd); + return err <= 0 ? 0 : 1; +} + +#define asrdr(x) (int (*)(void *, int, int, void *))(x) +error +test·fastq() +{ + error err; + Stream *fd; + + bio·Seq seq; + bio·FastqReader *rdr; + + clock_t t; + + fd = io·open("/home/nolln/root/data/test/eg.fq", "r"); + + rdr = bio·openfastq((io·Reader){.read = asrdr(io·read)}, fd, mem·sys, nil); + + t = clock(); + err = 0; + while (!err) { + err = bio·readfastq(rdr, &seq); + } + t = clock() - t; + printf("nick's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); + bio·closefastq(rdr); + + + io·close(fd); + return err <= 0 ? 0 : 1; +} + +error +test·align() +{ + double f; + error err; + int i, l, n; + + uint64 mem[aln·N][arrlen(SEQ)][aln·L]; + uint64 *phi[aln·N]; + int loc[aln·N][arrlen(SEQ)][aln·L]; + int *pos[aln·N]; + + for (i = 0; i < arrlen(SEQ); i++) { + for (n = 0; n < aln·N; n++) { + phi[n] = mem[n][i]; + pos[n] = loc[n][i]; + } + + err = aln·sketch(SEQ[i], aln·L, phi, pos); + } + + f = 0; + for (n = 0; n < aln·N; n++) { + aln·sort(arrlen(SEQ), aln·L, (uint64*)mem[n]); + + if (!memcmp(mem[n][0], mem[n][1], sizeof(uint64)*aln·L)) { + f += 1.; + printf("True : "); + } else { + printf("False: "); + } + for (i = 0; i < arrlen(SEQ); i++) { + printf("["); + for (l = 0; l < aln·L; l++) { + printf("%lu,", mem[n][i][l]); + } + printf("]"); + if (i == 0) printf(" ~ "); + } + printf("\n"); + } + + printf("Fraction hits %f\n", f/aln·N); + return err; + +} + +error +main() +{ + error err; + + if (err = test·newick(), err) { + errorf("test fail: newick"); + } + +#if 0 + if (err = test·fasta(), err) { + errorf("test fail: fasta"); + } + + if (err = test·fastq(), err) { + errorf("test fail: fastq"); + } +#endif +} + -- cgit v1.2.1