From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- sys/libbio/align.c | 178 --------------------- sys/libbio/fasta.c | 393 ---------------------------------------------- sys/libbio/newick.c | 414 ------------------------------------------------ sys/libbio/phylo.c | 427 -------------------------------------------------- sys/libbio/rules.mk | 28 ---- sys/libbio/simulate.c | 120 -------------- sys/libbio/test.c | 283 --------------------------------- 7 files changed, 1843 deletions(-) delete mode 100644 sys/libbio/align.c delete mode 100644 sys/libbio/fasta.c delete mode 100644 sys/libbio/newick.c delete mode 100644 sys/libbio/phylo.c delete mode 100644 sys/libbio/rules.mk delete mode 100644 sys/libbio/simulate.c delete mode 100644 sys/libbio/test.c (limited to 'sys/libbio') diff --git a/sys/libbio/align.c b/sys/libbio/align.c deleted file mode 100644 index 20a57df..0000000 --- a/sys/libbio/align.c +++ /dev/null @@ -1,178 +0,0 @@ -#include -#include -#include -#include - -// ----------------------------------------------------------------------- -// globals - -uint64 aln·shft = (2ULL * (aln·K - 1ULL)); -uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL; - -// ----------------------------------------------------------------------- -// static data - -static uint64 nuctab[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, -}; - -// ----------------------------------------------------------------------- -// hash functions - -enum -{ - MURM641 = 0xff51afd7ed558ccd, - MURM642 = 0xc4ceb9fe1a85ec53 -}; - -static -uint64 -minihash(uint64 x, uint64 mask) -{ - x = (~x + (x << 21)) & mask; - x = x ^ (x >> 24); - x = (x + (x << 3) + (x << 8)) & mask; - x = x ^ (x >> 14); - x = (x + (x << 2) + (x << 4)) & mask; - x = x ^ (x >> 28); - x = (x + (x << 31)); - - return x; -} - -static -uint64 -murmurhash(uint64 x, uint64 mask) -{ - x = x ^ (x >> 33); - x = (x * MURM641); - x = x ^ (x >> 33); - x = (x * MURM642); - x = x ^ (x >> 33); - - return x; -} - -// ----------------------------------------------------------------------- -// locality sensitive hashing (with spatial extent) - -static -void -sortpos(uintptr len, uint64 vals[], int locs[]) -{ - int tmpi; - uint64 tmpu64; - -#define LESS(i, j) (locs[i] < locs[j]) -#define SWAP(i, j) (tmpu64 = vals[i], tmpi = locs[i], \ - vals[i] = vals[j], locs[i] = locs[j], \ - vals[j] = tmpu64 , locs[j] = tmpi) - QSORT(len, LESS, SWAP); -#undef LESS -#undef SWAP -} - -/* - * sketch - * @param seq: '0' terminated string - * @param len: number of sequential sketches to keep - * @param vals: buffer to store hashes of sketch. - * @param locs: buffer to store location of sketch hashes - */ -error -aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N]) -{ - int i, n, l, *loc; - uint64 kmer, h[3], *val; - int tmpi[2]; - uint64 tmpu[2]; - - for(n = 0; n < aln·N; n++) { - for(l = 0; l < len; l++) { - vals[n][l] = UINT64_MAX; - } - } - - kmer = UINT64_MAX; - for(l = 0; *seq != '\0'; seq++, l++) { - kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask; - - h[0] = minihash(kmer, aln·mask); - h[1] = murmurhash(kmer, aln·mask); - - for(n = 0; n < aln·N; n++) { - val = vals[n]; - loc = locs[n]; - - h[2] = (h[0] + n * h[1]) & aln·mask; - for (i = 0; i < len && h[2] < val[i]; i++) { - ; - } - - tmpu[1] = h[2]; - tmpi[1] = l; - for(i -= 1; i >= 0; i--) { - tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0]; - tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0]; - } - } - } - - for(n = 0; n < aln·N; n++) { - sortpos(len, vals[n], locs[n]); - } - - return 0; -} - -static -int -lessarrs(int len, uint64 a[], uint64 b[]) -{ - int l; - - for(l = 0; l < len; l++) { - if (a[l] < b[l]) return 1; - if (a[l] > b[l]) return 0; - } - - return 0; -} - -static -void -swaparrs(int len, uint64 a[], uint64 b[]) -{ - int l; - uint64 tmp; - - for(l = 0; l < len; l++) { - tmp = a[l], a[l] = b[l], b[l] = tmp; - } -} - -error -aln·sort(uintptr n, int len, uint64 *vals) -{ -#define LESS(i, j) (lessarrs(len, vals+((i)*len), vals+((j)*len))) -#define SWAP(i, j) (swaparrs(len, vals+((i)*len), vals+((j)*len))) - QSORT(n, LESS, SWAP); -#undef LESS -#undef SWAP - return 0; -} diff --git a/sys/libbio/fasta.c b/sys/libbio/fasta.c deleted file mode 100644 index 3788544..0000000 --- a/sys/libbio/fasta.c +++ /dev/null @@ -1,393 +0,0 @@ -#include -#include -#include - -#define INIT_NM_SIZE 128 -#define INIT_SQ_SIZE 4096 - -struct SeqBuf -{ - mem·Allocator mem; - void *heap; - - int cap, off; - byte *it, b[]; -}; - -static -void -reset(struct SeqBuf *sb) -{ - sb->off = 0; - sb->it = sb->b; -} - -static -error -grow(struct SeqBuf **sb, int min) -{ - void* heap; - mem·Allocator mem; - - vlong newcap; - struct SeqBuf *old, *new; - - old = *sb; - mem = old->mem; - heap = old->heap; - - assert((*sb)->cap <= (SIZE_MAX - 1) / 2); - newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min)); - assert(newcap >= (*sb)->cap+min); - - if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) { - errorf("memory: could not allocate new buffer\n"); - return 1; - } - - memcpy(new, old, sizeof(*new) + (*sb)->cap); - - new->cap = newcap; - new->it = new->b + (old->it - old->b); - mem.free(heap, old); - - *sb = new; - return 0; -} - -static -error -put(struct SeqBuf **sb, byte c) -{ - int err; - struct SeqBuf *sq; - - sq = *sb; - if(sq->it < (sq->b + sq->cap)) { - *sq->it++ = c; - return 0; - } - - if(err = grow(sb, 1), err) { - errorf("memory fail: could not allocate more buffer\n"); - sq->mem.free(sq->heap, sq); - return 1; - } - - *((*sb)->it++) = c; - return 0; -} - -static -error -push(struct SeqBuf **sb, int n, void *buf) -{ - int d, err; - struct SeqBuf *seq; - - seq = *sb; - if(d = (seq->cap - (seq->it - seq->b)), d < n) { - assert(d >= 0); - if (err = grow(sb, n-d), err) { - errorf("memory fail: could not allocate more buffer\n"); - seq->mem.free(seq->heap, seq); - return 1; - } - } - seq = *sb; - - memcpy(seq->it, buf, n); - seq->it += n; - - return 0; -} - -// ----------------------------------------------------------------------- -// sequence data - -struct bio·SeqReader { - byte eof; - io·Reader rdr; - void *io; - - struct SeqBuf *seq; - - /* read buffer */ - byte *b, *bend, buf[4*4098]; -}; - -static -error -fill(bio·SeqReader *rdr) -{ - int n; - // NOTE: This could lead to an infinite loop. - if(rdr->eof) - return 0; - - n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf); - if(n < 0) { - errorf("read: no data obtained from reader\n"); - return 1; - } - rdr->b = rdr->buf; - rdr->bend = rdr->b + n; - if(rdr->eof = (n < arrlen(rdr->buf)), rdr->eof) - *rdr->bend++ = '\0'; - - return 0; -} - -bio·SeqReader* -bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap) -{ - error err; - bio·SeqReader *r; - - r = mem.alloc(heap, 1, sizeof(bio·SeqReader)); - r->rdr = rdr; - r->io = io; - r->eof = 0; - - r->seq = mem.alloc(heap, 1, sizeof(*r->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); - r->seq->mem = mem; - r->seq->heap = heap; - r->seq->it = r->seq->b; - r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE; - - if (err=fill(r), err) { - errorf("fill: could not populate buffer\n"); - goto ERROR; - } - - return r; - -ERROR: - mem.free(heap, r->seq); - mem.free(heap, r); - return nil; -} - -error -bio·closeseq(bio·SeqReader *rdr) -{ - mem·Allocator mem; - void *heap; - - mem = rdr->seq->mem; - heap = rdr->seq->heap; - - mem.free(heap, rdr->seq); - mem.free(heap, rdr); - - return 0; -} - - -static -error -readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop) -{ - error err; - byte *beg; - - if(rdr->eof && rdr->b == rdr->bend-1) - return EOF; - - reset(rdr->seq); - - // NOTE: Can this case happen? - assert(rdr->b != rdr->bend); - if(*rdr->b++ != hdr) { - errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--); - return 1; - } - -NAME: - beg = rdr->b; - while(rdr->b != rdr->bend) { - if(*rdr->b++ == '\n') { - push(&rdr->seq, (rdr->b - 1) - beg, beg); - goto SEQ; - } - } - push(&rdr->seq, rdr->b - beg, beg); - - if(err=fill(rdr), err) { - errorf("read: could not populate buffer\n"); - return 1; - } - goto NAME; - -SEQ: - put(&rdr->seq, '\0'); - rdr->seq->off = rdr->seq->it - rdr->seq->b; - -SEQLOOP: - beg = rdr->b; - while(rdr->b != rdr->bend) { - if(*rdr->b == '\n') { - push(&rdr->seq, rdr->b - beg, beg); - beg = rdr->b + 1; - } - - if(*rdr->b == stop || *rdr->b == '\0') - goto SUCCESS; - - rdr->b++; - } - - push(&rdr->seq, rdr->b - beg, beg); - - if(err=fill(rdr), err) { - errorf("read: could not populate buffer\n"); - return 1; - } - goto SEQLOOP; - -SUCCESS: - push(&rdr->seq, rdr->b - beg, beg); - put(&rdr->seq, '\0'); - - return 0; -} - -/* - * fasta files - */ - -error -bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq) -{ - error err; - - err = readfasta(rdr, seq, '>', '>'); - if(err && err != EOF) { - errorf("parse fail: could not read sequence of record\n"); - return err; - } - - seq->name = rdr->seq->b; - seq->s = rdr->seq->b + rdr->seq->off; - seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end - seq->q = nil; - - return err; -} - -/* - * fastq files - */ - -error -bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq) -{ - int n; - byte *beg; - error err; - - err = readfasta(rdr, seq, '@', '+'); - if(err) { - errorf("parse fail: could not read sequence of record\n"); - return err; - } - - seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); - - if(*rdr->b++ != '+') { - errorf("format error: no '+' character seperator found\n"); - return -1; - } - -EATLN: - while(rdr->b != rdr->bend) { - if (*rdr->b++ == '\n') { - n = 0; - goto QUAL; - } - } - - if(err = fill((bio·SeqReader*)rdr), err) { - errorf("read: could not populate buffer\n"); - return 1; - } - goto EATLN; - -QUAL: - beg = rdr->b; - while(rdr->b != rdr->bend) { - if(*rdr->b == '\n') { - push(&rdr->seq, rdr->b - beg, beg); - beg = rdr->b + 1; - } - - if(n++ == seq->len || *rdr->b == '\0') { - err = *rdr->b == '\0' ? EOF : 0; - goto SUCCESS; - } - - rdr->b++; - } - - push(&rdr->seq, rdr->b - beg, beg); - - if(err = fill((bio·SeqReader*)rdr), err) { - errorf("read: could not populate buffer\n"); - return 1; - } - goto QUAL; - - -SUCCESS: - push(&rdr->seq, rdr->b - beg, beg); - put(&rdr->seq, '\0'); - - seq->name = rdr->seq->b; - seq->s = rdr->seq->b + rdr->seq->off - 1; - seq->q = seq->s + seq->len + 1; - - return err; -} - -// ----------------------------------------------------------------------- -// sequence writing - -error -bio·writefasta(io·Writer io, void *wtr, bio·Seq seq) -{ - int i, j, d; - char buf[2048], *b = buf, *e = arrend(buf); - - *b++ = '>'; - while(*seq.name) { - *b++ = *seq.name++; - if(b == e) { - io.write(wtr, 1, arrlen(buf), buf); - b = buf; - } - } - - for(i=0; i -#include -#include - -// ----------------------------------------------------------------------- -// Tokens - -enum TokenKind -{ - tok·nil, - tok·eof, - tok·space, - tok·ident, - tok·number, - tok·lparen, - tok·rparen, - tok·lbrak, - tok·rbrak, - tok·comma, - tok·semi, - tok·colon, - - NUM_TOKENS, -}; - - -struct Token { - enum TokenKind kind; - union - { - byte *s; - double x; - } lit; -}; - -static -byte* -tokstr(struct Token tok) -{ - static byte b[50]; - switch (tok.kind) { - case tok·nil: return ""; - case tok·eof: return nil; - case tok·space: return " "; - case tok·ident: return tok.lit.s; - case tok·lparen: return "("; - case tok·rparen: return ")"; - case tok·lbrak: return "["; - case tok·rbrak: return "]"; - case tok·comma: return ","; - case tok·semi: return ";"; - case tok·colon: return ":"; - case tok·number: - snprintf(b, arrlen(b), "%f", tok.lit.x); - return b; - default: - return nil; - } -} - - -// ----------------------------------------------------------------------- -// Read - -// TODO: Bounds checking on buffer -static -struct Token -lex(io·Peeker s, void* fp) -{ -#define isvalidchar(C) ((C) == '!') || \ - ('\"' < (C) && (C) < '\'') || \ - (')' < (C) && (C) < '+') || \ - (',' < (C) && (C) < ':') || \ - (':' < (C) && (C) < '[') || \ - ((C) == '\\') || \ - (']' < (C) && (C) <= '~') - byte *c; - struct Token tok; - static byte b[1024]; - c = b; - *c = s.get(fp); - - if (isspace(*c)) { - while (isspace(*c)) { - *(++c) = s.get(fp); - } - - s.unget(fp, *c); - assert(c - b < 1024); - - *c = 0; - tok.kind = tok·space; - tok.lit.s = b; - return tok; - } - - switch (*c) { - case EOF: tok.kind = tok·eof; return tok; - case '(': tok.kind = tok·lparen; return tok; - case ')': tok.kind = tok·rparen; return tok; - case '[': tok.kind = tok·lbrak; return tok; - case ']': tok.kind = tok·rbrak; return tok; - case ',': tok.kind = tok·comma; return tok; - case ';': tok.kind = tok·semi; return tok; - case ':': tok.kind = tok·colon; return tok; - - case '.': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - while (isdigit(*c)) { - NUM: *(++c) = s.get(fp); - } - if (*c == '.') goto NUM; - if (isvalidchar(*c)) goto IDENT; - - s.unget(fp, *c); - assert(c - b < 1024); - - *c = 0; - tok.kind = tok·number; - tok.lit.x = atof(b); - return tok; - - case '\"': - while ((*c) != '\"') { - *(++c) = s.get(fp); - } - assert(c - b < 1024); - - *c = '\0'; - tok.kind = tok·ident; - tok.lit.s = b + 1; - return tok; - - default: - IDENT: - while (isvalidchar(*c)) { - *(++c) = s.get(fp); - } - s.unget(fp, *c); - assert(c - b < 1024); - - *c = '\0'; - tok.kind = tok·ident; - tok.lit.s = b; - return tok; - } -#undef isvalidchar -} - -static -struct Token -lex_nospace(io·Peeker s, void *impl) -{ - struct Token tok; - tok = lex(s, impl); - if (tok.kind == tok·space) { - tok = lex_nospace(s, impl); - } - - return tok; -} - -struct Parser -{ - int lev; - bio·Node *root; - struct Token tok; - - void *io; - io·Peeker file; - void *heap; - mem·Allocator mem; -}; - -static -error -parse(struct Parser *p) -{ - error err; - bio·Node *node; - bio·Node *root; - struct Token tok; - - node = p->root; - for (;;) { - tok = lex_nospace(p->file, p->io); - - switch (tok.kind) { - case tok·lparen: - if (!p->root && p->lev > 0) { - errorf("parse format: attempted to make root at non-zero level"); - goto ERROR; - } - - node = p->mem.alloc(p->heap, 1, sizeof(*node)); - memset(node, 0, sizeof(*node)); - - if (p->root) { - phylo·addchild(p->root, node); - root = p->root; - } else { - root = node; - } - - p->lev++; - p->root = node; - p->tok = tok; - err = parse(p); - if (err) { - goto ERROR; - } - if (p->tok.kind != tok·rparen) { - errorf("incorrect format: closing parentheses expected to proceed opening"); - goto ERROR; - } - p->root = root; - // NOTE(nnoll): We don't want to override the state of p->tok here! - // Jump straight to grabbing next token. - continue; - - case tok·rparen: - p->lev--; - goto DONE; - - /* Comments */ - case tok·lbrak: - if (!node) { - errorf("incorrect format: comment found in disallowed region"); - goto ERROR; - } - node->comment = str·make(""); - while (tok.kind != tok·rbrak) { - tok = lex_nospace(p->file, p->io); - if (tok.kind == tok·eof || tok.kind == tok·nil) { - errorf("incorrect format: unmatched comment bracket '['"); - goto ERROR; - } - str·append(&node->comment, tokstr(tok)); - } - break; - - case tok·rbrak: - errorf("incorrect format: end comment token found in disallowed region"); - goto ERROR; - break; - - case tok·colon: - tok = lex_nospace(p->file, p->io); - if (tok.kind != tok·number) { - errorf("incorrect format: expected number after colon"); - goto ERROR; - } - if (node == nil) { - errorf("parse error: attempting to set distance of nil node"); - goto ERROR; - } - node->dist = tok.lit.x; - break; - - case tok·comma: - node = nil; - break; - - case tok·ident: - if (p->tok.kind == tok·rparen) { - if (!node) { - errorf("parse error: attempting to set name of nil node"); - goto ERROR; - } - node->name = str·make(tok.lit.s); - } else { - if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) { - errorf("format error: misplaced identifier for leaf found"); - goto ERROR; - } - - if (!p->root) { - errorf("parse error: attempting to create child for no parent"); - goto ERROR; - } - - node = p->mem.alloc(p->heap, 1, sizeof(*node)); - memset(node, 0, sizeof(*node)); - - node->name = str·make(tok.lit.s); - - phylo·addchild(p->root, node); - } - break; - - case tok·number: - if (p->tok.kind == tok·rparen) { - if (p->lev == 0) { - errorf("format error: support value on root not supported"); - goto ERROR; - } - node->support = tok.lit.x; - } else { - errorf("format error: found number in unexpected location"); - goto ERROR; - } - break; - - case tok·semi: - p->file.unget(p->io, ';'); - if (p->lev) { - errorf("format error: uneven number of parentheses found at ';'"); - goto ERROR; - } - goto DONE; - - case tok·eof: - goto DONE; - - default: - errorf("parse error: unrecognized token"); - goto ERROR; - } - - p->tok = tok; - } - -DONE: - p->tok = tok; - return 0; -ERROR: - // TODO(nnoll): cleanup - return 1; -} - -int -bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree) -{ - error err; - struct Parser p; - - if (!tree) { - errorf("tree pointer nil"); - return 0; - } - - p = (struct Parser){ - .lev = 0, - .root = nil, - .tok = (struct Token){ 0 }, - .io = s, - .file = stream, - .mem = tree->mem, - .heap = tree->heap, - }; - err = parse(&p); - if (err) { - errorf("parsing failed\n"); - return 0; - } - - tree->root = p.root; - tree->nleaf = 0; - tree->root->nnode = 0; - - phylo·countleafs(tree->root, &tree->nleaf); - phylo·countnodes(tree->root, &tree->root->nnode); - - return 1; -} - -// ----------------------------------------------------------------------- -// Write - -static -error -dump(bio·Node *node, void *impl, io·Putter out) -{ - byte b[24]; - - if (!node) { - return 1; - } - - bio·Node *child; - if (node->nchild) { - out.put(impl, '('); - - dump(node->child, impl, out); - for (child = node->child->sibling; child != nil; child = child->sibling) { - out.put(impl, ','); - dump(child, impl, out); - } - - out.put(impl, ')'); - } - if (node->name) { - out.puts(impl, node->name); - } - - if (node->parent) { - out.put(impl, ':'); - snprintf(b, arrlen(b), "%f", node->dist); - out.puts(impl, b); - } - - return 0; -} - -error -bio·writenewick(bio·Tree tree, io·Putter out, void* impl) -{ - dump(tree.root, impl, out); - out.put(impl, ';'); - out.put(impl, '\n'); - - return 0; -} diff --git a/sys/libbio/phylo.c b/sys/libbio/phylo.c deleted file mode 100644 index d50934f..0000000 --- a/sys/libbio/phylo.c +++ /dev/null @@ -1,427 +0,0 @@ -#include -#include -#include -#include - -// ----------------------------------------------------------------------- -// subtree manipulation methods -// NOTE: As of now these don't update nnode & nleaf stats. -// It is the caller's responsibility to refresh counts. - -error -phylo·addchild(bio·Node* parent, bio·Node* child) -{ - bio·Node *it, *sibling; - if (!parent->nchild) { - parent->child = child; - goto SUCCESS; - } - - for (it = parent->child, sibling = it; it != nil; it = it->sibling) { - sibling = it; - } - sibling->sibling = child; - -SUCCESS: - child->parent = parent; - parent->nchild++; - return 0; -} - -error -phylo·rmchild(bio·Node *parent, bio·Node *child) -{ - bio·Node *it, *prev; - enum { - error·nil, - error·notfound, - error·nochildren, - }; - - prev = nil; - for (it = parent->child; it != nil; it = it->sibling) { - if (it == child) goto FOUND; - prev = it; - } - return error·notfound; - -FOUND: - if (prev == nil) { - parent->child = child->sibling; - } else { - prev->sibling = child->sibling; - } - - parent->nchild--; - return error·nil; -} - -// ----------------------------------------------------------------------- -// subtree statistics - -error -phylo·countnodes(bio·Node *node, int *n) -{ - int m; - error err; - bio·Node *child; - - m = *n; - for (child = node->child; child != nil; child = child->sibling) { - if (err = phylo·countnodes(child, n), err) { - errorf("node count: failure at '%s'", child->name); - return 1; - } - } - node->nnode = *n - m; - *n += 1; - - return 0; -} - -error -phylo·countleafs(bio·Node *node, int *n) -{ - error err; - bio·Node *child; - - if (!node->nchild) { - *n += 1; - } - - for (child = node->child; child != nil; child = child->sibling) { - if (err = phylo·countleafs(child, n), err) { - errorf("leaf count: failure at '%s'", child->name); - return 1; - } - } - - return 0; -} - -// ----------------------------------------------------------------------- -// generic operations on tree - -void* -phylo·postorder(bio·Node *clade, void *(*op)(bio·Node*, void*), void *ctx) -{ - bio·Node *it; - - for(it = clade->child; it != nil; it = it->sibling) { - ctx = phylo·postorder(it, op, ctx); - } - - return op(clade, ctx); -} - -void* -phylo·preorder(bio·Node *clade, void *(*op)(bio·Node*, void*), void *ctx) -{ - bio·Node *it; - - ctx = op(clade, ctx); - for(it = clade->child; it != nil; it = it->sibling) { - ctx = phylo·preorder(it, op, ctx); - } - - return ctx; -} - -int -phylo·collectpostorder(bio·Node *clade, bio·Node **list) -{ - bio·Node *it; - int n; - - for(n = 0, it = clade->child; it != nil; it = it->sibling) { - n += phylo·collectpostorder(it, list+n); - } - - return n; -} - -static -inline -void* -appendleaf(bio·Node *node, void* list) -{ - bio·Node **leafs; - - leafs = list; - if (!node->nchild) { - *leafs++ = node; - } - - return leafs; -} - -void -phylo·getleafs(bio·Tree tree, bio·Node **leafs) -{ - phylo·postorder(tree.root, &appendleaf, leafs); -} - -// ----------------------------------------------------------------------- -// tree editing - -static -void -sortnodelist(bio·Node **head, bio·Node *next) -{ - bio·Node tmp, *it; - - it = &tmp; - tmp.sibling = *head; - - while (it->sibling != nil && it->sibling->nnode < next->nnode) { - it = it->sibling; - } - - next->sibling = it->sibling; - it->sibling = next; - *head = tmp.sibling; -} - -error -phylo·ladderize(bio·Node *root) -{ - int i; - error err; - bio·Node *child, *sorted, *sibling; - - if (!root->nchild) return 0; - - // ladderize below - for (child = root->child; child != nil; child = child->sibling) { - if (err = phylo·ladderize(child), err) { - errorf("ladderize: failure at '%s'", child->name); - return 1; - } - } - - // ladderize yourself - sorted = nil; - child = root->child; - while (child != nil) { - sibling = child->sibling; - sortnodelist(&sorted, child); - child = sibling; - } - root->child = sorted; - - return 0; -} - -/* - * compute all distances from a given node - * must provide a working buffer - */ - -struct Tuple -{ - double *d; - bio·Node **n; -}; - -static -struct Tuple -getdistsfrom(bio·Node *node, bio·Node *prev, double curr, double *dist, bio·Node **list) -{ - bio·Node *it; - struct Tuple ret; - - *dist++ = curr; - *list++ = node; - - ret.d = dist; - ret.n = list; - - if (node->parent && node->parent != prev) { - ret = getdistsfrom(node->parent, node, curr + node->dist, dist, list); - - dist = ret.d; - list = ret.n; - } - - for (it = node->child; it != nil; it = it->sibling) { - if (it != prev) { - ret = getdistsfrom(it, node, curr + it->dist, dist, list); - - dist = ret.d; - list = ret.n; - } - } - - return ret; -} - -int -phylo·getdistsfrom(bio·Node *node, int len, double *dist, bio·Node **list) -{ - struct Tuple ret; - // TODO: Better bounds checking. - - ret = getdistsfrom(node, nil, 0.0, dist, list); - - assert(ret.n - list == len); - assert(ret.d - dist == len); - - return len; -} - -/* -static -void -disttoroot(bio·Node *clade, double anc, double *dists) -{ - double d; - bio·Node *it; - - *dists++ = anc + clade->dist; - d = dists[-1]; - for (it = clade->child; it != nil; it = it->sibling) { - disttoroot(it, d, ++dists); - } -} - -void -phylo·disttoroot(bio·Tree tree, double *dists) -{ - disttoroot(tree.root, 0.0, dists); -} -*/ - -/* - * compute the path constituting the tree diameter - * returns the number of edges in the path - */ - -static -void -sort·nodedists(uintptr len, double fs[], bio·Node* ns[]) -{ - double f; - bio·Node *n; -#define LESS(i, j) (fs[i] < fs[j]) -#define SWAP(i, j) (n = ns[i], f = fs[i], \ - fs[i] = fs[j], ns[i] = ns[j], \ - fs[j] = f, ns[j] = n) - QSORT(len, LESS, SWAP); -#undef LESS -#undef SWAP -} - -#define BUFLEN 4096 -double -phylo·diameter(bio·Tree tree, int *len, bio·Node **path) -{ - // TODO: deal with large tree > BUFLEN gracefully - int n; - double fbuf[BUFLEN]; - bio·Node *nbuf[BUFLEN]; - - n = tree.root->nnode; - - assert(n < BUFLEN); - - n = phylo·getdistsfrom(tree.root, tree.root->nnode, fbuf, nbuf); - sort·nodedists(n, fbuf, nbuf); - - path[0] = nbuf[n-1]; - printf("first end '%s'\n", path[0]->name); - - n = phylo·getdistsfrom(path[0], n, fbuf, nbuf); - sort·nodedists(n, fbuf, nbuf); - printf("second end '%s'\n", nbuf[n-1]->name); - - *len = 0; - - // TODO: Traverse up the tree from each node - // Find MRCA by intersection of nodes hit - - return 0.0; -} -#undef BUFLEN - -/* - * reroot a tree on a new node - */ -static -error -rotateparent(bio·Node *node, bio·Node *to) -{ - error err; - - // NOTE: will this ever be taken? - if (node->parent == to) { - return 0; - } - - if (!node->parent) { - goto RMCHILD; - } - - err = rotateparent(node->parent, node); - if (err) { - errorf("failure: broken tree"); - return err; - } - - err = phylo·addchild(node, node->parent); - if (err) { - errorf("inconsistent topology: could not add parent '%s' as child of '%s'", node->parent->name, node->name); - return err; - } - -RMCHILD: - err = phylo·rmchild(node, to); - if (err) { - errorf("inconsistent topology: could not remove child '%s' from '%s'", to->name, node->name); - return err; - } - - node->parent = to; - return 0; -} - -#define PREC .00000001 -error -phylo·reroot(bio·Tree *tree, bio·Node *node, double d) -{ - bio·Node *new; - - // TODO: should check that node is part of this tree? - // TODO: should we check if node->parent != nil? - - if (fabs(d) < PREC) { - new = node; - rotateparent(node->parent, node); - } else if (fabs(d-node->dist) < PREC) { - new = node->parent; - if (new->parent->parent) { - rotateparent(new->parent->parent, new->parent); - } - } else { - new = tree->mem.alloc(tree->heap, 1, sizeof(*new)); - memset(new, 0, sizeof(*new)); - - phylo·addchild(new, node); - node->parent = new; - - phylo·addchild(new, node->parent); - if (node->parent->parent) { - rotateparent(node->parent->parent, node->parent); - } - node->parent->parent = new; - } - - printf("number of children on old root: %d\n", tree->root->nchild); - tree->root = new; - tree->nleaf = 0; - - phylo·countleafs(new, &tree->nleaf); - phylo·countnodes(new, &new->nnode); - - return 0; -} -#undef PREC diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk deleted file mode 100644 index cbc6887..0000000 --- a/sys/libbio/rules.mk +++ /dev/null @@ -1,28 +0,0 @@ -include share/push.mk - -# Local sources -SRCS_$(d) := \ - $(d)/fasta.c \ - $(d)/newick.c \ - $(d)/phylo.c -LIBS_$(d) := $(d)/libbio.a -BINS_$(d) := -# TSTS_$(d) := \ -# $(d)/test.c \ -# $(d)/simulate.c - -include share/paths.mk - -# Local rules -# $(LIBS_$(d)) = TCFLAGS := -# $(LIBS_$(d)) = TCINCS := -# $(LIBS_$(d)) = TCLIBS := - -$(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io) - $(ARCHIVE) - -$(UNTS_$(d)): TCLIBS := $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a -$(UNTS_$(d)): $(TOBJS_$(d)) $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a - $(LINK) - -include share/pop.mk diff --git a/sys/libbio/simulate.c b/sys/libbio/simulate.c deleted file mode 100644 index 0f5a97e..0000000 --- a/sys/libbio/simulate.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include - -#define SEQLEN 2560 -static byte *SEQ = -"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAGGTAC" -"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG" -"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCAACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG" -"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA" -"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCTCTAAAGTCA" -"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA" -"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA" -"CGTGAAGTCAGTGCTGGAGTTGATTGAAAAGGTCGATGTGCATGCCATTGCGCACCTGACCGGCGGCGGCTTCTGGGAAA" -"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG" -"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCGCCCTGCC" -"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT" -"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA" -"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGTTCGGCCTT" -"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA" -"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG" -"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA" -"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGTGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT" -"ACAGGCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT" -"ATCCACTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG" -"CCGCCGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC" -"CTGTAAATCGCAAGCTCCAGCAGTTTTTTTCCCCCTTTTCTGGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA" -"TCCAGGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATCGAAAAAGAGCTCAGT" -"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG" -"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATCATTATTAGCGAAGAAC" -"AAGGCTCCAACTCTCATTCCCGCCATTTACTGGGCAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC" -"CTCTACAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA" -"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG" -"TGCAGTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG" -"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA" -"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA" -"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG" -"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCAGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT"; - -byte* -modify(byte *seq, int *len, double p) -{ - byte *head, *new; - - head = calloc(SEQLEN+1, sizeof(byte)); - new = head; - for (; *seq != '\0'; seq++) { - if (rng·bernoulli(p)) { - switch (rng·randi(5)) { - case 0: *new++ = 'A'; break; - case 1: *new++ = 'C'; break; - case 2: *new++ = 'G'; break; - case 3: *new++ = 'T'; break; - case 4: continue; - } - } else { - *new++ = *seq; - } - } - *new = '\0'; - *len = new - head; - return head; -} - -#define NSEQS 20 -int -main() -{ - int n, i, l, lens[NSEQS]; - byte *seqs[NSEQS]; - - int locs[aln·N][NSEQS][aln·L]; - int *loc[aln·N]; - uint64 vals[aln·N][NSEQS][aln·L]; - uint64 *val[aln·N]; - - rng·init(0); - - seqs[0] = SEQ; - lens[0] = SEQLEN; - - for (n = 0; n < aln·N; n++) { - for (i = 0; i < NSEQS; i++) { - for (l = 0; l < aln·L; l++) { - vals[n][i][l] = 0; - } - } - } - - for (i = 1; i < NSEQS; i++) { - seqs[i] = modify(SEQ, lens + i, .01*i); - } - - for (i = 0; i < NSEQS; i++) { - for (n = 0; n < aln·N; n++) { - val[n] = vals[n][i]; - loc[n] = locs[n][i]; - } - aln·sketch(seqs[i], aln·L, val, loc); - } - - // for (n = 0; n < aln·N; n++) { - // printf("iteration %d\n", n); - // printf("[\n"); - // for (i = 0; i < NSEQS; i++) { - // printf(" ["); - // for (l = 0; l < aln·L; l++) { - // printf("%lu,", vals[n][i][l]); - // } - // printf("],\n"); - // } - // printf("]\n"); - // } - - for (n = 0; n < aln·N; n++) { - aln·sort(NSEQS, aln·L, (uint64*)vals[n]); - } - - return 0; -} diff --git a/sys/libbio/test.c b/sys/libbio/test.c deleted file mode 100644 index 9926764..0000000 --- a/sys/libbio/test.c +++ /dev/null @@ -1,283 +0,0 @@ -#include -#include -#include - -#include - -// ----------------------------------------------------------------------- -// Global data - -static byte *SEQ[] = { -"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAGGTAC" -"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG" -"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCAACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG" -"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA" -"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCTCTAAAGTCA" -"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA" -"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA" -"CGTGAAGTCAGTGCTGGAGTTGATTGAAAAGGTCGATGTGCATGCCATTGCGCACCTGACCGGCGGCGGCTTCTGGGAAA" -"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG" -"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCGCCCTGCC" -"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT" -"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA" -"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGTTCGGCCTT" -"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA" -"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG" -"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA" -"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGTGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT" -"ACAGGCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT" -"ATCCACTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG" -"CCGCCGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC" -"CTGTAAATCGCAAGCTCCAGCAGTTTTTTTCCCCCTTTTCTGGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA" -"TCCAGGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATCGAAAAAGAGCTCAGT" -"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG" -"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATCATTATTAGCGAAGAAC" -"AAGGCTCCAACTCTCATTCCCGCCATTTACTGGGCAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC" -"CTCTACAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA" -"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG" -"TGCAGTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG" -"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA" -"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA" -"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG" -"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCAGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT", - -"GGCGGCTTCGGTGCGCTGTGTGCATTGCCGCAAAAATATCGTGAACCCGTGCTGGTTTCCGGCACTGACGGCGTAAATAC" -"CAAGCTGCGTCTGGCAATGGACTTAAAACGTCACGACACCATTGGTATTGATCTGGTCGCCATGTGCGTTAATGACCTGG" -"TGGTGCAAGGTGCGGAACCGCTGTTTTTCCTCGACTATTACGCACCGGAAAACTGGATGTTGATACCGCTTCAGCGGTG" -"ATCAGCGGCATTGCGGAAGGTTGTCTGCAATCGGGCTGTTCTCTGGTGGGTGGCGAAACGGCAGAAATGCCGGGGATGTA" -"TCACGGTGAAGATTACGATGTCGCGGGTTTCTGCGTGGGCGTGGTAGAAAAATCAGAAATCATCGACGGCAAAGTCA" -"GCGACGGCGATGTGCTGATTGCACTCGGTTCCAGCGGTCCGCACTCGAACGGTTATTCGCTGGTGCGCAAAATTCTTGAA" -"GTCAGCGGTTGTGATCCGCAAACCACCGAACTTGATGGTAAGCCATTAGCCGATCATCTGCTGGCACCGACCCGCATTTA" -"ACATTCCGCGCGTATTGCCAGATAATACCCAGGCAGTGATTGATGAATCTTCCTGGCAGTGGCCGGAAGTGTTCAACTGG" -"CTGCAAACGGCAGGTAACGTTGAGCGCCATGAAATGTATCGCACCTTCAACTGCGGCGTCGGGATGATTATCCCCTGCC" -"TGCTCCGGAAGTGGACAAAGCCCTCGCCCTGCTCAATGCCAACGGTGAAAACGCGTGGAAAATCGGTATCATCAAAGCCT" -"CTGATTCCGAACAACGCGTGGTTATCGAATAATGAATATTGTGTGCTTATTTCCGGCAACGGAAGTAATTTACAGGCAA" -"TTATTGACGCCTGTAAAACCAACAAAATTAAAGGCACCGTACGGGCAGTTTTCAGCAATAAGGCCGACGCGCGGCCTT" -"GAACGCGCCCGCCAGGCGGGTATTGCAACGCATACGCTCATCGCCAGCGCGTTTGACAGTCGTGAAGCCTATGACCGGGA" -"GTTGATTCATGAAATCGACATGTACGCACCCGATGTGGTCGTGCTGGCTGGTTTTATGCGCATTCTCAGCCCGGCGTTTG" -"TCTCCCACTATGCCGGGCGTTTGCTGAACATTCACCCTTCTCTGCTGCCGAAATATCCCGGATTACACACCCATCGTCAA" -"GCGCTGGAAAATGGCGATGAAGAGCACGGTACATCGGGCATTTCGTCACCGATGAACTGGACGGTGGCCCGGTTATTTT" -"ACAGTCGAAAGTCCCGGTATTTGCTGGTGATACGGAAGATGACGTCACCGCCCGCGTGCAAACCCAGGAACACGCCATTT" -"ATCCTCTGGTGATTAGCTGGTTTGCCGATGGTCGTCTGAAAATGCACGAAAACGCCGCGTGGCTGGATGGTCAACGTCTG" -"CCGCTGCAGGGCTACGCTGCCGACGAGTAATGCCCCCGTAGTTAAAGCGCCAGCTCTGCCGCTGGCGTTTTTCAATTCAC" -"CTGTTAATCGCAAGCTCCAGCAGCCCCCCCCCCCCTTTTCTGCATAGTTGGACATCTGCCAATATTGCTCGCCATAATA" -"TCCATGCAGTGTCCCGTGAATAAAACGGAGTAAAAGTGGTAATGGGTCAGGAAAAGCTATACATAAAAAGAGCTCAGT" -"TGGTTATCGTTCAATGAACGCGTGCTTCAGGAAGCGGCGGACAAATCTAACCCGCTGATTGAAAGGATGCGTTTCCTGGG" -"GATCTATTCCAATAACCTTGATGAGTTCTATAAAGTCCGCTTCGCTGAACTGAAGCGACGCATTATTAGCGAAGAAC" -"AAGGTTCCAACTCTCATTCCCGCCATTTACTGGGAAAATTCAGTCCCGGGTGCTGAAAGCCGATCAGGAATTCGACGGC" -"CTCTTCAACGAGCTATTGCTGGAGATGGCGCGCAACCAGATCTTCCTGATTAATGAACGCCAGCTCTCCGTCAATCAACA" -"AAACTGGCTGCGTCATTATTTTAAGCAGTATCTGCGTCAGCACATTACGCCGATTTTAATCAATCCTGACACTGACTTAG" -"TGCATTTCCTGAAAGATGATTACACCTATCTGGCGGTGGAAATTATCCGTGGCGATACCATCCGTTACGCGCTTCTGGAG" -"ATCCCATCAGATAAAGTGCCGCGCTTTGTGAATTTACCGCAGAAGCGCCGCGTCGACGCAAGCCGATGATTCTTCTGGA" -"TAACATTCTGCGTTACTGCCTTGATGATATTTTCAAAGGCTTCTTTGATTATGACGCGCTGAATGCCTATTCAATGAAGA" -"TGACCCGCGATGCCGAATACGATTTAGTGCATGAGATGGAAGCCAGCCTGATGGAGTTGATGTCTTCCAGTCTCAAGCAG" -"CGTTTAACTGCTGAGCCGGTGCGTTTTGTTTATCGCGCGATATGCCCAATGCGCTGGTTGAAGTTTTACGCGAAAAACT", -}; - - -static -int -my_read(Stream *s, void *buf, int n) -{ - return io·read(s, 1, n, buf); -} - -// ----------------------------------------------------------------------- -// Point of entry for testing - -error -test·newick() -{ - error err; - bio·Tree t; - mem·Arena *heap; - Stream *fd[2]; - - io·Peeker rdr; - io·Putter wtr; - - bio·Node **end, **it, **list; - - heap = mem·makearena(mem·sys, nil); - rdr = (io·Peeker){.get = (byte (*)(void *))io·getbyte, .unget = (error (*)(void *, byte))io·ungetbyte}; - wtr = (io·Putter){.put = (error (*)(void *, byte))io·putbyte, .putstr = (int (*)(void *, string))io·putstring}; - - fd[0] = io·open("/home/nolln/root/data/test/zika.nwk", "r"); - fd[1] = io·open("/home/nolln/root/data/test/zika.proc.nwk", "w"); - - t.h = heap; - t.heap = (mem·Allocator){ .alloc = (void *(*)(void *, uint, ulong))mem·arenaalloc, .free = nil, }; - - if (err = bio·readnewick(rdr, fd[0], &t), err) { - errorf("failed to read newick"); - return 1; - } - printf("number of children: %d\n", t.root->nchild); - - phylo·ladderize(t.root); - - list = mem·arenaalloc(heap, t.nleaf, sizeof(**list)); - phylo·getleafs(t, list); - for (it = list, end = list + t.nleaf; it != end; ++it) { - printf("Leaf '%s'\n", (*it)->name); - } - - bio·Node *path[100]; - // phylo·diameter(t, path); - - printf("Loaded tree with %d leafs and %d nodes\n", t.nleaf, t.root->nnode); - err = bio·writenewick(t, wtr, fd[1]); - - io·flush(fd[1]); - - io·close(fd[0]); - io·close(fd[1]); - - mem·freearena(heap); - return 0; -} - -error -test·fasta() -{ - error err; - Stream *fd; - - bio·Seq seq; - bio·FastaReader *rdr; - - clock_t t; - - fd = io·open("/home/nolln/root/data/test/zika.fa", "r"); - - /* Benchmark against Heng */ -#if 0 - int n, slen; - kseq_t *kseq; - - t = clock(); - kseq = kseq_init(fd); - while (kseq_read(kseq) >= 0) { - ++n, slen += kseq->seq.l; - } - t = clock() - t; - printf("heng's code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); - - kseq_destroy(kseq); - - io·seek(fd, 0, seek·set); -#endif - - rdr = bio·openfasta((io·Reader){.read = (int (*)(void *, int, int, void *))io·read}, fd, mem·sys, nil); - - t = clock(); - err = 0; - while (!err) { - err = bio·readfasta(rdr, &seq); - } - t = clock() - t; - printf("nick's code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); - bio·closefasta(rdr); - - - io·close(fd); - return err <= 0 ? 0 : 1; -} - -#define asrdr(x) (int (*)(void *, int, int, void *))(x) -error -test·fastq() -{ - error err; - Stream *fd; - - bio·Seq seq; - bio·FastqReader *rdr; - - clock_t t; - - fd = io·open("/home/nolln/root/data/test/eg.fq", "r"); - - rdr = bio·openfastq((io·Reader){.read = asrdr(io·read)}, fd, mem·sys, nil); - - t = clock(); - err = 0; - while (!err) { - err = bio·readfastq(rdr, &seq); - } - t = clock() - t; - printf("nick's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); - bio·closefastq(rdr); - - - io·close(fd); - return err <= 0 ? 0 : 1; -} - -error -test·align() -{ - double f; - error err; - int i, l, n; - - uint64 mem[aln·N][arrlen(SEQ)][aln·L]; - uint64 *phi[aln·N]; - int loc[aln·N][arrlen(SEQ)][aln·L]; - int *pos[aln·N]; - - for (i = 0; i < arrlen(SEQ); i++) { - for (n = 0; n < aln·N; n++) { - phi[n] = mem[n][i]; - pos[n] = loc[n][i]; - } - - err = aln·sketch(SEQ[i], aln·L, phi, pos); - } - - f = 0; - for (n = 0; n < aln·N; n++) { - aln·sort(arrlen(SEQ), aln·L, (uint64*)mem[n]); - - if (!memcmp(mem[n][0], mem[n][1], sizeof(uint64)*aln·L)) { - f += 1.; - printf("True : "); - } else { - printf("False: "); - } - for (i = 0; i < arrlen(SEQ); i++) { - printf("["); - for (l = 0; l < aln·L; l++) { - printf("%lu,", mem[n][i][l]); - } - printf("]"); - if (i == 0) printf(" ~ "); - } - printf("\n"); - } - - printf("Fraction hits %f\n", f/aln·N); - return err; - -} - -error -main() -{ - error err; - - if (err = test·newick(), err) { - errorf("test fail: newick"); - } - -#if 0 - if (err = test·fasta(), err) { - errorf("test fail: fasta"); - } - - if (err = test·fastq(), err) { - errorf("test fail: fastq"); - } -#endif -} - -- cgit v1.2.1