From 5d3642b8ef920316693031d2ea34b9def0b1abc5 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 22 Apr 2021 08:55:35 -0700 Subject: chore: rm unfinished projects --- sys/libbio/align.c | 24 +-- sys/libbio/io/fasta.c | 360 ------------------------------------------ sys/libbio/io/newick.c | 418 ------------------------------------------------- sys/libbio/io/rules.mk | 21 --- sys/libbio/rules.mk | 13 +- 5 files changed, 17 insertions(+), 819 deletions(-) delete mode 100644 sys/libbio/io/fasta.c delete mode 100644 sys/libbio/io/newick.c delete mode 100644 sys/libbio/io/rules.mk (limited to 'sys/libbio') diff --git a/sys/libbio/align.c b/sys/libbio/align.c index 0f00fce..20a57df 100644 --- a/sys/libbio/align.c +++ b/sys/libbio/align.c @@ -4,13 +4,13 @@ #include // ----------------------------------------------------------------------- -// Temporary globals +// globals uint64 aln·shft = (2ULL * (aln·K - 1ULL)); uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL; // ----------------------------------------------------------------------- -// Static data +// static data static uint64 nuctab[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -32,7 +32,7 @@ static uint64 nuctab[256] = { }; // ----------------------------------------------------------------------- -// Hash functions +// hash functions enum { @@ -69,7 +69,7 @@ murmurhash(uint64 x, uint64 mask) } // ----------------------------------------------------------------------- -// Locality sensitive hashing (with spatial extent) +// locality sensitive hashing (with spatial extent) static void @@ -102,20 +102,20 @@ aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N]) int tmpi[2]; uint64 tmpu[2]; - for (n = 0; n < aln·N; n++) { - for (l = 0; l < len; l++) { + for(n = 0; n < aln·N; n++) { + for(l = 0; l < len; l++) { vals[n][l] = UINT64_MAX; } } kmer = UINT64_MAX; - for (l = 0; *seq != '\0'; seq++, l++) { + for(l = 0; *seq != '\0'; seq++, l++) { kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask; h[0] = minihash(kmer, aln·mask); h[1] = murmurhash(kmer, aln·mask); - for (n = 0; n < aln·N; n++) { + for(n = 0; n < aln·N; n++) { val = vals[n]; loc = locs[n]; @@ -126,14 +126,14 @@ aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N]) tmpu[1] = h[2]; tmpi[1] = l; - for (i -= 1; i >= 0; i--) { + for(i -= 1; i >= 0; i--) { tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0]; tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0]; } } } - for (n = 0; n < aln·N; n++) { + for(n = 0; n < aln·N; n++) { sortpos(len, vals[n], locs[n]); } @@ -146,7 +146,7 @@ lessarrs(int len, uint64 a[], uint64 b[]) { int l; - for (l = 0; l < len; l++) { + for(l = 0; l < len; l++) { if (a[l] < b[l]) return 1; if (a[l] > b[l]) return 0; } @@ -161,7 +161,7 @@ swaparrs(int len, uint64 a[], uint64 b[]) int l; uint64 tmp; - for (l = 0; l < len; l++) { + for(l = 0; l < len; l++) { tmp = a[l], a[l] = b[l], b[l] = tmp; } } diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c deleted file mode 100644 index 28c9ca4..0000000 --- a/sys/libbio/io/fasta.c +++ /dev/null @@ -1,360 +0,0 @@ -#include -#include -#include - -#define INIT_NM_SIZE 128 -#define INIT_SQ_SIZE 4096 - -struct Seqbuf -{ - mem·Allocator heap; - void *h; - - int len; - int off; - byte *it; - byte b[]; -}; - -static -void -reset(struct Seqbuf *sb) -{ - sb->off = 0; - sb->it = sb->b; -} - -static -error -grow(struct Seqbuf **sb, int min) -{ - struct Seqbuf *old, *new; - vlong newlen; - - old = *sb; - assert((*sb)->len <= (SIZE_MAX - 1) / 2); - newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min)); - assert(newlen >= (*sb)->len+min); - - if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) { - errorf("memory: could not allocate new buffer"); - return 1; - } - - memcpy(new, old, sizeof(*sb) + (*sb)->len); - new->len = newlen; - new->it = new->b + (old->len); - old->heap.free(old->h, old); - - *sb = new; - return 0; -} - -static -error -put(struct Seqbuf **sb, byte c) -{ - int err; - struct Seqbuf *sq; - - sq = *sb; - if (sq->it < (sq->b + sq->len)) { - *sq->it++ = c; - return 0; - } - - if (err = grow(sb, 1), err) { - errorf("memory fail: could not allocate more buffer"); - sq->heap.free(sq->h, sq); - return 1; - } - - *((*sb)->it++) = c; - return 0; -} - -static -error -push(struct Seqbuf **sb, int n, void *buf) -{ - int d, err; - struct Seqbuf *seq; - - seq = *sb; - if (d = seq->len - (seq->it - seq->b), d < n) { - assert(d > 0); - if (err = grow(sb, n-d), err) { - errorf("memory fail: could not allocate more buffer"); - seq->heap.free(seq->h, seq); - return 1; - } - } - - memcpy((*sb)->it, buf, n); - (*sb)->it += n; - - return 0; -} - -// ----------------------------------------------------------------------- -// Fasta files - -struct bio·FastaReader { - byte eof; - io·Reader file; - void *f; - - struct Seqbuf *seq; - - /* read buffer */ - byte *b, *bend; - byte buf[4*4098]; -}; - -static -error -fill(bio·FastaReader *rdr) -{ - int n; - // NOTE: This could lead to an infinite loop. - if (rdr->eof) { - return 0; - } - - n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf); - if (n < 0) { - errorf("read: no data obtained from reader"); - return 1; - } - rdr->b = rdr->buf; - rdr->bend = rdr->b + n; - if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) { - *rdr->bend++ = '\0'; - } - - return 0; -} - -bio·FastaReader* -bio·openfasta(io·Reader file, void *f, mem·Allocator heap, void *h) -{ - error err; - bio·FastaReader *rdr; - - rdr = heap.alloc(h, 1, sizeof(bio·FastaReader)); - rdr->file = file; - rdr->f = f; - rdr->eof = 0; - - rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE); - rdr->seq->heap = heap; - rdr->seq->h = h; - rdr->seq->it = rdr->seq->b; - rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE; - - if (err = fill(rdr), err) { - errorf("fill: could not populate buffer"); - goto ERROR; - } - - return rdr; - -ERROR: - heap.free(h, rdr->seq); - heap.free(h, rdr); - return nil; -} - -static -error -readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte hdr, byte stop) -{ - error err; - byte *beg; - - if (rdr->eof && rdr->b == rdr->bend-1) { - return EOF; - } - - reset(rdr->seq); - - // NOTE: Can this case happen? - assert(rdr->b != rdr->bend); - if (*rdr->b++ != hdr) { - errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--); - return 1; - } - -NAME: - beg = rdr->b; - while (rdr->b != rdr->bend) { - if (*rdr->b++ == '\n') { - push(&rdr->seq, (rdr->b - 1) - beg, beg); - goto SEQ; - } - } - - push(&rdr->seq, rdr->b - beg, beg); - - if (err = fill(rdr), err) { - errorf("read: could not populate buffer"); - return 1; - } - goto NAME; - -SEQ: - put(&rdr->seq, '\0'); - rdr->seq->off = rdr->seq->it - rdr->seq->b; - -SEQL: - beg = rdr->b; - while (rdr->b != rdr->bend) { - if (*rdr->b == '\n') { - push(&rdr->seq, rdr->b - beg, beg); - beg = rdr->b + 1; - } - - if (*rdr->b == stop || *rdr->b == '\0') { - goto SUCCESS; - } - - rdr->b++; - } - - push(&rdr->seq, rdr->b - beg, beg); - - if (err = fill(rdr), err) { - errorf("read: could not populate buffer"); - return 1; - } - goto SEQL; - -SUCCESS: - push(&rdr->seq, rdr->b - beg, beg); - put(&rdr->seq, '\0'); - - return 0; -} - -error -bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq) -{ - error err; - - err = readfasta(rdr, seq, '>', '>'); - if (err && err != EOF) { - errorf("parse fail: could not read sequence of record"); - return err; - } - - seq->name = rdr->seq->b; - seq->s = rdr->seq->b + rdr->seq->off; - seq->len = rdr->seq->it - seq->s; - seq->q = nil; - - return err; -} - -error -bio·closefasta(bio·FastaReader *rdr) -{ - mem·Allocator heap; - void *h; - - heap = rdr->seq->heap; - h = rdr->seq->h; - - heap.free(h, rdr->seq); - heap.free(h, rdr); - - return 0; -} - -// ----------------------------------------------------------------------- -// Fastq files - -struct bio·FastqReader { - struct bio·FastaReader; -}; - -bio·FastqReader* -bio·openfastq(io·Reader file, void *f, mem·Allocator heap, void *h) -{ - return (bio·FastqReader*)bio·openfasta(file, f, heap, h); -} - -error -bio·closefastq(bio·FastqReader *rdr) -{ - return bio·closefasta((bio·FastaReader*)rdr); -} - -error -bio·readfastq(bio·FastqReader *rdr, bio·Seq *seq) -{ - int n; - byte *beg; - error err; - - err = readfasta((bio·FastaReader*)rdr, seq, '@', '+'); - if (err) { - errorf("parse fail: could not read sequence of record"); - return err; - } - - seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); - - if (*rdr->b++ != '+') { - errorf("format error: no '+' character seperator found"); - return -1; - } - -EATLN: - while (rdr->b != rdr->bend) { - if (*rdr->b++ == '\n') { - n = 0; - goto QUAL; - } - } - - if (err = fill((bio·FastaReader*)rdr), err) { - errorf("read: could not populate buffer"); - return 1; - } - goto EATLN; - -QUAL: - beg = rdr->b; - while (rdr->b != rdr->bend) { - if (*rdr->b == '\n') { - push(&rdr->seq, rdr->b - beg, beg); - beg = rdr->b + 1; - } - - if (n++ == seq->len || *rdr->b == '\0') { - err = *rdr->b == '\0' ? EOF : 0; - goto SUCCESS; - } - - rdr->b++; - } - - push(&rdr->seq, rdr->b - beg, beg); - - if (err = fill((bio·FastaReader*)rdr), err) { - errorf("read: could not populate buffer"); - return 1; - } - goto QUAL; - - -SUCCESS: - push(&rdr->seq, rdr->b - beg, beg); - put(&rdr->seq, '\0'); - - seq->name = rdr->seq->b; - seq->s = rdr->seq->b + rdr->seq->off; - seq->q = seq->s + seq->len + 1; - - return err; -} diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c deleted file mode 100644 index 164516f..0000000 --- a/sys/libbio/io/newick.c +++ /dev/null @@ -1,418 +0,0 @@ -#include -#include -#include - -// ----------------------------------------------------------------------- -// Tokens - -enum TokenKind -{ - tok·nil, - tok·eof, - tok·space, - tok·ident, - tok·number, - tok·lparen, - tok·rparen, - tok·lbrak, - tok·rbrak, - tok·comma, - tok·semi, - tok·colon, - - NUM_TOKENS, -}; - - -struct Token { - enum TokenKind kind; - union - { - byte *s; - double x; - } lit; -}; - -static -byte* -tokstr(struct Token tok) -{ - static byte b[50]; - switch (tok.kind) { - case tok·nil: return ""; - case tok·eof: return nil; - case tok·space: return " "; - case tok·ident: return tok.lit.s; - case tok·lparen: return "("; - case tok·rparen: return ")"; - case tok·lbrak: return "["; - case tok·rbrak: return "]"; - case tok·comma: return ","; - case tok·semi: return ";"; - case tok·colon: return ":"; - case tok·number: - snprintf(b, arrlen(b), "%f", tok.lit.x); - return b; - default: - return nil; - } -} - - -// ----------------------------------------------------------------------- -// Read - -// TODO: Bounds checking on buffer -static -struct Token -lex(io·Peeker s, void* fp) -{ -#define isvalidchar(C) ((C) == '!') || \ - ('\"' < (C) && (C) < '\'') || \ - (')' < (C) && (C) < '+') || \ - (',' < (C) && (C) < ':') || \ - (':' < (C) && (C) < '[') || \ - ((C) == '\\') || \ - (']' < (C) && (C) <= '~') - byte *c; - struct Token tok; - static byte b[1024]; - c = b; - *c = s.get(fp); - - if (isspace(*c)) { - while (isspace(*c)) { - *(++c) = s.get(fp); - } - - s.unget(fp, *c); - assert(c - b < 1024); - - *c = 0; - tok.kind = tok·space; - tok.lit.s = b; - return tok; - } - - switch (*c) { - case EOF: tok.kind = tok·eof; return tok; - case '(': tok.kind = tok·lparen; return tok; - case ')': tok.kind = tok·rparen; return tok; - case '[': tok.kind = tok·lbrak; return tok; - case ']': tok.kind = tok·rbrak; return tok; - case ',': tok.kind = tok·comma; return tok; - case ';': tok.kind = tok·semi; return tok; - case ':': tok.kind = tok·colon; return tok; - - case '.': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - while (isdigit(*c)) { - NUM: *(++c) = s.get(fp); - } - if (*c == '.') goto NUM; - if (isvalidchar(*c)) goto IDENT; - - s.unget(fp, *c); - assert(c - b < 1024); - - *c = 0; - tok.kind = tok·number; - tok.lit.x = atof(b); - return tok; - - case '\"': - while ((*c) != '\"') { - *(++c) = s.get(fp); - } - assert(c - b < 1024); - - *c = '\0'; - tok.kind = tok·ident; - tok.lit.s = b + 1; - return tok; - - default: - IDENT: - while (isvalidchar(*c)) { - *(++c) = s.get(fp); - } - s.unget(fp, *c); - assert(c - b < 1024); - - *c = '\0'; - tok.kind = tok·ident; - tok.lit.s = b; - return tok; - } -#undef isvalidchar -} - -static -struct Token -lex_nospace(io·Peeker s, void *impl) -{ - struct Token tok; - tok = lex(s, impl); - if (tok.kind == tok·space) { - tok = lex_nospace(s, impl); - } - - return tok; -} - -struct Parser -{ - int lev; - bio·Node *root; - struct Token tok; - - void *fimpl; - io·Peeker file; - void *himpl; - mem·Allocator heap; -}; - -static -error -parse(struct Parser *p) -{ - error err; - bio·Node *node; - bio·Node *root; - struct Token tok; - - node = p->root; - for (;;) { - tok = lex_nospace(p->file, p->fimpl); - - switch (tok.kind) { - case tok·lparen: - if (!p->root && p->lev > 0) { - errorf("parse format: attempted to make root at non-zero level"); - goto ERROR; - } - - node = p->heap.alloc(p->himpl, 1, sizeof(*node)); - memset(node, 0, sizeof(*node)); - - if (p->root) { - phylo·addchild(p->root, node); - root = p->root; - } else { - root = node; - } - - p->lev++; - p->root = node; - p->tok = tok; - err = parse(p); - if (err) { - goto ERROR; - } - if (p->tok.kind != tok·rparen) { - errorf("incorrect format: closing parentheses expected to proceed opening"); - goto ERROR; - } - p->root = root; - // NOTE(nnoll): We don't want to override the state of p->tok here! - // Jump straight to grabbing next token. - continue; - - case tok·rparen: - p->lev--; - goto DONE; - - /* Comments */ - case tok·lbrak: - if (!node) { - errorf("incorrect format: comment found in disallowed region"); - goto ERROR; - } - node->comment = str·make(""); - while (tok.kind != tok·rbrak) { - tok = lex_nospace(p->file, p->fimpl); - if (tok.kind == tok·eof || tok.kind == tok·nil) { - errorf("incorrect format: unmatched comment bracket '['"); - goto ERROR; - } - str·append(&node->comment, tokstr(tok)); - } - break; - - case tok·rbrak: - errorf("incorrect format: end comment token found in disallowed region"); - goto ERROR; - break; - - case tok·colon: - tok = lex_nospace(p->file, p->fimpl); - if (tok.kind != tok·number) { - errorf("incorrect format: expected number after colon"); - goto ERROR; - } - if (node == nil) { - errorf("parse error: attempting to set distance of nil node"); - goto ERROR; - } - node->dist = tok.lit.x; - break; - - case tok·comma: - node = nil; - break; - - case tok·ident: - if (p->tok.kind == tok·rparen) { - if (!node) { - errorf("parse error: attempting to set name of nil node"); - goto ERROR; - } - node->name = str·make(tok.lit.s); - } else { - if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) { - errorf("format error: misplaced identifier for leaf found"); - goto ERROR; - } - - if (!p->root) { - errorf("parse error: attempting to create child for no parent"); - goto ERROR; - } - - node = p->heap.alloc(p->himpl, 1, sizeof(*node)); - memset(node, 0, sizeof(*node)); - node->name = str·make(tok.lit.s); - - phylo·addchild(p->root, node); - } - break; - - case tok·number: - if (p->tok.kind == tok·rparen) { - if (p->lev == 0) { - errorf("format error: support value on root not supported"); - goto ERROR; - } - node->support = tok.lit.x; - } else { - errorf("format error: found number in unexpected location"); - goto ERROR; - } - break; - - case tok·semi: - p->file.unget(p->fimpl, ';'); - if (p->lev) { - errorf("format error: uneven number of parentheses found at ';'"); - goto ERROR; - } - goto DONE; - - case tok·eof: - goto DONE; - - default: - errorf("parse error: unrecognized token"); - goto ERROR; - } - - p->tok = tok; - } - -DONE: - p->tok = tok; - return 0; -ERROR: - // TODO(nnoll): cleanup - return 1; -} - -error -bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree) -{ - error err; - struct Parser p; - enum - { - error·nil, - error·notree, - error·parse, - }; - - if (!tree) { - return error·notree; - } - - p = (struct Parser){ - .lev = 0, - .root = nil, - .tok = (struct Token){ 0 }, - .fimpl = s, - .file = stream, - .himpl = tree->h, - .heap = tree->heap, - }; - err = parse(&p); - if (err) { - errorf("parsing failed\n"); - return error·parse; - } - - tree->root = p.root; - tree->nleaf = 0; - tree->root->nnode = 0; - - phylo·countleafs(tree->root, &tree->nleaf); - phylo·countnodes(tree->root, &tree->root->nnode); - - return error·nil; -} - -// ----------------------------------------------------------------------- -// Write - -static -error -dump(bio·Node *node, void *impl, io·Putter out) -{ - byte b[24]; - - if (!node) { - return 1; - } - - bio·Node *child; - if (node->nchild) { - out.put(impl, '('); - - dump(node->child, impl, out); - for (child = node->child->sibling; child != nil; child = child->sibling) { - out.put(impl, ','); - dump(child, impl, out); - } - - out.put(impl, ')'); - } - if (node->name) { - out.putstr(impl, node->name); - } - - if (node->parent) { - out.put(impl, ':'); - snprintf(b, arrlen(b), "%f", node->dist); - out.putstr(impl, b); - } - - return 0; -} - -error -bio·writenewick(bio·Tree tree, io·Putter out, void* impl) -{ - dump(tree.root, impl, out); - out.put(impl, ';'); - out.put(impl, '\n'); - - return 0; -} diff --git a/sys/libbio/io/rules.mk b/sys/libbio/io/rules.mk deleted file mode 100644 index a4277fc..0000000 --- a/sys/libbio/io/rules.mk +++ /dev/null @@ -1,21 +0,0 @@ -include share/push.mk - -# Iterate through subdirectory tree - -# Local sources -SRCS_$(d) := $(wildcard $(d)/*.c) -LIBS_$(d) := -BINS_$(d) := -TSTS_$(d) := - -include share/paths.mk - -$(LIBS_$(d)): $(OBJS_$(d)) - $(ARCHIVE) - -# $(BINS_$(d)): TCLIBS := $(OBJ_DIR)/libn/libn.a -$(BINS_$(d)): $(OBJS_$(d)) - $(LINK) - -# ---- Pop off stack ---- -include share/pop.mk diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk index 6a029b1..cbc6887 100644 --- a/sys/libbio/rules.mk +++ b/sys/libbio/rules.mk @@ -1,18 +1,15 @@ include share/push.mk -# Iterate through subdirectory tree -DIR := $(d)/io -include $(DIR)/rules.mk - # Local sources SRCS_$(d) := \ - $(d)/align.c \ + $(d)/fasta.c \ + $(d)/newick.c \ $(d)/phylo.c LIBS_$(d) := $(d)/libbio.a BINS_$(d) := -TSTS_$(d) := \ - $(d)/test.c \ - $(d)/simulate.c +# TSTS_$(d) := \ +# $(d)/test.c \ +# $(d)/simulate.c include share/paths.mk -- cgit v1.2.1