aboutsummaryrefslogtreecommitdiff
path: root/sys/libbio
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-04-22 08:55:35 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-04-22 08:55:35 -0700
commit5d3642b8ef920316693031d2ea34b9def0b1abc5 (patch)
tree8100890ed5b2e4ecdbde09615e0820346ccc3f41 /sys/libbio
parente30f8b22069bec1a3fb68f089a9a7198671eb09a (diff)
chore: rm unfinished projects
Diffstat (limited to 'sys/libbio')
-rw-r--r--sys/libbio/align.c24
-rw-r--r--sys/libbio/io/fasta.c360
-rw-r--r--sys/libbio/io/newick.c418
-rw-r--r--sys/libbio/io/rules.mk21
-rw-r--r--sys/libbio/rules.mk13
5 files changed, 17 insertions, 819 deletions
diff --git a/sys/libbio/align.c b/sys/libbio/align.c
index 0f00fce..20a57df 100644
--- a/sys/libbio/align.c
+++ b/sys/libbio/align.c
@@ -4,13 +4,13 @@
#include <libbio.h>
// -----------------------------------------------------------------------
-// Temporary globals
+// globals
uint64 aln·shft = (2ULL * (aln·K - 1ULL));
uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL;
// -----------------------------------------------------------------------
-// Static data
+// static data
static uint64 nuctab[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -32,7 +32,7 @@ static uint64 nuctab[256] = {
};
// -----------------------------------------------------------------------
-// Hash functions
+// hash functions
enum
{
@@ -69,7 +69,7 @@ murmurhash(uint64 x, uint64 mask)
}
// -----------------------------------------------------------------------
-// Locality sensitive hashing (with spatial extent)
+// locality sensitive hashing (with spatial extent)
static
void
@@ -102,20 +102,20 @@ aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N])
int tmpi[2];
uint64 tmpu[2];
- for (n = 0; n < aln·N; n++) {
- for (l = 0; l < len; l++) {
+ for(n = 0; n < aln·N; n++) {
+ for(l = 0; l < len; l++) {
vals[n][l] = UINT64_MAX;
}
}
kmer = UINT64_MAX;
- for (l = 0; *seq != '\0'; seq++, l++) {
+ for(l = 0; *seq != '\0'; seq++, l++) {
kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask;
h[0] = minihash(kmer, aln·mask);
h[1] = murmurhash(kmer, aln·mask);
- for (n = 0; n < aln·N; n++) {
+ for(n = 0; n < aln·N; n++) {
val = vals[n];
loc = locs[n];
@@ -126,14 +126,14 @@ aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N])
tmpu[1] = h[2];
tmpi[1] = l;
- for (i -= 1; i >= 0; i--) {
+ for(i -= 1; i >= 0; i--) {
tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0];
tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0];
}
}
}
- for (n = 0; n < aln·N; n++) {
+ for(n = 0; n < aln·N; n++) {
sortpos(len, vals[n], locs[n]);
}
@@ -146,7 +146,7 @@ lessarrs(int len, uint64 a[], uint64 b[])
{
int l;
- for (l = 0; l < len; l++) {
+ for(l = 0; l < len; l++) {
if (a[l] < b[l]) return 1;
if (a[l] > b[l]) return 0;
}
@@ -161,7 +161,7 @@ swaparrs(int len, uint64 a[], uint64 b[])
int l;
uint64 tmp;
- for (l = 0; l < len; l++) {
+ for(l = 0; l < len; l++) {
tmp = a[l], a[l] = b[l], b[l] = tmp;
}
}
diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c
deleted file mode 100644
index 28c9ca4..0000000
--- a/sys/libbio/io/fasta.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <u.h>
-#include <libn.h>
-#include <libbio.h>
-
-#define INIT_NM_SIZE 128
-#define INIT_SQ_SIZE 4096
-
-struct Seqbuf
-{
- mem·Allocator heap;
- void *h;
-
- int len;
- int off;
- byte *it;
- byte b[];
-};
-
-static
-void
-reset(struct Seqbuf *sb)
-{
- sb->off = 0;
- sb->it = sb->b;
-}
-
-static
-error
-grow(struct Seqbuf **sb, int min)
-{
- struct Seqbuf *old, *new;
- vlong newlen;
-
- old = *sb;
- assert((*sb)->len <= (SIZE_MAX - 1) / 2);
- newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min));
- assert(newlen >= (*sb)->len+min);
-
- if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) {
- errorf("memory: could not allocate new buffer");
- return 1;
- }
-
- memcpy(new, old, sizeof(*sb) + (*sb)->len);
- new->len = newlen;
- new->it = new->b + (old->len);
- old->heap.free(old->h, old);
-
- *sb = new;
- return 0;
-}
-
-static
-error
-put(struct Seqbuf **sb, byte c)
-{
- int err;
- struct Seqbuf *sq;
-
- sq = *sb;
- if (sq->it < (sq->b + sq->len)) {
- *sq->it++ = c;
- return 0;
- }
-
- if (err = grow(sb, 1), err) {
- errorf("memory fail: could not allocate more buffer");
- sq->heap.free(sq->h, sq);
- return 1;
- }
-
- *((*sb)->it++) = c;
- return 0;
-}
-
-static
-error
-push(struct Seqbuf **sb, int n, void *buf)
-{
- int d, err;
- struct Seqbuf *seq;
-
- seq = *sb;
- if (d = seq->len - (seq->it - seq->b), d < n) {
- assert(d > 0);
- if (err = grow(sb, n-d), err) {
- errorf("memory fail: could not allocate more buffer");
- seq->heap.free(seq->h, seq);
- return 1;
- }
- }
-
- memcpy((*sb)->it, buf, n);
- (*sb)->it += n;
-
- return 0;
-}
-
-// -----------------------------------------------------------------------
-// Fasta files
-
-struct bio·FastaReader {
- byte eof;
- io·Reader file;
- void *f;
-
- struct Seqbuf *seq;
-
- /* read buffer */
- byte *b, *bend;
- byte buf[4*4098];
-};
-
-static
-error
-fill(bio·FastaReader *rdr)
-{
- int n;
- // NOTE: This could lead to an infinite loop.
- if (rdr->eof) {
- return 0;
- }
-
- n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf);
- if (n < 0) {
- errorf("read: no data obtained from reader");
- return 1;
- }
- rdr->b = rdr->buf;
- rdr->bend = rdr->b + n;
- if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) {
- *rdr->bend++ = '\0';
- }
-
- return 0;
-}
-
-bio·FastaReader*
-bio·openfasta(io·Reader file, void *f, mem·Allocator heap, void *h)
-{
- error err;
- bio·FastaReader *rdr;
-
- rdr = heap.alloc(h, 1, sizeof(bio·FastaReader));
- rdr->file = file;
- rdr->f = f;
- rdr->eof = 0;
-
- rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
- rdr->seq->heap = heap;
- rdr->seq->h = h;
- rdr->seq->it = rdr->seq->b;
- rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE;
-
- if (err = fill(rdr), err) {
- errorf("fill: could not populate buffer");
- goto ERROR;
- }
-
- return rdr;
-
-ERROR:
- heap.free(h, rdr->seq);
- heap.free(h, rdr);
- return nil;
-}
-
-static
-error
-readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte hdr, byte stop)
-{
- error err;
- byte *beg;
-
- if (rdr->eof && rdr->b == rdr->bend-1) {
- return EOF;
- }
-
- reset(rdr->seq);
-
- // NOTE: Can this case happen?
- assert(rdr->b != rdr->bend);
- if (*rdr->b++ != hdr) {
- errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--);
- return 1;
- }
-
-NAME:
- beg = rdr->b;
- while (rdr->b != rdr->bend) {
- if (*rdr->b++ == '\n') {
- push(&rdr->seq, (rdr->b - 1) - beg, beg);
- goto SEQ;
- }
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if (err = fill(rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto NAME;
-
-SEQ:
- put(&rdr->seq, '\0');
- rdr->seq->off = rdr->seq->it - rdr->seq->b;
-
-SEQL:
- beg = rdr->b;
- while (rdr->b != rdr->bend) {
- if (*rdr->b == '\n') {
- push(&rdr->seq, rdr->b - beg, beg);
- beg = rdr->b + 1;
- }
-
- if (*rdr->b == stop || *rdr->b == '\0') {
- goto SUCCESS;
- }
-
- rdr->b++;
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if (err = fill(rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto SEQL;
-
-SUCCESS:
- push(&rdr->seq, rdr->b - beg, beg);
- put(&rdr->seq, '\0');
-
- return 0;
-}
-
-error
-bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq)
-{
- error err;
-
- err = readfasta(rdr, seq, '>', '>');
- if (err && err != EOF) {
- errorf("parse fail: could not read sequence of record");
- return err;
- }
-
- seq->name = rdr->seq->b;
- seq->s = rdr->seq->b + rdr->seq->off;
- seq->len = rdr->seq->it - seq->s;
- seq->q = nil;
-
- return err;
-}
-
-error
-bio·closefasta(bio·FastaReader *rdr)
-{
- mem·Allocator heap;
- void *h;
-
- heap = rdr->seq->heap;
- h = rdr->seq->h;
-
- heap.free(h, rdr->seq);
- heap.free(h, rdr);
-
- return 0;
-}
-
-// -----------------------------------------------------------------------
-// Fastq files
-
-struct bio·FastqReader {
- struct bio·FastaReader;
-};
-
-bio·FastqReader*
-bio·openfastq(io·Reader file, void *f, mem·Allocator heap, void *h)
-{
- return (bio·FastqReader*)bio·openfasta(file, f, heap, h);
-}
-
-error
-bio·closefastq(bio·FastqReader *rdr)
-{
- return bio·closefasta((bio·FastaReader*)rdr);
-}
-
-error
-bio·readfastq(bio·FastqReader *rdr, bio·Seq *seq)
-{
- int n;
- byte *beg;
- error err;
-
- err = readfasta((bio·FastaReader*)rdr, seq, '@', '+');
- if (err) {
- errorf("parse fail: could not read sequence of record");
- return err;
- }
-
- seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
-
- if (*rdr->b++ != '+') {
- errorf("format error: no '+' character seperator found");
- return -1;
- }
-
-EATLN:
- while (rdr->b != rdr->bend) {
- if (*rdr->b++ == '\n') {
- n = 0;
- goto QUAL;
- }
- }
-
- if (err = fill((bio·FastaReader*)rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto EATLN;
-
-QUAL:
- beg = rdr->b;
- while (rdr->b != rdr->bend) {
- if (*rdr->b == '\n') {
- push(&rdr->seq, rdr->b - beg, beg);
- beg = rdr->b + 1;
- }
-
- if (n++ == seq->len || *rdr->b == '\0') {
- err = *rdr->b == '\0' ? EOF : 0;
- goto SUCCESS;
- }
-
- rdr->b++;
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if (err = fill((bio·FastaReader*)rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto QUAL;
-
-
-SUCCESS:
- push(&rdr->seq, rdr->b - beg, beg);
- put(&rdr->seq, '\0');
-
- seq->name = rdr->seq->b;
- seq->s = rdr->seq->b + rdr->seq->off;
- seq->q = seq->s + seq->len + 1;
-
- return err;
-}
diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c
deleted file mode 100644
index 164516f..0000000
--- a/sys/libbio/io/newick.c
+++ /dev/null
@@ -1,418 +0,0 @@
-#include <u.h>
-#include <libn.h>
-#include <libbio.h>
-
-// -----------------------------------------------------------------------
-// Tokens
-
-enum TokenKind
-{
- tok·nil,
- tok·eof,
- tok·space,
- tok·ident,
- tok·number,
- tok·lparen,
- tok·rparen,
- tok·lbrak,
- tok·rbrak,
- tok·comma,
- tok·semi,
- tok·colon,
-
- NUM_TOKENS,
-};
-
-
-struct Token {
- enum TokenKind kind;
- union
- {
- byte *s;
- double x;
- } lit;
-};
-
-static
-byte*
-tokstr(struct Token tok)
-{
- static byte b[50];
- switch (tok.kind) {
- case tok·nil: return "";
- case tok·eof: return nil;
- case tok·space: return " ";
- case tok·ident: return tok.lit.s;
- case tok·lparen: return "(";
- case tok·rparen: return ")";
- case tok·lbrak: return "[";
- case tok·rbrak: return "]";
- case tok·comma: return ",";
- case tok·semi: return ";";
- case tok·colon: return ":";
- case tok·number:
- snprintf(b, arrlen(b), "%f", tok.lit.x);
- return b;
- default:
- return nil;
- }
-}
-
-
-// -----------------------------------------------------------------------
-// Read
-
-// TODO: Bounds checking on buffer
-static
-struct Token
-lex(io·Peeker s, void* fp)
-{
-#define isvalidchar(C) ((C) == '!') || \
- ('\"' < (C) && (C) < '\'') || \
- (')' < (C) && (C) < '+') || \
- (',' < (C) && (C) < ':') || \
- (':' < (C) && (C) < '[') || \
- ((C) == '\\') || \
- (']' < (C) && (C) <= '~')
- byte *c;
- struct Token tok;
- static byte b[1024];
- c = b;
- *c = s.get(fp);
-
- if (isspace(*c)) {
- while (isspace(*c)) {
- *(++c) = s.get(fp);
- }
-
- s.unget(fp, *c);
- assert(c - b < 1024);
-
- *c = 0;
- tok.kind = tok·space;
- tok.lit.s = b;
- return tok;
- }
-
- switch (*c) {
- case EOF: tok.kind = tok·eof; return tok;
- case '(': tok.kind = tok·lparen; return tok;
- case ')': tok.kind = tok·rparen; return tok;
- case '[': tok.kind = tok·lbrak; return tok;
- case ']': tok.kind = tok·rbrak; return tok;
- case ',': tok.kind = tok·comma; return tok;
- case ';': tok.kind = tok·semi; return tok;
- case ':': tok.kind = tok·colon; return tok;
-
- case '.':
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- while (isdigit(*c)) {
- NUM: *(++c) = s.get(fp);
- }
- if (*c == '.') goto NUM;
- if (isvalidchar(*c)) goto IDENT;
-
- s.unget(fp, *c);
- assert(c - b < 1024);
-
- *c = 0;
- tok.kind = tok·number;
- tok.lit.x = atof(b);
- return tok;
-
- case '\"':
- while ((*c) != '\"') {
- *(++c) = s.get(fp);
- }
- assert(c - b < 1024);
-
- *c = '\0';
- tok.kind = tok·ident;
- tok.lit.s = b + 1;
- return tok;
-
- default:
- IDENT:
- while (isvalidchar(*c)) {
- *(++c) = s.get(fp);
- }
- s.unget(fp, *c);
- assert(c - b < 1024);
-
- *c = '\0';
- tok.kind = tok·ident;
- tok.lit.s = b;
- return tok;
- }
-#undef isvalidchar
-}
-
-static
-struct Token
-lex_nospace(io·Peeker s, void *impl)
-{
- struct Token tok;
- tok = lex(s, impl);
- if (tok.kind == tok·space) {
- tok = lex_nospace(s, impl);
- }
-
- return tok;
-}
-
-struct Parser
-{
- int lev;
- bio·Node *root;
- struct Token tok;
-
- void *fimpl;
- io·Peeker file;
- void *himpl;
- mem·Allocator heap;
-};
-
-static
-error
-parse(struct Parser *p)
-{
- error err;
- bio·Node *node;
- bio·Node *root;
- struct Token tok;
-
- node = p->root;
- for (;;) {
- tok = lex_nospace(p->file, p->fimpl);
-
- switch (tok.kind) {
- case tok·lparen:
- if (!p->root && p->lev > 0) {
- errorf("parse format: attempted to make root at non-zero level");
- goto ERROR;
- }
-
- node = p->heap.alloc(p->himpl, 1, sizeof(*node));
- memset(node, 0, sizeof(*node));
-
- if (p->root) {
- phylo·addchild(p->root, node);
- root = p->root;
- } else {
- root = node;
- }
-
- p->lev++;
- p->root = node;
- p->tok = tok;
- err = parse(p);
- if (err) {
- goto ERROR;
- }
- if (p->tok.kind != tok·rparen) {
- errorf("incorrect format: closing parentheses expected to proceed opening");
- goto ERROR;
- }
- p->root = root;
- // NOTE(nnoll): We don't want to override the state of p->tok here!
- // Jump straight to grabbing next token.
- continue;
-
- case tok·rparen:
- p->lev--;
- goto DONE;
-
- /* Comments */
- case tok·lbrak:
- if (!node) {
- errorf("incorrect format: comment found in disallowed region");
- goto ERROR;
- }
- node->comment = str·make("");
- while (tok.kind != tok·rbrak) {
- tok = lex_nospace(p->file, p->fimpl);
- if (tok.kind == tok·eof || tok.kind == tok·nil) {
- errorf("incorrect format: unmatched comment bracket '['");
- goto ERROR;
- }
- str·append(&node->comment, tokstr(tok));
- }
- break;
-
- case tok·rbrak:
- errorf("incorrect format: end comment token found in disallowed region");
- goto ERROR;
- break;
-
- case tok·colon:
- tok = lex_nospace(p->file, p->fimpl);
- if (tok.kind != tok·number) {
- errorf("incorrect format: expected number after colon");
- goto ERROR;
- }
- if (node == nil) {
- errorf("parse error: attempting to set distance of nil node");
- goto ERROR;
- }
- node->dist = tok.lit.x;
- break;
-
- case tok·comma:
- node = nil;
- break;
-
- case tok·ident:
- if (p->tok.kind == tok·rparen) {
- if (!node) {
- errorf("parse error: attempting to set name of nil node");
- goto ERROR;
- }
- node->name = str·make(tok.lit.s);
- } else {
- if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) {
- errorf("format error: misplaced identifier for leaf found");
- goto ERROR;
- }
-
- if (!p->root) {
- errorf("parse error: attempting to create child for no parent");
- goto ERROR;
- }
-
- node = p->heap.alloc(p->himpl, 1, sizeof(*node));
- memset(node, 0, sizeof(*node));
- node->name = str·make(tok.lit.s);
-
- phylo·addchild(p->root, node);
- }
- break;
-
- case tok·number:
- if (p->tok.kind == tok·rparen) {
- if (p->lev == 0) {
- errorf("format error: support value on root not supported");
- goto ERROR;
- }
- node->support = tok.lit.x;
- } else {
- errorf("format error: found number in unexpected location");
- goto ERROR;
- }
- break;
-
- case tok·semi:
- p->file.unget(p->fimpl, ';');
- if (p->lev) {
- errorf("format error: uneven number of parentheses found at ';'");
- goto ERROR;
- }
- goto DONE;
-
- case tok·eof:
- goto DONE;
-
- default:
- errorf("parse error: unrecognized token");
- goto ERROR;
- }
-
- p->tok = tok;
- }
-
-DONE:
- p->tok = tok;
- return 0;
-ERROR:
- // TODO(nnoll): cleanup
- return 1;
-}
-
-error
-bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree)
-{
- error err;
- struct Parser p;
- enum
- {
- error·nil,
- error·notree,
- error·parse,
- };
-
- if (!tree) {
- return error·notree;
- }
-
- p = (struct Parser){
- .lev = 0,
- .root = nil,
- .tok = (struct Token){ 0 },
- .fimpl = s,
- .file = stream,
- .himpl = tree->h,
- .heap = tree->heap,
- };
- err = parse(&p);
- if (err) {
- errorf("parsing failed\n");
- return error·parse;
- }
-
- tree->root = p.root;
- tree->nleaf = 0;
- tree->root->nnode = 0;
-
- phylo·countleafs(tree->root, &tree->nleaf);
- phylo·countnodes(tree->root, &tree->root->nnode);
-
- return error·nil;
-}
-
-// -----------------------------------------------------------------------
-// Write
-
-static
-error
-dump(bio·Node *node, void *impl, io·Putter out)
-{
- byte b[24];
-
- if (!node) {
- return 1;
- }
-
- bio·Node *child;
- if (node->nchild) {
- out.put(impl, '(');
-
- dump(node->child, impl, out);
- for (child = node->child->sibling; child != nil; child = child->sibling) {
- out.put(impl, ',');
- dump(child, impl, out);
- }
-
- out.put(impl, ')');
- }
- if (node->name) {
- out.putstr(impl, node->name);
- }
-
- if (node->parent) {
- out.put(impl, ':');
- snprintf(b, arrlen(b), "%f", node->dist);
- out.putstr(impl, b);
- }
-
- return 0;
-}
-
-error
-bio·writenewick(bio·Tree tree, io·Putter out, void* impl)
-{
- dump(tree.root, impl, out);
- out.put(impl, ';');
- out.put(impl, '\n');
-
- return 0;
-}
diff --git a/sys/libbio/io/rules.mk b/sys/libbio/io/rules.mk
deleted file mode 100644
index a4277fc..0000000
--- a/sys/libbio/io/rules.mk
+++ /dev/null
@@ -1,21 +0,0 @@
-include share/push.mk
-
-# Iterate through subdirectory tree
-
-# Local sources
-SRCS_$(d) := $(wildcard $(d)/*.c)
-LIBS_$(d) :=
-BINS_$(d) :=
-TSTS_$(d) :=
-
-include share/paths.mk
-
-$(LIBS_$(d)): $(OBJS_$(d))
- $(ARCHIVE)
-
-# $(BINS_$(d)): TCLIBS := $(OBJ_DIR)/libn/libn.a
-$(BINS_$(d)): $(OBJS_$(d))
- $(LINK)
-
-# ---- Pop off stack ----
-include share/pop.mk
diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk
index 6a029b1..cbc6887 100644
--- a/sys/libbio/rules.mk
+++ b/sys/libbio/rules.mk
@@ -1,18 +1,15 @@
include share/push.mk
-# Iterate through subdirectory tree
-DIR := $(d)/io
-include $(DIR)/rules.mk
-
# Local sources
SRCS_$(d) := \
- $(d)/align.c \
+ $(d)/fasta.c \
+ $(d)/newick.c \
$(d)/phylo.c
LIBS_$(d) := $(d)/libbio.a
BINS_$(d) :=
-TSTS_$(d) := \
- $(d)/test.c \
- $(d)/simulate.c
+# TSTS_$(d) := \
+# $(d)/test.c \
+# $(d)/simulate.c
include share/paths.mk