aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-04-21 19:19:05 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-04-21 19:19:05 -0700
commit583656a3537bc43a28c58111520143df04bf27f2 (patch)
tree8e75d98cea3d6af1d1b85d5c5b5e22ac5e0c4cde
parente596ebd0c129913b7135210b23a50336b6f8556f (diff)
feat: added skeleton of biological library
-rw-r--r--include/libbio.h29
-rw-r--r--sys/libbio/io/newick.c313
-rw-r--r--sys/libbio/io/rules.mk41
-rw-r--r--sys/libbio/phylo.c20
-rw-r--r--sys/libbio/rules.mk43
-rw-r--r--sys/libbio/test.c60
6 files changed, 506 insertions, 0 deletions
diff --git a/include/libbio.h b/include/libbio.h
new file mode 100644
index 0000000..d8430b3
--- /dev/null
+++ b/include/libbio.h
@@ -0,0 +1,29 @@
+#pragma once
+
+// -----------------------------------------------------------------------
+// Phylogenetics
+
+typedef struct bio·Node
+{
+ string name;
+ string comment;
+ double dist;
+ double support;
+ int nchild;
+ int ndescendent;
+ struct bio·Node *parent;
+ struct bio·Node *child[2];
+ struct bio·Node *sibling;
+} bio·Node;
+
+error phylo·addchild(bio·Node* parent, bio·Node* child);
+
+typedef struct bio·Tree
+{
+ bio·Node *root;
+} bio·Tree;
+
+bio·Tree bio·readnewick(Stream *file, mem·Allocator heap);
+
+// -----------------------------------------------------------------------
+// Sequences
diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c
new file mode 100644
index 0000000..b81e1bd
--- /dev/null
+++ b/sys/libbio/io/newick.c
@@ -0,0 +1,313 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Tokens
+
+enum TokenKind
+{
+ tok·nil,
+ tok·eof,
+ tok·space,
+ tok·ident,
+ tok·number,
+ tok·lparen,
+ tok·rparen,
+ tok·lbrak,
+ tok·rbrak,
+ tok·comma,
+ tok·semi,
+ tok·colon,
+
+ NUM_TOKENS,
+};
+
+
+struct Token {
+ enum TokenKind kind;
+ union
+ {
+ byte *s;
+ double x;
+ } lit;
+};
+
+byte*
+tokstr(struct Token tok)
+{
+ static byte b[50];
+ switch (tok.kind) {
+ case tok·nil: return "";
+ case tok·eof: return nil;
+ case tok·space: return " ";
+ case tok·ident: return tok.lit.s;
+ case tok·lparen: return "(";
+ case tok·rparen: return ")";
+ case tok·lbrak: return "[";
+ case tok·rbrak: return "]";
+ case tok·comma: return ",";
+ case tok·semi: return ";";
+ case tok·colon: return ":";
+ case tok·number:
+ snprintf(b, arrlen(b), "%f", tok.lit.x);
+ return b;
+ default:
+ return nil;
+ }
+}
+
+
+// -----------------------------------------------------------------------
+// Read
+
+// TODO: Bounds checking on buffer
+struct Token
+lex(Stream *s)
+{
+ struct Token tok;
+ byte *c, b[1024];
+ c = b;
+ *c = io·getbyte(s);
+
+ if (isspace(*c)) {
+ while (isspace(*c)) {
+ *(++c) = io·getbyte(s);
+ }
+
+ io·ungetbyte(s, *c);
+ Assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·space;
+ tok.lit.s = b;
+ return tok;
+ }
+
+ switch (*c) {
+ case EOF: tok.kind = tok·eof; return tok;
+ case '(': tok.kind = tok·lparen; return tok;
+ case ')': tok.kind = tok·rparen; return tok;
+ case '[': tok.kind = tok·lbrak; return tok;
+ case ']': tok.kind = tok·rbrak; return tok;
+ case ',': tok.kind = tok·comma; return tok;
+ case ';': tok.kind = tok·semi; return tok;
+ case ':': tok.kind = tok·colon; return tok;
+
+ case '.':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ while (isdigit(*c)) {
+ NUM: *(++c) = io·getbyte(s);
+ }
+ if (*c == '.') goto NUM;
+
+ io·ungetbyte(s, *c);
+ Assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·number;
+ tok.lit.x = atof(b);
+ return tok;
+
+ default:
+ while (isalnum(*c)) {
+ *(++c) = io·getbyte(s);
+ }
+
+ io·ungetbyte(s, *c);
+ Assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·ident;
+ tok.lit.s = b;
+ return tok;
+ }
+}
+
+struct Token
+lex_nospace(Stream *s)
+{
+ struct Token tok;
+ tok = lex(s);
+ if (tok.kind == tok·space) {
+ lex_nospace(s);
+ }
+
+ return tok;
+}
+
+struct Parser
+{
+ int lev;
+ bio·Node *root;
+ struct Token tok;
+
+ Stream *file;
+ mem·Allocator heap;
+};
+
+error
+parse(struct Parser *p)
+{
+ error err;
+ bio·Node *node;
+ bio·Node *root;
+ struct Token tok;
+
+ for (;;) {
+ tok = lex_nospace(p->file);
+
+ switch (tok.kind) {
+ case tok·lparen:
+ if (p->lev > 0) {
+ errorf("incorrect format: opening another node before termination of last tree\n");
+ goto ERROR;
+ }
+ node = p->heap.alloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+ if (p->root) {
+ phylo·addchild(p->root, node);
+ root = p->root;
+ } else {
+ root = node;
+ }
+
+ p->lev++;
+ err = parse(p);
+ if (err) {
+ goto ERROR;
+ }
+ if (p->tok.kind != tok·rparen) {
+ errorf("incorrect format: closing parentheses expected to proceed opening\n");
+ goto ERROR;
+ }
+ p->root = root;
+
+ break;
+
+ case tok·rparen:
+ p->lev--;
+ goto DONE;
+
+ /* Comments */
+ case tok·lbrak:
+ if (!node) {
+ errorf("incorrect format: comment found in disallowed region\n");
+ goto ERROR;
+ }
+ node->comment = str·new("");
+ while (tok.kind != tok·rbrak) {
+ tok = lex_nospace(p->file);
+ if (tok.kind == tok·eof || tok.kind == tok·nil) {
+ errorf("incorrect format: unmatched comment bracket '['\n");
+ goto ERROR;
+ }
+ str·append(node->comment, tokstr(tok));
+ }
+ break;
+
+ case tok·rbrak:
+ errorf("incorrect format: end comment token found in disallowed region\n");
+ goto ERROR;
+ break;
+
+ case tok·colon:
+ tok = lex_nospace(p->file);
+ if (tok.kind != tok·number) {
+ errorf("incorrect format: expected number after colon\n");
+ goto ERROR;
+ }
+ if (node == nil) {
+ errorf("parse error: attempting to set distance of nil node\n");
+ goto ERROR;
+ }
+ node->dist = tok.lit.x;
+ break;
+
+ case tok·comma:
+ node = nil;
+ break;
+
+ case tok·ident:
+ if (p->tok.kind != tok·rparen) {
+ if (!node) {
+ errorf("parse error: attempting to set name of nil node\n");
+ goto ERROR;
+ }
+ node->name = str·new(tok.lit.s);
+ } else {
+ if (p->tok.kind != tok·comma) {
+ errorf("format error: misplaced identifier found\n");
+ goto ERROR;
+ }
+ if (!node) {
+ errorf("parse error: attempting to create child for no parent\n");
+ goto ERROR;
+ }
+ node = p->heap.alloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+ phylo·addchild(p->root, node);
+ }
+ break;
+
+ case tok·number:
+ if (p->tok.kind == tok·rparen) {
+ if (p->lev == 0) {
+ errorf("format error: support value on root not supported\n");
+ goto ERROR;
+ }
+ node->support = tok.lit.x;
+ } else {
+ errorf("format error: found number in unexpected location\n");
+ goto ERROR;
+ }
+ break;
+
+ case tok·semi:
+ io·ungetbyte(p->file, ';');
+ if (p->lev) {
+ errorf("format error: uneven number of parentheses found at ';'\n");
+ }
+ goto DONE;
+ break;
+
+ case tok·eof:
+ goto DONE;
+ break;
+
+ default:
+ goto ERROR;
+ }
+
+ p->tok = tok;
+ }
+
+
+DONE:
+ p->tok = tok;
+ return 0;
+ERROR:
+ // TODO(nnoll): cleanup
+ return 1;
+}
+
+bio·Tree
+bio·readnewick(Stream *file, mem·Allocator heap)
+{
+ error err;
+ struct Parser p;
+ bio·Tree tree;
+
+ err = parse(&p);
+ if (err) {
+ errorf("parsing failed\n");
+ return tree;
+ }
+ tree.root = p.root;
+
+ return tree;
+}
+
+// -----------------------------------------------------------------------
+// Write
diff --git a/sys/libbio/io/rules.mk b/sys/libbio/io/rules.mk
new file mode 100644
index 0000000..9ef6d00
--- /dev/null
+++ b/sys/libbio/io/rules.mk
@@ -0,0 +1,41 @@
+# ---- Push on stack ----
+SP := $(SP).x
+DIRSTACK_$(SP) := $(d)
+d := $(DIR)
+
+# Iterate through subdirectory tree
+
+# Local sources
+SRCS_$(d) := $(wildcard $(d)/*.c)
+OBJS_$(d) := $(SRCS_$(d):.c=.o)
+OBJS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(OBJS_$(d)))
+DEPS_$(d) := $(OBJS_$(d):.o=.d)
+
+OBJS := $(OBJS) $(OBJS_$(d))
+DEPS := $(DEPS) $(DEPS_$(d))
+
+# Local targets
+LIBS_$(d) :=
+LIBS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(LIBS_$(d)))
+LIBS := $(LIBS) $(LIBS_$(d))
+
+BINS_$(d) :=
+BINS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(BINS_$(d)))
+BINS := $(BINS) $(BINS_$(d))
+
+# Local rules
+# $(LIBS_$(d)) = TCFLAGS :=
+# $(LIBS_$(d)) = TCINCS :=
+
+$(LIBS_$(d)): $(OBJS_$(d))
+ $(ARCHIVE)
+
+# $(BINS_$(d)): TCLIBS := $(OBJ_DIR)/libn/libn.a
+$(BINS_$(d)): $(OBJS_$(d))
+ $(LINK)
+
+# ---- Pop off stack ----
+-include $(DEPS_$(d))
+
+d := $(DIRSTACK_$(SP))
+SP := $(basename $(SP))
diff --git a/sys/libbio/phylo.c b/sys/libbio/phylo.c
new file mode 100644
index 0000000..8033e35
--- /dev/null
+++ b/sys/libbio/phylo.c
@@ -0,0 +1,20 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+error
+phylo·addchild(bio·Node* parent, bio·Node* child)
+{
+ bio·Node *it, *sibling;
+ if (parent->nchild < 2) {
+ parent->child[parent->nchild++] = child;
+ } else {
+ for (it = parent->child[1]->sibling; it != nil; it = it->sibling) {
+ sibling = it;
+ }
+ sibling->sibling = child;
+ parent->nchild++;
+ }
+
+ return 0;
+}
diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk
new file mode 100644
index 0000000..fd3e066
--- /dev/null
+++ b/sys/libbio/rules.mk
@@ -0,0 +1,43 @@
+# ---- Push on stack ----
+SP := $(SP).x
+DIRSTACK_$(SP) := $(d)
+d := $(DIR)
+
+# Iterate through subdirectory tree
+DIR := $(d)/io
+include $(DIR)/rules.mk
+
+# Local sources
+SRCS_$(d) := $(wildcard $(d)/*.c)
+OBJS_$(d) := $(SRCS_$(d):.c=.o)
+OBJS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(OBJS_$(d)))
+DEPS_$(d) := $(OBJS_$(d):.o=.d)
+
+OBJS := $(OBJS) $(OBJS_$(d))
+DEPS := $(DEPS) $(DEPS_$(d))
+
+# Local targets
+LIBS_$(d) := $(d)/libbio.a
+LIBS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(LIBS_$(d)))
+LIBS := $(LIBS) $(LIBS_$(d))
+
+BINS_$(d) := $(d)/test
+BINS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(BINS_$(d)))
+BINS := $(BINS) $(BINS_$(d))
+
+# Local rules
+# $(LIBS_$(d)) = TCFLAGS :=
+# $(LIBS_$(d)) = TCINCS :=
+# $(LIBS_$(d)) = TCLIBS :=
+
+$(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io)
+ $(ARCHIVE)
+
+$(BINS_$(d)): $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a
+ $(LINK)
+
+# ---- Pop off stack ----
+-include $(DEPS_$(d))
+
+d := $(DIRSTACK_$(SP))
+SP := $(basename $(SP))
diff --git a/sys/libbio/test.c b/sys/libbio/test.c
new file mode 100644
index 0000000..00345c4
--- /dev/null
+++ b/sys/libbio/test.c
@@ -0,0 +1,60 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Arena allocator
+
+static mem·Arena* ARENA;
+
+static
+void*
+bio·alloc(ulong size)
+{
+ return mem·arenaalloc(ARENA, size);
+}
+
+static
+void*
+bio·realloc(void *ptr, ulong size)
+{
+ void* new = mem·arenaalloc(ARENA, size);
+ memcpy(new, ptr, size);
+
+ return new;
+}
+
+static
+void
+bio·free(void *ptr)
+{
+ /* stub */
+}
+
+static mem·Allocator arena = {.alloc = &bio·alloc, .realloc = &bio·realloc, .free = &bio·free };
+
+// -----------------------------------------------------------------------
+// Point of entry for testing
+
+void
+init()
+{
+ ARENA = mem·newarena(mem·sys);
+}
+
+int
+main()
+{
+ init();
+
+ bio·Tree t;
+ Stream *fd;
+
+ fd = io·open("/home/nolln/root/data/test/example.nwk", "r");
+ printf("starting\n");
+ t = bio·readnewick(fd, arena);
+ io·close(fd);
+ printf("ending\n");
+ return 0;
+}
+