aboutsummaryrefslogtreecommitdiff
path: root/src/libbio/newick.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libbio/newick.c')
-rw-r--r--src/libbio/newick.c414
1 files changed, 414 insertions, 0 deletions
diff --git a/src/libbio/newick.c b/src/libbio/newick.c
new file mode 100644
index 0000000..5e6d30a
--- /dev/null
+++ b/src/libbio/newick.c
@@ -0,0 +1,414 @@
+#include <u.h>
+#include <base.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Tokens
+
+enum TokenKind
+{
+ tok·nil,
+ tok·eof,
+ tok·space,
+ tok·ident,
+ tok·number,
+ tok·lparen,
+ tok·rparen,
+ tok·lbrak,
+ tok·rbrak,
+ tok·comma,
+ tok·semi,
+ tok·colon,
+
+ NUM_TOKENS,
+};
+
+
+struct Token {
+ enum TokenKind kind;
+ union
+ {
+ byte *s;
+ double x;
+ } lit;
+};
+
+static
+byte*
+tokstr(struct Token tok)
+{
+ static byte b[50];
+ switch (tok.kind) {
+ case tok·nil: return "";
+ case tok·eof: return nil;
+ case tok·space: return " ";
+ case tok·ident: return tok.lit.s;
+ case tok·lparen: return "(";
+ case tok·rparen: return ")";
+ case tok·lbrak: return "[";
+ case tok·rbrak: return "]";
+ case tok·comma: return ",";
+ case tok·semi: return ";";
+ case tok·colon: return ":";
+ case tok·number:
+ snprintf(b, arrlen(b), "%f", tok.lit.x);
+ return b;
+ default:
+ return nil;
+ }
+}
+
+
+// -----------------------------------------------------------------------
+// Read
+
+// TODO: Bounds checking on buffer
+static
+struct Token
+lex(io·Peeker s, void* fp)
+{
+#define isvalidchar(C) ((C) == '!') || \
+ ('\"' < (C) && (C) < '\'') || \
+ (')' < (C) && (C) < '+') || \
+ (',' < (C) && (C) < ':') || \
+ (':' < (C) && (C) < '[') || \
+ ((C) == '\\') || \
+ (']' < (C) && (C) <= '~')
+ byte *c;
+ struct Token tok;
+ static byte b[1024];
+ c = b;
+ *c = s.get(fp);
+
+ if (isspace(*c)) {
+ while (isspace(*c)) {
+ *(++c) = s.get(fp);
+ }
+
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·space;
+ tok.lit.s = b;
+ return tok;
+ }
+
+ switch (*c) {
+ case EOF: tok.kind = tok·eof; return tok;
+ case '(': tok.kind = tok·lparen; return tok;
+ case ')': tok.kind = tok·rparen; return tok;
+ case '[': tok.kind = tok·lbrak; return tok;
+ case ']': tok.kind = tok·rbrak; return tok;
+ case ',': tok.kind = tok·comma; return tok;
+ case ';': tok.kind = tok·semi; return tok;
+ case ':': tok.kind = tok·colon; return tok;
+
+ case '.':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ while (isdigit(*c)) {
+ NUM: *(++c) = s.get(fp);
+ }
+ if (*c == '.') goto NUM;
+ if (isvalidchar(*c)) goto IDENT;
+
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·number;
+ tok.lit.x = atof(b);
+ return tok;
+
+ case '\"':
+ while ((*c) != '\"') {
+ *(++c) = s.get(fp);
+ }
+ assert(c - b < 1024);
+
+ *c = '\0';
+ tok.kind = tok·ident;
+ tok.lit.s = b + 1;
+ return tok;
+
+ default:
+ IDENT:
+ while (isvalidchar(*c)) {
+ *(++c) = s.get(fp);
+ }
+ s.unget(fp, *c);
+ assert(c - b < 1024);
+
+ *c = '\0';
+ tok.kind = tok·ident;
+ tok.lit.s = b;
+ return tok;
+ }
+#undef isvalidchar
+}
+
+static
+struct Token
+lex_nospace(io·Peeker s, void *impl)
+{
+ struct Token tok;
+ tok = lex(s, impl);
+ if (tok.kind == tok·space) {
+ tok = lex_nospace(s, impl);
+ }
+
+ return tok;
+}
+
+struct Parser
+{
+ int lev;
+ bio·Node *root;
+ struct Token tok;
+
+ void *io;
+ io·Peeker file;
+ void *heap;
+ mem·Allocator mem;
+};
+
+static
+error
+parse(struct Parser *p)
+{
+ error err;
+ bio·Node *node;
+ bio·Node *root;
+ struct Token tok;
+
+ node = p->root;
+ for (;;) {
+ tok = lex_nospace(p->file, p->io);
+
+ switch (tok.kind) {
+ case tok·lparen:
+ if (!p->root && p->lev > 0) {
+ errorf("parse format: attempted to make root at non-zero level");
+ goto ERROR;
+ }
+
+ node = p->mem.alloc(p->heap, 1, sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ if (p->root) {
+ phylo·addchild(p->root, node);
+ root = p->root;
+ } else {
+ root = node;
+ }
+
+ p->lev++;
+ p->root = node;
+ p->tok = tok;
+ err = parse(p);
+ if (err) {
+ goto ERROR;
+ }
+ if (p->tok.kind != tok·rparen) {
+ errorf("incorrect format: closing parentheses expected to proceed opening");
+ goto ERROR;
+ }
+ p->root = root;
+ // NOTE(nnoll): We don't want to override the state of p->tok here!
+ // Jump straight to grabbing next token.
+ continue;
+
+ case tok·rparen:
+ p->lev--;
+ goto DONE;
+
+ /* Comments */
+ case tok·lbrak:
+ if (!node) {
+ errorf("incorrect format: comment found in disallowed region");
+ goto ERROR;
+ }
+ node->comment = str·make("");
+ while (tok.kind != tok·rbrak) {
+ tok = lex_nospace(p->file, p->io);
+ if (tok.kind == tok·eof || tok.kind == tok·nil) {
+ errorf("incorrect format: unmatched comment bracket '['");
+ goto ERROR;
+ }
+ str·append(&node->comment, tokstr(tok));
+ }
+ break;
+
+ case tok·rbrak:
+ errorf("incorrect format: end comment token found in disallowed region");
+ goto ERROR;
+ break;
+
+ case tok·colon:
+ tok = lex_nospace(p->file, p->io);
+ if (tok.kind != tok·number) {
+ errorf("incorrect format: expected number after colon");
+ goto ERROR;
+ }
+ if (node == nil) {
+ errorf("parse error: attempting to set distance of nil node");
+ goto ERROR;
+ }
+ node->dist = tok.lit.x;
+ break;
+
+ case tok·comma:
+ node = nil;
+ break;
+
+ case tok·ident:
+ if (p->tok.kind == tok·rparen) {
+ if (!node) {
+ errorf("parse error: attempting to set name of nil node");
+ goto ERROR;
+ }
+ node->name = str·make(tok.lit.s);
+ } else {
+ if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) {
+ errorf("format error: misplaced identifier for leaf found");
+ goto ERROR;
+ }
+
+ if (!p->root) {
+ errorf("parse error: attempting to create child for no parent");
+ goto ERROR;
+ }
+
+ node = p->mem.alloc(p->heap, 1, sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->name = str·make(tok.lit.s);
+
+ phylo·addchild(p->root, node);
+ }
+ break;
+
+ case tok·number:
+ if (p->tok.kind == tok·rparen) {
+ if (p->lev == 0) {
+ errorf("format error: support value on root not supported");
+ goto ERROR;
+ }
+ node->support = tok.lit.x;
+ } else {
+ errorf("format error: found number in unexpected location");
+ goto ERROR;
+ }
+ break;
+
+ case tok·semi:
+ p->file.unget(p->io, ';');
+ if (p->lev) {
+ errorf("format error: uneven number of parentheses found at ';'");
+ goto ERROR;
+ }
+ goto DONE;
+
+ case tok·eof:
+ goto DONE;
+
+ default:
+ errorf("parse error: unrecognized token");
+ goto ERROR;
+ }
+
+ p->tok = tok;
+ }
+
+DONE:
+ p->tok = tok;
+ return 0;
+ERROR:
+ // TODO(nnoll): cleanup
+ return 1;
+}
+
+int
+bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree)
+{
+ error err;
+ struct Parser p;
+
+ if (!tree) {
+ errorf("tree pointer nil");
+ return 0;
+ }
+
+ p = (struct Parser){
+ .lev = 0,
+ .root = nil,
+ .tok = (struct Token){ 0 },
+ .io = s,
+ .file = stream,
+ .mem = tree->mem,
+ .heap = tree->heap,
+ };
+ err = parse(&p);
+ if (err) {
+ errorf("parsing failed\n");
+ return 0;
+ }
+
+ tree->root = p.root;
+ tree->nleaf = 0;
+ tree->root->nnode = 0;
+
+ phylo·countleafs(tree->root, &tree->nleaf);
+ phylo·countnodes(tree->root, &tree->root->nnode);
+
+ return 1;
+}
+
+// -----------------------------------------------------------------------
+// Write
+
+static
+error
+dump(bio·Node *node, void *impl, io·Putter out)
+{
+ byte b[24];
+
+ if (!node) {
+ return 1;
+ }
+
+ bio·Node *child;
+ if (node->nchild) {
+ out.put(impl, '(');
+
+ dump(node->child, impl, out);
+ for (child = node->child->sibling; child != nil; child = child->sibling) {
+ out.put(impl, ',');
+ dump(child, impl, out);
+ }
+
+ out.put(impl, ')');
+ }
+ if (node->name) {
+ out.puts(impl, node->name);
+ }
+
+ if (node->parent) {
+ out.put(impl, ':');
+ snprintf(b, arrlen(b), "%f", node->dist);
+ out.puts(impl, b);
+ }
+
+ return 0;
+}
+
+error
+bio·writenewick(bio·Tree tree, io·Putter out, void* impl)
+{
+ dump(tree.root, impl, out);
+ out.put(impl, ';');
+ out.put(impl, '\n');
+
+ return 0;
+}