aboutsummaryrefslogtreecommitdiff
path: root/sys/libbio/io/newick.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/libbio/io/newick.c')
-rw-r--r--sys/libbio/io/newick.c313
1 files changed, 313 insertions, 0 deletions
diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c
new file mode 100644
index 0000000..b81e1bd
--- /dev/null
+++ b/sys/libbio/io/newick.c
@@ -0,0 +1,313 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Tokens
+
+enum TokenKind
+{
+ tok·nil,
+ tok·eof,
+ tok·space,
+ tok·ident,
+ tok·number,
+ tok·lparen,
+ tok·rparen,
+ tok·lbrak,
+ tok·rbrak,
+ tok·comma,
+ tok·semi,
+ tok·colon,
+
+ NUM_TOKENS,
+};
+
+
+struct Token {
+ enum TokenKind kind;
+ union
+ {
+ byte *s;
+ double x;
+ } lit;
+};
+
+byte*
+tokstr(struct Token tok)
+{
+ static byte b[50];
+ switch (tok.kind) {
+ case tok·nil: return "";
+ case tok·eof: return nil;
+ case tok·space: return " ";
+ case tok·ident: return tok.lit.s;
+ case tok·lparen: return "(";
+ case tok·rparen: return ")";
+ case tok·lbrak: return "[";
+ case tok·rbrak: return "]";
+ case tok·comma: return ",";
+ case tok·semi: return ";";
+ case tok·colon: return ":";
+ case tok·number:
+ snprintf(b, arrlen(b), "%f", tok.lit.x);
+ return b;
+ default:
+ return nil;
+ }
+}
+
+
+// -----------------------------------------------------------------------
+// Read
+
+// TODO: Bounds checking on buffer
+struct Token
+lex(Stream *s)
+{
+ struct Token tok;
+ byte *c, b[1024];
+ c = b;
+ *c = io·getbyte(s);
+
+ if (isspace(*c)) {
+ while (isspace(*c)) {
+ *(++c) = io·getbyte(s);
+ }
+
+ io·ungetbyte(s, *c);
+ Assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·space;
+ tok.lit.s = b;
+ return tok;
+ }
+
+ switch (*c) {
+ case EOF: tok.kind = tok·eof; return tok;
+ case '(': tok.kind = tok·lparen; return tok;
+ case ')': tok.kind = tok·rparen; return tok;
+ case '[': tok.kind = tok·lbrak; return tok;
+ case ']': tok.kind = tok·rbrak; return tok;
+ case ',': tok.kind = tok·comma; return tok;
+ case ';': tok.kind = tok·semi; return tok;
+ case ':': tok.kind = tok·colon; return tok;
+
+ case '.':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ while (isdigit(*c)) {
+ NUM: *(++c) = io·getbyte(s);
+ }
+ if (*c == '.') goto NUM;
+
+ io·ungetbyte(s, *c);
+ Assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·number;
+ tok.lit.x = atof(b);
+ return tok;
+
+ default:
+ while (isalnum(*c)) {
+ *(++c) = io·getbyte(s);
+ }
+
+ io·ungetbyte(s, *c);
+ Assert(c - b < 1024);
+
+ *c = 0;
+ tok.kind = tok·ident;
+ tok.lit.s = b;
+ return tok;
+ }
+}
+
+struct Token
+lex_nospace(Stream *s)
+{
+ struct Token tok;
+ tok = lex(s);
+ if (tok.kind == tok·space) {
+ lex_nospace(s);
+ }
+
+ return tok;
+}
+
+struct Parser
+{
+ int lev;
+ bio·Node *root;
+ struct Token tok;
+
+ Stream *file;
+ mem·Allocator heap;
+};
+
+error
+parse(struct Parser *p)
+{
+ error err;
+ bio·Node *node;
+ bio·Node *root;
+ struct Token tok;
+
+ for (;;) {
+ tok = lex_nospace(p->file);
+
+ switch (tok.kind) {
+ case tok·lparen:
+ if (p->lev > 0) {
+ errorf("incorrect format: opening another node before termination of last tree\n");
+ goto ERROR;
+ }
+ node = p->heap.alloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+ if (p->root) {
+ phylo·addchild(p->root, node);
+ root = p->root;
+ } else {
+ root = node;
+ }
+
+ p->lev++;
+ err = parse(p);
+ if (err) {
+ goto ERROR;
+ }
+ if (p->tok.kind != tok·rparen) {
+ errorf("incorrect format: closing parentheses expected to proceed opening\n");
+ goto ERROR;
+ }
+ p->root = root;
+
+ break;
+
+ case tok·rparen:
+ p->lev--;
+ goto DONE;
+
+ /* Comments */
+ case tok·lbrak:
+ if (!node) {
+ errorf("incorrect format: comment found in disallowed region\n");
+ goto ERROR;
+ }
+ node->comment = str·new("");
+ while (tok.kind != tok·rbrak) {
+ tok = lex_nospace(p->file);
+ if (tok.kind == tok·eof || tok.kind == tok·nil) {
+ errorf("incorrect format: unmatched comment bracket '['\n");
+ goto ERROR;
+ }
+ str·append(node->comment, tokstr(tok));
+ }
+ break;
+
+ case tok·rbrak:
+ errorf("incorrect format: end comment token found in disallowed region\n");
+ goto ERROR;
+ break;
+
+ case tok·colon:
+ tok = lex_nospace(p->file);
+ if (tok.kind != tok·number) {
+ errorf("incorrect format: expected number after colon\n");
+ goto ERROR;
+ }
+ if (node == nil) {
+ errorf("parse error: attempting to set distance of nil node\n");
+ goto ERROR;
+ }
+ node->dist = tok.lit.x;
+ break;
+
+ case tok·comma:
+ node = nil;
+ break;
+
+ case tok·ident:
+ if (p->tok.kind != tok·rparen) {
+ if (!node) {
+ errorf("parse error: attempting to set name of nil node\n");
+ goto ERROR;
+ }
+ node->name = str·new(tok.lit.s);
+ } else {
+ if (p->tok.kind != tok·comma) {
+ errorf("format error: misplaced identifier found\n");
+ goto ERROR;
+ }
+ if (!node) {
+ errorf("parse error: attempting to create child for no parent\n");
+ goto ERROR;
+ }
+ node = p->heap.alloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+ phylo·addchild(p->root, node);
+ }
+ break;
+
+ case tok·number:
+ if (p->tok.kind == tok·rparen) {
+ if (p->lev == 0) {
+ errorf("format error: support value on root not supported\n");
+ goto ERROR;
+ }
+ node->support = tok.lit.x;
+ } else {
+ errorf("format error: found number in unexpected location\n");
+ goto ERROR;
+ }
+ break;
+
+ case tok·semi:
+ io·ungetbyte(p->file, ';');
+ if (p->lev) {
+ errorf("format error: uneven number of parentheses found at ';'\n");
+ }
+ goto DONE;
+ break;
+
+ case tok·eof:
+ goto DONE;
+ break;
+
+ default:
+ goto ERROR;
+ }
+
+ p->tok = tok;
+ }
+
+
+DONE:
+ p->tok = tok;
+ return 0;
+ERROR:
+ // TODO(nnoll): cleanup
+ return 1;
+}
+
+bio·Tree
+bio·readnewick(Stream *file, mem·Allocator heap)
+{
+ error err;
+ struct Parser p;
+ bio·Tree tree;
+
+ err = parse(&p);
+ if (err) {
+ errorf("parsing failed\n");
+ return tree;
+ }
+ tree.root = p.root;
+
+ return tree;
+}
+
+// -----------------------------------------------------------------------
+// Write