From 583656a3537bc43a28c58111520143df04bf27f2 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 21 Apr 2020 19:19:05 -0700 Subject: feat: added skeleton of biological library --- sys/libbio/io/newick.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 sys/libbio/io/newick.c (limited to 'sys/libbio/io/newick.c') diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c new file mode 100644 index 0000000..b81e1bd --- /dev/null +++ b/sys/libbio/io/newick.c @@ -0,0 +1,313 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// Tokens + +enum TokenKind +{ + tok·nil, + tok·eof, + tok·space, + tok·ident, + tok·number, + tok·lparen, + tok·rparen, + tok·lbrak, + tok·rbrak, + tok·comma, + tok·semi, + tok·colon, + + NUM_TOKENS, +}; + + +struct Token { + enum TokenKind kind; + union + { + byte *s; + double x; + } lit; +}; + +byte* +tokstr(struct Token tok) +{ + static byte b[50]; + switch (tok.kind) { + case tok·nil: return ""; + case tok·eof: return nil; + case tok·space: return " "; + case tok·ident: return tok.lit.s; + case tok·lparen: return "("; + case tok·rparen: return ")"; + case tok·lbrak: return "["; + case tok·rbrak: return "]"; + case tok·comma: return ","; + case tok·semi: return ";"; + case tok·colon: return ":"; + case tok·number: + snprintf(b, arrlen(b), "%f", tok.lit.x); + return b; + default: + return nil; + } +} + + +// ----------------------------------------------------------------------- +// Read + +// TODO: Bounds checking on buffer +struct Token +lex(Stream *s) +{ + struct Token tok; + byte *c, b[1024]; + c = b; + *c = io·getbyte(s); + + if (isspace(*c)) { + while (isspace(*c)) { + *(++c) = io·getbyte(s); + } + + io·ungetbyte(s, *c); + Assert(c - b < 1024); + + *c = 0; + tok.kind = tok·space; + tok.lit.s = b; + return tok; + } + + switch (*c) { + case EOF: tok.kind = tok·eof; return tok; + case '(': tok.kind = tok·lparen; return tok; + case ')': tok.kind = tok·rparen; return tok; + case '[': tok.kind = tok·lbrak; return tok; + case ']': tok.kind = tok·rbrak; return tok; + case ',': tok.kind = tok·comma; return tok; + case ';': tok.kind = tok·semi; return tok; + case ':': tok.kind = tok·colon; return tok; + + case '.': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + while (isdigit(*c)) { + NUM: *(++c) = io·getbyte(s); + } + if (*c == '.') goto NUM; + + io·ungetbyte(s, *c); + Assert(c - b < 1024); + + *c = 0; + tok.kind = tok·number; + tok.lit.x = atof(b); + return tok; + + default: + while (isalnum(*c)) { + *(++c) = io·getbyte(s); + } + + io·ungetbyte(s, *c); + Assert(c - b < 1024); + + *c = 0; + tok.kind = tok·ident; + tok.lit.s = b; + return tok; + } +} + +struct Token +lex_nospace(Stream *s) +{ + struct Token tok; + tok = lex(s); + if (tok.kind == tok·space) { + lex_nospace(s); + } + + return tok; +} + +struct Parser +{ + int lev; + bio·Node *root; + struct Token tok; + + Stream *file; + mem·Allocator heap; +}; + +error +parse(struct Parser *p) +{ + error err; + bio·Node *node; + bio·Node *root; + struct Token tok; + + for (;;) { + tok = lex_nospace(p->file); + + switch (tok.kind) { + case tok·lparen: + if (p->lev > 0) { + errorf("incorrect format: opening another node before termination of last tree\n"); + goto ERROR; + } + node = p->heap.alloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + if (p->root) { + phylo·addchild(p->root, node); + root = p->root; + } else { + root = node; + } + + p->lev++; + err = parse(p); + if (err) { + goto ERROR; + } + if (p->tok.kind != tok·rparen) { + errorf("incorrect format: closing parentheses expected to proceed opening\n"); + goto ERROR; + } + p->root = root; + + break; + + case tok·rparen: + p->lev--; + goto DONE; + + /* Comments */ + case tok·lbrak: + if (!node) { + errorf("incorrect format: comment found in disallowed region\n"); + goto ERROR; + } + node->comment = str·new(""); + while (tok.kind != tok·rbrak) { + tok = lex_nospace(p->file); + if (tok.kind == tok·eof || tok.kind == tok·nil) { + errorf("incorrect format: unmatched comment bracket '['\n"); + goto ERROR; + } + str·append(node->comment, tokstr(tok)); + } + break; + + case tok·rbrak: + errorf("incorrect format: end comment token found in disallowed region\n"); + goto ERROR; + break; + + case tok·colon: + tok = lex_nospace(p->file); + if (tok.kind != tok·number) { + errorf("incorrect format: expected number after colon\n"); + goto ERROR; + } + if (node == nil) { + errorf("parse error: attempting to set distance of nil node\n"); + goto ERROR; + } + node->dist = tok.lit.x; + break; + + case tok·comma: + node = nil; + break; + + case tok·ident: + if (p->tok.kind != tok·rparen) { + if (!node) { + errorf("parse error: attempting to set name of nil node\n"); + goto ERROR; + } + node->name = str·new(tok.lit.s); + } else { + if (p->tok.kind != tok·comma) { + errorf("format error: misplaced identifier found\n"); + goto ERROR; + } + if (!node) { + errorf("parse error: attempting to create child for no parent\n"); + goto ERROR; + } + node = p->heap.alloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + phylo·addchild(p->root, node); + } + break; + + case tok·number: + if (p->tok.kind == tok·rparen) { + if (p->lev == 0) { + errorf("format error: support value on root not supported\n"); + goto ERROR; + } + node->support = tok.lit.x; + } else { + errorf("format error: found number in unexpected location\n"); + goto ERROR; + } + break; + + case tok·semi: + io·ungetbyte(p->file, ';'); + if (p->lev) { + errorf("format error: uneven number of parentheses found at ';'\n"); + } + goto DONE; + break; + + case tok·eof: + goto DONE; + break; + + default: + goto ERROR; + } + + p->tok = tok; + } + + +DONE: + p->tok = tok; + return 0; +ERROR: + // TODO(nnoll): cleanup + return 1; +} + +bio·Tree +bio·readnewick(Stream *file, mem·Allocator heap) +{ + error err; + struct Parser p; + bio·Tree tree; + + err = parse(&p); + if (err) { + errorf("parsing failed\n"); + return tree; + } + tree.root = p.root; + + return tree; +} + +// ----------------------------------------------------------------------- +// Write -- cgit v1.2.1