From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/libbio/newick.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 414 insertions(+) create mode 100644 src/libbio/newick.c (limited to 'src/libbio/newick.c') diff --git a/src/libbio/newick.c b/src/libbio/newick.c new file mode 100644 index 0000000..5e6d30a --- /dev/null +++ b/src/libbio/newick.c @@ -0,0 +1,414 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// Tokens + +enum TokenKind +{ + tok·nil, + tok·eof, + tok·space, + tok·ident, + tok·number, + tok·lparen, + tok·rparen, + tok·lbrak, + tok·rbrak, + tok·comma, + tok·semi, + tok·colon, + + NUM_TOKENS, +}; + + +struct Token { + enum TokenKind kind; + union + { + byte *s; + double x; + } lit; +}; + +static +byte* +tokstr(struct Token tok) +{ + static byte b[50]; + switch (tok.kind) { + case tok·nil: return ""; + case tok·eof: return nil; + case tok·space: return " "; + case tok·ident: return tok.lit.s; + case tok·lparen: return "("; + case tok·rparen: return ")"; + case tok·lbrak: return "["; + case tok·rbrak: return "]"; + case tok·comma: return ","; + case tok·semi: return ";"; + case tok·colon: return ":"; + case tok·number: + snprintf(b, arrlen(b), "%f", tok.lit.x); + return b; + default: + return nil; + } +} + + +// ----------------------------------------------------------------------- +// Read + +// TODO: Bounds checking on buffer +static +struct Token +lex(io·Peeker s, void* fp) +{ +#define isvalidchar(C) ((C) == '!') || \ + ('\"' < (C) && (C) < '\'') || \ + (')' < (C) && (C) < '+') || \ + (',' < (C) && (C) < ':') || \ + (':' < (C) && (C) < '[') || \ + ((C) == '\\') || \ + (']' < (C) && (C) <= '~') + byte *c; + struct Token tok; + static byte b[1024]; + c = b; + *c = s.get(fp); + + if (isspace(*c)) { + while (isspace(*c)) { + *(++c) = s.get(fp); + } + + s.unget(fp, *c); + assert(c - b < 1024); + + *c = 0; + tok.kind = tok·space; + tok.lit.s = b; + return tok; + } + + switch (*c) { + case EOF: tok.kind = tok·eof; return tok; + case '(': tok.kind = tok·lparen; return tok; + case ')': tok.kind = tok·rparen; return tok; + case '[': tok.kind = tok·lbrak; return tok; + case ']': tok.kind = tok·rbrak; return tok; + case ',': tok.kind = tok·comma; return tok; + case ';': tok.kind = tok·semi; return tok; + case ':': tok.kind = tok·colon; return tok; + + case '.': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + while (isdigit(*c)) { + NUM: *(++c) = s.get(fp); + } + if (*c == '.') goto NUM; + if (isvalidchar(*c)) goto IDENT; + + s.unget(fp, *c); + assert(c - b < 1024); + + *c = 0; + tok.kind = tok·number; + tok.lit.x = atof(b); + return tok; + + case '\"': + while ((*c) != '\"') { + *(++c) = s.get(fp); + } + assert(c - b < 1024); + + *c = '\0'; + tok.kind = tok·ident; + tok.lit.s = b + 1; + return tok; + + default: + IDENT: + while (isvalidchar(*c)) { + *(++c) = s.get(fp); + } + s.unget(fp, *c); + assert(c - b < 1024); + + *c = '\0'; + tok.kind = tok·ident; + tok.lit.s = b; + return tok; + } +#undef isvalidchar +} + +static +struct Token +lex_nospace(io·Peeker s, void *impl) +{ + struct Token tok; + tok = lex(s, impl); + if (tok.kind == tok·space) { + tok = lex_nospace(s, impl); + } + + return tok; +} + +struct Parser +{ + int lev; + bio·Node *root; + struct Token tok; + + void *io; + io·Peeker file; + void *heap; + mem·Allocator mem; +}; + +static +error +parse(struct Parser *p) +{ + error err; + bio·Node *node; + bio·Node *root; + struct Token tok; + + node = p->root; + for (;;) { + tok = lex_nospace(p->file, p->io); + + switch (tok.kind) { + case tok·lparen: + if (!p->root && p->lev > 0) { + errorf("parse format: attempted to make root at non-zero level"); + goto ERROR; + } + + node = p->mem.alloc(p->heap, 1, sizeof(*node)); + memset(node, 0, sizeof(*node)); + + if (p->root) { + phylo·addchild(p->root, node); + root = p->root; + } else { + root = node; + } + + p->lev++; + p->root = node; + p->tok = tok; + err = parse(p); + if (err) { + goto ERROR; + } + if (p->tok.kind != tok·rparen) { + errorf("incorrect format: closing parentheses expected to proceed opening"); + goto ERROR; + } + p->root = root; + // NOTE(nnoll): We don't want to override the state of p->tok here! + // Jump straight to grabbing next token. + continue; + + case tok·rparen: + p->lev--; + goto DONE; + + /* Comments */ + case tok·lbrak: + if (!node) { + errorf("incorrect format: comment found in disallowed region"); + goto ERROR; + } + node->comment = str·make(""); + while (tok.kind != tok·rbrak) { + tok = lex_nospace(p->file, p->io); + if (tok.kind == tok·eof || tok.kind == tok·nil) { + errorf("incorrect format: unmatched comment bracket '['"); + goto ERROR; + } + str·append(&node->comment, tokstr(tok)); + } + break; + + case tok·rbrak: + errorf("incorrect format: end comment token found in disallowed region"); + goto ERROR; + break; + + case tok·colon: + tok = lex_nospace(p->file, p->io); + if (tok.kind != tok·number) { + errorf("incorrect format: expected number after colon"); + goto ERROR; + } + if (node == nil) { + errorf("parse error: attempting to set distance of nil node"); + goto ERROR; + } + node->dist = tok.lit.x; + break; + + case tok·comma: + node = nil; + break; + + case tok·ident: + if (p->tok.kind == tok·rparen) { + if (!node) { + errorf("parse error: attempting to set name of nil node"); + goto ERROR; + } + node->name = str·make(tok.lit.s); + } else { + if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) { + errorf("format error: misplaced identifier for leaf found"); + goto ERROR; + } + + if (!p->root) { + errorf("parse error: attempting to create child for no parent"); + goto ERROR; + } + + node = p->mem.alloc(p->heap, 1, sizeof(*node)); + memset(node, 0, sizeof(*node)); + + node->name = str·make(tok.lit.s); + + phylo·addchild(p->root, node); + } + break; + + case tok·number: + if (p->tok.kind == tok·rparen) { + if (p->lev == 0) { + errorf("format error: support value on root not supported"); + goto ERROR; + } + node->support = tok.lit.x; + } else { + errorf("format error: found number in unexpected location"); + goto ERROR; + } + break; + + case tok·semi: + p->file.unget(p->io, ';'); + if (p->lev) { + errorf("format error: uneven number of parentheses found at ';'"); + goto ERROR; + } + goto DONE; + + case tok·eof: + goto DONE; + + default: + errorf("parse error: unrecognized token"); + goto ERROR; + } + + p->tok = tok; + } + +DONE: + p->tok = tok; + return 0; +ERROR: + // TODO(nnoll): cleanup + return 1; +} + +int +bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree) +{ + error err; + struct Parser p; + + if (!tree) { + errorf("tree pointer nil"); + return 0; + } + + p = (struct Parser){ + .lev = 0, + .root = nil, + .tok = (struct Token){ 0 }, + .io = s, + .file = stream, + .mem = tree->mem, + .heap = tree->heap, + }; + err = parse(&p); + if (err) { + errorf("parsing failed\n"); + return 0; + } + + tree->root = p.root; + tree->nleaf = 0; + tree->root->nnode = 0; + + phylo·countleafs(tree->root, &tree->nleaf); + phylo·countnodes(tree->root, &tree->root->nnode); + + return 1; +} + +// ----------------------------------------------------------------------- +// Write + +static +error +dump(bio·Node *node, void *impl, io·Putter out) +{ + byte b[24]; + + if (!node) { + return 1; + } + + bio·Node *child; + if (node->nchild) { + out.put(impl, '('); + + dump(node->child, impl, out); + for (child = node->child->sibling; child != nil; child = child->sibling) { + out.put(impl, ','); + dump(child, impl, out); + } + + out.put(impl, ')'); + } + if (node->name) { + out.puts(impl, node->name); + } + + if (node->parent) { + out.put(impl, ':'); + snprintf(b, arrlen(b), "%f", node->dist); + out.puts(impl, b); + } + + return 0; +} + +error +bio·writenewick(bio·Tree tree, io·Putter out, void* impl) +{ + dump(tree.root, impl, out); + out.put(impl, ';'); + out.put(impl, '\n'); + + return 0; +} -- cgit v1.2.1