#include #include #include // ----------------------------------------------------------------------- // Tokens enum TokenKind { tok·nil, tok·eof, tok·space, tok·ident, tok·number, tok·lparen, tok·rparen, tok·lbrak, tok·rbrak, tok·comma, tok·semi, tok·colon, NUM_TOKENS, }; struct Token { enum TokenKind kind; union { byte *s; double x; } lit; }; static byte* tokstr(struct Token tok) { static byte b[50]; switch (tok.kind) { case tok·nil: return ""; case tok·eof: return nil; case tok·space: return " "; case tok·ident: return tok.lit.s; case tok·lparen: return "("; case tok·rparen: return ")"; case tok·lbrak: return "["; case tok·rbrak: return "]"; case tok·comma: return ","; case tok·semi: return ";"; case tok·colon: return ":"; case tok·number: snprintf(b, arrlen(b), "%f", tok.lit.x); return b; default: return nil; } } // ----------------------------------------------------------------------- // Read // TODO: Bounds checking on buffer static struct Token lex(io·Peeker s, void* fp) { #define isvalidchar(C) ((C) == '!') || \ ('\"' < (C) && (C) < '\'') || \ (')' < (C) && (C) < '+') || \ (',' < (C) && (C) < ':') || \ (':' < (C) && (C) < '[') || \ ((C) == '\\') || \ (']' < (C) && (C) <= '~') byte *c; struct Token tok; static byte b[1024]; c = b; *c = s.get(fp); if (isspace(*c)) { while (isspace(*c)) { *(++c) = s.get(fp); } s.unget(fp, *c); assert(c - b < 1024); *c = 0; tok.kind = tok·space; tok.lit.s = b; return tok; } switch (*c) { case EOF: tok.kind = tok·eof; return tok; case '(': tok.kind = tok·lparen; return tok; case ')': tok.kind = tok·rparen; return tok; case '[': tok.kind = tok·lbrak; return tok; case ']': tok.kind = tok·rbrak; return tok; case ',': tok.kind = tok·comma; return tok; case ';': tok.kind = tok·semi; return tok; case ':': tok.kind = tok·colon; return tok; case '.': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': while (isdigit(*c)) { NUM: *(++c) = s.get(fp); } if (*c == '.') goto NUM; if (isvalidchar(*c)) goto IDENT; s.unget(fp, *c); assert(c - b < 1024); *c = 0; tok.kind = tok·number; tok.lit.x = atof(b); return tok; case '\"': while ((*c) != '\"') { *(++c) = s.get(fp); } assert(c - b < 1024); *c = '\0'; tok.kind = tok·ident; tok.lit.s = b + 1; return tok; default: IDENT: while (isvalidchar(*c)) { *(++c) = s.get(fp); } s.unget(fp, *c); assert(c - b < 1024); *c = '\0'; tok.kind = tok·ident; tok.lit.s = b; return tok; } #undef isvalidchar } static struct Token lex_nospace(io·Peeker s, void *impl) { struct Token tok; tok = lex(s, impl); if (tok.kind == tok·space) { tok = lex_nospace(s, impl); } return tok; } struct Parser { int lev; bio·Node *root; struct Token tok; void *io; io·Peeker file; void *heap; mem·Allocator mem; }; static error parse(struct Parser *p) { error err; bio·Node *node; bio·Node *root; struct Token tok; node = p->root; for (;;) { tok = lex_nospace(p->file, p->io); switch (tok.kind) { case tok·lparen: if (!p->root && p->lev > 0) { errorf("parse format: attempted to make root at non-zero level"); goto ERROR; } node = p->mem.alloc(p->heap, 1, sizeof(*node)); memset(node, 0, sizeof(*node)); if (p->root) { phylo·addchild(p->root, node); root = p->root; } else { root = node; } p->lev++; p->root = node; p->tok = tok; err = parse(p); if (err) { goto ERROR; } if (p->tok.kind != tok·rparen) { errorf("incorrect format: closing parentheses expected to proceed opening"); goto ERROR; } p->root = root; // NOTE(nnoll): We don't want to override the state of p->tok here! // Jump straight to grabbing next token. continue; case tok·rparen: p->lev--; goto DONE; /* Comments */ case tok·lbrak: if (!node) { errorf("incorrect format: comment found in disallowed region"); goto ERROR; } node->comment = str·make(""); while (tok.kind != tok·rbrak) { tok = lex_nospace(p->file, p->io); if (tok.kind == tok·eof || tok.kind == tok·nil) { errorf("incorrect format: unmatched comment bracket '['"); goto ERROR; } str·append(&node->comment, tokstr(tok)); } break; case tok·rbrak: errorf("incorrect format: end comment token found in disallowed region"); goto ERROR; break; case tok·colon: tok = lex_nospace(p->file, p->io); if (tok.kind != tok·number) { errorf("incorrect format: expected number after colon"); goto ERROR; } if (node == nil) { errorf("parse error: attempting to set distance of nil node"); goto ERROR; } node->dist = tok.lit.x; break; case tok·comma: node = nil; break; case tok·ident: if (p->tok.kind == tok·rparen) { if (!node) { errorf("parse error: attempting to set name of nil node"); goto ERROR; } node->name = str·make(tok.lit.s); } else { if (p->tok.kind != tok·lparen && p->tok.kind != tok·comma) { errorf("format error: misplaced identifier for leaf found"); goto ERROR; } if (!p->root) { errorf("parse error: attempting to create child for no parent"); goto ERROR; } node = p->mem.alloc(p->heap, 1, sizeof(*node)); memset(node, 0, sizeof(*node)); node->name = str·make(tok.lit.s); phylo·addchild(p->root, node); } break; case tok·number: if (p->tok.kind == tok·rparen) { if (p->lev == 0) { errorf("format error: support value on root not supported"); goto ERROR; } node->support = tok.lit.x; } else { errorf("format error: found number in unexpected location"); goto ERROR; } break; case tok·semi: p->file.unget(p->io, ';'); if (p->lev) { errorf("format error: uneven number of parentheses found at ';'"); goto ERROR; } goto DONE; case tok·eof: goto DONE; default: errorf("parse error: unrecognized token"); goto ERROR; } p->tok = tok; } DONE: p->tok = tok; return 0; ERROR: // TODO(nnoll): cleanup return 1; } int bio·readnewick(io·Peeker stream, void *s, bio·Tree *tree) { error err; struct Parser p; if (!tree) { errorf("tree pointer nil"); return 0; } p = (struct Parser){ .lev = 0, .root = nil, .tok = (struct Token){ 0 }, .io = s, .file = stream, .mem = tree->mem, .heap = tree->heap, }; err = parse(&p); if (err) { errorf("parsing failed\n"); return 0; } tree->root = p.root; tree->nleaf = 0; tree->root->nnode = 0; phylo·countleafs(tree->root, &tree->nleaf); phylo·countnodes(tree->root, &tree->root->nnode); return 1; } // ----------------------------------------------------------------------- // Write static error dump(bio·Node *node, void *impl, io·Putter out) { byte b[24]; if (!node) { return 1; } bio·Node *child; if (node->nchild) { out.put(impl, '('); dump(node->child, impl, out); for (child = node->child->sibling; child != nil; child = child->sibling) { out.put(impl, ','); dump(child, impl, out); } out.put(impl, ')'); } if (node->name) { out.puts(impl, node->name); } if (node->parent) { out.put(impl, ':'); snprintf(b, arrlen(b), "%f", node->dist); out.puts(impl, b); } return 0; } error bio·writenewick(bio·Tree tree, io·Putter out, void* impl) { dump(tree.root, impl, out); out.put(impl, ';'); out.put(impl, '\n'); return 0; }