From 583656a3537bc43a28c58111520143df04bf27f2 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 21 Apr 2020 19:19:05 -0700 Subject: feat: added skeleton of biological library --- include/libbio.h | 29 +++++ sys/libbio/io/newick.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++ sys/libbio/io/rules.mk | 41 +++++++ sys/libbio/phylo.c | 20 ++++ sys/libbio/rules.mk | 43 +++++++ sys/libbio/test.c | 60 ++++++++++ 6 files changed, 506 insertions(+) create mode 100644 include/libbio.h create mode 100644 sys/libbio/io/newick.c create mode 100644 sys/libbio/io/rules.mk create mode 100644 sys/libbio/phylo.c create mode 100644 sys/libbio/rules.mk create mode 100644 sys/libbio/test.c diff --git a/include/libbio.h b/include/libbio.h new file mode 100644 index 0000000..d8430b3 --- /dev/null +++ b/include/libbio.h @@ -0,0 +1,29 @@ +#pragma once + +// ----------------------------------------------------------------------- +// Phylogenetics + +typedef struct bio·Node +{ + string name; + string comment; + double dist; + double support; + int nchild; + int ndescendent; + struct bio·Node *parent; + struct bio·Node *child[2]; + struct bio·Node *sibling; +} bio·Node; + +error phylo·addchild(bio·Node* parent, bio·Node* child); + +typedef struct bio·Tree +{ + bio·Node *root; +} bio·Tree; + +bio·Tree bio·readnewick(Stream *file, mem·Allocator heap); + +// ----------------------------------------------------------------------- +// Sequences diff --git a/sys/libbio/io/newick.c b/sys/libbio/io/newick.c new file mode 100644 index 0000000..b81e1bd --- /dev/null +++ b/sys/libbio/io/newick.c @@ -0,0 +1,313 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// Tokens + +enum TokenKind +{ + tok·nil, + tok·eof, + tok·space, + tok·ident, + tok·number, + tok·lparen, + tok·rparen, + tok·lbrak, + tok·rbrak, + tok·comma, + tok·semi, + tok·colon, + + NUM_TOKENS, +}; + + +struct Token { + enum TokenKind kind; + union + { + byte *s; + double x; + } lit; +}; + +byte* +tokstr(struct Token tok) +{ + static byte b[50]; + switch (tok.kind) { + case tok·nil: return ""; + case tok·eof: return nil; + case tok·space: return " "; + case tok·ident: return tok.lit.s; + case tok·lparen: return "("; + case tok·rparen: return ")"; + case tok·lbrak: return "["; + case tok·rbrak: return "]"; + case tok·comma: return ","; + case tok·semi: return ";"; + case tok·colon: return ":"; + case tok·number: + snprintf(b, arrlen(b), "%f", tok.lit.x); + return b; + default: + return nil; + } +} + + +// ----------------------------------------------------------------------- +// Read + +// TODO: Bounds checking on buffer +struct Token +lex(Stream *s) +{ + struct Token tok; + byte *c, b[1024]; + c = b; + *c = io·getbyte(s); + + if (isspace(*c)) { + while (isspace(*c)) { + *(++c) = io·getbyte(s); + } + + io·ungetbyte(s, *c); + Assert(c - b < 1024); + + *c = 0; + tok.kind = tok·space; + tok.lit.s = b; + return tok; + } + + switch (*c) { + case EOF: tok.kind = tok·eof; return tok; + case '(': tok.kind = tok·lparen; return tok; + case ')': tok.kind = tok·rparen; return tok; + case '[': tok.kind = tok·lbrak; return tok; + case ']': tok.kind = tok·rbrak; return tok; + case ',': tok.kind = tok·comma; return tok; + case ';': tok.kind = tok·semi; return tok; + case ':': tok.kind = tok·colon; return tok; + + case '.': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + while (isdigit(*c)) { + NUM: *(++c) = io·getbyte(s); + } + if (*c == '.') goto NUM; + + io·ungetbyte(s, *c); + Assert(c - b < 1024); + + *c = 0; + tok.kind = tok·number; + tok.lit.x = atof(b); + return tok; + + default: + while (isalnum(*c)) { + *(++c) = io·getbyte(s); + } + + io·ungetbyte(s, *c); + Assert(c - b < 1024); + + *c = 0; + tok.kind = tok·ident; + tok.lit.s = b; + return tok; + } +} + +struct Token +lex_nospace(Stream *s) +{ + struct Token tok; + tok = lex(s); + if (tok.kind == tok·space) { + lex_nospace(s); + } + + return tok; +} + +struct Parser +{ + int lev; + bio·Node *root; + struct Token tok; + + Stream *file; + mem·Allocator heap; +}; + +error +parse(struct Parser *p) +{ + error err; + bio·Node *node; + bio·Node *root; + struct Token tok; + + for (;;) { + tok = lex_nospace(p->file); + + switch (tok.kind) { + case tok·lparen: + if (p->lev > 0) { + errorf("incorrect format: opening another node before termination of last tree\n"); + goto ERROR; + } + node = p->heap.alloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + if (p->root) { + phylo·addchild(p->root, node); + root = p->root; + } else { + root = node; + } + + p->lev++; + err = parse(p); + if (err) { + goto ERROR; + } + if (p->tok.kind != tok·rparen) { + errorf("incorrect format: closing parentheses expected to proceed opening\n"); + goto ERROR; + } + p->root = root; + + break; + + case tok·rparen: + p->lev--; + goto DONE; + + /* Comments */ + case tok·lbrak: + if (!node) { + errorf("incorrect format: comment found in disallowed region\n"); + goto ERROR; + } + node->comment = str·new(""); + while (tok.kind != tok·rbrak) { + tok = lex_nospace(p->file); + if (tok.kind == tok·eof || tok.kind == tok·nil) { + errorf("incorrect format: unmatched comment bracket '['\n"); + goto ERROR; + } + str·append(node->comment, tokstr(tok)); + } + break; + + case tok·rbrak: + errorf("incorrect format: end comment token found in disallowed region\n"); + goto ERROR; + break; + + case tok·colon: + tok = lex_nospace(p->file); + if (tok.kind != tok·number) { + errorf("incorrect format: expected number after colon\n"); + goto ERROR; + } + if (node == nil) { + errorf("parse error: attempting to set distance of nil node\n"); + goto ERROR; + } + node->dist = tok.lit.x; + break; + + case tok·comma: + node = nil; + break; + + case tok·ident: + if (p->tok.kind != tok·rparen) { + if (!node) { + errorf("parse error: attempting to set name of nil node\n"); + goto ERROR; + } + node->name = str·new(tok.lit.s); + } else { + if (p->tok.kind != tok·comma) { + errorf("format error: misplaced identifier found\n"); + goto ERROR; + } + if (!node) { + errorf("parse error: attempting to create child for no parent\n"); + goto ERROR; + } + node = p->heap.alloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + phylo·addchild(p->root, node); + } + break; + + case tok·number: + if (p->tok.kind == tok·rparen) { + if (p->lev == 0) { + errorf("format error: support value on root not supported\n"); + goto ERROR; + } + node->support = tok.lit.x; + } else { + errorf("format error: found number in unexpected location\n"); + goto ERROR; + } + break; + + case tok·semi: + io·ungetbyte(p->file, ';'); + if (p->lev) { + errorf("format error: uneven number of parentheses found at ';'\n"); + } + goto DONE; + break; + + case tok·eof: + goto DONE; + break; + + default: + goto ERROR; + } + + p->tok = tok; + } + + +DONE: + p->tok = tok; + return 0; +ERROR: + // TODO(nnoll): cleanup + return 1; +} + +bio·Tree +bio·readnewick(Stream *file, mem·Allocator heap) +{ + error err; + struct Parser p; + bio·Tree tree; + + err = parse(&p); + if (err) { + errorf("parsing failed\n"); + return tree; + } + tree.root = p.root; + + return tree; +} + +// ----------------------------------------------------------------------- +// Write diff --git a/sys/libbio/io/rules.mk b/sys/libbio/io/rules.mk new file mode 100644 index 0000000..9ef6d00 --- /dev/null +++ b/sys/libbio/io/rules.mk @@ -0,0 +1,41 @@ +# ---- Push on stack ---- +SP := $(SP).x +DIRSTACK_$(SP) := $(d) +d := $(DIR) + +# Iterate through subdirectory tree + +# Local sources +SRCS_$(d) := $(wildcard $(d)/*.c) +OBJS_$(d) := $(SRCS_$(d):.c=.o) +OBJS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(OBJS_$(d))) +DEPS_$(d) := $(OBJS_$(d):.o=.d) + +OBJS := $(OBJS) $(OBJS_$(d)) +DEPS := $(DEPS) $(DEPS_$(d)) + +# Local targets +LIBS_$(d) := +LIBS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(LIBS_$(d))) +LIBS := $(LIBS) $(LIBS_$(d)) + +BINS_$(d) := +BINS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(BINS_$(d))) +BINS := $(BINS) $(BINS_$(d)) + +# Local rules +# $(LIBS_$(d)) = TCFLAGS := +# $(LIBS_$(d)) = TCINCS := + +$(LIBS_$(d)): $(OBJS_$(d)) + $(ARCHIVE) + +# $(BINS_$(d)): TCLIBS := $(OBJ_DIR)/libn/libn.a +$(BINS_$(d)): $(OBJS_$(d)) + $(LINK) + +# ---- Pop off stack ---- +-include $(DEPS_$(d)) + +d := $(DIRSTACK_$(SP)) +SP := $(basename $(SP)) diff --git a/sys/libbio/phylo.c b/sys/libbio/phylo.c new file mode 100644 index 0000000..8033e35 --- /dev/null +++ b/sys/libbio/phylo.c @@ -0,0 +1,20 @@ +#include +#include +#include + +error +phylo·addchild(bio·Node* parent, bio·Node* child) +{ + bio·Node *it, *sibling; + if (parent->nchild < 2) { + parent->child[parent->nchild++] = child; + } else { + for (it = parent->child[1]->sibling; it != nil; it = it->sibling) { + sibling = it; + } + sibling->sibling = child; + parent->nchild++; + } + + return 0; +} diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk new file mode 100644 index 0000000..fd3e066 --- /dev/null +++ b/sys/libbio/rules.mk @@ -0,0 +1,43 @@ +# ---- Push on stack ---- +SP := $(SP).x +DIRSTACK_$(SP) := $(d) +d := $(DIR) + +# Iterate through subdirectory tree +DIR := $(d)/io +include $(DIR)/rules.mk + +# Local sources +SRCS_$(d) := $(wildcard $(d)/*.c) +OBJS_$(d) := $(SRCS_$(d):.c=.o) +OBJS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(OBJS_$(d))) +DEPS_$(d) := $(OBJS_$(d):.o=.d) + +OBJS := $(OBJS) $(OBJS_$(d)) +DEPS := $(DEPS) $(DEPS_$(d)) + +# Local targets +LIBS_$(d) := $(d)/libbio.a +LIBS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(LIBS_$(d))) +LIBS := $(LIBS) $(LIBS_$(d)) + +BINS_$(d) := $(d)/test +BINS_$(d) := $(patsubst $(SRC_DIR)/%, $(OBJ_DIR)/%, $(BINS_$(d))) +BINS := $(BINS) $(BINS_$(d)) + +# Local rules +# $(LIBS_$(d)) = TCFLAGS := +# $(LIBS_$(d)) = TCINCS := +# $(LIBS_$(d)) = TCLIBS := + +$(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io) + $(ARCHIVE) + +$(BINS_$(d)): $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a + $(LINK) + +# ---- Pop off stack ---- +-include $(DEPS_$(d)) + +d := $(DIRSTACK_$(SP)) +SP := $(basename $(SP)) diff --git a/sys/libbio/test.c b/sys/libbio/test.c new file mode 100644 index 0000000..00345c4 --- /dev/null +++ b/sys/libbio/test.c @@ -0,0 +1,60 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// Arena allocator + +static mem·Arena* ARENA; + +static +void* +bio·alloc(ulong size) +{ + return mem·arenaalloc(ARENA, size); +} + +static +void* +bio·realloc(void *ptr, ulong size) +{ + void* new = mem·arenaalloc(ARENA, size); + memcpy(new, ptr, size); + + return new; +} + +static +void +bio·free(void *ptr) +{ + /* stub */ +} + +static mem·Allocator arena = {.alloc = &bio·alloc, .realloc = &bio·realloc, .free = &bio·free }; + +// ----------------------------------------------------------------------- +// Point of entry for testing + +void +init() +{ + ARENA = mem·newarena(mem·sys); +} + +int +main() +{ + init(); + + bio·Tree t; + Stream *fd; + + fd = io·open("/home/nolln/root/data/test/example.nwk", "r"); + printf("starting\n"); + t = bio·readnewick(fd, arena); + io·close(fd); + printf("ending\n"); + return 0; +} + -- cgit v1.2.1