aboutsummaryrefslogtreecommitdiff
path: root/sys/libbio/io/fasta.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/libbio/io/fasta.c')
-rw-r--r--sys/libbio/io/fasta.c360
1 files changed, 0 insertions, 360 deletions
diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c
deleted file mode 100644
index 28c9ca4..0000000
--- a/sys/libbio/io/fasta.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <u.h>
-#include <libn.h>
-#include <libbio.h>
-
-#define INIT_NM_SIZE 128
-#define INIT_SQ_SIZE 4096
-
-struct Seqbuf
-{
- mem·Allocator heap;
- void *h;
-
- int len;
- int off;
- byte *it;
- byte b[];
-};
-
-static
-void
-reset(struct Seqbuf *sb)
-{
- sb->off = 0;
- sb->it = sb->b;
-}
-
-static
-error
-grow(struct Seqbuf **sb, int min)
-{
- struct Seqbuf *old, *new;
- vlong newlen;
-
- old = *sb;
- assert((*sb)->len <= (SIZE_MAX - 1) / 2);
- newlen = MAX(16, MAX(1 + 2*(*sb)->len, (*sb)->len+min));
- assert(newlen >= (*sb)->len+min);
-
- if (new = old->heap.alloc(old->h, 1, sizeof(*new)+newlen), !new) {
- errorf("memory: could not allocate new buffer");
- return 1;
- }
-
- memcpy(new, old, sizeof(*sb) + (*sb)->len);
- new->len = newlen;
- new->it = new->b + (old->len);
- old->heap.free(old->h, old);
-
- *sb = new;
- return 0;
-}
-
-static
-error
-put(struct Seqbuf **sb, byte c)
-{
- int err;
- struct Seqbuf *sq;
-
- sq = *sb;
- if (sq->it < (sq->b + sq->len)) {
- *sq->it++ = c;
- return 0;
- }
-
- if (err = grow(sb, 1), err) {
- errorf("memory fail: could not allocate more buffer");
- sq->heap.free(sq->h, sq);
- return 1;
- }
-
- *((*sb)->it++) = c;
- return 0;
-}
-
-static
-error
-push(struct Seqbuf **sb, int n, void *buf)
-{
- int d, err;
- struct Seqbuf *seq;
-
- seq = *sb;
- if (d = seq->len - (seq->it - seq->b), d < n) {
- assert(d > 0);
- if (err = grow(sb, n-d), err) {
- errorf("memory fail: could not allocate more buffer");
- seq->heap.free(seq->h, seq);
- return 1;
- }
- }
-
- memcpy((*sb)->it, buf, n);
- (*sb)->it += n;
-
- return 0;
-}
-
-// -----------------------------------------------------------------------
-// Fasta files
-
-struct bio·FastaReader {
- byte eof;
- io·Reader file;
- void *f;
-
- struct Seqbuf *seq;
-
- /* read buffer */
- byte *b, *bend;
- byte buf[4*4098];
-};
-
-static
-error
-fill(bio·FastaReader *rdr)
-{
- int n;
- // NOTE: This could lead to an infinite loop.
- if (rdr->eof) {
- return 0;
- }
-
- n = rdr->file.read(rdr->f, 1, arrlen(rdr->buf), rdr->buf);
- if (n < 0) {
- errorf("read: no data obtained from reader");
- return 1;
- }
- rdr->b = rdr->buf;
- rdr->bend = rdr->b + n;
- if (rdr->eof = n < arrlen(rdr->buf), rdr->eof) {
- *rdr->bend++ = '\0';
- }
-
- return 0;
-}
-
-bio·FastaReader*
-bio·openfasta(io·Reader file, void *f, mem·Allocator heap, void *h)
-{
- error err;
- bio·FastaReader *rdr;
-
- rdr = heap.alloc(h, 1, sizeof(bio·FastaReader));
- rdr->file = file;
- rdr->f = f;
- rdr->eof = 0;
-
- rdr->seq = heap.alloc(h, 1, sizeof(*rdr->seq) + INIT_NM_SIZE + INIT_SQ_SIZE);
- rdr->seq->heap = heap;
- rdr->seq->h = h;
- rdr->seq->it = rdr->seq->b;
- rdr->seq->len = INIT_NM_SIZE + INIT_SQ_SIZE;
-
- if (err = fill(rdr), err) {
- errorf("fill: could not populate buffer");
- goto ERROR;
- }
-
- return rdr;
-
-ERROR:
- heap.free(h, rdr->seq);
- heap.free(h, rdr);
- return nil;
-}
-
-static
-error
-readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte hdr, byte stop)
-{
- error err;
- byte *beg;
-
- if (rdr->eof && rdr->b == rdr->bend-1) {
- return EOF;
- }
-
- reset(rdr->seq);
-
- // NOTE: Can this case happen?
- assert(rdr->b != rdr->bend);
- if (*rdr->b++ != hdr) {
- errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--);
- return 1;
- }
-
-NAME:
- beg = rdr->b;
- while (rdr->b != rdr->bend) {
- if (*rdr->b++ == '\n') {
- push(&rdr->seq, (rdr->b - 1) - beg, beg);
- goto SEQ;
- }
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if (err = fill(rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto NAME;
-
-SEQ:
- put(&rdr->seq, '\0');
- rdr->seq->off = rdr->seq->it - rdr->seq->b;
-
-SEQL:
- beg = rdr->b;
- while (rdr->b != rdr->bend) {
- if (*rdr->b == '\n') {
- push(&rdr->seq, rdr->b - beg, beg);
- beg = rdr->b + 1;
- }
-
- if (*rdr->b == stop || *rdr->b == '\0') {
- goto SUCCESS;
- }
-
- rdr->b++;
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if (err = fill(rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto SEQL;
-
-SUCCESS:
- push(&rdr->seq, rdr->b - beg, beg);
- put(&rdr->seq, '\0');
-
- return 0;
-}
-
-error
-bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq)
-{
- error err;
-
- err = readfasta(rdr, seq, '>', '>');
- if (err && err != EOF) {
- errorf("parse fail: could not read sequence of record");
- return err;
- }
-
- seq->name = rdr->seq->b;
- seq->s = rdr->seq->b + rdr->seq->off;
- seq->len = rdr->seq->it - seq->s;
- seq->q = nil;
-
- return err;
-}
-
-error
-bio·closefasta(bio·FastaReader *rdr)
-{
- mem·Allocator heap;
- void *h;
-
- heap = rdr->seq->heap;
- h = rdr->seq->h;
-
- heap.free(h, rdr->seq);
- heap.free(h, rdr);
-
- return 0;
-}
-
-// -----------------------------------------------------------------------
-// Fastq files
-
-struct bio·FastqReader {
- struct bio·FastaReader;
-};
-
-bio·FastqReader*
-bio·openfastq(io·Reader file, void *f, mem·Allocator heap, void *h)
-{
- return (bio·FastqReader*)bio·openfasta(file, f, heap, h);
-}
-
-error
-bio·closefastq(bio·FastqReader *rdr)
-{
- return bio·closefasta((bio·FastaReader*)rdr);
-}
-
-error
-bio·readfastq(bio·FastqReader *rdr, bio·Seq *seq)
-{
- int n;
- byte *beg;
- error err;
-
- err = readfasta((bio·FastaReader*)rdr, seq, '@', '+');
- if (err) {
- errorf("parse fail: could not read sequence of record");
- return err;
- }
-
- seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
-
- if (*rdr->b++ != '+') {
- errorf("format error: no '+' character seperator found");
- return -1;
- }
-
-EATLN:
- while (rdr->b != rdr->bend) {
- if (*rdr->b++ == '\n') {
- n = 0;
- goto QUAL;
- }
- }
-
- if (err = fill((bio·FastaReader*)rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto EATLN;
-
-QUAL:
- beg = rdr->b;
- while (rdr->b != rdr->bend) {
- if (*rdr->b == '\n') {
- push(&rdr->seq, rdr->b - beg, beg);
- beg = rdr->b + 1;
- }
-
- if (n++ == seq->len || *rdr->b == '\0') {
- err = *rdr->b == '\0' ? EOF : 0;
- goto SUCCESS;
- }
-
- rdr->b++;
- }
-
- push(&rdr->seq, rdr->b - beg, beg);
-
- if (err = fill((bio·FastaReader*)rdr), err) {
- errorf("read: could not populate buffer");
- return 1;
- }
- goto QUAL;
-
-
-SUCCESS:
- push(&rdr->seq, rdr->b - beg, beg);
- put(&rdr->seq, '\0');
-
- seq->name = rdr->seq->b;
- seq->s = rdr->seq->b + rdr->seq->off;
- seq->q = seq->s + seq->len + 1;
-
- return err;
-}