From c0a7b53baf2a6e7bf9bc1fbec7ac05e43ac59154 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 8 Sep 2021 15:51:40 -0700 Subject: checkin --- sys/libbio/fasta.c | 95 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 30 deletions(-) (limited to 'sys/libbio/fasta.c') diff --git a/sys/libbio/fasta.c b/sys/libbio/fasta.c index 6b40e99..078325a 100644 --- a/sys/libbio/fasta.c +++ b/sys/libbio/fasta.c @@ -40,8 +40,8 @@ grow(struct SeqBuf **sb, int min) newcap = MAX(16, MAX(1 + 2*(*sb)->cap, (*sb)->cap+min)); assert(newcap >= (*sb)->cap+min); - if (new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) { - errorf("memory: could not allocate new buffer"); + if(new = mem.alloc(heap, 1, sizeof(*new)+newcap), !new) { + errorf("memory: could not allocate new buffer\n"); return 1; } @@ -63,13 +63,13 @@ put(struct SeqBuf **sb, byte c) struct SeqBuf *sq; sq = *sb; - if (sq->it < (sq->b + sq->cap)) { + if(sq->it < (sq->b + sq->cap)) { *sq->it++ = c; return 0; } - if (err = grow(sb, 1), err) { - errorf("memory fail: could not allocate more buffer"); + if(err = grow(sb, 1), err) { + errorf("memory fail: could not allocate more buffer\n"); sq->mem.free(sq->heap, sq); return 1; } @@ -85,21 +85,11 @@ push(struct SeqBuf **sb, int n, void *buf) int d, err; struct SeqBuf *seq; - char *cb = buf; - for(d=0; d < n; d++) { - if(cb[d] == 0) { - printf("ERROR: zero byte being copied @ pos %d/%d\n", d, n); - printf("ERROR: string afterwards is %s\n", cb+n+1); - printf("ERROR: string afterwards is %s\n", cb+n+2); - exit(1); - } - } - seq = *sb; - if(d = seq->cap - (seq->it - seq->b), d < n) { - assert(d > 0); + if(d = (seq->cap - (seq->it - seq->b)), d < n) { + assert(d >= 0); if (err = grow(sb, n-d), err) { - errorf("memory fail: could not allocate more buffer"); + errorf("memory fail: could not allocate more buffer\n"); seq->mem.free(seq->heap, seq); return 1; } @@ -137,7 +127,7 @@ fill(bio·SeqReader *rdr) n = rdr->rdr.read(rdr->io, 1, arrlen(rdr->buf), rdr->buf); if(n < 0) { - errorf("read: no data obtained from reader"); + errorf("read: no data obtained from reader\n"); return 1; } rdr->b = rdr->buf; @@ -166,7 +156,7 @@ bio·openseq(io·Reader rdr, void *io, mem·Allocator mem, void *heap) r->seq->cap = INIT_NM_SIZE + INIT_SQ_SIZE; if (err=fill(r), err) { - errorf("fill: could not populate buffer"); + errorf("fill: could not populate buffer\n"); goto ERROR; } @@ -209,7 +199,7 @@ readfasta(bio·SeqReader *rdr, bio·Seq *seq, byte hdr, byte stop) // NOTE: Can this case happen? assert(rdr->b != rdr->bend); if(*rdr->b++ != hdr) { - errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--); + errorf("fasta/q format: expected '%c', found '%c'\n", hdr, *rdr->b--); return 1; } @@ -224,7 +214,7 @@ NAME: push(&rdr->seq, rdr->b - beg, beg); if(err=fill(rdr), err) { - errorf("read: could not populate buffer"); + errorf("read: could not populate buffer\n"); return 1; } goto NAME; @@ -246,16 +236,18 @@ SEQLOOP: rdr->b++; } +#if 0 for(byte *cb = rdr->seq->b+rdr->seq->off; cb != rdr->seq->it; ++cb) { if(*cb == 0) { printf("ERROR @ pos=%ld: Found zero byte\n", cb - rdr->seq->b+rdr->seq->off); } } +#endif push(&rdr->seq, rdr->b - beg, beg); if(err=fill(rdr), err) { - errorf("read: could not populate buffer"); + errorf("read: could not populate buffer\n"); return 1; } goto SEQLOOP; @@ -278,13 +270,13 @@ bio·readfasta(bio·SeqReader *rdr, bio·Seq *seq) err = readfasta(rdr, seq, '>', '>'); if(err && err != EOF) { - errorf("parse fail: could not read sequence of record"); + errorf("parse fail: could not read sequence of record\n"); return err; } seq->name = rdr->seq->b; seq->s = rdr->seq->b + rdr->seq->off; - seq->len = rdr->seq->it - seq->s - 1; + seq->len = rdr->seq->it - seq->s - 1; // shift by 1 as we pushed a '0' to end seq->q = nil; return err; @@ -303,14 +295,14 @@ bio·readfastq(bio·SeqReader *rdr, bio·Seq *seq) err = readfasta(rdr, seq, '@', '+'); if(err) { - errorf("parse fail: could not read sequence of record"); + errorf("parse fail: could not read sequence of record\n"); return err; } seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); if(*rdr->b++ != '+') { - errorf("format error: no '+' character seperator found"); + errorf("format error: no '+' character seperator found\n"); return -1; } @@ -323,7 +315,7 @@ EATLN: } if(err = fill((bio·SeqReader*)rdr), err) { - errorf("read: could not populate buffer"); + errorf("read: could not populate buffer\n"); return 1; } goto EATLN; @@ -347,7 +339,7 @@ QUAL: push(&rdr->seq, rdr->b - beg, beg); if(err = fill((bio·SeqReader*)rdr), err) { - errorf("read: could not populate buffer"); + errorf("read: could not populate buffer\n"); return 1; } goto QUAL; @@ -358,8 +350,51 @@ SUCCESS: put(&rdr->seq, '\0'); seq->name = rdr->seq->b; - seq->s = rdr->seq->b + rdr->seq->off; + seq->s = rdr->seq->b + rdr->seq->off - 1; seq->q = seq->s + seq->len + 1; return err; } + +// ----------------------------------------------------------------------- +// sequence writing + +error +bio·writefasta(io·Writer io, void *wtr, bio·Seq seq) +{ + int i, j, d; + char buf[2048], *b = buf, *e = arrend(buf); + + *b++ = '>'; + while(*seq.name) { + *b++ = *seq.name++; + if(b == e) { + io.write(wtr, 1, arrlen(buf), buf); + b = buf; + } + } + + for(i=0; i