aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-04-25 14:13:13 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-04-25 14:13:13 -0700
commit98375c9c5e4bf26f8d238666d7233177dba787eb (patch)
tree32c23d1207e518935d8493ba7d5928d8f8ec2666 /sys
parent9ff2333d9fd84c3741bf71961532152976f8ddc7 (diff)
feat: parsing multiline fastq files
Diffstat (limited to 'sys')
-rw-r--r--sys/libbio/io/fasta.c98
-rw-r--r--sys/libbio/test.c59
2 files changed, 151 insertions, 6 deletions
diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c
index e26fd70..bb6bfc7 100644
--- a/sys/libbio/io/fasta.c
+++ b/sys/libbio/io/fasta.c
@@ -167,7 +167,7 @@ ERROR:
static
error
-readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte tok)
+readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte hdr, byte stop)
{
error err;
byte *beg;
@@ -180,8 +180,8 @@ readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte tok)
// NOTE: Can this case happen?
Assert(rdr->b != rdr->bend);
- if (*rdr->b++ != tok) {
- errorf("fasta format: expected '>', found '%c'", *rdr->b--);
+ if (*rdr->b++ != hdr) {
+ errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--);
return 1;
}
@@ -214,7 +214,7 @@ SEQL:
beg = rdr->b + 1;
}
- if (*rdr->b == tok || *rdr->b == '\0') {
+ if (*rdr->b == stop || *rdr->b == '\0') {
goto SUCCESS;
}
@@ -241,9 +241,9 @@ bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq)
{
error err;
- err = readfasta(rdr, seq, '>');
+ err = readfasta(rdr, seq, '>', '>');
if (err && err != EOF) {
- errorf("parse fail: could not read record");
+ errorf("parse fail: could not read sequence of record");
return err;
}
@@ -272,3 +272,89 @@ bio·closefasta(bio·FastaReader *rdr)
// -----------------------------------------------------------------------
// Fastq files
+
+struct bio·FastqReader {
+ struct bio·FastaReader;
+};
+
+bio·FastqReader*
+bio·openfastq(io·Reader file, void *f, mem·Allocator heap, void *h)
+{
+ return (bio·FastqReader*)bio·openfasta(file, f, heap, h);
+}
+
+error
+bio·closefastq(bio·FastqReader *rdr)
+{
+ return bio·closefasta((bio·FastaReader*)rdr);
+}
+
+error
+bio·readfastq(bio·FastqReader *rdr, bio·Seq *seq)
+{
+ int n;
+ byte *beg;
+ error err;
+
+ err = readfasta((bio·FastaReader*)rdr, seq, '@', '+');
+ if (err) {
+ errorf("parse fail: could not read sequence of record");
+ return err;
+ }
+
+ seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off);
+
+ if (*rdr->b++ != '+') {
+ errorf("format error: no '+' character seperator found");
+ return -1;
+ }
+
+EATLN:
+ while (rdr->b != rdr->bend) {
+ if (*rdr->b++ == '\n') {
+ n = 0;
+ goto QUAL;
+ }
+ }
+
+ if (err = fill((bio·FastaReader*)rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto EATLN;
+
+QUAL:
+ beg = rdr->b;
+ while (rdr->b != rdr->bend) {
+ if (*rdr->b == '\n') {
+ push(&rdr->seq, rdr->b - beg, beg);
+ beg = rdr->b + 1;
+ }
+
+ if (n++ == seq->len || *rdr->b == '\0') {
+ err = *rdr->b == '\0' ? EOF : 0;
+ goto SUCCESS;
+ }
+
+ rdr->b++;
+ }
+
+ push(&rdr->seq, rdr->b - beg, beg);
+
+ if (err = fill((bio·FastaReader*)rdr), err) {
+ errorf("read: could not populate buffer");
+ return 1;
+ }
+ goto QUAL;
+
+
+SUCCESS:
+ push(&rdr->seq, rdr->b - beg, beg);
+ put(&rdr->seq, '\0');
+
+ seq->name = rdr->seq->b;
+ seq->s = rdr->seq->b + rdr->seq->off;
+ seq->q = seq->s + seq->len + 1;
+
+ return err;
+}
diff --git a/sys/libbio/test.c b/sys/libbio/test.c
index f301ead..e943153 100644
--- a/sys/libbio/test.c
+++ b/sys/libbio/test.c
@@ -3,6 +3,16 @@
#include <libbio.h>
#include <time.h>
+#include "kseq.h"
+
+static
+int
+my_read(Stream *s, void *buf, int n)
+{
+ return io·read(s, 1, n, buf);
+}
+
+KSEQ_INIT(Stream*, my_read)
// -----------------------------------------------------------------------
// Point of entry for testing
@@ -31,6 +41,7 @@ test·newick()
err = bio·writenewick(t, wtr, fd[1]);
io·flush(fd[1]);
+
io·close(fd[0]);
io·close(fd[1]);
@@ -86,6 +97,50 @@ test·fasta()
}
error
+test·fastq()
+{
+ error err;
+ Stream *fd;
+
+ bio·Seq seq;
+ bio·FastqReader *rdr;
+
+ clock_t t;
+
+ int n, slen;
+ kseq_t *kseq;
+
+ fd = io·open("/home/nolln/root/data/test/eg.fq", "r");
+
+ t = clock();
+ kseq = kseq_init(fd);
+ while (kseq_read(kseq) >= 0) {
+ ++n, slen += kseq->seq.l;
+ }
+ t = clock() - t;
+ printf("heng's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC);
+
+ kseq_destroy(kseq);
+
+ io·seek(fd, 0, seek·set);
+
+ rdr = bio·openfastq((io·Reader){.read = &io·read}, fd, mem·sys, nil);
+
+ t = clock();
+ err = 0;
+ while (!err) {
+ err = bio·readfastq(rdr, &seq);
+ }
+ t = clock() - t;
+ printf("nick's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC);
+ bio·closefastq(rdr);
+
+
+ io·close(fd);
+ return err <= 0 ? 0 : 1;
+}
+
+error
main()
{
error err;
@@ -96,5 +151,9 @@ main()
if (err = test·fasta(), err) {
errorf("test fail: fasta");
}
+
+ if (err = test·fastq(), err) {
+ errorf("test fail: fastq");
+ }
}