From 98375c9c5e4bf26f8d238666d7233177dba787eb Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Sat, 25 Apr 2020 14:13:13 -0700 Subject: feat: parsing multiline fastq files --- sys/libbio/io/fasta.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++---- sys/libbio/test.c | 59 +++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+), 6 deletions(-) (limited to 'sys') diff --git a/sys/libbio/io/fasta.c b/sys/libbio/io/fasta.c index e26fd70..bb6bfc7 100644 --- a/sys/libbio/io/fasta.c +++ b/sys/libbio/io/fasta.c @@ -167,7 +167,7 @@ ERROR: static error -readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte tok) +readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte hdr, byte stop) { error err; byte *beg; @@ -180,8 +180,8 @@ readfasta(bio·FastaReader *rdr, bio·Seq *seq, byte tok) // NOTE: Can this case happen? Assert(rdr->b != rdr->bend); - if (*rdr->b++ != tok) { - errorf("fasta format: expected '>', found '%c'", *rdr->b--); + if (*rdr->b++ != hdr) { + errorf("fasta/q format: expected '%c', found '%c'", hdr, *rdr->b--); return 1; } @@ -214,7 +214,7 @@ SEQL: beg = rdr->b + 1; } - if (*rdr->b == tok || *rdr->b == '\0') { + if (*rdr->b == stop || *rdr->b == '\0') { goto SUCCESS; } @@ -241,9 +241,9 @@ bio·readfasta(bio·FastaReader *rdr, bio·Seq *seq) { error err; - err = readfasta(rdr, seq, '>'); + err = readfasta(rdr, seq, '>', '>'); if (err && err != EOF) { - errorf("parse fail: could not read record"); + errorf("parse fail: could not read sequence of record"); return err; } @@ -272,3 +272,89 @@ bio·closefasta(bio·FastaReader *rdr) // ----------------------------------------------------------------------- // Fastq files + +struct bio·FastqReader { + struct bio·FastaReader; +}; + +bio·FastqReader* +bio·openfastq(io·Reader file, void *f, mem·Allocator heap, void *h) +{ + return (bio·FastqReader*)bio·openfasta(file, f, heap, h); +} + +error +bio·closefastq(bio·FastqReader *rdr) +{ + return bio·closefasta((bio·FastaReader*)rdr); +} + +error +bio·readfastq(bio·FastqReader *rdr, bio·Seq *seq) +{ + int n; + byte *beg; + error err; + + err = readfasta((bio·FastaReader*)rdr, seq, '@', '+'); + if (err) { + errorf("parse fail: could not read sequence of record"); + return err; + } + + seq->len = rdr->seq->it - (rdr->seq->b + rdr->seq->off); + + if (*rdr->b++ != '+') { + errorf("format error: no '+' character seperator found"); + return -1; + } + +EATLN: + while (rdr->b != rdr->bend) { + if (*rdr->b++ == '\n') { + n = 0; + goto QUAL; + } + } + + if (err = fill((bio·FastaReader*)rdr), err) { + errorf("read: could not populate buffer"); + return 1; + } + goto EATLN; + +QUAL: + beg = rdr->b; + while (rdr->b != rdr->bend) { + if (*rdr->b == '\n') { + push(&rdr->seq, rdr->b - beg, beg); + beg = rdr->b + 1; + } + + if (n++ == seq->len || *rdr->b == '\0') { + err = *rdr->b == '\0' ? EOF : 0; + goto SUCCESS; + } + + rdr->b++; + } + + push(&rdr->seq, rdr->b - beg, beg); + + if (err = fill((bio·FastaReader*)rdr), err) { + errorf("read: could not populate buffer"); + return 1; + } + goto QUAL; + + +SUCCESS: + push(&rdr->seq, rdr->b - beg, beg); + put(&rdr->seq, '\0'); + + seq->name = rdr->seq->b; + seq->s = rdr->seq->b + rdr->seq->off; + seq->q = seq->s + seq->len + 1; + + return err; +} diff --git a/sys/libbio/test.c b/sys/libbio/test.c index f301ead..e943153 100644 --- a/sys/libbio/test.c +++ b/sys/libbio/test.c @@ -3,6 +3,16 @@ #include #include +#include "kseq.h" + +static +int +my_read(Stream *s, void *buf, int n) +{ + return io·read(s, 1, n, buf); +} + +KSEQ_INIT(Stream*, my_read) // ----------------------------------------------------------------------- // Point of entry for testing @@ -31,6 +41,7 @@ test·newick() err = bio·writenewick(t, wtr, fd[1]); io·flush(fd[1]); + io·close(fd[0]); io·close(fd[1]); @@ -85,6 +96,50 @@ test·fasta() return err <= 0 ? 0 : 1; } +error +test·fastq() +{ + error err; + Stream *fd; + + bio·Seq seq; + bio·FastqReader *rdr; + + clock_t t; + + int n, slen; + kseq_t *kseq; + + fd = io·open("/home/nolln/root/data/test/eg.fq", "r"); + + t = clock(); + kseq = kseq_init(fd); + while (kseq_read(kseq) >= 0) { + ++n, slen += kseq->seq.l; + } + t = clock() - t; + printf("heng's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); + + kseq_destroy(kseq); + + io·seek(fd, 0, seek·set); + + rdr = bio·openfastq((io·Reader){.read = &io·read}, fd, mem·sys, nil); + + t = clock(); + err = 0; + while (!err) { + err = bio·readfastq(rdr, &seq); + } + t = clock() - t; + printf("nick's fastq code took %f ms to execute\n", 1000.*t/CLOCKS_PER_SEC); + bio·closefastq(rdr); + + + io·close(fd); + return err <= 0 ? 0 : 1; +} + error main() { @@ -96,5 +151,9 @@ main() if (err = test·fasta(), err) { errorf("test fail: fasta"); } + + if (err = test·fastq(), err) { + errorf("test fail: fastq"); + } } -- cgit v1.2.1