diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2020-08-19 12:05:56 -0700 |
---|---|---|
committer | Nicholas Noll <nbnoll@eml.cc> | 2020-08-19 12:05:56 -0700 |
commit | 8d5626243ce5560161534cba114301fd3a4dd382 (patch) | |
tree | 40ad227789f32c7634e29c5da603265653976870 | |
parent | bc17c2a9a5cd218180d272061d492dcbbf566144 (diff) |
fix: remove new lines from input stream
-rw-r--r-- | pangraph/utils.py | 6 | ||||
-rwxr-xr-x | scripts/filter_plasmids.py | 19 |
2 files changed, 10 insertions, 15 deletions
diff --git a/pangraph/utils.py b/pangraph/utils.py index 3e553a0..e4b63f3 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -174,7 +174,9 @@ def parse_fasta(fh): self.meta = meta def __str__(self): - return f">{self.name} {self.meta}\n{self.seq[:77]}...\n" + NL = '\n' + nc = 80 + return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}" def __repr__(self): return str(self) @@ -187,7 +189,7 @@ def parse_fasta(fh): line = as_str(line) if line == "" or line[0] == ">": break - seq.write(line) + seq.write(line[:-1]) header = as_str(line) yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue()) diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py index 9461451..4ef2670 100755 --- a/scripts/filter_plasmids.py +++ b/scripts/filter_plasmids.py @@ -13,8 +13,6 @@ from glob import glob sys.path.insert(0, os.path.abspath('.')) # gross hack from pangraph.utils import parse_fasta, breakpoint -from Bio import SeqIO - def open(path, *args, **kwargs): if path.endswith('.gz'): return gzip.open(path, *args, **kwargs) @@ -26,18 +24,13 @@ def open(path, *args, **kwargs): from time import time if __name__ == "__main__": - t0 = time() - for path in glob("data/staph/assemblies/*.fna.gz"): - with open(path, 'rt') as fd: - seqs = [record for record in SeqIO.parse(fd, 'fasta')] - t1 = time() - print(f"bio parser took {t1 - t0} seconds") - - t0 = time() for path in glob("data/staph/assemblies/*.fna.gz"): - with open(path, 'rt') as fd: + with open(path, 'rt') as fd, open("test.fa", 'w') as wtr: + for rec in parse_fasta(fd): + # print(str(rec)) + wtr.write(str(rec)) + wtr.write('\n') seqs = [record for record in parse_fasta(fd)] - t1 = time() - print(f"my parser took {t1 - t0} seconds") + break # main(sys.argv[1:]) |