aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-08-19 12:05:56 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-08-19 12:05:56 -0700
commit8d5626243ce5560161534cba114301fd3a4dd382 (patch)
tree40ad227789f32c7634e29c5da603265653976870
parentbc17c2a9a5cd218180d272061d492dcbbf566144 (diff)
fix: remove new lines from input stream
-rw-r--r--pangraph/utils.py6
-rwxr-xr-xscripts/filter_plasmids.py19
2 files changed, 10 insertions, 15 deletions
diff --git a/pangraph/utils.py b/pangraph/utils.py
index 3e553a0..e4b63f3 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -174,7 +174,9 @@ def parse_fasta(fh):
self.meta = meta
def __str__(self):
- return f">{self.name} {self.meta}\n{self.seq[:77]}...\n"
+ NL = '\n'
+ nc = 80
+ return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}"
def __repr__(self):
return str(self)
@@ -187,7 +189,7 @@ def parse_fasta(fh):
line = as_str(line)
if line == "" or line[0] == ">":
break
- seq.write(line)
+ seq.write(line[:-1])
header = as_str(line)
yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue())
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
index 9461451..4ef2670 100755
--- a/scripts/filter_plasmids.py
+++ b/scripts/filter_plasmids.py
@@ -13,8 +13,6 @@ from glob import glob
sys.path.insert(0, os.path.abspath('.')) # gross hack
from pangraph.utils import parse_fasta, breakpoint
-from Bio import SeqIO
-
def open(path, *args, **kwargs):
if path.endswith('.gz'):
return gzip.open(path, *args, **kwargs)
@@ -26,18 +24,13 @@ def open(path, *args, **kwargs):
from time import time
if __name__ == "__main__":
- t0 = time()
- for path in glob("data/staph/assemblies/*.fna.gz"):
- with open(path, 'rt') as fd:
- seqs = [record for record in SeqIO.parse(fd, 'fasta')]
- t1 = time()
- print(f"bio parser took {t1 - t0} seconds")
-
- t0 = time()
for path in glob("data/staph/assemblies/*.fna.gz"):
- with open(path, 'rt') as fd:
+ with open(path, 'rt') as fd, open("test.fa", 'w') as wtr:
+ for rec in parse_fasta(fd):
+ # print(str(rec))
+ wtr.write(str(rec))
+ wtr.write('\n')
seqs = [record for record in parse_fasta(fd)]
- t1 = time()
- print(f"my parser took {t1 - t0} seconds")
+ break
# main(sys.argv[1:])