aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-08-19 11:54:29 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-08-19 11:54:29 -0700
commitbc17c2a9a5cd218180d272061d492dcbbf566144 (patch)
treeb38c3e52a8c4b381aa57fab3befbddff05b4bc28
parente01b49284da3d8384744695b7a5aeaa37bb47d4f (diff)
feat: added code to filter plasmids from chromosomes
-rwxr-xr-xscripts/filter_plasmids.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
new file mode 100755
index 0000000..9461451
--- /dev/null
+++ b/scripts/filter_plasmids.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+script to filter plasmids and chromosomes from full genome assemblies
+"""
+
+import os
+import sys
+import gzip
+import builtins
+
+from glob import glob
+
+sys.path.insert(0, os.path.abspath('.')) # gross hack
+from pangraph.utils import parse_fasta, breakpoint
+
+from Bio import SeqIO
+
+def open(path, *args, **kwargs):
+ if path.endswith('.gz'):
+ return gzip.open(path, *args, **kwargs)
+ else:
+ return builtins.open(path, *args, **kwargs)
+
+# def main(args):
+# for d in args:
+
+from time import time
+if __name__ == "__main__":
+ t0 = time()
+ for path in glob("data/staph/assemblies/*.fna.gz"):
+ with open(path, 'rt') as fd:
+ seqs = [record for record in SeqIO.parse(fd, 'fasta')]
+ t1 = time()
+ print(f"bio parser took {t1 - t0} seconds")
+
+ t0 = time()
+ for path in glob("data/staph/assemblies/*.fna.gz"):
+ with open(path, 'rt') as fd:
+ seqs = [record for record in parse_fasta(fd)]
+ t1 = time()
+ print(f"my parser took {t1 - t0} seconds")
+
+ # main(sys.argv[1:])