diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2020-08-19 11:54:29 -0700 |
---|---|---|
committer | Nicholas Noll <nbnoll@eml.cc> | 2020-08-19 11:54:29 -0700 |
commit | bc17c2a9a5cd218180d272061d492dcbbf566144 (patch) | |
tree | b38c3e52a8c4b381aa57fab3befbddff05b4bc28 | |
parent | e01b49284da3d8384744695b7a5aeaa37bb47d4f (diff) |
feat: added code to filter plasmids from chromosomes
-rwxr-xr-x | scripts/filter_plasmids.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py new file mode 100755 index 0000000..9461451 --- /dev/null +++ b/scripts/filter_plasmids.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +script to filter plasmids and chromosomes from full genome assemblies +""" + +import os +import sys +import gzip +import builtins + +from glob import glob + +sys.path.insert(0, os.path.abspath('.')) # gross hack +from pangraph.utils import parse_fasta, breakpoint + +from Bio import SeqIO + +def open(path, *args, **kwargs): + if path.endswith('.gz'): + return gzip.open(path, *args, **kwargs) + else: + return builtins.open(path, *args, **kwargs) + +# def main(args): +# for d in args: + +from time import time +if __name__ == "__main__": + t0 = time() + for path in glob("data/staph/assemblies/*.fna.gz"): + with open(path, 'rt') as fd: + seqs = [record for record in SeqIO.parse(fd, 'fasta')] + t1 = time() + print(f"bio parser took {t1 - t0} seconds") + + t0 = time() + for path in glob("data/staph/assemblies/*.fna.gz"): + with open(path, 'rt') as fd: + seqs = [record for record in parse_fasta(fd)] + t1 = time() + print(f"my parser took {t1 - t0} seconds") + + # main(sys.argv[1:]) |