aboutsummaryrefslogtreecommitdiff
path: root/scripts/filter_plasmids.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/filter_plasmids.py')
-rwxr-xr-xscripts/filter_plasmids.py57
1 files changed, 57 insertions, 0 deletions
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
new file mode 100755
index 0000000..c309150
--- /dev/null
+++ b/scripts/filter_plasmids.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+script to filter plasmids and chromosomes from full genome assemblies
+"""
+
+import os
+import sys
+import gzip
+import builtins
+import argparse
+
+from glob import glob
+
+sys.path.insert(0, os.path.abspath('.')) # gross hack
+from pangraph.utils import parse_fasta, breakpoint
+
+def open(path, *args, **kwargs):
+ if path.endswith('.gz'):
+ return gzip.open(path, *args, **kwargs)
+ else:
+ return builtins.open(path, *args, **kwargs)
+
+def main(dirs, plasmids=True):
+ for d in dirs:
+ in_dir = f"data/{d}/assemblies"
+ if not os.path.exists(in_dir):
+ print(f"{in_dir} doesn't exist. skipping...")
+ continue
+
+ if plasmids:
+ out_dir = f"data/{d}-plasmid/assemblies"
+ else:
+ out_dir = f"data/{d}-chromosome/assemblies"
+
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir)
+
+ for path in glob(f"{in_dir}/*.f?a*"):
+ with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr:
+ for i, rec in enumerate(parse_fasta(fd)):
+ if i == 0:
+ if not plasmids:
+ wtr.write(str(rec))
+ wtr.write('\n')
+ break
+ continue
+
+ wtr.write(str(rec))
+ wtr.write('\n')
+
+parser = argparse.ArgumentParser(description='seperate plasmids from chromosomes')
+parser.add_argument('directories', metavar='dirs', nargs='+')
+parser.add_argument('--chromosomes', default=False, action='store_true')
+
+if __name__ == "__main__":
+ args = parser.parse_args()
+ main(args.directories, plasmids=not args.chromosomes)