Merge pull request #5 from nnoll/feat/fasta-parser

feat/fasta parser
author: Nicholas Noll <nbnoll@eml.cc> 2020-08-20 10:41:41 -0700
committer: GitHub <noreply@github.com> 2020-08-20 10:41:41 -0700
commit: 001aaaeaaed4847bff81f9c5373d5d5161d0e3ad (patch)
tree: d317c74f94b0f14a997a6d083c191e7850a47ceb
parent: bd21a4412a3e35f32ad84cf19328b7c89e0ed6e5 (diff)
parent: 80cede9edef586bbf68258f04ec3be8371afcc0c (diff)
5 files changed, 116 insertions, 16 deletions
diff --git a/Makefile b/Makefile
index 117dbc9..ab398b6 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null
 
 # figures
 
diff --git a/pangraph/build.py b/pangraph/build.py
index 4577732..5c3a8ec 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -38,6 +38,16 @@ def register_args(parser):
                         type=int,
                         default=22,
                         help="energy cost for mutations (used during block merges)")
+    parser.add_argument("-w", "--window",
+                        metavar="edge window",
+                        type=int,
+                        default=1000,
+                        help="amount of sequence to align from for end repair")
+    parser.add_argument("-e", "--extend",
+                        metavar="edge extend",
+                        type=int,
+                        default=1000,
+                        help="amount of sequence to extend for end repair")
     parser.add_argument("-s", "--statistics",
                         default=False,
                         action='store_true',
@@ -71,12 +81,10 @@ def main(args):
     mkdir(tmp)
 
     log("aligning")
-    with cProfile.Profile() as pr:
-        T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
-        # TODO: when debugging phase is done, remove tmp directory
+    T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
+    # TODO: when debugging phase is done, remove tmp directory
 
-        graphs = T.collect()
-    pr.dump_stats("perf.prof")
+    graphs = T.collect()
 
     for i, g in enumerate(graphs):
         log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}")
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 1a67c16..83ef839 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -21,8 +21,8 @@ from .utils    import Strand, as_string, parse_paf, panic, as_record, new_strand
 # ------------------------------------------------------------------------
 # globals
 
-WINDOW = 1000
-EXTEND = 2500
+# WINDOW = 1000
+# EXTEND = 2500
 pp = pprint.PrettyPrinter(indent=4)
 
 # ------------------------------------------------------------------------
@@ -140,7 +140,7 @@ class Graph(object):
         graphs, names = [], []
         for name, path in G.seqs.items():
             blks = set([b.id for b in path.blocks()])
-            gi   = [ i for i, g in enumerate(graphs) if overlaps(blks, g)]
+            gi   = [i for i, g in enumerate(graphs) if overlaps(blks, g)]
             if len(gi) == 0:
                 graphs.append(blks)
                 names.append(set([name]))
@@ -163,7 +163,7 @@ class Graph(object):
     # ---------------
     # methods
 
-    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False):
+    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False):
         from seqanpy import align_global as align
 
         # ----------------------------------
@@ -324,7 +324,7 @@ class Graph(object):
                 continue
 
             merged = True
-            self.merge(proc(hit))
+            self.merge(proc(hit), edge_window, edge_extend)
             merged_blks.add(hit['ref']['name'])
             merged_blks.add(hit['qry']['name'])
 
@@ -414,7 +414,7 @@ class Graph(object):
             blks.update(path.blocks())
         self.blks = {b.id:self.blks[b.id] for b in blks}
 
-    def merge(self, hit):
+    def merge(self, hit, window, extend):
         old_ref = self.blks[hit['ref']['name']]
         old_qry = self.blks[hit['qry']['name']]
 
@@ -475,7 +475,7 @@ class Graph(object):
             # NOTE: this is a hack to deal with flipped orientations
             pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
             if first:
                 blk_list = set([b.id for b in blks])
                 first = False
@@ -489,7 +489,7 @@ class Graph(object):
             except:
                 breakpoint("bad find")
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
             blk_list.intersection_update(set([b.id for b in blks]))
             num_seqs += 1
 
@@ -507,10 +507,10 @@ class Graph(object):
                         pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
                         beg, end = pos[0], pos[-1]
 
-                        for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+                        for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]):
                             tmp[n].write(f">isolate_{i:04d}\n")
                             s = self.seqs[tag[0]].sequence_range(left,right)
-                            if len(s) > EXTEND + WINDOW:
+                            if len(s) > extend + window:
                                 breakpoint(f"bad sequence slicing: {len(s)}")
                             tmp[n].write(s + '\n')
 
diff --git a/pangraph/utils.py b/pangraph/utils.py
index 5d5e888..e4b63f3 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -3,6 +3,7 @@ import csv
 import gzip
 import numpy as np
 
+from io   import StringIO
 from enum import IntEnum
 
 from Bio import SeqIO
@@ -157,9 +158,43 @@ def getnwk(node, newick, parentdist, leaf_names):
         newick = "(%s" % (newick)
         return newick
 
+def as_str(s):
+    if isinstance(s, bytes):
+        return s.decode('utf-8')
+    return s
+
 # ------------------------------------------------------------------------
 # parsers
 
+def parse_fasta(fh):
+    class Record:
+        def __init__(self, name=None, meta=None, seq=None):
+            self.seq  = seq
+            self.name = name
+            self.meta = meta
+
+        def __str__(self):
+            NL = '\n'
+            nc = 80
+            return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}"
+
+        def __repr__(self):
+            return str(self)
+
+    header = as_str(fh.readline())
+    while header != "" and header[0] == ">":
+        name = header[1:].split()
+        seq  = StringIO()
+        for line in fh:
+            line = as_str(line)
+            if line == "" or line[0] == ">":
+                break
+            seq.write(line[:-1])
+
+        header = as_str(line)
+        yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue())
+        seq.close()
+
 def parse_paf(fh):
     hits = []
     for line in fh:
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
new file mode 100755
index 0000000..c309150
--- /dev/null
+++ b/scripts/filter_plasmids.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+script to filter plasmids and chromosomes from full genome assemblies
+"""
+
+import os
+import sys
+import gzip
+import builtins
+import argparse
+
+from glob import glob
+
+sys.path.insert(0, os.path.abspath('.')) # gross hack
+from pangraph.utils import parse_fasta, breakpoint
+
+def open(path, *args, **kwargs):
+    if path.endswith('.gz'):
+        return gzip.open(path, *args, **kwargs)
+    else:
+        return builtins.open(path, *args, **kwargs)
+
+def main(dirs, plasmids=True):
+    for d in dirs:
+        in_dir = f"data/{d}/assemblies"
+        if not os.path.exists(in_dir):
+            print(f"{in_dir} doesn't exist. skipping...")
+            continue
+
+        if plasmids:
+            out_dir = f"data/{d}-plasmid/assemblies"
+        else:
+            out_dir = f"data/{d}-chromosome/assemblies"
+
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+
+        for path in glob(f"{in_dir}/*.f?a*"):
+            with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr:
+                for i, rec in enumerate(parse_fasta(fd)):
+                    if i == 0:
+                        if not plasmids:
+                            wtr.write(str(rec))
+                            wtr.write('\n')
+                            break
+                        continue
+
+                    wtr.write(str(rec))
+                    wtr.write('\n')
+
+parser = argparse.ArgumentParser(description='seperate plasmids from chromosomes')
+parser.add_argument('directories', metavar='dirs', nargs='+')
+parser.add_argument('--chromosomes', default=False, action='store_true')
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.directories, plasmids=not args.chromosomes)
author	Nicholas Noll <nbnoll@eml.cc>	2020-08-20 10:41:41 -0700
committer	GitHub <noreply@github.com>	2020-08-20 10:41:41 -0700
commit	001aaaeaaed4847bff81f9c5373d5d5161d0e3ad (patch)
tree	d317c74f94b0f14a997a6d083c191e7850a47ceb
parent	bd21a4412a3e35f32ad84cf19328b7c89e0ed6e5 (diff)
parent	80cede9edef586bbf68258f04ec3be8371afcc0c (diff)