feat: window and extend for end repair now CLI parameters

author: Nicholas Noll <nbnoll@eml.cc> 2020-08-20 10:39:05 -0700
committer: Nicholas Noll <nbnoll@eml.cc> 2020-08-20 10:39:05 -0700
commit: 80cede9edef586bbf68258f04ec3be8371afcc0c (patch)
tree: d317c74f94b0f14a997a6d083c191e7850a47ceb
parent: 582f1f29268f3c3792c5b965d6948ff4b7633671 (diff)
3 files changed, 24 insertions, 16 deletions
diff --git a/Makefile b/Makefile
index 117dbc9..ab398b6 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null
 
 # figures
 
diff --git a/pangraph/build.py b/pangraph/build.py
index 4577732..5c3a8ec 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -38,6 +38,16 @@ def register_args(parser):
                         type=int,
                         default=22,
                         help="energy cost for mutations (used during block merges)")
+    parser.add_argument("-w", "--window",
+                        metavar="edge window",
+                        type=int,
+                        default=1000,
+                        help="amount of sequence to align from for end repair")
+    parser.add_argument("-e", "--extend",
+                        metavar="edge extend",
+                        type=int,
+                        default=1000,
+                        help="amount of sequence to extend for end repair")
     parser.add_argument("-s", "--statistics",
                         default=False,
                         action='store_true',
@@ -71,12 +81,10 @@ def main(args):
     mkdir(tmp)
 
     log("aligning")
-    with cProfile.Profile() as pr:
-        T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
-        # TODO: when debugging phase is done, remove tmp directory
+    T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
+    # TODO: when debugging phase is done, remove tmp directory
 
-        graphs = T.collect()
-    pr.dump_stats("perf.prof")
+    graphs = T.collect()
 
     for i, g in enumerate(graphs):
         log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}")
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 1a67c16..83ef839 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -21,8 +21,8 @@ from .utils    import Strand, as_string, parse_paf, panic, as_record, new_strand
 # ------------------------------------------------------------------------
 # globals
 
-WINDOW = 1000
-EXTEND = 2500
+# WINDOW = 1000
+# EXTEND = 2500
 pp = pprint.PrettyPrinter(indent=4)
 
 # ------------------------------------------------------------------------
@@ -140,7 +140,7 @@ class Graph(object):
         graphs, names = [], []
         for name, path in G.seqs.items():
             blks = set([b.id for b in path.blocks()])
-            gi   = [ i for i, g in enumerate(graphs) if overlaps(blks, g)]
+            gi   = [i for i, g in enumerate(graphs) if overlaps(blks, g)]
             if len(gi) == 0:
                 graphs.append(blks)
                 names.append(set([name]))
@@ -163,7 +163,7 @@ class Graph(object):
     # ---------------
     # methods
 
-    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False):
+    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False):
         from seqanpy import align_global as align
 
         # ----------------------------------
@@ -324,7 +324,7 @@ class Graph(object):
                 continue
 
             merged = True
-            self.merge(proc(hit))
+            self.merge(proc(hit), edge_window, edge_extend)
             merged_blks.add(hit['ref']['name'])
             merged_blks.add(hit['qry']['name'])
 
@@ -414,7 +414,7 @@ class Graph(object):
             blks.update(path.blocks())
         self.blks = {b.id:self.blks[b.id] for b in blks}
 
-    def merge(self, hit):
+    def merge(self, hit, window, extend):
         old_ref = self.blks[hit['ref']['name']]
         old_qry = self.blks[hit['qry']['name']]
 
@@ -475,7 +475,7 @@ class Graph(object):
             # NOTE: this is a hack to deal with flipped orientations
             pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
             if first:
                 blk_list = set([b.id for b in blks])
                 first = False
@@ -489,7 +489,7 @@ class Graph(object):
             except:
                 breakpoint("bad find")
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
             blk_list.intersection_update(set([b.id for b in blks]))
             num_seqs += 1
 
@@ -507,10 +507,10 @@ class Graph(object):
                         pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
                         beg, end = pos[0], pos[-1]
 
-                        for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+                        for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]):
                             tmp[n].write(f">isolate_{i:04d}\n")
                             s = self.seqs[tag[0]].sequence_range(left,right)
-                            if len(s) > EXTEND + WINDOW:
+                            if len(s) > extend + window:
                                 breakpoint(f"bad sequence slicing: {len(s)}")
                             tmp[n].write(s + '\n')
author	Nicholas Noll <nbnoll@eml.cc>	2020-08-20 10:39:05 -0700
committer	Nicholas Noll <nbnoll@eml.cc>	2020-08-20 10:39:05 -0700
commit	80cede9edef586bbf68258f04ec3be8371afcc0c (patch)
tree	d317c74f94b0f14a997a6d083c191e7850a47ceb
parent	582f1f29268f3c3792c5b965d6948ff4b7633671 (diff)