diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2020-08-20 10:39:05 -0700 |
---|---|---|
committer | Nicholas Noll <nbnoll@eml.cc> | 2020-08-20 10:39:05 -0700 |
commit | 80cede9edef586bbf68258f04ec3be8371afcc0c (patch) | |
tree | d317c74f94b0f14a997a6d083c191e7850a47ceb | |
parent | 582f1f29268f3c3792c5b965d6948ff4b7633671 (diff) |
feat: window and extend for end repair now CLI parameters
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | pangraph/build.py | 18 | ||||
-rw-r--r-- | pangraph/graph.py | 20 |
3 files changed, 24 insertions, 16 deletions
@@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null # figures diff --git a/pangraph/build.py b/pangraph/build.py index 4577732..5c3a8ec 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -38,6 +38,16 @@ def register_args(parser): type=int, default=22, help="energy cost for mutations (used during block merges)") + parser.add_argument("-w", "--window", + metavar="edge window", + type=int, + default=1000, + help="amount of sequence to align from for end repair") + parser.add_argument("-e", "--extend", + metavar="edge extend", + type=int, + default=1000, + help="amount of sequence to extend for end repair") parser.add_argument("-s", "--statistics", default=False, action='store_true', @@ -71,12 +81,10 @@ def main(args): mkdir(tmp) log("aligning") - with cProfile.Profile() as pr: - T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) - # TODO: when debugging phase is done, remove tmp directory + T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) + # TODO: when debugging phase is done, remove tmp directory - graphs = T.collect() - pr.dump_stats("perf.prof") + graphs = T.collect() for i, g in enumerate(graphs): log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}") diff --git a/pangraph/graph.py b/pangraph/graph.py index 1a67c16..83ef839 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -21,8 +21,8 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand # ------------------------------------------------------------------------ # globals -WINDOW = 1000 -EXTEND = 2500 +# WINDOW = 1000 +# EXTEND = 2500 pp = pprint.PrettyPrinter(indent=4) # ------------------------------------------------------------------------ @@ -140,7 +140,7 @@ class Graph(object): graphs, names = [], [] for name, path in G.seqs.items(): blks = set([b.id for b in path.blocks()]) - gi = [ i for i, g in enumerate(graphs) if overlaps(blks, g)] + gi = [i for i, g in enumerate(graphs) if overlaps(blks, g)] if len(gi) == 0: graphs.append(blks) names.append(set([name])) @@ -163,7 +163,7 @@ class Graph(object): # --------------- # methods - def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False): + def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False): from seqanpy import align_global as align # ---------------------------------- @@ -324,7 +324,7 @@ class Graph(object): continue merged = True - self.merge(proc(hit)) + self.merge(proc(hit), edge_window, edge_extend) merged_blks.add(hit['ref']['name']) merged_blks.add(hit['qry']['name']) @@ -414,7 +414,7 @@ class Graph(object): blks.update(path.blocks()) self.blks = {b.id:self.blks[b.id] for b in blks} - def merge(self, hit): + def merge(self, hit, window, extend): old_ref = self.blks[hit['ref']['name']] old_qry = self.blks[hit['qry']['name']] @@ -475,7 +475,7 @@ class Graph(object): # NOTE: this is a hack to deal with flipped orientations pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] if first: blk_list = set([b.id for b in blks]) first = False @@ -489,7 +489,7 @@ class Graph(object): except: breakpoint("bad find") beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] blk_list.intersection_update(set([b.id for b in blks])) num_seqs += 1 @@ -507,10 +507,10 @@ class Graph(object): pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): + for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]): tmp[n].write(f">isolate_{i:04d}\n") s = self.seqs[tag[0]].sequence_range(left,right) - if len(s) > EXTEND + WINDOW: + if len(s) > extend + window: breakpoint(f"bad sequence slicing: {len(s)}") tmp[n].write(s + '\n') |