aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-08-20 10:39:05 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-08-20 10:39:05 -0700
commit80cede9edef586bbf68258f04ec3be8371afcc0c (patch)
treed317c74f94b0f14a997a6d083c191e7850a47ceb
parent582f1f29268f3c3792c5b965d6948ff4b7633671 (diff)
feat: window and extend for end repair now CLI parameters
-rw-r--r--Makefile2
-rw-r--r--pangraph/build.py18
-rw-r--r--pangraph/graph.py20
3 files changed, 24 insertions, 16 deletions
diff --git a/Makefile b/Makefile
index 117dbc9..ab398b6 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
@echo "cluster staph"; \
pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
@echo "build staph"; \
- pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+ pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null
# figures
diff --git a/pangraph/build.py b/pangraph/build.py
index 4577732..5c3a8ec 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -38,6 +38,16 @@ def register_args(parser):
type=int,
default=22,
help="energy cost for mutations (used during block merges)")
+ parser.add_argument("-w", "--window",
+ metavar="edge window",
+ type=int,
+ default=1000,
+ help="amount of sequence to align from for end repair")
+ parser.add_argument("-e", "--extend",
+ metavar="edge extend",
+ type=int,
+ default=1000,
+ help="amount of sequence to extend for end repair")
parser.add_argument("-s", "--statistics",
default=False,
action='store_true',
@@ -71,12 +81,10 @@ def main(args):
mkdir(tmp)
log("aligning")
- with cProfile.Profile() as pr:
- T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
- # TODO: when debugging phase is done, remove tmp directory
+ T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
+ # TODO: when debugging phase is done, remove tmp directory
- graphs = T.collect()
- pr.dump_stats("perf.prof")
+ graphs = T.collect()
for i, g in enumerate(graphs):
log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}")
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 1a67c16..83ef839 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -21,8 +21,8 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand
# ------------------------------------------------------------------------
# globals
-WINDOW = 1000
-EXTEND = 2500
+# WINDOW = 1000
+# EXTEND = 2500
pp = pprint.PrettyPrinter(indent=4)
# ------------------------------------------------------------------------
@@ -140,7 +140,7 @@ class Graph(object):
graphs, names = [], []
for name, path in G.seqs.items():
blks = set([b.id for b in path.blocks()])
- gi = [ i for i, g in enumerate(graphs) if overlaps(blks, g)]
+ gi = [i for i, g in enumerate(graphs) if overlaps(blks, g)]
if len(gi) == 0:
graphs.append(blks)
names.append(set([name]))
@@ -163,7 +163,7 @@ class Graph(object):
# ---------------
# methods
- def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False):
+ def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False):
from seqanpy import align_global as align
# ----------------------------------
@@ -324,7 +324,7 @@ class Graph(object):
continue
merged = True
- self.merge(proc(hit))
+ self.merge(proc(hit), edge_window, edge_extend)
merged_blks.add(hit['ref']['name'])
merged_blks.add(hit['qry']['name'])
@@ -414,7 +414,7 @@ class Graph(object):
blks.update(path.blocks())
self.blks = {b.id:self.blks[b.id] for b in blks}
- def merge(self, hit):
+ def merge(self, hit, window, extend):
old_ref = self.blks[hit['ref']['name']]
old_qry = self.blks[hit['qry']['name']]
@@ -475,7 +475,7 @@ class Graph(object):
# NOTE: this is a hack to deal with flipped orientations
pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
beg, end = pos[0], pos[-1]
- blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+ blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
if first:
blk_list = set([b.id for b in blks])
first = False
@@ -489,7 +489,7 @@ class Graph(object):
except:
breakpoint("bad find")
beg, end = pos[0], pos[-1]
- blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+ blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
blk_list.intersection_update(set([b.id for b in blks]))
num_seqs += 1
@@ -507,10 +507,10 @@ class Graph(object):
pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
beg, end = pos[0], pos[-1]
- for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+ for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]):
tmp[n].write(f">isolate_{i:04d}\n")
s = self.seqs[tag[0]].sequence_range(left,right)
- if len(s) > EXTEND + WINDOW:
+ if len(s) > extend + window:
breakpoint(f"bad sequence slicing: {len(s)}")
tmp[n].write(s + '\n')