aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-08-18 12:34:10 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-08-18 12:34:10 -0700
commitc5209e086026a57c03501ff9194e15b6b8b0f404 (patch)
tree53c13817f6fb0d7e4efc6c354e8dd6372dc0acec
parent6fed97e86a7e07713884efc9c485b2b7c2b66b68 (diff)
fix: slicing bug that lead to incorrect lengths
-rw-r--r--pangraph/graph.py43
-rw-r--r--pangraph/sequence.py23
2 files changed, 36 insertions, 30 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 3395891..89ebadc 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -496,27 +496,30 @@ class Graph(object):
delta = len(blk_list)-len(shared_blks)
if delta > 0 and num_seqs > 1:
print(f"LEN: {delta}", end="\t")
- for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+ try:
fd, path = tempfile.mkstemp()
- try:
- with os.fdopen(fd, 'w') as tmp:
- for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
- tmp.write(f">isolate_{i:04d}\n")
- if left == right:
- breakpoint("no difference")
- tmp.write(self.seqs[tag[0]].sequence_range(left,right))
- tmp.write('\n')
- tmp.flush()
- print(f"aligning {num_seqs} seqs")
- proc = subprocess.Popen(f"mafft --auto {path} | fasttree",
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- shell=True)
- out, err = proc.communicate()
- tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
- print(f"-> NUM {n}: {tree.total_branch_length()}")
- finally:
- os.remove(path)
+ with os.fdopen(fd, 'w') as tmp:
+ for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
+ pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
+ beg, end = pos[0], pos[-1]
+
+ tmp.write(f">isolate_{i:04d}\n")
+ for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW)]): #, (end[1]-WINDOW,end[1]+EXTEND)]):
+ s = self.seqs[tag[0]].sequence_range(left,right)
+ if len(s) > EXTEND + WINDOW:
+ breakpoint(f"bad sequence slicing: {len(s)}")
+ tmp.write(s + '\n')
+ tmp.flush()
+ print(f"aligning {num_seqs} seqs")
+ proc = subprocess.Popen(f"mafft --auto {path} | fasttree",
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=True)
+ out, err = proc.communicate()
+ tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+ print(f"-> NUM {n}: {tree.total_branch_length()}")
+ finally:
+ os.remove(path)
else:
print(f"NO MATCH")
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 62bfd32..3660b71 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -161,6 +161,8 @@ class Path(object):
beg = start or 0
end = stop or self.position[-1]
l, r = "", ""
+ # if start == 159 and stop == 3659:
+ # breakpoint("test")
if beg < 0:
if len(self.nodes) > 1:
l = self.sequence_range(self.position[-1]+beg,self.position[-1])
@@ -176,16 +178,17 @@ class Path(object):
j = np.searchsorted(self.position, end, side='left')
m = ""
if i < j:
- if beg < self.position[i]:
- breakpoint("bad start search")
- if end > self.position[j]:
- breakpoint("bad end search")
- m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):]
- for n in self.nodes[i+1:j-1]:
- m += n.blk.extract(self.name, n.num)
- if j < len(self.nodes):
- b = self.nodes[j].blk.extract(self.name, self.nodes[j].num)
- m += b[0:(len(b)+self.position[j]-end)]
+ if j >= len(self.position):
+ breakpoint("what?")
+ if i == j - 1:
+ m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):(end-self.position[i])]
+ else:
+ m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):]
+ for n in self.nodes[i+1:j-1]:
+ m += n.blk.extract(self.name, n.num)
+ n = self.nodes[j-1]
+ b = n.blk.extract(self.name, n.num)
+ m += b[0:(end-self.position[j-1])]
return l + m + r
def __getitem__(self, index):