diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2020-08-18 12:34:10 -0700 |
---|---|---|
committer | Nicholas Noll <nbnoll@eml.cc> | 2020-08-18 12:34:10 -0700 |
commit | c5209e086026a57c03501ff9194e15b6b8b0f404 (patch) | |
tree | 53c13817f6fb0d7e4efc6c354e8dd6372dc0acec | |
parent | 6fed97e86a7e07713884efc9c485b2b7c2b66b68 (diff) |
fix: slicing bug that lead to incorrect lengths
-rw-r--r-- | pangraph/graph.py | 43 | ||||
-rw-r--r-- | pangraph/sequence.py | 23 |
2 files changed, 36 insertions, 30 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py index 3395891..89ebadc 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -496,27 +496,30 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: print(f"LEN: {delta}", end="\t") - for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): + try: fd, path = tempfile.mkstemp() - try: - with os.fdopen(fd, 'w') as tmp: - for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): - tmp.write(f">isolate_{i:04d}\n") - if left == right: - breakpoint("no difference") - tmp.write(self.seqs[tag[0]].sequence_range(left,right)) - tmp.write('\n') - tmp.flush() - print(f"aligning {num_seqs} seqs") - proc = subprocess.Popen(f"mafft --auto {path} | fasttree", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - out, err = proc.communicate() - tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> NUM {n}: {tree.total_branch_length()}") - finally: - os.remove(path) + with os.fdopen(fd, 'w') as tmp: + for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) + beg, end = pos[0], pos[-1] + + tmp.write(f">isolate_{i:04d}\n") + for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW)]): #, (end[1]-WINDOW,end[1]+EXTEND)]): + s = self.seqs[tag[0]].sequence_range(left,right) + if len(s) > EXTEND + WINDOW: + breakpoint(f"bad sequence slicing: {len(s)}") + tmp.write(s + '\n') + tmp.flush() + print(f"aligning {num_seqs} seqs") + proc = subprocess.Popen(f"mafft --auto {path} | fasttree", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + out, err = proc.communicate() + tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') + print(f"-> NUM {n}: {tree.total_branch_length()}") + finally: + os.remove(path) else: print(f"NO MATCH") diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 62bfd32..3660b71 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -161,6 +161,8 @@ class Path(object): beg = start or 0 end = stop or self.position[-1] l, r = "", "" + # if start == 159 and stop == 3659: + # breakpoint("test") if beg < 0: if len(self.nodes) > 1: l = self.sequence_range(self.position[-1]+beg,self.position[-1]) @@ -176,16 +178,17 @@ class Path(object): j = np.searchsorted(self.position, end, side='left') m = "" if i < j: - if beg < self.position[i]: - breakpoint("bad start search") - if end > self.position[j]: - breakpoint("bad end search") - m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):] - for n in self.nodes[i+1:j-1]: - m += n.blk.extract(self.name, n.num) - if j < len(self.nodes): - b = self.nodes[j].blk.extract(self.name, self.nodes[j].num) - m += b[0:(len(b)+self.position[j]-end)] + if j >= len(self.position): + breakpoint("what?") + if i == j - 1: + m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):(end-self.position[i])] + else: + m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):] + for n in self.nodes[i+1:j-1]: + m += n.blk.extract(self.name, n.num) + n = self.nodes[j-1] + b = n.blk.extract(self.name, n.num) + m += b[0:(end-self.position[j-1])] return l + m + r def __getitem__(self, index): |