diff options
Diffstat (limited to 'pangraph/graph.py')
-rw-r--r-- | pangraph/graph.py | 134 |
1 files changed, 61 insertions, 73 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py index d154f74..cc0c8e7 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -468,60 +468,77 @@ class Graph(object): new_blocks.extend(update(old_ref, new_refs, hit['ref'], Strand.Plus)) new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation'])) - blk_list = set() + lblks_set_x, rblks_set_x = set(), set() + lblks_set_s, rblks_set_s = set(), set() first = True num_seqs = 0 - for tag in ref.muts.keys(): + for tag in shared_blks[0].muts.keys(): # NOTE: this is a hack to deal with flipped orientations - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] + lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] + rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] + + lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] + rblks_s = self.seqs[tag[0]][end[1]-window:end[1]] if first: - blk_list = set([b.id for b in blks]) + lblks_set_x = set([b.id for b in lblks_x]) + rblks_set_x = set([b.id for b in rblks_x]) + + lblks_set_s = set([b.id for b in lblks_s]) + rblks_set_s = set([b.id for b in rblks_s]) + first = False else: - blk_list.intersection_update(set([b.id for b in blks])) - num_seqs += 1 + lblks_set_x.intersection_update(set([b.id for b in lblks_x])) + rblks_set_x.intersection_update(set([b.id for b in rblks_x])) - for tag in qry.muts.keys(): - try: - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0]) - except: - breakpoint("bad find") - beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] - blk_list.intersection_update(set([b.id for b in blks])) + lblks_set_s.intersection_update(set([b.id for b in lblks_s])) + rblks_set_s.intersection_update(set([b.id for b in rblks_s])) num_seqs += 1 - delta = len(blk_list)-len(shared_blks) - if delta > 0 and num_seqs > 1: - print(f">LEN={delta}", end=';') - fd = [None, None] - path = [None, None] - try: - fd[0], path[0] = tempfile.mkstemp() - fd[1], path[1] = tempfile.mkstemp() - with os.fdopen(fd[0], 'w') as tmp1, os.fdopen(fd[1], 'w') as tmp2: - tmp = [tmp1, tmp2] - for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) - beg, end = pos[0], pos[-1] - - for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]): - tmp[n].write(f">isolate_{i:04d}\n") + def emit(side): + if side == 'left': + delta = len(lblks_set_x)-len(lblks_set_s) + elif side == 'right': + delta = len(rblks_set_x)-len(rblks_set_s) + else: + raise ValueError(f"unrecognized argument '{side}' for side") + + if delta > 0 and num_seqs > 1: + print(f">LEN={delta}", end=';') + try: + fd, path = tempfile.mkstemp() + with os.fdopen(fd, 'w') as tmp: + for i, tag in enumerate(merged_blks[0].muts.keys()): + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) + beg, end = pos[0], pos[-1] + + if side == 'left': + left, right = beg[0]-extend,beg[0]+window + elif side == 'right': + left, right = end[1]-window,end[1]+extend + else: + raise ValueError(f"unrecognized argument '{side}' for side") + + iso_blks = self.seqs[tag[0]][left:right] + print("POSITIONS", pos) + print("LIST", lblks_set_x if side == 'left' else rblks_set_x) + print("SHARED", lblks_set_s if side == 'left' else rblks_set_s) + print("ISO", iso_blks) + breakpoint("stop") + tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n") s = self.seqs[tag[0]].sequence_range(left,right) if len(s) > extend + window: breakpoint(f"bad sequence slicing: {len(s)}") - tmp[n].write(s + '\n') + tmp.write(s + '\n') - tmp[0].flush() - tmp[1].flush() + tmp.flush() - def make_tree(n): proc = [None, None] out = [None, None] err = [None, None] - proc[0] = subprocess.Popen(f"mafft --auto {path[n]}", + proc[0] = subprocess.Popen(f"mafft --auto {path}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) @@ -533,45 +550,16 @@ class Graph(object): out[0], err[0] = proc[0].communicate() out[1], err[1] = proc[1].communicate(input=out[0]) tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') - print(f"ALIGNMENT=\n{out[0]}") + print(f"ALIGNMENT=\n{out[0]}", end=";") print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") + print("\n", end="") + finally: + os.remove(path) + else: + print(f">NO MATCH") - make_tree(0) - make_tree(1) - print("\n", end="") - finally: - os.remove(path[0]) - os.remove(path[1]) - else: - print(f">NO MATCH") - - # NOTE: debugging code - # if len(blk_list) < len(shared_blks): - # ref_list = set() - # first = True - # for tag in ref.muts.keys(): - # beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) - # end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) - # blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - # if first: - # ref_list = set([b.id for b in blks]) - # first = False - # else: - # ref_list.intersection_update(set([b.id for b in blks])) - - # qry_list = set() - # first = True - # for tag in qry.muts.keys(): - # beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) - # end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) - # blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - # if first: - # qry_list = set([b.id for b in blks]) - # first = False - # else: - # qry_list.intersection_update(set([b.id for b in blks])) - - # breakpoint("inconsistent number of blocks") + emit('left') + emit('right') self.prune_blks() |