From eddc70e299ca2822680f8b4ac9f277d88af002e5 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 11 Aug 2020 11:34:33 -0700 Subject: feat: blocks now store their position (not modulo length) along the path --- pangraph/block.py | 9 +++++++++ pangraph/graph.py | 18 +++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pangraph/block.py b/pangraph/block.py index 05b7806..e3953bd 100644 --- a/pangraph/block.py +++ b/pangraph/block.py @@ -26,6 +26,7 @@ class Block(object): super(Block, self).__init__() self.id = randomid() if gen else 0 self.seq = None + self.pos = {} self.muts = {} def __str__(self): @@ -49,6 +50,10 @@ class Block(object): def isolates(self): return dict(Counter([k[0] for k in self.muts])) + @property + def positions(self): + return { tag:(pos, pos+self.len_of(*tag)) for tag, pos in self.pos.items() } + # ------------------ # static methods @@ -56,6 +61,7 @@ class Block(object): def from_seq(cls, name, seq): new_blk = cls() new_blk.seq = as_array(seq) + new_blk.pos = {(name, 0): 0} new_blk.muts = {(name, 0):{}} return new_blk @@ -69,6 +75,7 @@ class Block(object): B = Block() B.id = d['id'] B.seq = as_array(d['seq']) + B.pos = d['pos'] B.muts = {unpack(k):v for k, v in d['muts'].items()} return B @@ -222,6 +229,7 @@ class Block(object): return {'id' : self.id, 'seq' : "".join(str(n) for n in self.seq), + 'pos' : self.pos, 'muts' : {pack(k) : fix(v) for k, v in self.muts.items()}} def __len__(self): @@ -233,6 +241,7 @@ class Block(object): start = val.start or 0 stop = val.stop or len(self.seq) b.seq = self.seq[start:stop] + b.pos = { iso : start+val.start for iso,start in self.pos.items() } for s, _ in self.muts.items(): b.muts[s] = {p-start:c for p,c in self.muts[s].items() if p>=start and p Date: Tue, 11 Aug 2020 11:54:13 -0700 Subject: fix: unpack and pack tuple on json serialization for block positions --- pangraph/block.py | 4 ++-- pangraph/graph.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pangraph/block.py b/pangraph/block.py index e3953bd..ad4cd7c 100644 --- a/pangraph/block.py +++ b/pangraph/block.py @@ -75,7 +75,7 @@ class Block(object): B = Block() B.id = d['id'] B.seq = as_array(d['seq']) - B.pos = d['pos'] + B.pos = {unpack(k):tuple(v) for k, v in d['pos'].items()} B.muts = {unpack(k):v for k, v in d['muts'].items()} return B @@ -229,7 +229,7 @@ class Block(object): return {'id' : self.id, 'seq' : "".join(str(n) for n in self.seq), - 'pos' : self.pos, + 'pos' : {pack(k) : v for k,v in self.pos.items()}, 'muts' : {pack(k) : fix(v) for k, v in self.muts.items()}} def __len__(self): diff --git a/pangraph/graph.py b/pangraph/graph.py index 70fd102..4ae721d 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -418,9 +418,14 @@ class Graph(object): # This is why in from_aln(aln) we set the start index to 0 ref = old_ref[hit['ref']['start']:hit['ref']['end']] qry = old_qry[hit['qry']['start']:hit['qry']['end']] - print(ref.positions) - print(qry.positions) - breakpoint("test positions") + + # print(ref.positions) + # print(qry.positions) + # # TODO: check for out of bounds accesses + # for tag, item in ref.positions.items(): + # blks = self.seqs[tag[0]][item-EXTEND:item+EXTEND] + # print(f"Isolate: {tag[0]} blocks: {blks}") + # breakpoint("test positions") if hit["orientation"] == Strand.Minus: qry = qry.rev_cmpl() -- cgit v1.2.1 From da096f986322037af9ff351ba475085bbb0fd6d8 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 11 Aug 2020 12:42:40 -0700 Subject: fix: slicing of sequence paths --- pangraph/graph.py | 14 +++++++------- pangraph/sequence.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 4ae721d..24defa9 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -419,13 +419,13 @@ class Graph(object): ref = old_ref[hit['ref']['start']:hit['ref']['end']] qry = old_qry[hit['qry']['start']:hit['qry']['end']] - # print(ref.positions) - # print(qry.positions) - # # TODO: check for out of bounds accesses - # for tag, item in ref.positions.items(): - # blks = self.seqs[tag[0]][item-EXTEND:item+EXTEND] - # print(f"Isolate: {tag[0]} blocks: {blks}") - # breakpoint("test positions") + print(ref.positions) + print(qry.positions) + for tag, item in ref.positions.items(): + blks = self.seqs[tag[0]][item[0]-EXTEND:item[1]+EXTEND] + if len(blks) >= 3: + print(f"Isolate: {tag[0]} blocks: {','.join(b.id for b in blks)}") + breakpoint("test positions") if hit["orientation"] == Strand.Minus: qry = qry.rev_cmpl() diff --git a/pangraph/sequence.py b/pangraph/sequence.py index b2b5ce7..5414364 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -158,8 +158,14 @@ class Path(object): beg = index.start or 0 end = index.stop or self.position[-1] - i = np.searchsorted(self.position, beg, side='right') - j = np.searchsorted(self.position, end, side='right') + 1 + # TODO: circular slicing + beg = max(0, beg) + end = min(end, self.position[-1]) + # if index.start < 0 or end > self.position[-1]: + # breakpoint(f"{index}:: need to implement circular slicing") + + i = np.searchsorted(self.position, beg, side='right') - 1 + j = np.searchsorted(self.position, end, side='left') assert i < j, "sorted" return [n.blk for n in self.nodes[i:j]] elif isinstance(index, int): -- cgit v1.2.1 From e5c6778ca74ac0e8eb5bc09d6e7ddc54e460e951 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 11 Aug 2020 14:12:24 -0700 Subject: fix: circular slicing check for singleton paths --- pangraph/graph.py | 4 ++-- pangraph/sequence.py | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 24defa9..6b29b1e 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -419,11 +419,11 @@ class Graph(object): ref = old_ref[hit['ref']['start']:hit['ref']['end']] qry = old_qry[hit['qry']['start']:hit['qry']['end']] - print(ref.positions) - print(qry.positions) for tag, item in ref.positions.items(): blks = self.seqs[tag[0]][item[0]-EXTEND:item[1]+EXTEND] if len(blks) >= 3: + print(ref.positions) + print(qry.positions) print(f"Isolate: {tag[0]} blocks: {','.join(b.id for b in blks)}") breakpoint("test positions") diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 5414364..baff6a2 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -157,19 +157,22 @@ class Path(object): if isinstance(index, slice): beg = index.start or 0 end = index.stop or self.position[-1] - - # TODO: circular slicing - beg = max(0, beg) - end = min(end, self.position[-1]) - # if index.start < 0 or end > self.position[-1]: - # breakpoint(f"{index}:: need to implement circular slicing") + l, r = [], [] + if beg < 0: + if len(self.nodes) > 1: + l = self[self.position[-1] + beg:self.position[-1]] + beg = 0 + if end > self.position[-1]: + if len(self.nodes) > 1: + r = self[0:end-self.position[-1]] + end = self.position[-1] i = np.searchsorted(self.position, beg, side='right') - 1 j = np.searchsorted(self.position, end, side='left') assert i < j, "sorted" - return [n.blk for n in self.nodes[i:j]] + return l + [n.blk for n in self.nodes[i:j]] + r elif isinstance(index, int): - i = np.searchsorted(self.position, index, side='left') + i = np.searchsorted(self.position, index, side='right') - 1 return self.nodes[i].blk else: raise ValueError(f"type '{type(index)}' not supported as index") -- cgit v1.2.1 From c796f164ea6856af354b879abefceb342de2f27c Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 11 Aug 2020 15:57:50 -0700 Subject: fix: compute position of each block using path class to ensure its accurate --- pangraph/block.py | 1 + pangraph/graph.py | 31 +++++++++++++++++++++++-------- pangraph/sequence.py | 27 ++++++++++++++++++++------- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/pangraph/block.py b/pangraph/block.py index ad4cd7c..5c649b9 100644 --- a/pangraph/block.py +++ b/pangraph/block.py @@ -92,6 +92,7 @@ class Block(object): for s in nblk.muts: nblk.muts[s].update({p+offset:c for p,c in b.muts[s].items()}) offset += len(b) + nblk.pos = { k:v for k,v in blks[0].pos.items() } return nblk diff --git a/pangraph/graph.py b/pangraph/graph.py index 6b29b1e..c2c3cdb 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -419,14 +419,6 @@ class Graph(object): ref = old_ref[hit['ref']['start']:hit['ref']['end']] qry = old_qry[hit['qry']['start']:hit['qry']['end']] - for tag, item in ref.positions.items(): - blks = self.seqs[tag[0]][item[0]-EXTEND:item[1]+EXTEND] - if len(blks) >= 3: - print(ref.positions) - print(qry.positions) - print(f"Isolate: {tag[0]} blocks: {','.join(b.id for b in blks)}") - breakpoint("test positions") - if hit["orientation"] == Strand.Minus: qry = qry.rev_cmpl() @@ -473,6 +465,29 @@ class Graph(object): new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation'])) self.prune_blks() + shared_blks = set() + first = True + for tag in ref.muts.keys(): + pos = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) + blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND] + if first: + shared_blks = set([b.id for b in blks]) + first = False + else: + shared_blks.intersection_update(set([b.id for b in blks])) + first = True + for tag in qry.muts.keys(): + pos = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) + blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND] + if first: + shared_blks = set([b.id for b in blks]) + first = False + else: + shared_blks.intersection_update(set([b.id for b in blks])) + + print(f"LEN: {len(shared_blks)}") + if len(shared_blks) > 20: + breakpoint("big number of blocks") return [b[0] for b in new_blocks] def extract(self, name, strip_gaps=True, verbose=False): diff --git a/pangraph/sequence.py b/pangraph/sequence.py index baff6a2..185dd38 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -88,11 +88,11 @@ class Path(object): return seq - def position_of(self, blk): - for i, n in enumerate(self.nodes): - if n.blk == blk: - return i, n.num - raise ValueError("block not found in path") + # def position_of(self, blk): + # for i, n in enumerate(self.nodes): + # if n.blk == blk: + # return i, n.num + # raise ValueError("block not found in path") def rm_nil_blks(self): good, popped = [], set() @@ -153,6 +153,17 @@ class Path(object): self.nodes = new self.position = np.cumsum([0] + [n.length(self.name) for n in self.nodes]) + def position_of(self, blk, num): + index = [i for i, n in enumerate(self.nodes) if n.blk == blk] + if len(index) <= num: + # print(f"NODES: {self.nodes}") + # print(f"BLOCK: {blk}") + # print(f"POSITION: {self.position}") + # print(f"INDEX: {index}") + # breakpoint("break") + return None + return (self.position[index[num]], self.position[index[num]+1]) + def __getitem__(self, index): if isinstance(index, slice): beg = index.start or 0 @@ -164,12 +175,14 @@ class Path(object): beg = 0 if end > self.position[-1]: if len(self.nodes) > 1: - r = self[0:end-self.position[-1]] + r = self[0:(end-self.position[-1])] end = self.position[-1] + if beg > end: + beg, end = end, beg i = np.searchsorted(self.position, beg, side='right') - 1 j = np.searchsorted(self.position, end, side='left') - assert i < j, "sorted" + assert i < j, f"not sorted, {beg}-{end}" return l + [n.blk for n in self.nodes[i:j]] + r elif isinstance(index, int): i = np.searchsorted(self.position, index, side='right') - 1 -- cgit v1.2.1 From 9deb0978ba029fc59ee699e8280dd60fb47bee1b Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 13 Aug 2020 11:37:05 -0700 Subject: fix: pull out interval of merged blocks and not just the first one --- Makefile | 2 +- pangraph/block.py | 4 +++- pangraph/graph.py | 38 +++++++++++++++++++------------------- pangraph/tree.py | 7 +++++++ 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 9681f3b..89f9e94 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json # 1>data/staph/pangraph.json + pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log 2>/dev/null # figures diff --git a/pangraph/block.py b/pangraph/block.py index 5c649b9..1bc501d 100644 --- a/pangraph/block.py +++ b/pangraph/block.py @@ -148,7 +148,9 @@ class Block(object): qryblks = qryblks[::-1] refblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None] - return newblks, qryblks, refblks, isomap + sharedblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None and qrys[i] is not None] + + return newblks, qryblks, refblks, sharedblks, isomap # -------------- # methods diff --git a/pangraph/graph.py b/pangraph/graph.py index c2c3cdb..fb46be9 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -433,7 +433,7 @@ class Graph(object): "qry_name" : hit["qry"]["name"], "orientation" : hit["orientation"]} - merged_blks, new_qrys, new_refs, blk_map = Block.from_aln(aln) + merged_blks, new_qrys, new_refs, shared_blks, blk_map = Block.from_aln(aln) for merged_blk in merged_blks: self.blks[merged_blk.id] = merged_blk @@ -463,31 +463,31 @@ class Graph(object): new_blocks = [] new_blocks.extend(update(old_ref, new_refs, hit['ref'], Strand.Plus)) new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation'])) - self.prune_blks() - shared_blks = set() - first = True + blk_list = set() + first = True for tag in ref.muts.keys(): - pos = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) - blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND] + beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) + end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) + blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] if first: - shared_blks = set([b.id for b in blks]) + blk_list = set([b.id for b in blks]) first = False else: - shared_blks.intersection_update(set([b.id for b in blks])) - first = True + blk_list.intersection_update(set([b.id for b in blks])) + for tag in qry.muts.keys(): - pos = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) - blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND] - if first: - shared_blks = set([b.id for b in blks]) - first = False - else: - shared_blks.intersection_update(set([b.id for b in blks])) + beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) + end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) + blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blk_list.intersection_update(set([b.id for b in blks])) + + print(f"LEN: {len(blk_list)-len(shared_blks)}") + if len(blk_list) < len(shared_blks): + breakpoint("inconsistent number of blocks") + + self.prune_blks() - print(f"LEN: {len(shared_blks)}") - if len(shared_blks) > 20: - breakpoint("big number of blocks") return [b[0] for b in new_blocks] def extract(self, name, strip_gaps=True, verbose=False): diff --git a/pangraph/tree.py b/pangraph/tree.py index 605bac3..64ac119 100644 --- a/pangraph/tree.py +++ b/pangraph/tree.py @@ -139,6 +139,11 @@ class Clade(object): 'fapath' : self.fapath, 'graph' : serialize(self.graph) if self.graph is not None else None } + def set_level(self, level): + for c in self.child: + c.set_level(level+1) + self.level = level + class Tree(object): # ------------------- # Class constructor @@ -288,6 +293,7 @@ class Tree(object): self.seqs = {leafs[name]:seq for name,seq in seqs.items()} def align(self, tmpdir, min_blk_len, mu, beta, extensive, log_stats=False, verbose=False): + self.root.set_level(0) # NOTE: for debug logging stats = {} # --------------------------------------------- # internal functions @@ -377,6 +383,7 @@ class Tree(object): for n in self.postorder(): if n.is_leaf(): continue + print(f"---NODE LEVEL {n.level}---") n.fapath = f"{tmpdir}/{n.name}" log(f"fusing {n.child[0].name} with {n.child[1].name} @ {n.name}") n.graph = merge(*n.child) -- cgit v1.2.1 From b498a39385f08f1ffa374b03637e2317da718004 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 13 Aug 2020 11:58:02 -0700 Subject: fix: deal with inverted cases for block lists --- pangraph/graph.py | 33 +++++++++++++++++++++++++++++---- pangraph/sequence.py | 5 ----- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index fb46be9..2926d3f 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -467,8 +467,9 @@ class Graph(object): blk_list = set() first = True for tag in ref.muts.keys(): - beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) - end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) + # NOTE: this is a hack to deal with flipped orientations + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) + beg, end = pos[0], pos[-1] blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] if first: blk_list = set([b.id for b in blks]) @@ -477,13 +478,37 @@ class Graph(object): blk_list.intersection_update(set([b.id for b in blks])) for tag in qry.muts.keys(): - beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) - end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0]) + beg, end = pos[0], pos[-1] blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] blk_list.intersection_update(set([b.id for b in blks])) print(f"LEN: {len(blk_list)-len(shared_blks)}") if len(blk_list) < len(shared_blks): + ref_list = set() + first = True + for tag in ref.muts.keys(): + beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) + end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) + blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + if first: + ref_list = set([b.id for b in blks]) + first = False + else: + ref_list.intersection_update(set([b.id for b in blks])) + + qry_list = set() + first = True + for tag in qry.muts.keys(): + beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) + end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) + blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + if first: + qry_list = set([b.id for b in blks]) + first = False + else: + qry_list.intersection_update(set([b.id for b in blks])) + breakpoint("inconsistent number of blocks") self.prune_blks() diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 185dd38..dd9677b 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -156,11 +156,6 @@ class Path(object): def position_of(self, blk, num): index = [i for i, n in enumerate(self.nodes) if n.blk == blk] if len(index) <= num: - # print(f"NODES: {self.nodes}") - # print(f"BLOCK: {blk}") - # print(f"POSITION: {self.position}") - # print(f"INDEX: {index}") - # breakpoint("break") return None return (self.position[index[num]], self.position[index[num]+1]) -- cgit v1.2.1 From b1af35822273ebe52f6fa089cfb5bc5f8ef542a3 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 13 Aug 2020 12:52:20 -0700 Subject: fix: bug associated to position of using index absolutely instead of relative to true dictionary key --- pangraph/graph.py | 5 ++++- pangraph/sequence.py | 12 ++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 2926d3f..afff779 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -478,7 +478,10 @@ class Graph(object): blk_list.intersection_update(set([b.id for b in blks])) for tag in qry.muts.keys(): - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0]) + try: + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0]) + except: + breakpoint("bad find") beg, end = pos[0], pos[-1] blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] blk_list.intersection_update(set([b.id for b in blks])) diff --git a/pangraph/sequence.py b/pangraph/sequence.py index dd9677b..6b9baba 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -88,12 +88,6 @@ class Path(object): return seq - # def position_of(self, blk): - # for i, n in enumerate(self.nodes): - # if n.blk == blk: - # return i, n.num - # raise ValueError("block not found in path") - def rm_nil_blks(self): good, popped = [], set() for i, n in enumerate(self.nodes): @@ -121,6 +115,8 @@ class Path(object): try: i, j = ids.index(start[0]), ids.index(stop[0]) + if N > 0: + breakpoint("HIT") if self.nodes[i].strand == start[1]: beg, end, s = i, j, Strand.Plus else: @@ -154,8 +150,8 @@ class Path(object): self.position = np.cumsum([0] + [n.length(self.name) for n in self.nodes]) def position_of(self, blk, num): - index = [i for i, n in enumerate(self.nodes) if n.blk == blk] - if len(index) <= num: + index = { n.num:i for i, n in enumerate(self.nodes) if n.blk == blk } + if not num in index: return None return (self.position[index[num]], self.position[index[num]+1]) -- cgit v1.2.1 From 967b6412dddc99de429e692973819bb7147b0354 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 13 Aug 2020 14:40:16 -0700 Subject: feat: prototype of mafft alignment of blocks --- Makefile | 2 +- pangraph/build.py | 3 ++- pangraph/graph.py | 21 ++++++++++++++++++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 89f9e94..117dbc9 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log 2>/dev/null + pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null # figures diff --git a/pangraph/build.py b/pangraph/build.py index 14f96d3..4577732 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -85,6 +85,7 @@ def main(args): with open(f"{root}/graph_{i:03d}.fa", 'w') as fd: g.write_fasta(fd) - T.write_json(sys.stdout, no_seqs=True) + # NOTE: uncomment when done debugging + # T.write_json(sys.stdout, no_seqs=True) return 0 diff --git a/pangraph/graph.py b/pangraph/graph.py index afff779..f45cfd2 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -5,6 +5,8 @@ import pprint from glob import glob from collections import defaultdict, Counter +from itertools import chain +import subprocess from Bio import SeqIO, Phylo from Bio.Seq import Seq @@ -486,7 +488,24 @@ class Graph(object): blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] blk_list.intersection_update(set([b.id for b in blks])) - print(f"LEN: {len(blk_list)-len(shared_blks)}") + delta = len(blk_list)-len(shared_blks) + if delta > 0: + print(f"LEN: {delta}") + buf = "" + for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): + buf += ">isolate_{i:04d}\n" + blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + for b in blks: + buf += b.extract(*tag) + buf += '\n' + proc = subprocess.run(["mafft", "-"], stdout=subprocess.PIPE, input=buf, encoding='ascii') + print(proc.stdout) + print(proc.stderr) + breakpoint('test') + else: + print(f"NO MATCH") + + # NOTE: debugging code if len(blk_list) < len(shared_blks): ref_list = set() first = True -- cgit v1.2.1 From 25b974618ec3242a9d7d7373b5c9a3a8bf0f0681 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 13 Aug 2020 15:33:22 -0700 Subject: feat: now assess the alignment based on total tree length --- Makefile | 2 +- pangraph/graph.py | 42 +++++++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 117dbc9..6061cfc 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null + pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 2>/dev/null #1>staph.log # figures diff --git a/pangraph/graph.py b/pangraph/graph.py index f45cfd2..5a3c618 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -1,12 +1,13 @@ -import os, sys +import io, os, sys import json import numpy as np import pprint +import subprocess +import tempfile from glob import glob from collections import defaultdict, Counter from itertools import chain -import subprocess from Bio import SeqIO, Phylo from Bio.Seq import Seq @@ -468,6 +469,7 @@ class Graph(object): blk_list = set() first = True + num_seqs = 0 for tag in ref.muts.keys(): # NOTE: this is a hack to deal with flipped orientations pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) @@ -478,6 +480,7 @@ class Graph(object): first = False else: blk_list.intersection_update(set([b.id for b in blks])) + num_seqs += 1 for tag in qry.muts.keys(): try: @@ -487,21 +490,30 @@ class Graph(object): beg, end = pos[0], pos[-1] blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] blk_list.intersection_update(set([b.id for b in blks])) + num_seqs += 1 delta = len(blk_list)-len(shared_blks) - if delta > 0: - print(f"LEN: {delta}") - buf = "" - for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): - buf += ">isolate_{i:04d}\n" - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - for b in blks: - buf += b.extract(*tag) - buf += '\n' - proc = subprocess.run(["mafft", "-"], stdout=subprocess.PIPE, input=buf, encoding='ascii') - print(proc.stdout) - print(proc.stderr) - breakpoint('test') + if delta > 0 and num_seqs > 1: + print(f"LEN: {delta}", end="\t") + fd, path = tempfile.mkstemp() + try: + with os.fdopen(fd, 'w') as tmp: + for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): + tmp.write(f">isolate_{i:04d}\n") + blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + for b in blks: + tmp.write(b.extract(*tag)) + tmp.write('\n') + tmp.flush() + proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + out, err = proc.communicate() + tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') + print(tree.total_branch_length()) + finally: + os.remove(path) else: print(f"NO MATCH") -- cgit v1.2.1 From fe39e4fa983ace442e99a0e8811e69fef18c48d9 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 18 Aug 2020 11:07:31 -0700 Subject: feat: use fixed width window into each block for end alignment instead of entire block --- Makefile | 2 +- pangraph/graph.py | 92 ++++++++++++++++++++++++++++--------------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/Makefile b/Makefile index 6061cfc..60266a5 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 2>/dev/null #1>staph.log + pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log # figures diff --git a/pangraph/graph.py b/pangraph/graph.py index 5a3c618..1664d03 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -495,55 +495,57 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: print(f"LEN: {delta}", end="\t") - fd, path = tempfile.mkstemp() - try: - with os.fdopen(fd, 'w') as tmp: - for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): - tmp.write(f">isolate_{i:04d}\n") - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - for b in blks: - tmp.write(b.extract(*tag)) - tmp.write('\n') - tmp.flush() - proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - out, err = proc.communicate() - tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(tree.total_branch_length()) - finally: - os.remove(path) + for i, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]), (end[1],end[1]+EXTEND)]): + fd, path = tempfile.mkstemp() + try: + with os.fdopen(fd, 'w') as tmp: + for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): + tmp.write(f">isolate_{i:04d}\n") + blks = self.seqs[tag[0]][left:right] + for b in blks: + tmp.write(b.extract(*tag)) + tmp.write('\n') + tmp.flush() + print(f"aligning {num_seqs} seqs") + proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + out, err = proc.communicate() + tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') + print(f"-> NUM {i}: {tree.total_branch_length()}") + finally: + os.remove(path) else: print(f"NO MATCH") # NOTE: debugging code - if len(blk_list) < len(shared_blks): - ref_list = set() - first = True - for tag in ref.muts.keys(): - beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) - end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - if first: - ref_list = set([b.id for b in blks]) - first = False - else: - ref_list.intersection_update(set([b.id for b in blks])) - - qry_list = set() - first = True - for tag in qry.muts.keys(): - beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) - end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - if first: - qry_list = set([b.id for b in blks]) - first = False - else: - qry_list.intersection_update(set([b.id for b in blks])) - - breakpoint("inconsistent number of blocks") + # if len(blk_list) < len(shared_blks): + # ref_list = set() + # first = True + # for tag in ref.muts.keys(): + # beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) + # end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) + # blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + # if first: + # ref_list = set([b.id for b in blks]) + # first = False + # else: + # ref_list.intersection_update(set([b.id for b in blks])) + + # qry_list = set() + # first = True + # for tag in qry.muts.keys(): + # beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) + # end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) + # blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + # if first: + # qry_list = set([b.id for b in blks]) + # first = False + # else: + # qry_list.intersection_update(set([b.id for b in blks])) + + # breakpoint("inconsistent number of blocks") self.prune_blks() -- cgit v1.2.1 From da2f92077e0d773b065cea1b576d9f5171cd0091 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 18 Aug 2020 11:20:37 -0700 Subject: fix: made window offset a global parameter in anticipation of a cli parameter --- pangraph/graph.py | 7 +++++-- pangraph/sequence.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 1664d03..a38a4e3 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -21,6 +21,7 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand # ------------------------------------------------------------------------ # globals +WINDOW = 1000 EXTEND = 2500 pp = pprint.PrettyPrinter(indent=4) @@ -495,12 +496,14 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: print(f"LEN: {delta}", end="\t") - for i, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]), (end[1],end[1]+EXTEND)]): + for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): fd, path = tempfile.mkstemp() try: with os.fdopen(fd, 'w') as tmp: for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): tmp.write(f">isolate_{i:04d}\n") + if left == right: + breakpoint("no difference") blks = self.seqs[tag[0]][left:right] for b in blks: tmp.write(b.extract(*tag)) @@ -513,7 +516,7 @@ class Graph(object): shell=True) out, err = proc.communicate() tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> NUM {i}: {tree.total_branch_length()}") + print(f"-> NUM {n}: {tree.total_branch_length()}") finally: os.remove(path) else: diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 6b9baba..c437602 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -173,7 +173,8 @@ class Path(object): i = np.searchsorted(self.position, beg, side='right') - 1 j = np.searchsorted(self.position, end, side='left') - assert i < j, f"not sorted, {beg}-{end}" + if i > j: + breakpoint(f"not sorted, {beg}-{end}") return l + [n.blk for n in self.nodes[i:j]] + r elif isinstance(index, int): i = np.searchsorted(self.position, index, side='right') - 1 -- cgit v1.2.1 From 6fed97e86a7e07713884efc9c485b2b7c2b66b68 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 18 Aug 2020 11:55:01 -0700 Subject: fix: correctly deal with block offsets when slicing for sequences --- pangraph/graph.py | 6 ++---- pangraph/sequence.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index a38a4e3..3395891 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -504,13 +504,11 @@ class Graph(object): tmp.write(f">isolate_{i:04d}\n") if left == right: breakpoint("no difference") - blks = self.seqs[tag[0]][left:right] - for b in blks: - tmp.write(b.extract(*tag)) + tmp.write(self.seqs[tag[0]].sequence_range(left,right)) tmp.write('\n') tmp.flush() print(f"aligning {num_seqs} seqs") - proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree", + proc = subprocess.Popen(f"mafft --auto {path} | fasttree", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) diff --git a/pangraph/sequence.py b/pangraph/sequence.py index c437602..62bfd32 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -155,6 +155,39 @@ class Path(object): return None return (self.position[index[num]], self.position[index[num]+1]) + # TODO: pull out common functionality into a helper function + # TODO: merge with other sequence function + def sequence_range(self, start=None, stop=None): + beg = start or 0 + end = stop or self.position[-1] + l, r = "", "" + if beg < 0: + if len(self.nodes) > 1: + l = self.sequence_range(self.position[-1]+beg,self.position[-1]) + beg = 0 + if end > self.position[-1]: + if len(self.nodes) > 1: + r = self.sequence_range(0,end-self.position[-1]) + end = self.position[-1] + if beg > end: + beg, end = end, beg + + i = np.searchsorted(self.position, beg, side='right') - 1 + j = np.searchsorted(self.position, end, side='left') + m = "" + if i < j: + if beg < self.position[i]: + breakpoint("bad start search") + if end > self.position[j]: + breakpoint("bad end search") + m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):] + for n in self.nodes[i+1:j-1]: + m += n.blk.extract(self.name, n.num) + if j < len(self.nodes): + b = self.nodes[j].blk.extract(self.name, self.nodes[j].num) + m += b[0:(len(b)+self.position[j]-end)] + return l + m + r + def __getitem__(self, index): if isinstance(index, slice): beg = index.start or 0 @@ -162,7 +195,7 @@ class Path(object): l, r = [], [] if beg < 0: if len(self.nodes) > 1: - l = self[self.position[-1] + beg:self.position[-1]] + l = self[(self.position[-1]+beg):self.position[-1]] beg = 0 if end > self.position[-1]: if len(self.nodes) > 1: -- cgit v1.2.1 From c5209e086026a57c03501ff9194e15b6b8b0f404 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 18 Aug 2020 12:34:10 -0700 Subject: fix: slicing bug that lead to incorrect lengths --- pangraph/graph.py | 43 +++++++++++++++++++++++-------------------- pangraph/sequence.py | 23 +++++++++++++---------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 3395891..89ebadc 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -496,27 +496,30 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: print(f"LEN: {delta}", end="\t") - for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): + try: fd, path = tempfile.mkstemp() - try: - with os.fdopen(fd, 'w') as tmp: - for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): - tmp.write(f">isolate_{i:04d}\n") - if left == right: - breakpoint("no difference") - tmp.write(self.seqs[tag[0]].sequence_range(left,right)) - tmp.write('\n') - tmp.flush() - print(f"aligning {num_seqs} seqs") - proc = subprocess.Popen(f"mafft --auto {path} | fasttree", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - out, err = proc.communicate() - tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> NUM {n}: {tree.total_branch_length()}") - finally: - os.remove(path) + with os.fdopen(fd, 'w') as tmp: + for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) + beg, end = pos[0], pos[-1] + + tmp.write(f">isolate_{i:04d}\n") + for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW)]): #, (end[1]-WINDOW,end[1]+EXTEND)]): + s = self.seqs[tag[0]].sequence_range(left,right) + if len(s) > EXTEND + WINDOW: + breakpoint(f"bad sequence slicing: {len(s)}") + tmp.write(s + '\n') + tmp.flush() + print(f"aligning {num_seqs} seqs") + proc = subprocess.Popen(f"mafft --auto {path} | fasttree", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + out, err = proc.communicate() + tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') + print(f"-> NUM {n}: {tree.total_branch_length()}") + finally: + os.remove(path) else: print(f"NO MATCH") diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 62bfd32..3660b71 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -161,6 +161,8 @@ class Path(object): beg = start or 0 end = stop or self.position[-1] l, r = "", "" + # if start == 159 and stop == 3659: + # breakpoint("test") if beg < 0: if len(self.nodes) > 1: l = self.sequence_range(self.position[-1]+beg,self.position[-1]) @@ -176,16 +178,17 @@ class Path(object): j = np.searchsorted(self.position, end, side='left') m = "" if i < j: - if beg < self.position[i]: - breakpoint("bad start search") - if end > self.position[j]: - breakpoint("bad end search") - m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):] - for n in self.nodes[i+1:j-1]: - m += n.blk.extract(self.name, n.num) - if j < len(self.nodes): - b = self.nodes[j].blk.extract(self.name, self.nodes[j].num) - m += b[0:(len(b)+self.position[j]-end)] + if j >= len(self.position): + breakpoint("what?") + if i == j - 1: + m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):(end-self.position[i])] + else: + m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):] + for n in self.nodes[i+1:j-1]: + m += n.blk.extract(self.name, n.num) + n = self.nodes[j-1] + b = n.blk.extract(self.name, n.num) + m += b[0:(end-self.position[j-1])] return l + m + r def __getitem__(self, index): -- cgit v1.2.1 From 66bd4dc85b165756f4f1924ff953813c899a9782 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 18 Aug 2020 13:12:46 -0700 Subject: fix: odd bug associated to non-unicode characters in stream --- pangraph/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pangraph/utils.py b/pangraph/utils.py index e0d85fa..5d5e888 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -97,7 +97,10 @@ def as_array(x): return np.array(list(x)) def as_string(x): - return x.view(f'U{x.size}')[0] + try: + return x.view(f'U{x.size}')[0] + except: + return "".join(str(c) for c in x) def flatten(x): return np.ndarray.flatten(x[:]) -- cgit v1.2.1 From 9fa0dc81e1a6162736fbefe7ffffde79f9f808ce Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 18 Aug 2020 15:42:54 -0700 Subject: fix: added left and right extension back in --- pangraph/graph.py | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 89ebadc..2f6572a 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -496,30 +496,41 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: print(f"LEN: {delta}", end="\t") + fd = [None, None] + path = [None, None] try: - fd, path = tempfile.mkstemp() - with os.fdopen(fd, 'w') as tmp: + fd[0], path[0] = tempfile.mkstemp() + fd[1], path[1] = tempfile.mkstemp() + with os.fdopen(fd[0], 'w') as tmp1, os.fdopen(fd[1], 'w') as tmp2: + tmp = [tmp1, tmp2] for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - tmp.write(f">isolate_{i:04d}\n") - for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW)]): #, (end[1]-WINDOW,end[1]+EXTEND)]): + for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): + tmp[n].write(f">isolate_{i:04d}\n") s = self.seqs[tag[0]].sequence_range(left,right) if len(s) > EXTEND + WINDOW: breakpoint(f"bad sequence slicing: {len(s)}") - tmp.write(s + '\n') - tmp.flush() - print(f"aligning {num_seqs} seqs") - proc = subprocess.Popen(f"mafft --auto {path} | fasttree", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - out, err = proc.communicate() - tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> NUM {n}: {tree.total_branch_length()}") + tmp[n].write(s + '\n') + + tmp[0].flush() + tmp[1].flush() + + def make_tree(n): + proc = subprocess.Popen(f"mafft --auto {path[n]} | fasttree", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + out, err = proc.communicate() + tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') + print(f"-> {n} SCORE: {tree.total_branch_length()/(2*{num_seqs})}") + + make_tree(0) + make_tree(1) finally: - os.remove(path) + os.remove(path[0]) + os.remove(path[1]) else: print(f"NO MATCH") -- cgit v1.2.1 From bd21a4412a3e35f32ad84cf19328b7c89e0ed6e5 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 08:44:51 -0700 Subject: chore: update makefile to redirect to correct targets --- Makefile | 2 +- pangraph/graph.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 60266a5..117dbc9 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log + pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null # figures diff --git a/pangraph/graph.py b/pangraph/graph.py index 2f6572a..1a67c16 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -524,7 +524,7 @@ class Graph(object): shell=True) out, err = proc.communicate() tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> {n} SCORE: {tree.total_branch_length()/(2*{num_seqs})}") + print(f"-> {n} SCORE: {tree.total_branch_length()/(2*num_seqs)}") make_tree(0) make_tree(1) -- cgit v1.2.1 From d1ac5cedcc325be21de8e75bed39ef7741a76a19 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 11:02:50 -0700 Subject: feat: added simple fasta parser --- pangraph/utils.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pangraph/utils.py b/pangraph/utils.py index 5d5e888..82e0e3c 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -160,6 +160,30 @@ def getnwk(node, newick, parentdist, leaf_names): # ------------------------------------------------------------------------ # parsers +def parse_fasta(fh): + class Record: + def __init__(self, name=None, meta=None, seq=None): + self.seq = seq + self.name = name + self.meta = meta + + header = fh.readline() + while True: + if header == "": + return 1, None + if header[0] != '>': + return 0, "improper fasta file syntax" + + name = header[1:].split() + seq = "" + for line in fh: + if line == "" or line[0] == ">": + break + seq += line + + header = line + yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq) + def parse_paf(fh): hits = [] for line in fh: -- cgit v1.2.1 From 91e389d6e3d0c33f02474804802b053bb67cc48d Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 11:40:11 -0700 Subject: fix: corrected the iterator --- pangraph/utils.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/pangraph/utils.py b/pangraph/utils.py index 82e0e3c..eff9432 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -157,6 +157,11 @@ def getnwk(node, newick, parentdist, leaf_names): newick = "(%s" % (newick) return newick +def as_str(s): + if isinstance(s, bytes): + return s.decode('utf-8') + return s + # ------------------------------------------------------------------------ # parsers @@ -167,21 +172,23 @@ def parse_fasta(fh): self.name = name self.meta = meta - header = fh.readline() - while True: - if header == "": - return 1, None - if header[0] != '>': - return 0, "improper fasta file syntax" + def __str__(self): + return f">{self.name} {self.meta}\n{self.seq[:77]}...\n" + + def __repr__(self): + return str(self) + header = as_str(fh.readline()) + while header != "" and header[0] == ">": name = header[1:].split() seq = "" for line in fh: + line = as_str(line) if line == "" or line[0] == ">": break seq += line - header = line + header = as_str(line) yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq) def parse_paf(fh): -- cgit v1.2.1 From e01b49284da3d8384744695b7a5aeaa37bb47d4f Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 11:52:05 -0700 Subject: fix: use a string buffer --- pangraph/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pangraph/utils.py b/pangraph/utils.py index eff9432..3e553a0 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -3,6 +3,7 @@ import csv import gzip import numpy as np +from io import StringIO from enum import IntEnum from Bio import SeqIO @@ -181,15 +182,16 @@ def parse_fasta(fh): header = as_str(fh.readline()) while header != "" and header[0] == ">": name = header[1:].split() - seq = "" + seq = StringIO() for line in fh: line = as_str(line) if line == "" or line[0] == ">": break - seq += line + seq.write(line) header = as_str(line) - yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq) + yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue()) + seq.close() def parse_paf(fh): hits = [] -- cgit v1.2.1 From bc17c2a9a5cd218180d272061d492dcbbf566144 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 11:54:29 -0700 Subject: feat: added code to filter plasmids from chromosomes --- scripts/filter_plasmids.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100755 scripts/filter_plasmids.py diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py new file mode 100755 index 0000000..9461451 --- /dev/null +++ b/scripts/filter_plasmids.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +script to filter plasmids and chromosomes from full genome assemblies +""" + +import os +import sys +import gzip +import builtins + +from glob import glob + +sys.path.insert(0, os.path.abspath('.')) # gross hack +from pangraph.utils import parse_fasta, breakpoint + +from Bio import SeqIO + +def open(path, *args, **kwargs): + if path.endswith('.gz'): + return gzip.open(path, *args, **kwargs) + else: + return builtins.open(path, *args, **kwargs) + +# def main(args): +# for d in args: + +from time import time +if __name__ == "__main__": + t0 = time() + for path in glob("data/staph/assemblies/*.fna.gz"): + with open(path, 'rt') as fd: + seqs = [record for record in SeqIO.parse(fd, 'fasta')] + t1 = time() + print(f"bio parser took {t1 - t0} seconds") + + t0 = time() + for path in glob("data/staph/assemblies/*.fna.gz"): + with open(path, 'rt') as fd: + seqs = [record for record in parse_fasta(fd)] + t1 = time() + print(f"my parser took {t1 - t0} seconds") + + # main(sys.argv[1:]) -- cgit v1.2.1 From 8d5626243ce5560161534cba114301fd3a4dd382 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 12:05:56 -0700 Subject: fix: remove new lines from input stream --- pangraph/utils.py | 6 ++++-- scripts/filter_plasmids.py | 19 ++++++------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/pangraph/utils.py b/pangraph/utils.py index 3e553a0..e4b63f3 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -174,7 +174,9 @@ def parse_fasta(fh): self.meta = meta def __str__(self): - return f">{self.name} {self.meta}\n{self.seq[:77]}...\n" + NL = '\n' + nc = 80 + return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}" def __repr__(self): return str(self) @@ -187,7 +189,7 @@ def parse_fasta(fh): line = as_str(line) if line == "" or line[0] == ">": break - seq.write(line) + seq.write(line[:-1]) header = as_str(line) yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue()) diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py index 9461451..4ef2670 100755 --- a/scripts/filter_plasmids.py +++ b/scripts/filter_plasmids.py @@ -13,8 +13,6 @@ from glob import glob sys.path.insert(0, os.path.abspath('.')) # gross hack from pangraph.utils import parse_fasta, breakpoint -from Bio import SeqIO - def open(path, *args, **kwargs): if path.endswith('.gz'): return gzip.open(path, *args, **kwargs) @@ -26,18 +24,13 @@ def open(path, *args, **kwargs): from time import time if __name__ == "__main__": - t0 = time() - for path in glob("data/staph/assemblies/*.fna.gz"): - with open(path, 'rt') as fd: - seqs = [record for record in SeqIO.parse(fd, 'fasta')] - t1 = time() - print(f"bio parser took {t1 - t0} seconds") - - t0 = time() for path in glob("data/staph/assemblies/*.fna.gz"): - with open(path, 'rt') as fd: + with open(path, 'rt') as fd, open("test.fa", 'w') as wtr: + for rec in parse_fasta(fd): + # print(str(rec)) + wtr.write(str(rec)) + wtr.write('\n') seqs = [record for record in parse_fasta(fd)] - t1 = time() - print(f"my parser took {t1 - t0} seconds") + break # main(sys.argv[1:]) -- cgit v1.2.1 From 8f8ce839b3edebd591e83bad8628302e995ab2e2 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 12:16:21 -0700 Subject: feat: filter plasmids now works on the directory level --- scripts/filter_plasmids.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py index 4ef2670..71aa23f 100755 --- a/scripts/filter_plasmids.py +++ b/scripts/filter_plasmids.py @@ -19,18 +19,24 @@ def open(path, *args, **kwargs): else: return builtins.open(path, *args, **kwargs) -# def main(args): -# for d in args: +def main(args): + for arg in args: + in_dir = f"data/{arg}/assemblies" + if not os.path.exists(in_dir): + print(f"{in_dir} doesn't exist. skipping...") + continue + + out_dir = f"data/{arg}-plasmid/assemblies" + if not os.path.exists(out_dir): + os.mkdirs(out_dir) + + for path in glob(f"{in_dir}/*.f?a*"): + with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr: + for i, rec in enumerate(parse_fasta(fd)): + if i == 0: + continue + wtr.write(str(rec)) + wtr.write('\n') -from time import time if __name__ == "__main__": - for path in glob("data/staph/assemblies/*.fna.gz"): - with open(path, 'rt') as fd, open("test.fa", 'w') as wtr: - for rec in parse_fasta(fd): - # print(str(rec)) - wtr.write(str(rec)) - wtr.write('\n') - seqs = [record for record in parse_fasta(fd)] - break - - # main(sys.argv[1:]) + main(sys.argv[1:]) -- cgit v1.2.1 From 3f29dfc9a170bd5e78e8fa77c52a6f0450b00c63 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 12:20:35 -0700 Subject: fix: typo in function call --- scripts/filter_plasmids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py index 71aa23f..81a902d 100755 --- a/scripts/filter_plasmids.py +++ b/scripts/filter_plasmids.py @@ -28,7 +28,7 @@ def main(args): out_dir = f"data/{arg}-plasmid/assemblies" if not os.path.exists(out_dir): - os.mkdirs(out_dir) + os.makedirs(out_dir) for path in glob(f"{in_dir}/*.f?a*"): with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr: -- cgit v1.2.1 From 582f1f29268f3c3792c5b965d6948ff4b7633671 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 19 Aug 2020 12:43:26 -0700 Subject: feat: allow for plasmids and chromosome filtering --- scripts/filter_plasmids.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py index 81a902d..c309150 100755 --- a/scripts/filter_plasmids.py +++ b/scripts/filter_plasmids.py @@ -7,6 +7,7 @@ import os import sys import gzip import builtins +import argparse from glob import glob @@ -19,14 +20,18 @@ def open(path, *args, **kwargs): else: return builtins.open(path, *args, **kwargs) -def main(args): - for arg in args: - in_dir = f"data/{arg}/assemblies" +def main(dirs, plasmids=True): + for d in dirs: + in_dir = f"data/{d}/assemblies" if not os.path.exists(in_dir): print(f"{in_dir} doesn't exist. skipping...") continue - out_dir = f"data/{arg}-plasmid/assemblies" + if plasmids: + out_dir = f"data/{d}-plasmid/assemblies" + else: + out_dir = f"data/{d}-chromosome/assemblies" + if not os.path.exists(out_dir): os.makedirs(out_dir) @@ -34,9 +39,19 @@ def main(args): with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr: for i, rec in enumerate(parse_fasta(fd)): if i == 0: + if not plasmids: + wtr.write(str(rec)) + wtr.write('\n') + break continue + wtr.write(str(rec)) wtr.write('\n') +parser = argparse.ArgumentParser(description='seperate plasmids from chromosomes') +parser.add_argument('directories', metavar='dirs', nargs='+') +parser.add_argument('--chromosomes', default=False, action='store_true') + if __name__ == "__main__": - main(sys.argv[1:]) + args = parser.parse_args() + main(args.directories, plasmids=not args.chromosomes) -- cgit v1.2.1 From 80cede9edef586bbf68258f04ec3be8371afcc0c Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 10:39:05 -0700 Subject: feat: window and extend for end repair now CLI parameters --- Makefile | 2 +- pangraph/build.py | 18 +++++++++++++----- pangraph/graph.py | 20 ++++++++++---------- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 117dbc9..ab398b6 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null # figures diff --git a/pangraph/build.py b/pangraph/build.py index 4577732..5c3a8ec 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -38,6 +38,16 @@ def register_args(parser): type=int, default=22, help="energy cost for mutations (used during block merges)") + parser.add_argument("-w", "--window", + metavar="edge window", + type=int, + default=1000, + help="amount of sequence to align from for end repair") + parser.add_argument("-e", "--extend", + metavar="edge extend", + type=int, + default=1000, + help="amount of sequence to extend for end repair") parser.add_argument("-s", "--statistics", default=False, action='store_true', @@ -71,12 +81,10 @@ def main(args): mkdir(tmp) log("aligning") - with cProfile.Profile() as pr: - T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) - # TODO: when debugging phase is done, remove tmp directory + T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) + # TODO: when debugging phase is done, remove tmp directory - graphs = T.collect() - pr.dump_stats("perf.prof") + graphs = T.collect() for i, g in enumerate(graphs): log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}") diff --git a/pangraph/graph.py b/pangraph/graph.py index 1a67c16..83ef839 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -21,8 +21,8 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand # ------------------------------------------------------------------------ # globals -WINDOW = 1000 -EXTEND = 2500 +# WINDOW = 1000 +# EXTEND = 2500 pp = pprint.PrettyPrinter(indent=4) # ------------------------------------------------------------------------ @@ -140,7 +140,7 @@ class Graph(object): graphs, names = [], [] for name, path in G.seqs.items(): blks = set([b.id for b in path.blocks()]) - gi = [ i for i, g in enumerate(graphs) if overlaps(blks, g)] + gi = [i for i, g in enumerate(graphs) if overlaps(blks, g)] if len(gi) == 0: graphs.append(blks) names.append(set([name])) @@ -163,7 +163,7 @@ class Graph(object): # --------------- # methods - def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False): + def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False): from seqanpy import align_global as align # ---------------------------------- @@ -324,7 +324,7 @@ class Graph(object): continue merged = True - self.merge(proc(hit)) + self.merge(proc(hit), edge_window, edge_extend) merged_blks.add(hit['ref']['name']) merged_blks.add(hit['qry']['name']) @@ -414,7 +414,7 @@ class Graph(object): blks.update(path.blocks()) self.blks = {b.id:self.blks[b.id] for b in blks} - def merge(self, hit): + def merge(self, hit, window, extend): old_ref = self.blks[hit['ref']['name']] old_qry = self.blks[hit['qry']['name']] @@ -475,7 +475,7 @@ class Graph(object): # NOTE: this is a hack to deal with flipped orientations pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] if first: blk_list = set([b.id for b in blks]) first = False @@ -489,7 +489,7 @@ class Graph(object): except: breakpoint("bad find") beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] blk_list.intersection_update(set([b.id for b in blks])) num_seqs += 1 @@ -507,10 +507,10 @@ class Graph(object): pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): + for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]): tmp[n].write(f">isolate_{i:04d}\n") s = self.seqs[tag[0]].sequence_range(left,right) - if len(s) > EXTEND + WINDOW: + if len(s) > extend + window: breakpoint(f"bad sequence slicing: {len(s)}") tmp[n].write(s + '\n') -- cgit v1.2.1 From 0931ce34c979da9e376feeed515b8515748f9a6f Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 11:27:49 -0700 Subject: fix: pass in cli parameters into union function now --- Makefile | 2 +- pangraph/build.py | 2 +- pangraph/graph.py | 2 +- pangraph/tree.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index ab398b6..0fe7c0a 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 100 data/staph/guide.json # figures diff --git a/pangraph/build.py b/pangraph/build.py index 5c3a8ec..e744149 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -81,7 +81,7 @@ def main(args): mkdir(tmp) log("aligning") - T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) + T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.window, args.extend, args.statistics) # TODO: when debugging phase is done, remove tmp directory graphs = T.collect() diff --git a/pangraph/graph.py b/pangraph/graph.py index 83ef839..f4afa97 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -163,7 +163,7 @@ class Graph(object): # --------------- # methods - def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False): + def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False, edge_window=1000, edge_extend=2500): from seqanpy import align_global as align # ---------------------------------- diff --git a/pangraph/tree.py b/pangraph/tree.py index 64ac119..eeb52c8 100644 --- a/pangraph/tree.py +++ b/pangraph/tree.py @@ -292,7 +292,7 @@ class Tree(object): leafs = {n.name: n for n in self.get_leafs()} self.seqs = {leafs[name]:seq for name,seq in seqs.items()} - def align(self, tmpdir, min_blk_len, mu, beta, extensive, log_stats=False, verbose=False): + def align(self, tmpdir, min_blk_len, mu, beta, extensive, edge_window, edge_extend, log_stats=False, verbose=False): self.root.set_level(0) # NOTE: for debug logging stats = {} # --------------------------------------------- @@ -351,7 +351,7 @@ class Tree(object): graph1, fapath1 = node1.graph, node1.fapath graph2, fapath2 = node2.graph, node2.fapath graph = Graph.fuse(graph1, graph2) - graph, _ = graph.union(fapath1, fapath2, f"{tmpdir}/{n.name}", min_blk_len, mu, beta, extensive) + graph, _ = graph.union(fapath1, fapath2, f"{tmpdir}/{n.name}", min_blk_len, mu, beta, extensive, edge_window, edge_extend) else: graph = node1.graph @@ -361,7 +361,7 @@ class Tree(object): itr = f"{tmpdir}/{n.name}_iter_{i}" with open(f"{itr}.fa", 'w') as fd: graph.write_fasta(fd) - graph, contin = graph.union(itr, itr, f"{tmpdir}/{n.name}_iter_{i}", min_blk_len, mu, beta, extensive) + graph, contin = graph.union(itr, itr, f"{tmpdir}/{n.name}_iter_{i}", min_blk_len, mu, beta, extensive, edge_window, edge_extend) if not contin: return graph return graph -- cgit v1.2.1 From 53dbbf03451cf9495fb9602257ecadf2e1f843e5 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 13:32:02 -0700 Subject: feat: added prototype for plotting log file data --- Makefile | 2 +- pangraph/graph.py | 7 ++++--- pangraph/tree.py | 2 +- scripts/parse_log.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 scripts/parse_log.py diff --git a/Makefile b/Makefile index 0fe7c0a..b3f1d20 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 100 data/staph/guide.json + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>/dev/null 1>staph-e2500-w1000.log # figures diff --git a/pangraph/graph.py b/pangraph/graph.py index f4afa97..235ef31 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -495,7 +495,7 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: - print(f"LEN: {delta}", end="\t") + print(f">LEN: {delta}", end=';') fd = [None, None] path = [None, None] try: @@ -524,15 +524,16 @@ class Graph(object): shell=True) out, err = proc.communicate() tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> {n} SCORE: {tree.total_branch_length()/(2*num_seqs)}") + print(f"-> {n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") make_tree(0) make_tree(1) + print("\n", end="") finally: os.remove(path[0]) os.remove(path[1]) else: - print(f"NO MATCH") + print(f">NO MATCH") # NOTE: debugging code # if len(blk_list) < len(shared_blks): diff --git a/pangraph/tree.py b/pangraph/tree.py index eeb52c8..d36b77d 100644 --- a/pangraph/tree.py +++ b/pangraph/tree.py @@ -383,7 +383,7 @@ class Tree(object): for n in self.postorder(): if n.is_leaf(): continue - print(f"---NODE LEVEL {n.level}---") + print(f"+++LEVEL={n.level}+++") n.fapath = f"{tmpdir}/{n.name}" log(f"fusing {n.child[0].name} with {n.child[1].name} @ {n.name}") n.graph = merge(*n.child) diff --git a/scripts/parse_log.py b/scripts/parse_log.py new file mode 100644 index 0000000..1caae07 --- /dev/null +++ b/scripts/parse_log.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +""" +script to process our end repair log files for plotting +""" +import argparse +from collections import defaultdict + +level_preset = "+++LEVEL=" +level_offset = len(level_preset) + +score_preset = "SCORE=" +score_offset = len(score_preset) + +def main(args): + for log_path in args: + with open(log_path) as log: + level = -1 + stats = defaultdict(lambda: {'hits':[], 'miss': 0}) + for line in log: + line.rstrip('\n') + if line[0] == "+": + assert line.startwith(level_preset), "check syntax in log file" + level = int(line[level_offset:line.find("+++", level_offset)]) + continue + if line[0] == ">": + if line[1:] == "NO MATCH": + stats[level]['miss'] += 1 + continue + if line[1:].startswith("LEN="): + offset = [line.find(";")] + offset.append(line.find(";", offset[0])) + score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1]) + score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:) + stats[level]['hits'].extend(score) + + raise ValueError(f"invalid syntax: {line}") + print(stats) + +parser = argparse.ArgumentParser(description='process our data log files on end repair') +parser.add_argument('files', type=str, nargs='+') + +if __name__ == "__main__": + args = parser.parse_args() + main(args.files) -- cgit v1.2.1 From 70e85b88c3038ad707d1e7a149be3b502c58f868 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 13:34:13 -0700 Subject: fix: small typos --- scripts/parse_log.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) mode change 100644 => 100755 scripts/parse_log.py diff --git a/scripts/parse_log.py b/scripts/parse_log.py old mode 100644 new mode 100755 index 1caae07..82d5e0d --- a/scripts/parse_log.py +++ b/scripts/parse_log.py @@ -19,21 +19,23 @@ def main(args): for line in log: line.rstrip('\n') if line[0] == "+": - assert line.startwith(level_preset), "check syntax in log file" + assert line.startswith(level_preset), "check syntax in log file" level = int(line[level_offset:line.find("+++", level_offset)]) continue if line[0] == ">": - if line[1:] == "NO MATCH": + if line[1:].startswith("NO MATCH"): stats[level]['miss'] += 1 continue if line[1:].startswith("LEN="): offset = [line.find(";")] offset.append(line.find(";", offset[0])) - score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1]) - score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:) + score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1]]) + score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:]) stats[level]['hits'].extend(score) + continue + + raise ValueError(f"invalid syntax: {line[1:]}") - raise ValueError(f"invalid syntax: {line}") print(stats) parser = argparse.ArgumentParser(description='process our data log files on end repair') -- cgit v1.2.1 From 7e711b0d4693b301f7877d570198843238f90960 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 13:34:44 -0700 Subject: fix: harmonize log file syntax --- pangraph/graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 235ef31..500df82 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -524,7 +524,7 @@ class Graph(object): shell=True) out, err = proc.communicate() tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"-> {n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") + print(f"{n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") make_tree(0) make_tree(1) -- cgit v1.2.1 From 8b50c4516a6e53c9d6f06207a0c636f9348967e2 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 14:19:10 -0700 Subject: fix: reharmonize syntax of log file --- pangraph/graph.py | 4 ++-- scripts/parse_log.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 500df82..c4e3f83 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -495,7 +495,7 @@ class Graph(object): delta = len(blk_list)-len(shared_blks) if delta > 0 and num_seqs > 1: - print(f">LEN: {delta}", end=';') + print(f">LEN={delta}", end=';') fd = [None, None] path = [None, None] try: @@ -524,7 +524,7 @@ class Graph(object): shell=True) out, err = proc.communicate() tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') - print(f"{n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") + print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") make_tree(0) make_tree(1) diff --git a/scripts/parse_log.py b/scripts/parse_log.py index 82d5e0d..bbca0ea 100755 --- a/scripts/parse_log.py +++ b/scripts/parse_log.py @@ -28,16 +28,16 @@ def main(args): continue if line[1:].startswith("LEN="): offset = [line.find(";")] - offset.append(line.find(";", offset[0])) - score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1]]) - score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:]) + offset.append(line.find(";", offset[0]+1)) + offset.append(line.find(";", offset[1]+1)) + + score = [None, None] + score[0] = float(line[offset[0]+1+score_offset:offset[1]]) + score[1] = float(line[offset[1]+1+score_offset:offset[2]]) stats[level]['hits'].extend(score) continue - raise ValueError(f"invalid syntax: {line[1:]}") - print(stats) - parser = argparse.ArgumentParser(description='process our data log files on end repair') parser.add_argument('files', type=str, nargs='+') -- cgit v1.2.1 From 1c8a828dbe87d5adf6a0f528fe9f0e07ee32fa68 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 15:00:45 -0700 Subject: fix: allow manual override for tmp directory number --- pangraph/build.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pangraph/build.py b/pangraph/build.py index e744149..8cbfa1d 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -52,6 +52,10 @@ def register_args(parser): default=False, action='store_true', help="boolean flag that toggles whether the graph statistics are computed for intermediate graphs") + parser.add_argument("-n", "--tmp-dir-num", + type=int, + default=-1, + help="manually sets the tmp directory number. internal use only.") parser.add_argument("input", type=str, default="-", @@ -74,10 +78,13 @@ def main(args): root = args.dir.rstrip('/') tmp = f"{root}/tmp" - i = 0 - while os.path.isdir(tmp) and i < 32: - i += 1 - tmp = f"{root}/tmp{i:03d}" + if args.n == -1: + i = 0 + while os.path.isdir(tmp) and i < 64: + i += 1 + tmp = f"{root}/tmp{i:03d}" + else: + tmp = f"{root}/tmp{args.n:03d}" mkdir(tmp) log("aligning") -- cgit v1.2.1 From c712cd0f686df973f6281bbed98e821243bc3684 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 15:04:20 -0700 Subject: fix: small name change --- pangraph/build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pangraph/build.py b/pangraph/build.py index 8cbfa1d..5caceb3 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -52,7 +52,7 @@ def register_args(parser): default=False, action='store_true', help="boolean flag that toggles whether the graph statistics are computed for intermediate graphs") - parser.add_argument("-n", "--tmp-dir-num", + parser.add_argument("-n", "--num", type=int, default=-1, help="manually sets the tmp directory number. internal use only.") @@ -78,7 +78,7 @@ def main(args): root = args.dir.rstrip('/') tmp = f"{root}/tmp" - if args.n == -1: + if args.num == -1: i = 0 while os.path.isdir(tmp) and i < 64: i += 1 -- cgit v1.2.1 From f90622c44bc0fe71a1e5a12a3dbc22d19ee5f770 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 20 Aug 2020 15:05:37 -0700 Subject: fix: small name change v2 --- pangraph/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangraph/build.py b/pangraph/build.py index 5caceb3..92708a4 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -84,7 +84,7 @@ def main(args): i += 1 tmp = f"{root}/tmp{i:03d}" else: - tmp = f"{root}/tmp{args.n:03d}" + tmp = f"{root}/tmp{args.num:03d}" mkdir(tmp) log("aligning") -- cgit v1.2.1 From 78639f76e10810296607df2e9b3f839117185cde Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 2 Sep 2020 12:22:23 -0700 Subject: feat: added output of alignments to logging --- Makefile | 2 +- pangraph/graph.py | 16 +++++++++++++--- scripts/parse_log.py | 12 +++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index b3f1d20..b0b9ffb 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>/dev/null 1>staph-e2500-w1000.log + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log # figures diff --git a/pangraph/graph.py b/pangraph/graph.py index c4e3f83..d154f74 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -518,12 +518,22 @@ class Graph(object): tmp[1].flush() def make_tree(n): - proc = subprocess.Popen(f"mafft --auto {path[n]} | fasttree", + proc = [None, None] + out = [None, None] + err = [None, None] + proc[0] = subprocess.Popen(f"mafft --auto {path[n]}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out, err = proc.communicate() - tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick') + proc[1] = subprocess.Popen(f"fasttree", + stdin =subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + out[0], err[0] = proc[0].communicate() + out[1], err[1] = proc[1].communicate(input=out[0]) + tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') + print(f"ALIGNMENT=\n{out[0]}") print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") make_tree(0) diff --git a/scripts/parse_log.py b/scripts/parse_log.py index bbca0ea..3a0aed0 100755 --- a/scripts/parse_log.py +++ b/scripts/parse_log.py @@ -12,10 +12,11 @@ score_preset = "SCORE=" score_offset = len(score_preset) def main(args): + results = {} for log_path in args: + stats = defaultdict(lambda: {'hits':[], 'miss': 0}) + level = -1 with open(log_path) as log: - level = -1 - stats = defaultdict(lambda: {'hits':[], 'miss': 0}) for line in log: line.rstrip('\n') if line[0] == "+": @@ -37,10 +38,15 @@ def main(args): stats[level]['hits'].extend(score) continue raise ValueError(f"invalid syntax: {line[1:]}") + if len(stats) > 0: + path = log_path.replace(".log", "").split("-") + e, w = int(path[1][1:]), int(path[2][1:]) + results[(e,w)] = dict(stats) + return results parser = argparse.ArgumentParser(description='process our data log files on end repair') parser.add_argument('files', type=str, nargs='+') if __name__ == "__main__": args = parser.parse_args() - main(args.files) + results = main(args.files) -- cgit v1.2.1 From d1712b5a2bbeb61c0660c57e76ccb0e62230edee Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 10 Sep 2020 13:55:29 -0700 Subject: fix: more consistent handling of left/right extension --- Makefile | 4 +- pangraph/graph.py | 134 +++++++++++++++++++++++---------------------------- scripts/parse_log.py | 50 ++++++++++++++++--- 3 files changed, 105 insertions(+), 83 deletions(-) diff --git a/Makefile b/Makefile index b0b9ffb..d1a401d 100644 --- a/Makefile +++ b/Makefile @@ -55,8 +55,8 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log - + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json +# 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log # figures # figs/figure1.png: $(STAT) diff --git a/pangraph/graph.py b/pangraph/graph.py index d154f74..cc0c8e7 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -468,60 +468,77 @@ class Graph(object): new_blocks.extend(update(old_ref, new_refs, hit['ref'], Strand.Plus)) new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation'])) - blk_list = set() + lblks_set_x, rblks_set_x = set(), set() + lblks_set_s, rblks_set_s = set(), set() first = True num_seqs = 0 - for tag in ref.muts.keys(): + for tag in shared_blks[0].muts.keys(): # NOTE: this is a hack to deal with flipped orientations - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] + lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] + rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] + + lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] + rblks_s = self.seqs[tag[0]][end[1]-window:end[1]] if first: - blk_list = set([b.id for b in blks]) + lblks_set_x = set([b.id for b in lblks_x]) + rblks_set_x = set([b.id for b in rblks_x]) + + lblks_set_s = set([b.id for b in lblks_s]) + rblks_set_s = set([b.id for b in rblks_s]) + first = False else: - blk_list.intersection_update(set([b.id for b in blks])) - num_seqs += 1 + lblks_set_x.intersection_update(set([b.id for b in lblks_x])) + rblks_set_x.intersection_update(set([b.id for b in rblks_x])) - for tag in qry.muts.keys(): - try: - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0]) - except: - breakpoint("bad find") - beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] - blk_list.intersection_update(set([b.id for b in blks])) + lblks_set_s.intersection_update(set([b.id for b in lblks_s])) + rblks_set_s.intersection_update(set([b.id for b in rblks_s])) num_seqs += 1 - delta = len(blk_list)-len(shared_blks) - if delta > 0 and num_seqs > 1: - print(f">LEN={delta}", end=';') - fd = [None, None] - path = [None, None] - try: - fd[0], path[0] = tempfile.mkstemp() - fd[1], path[1] = tempfile.mkstemp() - with os.fdopen(fd[0], 'w') as tmp1, os.fdopen(fd[1], 'w') as tmp2: - tmp = [tmp1, tmp2] - for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())): - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) - beg, end = pos[0], pos[-1] - - for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]): - tmp[n].write(f">isolate_{i:04d}\n") + def emit(side): + if side == 'left': + delta = len(lblks_set_x)-len(lblks_set_s) + elif side == 'right': + delta = len(rblks_set_x)-len(rblks_set_s) + else: + raise ValueError(f"unrecognized argument '{side}' for side") + + if delta > 0 and num_seqs > 1: + print(f">LEN={delta}", end=';') + try: + fd, path = tempfile.mkstemp() + with os.fdopen(fd, 'w') as tmp: + for i, tag in enumerate(merged_blks[0].muts.keys()): + pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) + beg, end = pos[0], pos[-1] + + if side == 'left': + left, right = beg[0]-extend,beg[0]+window + elif side == 'right': + left, right = end[1]-window,end[1]+extend + else: + raise ValueError(f"unrecognized argument '{side}' for side") + + iso_blks = self.seqs[tag[0]][left:right] + print("POSITIONS", pos) + print("LIST", lblks_set_x if side == 'left' else rblks_set_x) + print("SHARED", lblks_set_s if side == 'left' else rblks_set_s) + print("ISO", iso_blks) + breakpoint("stop") + tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n") s = self.seqs[tag[0]].sequence_range(left,right) if len(s) > extend + window: breakpoint(f"bad sequence slicing: {len(s)}") - tmp[n].write(s + '\n') + tmp.write(s + '\n') - tmp[0].flush() - tmp[1].flush() + tmp.flush() - def make_tree(n): proc = [None, None] out = [None, None] err = [None, None] - proc[0] = subprocess.Popen(f"mafft --auto {path[n]}", + proc[0] = subprocess.Popen(f"mafft --auto {path}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) @@ -533,45 +550,16 @@ class Graph(object): out[0], err[0] = proc[0].communicate() out[1], err[1] = proc[1].communicate(input=out[0]) tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') - print(f"ALIGNMENT=\n{out[0]}") + print(f"ALIGNMENT=\n{out[0]}", end=";") print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") + print("\n", end="") + finally: + os.remove(path) + else: + print(f">NO MATCH") - make_tree(0) - make_tree(1) - print("\n", end="") - finally: - os.remove(path[0]) - os.remove(path[1]) - else: - print(f">NO MATCH") - - # NOTE: debugging code - # if len(blk_list) < len(shared_blks): - # ref_list = set() - # first = True - # for tag in ref.muts.keys(): - # beg = self.seqs[tag[0]].position_of(new_refs[0], tag[1]) - # end = self.seqs[tag[0]].position_of(new_refs[-1], tag[1]) - # blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - # if first: - # ref_list = set([b.id for b in blks]) - # first = False - # else: - # ref_list.intersection_update(set([b.id for b in blks])) - - # qry_list = set() - # first = True - # for tag in qry.muts.keys(): - # beg = self.seqs[tag[0]].position_of(new_qrys[0], tag[1]) - # end = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1]) - # blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] - # if first: - # qry_list = set([b.id for b in blks]) - # first = False - # else: - # qry_list.intersection_update(set([b.id for b in blks])) - - # breakpoint("inconsistent number of blocks") + emit('left') + emit('right') self.prune_blks() diff --git a/scripts/parse_log.py b/scripts/parse_log.py index 3a0aed0..8532d3d 100755 --- a/scripts/parse_log.py +++ b/scripts/parse_log.py @@ -2,7 +2,11 @@ """ script to process our end repair log files for plotting """ +import ast import argparse + +from io import StringIO +from Bio import AlignIO from collections import defaultdict level_preset = "+++LEVEL=" @@ -11,6 +15,40 @@ level_offset = len(level_preset) score_preset = "SCORE=" score_offset = len(score_preset) +msaln_preset = "ALIGNMENT=" +msaln_offset = len(msaln_preset) + +def unpack(line): + offset = [line.find(";")] + offset.append(line.find(";", offset[0]+1)) + + align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8')) + align = next(AlignIO.parse(align, 'fasta')) + score = float(line[offset[0]+1+score_offset:offset[1]]) + + return align, score + +# TODO: remove hardcoded numbers +def save_aln_examples(results): + stats = results[(2500,1000)] + nums = [0, 0, 0] + for score, aln in stats[1]['hits']: + if len(aln) < 10: + continue + + if score < 1e-4: + with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd: + AlignIO.write(aln, fd, "fasta") + nums[0] += 1 + elif score < 1e-2: + with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd: + AlignIO.write(aln, fd, "fasta") + nums[1] += 1 + else: + with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd: + AlignIO.write(aln, fd, "fasta") + nums[2] += 1 + def main(args): results = {} for log_path in args: @@ -28,14 +66,9 @@ def main(args): stats[level]['miss'] += 1 continue if line[1:].startswith("LEN="): - offset = [line.find(";")] - offset.append(line.find(";", offset[0]+1)) - offset.append(line.find(";", offset[1]+1)) - - score = [None, None] - score[0] = float(line[offset[0]+1+score_offset:offset[1]]) - score[1] = float(line[offset[1]+1+score_offset:offset[2]]) - stats[level]['hits'].extend(score) + laln, lscore = unpack(log.readline()) + raln, rscore = unpack(log.readline()) + stats[level]['hits'].extend([(lscore, laln), (rscore, raln)]) continue raise ValueError(f"invalid syntax: {line[1:]}") if len(stats) > 0: @@ -50,3 +83,4 @@ parser.add_argument('files', type=str, nargs='+') if __name__ == "__main__": args = parser.parse_args() results = main(args.files) + save_aln_examples(results) -- cgit v1.2.1 From e24f680eeb4aecefec3e06f4e9857657fc9e8d1c Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 10 Sep 2020 14:37:21 -0700 Subject: fix: proper handling of Z2 flipped orientations --- Makefile | 2 +- pangraph/graph.py | 33 ++++++++++++++++++++++++--------- pangraph/sequence.py | 6 ++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index d1a401d..85c1b24 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json + pangraph build -d data/staph -m 500 -b 0 -e 25 -w 1000 data/staph/guide.json # 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log # figures diff --git a/pangraph/graph.py b/pangraph/graph.py index cc0c8e7..96cb721 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -473,14 +473,27 @@ class Graph(object): first = True num_seqs = 0 for tag in shared_blks[0].muts.keys(): - # NOTE: this is a hack to deal with flipped orientations - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) - beg, end = pos[0], pos[-1] - lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] - rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] - - lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] - rblks_s = self.seqs[tag[0]][end[1]-window:end[1]] + pos = [self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks] + strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks] + if strand[0] == Strand.Plus: + beg, end = pos[0], pos[-1] + + lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] + rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] + + lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] + rblks_s = self.seqs[tag[0]][end[1]-window:end[1]] + elif strand[0] == Strand.Minus: + beg, end = pos[0], pos[-1] + + rblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] + lblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] + + rblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] + lblks_s = self.seqs[tag[0]][end[1]-window:end[1]] + else: + raise ValueError("unrecognized strand polarity") + if first: lblks_set_x = set([b.id for b in lblks_x]) rblks_set_x = set([b.id for b in rblks_x]) @@ -511,7 +524,8 @@ class Graph(object): fd, path = tempfile.mkstemp() with os.fdopen(fd, 'w') as tmp: for i, tag in enumerate(merged_blks[0].muts.keys()): - pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) + pos = [self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks] + strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks] beg, end = pos[0], pos[-1] if side == 'left': @@ -523,6 +537,7 @@ class Graph(object): iso_blks = self.seqs[tag[0]][left:right] print("POSITIONS", pos) + print("STRAND", strand) print("LIST", lblks_set_x if side == 'left' else rblks_set_x) print("SHARED", lblks_set_s if side == 'left' else rblks_set_s) print("ISO", iso_blks) diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 3660b71..98170b4 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -155,6 +155,12 @@ class Path(object): return None return (self.position[index[num]], self.position[index[num]+1]) + def orientation_of(self, blk, num): + orientation = { n.num:n.strand for i, n in enumerate(self.nodes) if n.blk == blk } + if not num in orientation: + return None + return orientation[num] + # TODO: pull out common functionality into a helper function # TODO: merge with other sequence function def sequence_range(self, start=None, stop=None): -- cgit v1.2.1 From 02d03656460253609daf2883de5518582b8b7af7 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 10 Sep 2020 15:04:40 -0700 Subject: fix: properly deal with precondition for edge repair. now only if block lists show diversity --- Makefile | 2 +- pangraph/block.py | 2 +- pangraph/graph.py | 68 +++++++++++++++++++++++++++++++++------------------- pangraph/sequence.py | 2 -- 4 files changed, 45 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 85c1b24..d1a401d 100644 --- a/Makefile +++ b/Makefile @@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 -e 25 -w 1000 data/staph/guide.json + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json # 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log # figures diff --git a/pangraph/block.py b/pangraph/block.py index 1bc501d..92f9107 100644 --- a/pangraph/block.py +++ b/pangraph/block.py @@ -146,8 +146,8 @@ class Block(object): qryblks = [nb for i, nb in enumerate(newblks) if qrys[i] is not None] if aln['orientation'] == -1: qryblks = qryblks[::-1] - refblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None] + refblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None] sharedblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None and qrys[i] is not None] return newblks, qryblks, refblks, sharedblks, isomap diff --git a/pangraph/graph.py b/pangraph/graph.py index 96cb721..096d62a 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -475,22 +475,24 @@ class Graph(object): for tag in shared_blks[0].muts.keys(): pos = [self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks] strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks] + beg, end = pos[0], pos[-1] if strand[0] == Strand.Plus: - beg, end = pos[0], pos[-1] + lwindow = min(window, shared_blks[0].len_of(*tag)) + rwindow = min(window, shared_blks[-1].len_of(*tag)) - lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] - rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] + lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+lwindow] + rblks_x = self.seqs[tag[0]][end[1]-rwindow:end[1]+extend] - lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] - rblks_s = self.seqs[tag[0]][end[1]-window:end[1]] + # lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] + # rblks_s = self.seqs[tag[0]][end[1]-window:end[1]] elif strand[0] == Strand.Minus: - beg, end = pos[0], pos[-1] + lwindow = min(window, shared_blks[-1].len_of(*tag)) + rwindow = min(window, shared_blks[0].len_of(*tag)) + rblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+rwindow] + lblks_x = self.seqs[tag[0]][end[1]-lwindow:end[1]+extend] - rblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window] - lblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend] - - rblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] - lblks_s = self.seqs[tag[0]][end[1]-window:end[1]] + # rblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window] + # lblks_s = self.seqs[tag[0]][end[1]-window:end[1]] else: raise ValueError("unrecognized strand polarity") @@ -498,23 +500,28 @@ class Graph(object): lblks_set_x = set([b.id for b in lblks_x]) rblks_set_x = set([b.id for b in rblks_x]) - lblks_set_s = set([b.id for b in lblks_s]) - rblks_set_s = set([b.id for b in rblks_s]) + # lblks_set_s = set([b.id for b in lblks_s]) + # rblks_set_s = set([b.id for b in rblks_s]) + + lblks_set_s = set([b.id for b in lblks_x]) + rblks_set_s = set([b.id for b in rblks_x]) first = False else: lblks_set_x.intersection_update(set([b.id for b in lblks_x])) rblks_set_x.intersection_update(set([b.id for b in rblks_x])) - lblks_set_s.intersection_update(set([b.id for b in lblks_s])) - rblks_set_s.intersection_update(set([b.id for b in rblks_s])) + lblks_set_s.update(set([b.id for b in lblks_x])) + rblks_set_s.update(set([b.id for b in rblks_x])) + # lblks_set_s.intersection_update(set([b.id for b in lblks_s])) + # rblks_set_s.intersection_update(set([b.id for b in rblks_s])) num_seqs += 1 def emit(side): if side == 'left': - delta = len(lblks_set_x)-len(lblks_set_s) + delta = len(lblks_set_s)-len(lblks_set_x) elif side == 'right': - delta = len(rblks_set_x)-len(rblks_set_s) + delta = len(lblks_set_s)-len(rblks_set_x) else: raise ValueError(f"unrecognized argument '{side}' for side") @@ -528,18 +535,29 @@ class Graph(object): strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks] beg, end = pos[0], pos[-1] - if side == 'left': - left, right = beg[0]-extend,beg[0]+window - elif side == 'right': - left, right = end[1]-window,end[1]+extend - else: - raise ValueError(f"unrecognized argument '{side}' for side") + if strand[0] == Strand.Plus: + if side == 'left': + left, right = beg[0]-extend,beg[0]+min(window,shared_blks[0].len_of(*tag)) + elif side == 'right': + left, right = end[1]-min(window,shared_blks[-1].len_of(*tag)),end[1]+extend + else: + raise ValueError(f"unrecognized argument '{side}' for side") + + elif strand[0] == Strand.Minus: + if side == 'left': + left, right = end[1]-min(window,shared_blks[-1].len_of(*tag)),end[1]+extend + elif side == 'right': + left, right = beg[0]-extend,beg[0]+min(window, shared_blks[0].len_of(*tag)) + else: + raise ValueError(f"unrecognized argument '{side}' for side") iso_blks = self.seqs[tag[0]][left:right] print("POSITIONS", pos) print("STRAND", strand) - print("LIST", lblks_set_x if side == 'left' else rblks_set_x) - print("SHARED", lblks_set_s if side == 'left' else rblks_set_s) + print("LIST", shared_blks) + print("MERGED", merged_blks) + print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x) + print("UNION", lblks_set_s if side == 'left' else rblks_set_s) print("ISO", iso_blks) breakpoint("stop") tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n") diff --git a/pangraph/sequence.py b/pangraph/sequence.py index 98170b4..db2b3e6 100644 --- a/pangraph/sequence.py +++ b/pangraph/sequence.py @@ -167,8 +167,6 @@ class Path(object): beg = start or 0 end = stop or self.position[-1] l, r = "", "" - # if start == 159 and stop == 3659: - # breakpoint("test") if beg < 0: if len(self.nodes) > 1: l = self.sequence_range(self.position[-1]+beg,self.position[-1]) -- cgit v1.2.1 From 95e9b2408debb3d4c13d64f5bdef47f585d9e2f8 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 10 Sep 2020 15:40:48 -0700 Subject: fix: changed score from average tree branch length to average column entropy of msa --- pangraph/graph.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 096d62a..d561e6c 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -5,11 +5,12 @@ import pprint import subprocess import tempfile +from io import StringIO from glob import glob from collections import defaultdict, Counter from itertools import chain -from Bio import SeqIO, Phylo +from Bio import AlignIO, SeqIO, Phylo from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord @@ -25,6 +26,22 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand # EXTEND = 2500 pp = pprint.PrettyPrinter(indent=4) +# ------------------------------------------------------------------------ +# utility + +def entropy(s): + S = 0 + c = Counter(s) + S = sum((v/len(s))*np.log(len(s)/v) for v in c.values()) + return S + +def alignment_entropy(rdr): + aln = AlignIO.read(rdr, 'fasta') + S = 0 + for i in range(aln.get_alignment_length()): + S += entropy(aln[:,i]) + return S/aln.get_alignment_length() + # ------------------------------------------------------------------------ # Junction class # simple struct @@ -581,10 +598,13 @@ class Graph(object): stderr=subprocess.PIPE, shell=True) out[0], err[0] = proc[0].communicate() - out[1], err[1] = proc[1].communicate(input=out[0]) - tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') - print(f"ALIGNMENT=\n{out[0]}", end=";") - print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") + # out[1], err[1] = proc[1].communicate(input=out[0]) + # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') + print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";") + rdr = StringIO(out[0].decode('utf-8')) + print(f"SCORE={alignment_entropy(rdr)}", end=";") + rdr.close() + # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") print("\n", end="") finally: os.remove(path) -- cgit v1.2.1 From 8f9a55297a46af2267896de9fb3e1e50d07d9783 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 15 Sep 2020 13:03:56 -0700 Subject: fix: moved to use numpy array instead of slicing AlignIO object --- pangraph/graph.py | 51 ++++++++++++++++++++++++++------------------------- scripts/parse_log.py | 45 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 34 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index d561e6c..fbbb1e7 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -36,11 +36,15 @@ def entropy(s): return S def alignment_entropy(rdr): - aln = AlignIO.read(rdr, 'fasta') - S = 0 - for i in range(aln.get_alignment_length()): - S += entropy(aln[:,i]) - return S/aln.get_alignment_length() + try: + aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character) + S = 0 + for i in range(aln.shape[1]): + S += entropy(aln[:,i]) + return S/aln.shape[1] + except Exception as msg: + print(f"ERROR: {msg}") + return None # ------------------------------------------------------------------------ # Junction class @@ -569,14 +573,14 @@ class Graph(object): raise ValueError(f"unrecognized argument '{side}' for side") iso_blks = self.seqs[tag[0]][left:right] - print("POSITIONS", pos) - print("STRAND", strand) - print("LIST", shared_blks) - print("MERGED", merged_blks) - print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x) - print("UNION", lblks_set_s if side == 'left' else rblks_set_s) - print("ISO", iso_blks) - breakpoint("stop") + # print("POSITIONS", pos) + # print("STRAND", strand) + # print("LIST", shared_blks) + # print("MERGED", merged_blks) + # print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x) + # print("UNION", lblks_set_s if side == 'left' else rblks_set_s) + # print("ISO", iso_blks) + # breakpoint("stop") tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n") s = self.seqs[tag[0]].sequence_range(left,right) if len(s) > extend + window: @@ -585,23 +589,20 @@ class Graph(object): tmp.flush() - proc = [None, None] - out = [None, None] - err = [None, None] - proc[0] = subprocess.Popen(f"mafft --auto {path}", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - proc[1] = subprocess.Popen(f"fasttree", - stdin =subprocess.PIPE, + proc = subprocess.Popen(f"mafft --auto {path}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out[0], err[0] = proc[0].communicate() + # proc[1] = subprocess.Popen(f"fasttree", + # stdin =subprocess.PIPE, + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE, + # shell=True) + out, err = proc.communicate() # out[1], err[1] = proc[1].communicate(input=out[0]) # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') - print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";") - rdr = StringIO(out[0].decode('utf-8')) + print(f"ALIGNMENT={out}", end=";") + rdr = StringIO(out.decode('utf-8')) print(f"SCORE={alignment_entropy(rdr)}", end=";") rdr.close() # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") diff --git a/scripts/parse_log.py b/scripts/parse_log.py index 8532d3d..65ef8fd 100755 --- a/scripts/parse_log.py +++ b/scripts/parse_log.py @@ -9,6 +9,9 @@ from io import StringIO from Bio import AlignIO from collections import defaultdict +import numpy as np +import matplotlib.pylab as plt + level_preset = "+++LEVEL=" level_offset = len(level_preset) @@ -21,31 +24,32 @@ msaln_offset = len(msaln_preset) def unpack(line): offset = [line.find(";")] offset.append(line.find(";", offset[0]+1)) + offset.append(line.find(";", offset[1]+1)) - align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8')) + align = StringIO(ast.literal_eval(line[msaln_offset+1+offset[0]:offset[1]]).decode('utf-8')) align = next(AlignIO.parse(align, 'fasta')) - score = float(line[offset[0]+1+score_offset:offset[1]]) + score = float(line[offset[1]+1+score_offset:offset[2]]) return align, score # TODO: remove hardcoded numbers def save_aln_examples(results): - stats = results[(2500,1000)] + stats = results[(500,1000)] nums = [0, 0, 0] for score, aln in stats[1]['hits']: if len(aln) < 10: continue if score < 1e-4: - with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd: + with open(f"scratch/1/eg_{nums[0]}.fna", 'w+') as fd: AlignIO.write(aln, fd, "fasta") nums[0] += 1 elif score < 1e-2: - with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd: + with open(f"scratch/2/eg_{nums[1]}.fna", 'w+') as fd: AlignIO.write(aln, fd, "fasta") nums[1] += 1 else: - with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd: + with open(f"scratch/3/eg_{nums[2]}.fna", 'w+') as fd: AlignIO.write(aln, fd, "fasta") nums[2] += 1 @@ -66,9 +70,8 @@ def main(args): stats[level]['miss'] += 1 continue if line[1:].startswith("LEN="): - laln, lscore = unpack(log.readline()) - raln, rscore = unpack(log.readline()) - stats[level]['hits'].extend([(lscore, laln), (rscore, raln)]) + aln, score = unpack(line) + stats[level]['hits'].append((score, aln)) continue raise ValueError(f"invalid syntax: {line[1:]}") if len(stats) > 0: @@ -77,10 +80,34 @@ def main(args): results[(e,w)] = dict(stats) return results +def plot(results): + data = results[(500,1000)] + scores = [[] for _ in data.keys()] + fracs = np.zeros(len(data.keys())) + for i, lvl in enumerate(sorted(data.keys())): + scores[i] = [elt[0] for elt in data[lvl]['hits']] + fracs[i] = len(scores[i])/(len(scores[i]) + data[lvl]['miss']) + + cmap = plt.cm.get_cmap('plasma') + colors = [cmap(x) for x in np.linspace(0,1,len(scores[:-5]))] + fig, (ax1, ax2) = plt.subplots(1, 2) + ax1.plot(fracs[:-5]) + ax1.set_xlabel("tree level") + ax1.set_ylabel("fraction of good edges") + for i, score in enumerate(scores[:-5]): + ax2.plot(sorted(np.exp(score)), np.linspace(0,1,len(score)),color=colors[i],label=f"level={i+1}") + ax2.set_xlabel("average column entropy") + ax2.set_ylabel("CDF") + ax2.legend() + parser = argparse.ArgumentParser(description='process our data log files on end repair') parser.add_argument('files', type=str, nargs='+') +# ----------------------------------------------------------------------------- +# main point of entry + if __name__ == "__main__": args = parser.parse_args() results = main(args.files) save_aln_examples(results) + plot(results) -- cgit v1.2.1 From 26c9b16dd74767e4e99a7b771026e17b484016a7 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 15 Sep 2020 13:36:01 -0700 Subject: fix: vectorized entropy computation --- pangraph/graph.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index fbbb1e7..9f00587 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -14,6 +14,8 @@ from Bio import AlignIO, SeqIO, Phylo from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from scipy.stats import entropy + from . import suffix from .block import Block from .sequence import Node, Path @@ -29,18 +31,10 @@ pp = pprint.PrettyPrinter(indent=4) # ------------------------------------------------------------------------ # utility -def entropy(s): - S = 0 - c = Counter(s) - S = sum((v/len(s))*np.log(len(s)/v) for v in c.values()) - return S - def alignment_entropy(rdr): try: - aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character) - S = 0 - for i in range(aln.shape[1]): - S += entropy(aln[:,i]) + aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character).view(np.uint8) + S = sum(entropy(np.bincount(aln[:,i])/aln.shape[0]) for i in range(aln.shape[1])) return S/aln.shape[1] except Exception as msg: print(f"ERROR: {msg}") -- cgit v1.2.1