From eddc70e299ca2822680f8b4ac9f277d88af002e5 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 11 Aug 2020 11:34:33 -0700
Subject: feat: blocks now store their position (not modulo length) along the
 path

---
 pangraph/block.py |  9 +++++++++
 pangraph/graph.py | 18 +++++-------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/pangraph/block.py b/pangraph/block.py
index 05b7806..e3953bd 100644
--- a/pangraph/block.py
+++ b/pangraph/block.py
@@ -26,6 +26,7 @@ class Block(object):
         super(Block, self).__init__()
         self.id   = randomid() if gen else 0
         self.seq  = None
+        self.pos  = {}
         self.muts = {}
 
     def __str__(self):
@@ -49,6 +50,10 @@ class Block(object):
     def isolates(self):
         return dict(Counter([k[0] for k in self.muts]))
 
+    @property
+    def positions(self):
+        return { tag:(pos, pos+self.len_of(*tag)) for tag, pos in self.pos.items() }
+
     # ------------------
     # static methods
 
@@ -56,6 +61,7 @@ class Block(object):
     def from_seq(cls, name, seq):
         new_blk      = cls()
         new_blk.seq  = as_array(seq)
+        new_blk.pos  = {(name, 0): 0}
         new_blk.muts = {(name, 0):{}}
 
         return new_blk
@@ -69,6 +75,7 @@ class Block(object):
         B      = Block()
         B.id   = d['id']
         B.seq  = as_array(d['seq'])
+        B.pos  = d['pos']
         B.muts = {unpack(k):v for k, v in d['muts'].items()}
 
         return B
@@ -222,6 +229,7 @@ class Block(object):
 
         return {'id'   : self.id,
                 'seq'  : "".join(str(n) for n in self.seq),
+                'pos'  : self.pos,
                 'muts' : {pack(k) : fix(v) for k, v in self.muts.items()}}
 
     def __len__(self):
@@ -233,6 +241,7 @@ class Block(object):
             start = val.start or 0
             stop  = val.stop or len(self.seq)
             b.seq = self.seq[start:stop]
+            b.pos = { iso : start+val.start for iso,start in self.pos.items() }
             for s, _ in self.muts.items():
                 b.muts[s] = {p-start:c for p,c in self.muts[s].items() if p>=start and p<stop}
             return b
diff --git a/pangraph/graph.py b/pangraph/graph.py
index bc1e9d9..70fd102 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -319,19 +319,11 @@ class Graph(object):
             or not accepted(hit):
                 continue
 
-            merged   = True
-            new_blks = self.merge(proc(hit))
+            merged = True
+            self.merge(proc(hit))
             merged_blks.add(hit['ref']['name'])
             merged_blks.add(hit['qry']['name'])
 
-            # for blk in new_blks:
-            #     for iso in blk.isolates:
-            #         path = self.seqs[iso]
-            #         x,  n  = path.position_of(blk)
-            #         lb, ub = max(0, x-EXTEND), min(x+blk.len_of(iso, n)+EXTEND, len(path))
-            #         subpath = path[lb:ub]
-            #         print(subpath, file=sys.stderr)
-            #         breakpoint("stop")
         self.remove_transitives()
 
         for path in self.seqs.values():
@@ -407,9 +399,6 @@ class Graph(object):
             # TODO: check that isos is constant along the chain
             for iso in self.blks[c[0][0]].isolates.keys():
                 self.seqs[iso].merge(c[0], c[-1], new_blk)
-                # for n in self.seqs[iso].nodes:
-                #     if n.blk.id in [e[0] for e in c]:
-                #         breakpoint("bad deletion")
 
             self.blks[new_blk.id] = new_blk
             for b, _ in c:
@@ -429,6 +418,9 @@ class Graph(object):
         # This is why in from_aln(aln) we set the start index to 0
         ref = old_ref[hit['ref']['start']:hit['ref']['end']]
         qry = old_qry[hit['qry']['start']:hit['qry']['end']]
+        print(ref.positions)
+        print(qry.positions)
+        breakpoint("test positions")
 
         if hit["orientation"] == Strand.Minus:
             qry = qry.rev_cmpl()
-- 
cgit v1.2.1


From 211b47294d8e4a0371448c4121a292357d8c7fda Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 11 Aug 2020 11:54:13 -0700
Subject: fix: unpack and pack tuple on json serialization for block positions

---
 pangraph/block.py |  4 ++--
 pangraph/graph.py | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pangraph/block.py b/pangraph/block.py
index e3953bd..ad4cd7c 100644
--- a/pangraph/block.py
+++ b/pangraph/block.py
@@ -75,7 +75,7 @@ class Block(object):
         B      = Block()
         B.id   = d['id']
         B.seq  = as_array(d['seq'])
-        B.pos  = d['pos']
+        B.pos  = {unpack(k):tuple(v) for k, v in d['pos'].items()}
         B.muts = {unpack(k):v for k, v in d['muts'].items()}
 
         return B
@@ -229,7 +229,7 @@ class Block(object):
 
         return {'id'   : self.id,
                 'seq'  : "".join(str(n) for n in self.seq),
-                'pos'  : self.pos,
+                'pos'  : {pack(k) : v for k,v in self.pos.items()},
                 'muts' : {pack(k) : fix(v) for k, v in self.muts.items()}}
 
     def __len__(self):
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 70fd102..4ae721d 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -418,9 +418,14 @@ class Graph(object):
         # This is why in from_aln(aln) we set the start index to 0
         ref = old_ref[hit['ref']['start']:hit['ref']['end']]
         qry = old_qry[hit['qry']['start']:hit['qry']['end']]
-        print(ref.positions)
-        print(qry.positions)
-        breakpoint("test positions")
+
+        # print(ref.positions)
+        # print(qry.positions)
+        # # TODO: check for out of bounds accesses
+        # for tag, item in ref.positions.items():
+        #     blks = self.seqs[tag[0]][item-EXTEND:item+EXTEND]
+        #     print(f"Isolate: {tag[0]} blocks: {blks}")
+        # breakpoint("test positions")
 
         if hit["orientation"] == Strand.Minus:
             qry = qry.rev_cmpl()
-- 
cgit v1.2.1


From da096f986322037af9ff351ba475085bbb0fd6d8 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 11 Aug 2020 12:42:40 -0700
Subject: fix: slicing of sequence paths

---
 pangraph/graph.py    | 14 +++++++-------
 pangraph/sequence.py | 10 ++++++++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 4ae721d..24defa9 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -419,13 +419,13 @@ class Graph(object):
         ref = old_ref[hit['ref']['start']:hit['ref']['end']]
         qry = old_qry[hit['qry']['start']:hit['qry']['end']]
 
-        # print(ref.positions)
-        # print(qry.positions)
-        # # TODO: check for out of bounds accesses
-        # for tag, item in ref.positions.items():
-        #     blks = self.seqs[tag[0]][item-EXTEND:item+EXTEND]
-        #     print(f"Isolate: {tag[0]} blocks: {blks}")
-        # breakpoint("test positions")
+        print(ref.positions)
+        print(qry.positions)
+        for tag, item in ref.positions.items():
+            blks = self.seqs[tag[0]][item[0]-EXTEND:item[1]+EXTEND]
+            if len(blks) >= 3:
+                print(f"Isolate: {tag[0]} blocks: {','.join(b.id for b in blks)}")
+                breakpoint("test positions")
 
         if hit["orientation"] == Strand.Minus:
             qry = qry.rev_cmpl()
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index b2b5ce7..5414364 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -158,8 +158,14 @@ class Path(object):
             beg = index.start or 0
             end = index.stop or self.position[-1]
 
-            i = np.searchsorted(self.position, beg, side='right')
-            j = np.searchsorted(self.position, end, side='right') + 1
+            # TODO: circular slicing
+            beg = max(0, beg)
+            end = min(end, self.position[-1])
+            # if index.start < 0 or end > self.position[-1]:
+                # breakpoint(f"{index}:: need to implement circular slicing")
+
+            i = np.searchsorted(self.position, beg, side='right') - 1
+            j = np.searchsorted(self.position, end, side='left')
             assert i < j, "sorted"
             return [n.blk for n in self.nodes[i:j]]
         elif isinstance(index, int):
-- 
cgit v1.2.1


From e5c6778ca74ac0e8eb5bc09d6e7ddc54e460e951 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 11 Aug 2020 14:12:24 -0700
Subject: fix: circular slicing check for singleton paths

---
 pangraph/graph.py    |  4 ++--
 pangraph/sequence.py | 19 +++++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 24defa9..6b29b1e 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -419,11 +419,11 @@ class Graph(object):
         ref = old_ref[hit['ref']['start']:hit['ref']['end']]
         qry = old_qry[hit['qry']['start']:hit['qry']['end']]
 
-        print(ref.positions)
-        print(qry.positions)
         for tag, item in ref.positions.items():
             blks = self.seqs[tag[0]][item[0]-EXTEND:item[1]+EXTEND]
             if len(blks) >= 3:
+                print(ref.positions)
+                print(qry.positions)
                 print(f"Isolate: {tag[0]} blocks: {','.join(b.id for b in blks)}")
                 breakpoint("test positions")
 
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 5414364..baff6a2 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -157,19 +157,22 @@ class Path(object):
         if isinstance(index, slice):
             beg = index.start or 0
             end = index.stop or self.position[-1]
-
-            # TODO: circular slicing
-            beg = max(0, beg)
-            end = min(end, self.position[-1])
-            # if index.start < 0 or end > self.position[-1]:
-                # breakpoint(f"{index}:: need to implement circular slicing")
+            l, r = [], []
+            if beg < 0:
+                if len(self.nodes) > 1:
+                    l   = self[self.position[-1] + beg:self.position[-1]]
+                beg = 0
+            if end > self.position[-1]:
+                if len(self.nodes) > 1:
+                    r   = self[0:end-self.position[-1]]
+                end = self.position[-1]
 
             i = np.searchsorted(self.position, beg, side='right') - 1
             j = np.searchsorted(self.position, end, side='left')
             assert i < j, "sorted"
-            return [n.blk for n in self.nodes[i:j]]
+            return l + [n.blk for n in self.nodes[i:j]] + r
         elif isinstance(index, int):
-            i = np.searchsorted(self.position, index, side='left')
+            i = np.searchsorted(self.position, index, side='right') - 1
             return self.nodes[i].blk
         else:
             raise ValueError(f"type '{type(index)}' not supported as index")
-- 
cgit v1.2.1


From c796f164ea6856af354b879abefceb342de2f27c Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 11 Aug 2020 15:57:50 -0700
Subject: fix: compute position of each block using path class to ensure its
 accurate

---
 pangraph/block.py    |  1 +
 pangraph/graph.py    | 31 +++++++++++++++++++++++--------
 pangraph/sequence.py | 27 ++++++++++++++++++++-------
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/pangraph/block.py b/pangraph/block.py
index ad4cd7c..5c649b9 100644
--- a/pangraph/block.py
+++ b/pangraph/block.py
@@ -92,6 +92,7 @@ class Block(object):
             for s in nblk.muts:
                 nblk.muts[s].update({p+offset:c for p,c in b.muts[s].items()})
             offset += len(b)
+        nblk.pos = { k:v for k,v in blks[0].pos.items() }
 
         return nblk
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 6b29b1e..c2c3cdb 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -419,14 +419,6 @@ class Graph(object):
         ref = old_ref[hit['ref']['start']:hit['ref']['end']]
         qry = old_qry[hit['qry']['start']:hit['qry']['end']]
 
-        for tag, item in ref.positions.items():
-            blks = self.seqs[tag[0]][item[0]-EXTEND:item[1]+EXTEND]
-            if len(blks) >= 3:
-                print(ref.positions)
-                print(qry.positions)
-                print(f"Isolate: {tag[0]} blocks: {','.join(b.id for b in blks)}")
-                breakpoint("test positions")
-
         if hit["orientation"] == Strand.Minus:
             qry = qry.rev_cmpl()
 
@@ -473,6 +465,29 @@ class Graph(object):
         new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation']))
         self.prune_blks()
 
+        shared_blks = set()
+        first = True
+        for tag in ref.muts.keys():
+            pos  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
+            blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND]
+            if first:
+                shared_blks = set([b.id for b in blks])
+                first = False
+            else:
+                shared_blks.intersection_update(set([b.id for b in blks]))
+        first = True
+        for tag in qry.muts.keys():
+            pos  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
+            blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND]
+            if first:
+                shared_blks = set([b.id for b in blks])
+                first = False
+            else:
+                shared_blks.intersection_update(set([b.id for b in blks]))
+
+        print(f"LEN: {len(shared_blks)}")
+        if len(shared_blks) > 20:
+            breakpoint("big number of blocks")
         return [b[0] for b in new_blocks]
 
     def extract(self, name, strip_gaps=True, verbose=False):
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index baff6a2..185dd38 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -88,11 +88,11 @@ class Path(object):
 
         return seq
 
-    def position_of(self, blk):
-        for i, n in enumerate(self.nodes):
-            if n.blk == blk:
-                return i, n.num
-        raise ValueError("block not found in path")
+    # def position_of(self, blk):
+    #     for i, n in enumerate(self.nodes):
+    #         if n.blk == blk:
+    #             return i, n.num
+    #     raise ValueError("block not found in path")
 
     def rm_nil_blks(self):
         good, popped = [], set()
@@ -153,6 +153,17 @@ class Path(object):
         self.nodes    = new
         self.position = np.cumsum([0] + [n.length(self.name) for n in self.nodes])
 
+    def position_of(self, blk, num):
+        index = [i for i, n in enumerate(self.nodes) if n.blk == blk]
+        if len(index) <= num:
+            # print(f"NODES: {self.nodes}")
+            # print(f"BLOCK: {blk}")
+            # print(f"POSITION: {self.position}")
+            # print(f"INDEX: {index}")
+            # breakpoint("break")
+            return None
+        return (self.position[index[num]], self.position[index[num]+1])
+
     def __getitem__(self, index):
         if isinstance(index, slice):
             beg = index.start or 0
@@ -164,12 +175,14 @@ class Path(object):
                 beg = 0
             if end > self.position[-1]:
                 if len(self.nodes) > 1:
-                    r   = self[0:end-self.position[-1]]
+                    r = self[0:(end-self.position[-1])]
                 end = self.position[-1]
+            if beg > end:
+                beg, end = end, beg
 
             i = np.searchsorted(self.position, beg, side='right') - 1
             j = np.searchsorted(self.position, end, side='left')
-            assert i < j, "sorted"
+            assert i < j, f"not sorted, {beg}-{end}"
             return l + [n.blk for n in self.nodes[i:j]] + r
         elif isinstance(index, int):
             i = np.searchsorted(self.position, index, side='right') - 1
-- 
cgit v1.2.1


From 9deb0978ba029fc59ee699e8280dd60fb47bee1b Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 13 Aug 2020 11:37:05 -0700
Subject: fix: pull out interval of merged blocks and not just the first one

---
 Makefile          |  2 +-
 pangraph/block.py |  4 +++-
 pangraph/graph.py | 38 +++++++++++++++++++-------------------
 pangraph/tree.py  |  7 +++++++
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 9681f3b..89f9e94 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json # 1>data/staph/pangraph.json
+	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log 2>/dev/null
 
 # figures
 
diff --git a/pangraph/block.py b/pangraph/block.py
index 5c649b9..1bc501d 100644
--- a/pangraph/block.py
+++ b/pangraph/block.py
@@ -148,7 +148,9 @@ class Block(object):
             qryblks = qryblks[::-1]
         refblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None]
 
-        return newblks, qryblks, refblks, isomap
+        sharedblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None and qrys[i] is not None]
+
+        return newblks, qryblks, refblks, sharedblks, isomap
 
     # --------------
     # methods
diff --git a/pangraph/graph.py b/pangraph/graph.py
index c2c3cdb..fb46be9 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -433,7 +433,7 @@ class Graph(object):
                "qry_name"    : hit["qry"]["name"],
                "orientation" : hit["orientation"]}
 
-        merged_blks, new_qrys, new_refs, blk_map = Block.from_aln(aln)
+        merged_blks, new_qrys, new_refs, shared_blks, blk_map = Block.from_aln(aln)
         for merged_blk in merged_blks:
             self.blks[merged_blk.id] = merged_blk
 
@@ -463,31 +463,31 @@ class Graph(object):
         new_blocks = []
         new_blocks.extend(update(old_ref, new_refs, hit['ref'], Strand.Plus))
         new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation']))
-        self.prune_blks()
 
-        shared_blks = set()
-        first = True
+        blk_list = set()
+        first    = True
         for tag in ref.muts.keys():
-            pos  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
-            blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND]
+            beg  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
+            end  = self.seqs[tag[0]].position_of(new_refs[-1], tag[1])
+            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
             if first:
-                shared_blks = set([b.id for b in blks])
+                blk_list = set([b.id for b in blks])
                 first = False
             else:
-                shared_blks.intersection_update(set([b.id for b in blks]))
-        first = True
+                blk_list.intersection_update(set([b.id for b in blks]))
+
         for tag in qry.muts.keys():
-            pos  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
-            blks = self.seqs[tag[0]][pos[0]-EXTEND:pos[1]+EXTEND]
-            if first:
-                shared_blks = set([b.id for b in blks])
-                first = False
-            else:
-                shared_blks.intersection_update(set([b.id for b in blks]))
+            beg  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
+            end  = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1])
+            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blk_list.intersection_update(set([b.id for b in blks]))
+
+        print(f"LEN: {len(blk_list)-len(shared_blks)}")
+        if len(blk_list) < len(shared_blks):
+            breakpoint("inconsistent number of blocks")
+
+        self.prune_blks()
 
-        print(f"LEN: {len(shared_blks)}")
-        if len(shared_blks) > 20:
-            breakpoint("big number of blocks")
         return [b[0] for b in new_blocks]
 
     def extract(self, name, strip_gaps=True, verbose=False):
diff --git a/pangraph/tree.py b/pangraph/tree.py
index 605bac3..64ac119 100644
--- a/pangraph/tree.py
+++ b/pangraph/tree.py
@@ -139,6 +139,11 @@ class Clade(object):
                 'fapath'   : self.fapath,
                 'graph'    : serialize(self.graph) if self.graph is not None else None }
 
+    def set_level(self, level):
+        for c in self.child:
+            c.set_level(level+1)
+        self.level = level
+
 class Tree(object):
     # ------------------- 
     # Class constructor
@@ -288,6 +293,7 @@ class Tree(object):
         self.seqs = {leafs[name]:seq for name,seq in seqs.items()}
 
     def align(self, tmpdir, min_blk_len, mu, beta, extensive, log_stats=False, verbose=False):
+        self.root.set_level(0) # NOTE: for debug logging
         stats = {}
         # ---------------------------------------------
         # internal functions
@@ -377,6 +383,7 @@ class Tree(object):
         for n in self.postorder():
             if n.is_leaf():
                 continue
+            print(f"---NODE LEVEL {n.level}---")
             n.fapath = f"{tmpdir}/{n.name}"
             log(f"fusing {n.child[0].name} with {n.child[1].name} @ {n.name}")
             n.graph = merge(*n.child)
-- 
cgit v1.2.1


From b498a39385f08f1ffa374b03637e2317da718004 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 13 Aug 2020 11:58:02 -0700
Subject: fix: deal with inverted cases for block lists

---
 pangraph/graph.py    | 33 +++++++++++++++++++++++++++++----
 pangraph/sequence.py |  5 -----
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index fb46be9..2926d3f 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -467,8 +467,9 @@ class Graph(object):
         blk_list = set()
         first    = True
         for tag in ref.muts.keys():
-            beg  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
-            end  = self.seqs[tag[0]].position_of(new_refs[-1], tag[1])
+            # NOTE: this is a hack to deal with flipped orientations
+            pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
+            beg, end  = pos[0], pos[-1]
             blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
             if first:
                 blk_list = set([b.id for b in blks])
@@ -477,13 +478,37 @@ class Graph(object):
                 blk_list.intersection_update(set([b.id for b in blks]))
 
         for tag in qry.muts.keys():
-            beg  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
-            end  = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1])
+            pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0])
+            beg, end  = pos[0], pos[-1]
             blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
             blk_list.intersection_update(set([b.id for b in blks]))
 
         print(f"LEN: {len(blk_list)-len(shared_blks)}")
         if len(blk_list) < len(shared_blks):
+            ref_list = set()
+            first    = True
+            for tag in ref.muts.keys():
+                beg  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
+                end  = self.seqs[tag[0]].position_of(new_refs[-1], tag[1])
+                blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+                if first:
+                    ref_list = set([b.id for b in blks])
+                    first = False
+                else:
+                    ref_list.intersection_update(set([b.id for b in blks]))
+
+            qry_list = set()
+            first    = True
+            for tag in qry.muts.keys():
+                beg  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
+                end  = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1])
+                blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+                if first:
+                    qry_list = set([b.id for b in blks])
+                    first = False
+                else:
+                    qry_list.intersection_update(set([b.id for b in blks]))
+
             breakpoint("inconsistent number of blocks")
 
         self.prune_blks()
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 185dd38..dd9677b 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -156,11 +156,6 @@ class Path(object):
     def position_of(self, blk, num):
         index = [i for i, n in enumerate(self.nodes) if n.blk == blk]
         if len(index) <= num:
-            # print(f"NODES: {self.nodes}")
-            # print(f"BLOCK: {blk}")
-            # print(f"POSITION: {self.position}")
-            # print(f"INDEX: {index}")
-            # breakpoint("break")
             return None
         return (self.position[index[num]], self.position[index[num]+1])
 
-- 
cgit v1.2.1


From b1af35822273ebe52f6fa089cfb5bc5f8ef542a3 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 13 Aug 2020 12:52:20 -0700
Subject: fix: bug associated to position of using index absolutely instead of
 relative to true dictionary key

---
 pangraph/graph.py    |  5 ++++-
 pangraph/sequence.py | 12 ++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 2926d3f..afff779 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -478,7 +478,10 @@ class Graph(object):
                 blk_list.intersection_update(set([b.id for b in blks]))
 
         for tag in qry.muts.keys():
-            pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0])
+            try:
+                pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0])
+            except:
+                breakpoint("bad find")
             beg, end  = pos[0], pos[-1]
             blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
             blk_list.intersection_update(set([b.id for b in blks]))
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index dd9677b..6b9baba 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -88,12 +88,6 @@ class Path(object):
 
         return seq
 
-    # def position_of(self, blk):
-    #     for i, n in enumerate(self.nodes):
-    #         if n.blk == blk:
-    #             return i, n.num
-    #     raise ValueError("block not found in path")
-
     def rm_nil_blks(self):
         good, popped = [], set()
         for i, n in enumerate(self.nodes):
@@ -121,6 +115,8 @@ class Path(object):
             try:
                 i, j = ids.index(start[0]), ids.index(stop[0])
 
+                if N > 0:
+                    breakpoint("HIT")
                 if self.nodes[i].strand == start[1]:
                     beg, end, s = i, j, Strand.Plus
                 else:
@@ -154,8 +150,8 @@ class Path(object):
         self.position = np.cumsum([0] + [n.length(self.name) for n in self.nodes])
 
     def position_of(self, blk, num):
-        index = [i for i, n in enumerate(self.nodes) if n.blk == blk]
-        if len(index) <= num:
+        index = { n.num:i for i, n in enumerate(self.nodes) if n.blk == blk }
+        if not num in index:
             return None
         return (self.position[index[num]], self.position[index[num]+1])
 
-- 
cgit v1.2.1


From 967b6412dddc99de429e692973819bb7147b0354 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 13 Aug 2020 14:40:16 -0700
Subject: feat: prototype of mafft alignment of blocks

---
 Makefile          |  2 +-
 pangraph/build.py |  3 ++-
 pangraph/graph.py | 21 ++++++++++++++++++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 89f9e94..117dbc9 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log 2>/dev/null
+	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
 
 # figures
 
diff --git a/pangraph/build.py b/pangraph/build.py
index 14f96d3..4577732 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -85,6 +85,7 @@ def main(args):
         with open(f"{root}/graph_{i:03d}.fa", 'w') as fd:
             g.write_fasta(fd)
 
-    T.write_json(sys.stdout, no_seqs=True)
+    # NOTE: uncomment when done debugging
+    # T.write_json(sys.stdout, no_seqs=True)
 
     return 0
diff --git a/pangraph/graph.py b/pangraph/graph.py
index afff779..f45cfd2 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -5,6 +5,8 @@ import pprint
 
 from glob        import glob
 from collections import defaultdict, Counter
+from itertools   import chain
+import subprocess
 
 from Bio           import SeqIO, Phylo
 from Bio.Seq       import Seq
@@ -486,7 +488,24 @@ class Graph(object):
             blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
             blk_list.intersection_update(set([b.id for b in blks]))
 
-        print(f"LEN: {len(blk_list)-len(shared_blks)}")
+        delta = len(blk_list)-len(shared_blks)
+        if delta > 0:
+            print(f"LEN: {delta}")
+            buf = ""
+            for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
+                buf += ">isolate_{i:04d}\n"
+                blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+                for b in blks:
+                    buf += b.extract(*tag)
+                buf += '\n'
+            proc = subprocess.run(["mafft", "-"], stdout=subprocess.PIPE, input=buf, encoding='ascii')
+            print(proc.stdout)
+            print(proc.stderr)
+            breakpoint('test')
+        else:
+            print(f"NO MATCH")
+
+        # NOTE: debugging code
         if len(blk_list) < len(shared_blks):
             ref_list = set()
             first    = True
-- 
cgit v1.2.1


From 25b974618ec3242a9d7d7373b5c9a3a8bf0f0681 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 13 Aug 2020 15:33:22 -0700
Subject: feat: now assess the alignment based on total tree length

---
 Makefile          |  2 +-
 pangraph/graph.py | 42 +++++++++++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 117dbc9..6061cfc 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 2>/dev/null #1>staph.log
 
 # figures
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index f45cfd2..5a3c618 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -1,12 +1,13 @@
-import os, sys
+import io, os, sys
 import json
 import numpy as np
 import pprint
+import subprocess
+import tempfile
 
 from glob        import glob
 from collections import defaultdict, Counter
 from itertools   import chain
-import subprocess
 
 from Bio           import SeqIO, Phylo
 from Bio.Seq       import Seq
@@ -468,6 +469,7 @@ class Graph(object):
 
         blk_list = set()
         first    = True
+        num_seqs = 0
         for tag in ref.muts.keys():
             # NOTE: this is a hack to deal with flipped orientations
             pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
@@ -478,6 +480,7 @@ class Graph(object):
                 first = False
             else:
                 blk_list.intersection_update(set([b.id for b in blks]))
+            num_seqs += 1
 
         for tag in qry.muts.keys():
             try:
@@ -487,21 +490,30 @@ class Graph(object):
             beg, end  = pos[0], pos[-1]
             blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
             blk_list.intersection_update(set([b.id for b in blks]))
+            num_seqs += 1
 
         delta = len(blk_list)-len(shared_blks)
-        if delta > 0:
-            print(f"LEN: {delta}")
-            buf = ""
-            for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
-                buf += ">isolate_{i:04d}\n"
-                blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
-                for b in blks:
-                    buf += b.extract(*tag)
-                buf += '\n'
-            proc = subprocess.run(["mafft", "-"], stdout=subprocess.PIPE, input=buf, encoding='ascii')
-            print(proc.stdout)
-            print(proc.stderr)
-            breakpoint('test')
+        if delta > 0 and num_seqs > 1:
+            print(f"LEN: {delta}", end="\t")
+            fd, path = tempfile.mkstemp()
+            try:
+                with os.fdopen(fd, 'w') as tmp:
+                    for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
+                        tmp.write(f">isolate_{i:04d}\n")
+                        blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+                        for b in blks:
+                            tmp.write(b.extract(*tag))
+                        tmp.write('\n')
+                    tmp.flush()
+                    proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree",
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                shell=True)
+                    out, err = proc.communicate()
+                    tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+                    print(tree.total_branch_length())
+            finally:
+                os.remove(path)
         else:
             print(f"NO MATCH")
 
-- 
cgit v1.2.1


From fe39e4fa983ace442e99a0e8811e69fef18c48d9 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 18 Aug 2020 11:07:31 -0700
Subject: feat: use fixed width window into each block for end alignment
 instead of entire block

---
 Makefile          |  2 +-
 pangraph/graph.py | 92 ++++++++++++++++++++++++++++---------------------------
 2 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/Makefile b/Makefile
index 6061cfc..60266a5 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 2>/dev/null #1>staph.log
+	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log
 
 # figures
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 5a3c618..1664d03 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -495,55 +495,57 @@ class Graph(object):
         delta = len(blk_list)-len(shared_blks)
         if delta > 0 and num_seqs > 1:
             print(f"LEN: {delta}", end="\t")
-            fd, path = tempfile.mkstemp()
-            try:
-                with os.fdopen(fd, 'w') as tmp:
-                    for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
-                        tmp.write(f">isolate_{i:04d}\n")
-                        blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
-                        for b in blks:
-                            tmp.write(b.extract(*tag))
-                        tmp.write('\n')
-                    tmp.flush()
-                    proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree",
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                shell=True)
-                    out, err = proc.communicate()
-                    tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                    print(tree.total_branch_length())
-            finally:
-                os.remove(path)
+            for i, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]), (end[1],end[1]+EXTEND)]):
+                fd, path = tempfile.mkstemp()
+                try:
+                    with os.fdopen(fd, 'w') as tmp:
+                        for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
+                            tmp.write(f">isolate_{i:04d}\n")
+                            blks = self.seqs[tag[0]][left:right]
+                            for b in blks:
+                                tmp.write(b.extract(*tag))
+                            tmp.write('\n')
+                        tmp.flush()
+                        print(f"aligning {num_seqs} seqs")
+                        proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree",
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    shell=True)
+                        out, err = proc.communicate()
+                        tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+                        print(f"-> NUM {i}: {tree.total_branch_length()}")
+                finally:
+                    os.remove(path)
         else:
             print(f"NO MATCH")
 
         # NOTE: debugging code
-        if len(blk_list) < len(shared_blks):
-            ref_list = set()
-            first    = True
-            for tag in ref.muts.keys():
-                beg  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
-                end  = self.seqs[tag[0]].position_of(new_refs[-1], tag[1])
-                blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
-                if first:
-                    ref_list = set([b.id for b in blks])
-                    first = False
-                else:
-                    ref_list.intersection_update(set([b.id for b in blks]))
-
-            qry_list = set()
-            first    = True
-            for tag in qry.muts.keys():
-                beg  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
-                end  = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1])
-                blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
-                if first:
-                    qry_list = set([b.id for b in blks])
-                    first = False
-                else:
-                    qry_list.intersection_update(set([b.id for b in blks]))
-
-            breakpoint("inconsistent number of blocks")
+        # if len(blk_list) < len(shared_blks):
+        #     ref_list = set()
+        #     first    = True
+        #     for tag in ref.muts.keys():
+        #         beg  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
+        #         end  = self.seqs[tag[0]].position_of(new_refs[-1], tag[1])
+        #         blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+        #         if first:
+        #             ref_list = set([b.id for b in blks])
+        #             first = False
+        #         else:
+        #             ref_list.intersection_update(set([b.id for b in blks]))
+
+        #     qry_list = set()
+        #     first    = True
+        #     for tag in qry.muts.keys():
+        #         beg  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
+        #         end  = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1])
+        #         blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+        #         if first:
+        #             qry_list = set([b.id for b in blks])
+        #             first = False
+        #         else:
+        #             qry_list.intersection_update(set([b.id for b in blks]))
+
+        #     breakpoint("inconsistent number of blocks")
 
         self.prune_blks()
 
-- 
cgit v1.2.1


From da2f92077e0d773b065cea1b576d9f5171cd0091 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 18 Aug 2020 11:20:37 -0700
Subject: fix: made window offset a global parameter in anticipation of a cli
 parameter

---
 pangraph/graph.py    | 7 +++++--
 pangraph/sequence.py | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 1664d03..a38a4e3 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -21,6 +21,7 @@ from .utils    import Strand, as_string, parse_paf, panic, as_record, new_strand
 # ------------------------------------------------------------------------
 # globals
 
+WINDOW = 1000
 EXTEND = 2500
 pp = pprint.PrettyPrinter(indent=4)
 
@@ -495,12 +496,14 @@ class Graph(object):
         delta = len(blk_list)-len(shared_blks)
         if delta > 0 and num_seqs > 1:
             print(f"LEN: {delta}", end="\t")
-            for i, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]), (end[1],end[1]+EXTEND)]):
+            for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
                 fd, path = tempfile.mkstemp()
                 try:
                     with os.fdopen(fd, 'w') as tmp:
                         for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
                             tmp.write(f">isolate_{i:04d}\n")
+                            if left == right:
+                                breakpoint("no difference")
                             blks = self.seqs[tag[0]][left:right]
                             for b in blks:
                                 tmp.write(b.extract(*tag))
@@ -513,7 +516,7 @@ class Graph(object):
                                     shell=True)
                         out, err = proc.communicate()
                         tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                        print(f"-> NUM {i}: {tree.total_branch_length()}")
+                        print(f"-> NUM {n}: {tree.total_branch_length()}")
                 finally:
                     os.remove(path)
         else:
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 6b9baba..c437602 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -173,7 +173,8 @@ class Path(object):
 
             i = np.searchsorted(self.position, beg, side='right') - 1
             j = np.searchsorted(self.position, end, side='left')
-            assert i < j, f"not sorted, {beg}-{end}"
+            if i > j:
+                breakpoint(f"not sorted, {beg}-{end}")
             return l + [n.blk for n in self.nodes[i:j]] + r
         elif isinstance(index, int):
             i = np.searchsorted(self.position, index, side='right') - 1
-- 
cgit v1.2.1


From 6fed97e86a7e07713884efc9c485b2b7c2b66b68 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 18 Aug 2020 11:55:01 -0700
Subject: fix: correctly deal with block offsets when slicing for sequences

---
 pangraph/graph.py    |  6 ++----
 pangraph/sequence.py | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index a38a4e3..3395891 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -504,13 +504,11 @@ class Graph(object):
                             tmp.write(f">isolate_{i:04d}\n")
                             if left == right:
                                 breakpoint("no difference")
-                            blks = self.seqs[tag[0]][left:right]
-                            for b in blks:
-                                tmp.write(b.extract(*tag))
+                            tmp.write(self.seqs[tag[0]].sequence_range(left,right))
                             tmp.write('\n')
                         tmp.flush()
                         print(f"aligning {num_seqs} seqs")
-                        proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree",
+                        proc = subprocess.Popen(f"mafft --auto {path} | fasttree",
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=True)
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index c437602..62bfd32 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -155,6 +155,39 @@ class Path(object):
             return None
         return (self.position[index[num]], self.position[index[num]+1])
 
+    # TODO: pull out common functionality into a helper function
+    # TODO: merge with other sequence function
+    def sequence_range(self, start=None, stop=None):
+        beg = start or 0
+        end = stop or self.position[-1]
+        l, r = "", ""
+        if beg < 0:
+            if len(self.nodes) > 1:
+                l = self.sequence_range(self.position[-1]+beg,self.position[-1])
+            beg = 0
+        if end > self.position[-1]:
+            if len(self.nodes) > 1:
+                r = self.sequence_range(0,end-self.position[-1])
+            end = self.position[-1]
+        if beg > end:
+            beg, end = end, beg
+
+        i = np.searchsorted(self.position, beg, side='right') - 1
+        j = np.searchsorted(self.position, end, side='left')
+        m = ""
+        if i < j:
+            if beg < self.position[i]:
+                breakpoint("bad start search")
+            if end > self.position[j]:
+                breakpoint("bad end search")
+            m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):]
+            for n in self.nodes[i+1:j-1]:
+                m += n.blk.extract(self.name, n.num)
+            if j < len(self.nodes):
+                b  = self.nodes[j].blk.extract(self.name, self.nodes[j].num)
+                m += b[0:(len(b)+self.position[j]-end)]
+        return l + m + r
+
     def __getitem__(self, index):
         if isinstance(index, slice):
             beg = index.start or 0
@@ -162,7 +195,7 @@ class Path(object):
             l, r = [], []
             if beg < 0:
                 if len(self.nodes) > 1:
-                    l   = self[self.position[-1] + beg:self.position[-1]]
+                    l = self[(self.position[-1]+beg):self.position[-1]]
                 beg = 0
             if end > self.position[-1]:
                 if len(self.nodes) > 1:
-- 
cgit v1.2.1


From c5209e086026a57c03501ff9194e15b6b8b0f404 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 18 Aug 2020 12:34:10 -0700
Subject: fix: slicing bug that lead to incorrect lengths

---
 pangraph/graph.py    | 43 +++++++++++++++++++++++--------------------
 pangraph/sequence.py | 23 +++++++++++++----------
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 3395891..89ebadc 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -496,27 +496,30 @@ class Graph(object):
         delta = len(blk_list)-len(shared_blks)
         if delta > 0 and num_seqs > 1:
             print(f"LEN: {delta}", end="\t")
-            for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+            try:
                 fd, path = tempfile.mkstemp()
-                try:
-                    with os.fdopen(fd, 'w') as tmp:
-                        for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
-                            tmp.write(f">isolate_{i:04d}\n")
-                            if left == right:
-                                breakpoint("no difference")
-                            tmp.write(self.seqs[tag[0]].sequence_range(left,right))
-                            tmp.write('\n')
-                        tmp.flush()
-                        print(f"aligning {num_seqs} seqs")
-                        proc = subprocess.Popen(f"mafft --auto {path} | fasttree",
-                                    stdout=subprocess.PIPE,
-                                    stderr=subprocess.PIPE,
-                                    shell=True)
-                        out, err = proc.communicate()
-                        tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                        print(f"-> NUM {n}: {tree.total_branch_length()}")
-                finally:
-                    os.remove(path)
+                with os.fdopen(fd, 'w') as tmp:
+                    for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
+                        pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
+                        beg, end = pos[0], pos[-1]
+
+                        tmp.write(f">isolate_{i:04d}\n")
+                        for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW)]): #, (end[1]-WINDOW,end[1]+EXTEND)]):
+                            s = self.seqs[tag[0]].sequence_range(left,right)
+                            if len(s) > EXTEND + WINDOW:
+                                breakpoint(f"bad sequence slicing: {len(s)}")
+                            tmp.write(s + '\n')
+                    tmp.flush()
+                    print(f"aligning {num_seqs} seqs")
+                    proc = subprocess.Popen(f"mafft --auto {path} | fasttree",
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE,
+                                shell=True)
+                    out, err = proc.communicate()
+                    tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+                    print(f"-> NUM {n}: {tree.total_branch_length()}")
+            finally:
+                os.remove(path)
         else:
             print(f"NO MATCH")
 
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 62bfd32..3660b71 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -161,6 +161,8 @@ class Path(object):
         beg = start or 0
         end = stop or self.position[-1]
         l, r = "", ""
+        # if start == 159 and stop == 3659:
+        #     breakpoint("test")
         if beg < 0:
             if len(self.nodes) > 1:
                 l = self.sequence_range(self.position[-1]+beg,self.position[-1])
@@ -176,16 +178,17 @@ class Path(object):
         j = np.searchsorted(self.position, end, side='left')
         m = ""
         if i < j:
-            if beg < self.position[i]:
-                breakpoint("bad start search")
-            if end > self.position[j]:
-                breakpoint("bad end search")
-            m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):]
-            for n in self.nodes[i+1:j-1]:
-                m += n.blk.extract(self.name, n.num)
-            if j < len(self.nodes):
-                b  = self.nodes[j].blk.extract(self.name, self.nodes[j].num)
-                m += b[0:(len(b)+self.position[j]-end)]
+            if j >= len(self.position):
+                breakpoint("what?")
+            if i == j - 1:
+                m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):(end-self.position[i])]
+            else:
+                m = self.nodes[i].blk.extract(self.name, self.nodes[i].num)[(beg-self.position[i]):]
+                for n in self.nodes[i+1:j-1]:
+                    m += n.blk.extract(self.name, n.num)
+                n  = self.nodes[j-1]
+                b  = n.blk.extract(self.name, n.num)
+                m += b[0:(end-self.position[j-1])]
         return l + m + r
 
     def __getitem__(self, index):
-- 
cgit v1.2.1


From 66bd4dc85b165756f4f1924ff953813c899a9782 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 18 Aug 2020 13:12:46 -0700
Subject: fix: odd bug associated to non-unicode characters in stream

---
 pangraph/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pangraph/utils.py b/pangraph/utils.py
index e0d85fa..5d5e888 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -97,7 +97,10 @@ def as_array(x):
     return np.array(list(x))
 
 def as_string(x):
-    return x.view(f'U{x.size}')[0]
+    try:
+        return x.view(f'U{x.size}')[0]
+    except:
+        return "".join(str(c) for c in x)
 
 def flatten(x):
     return np.ndarray.flatten(x[:])
-- 
cgit v1.2.1


From 9fa0dc81e1a6162736fbefe7ffffde79f9f808ce Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 18 Aug 2020 15:42:54 -0700
Subject: fix: added left and right extension back in

---
 pangraph/graph.py | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 89ebadc..2f6572a 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -496,30 +496,41 @@ class Graph(object):
         delta = len(blk_list)-len(shared_blks)
         if delta > 0 and num_seqs > 1:
             print(f"LEN: {delta}", end="\t")
+            fd   = [None, None]
+            path = [None, None]
             try:
-                fd, path = tempfile.mkstemp()
-                with os.fdopen(fd, 'w') as tmp:
+                fd[0], path[0] = tempfile.mkstemp()
+                fd[1], path[1] = tempfile.mkstemp()
+                with os.fdopen(fd[0], 'w') as tmp1, os.fdopen(fd[1], 'w') as tmp2:
+                    tmp = [tmp1, tmp2]
                     for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
                         pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
                         beg, end = pos[0], pos[-1]
 
-                        tmp.write(f">isolate_{i:04d}\n")
-                        for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW)]): #, (end[1]-WINDOW,end[1]+EXTEND)]):
+                        for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+                            tmp[n].write(f">isolate_{i:04d}\n")
                             s = self.seqs[tag[0]].sequence_range(left,right)
                             if len(s) > EXTEND + WINDOW:
                                 breakpoint(f"bad sequence slicing: {len(s)}")
-                            tmp.write(s + '\n')
-                    tmp.flush()
-                    print(f"aligning {num_seqs} seqs")
-                    proc = subprocess.Popen(f"mafft --auto {path} | fasttree",
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                shell=True)
-                    out, err = proc.communicate()
-                    tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                    print(f"-> NUM {n}: {tree.total_branch_length()}")
+                            tmp[n].write(s + '\n')
+
+                    tmp[0].flush()
+                    tmp[1].flush()
+
+                    def make_tree(n):
+                        proc = subprocess.Popen(f"mafft --auto {path[n]} | fasttree",
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    shell=True)
+                        out, err = proc.communicate()
+                        tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+                        print(f"-> {n} SCORE: {tree.total_branch_length()/(2*{num_seqs})}")
+
+                    make_tree(0)
+                    make_tree(1)
             finally:
-                os.remove(path)
+                os.remove(path[0])
+                os.remove(path[1])
         else:
             print(f"NO MATCH")
 
-- 
cgit v1.2.1


From bd21a4412a3e35f32ad84cf19328b7c89e0ed6e5 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 08:44:51 -0700
Subject: chore: update makefile to redirect to correct targets

---
 Makefile          | 2 +-
 pangraph/graph.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 60266a5..117dbc9 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json #1>staph.log
+	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
 
 # figures
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 2f6572a..1a67c16 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -524,7 +524,7 @@ class Graph(object):
                                     shell=True)
                         out, err = proc.communicate()
                         tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                        print(f"-> {n} SCORE: {tree.total_branch_length()/(2*{num_seqs})}")
+                        print(f"-> {n} SCORE: {tree.total_branch_length()/(2*num_seqs)}")
 
                     make_tree(0)
                     make_tree(1)
-- 
cgit v1.2.1


From d1ac5cedcc325be21de8e75bed39ef7741a76a19 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 11:02:50 -0700
Subject: feat: added simple fasta parser

---
 pangraph/utils.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/pangraph/utils.py b/pangraph/utils.py
index 5d5e888..82e0e3c 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -160,6 +160,30 @@ def getnwk(node, newick, parentdist, leaf_names):
 # ------------------------------------------------------------------------
 # parsers
 
+def parse_fasta(fh):
+    class Record:
+        def __init__(self, name=None, meta=None, seq=None):
+            self.seq  = seq
+            self.name = name
+            self.meta = meta
+
+    header = fh.readline()
+    while True:
+        if header == "":
+            return 1, None
+        if header[0] != '>':
+            return 0, "improper fasta file syntax"
+
+        name = header[1:].split()
+        seq  = ""
+        for line in fh:
+            if line == "" or line[0] == ">":
+                break
+            seq += line
+
+        header = line
+        yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq)
+
 def parse_paf(fh):
     hits = []
     for line in fh:
-- 
cgit v1.2.1


From 91e389d6e3d0c33f02474804802b053bb67cc48d Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 11:40:11 -0700
Subject: fix: corrected the iterator

---
 pangraph/utils.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/pangraph/utils.py b/pangraph/utils.py
index 82e0e3c..eff9432 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -157,6 +157,11 @@ def getnwk(node, newick, parentdist, leaf_names):
         newick = "(%s" % (newick)
         return newick
 
+def as_str(s):
+    if isinstance(s, bytes):
+        return s.decode('utf-8')
+    return s
+
 # ------------------------------------------------------------------------
 # parsers
 
@@ -167,21 +172,23 @@ def parse_fasta(fh):
             self.name = name
             self.meta = meta
 
-    header = fh.readline()
-    while True:
-        if header == "":
-            return 1, None
-        if header[0] != '>':
-            return 0, "improper fasta file syntax"
+        def __str__(self):
+            return f">{self.name} {self.meta}\n{self.seq[:77]}...\n"
+
+        def __repr__(self):
+            return str(self)
 
+    header = as_str(fh.readline())
+    while header != "" and header[0] == ">":
         name = header[1:].split()
         seq  = ""
         for line in fh:
+            line = as_str(line)
             if line == "" or line[0] == ">":
                 break
             seq += line
 
-        header = line
+        header = as_str(line)
         yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq)
 
 def parse_paf(fh):
-- 
cgit v1.2.1


From e01b49284da3d8384744695b7a5aeaa37bb47d4f Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 11:52:05 -0700
Subject: fix: use a string buffer

---
 pangraph/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pangraph/utils.py b/pangraph/utils.py
index eff9432..3e553a0 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -3,6 +3,7 @@ import csv
 import gzip
 import numpy as np
 
+from io   import StringIO
 from enum import IntEnum
 
 from Bio import SeqIO
@@ -181,15 +182,16 @@ def parse_fasta(fh):
     header = as_str(fh.readline())
     while header != "" and header[0] == ">":
         name = header[1:].split()
-        seq  = ""
+        seq  = StringIO()
         for line in fh:
             line = as_str(line)
             if line == "" or line[0] == ">":
                 break
-            seq += line
+            seq.write(line)
 
         header = as_str(line)
-        yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq)
+        yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue())
+        seq.close()
 
 def parse_paf(fh):
     hits = []
-- 
cgit v1.2.1


From bc17c2a9a5cd218180d272061d492dcbbf566144 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 11:54:29 -0700
Subject: feat: added code to filter plasmids from chromosomes

---
 scripts/filter_plasmids.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100755 scripts/filter_plasmids.py

diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
new file mode 100755
index 0000000..9461451
--- /dev/null
+++ b/scripts/filter_plasmids.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+script to filter plasmids and chromosomes from full genome assemblies
+"""
+
+import os
+import sys
+import gzip
+import builtins
+
+from glob import glob
+
+sys.path.insert(0, os.path.abspath('.')) # gross hack
+from pangraph.utils import parse_fasta, breakpoint
+
+from Bio import SeqIO
+
+def open(path, *args, **kwargs):
+    if path.endswith('.gz'):
+        return gzip.open(path, *args, **kwargs)
+    else:
+        return builtins.open(path, *args, **kwargs)
+
+# def main(args):
+#     for d in args:
+
+from time import time
+if __name__ == "__main__":
+    t0 = time()
+    for path in glob("data/staph/assemblies/*.fna.gz"):
+        with open(path, 'rt') as fd:
+            seqs = [record for record in SeqIO.parse(fd, 'fasta')]
+    t1 = time()
+    print(f"bio parser took {t1 - t0} seconds")
+
+    t0 = time()
+    for path in glob("data/staph/assemblies/*.fna.gz"):
+        with open(path, 'rt') as fd:
+            seqs = [record for record in parse_fasta(fd)]
+    t1 = time()
+    print(f"my parser took {t1 - t0} seconds")
+
+    # main(sys.argv[1:])
-- 
cgit v1.2.1


From 8d5626243ce5560161534cba114301fd3a4dd382 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 12:05:56 -0700
Subject: fix: remove new lines from input stream

---
 pangraph/utils.py          |  6 ++++--
 scripts/filter_plasmids.py | 19 ++++++-------------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/pangraph/utils.py b/pangraph/utils.py
index 3e553a0..e4b63f3 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -174,7 +174,9 @@ def parse_fasta(fh):
             self.meta = meta
 
         def __str__(self):
-            return f">{self.name} {self.meta}\n{self.seq[:77]}...\n"
+            NL = '\n'
+            nc = 80
+            return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}"
 
         def __repr__(self):
             return str(self)
@@ -187,7 +189,7 @@ def parse_fasta(fh):
             line = as_str(line)
             if line == "" or line[0] == ">":
                 break
-            seq.write(line)
+            seq.write(line[:-1])
 
         header = as_str(line)
         yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue())
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
index 9461451..4ef2670 100755
--- a/scripts/filter_plasmids.py
+++ b/scripts/filter_plasmids.py
@@ -13,8 +13,6 @@ from glob import glob
 sys.path.insert(0, os.path.abspath('.')) # gross hack
 from pangraph.utils import parse_fasta, breakpoint
 
-from Bio import SeqIO
-
 def open(path, *args, **kwargs):
     if path.endswith('.gz'):
         return gzip.open(path, *args, **kwargs)
@@ -26,18 +24,13 @@ def open(path, *args, **kwargs):
 
 from time import time
 if __name__ == "__main__":
-    t0 = time()
-    for path in glob("data/staph/assemblies/*.fna.gz"):
-        with open(path, 'rt') as fd:
-            seqs = [record for record in SeqIO.parse(fd, 'fasta')]
-    t1 = time()
-    print(f"bio parser took {t1 - t0} seconds")
-
-    t0 = time()
     for path in glob("data/staph/assemblies/*.fna.gz"):
-        with open(path, 'rt') as fd:
+        with open(path, 'rt') as fd, open("test.fa", 'w') as wtr:
+            for rec in parse_fasta(fd):
+                # print(str(rec))
+                wtr.write(str(rec))
+                wtr.write('\n')
             seqs = [record for record in parse_fasta(fd)]
-    t1 = time()
-    print(f"my parser took {t1 - t0} seconds")
+        break
 
     # main(sys.argv[1:])
-- 
cgit v1.2.1


From 8f8ce839b3edebd591e83bad8628302e995ab2e2 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 12:16:21 -0700
Subject: feat: filter plasmids now works on the directory level

---
 scripts/filter_plasmids.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
index 4ef2670..71aa23f 100755
--- a/scripts/filter_plasmids.py
+++ b/scripts/filter_plasmids.py
@@ -19,18 +19,24 @@ def open(path, *args, **kwargs):
     else:
         return builtins.open(path, *args, **kwargs)
 
-# def main(args):
-#     for d in args:
+def main(args):
+    for arg in args:
+        in_dir = f"data/{arg}/assemblies"
+        if not os.path.exists(in_dir):
+            print(f"{in_dir} doesn't exist. skipping...")
+            continue
+
+        out_dir = f"data/{arg}-plasmid/assemblies"
+        if not os.path.exists(out_dir):
+            os.mkdirs(out_dir)
+
+        for path in glob(f"{in_dir}/*.f?a*"):
+            with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr:
+                for i, rec in enumerate(parse_fasta(fd)):
+                    if i == 0:
+                        continue
+                    wtr.write(str(rec))
+                    wtr.write('\n')
 
-from time import time
 if __name__ == "__main__":
-    for path in glob("data/staph/assemblies/*.fna.gz"):
-        with open(path, 'rt') as fd, open("test.fa", 'w') as wtr:
-            for rec in parse_fasta(fd):
-                # print(str(rec))
-                wtr.write(str(rec))
-                wtr.write('\n')
-            seqs = [record for record in parse_fasta(fd)]
-        break
-
-    # main(sys.argv[1:])
+    main(sys.argv[1:])
-- 
cgit v1.2.1


From 3f29dfc9a170bd5e78e8fa77c52a6f0450b00c63 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 12:20:35 -0700
Subject: fix: typo in function call

---
 scripts/filter_plasmids.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
index 71aa23f..81a902d 100755
--- a/scripts/filter_plasmids.py
+++ b/scripts/filter_plasmids.py
@@ -28,7 +28,7 @@ def main(args):
 
         out_dir = f"data/{arg}-plasmid/assemblies"
         if not os.path.exists(out_dir):
-            os.mkdirs(out_dir)
+            os.makedirs(out_dir)
 
         for path in glob(f"{in_dir}/*.f?a*"):
             with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr:
-- 
cgit v1.2.1


From 582f1f29268f3c3792c5b965d6948ff4b7633671 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 19 Aug 2020 12:43:26 -0700
Subject: feat: allow for plasmids and chromosome filtering

---
 scripts/filter_plasmids.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
index 81a902d..c309150 100755
--- a/scripts/filter_plasmids.py
+++ b/scripts/filter_plasmids.py
@@ -7,6 +7,7 @@ import os
 import sys
 import gzip
 import builtins
+import argparse
 
 from glob import glob
 
@@ -19,14 +20,18 @@ def open(path, *args, **kwargs):
     else:
         return builtins.open(path, *args, **kwargs)
 
-def main(args):
-    for arg in args:
-        in_dir = f"data/{arg}/assemblies"
+def main(dirs, plasmids=True):
+    for d in dirs:
+        in_dir = f"data/{d}/assemblies"
         if not os.path.exists(in_dir):
             print(f"{in_dir} doesn't exist. skipping...")
             continue
 
-        out_dir = f"data/{arg}-plasmid/assemblies"
+        if plasmids:
+            out_dir = f"data/{d}-plasmid/assemblies"
+        else:
+            out_dir = f"data/{d}-chromosome/assemblies"
+
         if not os.path.exists(out_dir):
             os.makedirs(out_dir)
 
@@ -34,9 +39,19 @@ def main(args):
             with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr:
                 for i, rec in enumerate(parse_fasta(fd)):
                     if i == 0:
+                        if not plasmids:
+                            wtr.write(str(rec))
+                            wtr.write('\n')
+                            break
                         continue
+
                     wtr.write(str(rec))
                     wtr.write('\n')
 
+parser = argparse.ArgumentParser(description='seperate plasmids from chromosomes')
+parser.add_argument('directories', metavar='dirs', nargs='+')
+parser.add_argument('--chromosomes', default=False, action='store_true')
+
 if __name__ == "__main__":
-    main(sys.argv[1:])
+    args = parser.parse_args()
+    main(args.directories, plasmids=not args.chromosomes)
-- 
cgit v1.2.1


From 80cede9edef586bbf68258f04ec3be8371afcc0c Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 10:39:05 -0700
Subject: feat: window and extend for end repair now CLI parameters

---
 Makefile          |  2 +-
 pangraph/build.py | 18 +++++++++++++-----
 pangraph/graph.py | 20 ++++++++++----------
 3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 117dbc9..ab398b6 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null
 
 # figures
 
diff --git a/pangraph/build.py b/pangraph/build.py
index 4577732..5c3a8ec 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -38,6 +38,16 @@ def register_args(parser):
                         type=int,
                         default=22,
                         help="energy cost for mutations (used during block merges)")
+    parser.add_argument("-w", "--window",
+                        metavar="edge window",
+                        type=int,
+                        default=1000,
+                        help="amount of sequence to align from for end repair")
+    parser.add_argument("-e", "--extend",
+                        metavar="edge extend",
+                        type=int,
+                        default=1000,
+                        help="amount of sequence to extend for end repair")
     parser.add_argument("-s", "--statistics",
                         default=False,
                         action='store_true',
@@ -71,12 +81,10 @@ def main(args):
     mkdir(tmp)
 
     log("aligning")
-    with cProfile.Profile() as pr:
-        T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
-        # TODO: when debugging phase is done, remove tmp directory
+    T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
+    # TODO: when debugging phase is done, remove tmp directory
 
-        graphs = T.collect()
-    pr.dump_stats("perf.prof")
+    graphs = T.collect()
 
     for i, g in enumerate(graphs):
         log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}")
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 1a67c16..83ef839 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -21,8 +21,8 @@ from .utils    import Strand, as_string, parse_paf, panic, as_record, new_strand
 # ------------------------------------------------------------------------
 # globals
 
-WINDOW = 1000
-EXTEND = 2500
+# WINDOW = 1000
+# EXTEND = 2500
 pp = pprint.PrettyPrinter(indent=4)
 
 # ------------------------------------------------------------------------
@@ -140,7 +140,7 @@ class Graph(object):
         graphs, names = [], []
         for name, path in G.seqs.items():
             blks = set([b.id for b in path.blocks()])
-            gi   = [ i for i, g in enumerate(graphs) if overlaps(blks, g)]
+            gi   = [i for i, g in enumerate(graphs) if overlaps(blks, g)]
             if len(gi) == 0:
                 graphs.append(blks)
                 names.append(set([name]))
@@ -163,7 +163,7 @@ class Graph(object):
     # ---------------
     # methods
 
-    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False):
+    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False):
         from seqanpy import align_global as align
 
         # ----------------------------------
@@ -324,7 +324,7 @@ class Graph(object):
                 continue
 
             merged = True
-            self.merge(proc(hit))
+            self.merge(proc(hit), edge_window, edge_extend)
             merged_blks.add(hit['ref']['name'])
             merged_blks.add(hit['qry']['name'])
 
@@ -414,7 +414,7 @@ class Graph(object):
             blks.update(path.blocks())
         self.blks = {b.id:self.blks[b.id] for b in blks}
 
-    def merge(self, hit):
+    def merge(self, hit, window, extend):
         old_ref = self.blks[hit['ref']['name']]
         old_qry = self.blks[hit['qry']['name']]
 
@@ -475,7 +475,7 @@ class Graph(object):
             # NOTE: this is a hack to deal with flipped orientations
             pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
             if first:
                 blk_list = set([b.id for b in blks])
                 first = False
@@ -489,7 +489,7 @@ class Graph(object):
             except:
                 breakpoint("bad find")
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
             blk_list.intersection_update(set([b.id for b in blks]))
             num_seqs += 1
 
@@ -507,10 +507,10 @@ class Graph(object):
                         pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
                         beg, end = pos[0], pos[-1]
 
-                        for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+                        for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]):
                             tmp[n].write(f">isolate_{i:04d}\n")
                             s = self.seqs[tag[0]].sequence_range(left,right)
-                            if len(s) > EXTEND + WINDOW:
+                            if len(s) > extend + window:
                                 breakpoint(f"bad sequence slicing: {len(s)}")
                             tmp[n].write(s + '\n')
 
-- 
cgit v1.2.1


From 0931ce34c979da9e376feeed515b8515748f9a6f Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 11:27:49 -0700
Subject: fix: pass in cli parameters into union function now

---
 Makefile          | 2 +-
 pangraph/build.py | 2 +-
 pangraph/graph.py | 2 +-
 pangraph/tree.py  | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index ab398b6..0fe7c0a 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 100 data/staph/guide.json
 
 # figures
 
diff --git a/pangraph/build.py b/pangraph/build.py
index 5c3a8ec..e744149 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -81,7 +81,7 @@ def main(args):
     mkdir(tmp)
 
     log("aligning")
-    T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
+    T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.window, args.extend, args.statistics)
     # TODO: when debugging phase is done, remove tmp directory
 
     graphs = T.collect()
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 83ef839..f4afa97 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -163,7 +163,7 @@ class Graph(object):
     # ---------------
     # methods
 
-    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False):
+    def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False, edge_window=1000, edge_extend=2500):
         from seqanpy import align_global as align
 
         # ----------------------------------
diff --git a/pangraph/tree.py b/pangraph/tree.py
index 64ac119..eeb52c8 100644
--- a/pangraph/tree.py
+++ b/pangraph/tree.py
@@ -292,7 +292,7 @@ class Tree(object):
         leafs = {n.name: n for n in self.get_leafs()}
         self.seqs = {leafs[name]:seq for name,seq in seqs.items()}
 
-    def align(self, tmpdir, min_blk_len, mu, beta, extensive, log_stats=False, verbose=False):
+    def align(self, tmpdir, min_blk_len, mu, beta, extensive, edge_window, edge_extend, log_stats=False, verbose=False):
         self.root.set_level(0) # NOTE: for debug logging
         stats = {}
         # ---------------------------------------------
@@ -351,7 +351,7 @@ class Tree(object):
                 graph1, fapath1 = node1.graph, node1.fapath
                 graph2, fapath2 = node2.graph, node2.fapath
                 graph    = Graph.fuse(graph1, graph2)
-                graph, _ = graph.union(fapath1, fapath2, f"{tmpdir}/{n.name}", min_blk_len, mu, beta, extensive)
+                graph, _ = graph.union(fapath1, fapath2, f"{tmpdir}/{n.name}", min_blk_len, mu, beta, extensive, edge_window, edge_extend)
             else:
                 graph = node1.graph
 
@@ -361,7 +361,7 @@ class Tree(object):
                 itr = f"{tmpdir}/{n.name}_iter_{i}"
                 with open(f"{itr}.fa", 'w') as fd:
                     graph.write_fasta(fd)
-                graph, contin = graph.union(itr, itr, f"{tmpdir}/{n.name}_iter_{i}", min_blk_len, mu, beta, extensive)
+                graph, contin = graph.union(itr, itr, f"{tmpdir}/{n.name}_iter_{i}", min_blk_len, mu, beta, extensive, edge_window, edge_extend)
                 if not contin:
                     return graph
             return graph
-- 
cgit v1.2.1


From 53dbbf03451cf9495fb9602257ecadf2e1f843e5 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 13:32:02 -0700
Subject: feat: added prototype for plotting log file data

---
 Makefile             |  2 +-
 pangraph/graph.py    |  7 ++++---
 pangraph/tree.py     |  2 +-
 scripts/parse_log.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 5 deletions(-)
 create mode 100644 scripts/parse_log.py

diff --git a/Makefile b/Makefile
index 0fe7c0a..b3f1d20 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 100 data/staph/guide.json
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>/dev/null 1>staph-e2500-w1000.log
 
 # figures
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index f4afa97..235ef31 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -495,7 +495,7 @@ class Graph(object):
 
         delta = len(blk_list)-len(shared_blks)
         if delta > 0 and num_seqs > 1:
-            print(f"LEN: {delta}", end="\t")
+            print(f">LEN: {delta}", end=';')
             fd   = [None, None]
             path = [None, None]
             try:
@@ -524,15 +524,16 @@ class Graph(object):
                                     shell=True)
                         out, err = proc.communicate()
                         tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                        print(f"-> {n} SCORE: {tree.total_branch_length()/(2*num_seqs)}")
+                        print(f"-> {n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
 
                     make_tree(0)
                     make_tree(1)
+                    print("\n", end="")
             finally:
                 os.remove(path[0])
                 os.remove(path[1])
         else:
-            print(f"NO MATCH")
+            print(f">NO MATCH")
 
         # NOTE: debugging code
         # if len(blk_list) < len(shared_blks):
diff --git a/pangraph/tree.py b/pangraph/tree.py
index eeb52c8..d36b77d 100644
--- a/pangraph/tree.py
+++ b/pangraph/tree.py
@@ -383,7 +383,7 @@ class Tree(object):
         for n in self.postorder():
             if n.is_leaf():
                 continue
-            print(f"---NODE LEVEL {n.level}---")
+            print(f"+++LEVEL={n.level}+++")
             n.fapath = f"{tmpdir}/{n.name}"
             log(f"fusing {n.child[0].name} with {n.child[1].name} @ {n.name}")
             n.graph = merge(*n.child)
diff --git a/scripts/parse_log.py b/scripts/parse_log.py
new file mode 100644
index 0000000..1caae07
--- /dev/null
+++ b/scripts/parse_log.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+script to process our end repair log files for plotting
+"""
+import argparse
+from collections import defaultdict
+
+level_preset = "+++LEVEL="
+level_offset = len(level_preset)
+
+score_preset = "SCORE="
+score_offset = len(score_preset)
+
+def main(args):
+    for log_path in args:
+        with open(log_path) as log:
+            level = -1
+            stats = defaultdict(lambda: {'hits':[], 'miss': 0})
+            for line in log:
+                line.rstrip('\n')
+                if line[0] == "+":
+                    assert line.startwith(level_preset), "check syntax in log file"
+                    level = int(line[level_offset:line.find("+++", level_offset)])
+                    continue
+                if line[0] == ">":
+                    if line[1:] == "NO MATCH":
+                        stats[level]['miss'] += 1
+                        continue
+                    if line[1:].startswith("LEN="):
+                        offset = [line.find(";")]
+                        offset.append(line.find(";", offset[0]))
+                        score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1])
+                        score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:)
+                        stats[level]['hits'].extend(score)
+
+                raise ValueError(f"invalid syntax: {line}")
+        print(stats)
+
+parser = argparse.ArgumentParser(description='process our data log files on end repair')
+parser.add_argument('files', type=str, nargs='+')
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args.files)
-- 
cgit v1.2.1


From 70e85b88c3038ad707d1e7a149be3b502c58f868 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 13:34:13 -0700
Subject: fix: small typos

---
 scripts/parse_log.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 scripts/parse_log.py

diff --git a/scripts/parse_log.py b/scripts/parse_log.py
old mode 100644
new mode 100755
index 1caae07..82d5e0d
--- a/scripts/parse_log.py
+++ b/scripts/parse_log.py
@@ -19,21 +19,23 @@ def main(args):
             for line in log:
                 line.rstrip('\n')
                 if line[0] == "+":
-                    assert line.startwith(level_preset), "check syntax in log file"
+                    assert line.startswith(level_preset), "check syntax in log file"
                     level = int(line[level_offset:line.find("+++", level_offset)])
                     continue
                 if line[0] == ">":
-                    if line[1:] == "NO MATCH":
+                    if line[1:].startswith("NO MATCH"):
                         stats[level]['miss'] += 1
                         continue
                     if line[1:].startswith("LEN="):
                         offset = [line.find(";")]
                         offset.append(line.find(";", offset[0]))
-                        score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1])
-                        score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:)
+                        score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1]])
+                        score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:])
                         stats[level]['hits'].extend(score)
+                        continue
+
+                raise ValueError(f"invalid syntax: {line[1:]}")
 
-                raise ValueError(f"invalid syntax: {line}")
         print(stats)
 
 parser = argparse.ArgumentParser(description='process our data log files on end repair')
-- 
cgit v1.2.1


From 7e711b0d4693b301f7877d570198843238f90960 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 13:34:44 -0700
Subject: fix: harmonize log file syntax

---
 pangraph/graph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 235ef31..500df82 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -524,7 +524,7 @@ class Graph(object):
                                     shell=True)
                         out, err = proc.communicate()
                         tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                        print(f"-> {n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
+                        print(f"{n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
 
                     make_tree(0)
                     make_tree(1)
-- 
cgit v1.2.1


From 8b50c4516a6e53c9d6f06207a0c636f9348967e2 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 14:19:10 -0700
Subject: fix: reharmonize syntax of log file

---
 pangraph/graph.py    |  4 ++--
 scripts/parse_log.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 500df82..c4e3f83 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -495,7 +495,7 @@ class Graph(object):
 
         delta = len(blk_list)-len(shared_blks)
         if delta > 0 and num_seqs > 1:
-            print(f">LEN: {delta}", end=';')
+            print(f">LEN={delta}", end=';')
             fd   = [None, None]
             path = [None, None]
             try:
@@ -524,7 +524,7 @@ class Graph(object):
                                     shell=True)
                         out, err = proc.communicate()
                         tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
-                        print(f"{n} SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
+                        print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
 
                     make_tree(0)
                     make_tree(1)
diff --git a/scripts/parse_log.py b/scripts/parse_log.py
index 82d5e0d..bbca0ea 100755
--- a/scripts/parse_log.py
+++ b/scripts/parse_log.py
@@ -28,16 +28,16 @@ def main(args):
                         continue
                     if line[1:].startswith("LEN="):
                         offset = [line.find(";")]
-                        offset.append(line.find(";", offset[0]))
-                        score[0] = int(line[line.find(score_preset, offset[0])+1+score_offset:offset[1]])
-                        score[1] = int(line[line.find(score_preset, offset[1])+1+score_offset:])
+                        offset.append(line.find(";", offset[0]+1))
+                        offset.append(line.find(";", offset[1]+1))
+
+                        score    = [None, None]
+                        score[0] = float(line[offset[0]+1+score_offset:offset[1]])
+                        score[1] = float(line[offset[1]+1+score_offset:offset[2]])
                         stats[level]['hits'].extend(score)
                         continue
-
                 raise ValueError(f"invalid syntax: {line[1:]}")
 
-        print(stats)
-
 parser = argparse.ArgumentParser(description='process our data log files on end repair')
 parser.add_argument('files', type=str, nargs='+')
 
-- 
cgit v1.2.1


From 1c8a828dbe87d5adf6a0f528fe9f0e07ee32fa68 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 15:00:45 -0700
Subject: fix: allow manual override for tmp directory number

---
 pangraph/build.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/pangraph/build.py b/pangraph/build.py
index e744149..8cbfa1d 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -52,6 +52,10 @@ def register_args(parser):
                         default=False,
                         action='store_true',
                         help="boolean flag that toggles whether the graph statistics are computed for intermediate graphs")
+    parser.add_argument("-n", "--tmp-dir-num",
+                        type=int,
+                        default=-1,
+                        help="manually sets the tmp directory number. internal use only.")
     parser.add_argument("input",
                         type=str,
                         default="-",
@@ -74,10 +78,13 @@ def main(args):
 
     root = args.dir.rstrip('/')
     tmp  = f"{root}/tmp"
-    i    = 0
-    while os.path.isdir(tmp) and i < 32:
-        i += 1
-        tmp = f"{root}/tmp{i:03d}"
+    if args.n == -1:
+        i    = 0
+        while os.path.isdir(tmp) and i < 64:
+            i += 1
+            tmp = f"{root}/tmp{i:03d}"
+    else:
+            tmp = f"{root}/tmp{args.n:03d}"
     mkdir(tmp)
 
     log("aligning")
-- 
cgit v1.2.1


From c712cd0f686df973f6281bbed98e821243bc3684 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 15:04:20 -0700
Subject: fix: small name change

---
 pangraph/build.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pangraph/build.py b/pangraph/build.py
index 8cbfa1d..5caceb3 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -52,7 +52,7 @@ def register_args(parser):
                         default=False,
                         action='store_true',
                         help="boolean flag that toggles whether the graph statistics are computed for intermediate graphs")
-    parser.add_argument("-n", "--tmp-dir-num",
+    parser.add_argument("-n", "--num",
                         type=int,
                         default=-1,
                         help="manually sets the tmp directory number. internal use only.")
@@ -78,7 +78,7 @@ def main(args):
 
     root = args.dir.rstrip('/')
     tmp  = f"{root}/tmp"
-    if args.n == -1:
+    if args.num == -1:
         i    = 0
         while os.path.isdir(tmp) and i < 64:
             i += 1
-- 
cgit v1.2.1


From f90622c44bc0fe71a1e5a12a3dbc22d19ee5f770 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 20 Aug 2020 15:05:37 -0700
Subject: fix: small name change v2

---
 pangraph/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pangraph/build.py b/pangraph/build.py
index 5caceb3..92708a4 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -84,7 +84,7 @@ def main(args):
             i += 1
             tmp = f"{root}/tmp{i:03d}"
     else:
-            tmp = f"{root}/tmp{args.n:03d}"
+            tmp = f"{root}/tmp{args.num:03d}"
     mkdir(tmp)
 
     log("aligning")
-- 
cgit v1.2.1


From 78639f76e10810296607df2e9b3f839117185cde Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Wed, 2 Sep 2020 12:22:23 -0700
Subject: feat: added output of alignments to logging

---
 Makefile             |  2 +-
 pangraph/graph.py    | 16 +++++++++++++---
 scripts/parse_log.py | 12 +++++++++---
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index b3f1d20..b0b9ffb 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>/dev/null 1>staph-e2500-w1000.log
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log
 
 # figures
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index c4e3f83..d154f74 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -518,12 +518,22 @@ class Graph(object):
                     tmp[1].flush()
 
                     def make_tree(n):
-                        proc = subprocess.Popen(f"mafft --auto {path[n]} | fasttree",
+                        proc = [None, None]
+                        out  = [None, None]
+                        err  = [None, None]
+                        proc[0] = subprocess.Popen(f"mafft --auto {path[n]}",
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=True)
-                        out, err = proc.communicate()
-                        tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+                        proc[1] = subprocess.Popen(f"fasttree",
+                                    stdin =subprocess.PIPE,
+                                    stdout=subprocess.PIPE,
+                                    stderr=subprocess.PIPE,
+                                    shell=True)
+                        out[0], err[0] = proc[0].communicate()
+                        out[1], err[1] = proc[1].communicate(input=out[0])
+                        tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
+                        print(f"ALIGNMENT=\n{out[0]}")
                         print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
 
                     make_tree(0)
diff --git a/scripts/parse_log.py b/scripts/parse_log.py
index bbca0ea..3a0aed0 100755
--- a/scripts/parse_log.py
+++ b/scripts/parse_log.py
@@ -12,10 +12,11 @@ score_preset = "SCORE="
 score_offset = len(score_preset)
 
 def main(args):
+    results = {}
     for log_path in args:
+        stats = defaultdict(lambda: {'hits':[], 'miss': 0})
+        level = -1
         with open(log_path) as log:
-            level = -1
-            stats = defaultdict(lambda: {'hits':[], 'miss': 0})
             for line in log:
                 line.rstrip('\n')
                 if line[0] == "+":
@@ -37,10 +38,15 @@ def main(args):
                         stats[level]['hits'].extend(score)
                         continue
                 raise ValueError(f"invalid syntax: {line[1:]}")
+        if len(stats) > 0:
+            path = log_path.replace(".log", "").split("-")
+            e, w = int(path[1][1:]), int(path[2][1:])
+            results[(e,w)] = dict(stats)
+    return results
 
 parser = argparse.ArgumentParser(description='process our data log files on end repair')
 parser.add_argument('files', type=str, nargs='+')
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    main(args.files)
+    results = main(args.files)
-- 
cgit v1.2.1


From d1712b5a2bbeb61c0660c57e76ccb0e62230edee Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 10 Sep 2020 13:55:29 -0700
Subject: fix: more consistent handling of left/right extension

---
 Makefile             |   4 +-
 pangraph/graph.py    | 134 +++++++++++++++++++++++----------------------------
 scripts/parse_log.py |  50 ++++++++++++++++---
 3 files changed, 105 insertions(+), 83 deletions(-)

diff --git a/Makefile b/Makefile
index b0b9ffb..d1a401d 100644
--- a/Makefile
+++ b/Makefile
@@ -55,8 +55,8 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log
-
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json
+# 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log
 # figures
 
 # figs/figure1.png: $(STAT)
diff --git a/pangraph/graph.py b/pangraph/graph.py
index d154f74..cc0c8e7 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -468,60 +468,77 @@ class Graph(object):
         new_blocks.extend(update(old_ref, new_refs, hit['ref'], Strand.Plus))
         new_blocks.extend(update(old_qry, new_qrys, hit['qry'], hit['orientation']))
 
-        blk_list = set()
+        lblks_set_x, rblks_set_x = set(), set()
+        lblks_set_s, rblks_set_s = set(), set()
         first    = True
         num_seqs = 0
-        for tag in ref.muts.keys():
+        for tag in shared_blks[0].muts.keys():
             # NOTE: this is a hack to deal with flipped orientations
-            pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
+            pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
             beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
+            lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window]
+            rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend]
+
+            lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
+            rblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
             if first:
-                blk_list = set([b.id for b in blks])
+                lblks_set_x = set([b.id for b in lblks_x])
+                rblks_set_x = set([b.id for b in rblks_x])
+
+                lblks_set_s = set([b.id for b in lblks_s])
+                rblks_set_s = set([b.id for b in rblks_s])
+
                 first = False
             else:
-                blk_list.intersection_update(set([b.id for b in blks]))
-            num_seqs += 1
+                lblks_set_x.intersection_update(set([b.id for b in lblks_x]))
+                rblks_set_x.intersection_update(set([b.id for b in rblks_x]))
 
-        for tag in qry.muts.keys():
-            try:
-                pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_qrys], key=lambda x: x[0])
-            except:
-                breakpoint("bad find")
-            beg, end  = pos[0], pos[-1]
-            blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
-            blk_list.intersection_update(set([b.id for b in blks]))
+                lblks_set_s.intersection_update(set([b.id for b in lblks_s]))
+                rblks_set_s.intersection_update(set([b.id for b in rblks_s]))
             num_seqs += 1
 
-        delta = len(blk_list)-len(shared_blks)
-        if delta > 0 and num_seqs > 1:
-            print(f">LEN={delta}", end=';')
-            fd   = [None, None]
-            path = [None, None]
-            try:
-                fd[0], path[0] = tempfile.mkstemp()
-                fd[1], path[1] = tempfile.mkstemp()
-                with os.fdopen(fd[0], 'w') as tmp1, os.fdopen(fd[1], 'w') as tmp2:
-                    tmp = [tmp1, tmp2]
-                    for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
-                        pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
-                        beg, end = pos[0], pos[-1]
-
-                        for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]):
-                            tmp[n].write(f">isolate_{i:04d}\n")
+        def emit(side):
+            if side == 'left':
+                delta = len(lblks_set_x)-len(lblks_set_s)
+            elif side == 'right':
+                delta = len(rblks_set_x)-len(rblks_set_s)
+            else:
+                raise ValueError(f"unrecognized argument '{side}' for side")
+
+            if delta > 0 and num_seqs > 1:
+                print(f">LEN={delta}", end=';')
+                try:
+                    fd, path = tempfile.mkstemp()
+                    with os.fdopen(fd, 'w') as tmp:
+                        for i, tag in enumerate(merged_blks[0].muts.keys()):
+                            pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
+                            beg, end = pos[0], pos[-1]
+
+                            if side == 'left':
+                                left, right = beg[0]-extend,beg[0]+window
+                            elif side == 'right':
+                                left, right = end[1]-window,end[1]+extend
+                            else:
+                                raise ValueError(f"unrecognized argument '{side}' for side")
+
+                            iso_blks = self.seqs[tag[0]][left:right]
+                            print("POSITIONS", pos)
+                            print("LIST", lblks_set_x if side == 'left' else rblks_set_x)
+                            print("SHARED", lblks_set_s if side == 'left' else rblks_set_s)
+                            print("ISO", iso_blks)
+                            breakpoint("stop")
+                            tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n")
                             s = self.seqs[tag[0]].sequence_range(left,right)
                             if len(s) > extend + window:
                                 breakpoint(f"bad sequence slicing: {len(s)}")
-                            tmp[n].write(s + '\n')
+                            tmp.write(s + '\n')
 
-                    tmp[0].flush()
-                    tmp[1].flush()
+                        tmp.flush()
 
-                    def make_tree(n):
                         proc = [None, None]
                         out  = [None, None]
                         err  = [None, None]
-                        proc[0] = subprocess.Popen(f"mafft --auto {path[n]}",
+                        proc[0] = subprocess.Popen(f"mafft --auto {path}",
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=True)
@@ -533,45 +550,16 @@ class Graph(object):
                         out[0], err[0] = proc[0].communicate()
                         out[1], err[1] = proc[1].communicate(input=out[0])
                         tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
-                        print(f"ALIGNMENT=\n{out[0]}")
+                        print(f"ALIGNMENT=\n{out[0]}", end=";")
                         print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
+                        print("\n", end="")
+                finally:
+                    os.remove(path)
+            else:
+                print(f">NO MATCH")
 
-                    make_tree(0)
-                    make_tree(1)
-                    print("\n", end="")
-            finally:
-                os.remove(path[0])
-                os.remove(path[1])
-        else:
-            print(f">NO MATCH")
-
-        # NOTE: debugging code
-        # if len(blk_list) < len(shared_blks):
-        #     ref_list = set()
-        #     first    = True
-        #     for tag in ref.muts.keys():
-        #         beg  = self.seqs[tag[0]].position_of(new_refs[0], tag[1])
-        #         end  = self.seqs[tag[0]].position_of(new_refs[-1], tag[1])
-        #         blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
-        #         if first:
-        #             ref_list = set([b.id for b in blks])
-        #             first = False
-        #         else:
-        #             ref_list.intersection_update(set([b.id for b in blks]))
-
-        #     qry_list = set()
-        #     first    = True
-        #     for tag in qry.muts.keys():
-        #         beg  = self.seqs[tag[0]].position_of(new_qrys[0], tag[1])
-        #         end  = self.seqs[tag[0]].position_of(new_qrys[-1], tag[1])
-        #         blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
-        #         if first:
-        #             qry_list = set([b.id for b in blks])
-        #             first = False
-        #         else:
-        #             qry_list.intersection_update(set([b.id for b in blks]))
-
-        #     breakpoint("inconsistent number of blocks")
+        emit('left')
+        emit('right')
 
         self.prune_blks()
 
diff --git a/scripts/parse_log.py b/scripts/parse_log.py
index 3a0aed0..8532d3d 100755
--- a/scripts/parse_log.py
+++ b/scripts/parse_log.py
@@ -2,7 +2,11 @@
 """
 script to process our end repair log files for plotting
 """
+import ast
 import argparse
+
+from io          import StringIO
+from Bio         import AlignIO
 from collections import defaultdict
 
 level_preset = "+++LEVEL="
@@ -11,6 +15,40 @@ level_offset = len(level_preset)
 score_preset = "SCORE="
 score_offset = len(score_preset)
 
+msaln_preset = "ALIGNMENT="
+msaln_offset = len(msaln_preset)
+
+def unpack(line):
+    offset = [line.find(";")]
+    offset.append(line.find(";", offset[0]+1))
+
+    align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8'))
+    align = next(AlignIO.parse(align, 'fasta'))
+    score = float(line[offset[0]+1+score_offset:offset[1]])
+
+    return align, score
+
+# TODO: remove hardcoded numbers
+def save_aln_examples(results):
+    stats = results[(2500,1000)]
+    nums  = [0, 0, 0]
+    for score, aln in stats[1]['hits']:
+        if len(aln) < 10:
+            continue
+
+        if score < 1e-4:
+            with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd:
+                AlignIO.write(aln, fd, "fasta")
+                nums[0] += 1
+        elif score < 1e-2:
+            with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd:
+                AlignIO.write(aln, fd, "fasta")
+                nums[1] += 1
+        else:
+            with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd:
+                AlignIO.write(aln, fd, "fasta")
+                nums[2] += 1
+
 def main(args):
     results = {}
     for log_path in args:
@@ -28,14 +66,9 @@ def main(args):
                         stats[level]['miss'] += 1
                         continue
                     if line[1:].startswith("LEN="):
-                        offset = [line.find(";")]
-                        offset.append(line.find(";", offset[0]+1))
-                        offset.append(line.find(";", offset[1]+1))
-
-                        score    = [None, None]
-                        score[0] = float(line[offset[0]+1+score_offset:offset[1]])
-                        score[1] = float(line[offset[1]+1+score_offset:offset[2]])
-                        stats[level]['hits'].extend(score)
+                        laln, lscore = unpack(log.readline())
+                        raln, rscore = unpack(log.readline())
+                        stats[level]['hits'].extend([(lscore, laln), (rscore, raln)])
                         continue
                 raise ValueError(f"invalid syntax: {line[1:]}")
         if len(stats) > 0:
@@ -50,3 +83,4 @@ parser.add_argument('files', type=str, nargs='+')
 if __name__ == "__main__":
     args = parser.parse_args()
     results = main(args.files)
+    save_aln_examples(results)
-- 
cgit v1.2.1


From e24f680eeb4aecefec3e06f4e9857657fc9e8d1c Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 10 Sep 2020 14:37:21 -0700
Subject: fix: proper handling of Z2 flipped orientations

---
 Makefile             |  2 +-
 pangraph/graph.py    | 33 ++++++++++++++++++++++++---------
 pangraph/sequence.py |  6 ++++++
 3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index d1a401d..85c1b24 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json
+	pangraph build -d data/staph -m 500 -b 0 -e 25 -w 1000 data/staph/guide.json
 # 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log
 # figures
 
diff --git a/pangraph/graph.py b/pangraph/graph.py
index cc0c8e7..96cb721 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -473,14 +473,27 @@ class Graph(object):
         first    = True
         num_seqs = 0
         for tag in shared_blks[0].muts.keys():
-            # NOTE: this is a hack to deal with flipped orientations
-            pos  = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
-            beg, end  = pos[0], pos[-1]
-            lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window]
-            rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend]
-
-            lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
-            rblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
+            pos    = [self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks]
+            strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks]
+            if strand[0] == Strand.Plus:
+                beg, end = pos[0], pos[-1]
+
+                lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window]
+                rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend]
+
+                lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
+                rblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
+            elif strand[0] == Strand.Minus:
+                beg, end = pos[0], pos[-1]
+
+                rblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window]
+                lblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend]
+
+                rblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
+                lblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
+            else:
+                raise ValueError("unrecognized strand polarity")
+
             if first:
                 lblks_set_x = set([b.id for b in lblks_x])
                 rblks_set_x = set([b.id for b in rblks_x])
@@ -511,7 +524,8 @@ class Graph(object):
                     fd, path = tempfile.mkstemp()
                     with os.fdopen(fd, 'w') as tmp:
                         for i, tag in enumerate(merged_blks[0].muts.keys()):
-                            pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
+                            pos    = [self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks]
+                            strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks]
                             beg, end = pos[0], pos[-1]
 
                             if side == 'left':
@@ -523,6 +537,7 @@ class Graph(object):
 
                             iso_blks = self.seqs[tag[0]][left:right]
                             print("POSITIONS", pos)
+                            print("STRAND", strand)
                             print("LIST", lblks_set_x if side == 'left' else rblks_set_x)
                             print("SHARED", lblks_set_s if side == 'left' else rblks_set_s)
                             print("ISO", iso_blks)
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 3660b71..98170b4 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -155,6 +155,12 @@ class Path(object):
             return None
         return (self.position[index[num]], self.position[index[num]+1])
 
+    def orientation_of(self, blk, num):
+        orientation = { n.num:n.strand for i, n in enumerate(self.nodes) if n.blk == blk }
+        if not num in orientation:
+            return None
+        return orientation[num]
+
     # TODO: pull out common functionality into a helper function
     # TODO: merge with other sequence function
     def sequence_range(self, start=None, stop=None):
-- 
cgit v1.2.1


From 02d03656460253609daf2883de5518582b8b7af7 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 10 Sep 2020 15:04:40 -0700
Subject: fix: properly deal with precondition for edge repair. now only if
 block lists show diversity

---
 Makefile             |  2 +-
 pangraph/block.py    |  2 +-
 pangraph/graph.py    | 68 +++++++++++++++++++++++++++++++++-------------------
 pangraph/sequence.py |  2 --
 4 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index 85c1b24..d1a401d 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
 	@echo "cluster    staph"; \
 	pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
 	@echo "build      staph"; \
-	pangraph build -d data/staph -m 500 -b 0 -e 25 -w 1000 data/staph/guide.json
+	pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json
 # 2>staph-e2500-w1000.err 1>staph-e2500-w1000.log
 # figures
 
diff --git a/pangraph/block.py b/pangraph/block.py
index 1bc501d..92f9107 100644
--- a/pangraph/block.py
+++ b/pangraph/block.py
@@ -146,8 +146,8 @@ class Block(object):
         qryblks = [nb for i, nb in enumerate(newblks) if qrys[i] is not None]
         if aln['orientation'] == -1:
             qryblks = qryblks[::-1]
-        refblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None]
 
+        refblks    = [nb for i, nb in enumerate(newblks) if refs[i] is not None]
         sharedblks = [nb for i, nb in enumerate(newblks) if refs[i] is not None and qrys[i] is not None]
 
         return newblks, qryblks, refblks, sharedblks, isomap
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 96cb721..096d62a 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -475,22 +475,24 @@ class Graph(object):
         for tag in shared_blks[0].muts.keys():
             pos    = [self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks]
             strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks]
+            beg, end = pos[0], pos[-1]
             if strand[0] == Strand.Plus:
-                beg, end = pos[0], pos[-1]
+                lwindow = min(window, shared_blks[0].len_of(*tag))
+                rwindow = min(window, shared_blks[-1].len_of(*tag))
 
-                lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window]
-                rblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend]
+                lblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+lwindow]
+                rblks_x = self.seqs[tag[0]][end[1]-rwindow:end[1]+extend]
 
-                lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
-                rblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
+                # lblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
+                # rblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
             elif strand[0] == Strand.Minus:
-                beg, end = pos[0], pos[-1]
+                lwindow = min(window, shared_blks[-1].len_of(*tag))
+                rwindow = min(window, shared_blks[0].len_of(*tag))
+                rblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+rwindow]
+                lblks_x = self.seqs[tag[0]][end[1]-lwindow:end[1]+extend]
 
-                rblks_x = self.seqs[tag[0]][beg[0]-extend:beg[0]+window]
-                lblks_x = self.seqs[tag[0]][end[1]-window:end[1]+extend]
-
-                rblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
-                lblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
+                # rblks_s = self.seqs[tag[0]][beg[0]:beg[0]+window]
+                # lblks_s = self.seqs[tag[0]][end[1]-window:end[1]]
             else:
                 raise ValueError("unrecognized strand polarity")
 
@@ -498,23 +500,28 @@ class Graph(object):
                 lblks_set_x = set([b.id for b in lblks_x])
                 rblks_set_x = set([b.id for b in rblks_x])
 
-                lblks_set_s = set([b.id for b in lblks_s])
-                rblks_set_s = set([b.id for b in rblks_s])
+                # lblks_set_s = set([b.id for b in lblks_s])
+                # rblks_set_s = set([b.id for b in rblks_s])
+
+                lblks_set_s = set([b.id for b in lblks_x])
+                rblks_set_s = set([b.id for b in rblks_x])
 
                 first = False
             else:
                 lblks_set_x.intersection_update(set([b.id for b in lblks_x]))
                 rblks_set_x.intersection_update(set([b.id for b in rblks_x]))
 
-                lblks_set_s.intersection_update(set([b.id for b in lblks_s]))
-                rblks_set_s.intersection_update(set([b.id for b in rblks_s]))
+                lblks_set_s.update(set([b.id for b in lblks_x]))
+                rblks_set_s.update(set([b.id for b in rblks_x]))
+                # lblks_set_s.intersection_update(set([b.id for b in lblks_s]))
+                # rblks_set_s.intersection_update(set([b.id for b in rblks_s]))
             num_seqs += 1
 
         def emit(side):
             if side == 'left':
-                delta = len(lblks_set_x)-len(lblks_set_s)
+                delta  = len(lblks_set_s)-len(lblks_set_x)
             elif side == 'right':
-                delta = len(rblks_set_x)-len(rblks_set_s)
+                delta = len(lblks_set_s)-len(rblks_set_x)
             else:
                 raise ValueError(f"unrecognized argument '{side}' for side")
 
@@ -528,18 +535,29 @@ class Graph(object):
                             strand = [self.seqs[tag[0]].orientation_of(b, tag[1]) for b in shared_blks]
                             beg, end = pos[0], pos[-1]
 
-                            if side == 'left':
-                                left, right = beg[0]-extend,beg[0]+window
-                            elif side == 'right':
-                                left, right = end[1]-window,end[1]+extend
-                            else:
-                                raise ValueError(f"unrecognized argument '{side}' for side")
+                            if strand[0] == Strand.Plus:
+                                if side == 'left':
+                                    left, right = beg[0]-extend,beg[0]+min(window,shared_blks[0].len_of(*tag))
+                                elif side == 'right':
+                                    left, right = end[1]-min(window,shared_blks[-1].len_of(*tag)),end[1]+extend
+                                else:
+                                    raise ValueError(f"unrecognized argument '{side}' for side")
+
+                            elif strand[0] == Strand.Minus:
+                                if side == 'left':
+                                    left, right = end[1]-min(window,shared_blks[-1].len_of(*tag)),end[1]+extend
+                                elif side == 'right':
+                                    left, right = beg[0]-extend,beg[0]+min(window, shared_blks[0].len_of(*tag))
+                                else:
+                                    raise ValueError(f"unrecognized argument '{side}' for side")
 
                             iso_blks = self.seqs[tag[0]][left:right]
                             print("POSITIONS", pos)
                             print("STRAND", strand)
-                            print("LIST", lblks_set_x if side == 'left' else rblks_set_x)
-                            print("SHARED", lblks_set_s if side == 'left' else rblks_set_s)
+                            print("LIST", shared_blks)
+                            print("MERGED", merged_blks)
+                            print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x)
+                            print("UNION", lblks_set_s if side == 'left' else rblks_set_s)
                             print("ISO", iso_blks)
                             breakpoint("stop")
                             tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n")
diff --git a/pangraph/sequence.py b/pangraph/sequence.py
index 98170b4..db2b3e6 100644
--- a/pangraph/sequence.py
+++ b/pangraph/sequence.py
@@ -167,8 +167,6 @@ class Path(object):
         beg = start or 0
         end = stop or self.position[-1]
         l, r = "", ""
-        # if start == 159 and stop == 3659:
-        #     breakpoint("test")
         if beg < 0:
             if len(self.nodes) > 1:
                 l = self.sequence_range(self.position[-1]+beg,self.position[-1])
-- 
cgit v1.2.1


From 95e9b2408debb3d4c13d64f5bdef47f585d9e2f8 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Thu, 10 Sep 2020 15:40:48 -0700
Subject: fix: changed score from average tree branch length to average column
 entropy of msa

---
 pangraph/graph.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index 096d62a..d561e6c 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -5,11 +5,12 @@ import pprint
 import subprocess
 import tempfile
 
+from io          import StringIO
 from glob        import glob
 from collections import defaultdict, Counter
 from itertools   import chain
 
-from Bio           import SeqIO, Phylo
+from Bio           import AlignIO, SeqIO, Phylo
 from Bio.Seq       import Seq
 from Bio.SeqRecord import SeqRecord
 
@@ -25,6 +26,22 @@ from .utils    import Strand, as_string, parse_paf, panic, as_record, new_strand
 # EXTEND = 2500
 pp = pprint.PrettyPrinter(indent=4)
 
+# ------------------------------------------------------------------------
+# utility
+
+def entropy(s):
+    S = 0
+    c = Counter(s)
+    S = sum((v/len(s))*np.log(len(s)/v) for v in c.values())
+    return S
+
+def alignment_entropy(rdr):
+    aln = AlignIO.read(rdr, 'fasta')
+    S = 0
+    for i in range(aln.get_alignment_length()):
+        S += entropy(aln[:,i])
+    return S/aln.get_alignment_length()
+
 # ------------------------------------------------------------------------
 # Junction class
 # simple struct
@@ -581,10 +598,13 @@ class Graph(object):
                                     stderr=subprocess.PIPE,
                                     shell=True)
                         out[0], err[0] = proc[0].communicate()
-                        out[1], err[1] = proc[1].communicate(input=out[0])
-                        tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
-                        print(f"ALIGNMENT=\n{out[0]}", end=";")
-                        print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
+                        # out[1], err[1] = proc[1].communicate(input=out[0])
+                        # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
+                        print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";")
+                        rdr = StringIO(out[0].decode('utf-8'))
+                        print(f"SCORE={alignment_entropy(rdr)}", end=";")
+                        rdr.close()
+                        # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
                         print("\n", end="")
                 finally:
                     os.remove(path)
-- 
cgit v1.2.1


From 8f9a55297a46af2267896de9fb3e1e50d07d9783 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 15 Sep 2020 13:03:56 -0700
Subject: fix: moved to use numpy array instead of slicing AlignIO object

---
 pangraph/graph.py    | 51 ++++++++++++++++++++++++++-------------------------
 scripts/parse_log.py | 45 ++++++++++++++++++++++++++++++++++++---------
 2 files changed, 62 insertions(+), 34 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index d561e6c..fbbb1e7 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -36,11 +36,15 @@ def entropy(s):
     return S
 
 def alignment_entropy(rdr):
-    aln = AlignIO.read(rdr, 'fasta')
-    S = 0
-    for i in range(aln.get_alignment_length()):
-        S += entropy(aln[:,i])
-    return S/aln.get_alignment_length()
+    try:
+        aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character)
+        S = 0
+        for i in range(aln.shape[1]):
+            S += entropy(aln[:,i])
+        return S/aln.shape[1]
+    except Exception as msg:
+        print(f"ERROR: {msg}")
+        return None
 
 # ------------------------------------------------------------------------
 # Junction class
@@ -569,14 +573,14 @@ class Graph(object):
                                     raise ValueError(f"unrecognized argument '{side}' for side")
 
                             iso_blks = self.seqs[tag[0]][left:right]
-                            print("POSITIONS", pos)
-                            print("STRAND", strand)
-                            print("LIST", shared_blks)
-                            print("MERGED", merged_blks)
-                            print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x)
-                            print("UNION", lblks_set_s if side == 'left' else rblks_set_s)
-                            print("ISO", iso_blks)
-                            breakpoint("stop")
+                            # print("POSITIONS", pos)
+                            # print("STRAND", strand)
+                            # print("LIST", shared_blks)
+                            # print("MERGED", merged_blks)
+                            # print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x)
+                            # print("UNION", lblks_set_s if side == 'left' else rblks_set_s)
+                            # print("ISO", iso_blks)
+                            # breakpoint("stop")
                             tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n")
                             s = self.seqs[tag[0]].sequence_range(left,right)
                             if len(s) > extend + window:
@@ -585,23 +589,20 @@ class Graph(object):
 
                         tmp.flush()
 
-                        proc = [None, None]
-                        out  = [None, None]
-                        err  = [None, None]
-                        proc[0] = subprocess.Popen(f"mafft --auto {path}",
-                                    stdout=subprocess.PIPE,
-                                    stderr=subprocess.PIPE,
-                                    shell=True)
-                        proc[1] = subprocess.Popen(f"fasttree",
-                                    stdin =subprocess.PIPE,
+                        proc = subprocess.Popen(f"mafft --auto {path}",
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=True)
-                        out[0], err[0] = proc[0].communicate()
+                        # proc[1] = subprocess.Popen(f"fasttree",
+                        #             stdin =subprocess.PIPE,
+                        #             stdout=subprocess.PIPE,
+                        #             stderr=subprocess.PIPE,
+                        #             shell=True)
+                        out, err = proc.communicate()
                         # out[1], err[1] = proc[1].communicate(input=out[0])
                         # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
-                        print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";")
-                        rdr = StringIO(out[0].decode('utf-8'))
+                        print(f"ALIGNMENT={out}", end=";")
+                        rdr = StringIO(out.decode('utf-8'))
                         print(f"SCORE={alignment_entropy(rdr)}", end=";")
                         rdr.close()
                         # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
diff --git a/scripts/parse_log.py b/scripts/parse_log.py
index 8532d3d..65ef8fd 100755
--- a/scripts/parse_log.py
+++ b/scripts/parse_log.py
@@ -9,6 +9,9 @@ from io          import StringIO
 from Bio         import AlignIO
 from collections import defaultdict
 
+import numpy as np
+import matplotlib.pylab as plt
+
 level_preset = "+++LEVEL="
 level_offset = len(level_preset)
 
@@ -21,31 +24,32 @@ msaln_offset = len(msaln_preset)
 def unpack(line):
     offset = [line.find(";")]
     offset.append(line.find(";", offset[0]+1))
+    offset.append(line.find(";", offset[1]+1))
 
-    align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8'))
+    align = StringIO(ast.literal_eval(line[msaln_offset+1+offset[0]:offset[1]]).decode('utf-8'))
     align = next(AlignIO.parse(align, 'fasta'))
-    score = float(line[offset[0]+1+score_offset:offset[1]])
+    score = float(line[offset[1]+1+score_offset:offset[2]])
 
     return align, score
 
 # TODO: remove hardcoded numbers
 def save_aln_examples(results):
-    stats = results[(2500,1000)]
+    stats = results[(500,1000)]
     nums  = [0, 0, 0]
     for score, aln in stats[1]['hits']:
         if len(aln) < 10:
             continue
 
         if score < 1e-4:
-            with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd:
+            with open(f"scratch/1/eg_{nums[0]}.fna", 'w+') as fd:
                 AlignIO.write(aln, fd, "fasta")
                 nums[0] += 1
         elif score < 1e-2:
-            with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd:
+            with open(f"scratch/2/eg_{nums[1]}.fna", 'w+') as fd:
                 AlignIO.write(aln, fd, "fasta")
                 nums[1] += 1
         else:
-            with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd:
+            with open(f"scratch/3/eg_{nums[2]}.fna", 'w+') as fd:
                 AlignIO.write(aln, fd, "fasta")
                 nums[2] += 1
 
@@ -66,9 +70,8 @@ def main(args):
                         stats[level]['miss'] += 1
                         continue
                     if line[1:].startswith("LEN="):
-                        laln, lscore = unpack(log.readline())
-                        raln, rscore = unpack(log.readline())
-                        stats[level]['hits'].extend([(lscore, laln), (rscore, raln)])
+                        aln, score = unpack(line)
+                        stats[level]['hits'].append((score, aln))
                         continue
                 raise ValueError(f"invalid syntax: {line[1:]}")
         if len(stats) > 0:
@@ -77,10 +80,34 @@ def main(args):
             results[(e,w)] = dict(stats)
     return results
 
+def plot(results):
+    data   = results[(500,1000)]
+    scores = [[] for _ in data.keys()]
+    fracs  = np.zeros(len(data.keys()))
+    for i, lvl in enumerate(sorted(data.keys())):
+        scores[i] = [elt[0] for elt in data[lvl]['hits']]
+        fracs[i]  = len(scores[i])/(len(scores[i]) + data[lvl]['miss'])
+
+    cmap   = plt.cm.get_cmap('plasma')
+    colors = [cmap(x) for x in np.linspace(0,1,len(scores[:-5]))]
+    fig, (ax1, ax2) = plt.subplots(1, 2)
+    ax1.plot(fracs[:-5])
+    ax1.set_xlabel("tree level")
+    ax1.set_ylabel("fraction of good edges")
+    for i, score in enumerate(scores[:-5]):
+        ax2.plot(sorted(np.exp(score)), np.linspace(0,1,len(score)),color=colors[i],label=f"level={i+1}")
+    ax2.set_xlabel("average column entropy")
+    ax2.set_ylabel("CDF")
+    ax2.legend()
+
 parser = argparse.ArgumentParser(description='process our data log files on end repair')
 parser.add_argument('files', type=str, nargs='+')
 
+# -----------------------------------------------------------------------------
+# main point of entry
+
 if __name__ == "__main__":
     args = parser.parse_args()
     results = main(args.files)
     save_aln_examples(results)
+    plot(results)
-- 
cgit v1.2.1


From 26c9b16dd74767e4e99a7b771026e17b484016a7 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Tue, 15 Sep 2020 13:36:01 -0700
Subject: fix: vectorized entropy computation

---
 pangraph/graph.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pangraph/graph.py b/pangraph/graph.py
index fbbb1e7..9f00587 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -14,6 +14,8 @@ from Bio           import AlignIO, SeqIO, Phylo
 from Bio.Seq       import Seq
 from Bio.SeqRecord import SeqRecord
 
+from scipy.stats import entropy
+
 from .         import suffix
 from .block    import Block
 from .sequence import Node, Path
@@ -29,18 +31,10 @@ pp = pprint.PrettyPrinter(indent=4)
 # ------------------------------------------------------------------------
 # utility
 
-def entropy(s):
-    S = 0
-    c = Counter(s)
-    S = sum((v/len(s))*np.log(len(s)/v) for v in c.values())
-    return S
-
 def alignment_entropy(rdr):
     try:
-        aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character)
-        S = 0
-        for i in range(aln.shape[1]):
-            S += entropy(aln[:,i])
+        aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character).view(np.uint8)
+        S   = sum(entropy(np.bincount(aln[:,i])/aln.shape[0]) for i in range(aln.shape[1]))
         return S/aln.shape[1]
     except Exception as msg:
         print(f"ERROR: {msg}")
-- 
cgit v1.2.1