aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-08-13 15:33:22 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-08-13 15:33:22 -0700
commit25b974618ec3242a9d7d7373b5c9a3a8bf0f0681 (patch)
tree337fbcc23a117b5226c4ed8a0d53cb0416588d36
parent967b6412dddc99de429e692973819bb7147b0354 (diff)
feat: now assess the alignment based on total tree length
-rw-r--r--Makefile2
-rw-r--r--pangraph/graph.py42
2 files changed, 28 insertions, 16 deletions
diff --git a/Makefile b/Makefile
index 117dbc9..6061cfc 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
@echo "cluster staph"; \
pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
@echo "build staph"; \
- pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+ pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 2>/dev/null #1>staph.log
# figures
diff --git a/pangraph/graph.py b/pangraph/graph.py
index f45cfd2..5a3c618 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -1,12 +1,13 @@
-import os, sys
+import io, os, sys
import json
import numpy as np
import pprint
+import subprocess
+import tempfile
from glob import glob
from collections import defaultdict, Counter
from itertools import chain
-import subprocess
from Bio import SeqIO, Phylo
from Bio.Seq import Seq
@@ -468,6 +469,7 @@ class Graph(object):
blk_list = set()
first = True
+ num_seqs = 0
for tag in ref.muts.keys():
# NOTE: this is a hack to deal with flipped orientations
pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
@@ -478,6 +480,7 @@ class Graph(object):
first = False
else:
blk_list.intersection_update(set([b.id for b in blks]))
+ num_seqs += 1
for tag in qry.muts.keys():
try:
@@ -487,21 +490,30 @@ class Graph(object):
beg, end = pos[0], pos[-1]
blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
blk_list.intersection_update(set([b.id for b in blks]))
+ num_seqs += 1
delta = len(blk_list)-len(shared_blks)
- if delta > 0:
- print(f"LEN: {delta}")
- buf = ""
- for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
- buf += ">isolate_{i:04d}\n"
- blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
- for b in blks:
- buf += b.extract(*tag)
- buf += '\n'
- proc = subprocess.run(["mafft", "-"], stdout=subprocess.PIPE, input=buf, encoding='ascii')
- print(proc.stdout)
- print(proc.stderr)
- breakpoint('test')
+ if delta > 0 and num_seqs > 1:
+ print(f"LEN: {delta}", end="\t")
+ fd, path = tempfile.mkstemp()
+ try:
+ with os.fdopen(fd, 'w') as tmp:
+ for i, tag in enumerate(chain(ref.muts.keys(), qry.muts.keys())):
+ tmp.write(f">isolate_{i:04d}\n")
+ blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+ for b in blks:
+ tmp.write(b.extract(*tag))
+ tmp.write('\n')
+ tmp.flush()
+ proc = subprocess.Popen(f"mafft --auto --quiet {path} | fasttree",
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=True)
+ out, err = proc.communicate()
+ tree = Phylo.read(io.StringIO(out.decode('utf-8')), format='newick')
+ print(tree.total_branch_length())
+ finally:
+ os.remove(path)
else:
print(f"NO MATCH")