aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-09-10 15:40:48 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-09-10 15:40:48 -0700
commit95e9b2408debb3d4c13d64f5bdef47f585d9e2f8 (patch)
tree207e559ce644cabae32727e9cd91359cdb6b8588
parent02d03656460253609daf2883de5518582b8b7af7 (diff)
fix: changed score from average tree branch length to average column entropy of msa
-rw-r--r--pangraph/graph.py30
1 files changed, 25 insertions, 5 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 096d62a..d561e6c 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -5,11 +5,12 @@ import pprint
import subprocess
import tempfile
+from io import StringIO
from glob import glob
from collections import defaultdict, Counter
from itertools import chain
-from Bio import SeqIO, Phylo
+from Bio import AlignIO, SeqIO, Phylo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
@@ -26,6 +27,22 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand
pp = pprint.PrettyPrinter(indent=4)
# ------------------------------------------------------------------------
+# utility
+
+def entropy(s):
+ S = 0
+ c = Counter(s)
+ S = sum((v/len(s))*np.log(len(s)/v) for v in c.values())
+ return S
+
+def alignment_entropy(rdr):
+ aln = AlignIO.read(rdr, 'fasta')
+ S = 0
+ for i in range(aln.get_alignment_length()):
+ S += entropy(aln[:,i])
+ return S/aln.get_alignment_length()
+
+# ------------------------------------------------------------------------
# Junction class
# simple struct
@@ -581,10 +598,13 @@ class Graph(object):
stderr=subprocess.PIPE,
shell=True)
out[0], err[0] = proc[0].communicate()
- out[1], err[1] = proc[1].communicate(input=out[0])
- tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
- print(f"ALIGNMENT=\n{out[0]}", end=";")
- print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
+ # out[1], err[1] = proc[1].communicate(input=out[0])
+ # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
+ print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";")
+ rdr = StringIO(out[0].decode('utf-8'))
+ print(f"SCORE={alignment_entropy(rdr)}", end=";")
+ rdr.close()
+ # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
print("\n", end="")
finally:
os.remove(path)