From 95e9b2408debb3d4c13d64f5bdef47f585d9e2f8 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 10 Sep 2020 15:40:48 -0700 Subject: fix: changed score from average tree branch length to average column entropy of msa --- pangraph/graph.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index 096d62a..d561e6c 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -5,11 +5,12 @@ import pprint import subprocess import tempfile +from io import StringIO from glob import glob from collections import defaultdict, Counter from itertools import chain -from Bio import SeqIO, Phylo +from Bio import AlignIO, SeqIO, Phylo from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord @@ -25,6 +26,22 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand # EXTEND = 2500 pp = pprint.PrettyPrinter(indent=4) +# ------------------------------------------------------------------------ +# utility + +def entropy(s): + S = 0 + c = Counter(s) + S = sum((v/len(s))*np.log(len(s)/v) for v in c.values()) + return S + +def alignment_entropy(rdr): + aln = AlignIO.read(rdr, 'fasta') + S = 0 + for i in range(aln.get_alignment_length()): + S += entropy(aln[:,i]) + return S/aln.get_alignment_length() + # ------------------------------------------------------------------------ # Junction class # simple struct @@ -581,10 +598,13 @@ class Graph(object): stderr=subprocess.PIPE, shell=True) out[0], err[0] = proc[0].communicate() - out[1], err[1] = proc[1].communicate(input=out[0]) - tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') - print(f"ALIGNMENT=\n{out[0]}", end=";") - print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") + # out[1], err[1] = proc[1].communicate(input=out[0]) + # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') + print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";") + rdr = StringIO(out[0].decode('utf-8')) + print(f"SCORE={alignment_entropy(rdr)}", end=";") + rdr.close() + # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") print("\n", end="") finally: os.remove(path) -- cgit v1.2.1