diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2020-09-15 13:36:01 -0700 |
---|---|---|
committer | Nicholas Noll <nbnoll@eml.cc> | 2020-09-15 13:36:01 -0700 |
commit | 26c9b16dd74767e4e99a7b771026e17b484016a7 (patch) | |
tree | 2a5f4e4330a7d5bddffd23fd26320495a708b24a | |
parent | 8f9a55297a46af2267896de9fb3e1e50d07d9783 (diff) |
fix: vectorized entropy computation
-rw-r--r-- | pangraph/graph.py | 14 |
1 files changed, 4 insertions, 10 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py index fbbb1e7..9f00587 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -14,6 +14,8 @@ from Bio import AlignIO, SeqIO, Phylo from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord +from scipy.stats import entropy + from . import suffix from .block import Block from .sequence import Node, Path @@ -29,18 +31,10 @@ pp = pprint.PrettyPrinter(indent=4) # ------------------------------------------------------------------------ # utility -def entropy(s): - S = 0 - c = Counter(s) - S = sum((v/len(s))*np.log(len(s)/v) for v in c.values()) - return S - def alignment_entropy(rdr): try: - aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character) - S = 0 - for i in range(aln.shape[1]): - S += entropy(aln[:,i]) + aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character).view(np.uint8) + S = sum(entropy(np.bincount(aln[:,i])/aln.shape[0]) for i in range(aln.shape[1])) return S/aln.shape[1] except Exception as msg: print(f"ERROR: {msg}") |