aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-09-15 13:36:01 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-09-15 13:36:01 -0700
commit26c9b16dd74767e4e99a7b771026e17b484016a7 (patch)
tree2a5f4e4330a7d5bddffd23fd26320495a708b24a
parent8f9a55297a46af2267896de9fb3e1e50d07d9783 (diff)
fix: vectorized entropy computation
-rw-r--r--pangraph/graph.py14
1 files changed, 4 insertions, 10 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py
index fbbb1e7..9f00587 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -14,6 +14,8 @@ from Bio import AlignIO, SeqIO, Phylo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
+from scipy.stats import entropy
+
from . import suffix
from .block import Block
from .sequence import Node, Path
@@ -29,18 +31,10 @@ pp = pprint.PrettyPrinter(indent=4)
# ------------------------------------------------------------------------
# utility
-def entropy(s):
- S = 0
- c = Counter(s)
- S = sum((v/len(s))*np.log(len(s)/v) for v in c.values())
- return S
-
def alignment_entropy(rdr):
try:
- aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character)
- S = 0
- for i in range(aln.shape[1]):
- S += entropy(aln[:,i])
+ aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character).view(np.uint8)
+ S = sum(entropy(np.bincount(aln[:,i])/aln.shape[0]) for i in range(aln.shape[1]))
return S/aln.shape[1]
except Exception as msg:
print(f"ERROR: {msg}")