aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-09-15 13:03:56 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-09-15 13:03:56 -0700
commit8f9a55297a46af2267896de9fb3e1e50d07d9783 (patch)
tree33cdd570f504eda657e0d63f7c8affb641201e6e
parent95e9b2408debb3d4c13d64f5bdef47f585d9e2f8 (diff)
fix: moved to use numpy array instead of slicing AlignIO object
-rw-r--r--pangraph/graph.py51
-rwxr-xr-xscripts/parse_log.py45
2 files changed, 62 insertions, 34 deletions
diff --git a/pangraph/graph.py b/pangraph/graph.py
index d561e6c..fbbb1e7 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -36,11 +36,15 @@ def entropy(s):
return S
def alignment_entropy(rdr):
- aln = AlignIO.read(rdr, 'fasta')
- S = 0
- for i in range(aln.get_alignment_length()):
- S += entropy(aln[:,i])
- return S/aln.get_alignment_length()
+ try:
+ aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character)
+ S = 0
+ for i in range(aln.shape[1]):
+ S += entropy(aln[:,i])
+ return S/aln.shape[1]
+ except Exception as msg:
+ print(f"ERROR: {msg}")
+ return None
# ------------------------------------------------------------------------
# Junction class
@@ -569,14 +573,14 @@ class Graph(object):
raise ValueError(f"unrecognized argument '{side}' for side")
iso_blks = self.seqs[tag[0]][left:right]
- print("POSITIONS", pos)
- print("STRAND", strand)
- print("LIST", shared_blks)
- print("MERGED", merged_blks)
- print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x)
- print("UNION", lblks_set_s if side == 'left' else rblks_set_s)
- print("ISO", iso_blks)
- breakpoint("stop")
+ # print("POSITIONS", pos)
+ # print("STRAND", strand)
+ # print("LIST", shared_blks)
+ # print("MERGED", merged_blks)
+ # print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x)
+ # print("UNION", lblks_set_s if side == 'left' else rblks_set_s)
+ # print("ISO", iso_blks)
+ # breakpoint("stop")
tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n")
s = self.seqs[tag[0]].sequence_range(left,right)
if len(s) > extend + window:
@@ -585,23 +589,20 @@ class Graph(object):
tmp.flush()
- proc = [None, None]
- out = [None, None]
- err = [None, None]
- proc[0] = subprocess.Popen(f"mafft --auto {path}",
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- shell=True)
- proc[1] = subprocess.Popen(f"fasttree",
- stdin =subprocess.PIPE,
+ proc = subprocess.Popen(f"mafft --auto {path}",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
- out[0], err[0] = proc[0].communicate()
+ # proc[1] = subprocess.Popen(f"fasttree",
+ # stdin =subprocess.PIPE,
+ # stdout=subprocess.PIPE,
+ # stderr=subprocess.PIPE,
+ # shell=True)
+ out, err = proc.communicate()
# out[1], err[1] = proc[1].communicate(input=out[0])
# tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick')
- print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";")
- rdr = StringIO(out[0].decode('utf-8'))
+ print(f"ALIGNMENT={out}", end=";")
+ rdr = StringIO(out.decode('utf-8'))
print(f"SCORE={alignment_entropy(rdr)}", end=";")
rdr.close()
# print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";")
diff --git a/scripts/parse_log.py b/scripts/parse_log.py
index 8532d3d..65ef8fd 100755
--- a/scripts/parse_log.py
+++ b/scripts/parse_log.py
@@ -9,6 +9,9 @@ from io import StringIO
from Bio import AlignIO
from collections import defaultdict
+import numpy as np
+import matplotlib.pylab as plt
+
level_preset = "+++LEVEL="
level_offset = len(level_preset)
@@ -21,31 +24,32 @@ msaln_offset = len(msaln_preset)
def unpack(line):
offset = [line.find(";")]
offset.append(line.find(";", offset[0]+1))
+ offset.append(line.find(";", offset[1]+1))
- align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8'))
+ align = StringIO(ast.literal_eval(line[msaln_offset+1+offset[0]:offset[1]]).decode('utf-8'))
align = next(AlignIO.parse(align, 'fasta'))
- score = float(line[offset[0]+1+score_offset:offset[1]])
+ score = float(line[offset[1]+1+score_offset:offset[2]])
return align, score
# TODO: remove hardcoded numbers
def save_aln_examples(results):
- stats = results[(2500,1000)]
+ stats = results[(500,1000)]
nums = [0, 0, 0]
for score, aln in stats[1]['hits']:
if len(aln) < 10:
continue
if score < 1e-4:
- with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd:
+ with open(f"scratch/1/eg_{nums[0]}.fna", 'w+') as fd:
AlignIO.write(aln, fd, "fasta")
nums[0] += 1
elif score < 1e-2:
- with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd:
+ with open(f"scratch/2/eg_{nums[1]}.fna", 'w+') as fd:
AlignIO.write(aln, fd, "fasta")
nums[1] += 1
else:
- with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd:
+ with open(f"scratch/3/eg_{nums[2]}.fna", 'w+') as fd:
AlignIO.write(aln, fd, "fasta")
nums[2] += 1
@@ -66,9 +70,8 @@ def main(args):
stats[level]['miss'] += 1
continue
if line[1:].startswith("LEN="):
- laln, lscore = unpack(log.readline())
- raln, rscore = unpack(log.readline())
- stats[level]['hits'].extend([(lscore, laln), (rscore, raln)])
+ aln, score = unpack(line)
+ stats[level]['hits'].append((score, aln))
continue
raise ValueError(f"invalid syntax: {line[1:]}")
if len(stats) > 0:
@@ -77,10 +80,34 @@ def main(args):
results[(e,w)] = dict(stats)
return results
+def plot(results):
+ data = results[(500,1000)]
+ scores = [[] for _ in data.keys()]
+ fracs = np.zeros(len(data.keys()))
+ for i, lvl in enumerate(sorted(data.keys())):
+ scores[i] = [elt[0] for elt in data[lvl]['hits']]
+ fracs[i] = len(scores[i])/(len(scores[i]) + data[lvl]['miss'])
+
+ cmap = plt.cm.get_cmap('plasma')
+ colors = [cmap(x) for x in np.linspace(0,1,len(scores[:-5]))]
+ fig, (ax1, ax2) = plt.subplots(1, 2)
+ ax1.plot(fracs[:-5])
+ ax1.set_xlabel("tree level")
+ ax1.set_ylabel("fraction of good edges")
+ for i, score in enumerate(scores[:-5]):
+ ax2.plot(sorted(np.exp(score)), np.linspace(0,1,len(score)),color=colors[i],label=f"level={i+1}")
+ ax2.set_xlabel("average column entropy")
+ ax2.set_ylabel("CDF")
+ ax2.legend()
+
parser = argparse.ArgumentParser(description='process our data log files on end repair')
parser.add_argument('files', type=str, nargs='+')
+# -----------------------------------------------------------------------------
+# main point of entry
+
if __name__ == "__main__":
args = parser.parse_args()
results = main(args.files)
save_aln_examples(results)
+ plot(results)