From 8f9a55297a46af2267896de9fb3e1e50d07d9783 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 15 Sep 2020 13:03:56 -0700 Subject: fix: moved to use numpy array instead of slicing AlignIO object --- pangraph/graph.py | 51 ++++++++++++++++++++++++++------------------------- scripts/parse_log.py | 45 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 34 deletions(-) diff --git a/pangraph/graph.py b/pangraph/graph.py index d561e6c..fbbb1e7 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -36,11 +36,15 @@ def entropy(s): return S def alignment_entropy(rdr): - aln = AlignIO.read(rdr, 'fasta') - S = 0 - for i in range(aln.get_alignment_length()): - S += entropy(aln[:,i]) - return S/aln.get_alignment_length() + try: + aln = np.array([list(rec) for rec in AlignIO.read(rdr, 'fasta')], np.character) + S = 0 + for i in range(aln.shape[1]): + S += entropy(aln[:,i]) + return S/aln.shape[1] + except Exception as msg: + print(f"ERROR: {msg}") + return None # ------------------------------------------------------------------------ # Junction class @@ -569,14 +573,14 @@ class Graph(object): raise ValueError(f"unrecognized argument '{side}' for side") iso_blks = self.seqs[tag[0]][left:right] - print("POSITIONS", pos) - print("STRAND", strand) - print("LIST", shared_blks) - print("MERGED", merged_blks) - print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x) - print("UNION", lblks_set_s if side == 'left' else rblks_set_s) - print("ISO", iso_blks) - breakpoint("stop") + # print("POSITIONS", pos) + # print("STRAND", strand) + # print("LIST", shared_blks) + # print("MERGED", merged_blks) + # print("INTERSECTION", lblks_set_x if side == 'left' else rblks_set_x) + # print("UNION", lblks_set_s if side == 'left' else rblks_set_s) + # print("ISO", iso_blks) + # breakpoint("stop") tmp.write(f">isolate_{i:04d} {','.join(b.id for b in iso_blks)}\n") s = self.seqs[tag[0]].sequence_range(left,right) if len(s) > extend + window: @@ -585,23 +589,20 @@ class Graph(object): tmp.flush() - proc = [None, None] - out = [None, None] - err = [None, None] - proc[0] = subprocess.Popen(f"mafft --auto {path}", - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - proc[1] = subprocess.Popen(f"fasttree", - stdin =subprocess.PIPE, + proc = subprocess.Popen(f"mafft --auto {path}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - out[0], err[0] = proc[0].communicate() + # proc[1] = subprocess.Popen(f"fasttree", + # stdin =subprocess.PIPE, + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE, + # shell=True) + out, err = proc.communicate() # out[1], err[1] = proc[1].communicate(input=out[0]) # tree = Phylo.read(io.StringIO(out[1].decode('utf-8')), format='newick') - print(f"ALIGNMENT=\n{out[0].decode('utf-8')}", end=";") - rdr = StringIO(out[0].decode('utf-8')) + print(f"ALIGNMENT={out}", end=";") + rdr = StringIO(out.decode('utf-8')) print(f"SCORE={alignment_entropy(rdr)}", end=";") rdr.close() # print(f"SCORE={tree.total_branch_length()/(2*num_seqs)}", end=";") diff --git a/scripts/parse_log.py b/scripts/parse_log.py index 8532d3d..65ef8fd 100755 --- a/scripts/parse_log.py +++ b/scripts/parse_log.py @@ -9,6 +9,9 @@ from io import StringIO from Bio import AlignIO from collections import defaultdict +import numpy as np +import matplotlib.pylab as plt + level_preset = "+++LEVEL=" level_offset = len(level_preset) @@ -21,31 +24,32 @@ msaln_offset = len(msaln_preset) def unpack(line): offset = [line.find(";")] offset.append(line.find(";", offset[0]+1)) + offset.append(line.find(";", offset[1]+1)) - align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8')) + align = StringIO(ast.literal_eval(line[msaln_offset+1+offset[0]:offset[1]]).decode('utf-8')) align = next(AlignIO.parse(align, 'fasta')) - score = float(line[offset[0]+1+score_offset:offset[1]]) + score = float(line[offset[1]+1+score_offset:offset[2]]) return align, score # TODO: remove hardcoded numbers def save_aln_examples(results): - stats = results[(2500,1000)] + stats = results[(500,1000)] nums = [0, 0, 0] for score, aln in stats[1]['hits']: if len(aln) < 10: continue if score < 1e-4: - with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd: + with open(f"scratch/1/eg_{nums[0]}.fna", 'w+') as fd: AlignIO.write(aln, fd, "fasta") nums[0] += 1 elif score < 1e-2: - with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd: + with open(f"scratch/2/eg_{nums[1]}.fna", 'w+') as fd: AlignIO.write(aln, fd, "fasta") nums[1] += 1 else: - with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd: + with open(f"scratch/3/eg_{nums[2]}.fna", 'w+') as fd: AlignIO.write(aln, fd, "fasta") nums[2] += 1 @@ -66,9 +70,8 @@ def main(args): stats[level]['miss'] += 1 continue if line[1:].startswith("LEN="): - laln, lscore = unpack(log.readline()) - raln, rscore = unpack(log.readline()) - stats[level]['hits'].extend([(lscore, laln), (rscore, raln)]) + aln, score = unpack(line) + stats[level]['hits'].append((score, aln)) continue raise ValueError(f"invalid syntax: {line[1:]}") if len(stats) > 0: @@ -77,10 +80,34 @@ def main(args): results[(e,w)] = dict(stats) return results +def plot(results): + data = results[(500,1000)] + scores = [[] for _ in data.keys()] + fracs = np.zeros(len(data.keys())) + for i, lvl in enumerate(sorted(data.keys())): + scores[i] = [elt[0] for elt in data[lvl]['hits']] + fracs[i] = len(scores[i])/(len(scores[i]) + data[lvl]['miss']) + + cmap = plt.cm.get_cmap('plasma') + colors = [cmap(x) for x in np.linspace(0,1,len(scores[:-5]))] + fig, (ax1, ax2) = plt.subplots(1, 2) + ax1.plot(fracs[:-5]) + ax1.set_xlabel("tree level") + ax1.set_ylabel("fraction of good edges") + for i, score in enumerate(scores[:-5]): + ax2.plot(sorted(np.exp(score)), np.linspace(0,1,len(score)),color=colors[i],label=f"level={i+1}") + ax2.set_xlabel("average column entropy") + ax2.set_ylabel("CDF") + ax2.legend() + parser = argparse.ArgumentParser(description='process our data log files on end repair') parser.add_argument('files', type=str, nargs='+') +# ----------------------------------------------------------------------------- +# main point of entry + if __name__ == "__main__": args = parser.parse_args() results = main(args.files) save_aln_examples(results) + plot(results) -- cgit v1.2.1