scripts/parse_log.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

#!/usr/bin/env python3
"""
script to process our end repair log files for plotting
"""
import ast
import argparse

from io          import StringIO
from Bio         import AlignIO
from collections import defaultdict

import numpy as np
import matplotlib.pylab as plt

level_preset = "+++LEVEL="
level_offset = len(level_preset)

score_preset = "SCORE="
score_offset = len(score_preset)

msaln_preset = "ALIGNMENT="
msaln_offset = len(msaln_preset)

def unpack(line):
    offset = [line.find(";")]
    offset.append(line.find(";", offset[0]+1))
    offset.append(line.find(";", offset[1]+1))

    align = StringIO(ast.literal_eval(line[msaln_offset+1+offset[0]:offset[1]]).decode('utf-8'))
    align = next(AlignIO.parse(align, 'fasta'))
    score = float(line[offset[1]+1+score_offset:offset[2]])

    return align, score

# TODO: remove hardcoded numbers
def save_aln_examples(results):
    stats = results[(500,1000)]
    nums  = [0, 0, 0]
    for score, aln in stats[1]['hits']:
        if len(aln) < 10:
            continue

        if score < 1e-4:
            with open(f"scratch/1/eg_{nums[0]}.fna", 'w+') as fd:
                AlignIO.write(aln, fd, "fasta")
                nums[0] += 1
        elif score < 1e-2:
            with open(f"scratch/2/eg_{nums[1]}.fna", 'w+') as fd:
                AlignIO.write(aln, fd, "fasta")
                nums[1] += 1
        else:
            with open(f"scratch/3/eg_{nums[2]}.fna", 'w+') as fd:
                AlignIO.write(aln, fd, "fasta")
                nums[2] += 1

def main(args):
    results = {}
    for log_path in args:
        stats = defaultdict(lambda: {'hits':[], 'miss': 0})
        level = -1
        with open(log_path) as log:
            for line in log:
                line.rstrip('\n')
                if line[0] == "+":
                    assert line.startswith(level_preset), "check syntax in log file"
                    level = int(line[level_offset:line.find("+++", level_offset)])
                    continue
                if line[0] == ">":
                    if line[1:].startswith("NO MATCH"):
                        stats[level]['miss'] += 1
                        continue
                    if line[1:].startswith("LEN="):
                        aln, score = unpack(line)
                        stats[level]['hits'].append((score, aln))
                        continue
                raise ValueError(f"invalid syntax: {line[1:]}")
        if len(stats) > 0:
            path = log_path.replace(".log", "").split("-")
            e, w = int(path[1][1:]), int(path[2][1:])
            results[(e,w)] = dict(stats)
    return results

def plot(results):
    data   = results[(500,1000)]
    scores = [[] for _ in data.keys()]
    fracs  = np.zeros(len(data.keys()))
    for i, lvl in enumerate(sorted(data.keys())):
        scores[i] = [elt[0] for elt in data[lvl]['hits']]
        fracs[i]  = len(scores[i])/(len(scores[i]) + data[lvl]['miss'])

    cmap   = plt.cm.get_cmap('plasma')
    colors = [cmap(x) for x in np.linspace(0,1,len(scores[:-5]))]
    fig, (ax1, ax2) = plt.subplots(1, 2)
    ax1.plot(fracs[:-5])
    ax1.set_xlabel("tree level")
    ax1.set_ylabel("fraction of good edges")
    for i, score in enumerate(scores[:-5]):
        ax2.plot(sorted(np.exp(score)), np.linspace(0,1,len(score)),color=colors[i],label=f"level={i+1}")
    ax2.set_xlabel("average column entropy")
    ax2.set_ylabel("CDF")
    ax2.legend()

parser = argparse.ArgumentParser(description='process our data log files on end repair')
parser.add_argument('files', type=str, nargs='+')

# -----------------------------------------------------------------------------
# main point of entry

if __name__ == "__main__":
    args = parser.parse_args()
    results = main(args.files)
    save_aln_examples(results)
    plot(results)