scripts/parse_log.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

#!/usr/bin/env python3
"""
script to process our end repair log files for plotting
"""
import ast
import argparse

from io          import StringIO
from Bio         import AlignIO
from collections import defaultdict

level_preset = "+++LEVEL="
level_offset = len(level_preset)

score_preset = "SCORE="
score_offset = len(score_preset)

msaln_preset = "ALIGNMENT="
msaln_offset = len(msaln_preset)

def unpack(line):
    offset = [line.find(";")]
    offset.append(line.find(";", offset[0]+1))

    align = StringIO(ast.literal_eval(line[msaln_offset:offset[0]]).decode('utf-8'))
    align = next(AlignIO.parse(align, 'fasta'))
    score = float(line[offset[0]+1+score_offset:offset[1]])

    return align, score

# TODO: remove hardcoded numbers
def save_aln_examples(results):
    stats = results[(2500,1000)]
    nums  = [0, 0, 0]
    for score, aln in stats[1]['hits']:
        if len(aln) < 10:
            continue

        if score < 1e-4:
            with open(f"scratch/1/eg_{nums[0]}.fna", 'w') as fd:
                AlignIO.write(aln, fd, "fasta")
                nums[0] += 1
        elif score < 1e-2:
            with open(f"scratch/2/eg_{nums[1]}.fna", 'w') as fd:
                AlignIO.write(aln, fd, "fasta")
                nums[1] += 1
        else:
            with open(f"scratch/3/eg_{nums[2]}.fna", 'w') as fd:
                AlignIO.write(aln, fd, "fasta")
                nums[2] += 1

def main(args):
    results = {}
    for log_path in args:
        stats = defaultdict(lambda: {'hits':[], 'miss': 0})
        level = -1
        with open(log_path) as log:
            for line in log:
                line.rstrip('\n')
                if line[0] == "+":
                    assert line.startswith(level_preset), "check syntax in log file"
                    level = int(line[level_offset:line.find("+++", level_offset)])
                    continue
                if line[0] == ">":
                    if line[1:].startswith("NO MATCH"):
                        stats[level]['miss'] += 1
                        continue
                    if line[1:].startswith("LEN="):
                        laln, lscore = unpack(log.readline())
                        raln, rscore = unpack(log.readline())
                        stats[level]['hits'].extend([(lscore, laln), (rscore, raln)])
                        continue
                raise ValueError(f"invalid syntax: {line[1:]}")
        if len(stats) > 0:
            path = log_path.replace(".log", "").split("-")
            e, w = int(path[1][1:]), int(path[2][1:])
            results[(e,w)] = dict(stats)
    return results

parser = argparse.ArgumentParser(description='process our data log files on end repair')
parser.add_argument('files', type=str, nargs='+')

if __name__ == "__main__":
    args = parser.parse_args()
    results = main(args.files)
    save_aln_examples(results)