diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2020-08-20 10:41:41 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-20 10:41:41 -0700 |
commit | 001aaaeaaed4847bff81f9c5373d5d5161d0e3ad (patch) | |
tree | d317c74f94b0f14a997a6d083c191e7850a47ceb | |
parent | bd21a4412a3e35f32ad84cf19328b7c89e0ed6e5 (diff) | |
parent | 80cede9edef586bbf68258f04ec3be8371afcc0c (diff) |
Merge pull request #5 from nnoll/feat/fasta-parser
feat/fasta parser
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | pangraph/build.py | 18 | ||||
-rw-r--r-- | pangraph/graph.py | 20 | ||||
-rw-r--r-- | pangraph/utils.py | 35 | ||||
-rwxr-xr-x | scripts/filter_plasmids.py | 57 |
5 files changed, 116 insertions, 16 deletions
@@ -55,7 +55,7 @@ staph: @echo "cluster staph"; \ pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz @echo "build staph"; \ - pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null + pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null # figures diff --git a/pangraph/build.py b/pangraph/build.py index 4577732..5c3a8ec 100644 --- a/pangraph/build.py +++ b/pangraph/build.py @@ -38,6 +38,16 @@ def register_args(parser): type=int, default=22, help="energy cost for mutations (used during block merges)") + parser.add_argument("-w", "--window", + metavar="edge window", + type=int, + default=1000, + help="amount of sequence to align from for end repair") + parser.add_argument("-e", "--extend", + metavar="edge extend", + type=int, + default=1000, + help="amount of sequence to extend for end repair") parser.add_argument("-s", "--statistics", default=False, action='store_true', @@ -71,12 +81,10 @@ def main(args): mkdir(tmp) log("aligning") - with cProfile.Profile() as pr: - T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) - # TODO: when debugging phase is done, remove tmp directory + T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics) + # TODO: when debugging phase is done, remove tmp directory - graphs = T.collect() - pr.dump_stats("perf.prof") + graphs = T.collect() for i, g in enumerate(graphs): log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}") diff --git a/pangraph/graph.py b/pangraph/graph.py index 1a67c16..83ef839 100644 --- a/pangraph/graph.py +++ b/pangraph/graph.py @@ -21,8 +21,8 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand # ------------------------------------------------------------------------ # globals -WINDOW = 1000 -EXTEND = 2500 +# WINDOW = 1000 +# EXTEND = 2500 pp = pprint.PrettyPrinter(indent=4) # ------------------------------------------------------------------------ @@ -140,7 +140,7 @@ class Graph(object): graphs, names = [], [] for name, path in G.seqs.items(): blks = set([b.id for b in path.blocks()]) - gi = [ i for i, g in enumerate(graphs) if overlaps(blks, g)] + gi = [i for i, g in enumerate(graphs) if overlaps(blks, g)] if len(gi) == 0: graphs.append(blks) names.append(set([name])) @@ -163,7 +163,7 @@ class Graph(object): # --------------- # methods - def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False): + def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False): from seqanpy import align_global as align # ---------------------------------- @@ -324,7 +324,7 @@ class Graph(object): continue merged = True - self.merge(proc(hit)) + self.merge(proc(hit), edge_window, edge_extend) merged_blks.add(hit['ref']['name']) merged_blks.add(hit['qry']['name']) @@ -414,7 +414,7 @@ class Graph(object): blks.update(path.blocks()) self.blks = {b.id:self.blks[b.id] for b in blks} - def merge(self, hit): + def merge(self, hit, window, extend): old_ref = self.blks[hit['ref']['name']] old_qry = self.blks[hit['qry']['name']] @@ -475,7 +475,7 @@ class Graph(object): # NOTE: this is a hack to deal with flipped orientations pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] if first: blk_list = set([b.id for b in blks]) first = False @@ -489,7 +489,7 @@ class Graph(object): except: breakpoint("bad find") beg, end = pos[0], pos[-1] - blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND] + blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend] blk_list.intersection_update(set([b.id for b in blks])) num_seqs += 1 @@ -507,10 +507,10 @@ class Graph(object): pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0]) beg, end = pos[0], pos[-1] - for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]): + for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]): tmp[n].write(f">isolate_{i:04d}\n") s = self.seqs[tag[0]].sequence_range(left,right) - if len(s) > EXTEND + WINDOW: + if len(s) > extend + window: breakpoint(f"bad sequence slicing: {len(s)}") tmp[n].write(s + '\n') diff --git a/pangraph/utils.py b/pangraph/utils.py index 5d5e888..e4b63f3 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -3,6 +3,7 @@ import csv import gzip import numpy as np +from io import StringIO from enum import IntEnum from Bio import SeqIO @@ -157,9 +158,43 @@ def getnwk(node, newick, parentdist, leaf_names): newick = "(%s" % (newick) return newick +def as_str(s): + if isinstance(s, bytes): + return s.decode('utf-8') + return s + # ------------------------------------------------------------------------ # parsers +def parse_fasta(fh): + class Record: + def __init__(self, name=None, meta=None, seq=None): + self.seq = seq + self.name = name + self.meta = meta + + def __str__(self): + NL = '\n' + nc = 80 + return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}" + + def __repr__(self): + return str(self) + + header = as_str(fh.readline()) + while header != "" and header[0] == ">": + name = header[1:].split() + seq = StringIO() + for line in fh: + line = as_str(line) + if line == "" or line[0] == ">": + break + seq.write(line[:-1]) + + header = as_str(line) + yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue()) + seq.close() + def parse_paf(fh): hits = [] for line in fh: diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py new file mode 100755 index 0000000..c309150 --- /dev/null +++ b/scripts/filter_plasmids.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +script to filter plasmids and chromosomes from full genome assemblies +""" + +import os +import sys +import gzip +import builtins +import argparse + +from glob import glob + +sys.path.insert(0, os.path.abspath('.')) # gross hack +from pangraph.utils import parse_fasta, breakpoint + +def open(path, *args, **kwargs): + if path.endswith('.gz'): + return gzip.open(path, *args, **kwargs) + else: + return builtins.open(path, *args, **kwargs) + +def main(dirs, plasmids=True): + for d in dirs: + in_dir = f"data/{d}/assemblies" + if not os.path.exists(in_dir): + print(f"{in_dir} doesn't exist. skipping...") + continue + + if plasmids: + out_dir = f"data/{d}-plasmid/assemblies" + else: + out_dir = f"data/{d}-chromosome/assemblies" + + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + for path in glob(f"{in_dir}/*.f?a*"): + with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr: + for i, rec in enumerate(parse_fasta(fd)): + if i == 0: + if not plasmids: + wtr.write(str(rec)) + wtr.write('\n') + break + continue + + wtr.write(str(rec)) + wtr.write('\n') + +parser = argparse.ArgumentParser(description='seperate plasmids from chromosomes') +parser.add_argument('directories', metavar='dirs', nargs='+') +parser.add_argument('--chromosomes', default=False, action='store_true') + +if __name__ == "__main__": + args = parser.parse_args() + main(args.directories, plasmids=not args.chromosomes) |