aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-08-20 10:41:41 -0700
committerGitHub <noreply@github.com>2020-08-20 10:41:41 -0700
commit001aaaeaaed4847bff81f9c5373d5d5161d0e3ad (patch)
treed317c74f94b0f14a997a6d083c191e7850a47ceb
parentbd21a4412a3e35f32ad84cf19328b7c89e0ed6e5 (diff)
parent80cede9edef586bbf68258f04ec3be8371afcc0c (diff)
Merge pull request #5 from nnoll/feat/fasta-parser
feat/fasta parser
-rw-r--r--Makefile2
-rw-r--r--pangraph/build.py18
-rw-r--r--pangraph/graph.py20
-rw-r--r--pangraph/utils.py35
-rwxr-xr-xscripts/filter_plasmids.py57
5 files changed, 116 insertions, 16 deletions
diff --git a/Makefile b/Makefile
index 117dbc9..ab398b6 100644
--- a/Makefile
+++ b/Makefile
@@ -55,7 +55,7 @@ staph:
@echo "cluster staph"; \
pangraph cluster -d data/staph data/staph/assemblies/*.fna.gz
@echo "build staph"; \
- pangraph build -d data/staph -m 500 -b 0 data/staph/guide.json 1>staph.log 2>/dev/null
+ pangraph build -d data/staph -m 500 -b 0 -e 2500 -w 1000 data/staph/guide.json 1>staph-e2500-w1000.log 2>/dev/null
# figures
diff --git a/pangraph/build.py b/pangraph/build.py
index 4577732..5c3a8ec 100644
--- a/pangraph/build.py
+++ b/pangraph/build.py
@@ -38,6 +38,16 @@ def register_args(parser):
type=int,
default=22,
help="energy cost for mutations (used during block merges)")
+ parser.add_argument("-w", "--window",
+ metavar="edge window",
+ type=int,
+ default=1000,
+ help="amount of sequence to align from for end repair")
+ parser.add_argument("-e", "--extend",
+ metavar="edge extend",
+ type=int,
+ default=1000,
+ help="amount of sequence to extend for end repair")
parser.add_argument("-s", "--statistics",
default=False,
action='store_true',
@@ -71,12 +81,10 @@ def main(args):
mkdir(tmp)
log("aligning")
- with cProfile.Profile() as pr:
- T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
- # TODO: when debugging phase is done, remove tmp directory
+ T.align(tmp, args.len, args.mu, args.beta, args.extensive, args.statistics)
+ # TODO: when debugging phase is done, remove tmp directory
- graphs = T.collect()
- pr.dump_stats("perf.prof")
+ graphs = T.collect()
for i, g in enumerate(graphs):
log(f"graph {i}: nseqs: {len(g.seqs)} nblks: {len(g.blks)}")
diff --git a/pangraph/graph.py b/pangraph/graph.py
index 1a67c16..83ef839 100644
--- a/pangraph/graph.py
+++ b/pangraph/graph.py
@@ -21,8 +21,8 @@ from .utils import Strand, as_string, parse_paf, panic, as_record, new_strand
# ------------------------------------------------------------------------
# globals
-WINDOW = 1000
-EXTEND = 2500
+# WINDOW = 1000
+# EXTEND = 2500
pp = pprint.PrettyPrinter(indent=4)
# ------------------------------------------------------------------------
@@ -140,7 +140,7 @@ class Graph(object):
graphs, names = [], []
for name, path in G.seqs.items():
blks = set([b.id for b in path.blocks()])
- gi = [ i for i, g in enumerate(graphs) if overlaps(blks, g)]
+ gi = [i for i, g in enumerate(graphs) if overlaps(blks, g)]
if len(gi) == 0:
graphs.append(blks)
names.append(set([name]))
@@ -163,7 +163,7 @@ class Graph(object):
# ---------------
# methods
- def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, extensive=False):
+ def union(self, qpath, rpath, out, cutoff=0, alpha=10, beta=2, edge_window=100, edge_extend=2500, extensive=False):
from seqanpy import align_global as align
# ----------------------------------
@@ -324,7 +324,7 @@ class Graph(object):
continue
merged = True
- self.merge(proc(hit))
+ self.merge(proc(hit), edge_window, edge_extend)
merged_blks.add(hit['ref']['name'])
merged_blks.add(hit['qry']['name'])
@@ -414,7 +414,7 @@ class Graph(object):
blks.update(path.blocks())
self.blks = {b.id:self.blks[b.id] for b in blks}
- def merge(self, hit):
+ def merge(self, hit, window, extend):
old_ref = self.blks[hit['ref']['name']]
old_qry = self.blks[hit['qry']['name']]
@@ -475,7 +475,7 @@ class Graph(object):
# NOTE: this is a hack to deal with flipped orientations
pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in new_refs], key=lambda x: x[0])
beg, end = pos[0], pos[-1]
- blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+ blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
if first:
blk_list = set([b.id for b in blks])
first = False
@@ -489,7 +489,7 @@ class Graph(object):
except:
breakpoint("bad find")
beg, end = pos[0], pos[-1]
- blks = self.seqs[tag[0]][beg[0]-EXTEND:end[1]+EXTEND]
+ blks = self.seqs[tag[0]][beg[0]-extend:end[1]+extend]
blk_list.intersection_update(set([b.id for b in blks]))
num_seqs += 1
@@ -507,10 +507,10 @@ class Graph(object):
pos = sorted([self.seqs[tag[0]].position_of(b, tag[1]) for b in shared_blks], key=lambda x: x[0])
beg, end = pos[0], pos[-1]
- for n, (left, right) in enumerate([(beg[0]-EXTEND,beg[0]+WINDOW), (end[1]-WINDOW,end[1]+EXTEND)]):
+ for n, (left, right) in enumerate([(beg[0]-extend,beg[0]+window), (end[1]-window,end[1]+extend)]):
tmp[n].write(f">isolate_{i:04d}\n")
s = self.seqs[tag[0]].sequence_range(left,right)
- if len(s) > EXTEND + WINDOW:
+ if len(s) > extend + window:
breakpoint(f"bad sequence slicing: {len(s)}")
tmp[n].write(s + '\n')
diff --git a/pangraph/utils.py b/pangraph/utils.py
index 5d5e888..e4b63f3 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -3,6 +3,7 @@ import csv
import gzip
import numpy as np
+from io import StringIO
from enum import IntEnum
from Bio import SeqIO
@@ -157,9 +158,43 @@ def getnwk(node, newick, parentdist, leaf_names):
newick = "(%s" % (newick)
return newick
+def as_str(s):
+ if isinstance(s, bytes):
+ return s.decode('utf-8')
+ return s
+
# ------------------------------------------------------------------------
# parsers
+def parse_fasta(fh):
+ class Record:
+ def __init__(self, name=None, meta=None, seq=None):
+ self.seq = seq
+ self.name = name
+ self.meta = meta
+
+ def __str__(self):
+ NL = '\n'
+ nc = 80
+ return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}"
+
+ def __repr__(self):
+ return str(self)
+
+ header = as_str(fh.readline())
+ while header != "" and header[0] == ">":
+ name = header[1:].split()
+ seq = StringIO()
+ for line in fh:
+ line = as_str(line)
+ if line == "" or line[0] == ">":
+ break
+ seq.write(line[:-1])
+
+ header = as_str(line)
+ yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue())
+ seq.close()
+
def parse_paf(fh):
hits = []
for line in fh:
diff --git a/scripts/filter_plasmids.py b/scripts/filter_plasmids.py
new file mode 100755
index 0000000..c309150
--- /dev/null
+++ b/scripts/filter_plasmids.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+script to filter plasmids and chromosomes from full genome assemblies
+"""
+
+import os
+import sys
+import gzip
+import builtins
+import argparse
+
+from glob import glob
+
+sys.path.insert(0, os.path.abspath('.')) # gross hack
+from pangraph.utils import parse_fasta, breakpoint
+
+def open(path, *args, **kwargs):
+ if path.endswith('.gz'):
+ return gzip.open(path, *args, **kwargs)
+ else:
+ return builtins.open(path, *args, **kwargs)
+
+def main(dirs, plasmids=True):
+ for d in dirs:
+ in_dir = f"data/{d}/assemblies"
+ if not os.path.exists(in_dir):
+ print(f"{in_dir} doesn't exist. skipping...")
+ continue
+
+ if plasmids:
+ out_dir = f"data/{d}-plasmid/assemblies"
+ else:
+ out_dir = f"data/{d}-chromosome/assemblies"
+
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir)
+
+ for path in glob(f"{in_dir}/*.f?a*"):
+ with open(path, 'rt') as fd, open(f"{out_dir}/{os.path.basename(path).replace('.gz', '')}", 'w') as wtr:
+ for i, rec in enumerate(parse_fasta(fd)):
+ if i == 0:
+ if not plasmids:
+ wtr.write(str(rec))
+ wtr.write('\n')
+ break
+ continue
+
+ wtr.write(str(rec))
+ wtr.write('\n')
+
+parser = argparse.ArgumentParser(description='seperate plasmids from chromosomes')
+parser.add_argument('directories', metavar='dirs', nargs='+')
+parser.add_argument('--chromosomes', default=False, action='store_true')
+
+if __name__ == "__main__":
+ args = parser.parse_args()
+ main(args.directories, plasmids=not args.chromosomes)