diff options
Diffstat (limited to 'pangraph/utils.py')
-rw-r--r-- | pangraph/utils.py | 40 |
1 files changed, 39 insertions, 1 deletions
diff --git a/pangraph/utils.py b/pangraph/utils.py index e0d85fa..e4b63f3 100644 --- a/pangraph/utils.py +++ b/pangraph/utils.py @@ -3,6 +3,7 @@ import csv import gzip import numpy as np +from io import StringIO from enum import IntEnum from Bio import SeqIO @@ -97,7 +98,10 @@ def as_array(x): return np.array(list(x)) def as_string(x): - return x.view(f'U{x.size}')[0] + try: + return x.view(f'U{x.size}')[0] + except: + return "".join(str(c) for c in x) def flatten(x): return np.ndarray.flatten(x[:]) @@ -154,9 +158,43 @@ def getnwk(node, newick, parentdist, leaf_names): newick = "(%s" % (newick) return newick +def as_str(s): + if isinstance(s, bytes): + return s.decode('utf-8') + return s + # ------------------------------------------------------------------------ # parsers +def parse_fasta(fh): + class Record: + def __init__(self, name=None, meta=None, seq=None): + self.seq = seq + self.name = name + self.meta = meta + + def __str__(self): + NL = '\n' + nc = 80 + return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}" + + def __repr__(self): + return str(self) + + header = as_str(fh.readline()) + while header != "" and header[0] == ">": + name = header[1:].split() + seq = StringIO() + for line in fh: + line = as_str(line) + if line == "" or line[0] == ">": + break + seq.write(line[:-1]) + + header = as_str(line) + yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue()) + seq.close() + def parse_paf(fh): hits = [] for line in fh: |