aboutsummaryrefslogtreecommitdiff
path: root/pangraph/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'pangraph/utils.py')
-rw-r--r--pangraph/utils.py40
1 files changed, 39 insertions, 1 deletions
diff --git a/pangraph/utils.py b/pangraph/utils.py
index e0d85fa..e4b63f3 100644
--- a/pangraph/utils.py
+++ b/pangraph/utils.py
@@ -3,6 +3,7 @@ import csv
import gzip
import numpy as np
+from io import StringIO
from enum import IntEnum
from Bio import SeqIO
@@ -97,7 +98,10 @@ def as_array(x):
return np.array(list(x))
def as_string(x):
- return x.view(f'U{x.size}')[0]
+ try:
+ return x.view(f'U{x.size}')[0]
+ except:
+ return "".join(str(c) for c in x)
def flatten(x):
return np.ndarray.flatten(x[:])
@@ -154,9 +158,43 @@ def getnwk(node, newick, parentdist, leaf_names):
newick = "(%s" % (newick)
return newick
+def as_str(s):
+ if isinstance(s, bytes):
+ return s.decode('utf-8')
+ return s
+
# ------------------------------------------------------------------------
# parsers
+def parse_fasta(fh):
+ class Record:
+ def __init__(self, name=None, meta=None, seq=None):
+ self.seq = seq
+ self.name = name
+ self.meta = meta
+
+ def __str__(self):
+ NL = '\n'
+ nc = 80
+ return f">{self.name} {self.meta}\n{NL.join([self.seq[i:(i+nc)] for i in range(0, len(self.seq), nc)])}"
+
+ def __repr__(self):
+ return str(self)
+
+ header = as_str(fh.readline())
+ while header != "" and header[0] == ">":
+ name = header[1:].split()
+ seq = StringIO()
+ for line in fh:
+ line = as_str(line)
+ if line == "" or line[0] == ">":
+ break
+ seq.write(line[:-1])
+
+ header = as_str(line)
+ yield Record(name=name[0], meta=" ".join(name[1:]), seq=seq.getvalue())
+ seq.close()
+
def parse_paf(fh):
hits = []
for line in fh: