aboutsummaryrefslogtreecommitdiff
path: root/src/libbio/align.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libbio/align.c')
-rw-r--r--src/libbio/align.c178
1 files changed, 178 insertions, 0 deletions
diff --git a/src/libbio/align.c b/src/libbio/align.c
new file mode 100644
index 0000000..20a57df
--- /dev/null
+++ b/src/libbio/align.c
@@ -0,0 +1,178 @@
+#include <u.h>
+#include <libn.h>
+#include <libn/macro/qsort.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// globals
+
+uint64 aln·shft = (2ULL * (aln·K - 1ULL));
+uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL;
+
+// -----------------------------------------------------------------------
+// static data
+
+static uint64 nuctab[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+};
+
+// -----------------------------------------------------------------------
+// hash functions
+
+enum
+{
+ MURM641 = 0xff51afd7ed558ccd,
+ MURM642 = 0xc4ceb9fe1a85ec53
+};
+
+static
+uint64
+minihash(uint64 x, uint64 mask)
+{
+ x = (~x + (x << 21)) & mask;
+ x = x ^ (x >> 24);
+ x = (x + (x << 3) + (x << 8)) & mask;
+ x = x ^ (x >> 14);
+ x = (x + (x << 2) + (x << 4)) & mask;
+ x = x ^ (x >> 28);
+ x = (x + (x << 31));
+
+ return x;
+}
+
+static
+uint64
+murmurhash(uint64 x, uint64 mask)
+{
+ x = x ^ (x >> 33);
+ x = (x * MURM641);
+ x = x ^ (x >> 33);
+ x = (x * MURM642);
+ x = x ^ (x >> 33);
+
+ return x;
+}
+
+// -----------------------------------------------------------------------
+// locality sensitive hashing (with spatial extent)
+
+static
+void
+sortpos(uintptr len, uint64 vals[], int locs[])
+{
+ int tmpi;
+ uint64 tmpu64;
+
+#define LESS(i, j) (locs[i] < locs[j])
+#define SWAP(i, j) (tmpu64 = vals[i], tmpi = locs[i], \
+ vals[i] = vals[j], locs[i] = locs[j], \
+ vals[j] = tmpu64 , locs[j] = tmpi)
+ QSORT(len, LESS, SWAP);
+#undef LESS
+#undef SWAP
+}
+
+/*
+ * sketch
+ * @param seq: '0' terminated string
+ * @param len: number of sequential sketches to keep
+ * @param vals: buffer to store hashes of sketch.
+ * @param locs: buffer to store location of sketch hashes
+ */
+error
+aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N])
+{
+ int i, n, l, *loc;
+ uint64 kmer, h[3], *val;
+ int tmpi[2];
+ uint64 tmpu[2];
+
+ for(n = 0; n < aln·N; n++) {
+ for(l = 0; l < len; l++) {
+ vals[n][l] = UINT64_MAX;
+ }
+ }
+
+ kmer = UINT64_MAX;
+ for(l = 0; *seq != '\0'; seq++, l++) {
+ kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask;
+
+ h[0] = minihash(kmer, aln·mask);
+ h[1] = murmurhash(kmer, aln·mask);
+
+ for(n = 0; n < aln·N; n++) {
+ val = vals[n];
+ loc = locs[n];
+
+ h[2] = (h[0] + n * h[1]) & aln·mask;
+ for (i = 0; i < len && h[2] < val[i]; i++) {
+ ;
+ }
+
+ tmpu[1] = h[2];
+ tmpi[1] = l;
+ for(i -= 1; i >= 0; i--) {
+ tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0];
+ tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0];
+ }
+ }
+ }
+
+ for(n = 0; n < aln·N; n++) {
+ sortpos(len, vals[n], locs[n]);
+ }
+
+ return 0;
+}
+
+static
+int
+lessarrs(int len, uint64 a[], uint64 b[])
+{
+ int l;
+
+ for(l = 0; l < len; l++) {
+ if (a[l] < b[l]) return 1;
+ if (a[l] > b[l]) return 0;
+ }
+
+ return 0;
+}
+
+static
+void
+swaparrs(int len, uint64 a[], uint64 b[])
+{
+ int l;
+ uint64 tmp;
+
+ for(l = 0; l < len; l++) {
+ tmp = a[l], a[l] = b[l], b[l] = tmp;
+ }
+}
+
+error
+aln·sort(uintptr n, int len, uint64 *vals)
+{
+#define LESS(i, j) (lessarrs(len, vals+((i)*len), vals+((j)*len)))
+#define SWAP(i, j) (swaparrs(len, vals+((i)*len), vals+((j)*len)))
+ QSORT(n, LESS, SWAP);
+#undef LESS
+#undef SWAP
+ return 0;
+}