aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-04-26 18:14:25 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-04-26 18:14:25 -0700
commitea845a58272658b7716345855b05631bcb296605 (patch)
tree2e0a020e240d2f6a8aa61dd765237ceb29b6423e /sys
parentfe89402c696040d1f63f21bae30abd51532106ec (diff)
feat: begun work on edit distance aware local sensitive hashing algorithm
Diffstat (limited to 'sys')
-rw-r--r--sys/libbio/align.c144
1 files changed, 144 insertions, 0 deletions
diff --git a/sys/libbio/align.c b/sys/libbio/align.c
new file mode 100644
index 0000000..bcb6fd3
--- /dev/null
+++ b/sys/libbio/align.c
@@ -0,0 +1,144 @@
+#include <u.h>
+#include <libn.h>
+#include <libbio.h>
+
+// -----------------------------------------------------------------------
+// Temporary globals
+
+uint64 aln·shft = (2ULL * (aln·K - 1ULL));
+uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL;
+
+// -----------------------------------------------------------------------
+// Static data
+
+static uint64 nuctab[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+};
+
+// -----------------------------------------------------------------------
+// Hash functions
+
+enum
+{
+ MURM641 = 0xff51afd7ed558ccd,
+ MURM642 = 0xc4ceb9fe1a85ec53
+};
+
+static
+uint64
+minihash(uint64 x, uint64 mask)
+{
+ x = (~x + (x << 21)) & mask;
+ x = x ^ (x >> 24);
+ x = (x + (x << 3) + (x << 8)) & mask;
+ x = x ^ (x >> 14);
+ x = (x + (x << 2) + (x << 4)) & mask;
+ x = x ^ (x >> 28);
+ x = (x + (x << 31));
+
+ return x;
+}
+
+static
+uint64
+murmurhash(uint64 x, uint64 mask)
+{
+ x = x ^ (x >> 33);
+ x = (x * MURM641);
+ x = x ^ (x >> 33);
+ x = (x * MURM642);
+ x = x ^ (x >> 33);
+
+ return x;
+}
+
+// -----------------------------------------------------------------------
+// Locality sensitive hashing (with spatial extent)
+
+error
+aln·sketch(byte *seq, int L, uint64 *vals[aln·N], int *locs[aln·N])
+{
+ int i, n, l, *loc;
+ uint64 kmer, tmp[4], h[3], *val;
+
+ for (n = 0; n < aln·N; n++) {
+ val = vals[n];
+ for (l = 0; l < L; l++) {
+ val[l] = UINT64_MAX;
+ }
+ }
+
+ kmer = UINT64_MAX;
+ for (l = 0; *seq != '\0'; seq++, l++) {
+ kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask;
+
+ h[0] = minihash(kmer, aln·mask);
+ h[1] = murmurhash(kmer, aln·mask);
+
+ for (n = 0; n < aln·N; n++) {
+ val = vals[n];
+ loc = locs[n];
+
+ h[2] = (h[0] + n * h[1]) & aln·mask;
+ for (i = 0; i < L && h[2] < val[i]; i++) {
+ ;
+ }
+
+ tmp[1] = h[2];
+ tmp[3] = l;
+ for (i -= 1; i >= 0; i--) {
+ tmp[0] = tmp[1];
+ tmp[1] = val[i];
+ val[i] = tmp[0];
+
+ tmp[2] = tmp[3];
+ tmp[3] = loc[i];
+ loc[i] = tmp[2];
+ }
+ }
+ }
+
+ // TODO: Sort vals by their locs
+
+ return 0;
+}
+
+static
+int
+compare(void *a, void *b)
+{
+ int l;
+ uint64 *a64, *b64;
+
+ a64 = (uint64*)a;
+ b64 = (uint64*)b;
+ for (l = 0; l < aln·L; l++) {
+ if (a64 < b64) return -1;
+ if (a64 > b64) return +1;
+ }
+
+ return 0;
+}
+
+// TODO: replace with inlined version
+error
+aln·sort(int len, int l, uint64 *vals)
+{
+ qsort(vals, len, l*sizeof(uint64), &compare);
+ return 0;
+}