From ea845a58272658b7716345855b05631bcb296605 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Sun, 26 Apr 2020 18:14:25 -0700 Subject: feat: begun work on edit distance aware local sensitive hashing algorithm --- sys/libbio/align.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 sys/libbio/align.c diff --git a/sys/libbio/align.c b/sys/libbio/align.c new file mode 100644 index 0000000..bcb6fd3 --- /dev/null +++ b/sys/libbio/align.c @@ -0,0 +1,144 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// Temporary globals + +uint64 aln·shft = (2ULL * (aln·K - 1ULL)); +uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL; + +// ----------------------------------------------------------------------- +// Static data + +static uint64 nuctab[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +// ----------------------------------------------------------------------- +// Hash functions + +enum +{ + MURM641 = 0xff51afd7ed558ccd, + MURM642 = 0xc4ceb9fe1a85ec53 +}; + +static +uint64 +minihash(uint64 x, uint64 mask) +{ + x = (~x + (x << 21)) & mask; + x = x ^ (x >> 24); + x = (x + (x << 3) + (x << 8)) & mask; + x = x ^ (x >> 14); + x = (x + (x << 2) + (x << 4)) & mask; + x = x ^ (x >> 28); + x = (x + (x << 31)); + + return x; +} + +static +uint64 +murmurhash(uint64 x, uint64 mask) +{ + x = x ^ (x >> 33); + x = (x * MURM641); + x = x ^ (x >> 33); + x = (x * MURM642); + x = x ^ (x >> 33); + + return x; +} + +// ----------------------------------------------------------------------- +// Locality sensitive hashing (with spatial extent) + +error +aln·sketch(byte *seq, int L, uint64 *vals[aln·N], int *locs[aln·N]) +{ + int i, n, l, *loc; + uint64 kmer, tmp[4], h[3], *val; + + for (n = 0; n < aln·N; n++) { + val = vals[n]; + for (l = 0; l < L; l++) { + val[l] = UINT64_MAX; + } + } + + kmer = UINT64_MAX; + for (l = 0; *seq != '\0'; seq++, l++) { + kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask; + + h[0] = minihash(kmer, aln·mask); + h[1] = murmurhash(kmer, aln·mask); + + for (n = 0; n < aln·N; n++) { + val = vals[n]; + loc = locs[n]; + + h[2] = (h[0] + n * h[1]) & aln·mask; + for (i = 0; i < L && h[2] < val[i]; i++) { + ; + } + + tmp[1] = h[2]; + tmp[3] = l; + for (i -= 1; i >= 0; i--) { + tmp[0] = tmp[1]; + tmp[1] = val[i]; + val[i] = tmp[0]; + + tmp[2] = tmp[3]; + tmp[3] = loc[i]; + loc[i] = tmp[2]; + } + } + } + + // TODO: Sort vals by their locs + + return 0; +} + +static +int +compare(void *a, void *b) +{ + int l; + uint64 *a64, *b64; + + a64 = (uint64*)a; + b64 = (uint64*)b; + for (l = 0; l < aln·L; l++) { + if (a64 < b64) return -1; + if (a64 > b64) return +1; + } + + return 0; +} + +// TODO: replace with inlined version +error +aln·sort(int len, int l, uint64 *vals) +{ + qsort(vals, len, l*sizeof(uint64), &compare); + return 0; +} -- cgit v1.2.1