From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/libbio/align.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 src/libbio/align.c (limited to 'src/libbio/align.c') diff --git a/src/libbio/align.c b/src/libbio/align.c new file mode 100644 index 0000000..20a57df --- /dev/null +++ b/src/libbio/align.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include + +// ----------------------------------------------------------------------- +// globals + +uint64 aln·shft = (2ULL * (aln·K - 1ULL)); +uint64 aln·mask = (1ULL << (2*aln·K)) - 1ULL; + +// ----------------------------------------------------------------------- +// static data + +static uint64 nuctab[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + +// ----------------------------------------------------------------------- +// hash functions + +enum +{ + MURM641 = 0xff51afd7ed558ccd, + MURM642 = 0xc4ceb9fe1a85ec53 +}; + +static +uint64 +minihash(uint64 x, uint64 mask) +{ + x = (~x + (x << 21)) & mask; + x = x ^ (x >> 24); + x = (x + (x << 3) + (x << 8)) & mask; + x = x ^ (x >> 14); + x = (x + (x << 2) + (x << 4)) & mask; + x = x ^ (x >> 28); + x = (x + (x << 31)); + + return x; +} + +static +uint64 +murmurhash(uint64 x, uint64 mask) +{ + x = x ^ (x >> 33); + x = (x * MURM641); + x = x ^ (x >> 33); + x = (x * MURM642); + x = x ^ (x >> 33); + + return x; +} + +// ----------------------------------------------------------------------- +// locality sensitive hashing (with spatial extent) + +static +void +sortpos(uintptr len, uint64 vals[], int locs[]) +{ + int tmpi; + uint64 tmpu64; + +#define LESS(i, j) (locs[i] < locs[j]) +#define SWAP(i, j) (tmpu64 = vals[i], tmpi = locs[i], \ + vals[i] = vals[j], locs[i] = locs[j], \ + vals[j] = tmpu64 , locs[j] = tmpi) + QSORT(len, LESS, SWAP); +#undef LESS +#undef SWAP +} + +/* + * sketch + * @param seq: '0' terminated string + * @param len: number of sequential sketches to keep + * @param vals: buffer to store hashes of sketch. + * @param locs: buffer to store location of sketch hashes + */ +error +aln·sketch(byte *seq, int len, uint64 *vals[aln·N], int *locs[aln·N]) +{ + int i, n, l, *loc; + uint64 kmer, h[3], *val; + int tmpi[2]; + uint64 tmpu[2]; + + for(n = 0; n < aln·N; n++) { + for(l = 0; l < len; l++) { + vals[n][l] = UINT64_MAX; + } + } + + kmer = UINT64_MAX; + for(l = 0; *seq != '\0'; seq++, l++) { + kmer = ((kmer << 2) | nuctab[*seq]) & aln·mask; + + h[0] = minihash(kmer, aln·mask); + h[1] = murmurhash(kmer, aln·mask); + + for(n = 0; n < aln·N; n++) { + val = vals[n]; + loc = locs[n]; + + h[2] = (h[0] + n * h[1]) & aln·mask; + for (i = 0; i < len && h[2] < val[i]; i++) { + ; + } + + tmpu[1] = h[2]; + tmpi[1] = l; + for(i -= 1; i >= 0; i--) { + tmpu[0] = tmpu[1], tmpu[1] = val[i], val[i] = tmpu[0]; + tmpi[0] = tmpi[1], tmpi[1] = loc[i], loc[i] = tmpi[0]; + } + } + } + + for(n = 0; n < aln·N; n++) { + sortpos(len, vals[n], locs[n]); + } + + return 0; +} + +static +int +lessarrs(int len, uint64 a[], uint64 b[]) +{ + int l; + + for(l = 0; l < len; l++) { + if (a[l] < b[l]) return 1; + if (a[l] > b[l]) return 0; + } + + return 0; +} + +static +void +swaparrs(int len, uint64 a[], uint64 b[]) +{ + int l; + uint64 tmp; + + for(l = 0; l < len; l++) { + tmp = a[l], a[l] = b[l], b[l] = tmp; + } +} + +error +aln·sort(uintptr n, int len, uint64 *vals) +{ +#define LESS(i, j) (lessarrs(len, vals+((i)*len), vals+((j)*len))) +#define SWAP(i, j) (swaparrs(len, vals+((i)*len), vals+((j)*len))) + QSORT(n, LESS, SWAP); +#undef LESS +#undef SWAP + return 0; +} -- cgit v1.2.1