aboutsummaryrefslogtreecommitdiff
path: root/sys/libmath/blas.c
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-05-07 15:35:06 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-05-07 15:35:06 -0700
commit36117f59ec77784c9ef77801d7c1cbf03a4c4a8b (patch)
tree18a7c3f373c5d9ac11cce320045713550ce674fa /sys/libmath/blas.c
parent46da413cf7c9c995842a83cd54b02a17cd9c7289 (diff)
wrap: elementary math functions for libmath
Diffstat (limited to 'sys/libmath/blas.c')
-rw-r--r--sys/libmath/blas.c237
1 files changed, 233 insertions, 4 deletions
diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c
index bbbe1c8..8343a42 100644
--- a/sys/libmath/blas.c
+++ b/sys/libmath/blas.c
@@ -1,5 +1,6 @@
#include <u.h>
#include <libn.h>
+#include <libmath.h>
#include <vendor/blas/cblas.h>
#include <x86intrin.h>
@@ -9,6 +10,176 @@
// level one
/*
+ * Rotate vector
+ * x = cos*x + sin*y
+ * y = cos*x - sin*y
+ */
+
+static
+void
+rot_kernel8_avx2(int n, double *x, double *y, double cos, double sin)
+{
+ register int i;
+ __m256d x256, y256;
+ __m128d cos128, sin128;
+ __m256d cos256, sin256;
+
+ cos128 = _mm_load_sd(&cos);
+ cos256 = _mm256_broadcastsd_pd(cos128);
+
+ sin128 = _mm_load_sd(&sin);
+ sin256 = _mm256_broadcastsd_pd(sin128);
+
+ for (i = 0; i < n; i+=8) {
+ x256 = _mm256_loadu_pd(x+i+0);
+ y256 = _mm256_loadu_pd(y+i+0);
+ _mm256_storeu_pd(x+i+0, cos256 * x256 + sin256 * y256);
+ _mm256_storeu_pd(y+i+0, cos256 * y256 - sin256 * x256);
+
+ x256 = _mm256_loadu_pd(x+i+4);
+ y256 = _mm256_loadu_pd(y+i+4);
+ _mm256_storeu_pd(x+i+4, cos256 * x256 + sin256 * y256);
+ _mm256_storeu_pd(y+i+4, cos256 * y256 - sin256 * x256);
+ }
+}
+
+static
+void
+rot_kernel8(int n, double *x, double *y, double cos, double sin)
+{
+ register int i;
+ register double tmp;
+
+ for (i = 0; i < n; i+=8) {
+ tmp = x[i+0], x[i+0] = cos*x[i+0] + sin*y[i+0], y[i+0] = cos*y[i+0] - sin*tmp;
+ tmp = x[i+1], x[i+1] = cos*x[i+1] + sin*y[i+1], y[i+1] = cos*y[i+1] - sin*tmp;
+ tmp = x[i+2], x[i+2] = cos*x[i+2] + sin*y[i+2], y[i+2] = cos*y[i+2] - sin*tmp;
+ tmp = x[i+3], x[i+3] = cos*x[i+3] + sin*y[i+3], y[i+3] = cos*y[i+3] - sin*tmp;
+ tmp = x[i+4], x[i+4] = cos*x[i+4] + sin*y[i+4], y[i+4] = cos*y[i+4] - sin*tmp;
+ tmp = x[i+5], x[i+5] = cos*x[i+5] + sin*y[i+5], y[i+5] = cos*y[i+5] - sin*tmp;
+ tmp = x[i+6], x[i+6] = cos*x[i+6] + sin*y[i+6], y[i+6] = cos*y[i+6] - sin*tmp;
+ tmp = x[i+7], x[i+7] = cos*x[i+7] + sin*y[i+7], y[i+7] = cos*y[i+7] - sin*tmp;
+ }
+}
+
+void
+blas·rot(int len, double *x, double *y, double cos, double sin)
+{
+ register int n;
+ register double tmp;
+
+ n = len & ~7;
+ rot_kernel8_avx2(n, x, y, cos, sin);
+
+ for (; n < len; n++) {
+ tmp = x[n], x[n] = cos*x[n] + sin*y[n], y[n] = cos*y[n]- sin*tmp;
+ }
+}
+
+/*
+ * compute givens rotate vector
+ * -- --
+ * |+cos -sin| | a | = | r |
+ * |-sin +cos| | b | = | 0 |
+ * -- --
+ */
+
+void
+blas·rotg(double *a, double *b, double *cos, double *sin)
+{
+ double abs_a, abs_b, r, rho, scale, z;
+
+ abs_a = math·abs(*a);
+ abs_b = math·abs(*b);
+ rho = abs_a > abs_b ? *a : *b;
+ scale = abs_a + abs_b;
+
+ if (scale == 0) {
+ *cos = 1, *sin = 0;
+ r = 0.;
+ z = 0.;
+ } else {
+ r = math·sgn(rho) * scale * math·sqrt(math·pow(abs_a/scale, 2) + math·pow(abs_b/scale, 2));
+ *cos = *a / r;
+ *sin = *b / r;
+ if (abs_a > abs_b)
+ z = *sin;
+ else if (abs_b >= abs_a && *cos != 0)
+ z = 1/(*cos);
+ else
+ z = 1.;
+ }
+ *a = r;
+ *b = z;
+}
+
+/*
+ * modified Givens rotation of points in plane
+ * operates on len points
+ *
+ * params = [flag, h11, h12, h21, h22]
+ * @flag = -1:
+ * H -> [ [h11, h12], [h21, h22] ]
+ * @flag = 0.0:
+ * H -> [ [1, h12], [h21, 1] ]
+ * @flag = +1:
+ * H -> [ [h11, 1], [-1, h22] ]
+ * @flag = -2:
+ * H -> [ [1, 0], [0, 1] ]
+ *
+ * Replaces:
+ * x -> H11 * x + H12 * y
+ * y -> H21 * x + H22 * y
+ */
+
+static
+void
+rotm_kernel8(int n, double *x, double *y, double H[4])
+{
+ register int i;
+ register double tmp;
+
+ for (i = 0; i < n; i+=8) {
+ tmp = x[i+0], x[i+0] = H[0]*x[i+0] + H[1]*y[i+0], y[i+0] = H[2]*y[i+0] + H[3]*tmp;
+ tmp = x[i+1], x[i+1] = H[0]*x[i+1] + H[1]*y[i+1], y[i+1] = H[2]*y[i+1] + H[3]*tmp;
+ tmp = x[i+2], x[i+2] = H[0]*x[i+2] + H[1]*y[i+2], y[i+2] = H[2]*y[i+2] + H[3]*tmp;
+ tmp = x[i+3], x[i+3] = H[0]*x[i+3] + H[1]*y[i+3], y[i+3] = H[2]*y[i+3] + H[3]*tmp;
+ tmp = x[i+4], x[i+4] = H[0]*x[i+4] + H[1]*y[i+4], y[i+4] = H[2]*y[i+4] + H[3]*tmp;
+ tmp = x[i+5], x[i+5] = H[0]*x[i+5] + H[1]*y[i+5], y[i+5] = H[2]*y[i+5] + H[3]*tmp;
+ tmp = x[i+6], x[i+6] = H[0]*x[i+6] + H[1]*y[i+6], y[i+6] = H[2]*y[i+6] + H[3]*tmp;
+ tmp = x[i+7], x[i+7] = H[0]*x[i+7] + H[1]*y[i+7], y[i+7] = H[2]*y[i+7] + H[3]*tmp;
+ }
+}
+
+error
+blas·rotm(int len, double *x, double *y, double p[5])
+{
+ int n, flag;
+ double tmp, H[4];
+
+ flag = math·round(p[0]);
+ switch (flag) {
+ case 0: H[0] = p[1], H[1] = p[2], H[2] = p[3], H[3] = p[4]; break;
+ case -1: H[0] = +1, H[1] = p[2], H[2] = p[3], H[3] = +1; break;
+ case +1: H[0] = p[1], H[1] = +1, H[2] = -1, H[3] = p[4]; break;
+ case -2: H[0] = +1, H[1] = 0, H[2] = 0, H[3] = +1; break;
+ default:
+ errorf("rotm: flag '%d' unrecognized", flag);
+ return 1;
+ }
+
+ n = len & ~7;
+ rotm_kernel8(n, x, y, H);
+
+ for (; n < len; n++) {
+ tmp = x[n], x[n] = H[0]*x[n] + H[1]*y[n], y[n] = H[2]*y[n] + H[3]*tmp;
+ }
+
+ return 0;
+}
+
+
+/*
* Scale vector
* x = ax
*/
@@ -47,7 +218,7 @@ scale_kernel8(int n, double *x, double a)
}
void
-blas·scalevec(int len, double *x, double a)
+blas·scale(int len, double *x, double a)
{
int n;
@@ -403,11 +574,11 @@ blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, d
}
#define NITER 5000
-#define NCOL 57
-#define NROW 57
+#define NCOL 10007
+#define NROW 10007
error
-main()
+test·gemm()
{
int i, n;
clock_t t;
@@ -468,3 +639,61 @@ main()
return 0;
}
+
+void
+print·array(int n, double *x)
+{
+ double *end;
+ printf("[");
+ for (end=x+n; x != end; ++x) {
+ printf("%f,", *x);
+ }
+ printf("]\n");
+}
+
+error
+main()
+{
+ int i, n;
+ double *x, *y;
+ double tprof[2];
+ clock_t t;
+
+ x = malloc(sizeof(*x)*NCOL);
+ y = malloc(sizeof(*x)*NCOL);
+
+ for (n = 0; n < NITER; n++) {
+ for (i = 0; i < NCOL; i++) {
+ x[i] = i*i+1;
+ y[i] = i+1;
+ }
+
+ t = clock();
+ blas·rot(NCOL, x, y, .707, .707);
+ t = clock() - t;
+ tprof[0] += 1000.*t/CLOCKS_PER_SEC;
+
+ for (i = 0; i < NCOL; i++) {
+ x[i] = i*i+1;
+ y[i] = i+1;
+ }
+
+ t = clock();
+ cblas_drot(NCOL, x, 1, y, 1, .707, .707);
+ t = clock() - t;
+ tprof[1] += 1000.*t/CLOCKS_PER_SEC;
+ }
+
+ printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER);
+ printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER);
+
+ double a, b, c, s;
+
+ a = 10.234, b = 2.;
+ cblas_drotg(&a, &b, &c, &s);
+ printf("%f, %f, %f, %f\n", a, b, c, s);
+
+ a = 10.234, b = 2.;
+ blas·rotg(&a, &b, &c, &s);
+ printf("%f, %f, %f, %f\n", a, b, c, s);
+}