From 36117f59ec77784c9ef77801d7c1cbf03a4c4a8b Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 7 May 2020 15:35:06 -0700 Subject: wrap: elementary math functions for libmath --- sys/libmath/blas.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 233 insertions(+), 4 deletions(-) (limited to 'sys/libmath/blas.c') diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c index bbbe1c8..8343a42 100644 --- a/sys/libmath/blas.c +++ b/sys/libmath/blas.c @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -8,6 +9,176 @@ // ----------------------------------------------------------------------- // level one +/* + * Rotate vector + * x = cos*x + sin*y + * y = cos*x - sin*y + */ + +static +void +rot_kernel8_avx2(int n, double *x, double *y, double cos, double sin) +{ + register int i; + __m256d x256, y256; + __m128d cos128, sin128; + __m256d cos256, sin256; + + cos128 = _mm_load_sd(&cos); + cos256 = _mm256_broadcastsd_pd(cos128); + + sin128 = _mm_load_sd(&sin); + sin256 = _mm256_broadcastsd_pd(sin128); + + for (i = 0; i < n; i+=8) { + x256 = _mm256_loadu_pd(x+i+0); + y256 = _mm256_loadu_pd(y+i+0); + _mm256_storeu_pd(x+i+0, cos256 * x256 + sin256 * y256); + _mm256_storeu_pd(y+i+0, cos256 * y256 - sin256 * x256); + + x256 = _mm256_loadu_pd(x+i+4); + y256 = _mm256_loadu_pd(y+i+4); + _mm256_storeu_pd(x+i+4, cos256 * x256 + sin256 * y256); + _mm256_storeu_pd(y+i+4, cos256 * y256 - sin256 * x256); + } +} + +static +void +rot_kernel8(int n, double *x, double *y, double cos, double sin) +{ + register int i; + register double tmp; + + for (i = 0; i < n; i+=8) { + tmp = x[i+0], x[i+0] = cos*x[i+0] + sin*y[i+0], y[i+0] = cos*y[i+0] - sin*tmp; + tmp = x[i+1], x[i+1] = cos*x[i+1] + sin*y[i+1], y[i+1] = cos*y[i+1] - sin*tmp; + tmp = x[i+2], x[i+2] = cos*x[i+2] + sin*y[i+2], y[i+2] = cos*y[i+2] - sin*tmp; + tmp = x[i+3], x[i+3] = cos*x[i+3] + sin*y[i+3], y[i+3] = cos*y[i+3] - sin*tmp; + tmp = x[i+4], x[i+4] = cos*x[i+4] + sin*y[i+4], y[i+4] = cos*y[i+4] - sin*tmp; + tmp = x[i+5], x[i+5] = cos*x[i+5] + sin*y[i+5], y[i+5] = cos*y[i+5] - sin*tmp; + tmp = x[i+6], x[i+6] = cos*x[i+6] + sin*y[i+6], y[i+6] = cos*y[i+6] - sin*tmp; + tmp = x[i+7], x[i+7] = cos*x[i+7] + sin*y[i+7], y[i+7] = cos*y[i+7] - sin*tmp; + } +} + +void +blas·rot(int len, double *x, double *y, double cos, double sin) +{ + register int n; + register double tmp; + + n = len & ~7; + rot_kernel8_avx2(n, x, y, cos, sin); + + for (; n < len; n++) { + tmp = x[n], x[n] = cos*x[n] + sin*y[n], y[n] = cos*y[n]- sin*tmp; + } +} + +/* + * compute givens rotate vector + * -- -- + * |+cos -sin| | a | = | r | + * |-sin +cos| | b | = | 0 | + * -- -- + */ + +void +blas·rotg(double *a, double *b, double *cos, double *sin) +{ + double abs_a, abs_b, r, rho, scale, z; + + abs_a = math·abs(*a); + abs_b = math·abs(*b); + rho = abs_a > abs_b ? *a : *b; + scale = abs_a + abs_b; + + if (scale == 0) { + *cos = 1, *sin = 0; + r = 0.; + z = 0.; + } else { + r = math·sgn(rho) * scale * math·sqrt(math·pow(abs_a/scale, 2) + math·pow(abs_b/scale, 2)); + *cos = *a / r; + *sin = *b / r; + if (abs_a > abs_b) + z = *sin; + else if (abs_b >= abs_a && *cos != 0) + z = 1/(*cos); + else + z = 1.; + } + *a = r; + *b = z; +} + +/* + * modified Givens rotation of points in plane + * operates on len points + * + * params = [flag, h11, h12, h21, h22] + * @flag = -1: + * H -> [ [h11, h12], [h21, h22] ] + * @flag = 0.0: + * H -> [ [1, h12], [h21, 1] ] + * @flag = +1: + * H -> [ [h11, 1], [-1, h22] ] + * @flag = -2: + * H -> [ [1, 0], [0, 1] ] + * + * Replaces: + * x -> H11 * x + H12 * y + * y -> H21 * x + H22 * y + */ + +static +void +rotm_kernel8(int n, double *x, double *y, double H[4]) +{ + register int i; + register double tmp; + + for (i = 0; i < n; i+=8) { + tmp = x[i+0], x[i+0] = H[0]*x[i+0] + H[1]*y[i+0], y[i+0] = H[2]*y[i+0] + H[3]*tmp; + tmp = x[i+1], x[i+1] = H[0]*x[i+1] + H[1]*y[i+1], y[i+1] = H[2]*y[i+1] + H[3]*tmp; + tmp = x[i+2], x[i+2] = H[0]*x[i+2] + H[1]*y[i+2], y[i+2] = H[2]*y[i+2] + H[3]*tmp; + tmp = x[i+3], x[i+3] = H[0]*x[i+3] + H[1]*y[i+3], y[i+3] = H[2]*y[i+3] + H[3]*tmp; + tmp = x[i+4], x[i+4] = H[0]*x[i+4] + H[1]*y[i+4], y[i+4] = H[2]*y[i+4] + H[3]*tmp; + tmp = x[i+5], x[i+5] = H[0]*x[i+5] + H[1]*y[i+5], y[i+5] = H[2]*y[i+5] + H[3]*tmp; + tmp = x[i+6], x[i+6] = H[0]*x[i+6] + H[1]*y[i+6], y[i+6] = H[2]*y[i+6] + H[3]*tmp; + tmp = x[i+7], x[i+7] = H[0]*x[i+7] + H[1]*y[i+7], y[i+7] = H[2]*y[i+7] + H[3]*tmp; + } +} + +error +blas·rotm(int len, double *x, double *y, double p[5]) +{ + int n, flag; + double tmp, H[4]; + + flag = math·round(p[0]); + switch (flag) { + case 0: H[0] = p[1], H[1] = p[2], H[2] = p[3], H[3] = p[4]; break; + case -1: H[0] = +1, H[1] = p[2], H[2] = p[3], H[3] = +1; break; + case +1: H[0] = p[1], H[1] = +1, H[2] = -1, H[3] = p[4]; break; + case -2: H[0] = +1, H[1] = 0, H[2] = 0, H[3] = +1; break; + default: + errorf("rotm: flag '%d' unrecognized", flag); + return 1; + } + + n = len & ~7; + rotm_kernel8(n, x, y, H); + + for (; n < len; n++) { + tmp = x[n], x[n] = H[0]*x[n] + H[1]*y[n], y[n] = H[2]*y[n] + H[3]*tmp; + } + + return 0; +} + + /* * Scale vector * x = ax @@ -47,7 +218,7 @@ scale_kernel8(int n, double *x, double a) } void -blas·scalevec(int len, double *x, double a) +blas·scale(int len, double *x, double a) { int n; @@ -403,11 +574,11 @@ blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, d } #define NITER 5000 -#define NCOL 57 -#define NROW 57 +#define NCOL 10007 +#define NROW 10007 error -main() +test·gemm() { int i, n; clock_t t; @@ -468,3 +639,61 @@ main() return 0; } + +void +print·array(int n, double *x) +{ + double *end; + printf("["); + for (end=x+n; x != end; ++x) { + printf("%f,", *x); + } + printf("]\n"); +} + +error +main() +{ + int i, n; + double *x, *y; + double tprof[2]; + clock_t t; + + x = malloc(sizeof(*x)*NCOL); + y = malloc(sizeof(*x)*NCOL); + + for (n = 0; n < NITER; n++) { + for (i = 0; i < NCOL; i++) { + x[i] = i*i+1; + y[i] = i+1; + } + + t = clock(); + blas·rot(NCOL, x, y, .707, .707); + t = clock() - t; + tprof[0] += 1000.*t/CLOCKS_PER_SEC; + + for (i = 0; i < NCOL; i++) { + x[i] = i*i+1; + y[i] = i+1; + } + + t = clock(); + cblas_drot(NCOL, x, 1, y, 1, .707, .707); + t = clock() - t; + tprof[1] += 1000.*t/CLOCKS_PER_SEC; + } + + printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); + printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); + + double a, b, c, s; + + a = 10.234, b = 2.; + cblas_drotg(&a, &b, &c, &s); + printf("%f, %f, %f, %f\n", a, b, c, s); + + a = 10.234, b = 2.; + blas·rotg(&a, &b, &c, &s); + printf("%f, %f, %f, %f\n", a, b, c, s); +} -- cgit v1.2.1