From 04c688f125069b65517b00660c31c81e210ddf3a Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 8 May 2020 21:33:24 -0700 Subject: Adding strided computation to blas kernels. I started implementing LQ factorization and immediately realized I needed strided views. For simplicity, I will just implement them in the most portable, C native way (no vectorization). Speed can come later. --- sys/libmath/blas.c | 228 ++++++++++++++++++++++++++++++++++++++++++--------- sys/libmath/linalg.c | 57 +++++++++++++ 2 files changed, 246 insertions(+), 39 deletions(-) (limited to 'sys') diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c index 227715c..f1ae09d 100644 --- a/sys/libmath/blas.c +++ b/sys/libmath/blas.c @@ -248,7 +248,7 @@ scale_kernel8(int n, double *x, double a) } void -blas·scale(int len, double *x, double a) +blas·scale(int len, double a, double *x) { int n; @@ -459,18 +459,121 @@ dot_kernel8(int len, double *x, double *y) } double -blas·dot(int len, double *x, double *y) +blas·dot(int len, double *x, int incx, double *y, int incy) +{ + int i, n, ix, iy; + double res, mul[4], sum[2]; + + if (len == 0) return 0; + + if (incx == 1 && incy == 1) { + n = len & ~15; // neat trick + res = dot_kernel8_fma3(n, x, y); + + for (i = n; i < len; i++) { + res += x[i] * y[i]; + } + return res; + } + + n = len & ~3; + for (i = 0, ix = 0, iy = 0; i < n; i += 4, ix += 4*incx, iy += 4*incy) { + mul[0] = x[ix+0*incx] * y[iy+0*incy]; + mul[1] = x[ix+1*incx] * y[iy+1*incy]; + mul[2] = x[ix+2*incx] * y[iy+2*incy]; + mul[3] = x[ix+3*incx] * y[iy+3*incy]; + + sum[0] += mul[0] + mul[2]; + sum[1] += mul[1] + mul[3]; + } + + for (; i < len; i++, ix += incx, iy += incy) { + sum[0] += x[ix] * y[iy]; + } + + res = sum[0] + sum[1]; + return res; +} + +/* + * euclidean norm + * ||x|| + */ +double +blas·norm(int len, double *x) +{ + double res; + + res = blas·dot(len, x, 1, x, 1); + res = math·sqrt(res); + + return res; +} + +static +double +sum_kernel8_avx2(int len, double *x) +{ + register int i; + __m256d sum[2]; + __m128d res; + + for (i = 0; i < arrlen(sum); i++) { + sum[i] = _mm256_setzero_pd(); + } + + for (i = 0; i < len; i += 8) { + sum[0] += _mm256_loadu_pd(x+i+0); + sum[1] += _mm256_loadu_pd(x+i+4); + } + + sum[0] += sum[1]; + + res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); + res = _mm_hadd_pd(res, res); + + return res[0]; +} + +static +double +sum_kernel8(int len, double *x, double *y) +{ + double res; + register int i; + + for (i = 0; i < len; i += 8) { + res += x[i] + + x[i+1] + + x[i+2] + + x[i+3] + + x[i+4] + + x[i+5] + + x[i+6] + + x[i+7]; + } + + return res; +} + + +/* + * L1 norm + * sum(x_i) + */ +double +blas·sum(int len, double *x) { int i, n; double res; if (len == 0) return 0; - n = len & ~15; // neat trick - res = dot_kernel8_fma3(n, x, y); + n = len & ~7; + res = sum_kernel8_avx2(n, x); for (i = n; i < len; i++) { - res += x[i] * y[i]; + res += x[i]; } return res; @@ -833,7 +936,7 @@ blas·tpmv(blas·Flag f, int n, double *m, double *x) { int i; for (i = 0; i < n; m += (n-i), ++x, ++i) { - *x = blas·dot(n-i, m, x); + *x = blas·dot(n-i, m, 1, x, 1); } } @@ -853,7 +956,7 @@ blas·tpsv(blas·Flag f, int n, double *m, double *x) x += (n - 1); m += ((n * (n+1))/2 - 1); for (i = n-1; i >= 0; --i, --x, m -= (n-i)) { - r = blas·dot(n-i-1, m+1, x+1); + r = blas·dot(n-i-1, m+1, 1, x+1, 1); *x = (*x - r) / *m; } } @@ -1098,13 +1201,13 @@ blas·trsm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2) } #define NITER 10000 -#define NCOL 5007 -#define NROW 5007 +#define NCOL 5000007 +#define NROW 57 error test·level3() { - int i, n; + vlong i, n; clock_t t; double *x, *y, *m[3]; @@ -1154,7 +1257,7 @@ test·level3() blas·gemm(NROW, NROW, NROW, 1.2, m[0], m[1], 2.8, m[2]); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; - res[0] = blas·dot(NROW*NCOL, m[2], m[2]); + res[0] = blas·dot(NROW*NCOL, m[2], 1, m[2], 1); } printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("--> result (naive): %f\n", res[0]); @@ -1164,6 +1267,54 @@ test·level3() return 0; } +void +test·level2() +{ + int i, j, n, it; + clock_t t; + + double *x, *y, *m; + double tprof[2]; + + rng·init(0); + + tprof[0] = 0, tprof[1] = 0; + x = malloc(sizeof(*x)*NCOL); + y = malloc(sizeof(*x)*NCOL); + m = malloc(sizeof(*x)*NCOL*(NCOL+1)/2); + + for (it = 0; it < NITER; it++) { + n = 0; + for (i = 0; i < NCOL; i++) { + y[i] = rng·random(); + for (j = i; j < NCOL; j++) { + m[n++] = rng·random() + .1; // To ensure not singular + } + } + + memcpy(x, y, NCOL * sizeof(*x)); + + t = clock(); + blas·tpsv(0, NCOL, m, x); + t = clock() - t; + tprof[0] += 1000.*t/CLOCKS_PER_SEC; + + t = clock(); + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, NCOL, m, y, 1); + t = clock() - t; + tprof[1] += 1000.*t/CLOCKS_PER_SEC; + + for (i = 0; i < NCOL; i++) { + if (math·abs(x[i] - y[i])/math·abs(x[i]) > 1e-5) { + errorf("failure at index %d: %f != %f", i, x[i], y[i]); + } + } + } + + printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); + printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); +} + void print·array(int n, double *x) { @@ -1236,47 +1387,46 @@ test·level1() error main() { - int i, j, n, it; + int i, n; + double *x, *y; + double res[2], tprof[2]; clock_t t; - double *x, *y, *m; - double tprof[2]; - - rng·init(0); - - tprof[0] = 0, tprof[1] = 0; x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NCOL); - m = malloc(sizeof(*x)*NCOL*(NCOL+1)/2); + rng·init(0); - for (it = 0; it < NITER; it++) { - n = 0; + for (n = 0; n < NITER; n++) { for (i = 0; i < NCOL; i++) { + x[i] = rng·random(); y[i] = rng·random(); - for (j = i; j < NCOL; j++) { - m[n++] = rng·random() + .1; // To ensure not singular - } } - memcpy(x, y, NCOL * sizeof(*x)); - - t = clock(); - blas·tpsv(0, NCOL, m, x); - t = clock() - t; - tprof[0] += 1000.*t/CLOCKS_PER_SEC; - - t = clock(); - cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, NCOL, m, y, 1); - t = clock() - t; + t = clock(); + res[1] += cblas_ddot(NCOL/4, x, 4, y, 4); + t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; - for (i = 0; i < NCOL; i++) { - if (math·abs(x[i] - y[i])/math·abs(x[i]) > 1e-5) { - errorf("failure at index %d: %f != %f", i, x[i], y[i]); - } - } + t = clock(); + res[0] += blas·dot(NCOL/4, x, 4, y, 4); + t = clock() - t; + tprof[0] += 1000.*t/CLOCKS_PER_SEC; } + printf("%f, %f\n", res[0], res[1]); printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); + + double a, b, c, s; + + a = 10.234, b = 2.; + cblas_drotg(&a, &b, &c, &s); + printf("%f, %f, %f, %f\n", a, b, c, s); + + a = 10.234, b = 2.; + blas·rotg(&a, &b, &c, &s); + printf("%f, %f, %f, %f\n", a, b, c, s); + + return 0; + } diff --git a/sys/libmath/linalg.c b/sys/libmath/linalg.c index 57f799b..5a73527 100644 --- a/sys/libmath/linalg.c +++ b/sys/libmath/linalg.c @@ -2,4 +2,61 @@ #include #include +// ----------------------------------------------------------------------- +// Vector +void +linalg·normalize(math·Vector vec) +{ + double norm; + + norm = blas·norm(vec.len, vec.data); + blas·scale(vec.len, 1/norm, vec.data); +} +// TODO: Write blas wrappers that eat vectors for convenience + +// ----------------------------------------------------------------------- +// Matrix +// +// NOTE: all matrices are row major oriented + +/* + * linalg·lq + * computes the LQ decomposition of matrix M: M = LQ + * L is lower triangular + * Q is orthogonal -> transp(Q) * Q = I + * + * m: matrix to factorize. changes in place + * + lower triangle -> L + * + upper triangle -> all reflection vectors stored in rows + * w: working buffer: len = ncols! + */ +error +linalg·lq(math·Matrix m, math·Vector w) +{ + int i, j, len; + double *row, mag; + enum { + err·nil, + err·baddims, + }; + + if (m.dim[0] > m.dim[1]) { + return err·baddims; + } + + for (i = 0; i < m.dim[0]; i++, m.data += m.dim[1]) { + row = m.data + i; + len = m.dim[0] - i; + + // TODO: Don't want to compute norm twice!! + w.data[0] = math·sgn(row[0]) * blas·norm(len, row); + blas·axpy(len, 1.0, row, w.data); + mag = blas·norm(len, w.data); + blas·scale(len, 1/mag, w.data); + + blas·copy(len - m.dim[0], w.data, m.data + i); + } + + return err·nil; +} -- cgit v1.2.1