From 5355432d71cb1e3347b73536ce5be4af1aefcadc Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 1 May 2020 09:44:21 -0700 Subject: feat: level 3 funcs --- sys/libmath/blas.c | 89 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 19 deletions(-) (limited to 'sys/libmath/blas.c') diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c index b42a3b8..a672101 100644 --- a/sys/libmath/blas.c +++ b/sys/libmath/blas.c @@ -7,7 +7,7 @@ #include // ----------------------------------------------------------------------- -// Level One +// level one /* * Scale vector @@ -60,10 +60,10 @@ blas·scalevec(int len, double *x, double a) } } -/************************************************ +/* * Daxpy * y = ax + y - ***********************************************/ + */ static void @@ -208,7 +208,7 @@ blas·dot(int len, double *x, double *y) } // ----------------------------------------------------------------------- -// Level Two +// level two /* * Notation: (number of rows) x (number of columns) _ unroll factor @@ -327,6 +327,7 @@ blas·gemv(int nrow, int ncol, double a, double *m, double *x, double b, double /* * rank one addition * M = ax(y^T) + M + * TODO: vectorize kernel */ void @@ -344,6 +345,7 @@ blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m) /* * symmetric rank one addition * M = ax(x^T) + M + * TODO: vectorize kernel */ void @@ -358,9 +360,52 @@ blas·syr(int nrow, int ncol, double a, double *x, double *m) } } -#define NITER 500 -#define NCOL 1007 -#define NROW 1007 +// ----------------------------------------------------------------------- +// level three + +/* + * matrix multiplication + * m3 = a(m1 * m2) + b(m3) + * einstein notation: + * m3_{ij} = a m1_{ik} m2_{kj} + b m3_{ij} + * + * n1 = # rows of m1 = # rows of m3 + * n2 = # cols of m2 = # cols of m3 + * n3 = # cols of m1 = # rows of m2 + * + * TODO: Right now we are 2x slower than OpenBLAS. + * This is because we use a very simple algorithm. + */ + +void +blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3) +{ + int i, j, k, len; + double prod; + + // TODO: Is there anyway this computation can be integrated into the one below? + for (i = 0; i < n1; i++) { + for (j = 0; j < n2; j++) { + m3[i + n2*j] *= b; + } + } + + len = n1 & ~7; + for (j = 0; j < n2; j++) { + for (k = 0; k < n3; k++) { + daxpy_kernel8_avx2(len, a * m2[k + n2*j], m1 + n3*k, m3 + n2*j); + + // remainder + for (i = len; i < n1; i++) { + m3[i + n2*j] += a * m1[i + n3*k] * m2[k + n2*j]; + } + } + } +} + +#define NITER 5000 +#define NCOL 57 +#define NROW 57 error main() @@ -368,14 +413,16 @@ main() int i, n; clock_t t; - double *x, *y, *m; + double *x, *y, *m[3]; double res[2], tprof[2]; openblas_set_num_threads(1); - x = malloc(sizeof(*x)*NCOL); - y = malloc(sizeof(*x)*NCOL); - m = malloc(sizeof(*x)*NCOL*NROW); + x = malloc(sizeof(*x)*NCOL); + y = malloc(sizeof(*x)*NCOL); + m[0] = malloc(sizeof(*x)*NCOL*NROW); + m[1] = malloc(sizeof(*x)*NCOL*NROW); + m[2] = malloc(sizeof(*x)*NCOL*NROW); tprof[0] = 0; tprof[1] = 0; @@ -387,14 +434,16 @@ main() } for (i = 0; i < NCOL*NROW; i++) { - m[i] = .1; + m[0][i] = i/(NCOL*NROW); + m[1][i] = i*i/(NCOL*NROW*NCOL*NROW); + m[2][i] = 1; } t = clock(); - cblas_dger(CblasRowMajor, NROW, NCOL, 1.2, x, 1, y, 1, m, NCOL); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, NCOL, NROW, NROW, 1.2, m[0], NROW, m[1], NROW, 2.8, m[2], NROW); t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; - res[1] = cblas_ddot(NROW*NCOL, m, 1, m, 1); + res[1] = cblas_ddot(NROW*NCOL, m[2], 1, m[2], 1); for (i = 0; i < NCOL; i++) { x[i] = i*i+1; @@ -402,18 +451,20 @@ main() } for (i = 0; i < NCOL*NROW; i++) { - m[i] = .1; + m[0][i] = i/(NCOL*NROW); + m[1][i] = i*i/(NCOL*NROW*NCOL*NROW); + m[2][i] = 1; } t = clock(); - blas·ger(NROW, NCOL, 1.2, x, y, m); + blas·gemm(NROW, NROW, NROW, 1.2, m[0], m[1], 2.8, m[2]); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; - res[0] = blas·dot(NROW*NCOL, m, m); + res[0] = blas·dot(NROW*NCOL, m[2], m[2]); } - printf("avg time elapsed/iteration (naive): %fms\n", tprof[0]/NITER); + printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("--> result (naive): %f\n", res[0]); - printf("avg time elapsed/iteration (oblas): %fms\n", tprof[1]/NITER); + printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); printf("--> result (oblas): %f\n", res[1]); return 0; -- cgit v1.2.1