aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-05-01 09:44:21 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-05-01 09:44:21 -0700
commit5355432d71cb1e3347b73536ce5be4af1aefcadc (patch)
tree88317fbd6bf6845fb0e2b8a2bfae2d5699ddfe45 /sys
parent19e04dbf6d39f059f517973bd3c6e2bde448cd93 (diff)
feat: level 3 funcs
Diffstat (limited to 'sys')
-rw-r--r--sys/libmath/blas.c89
1 files changed, 70 insertions, 19 deletions
diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c
index b42a3b8..a672101 100644
--- a/sys/libmath/blas.c
+++ b/sys/libmath/blas.c
@@ -7,7 +7,7 @@
#include <time.h>
// -----------------------------------------------------------------------
-// Level One
+// level one
/*
* Scale vector
@@ -60,10 +60,10 @@ blas·scalevec(int len, double *x, double a)
}
}
-/************************************************
+/*
* Daxpy
* y = ax + y
- ***********************************************/
+ */
static
void
@@ -208,7 +208,7 @@ blas·dot(int len, double *x, double *y)
}
// -----------------------------------------------------------------------
-// Level Two
+// level two
/*
* Notation: (number of rows) x (number of columns) _ unroll factor
@@ -327,6 +327,7 @@ blas·gemv(int nrow, int ncol, double a, double *m, double *x, double b, double
/*
* rank one addition
* M = ax(y^T) + M
+ * TODO: vectorize kernel
*/
void
@@ -344,6 +345,7 @@ blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m)
/*
* symmetric rank one addition
* M = ax(x^T) + M
+ * TODO: vectorize kernel
*/
void
@@ -358,9 +360,52 @@ blas·syr(int nrow, int ncol, double a, double *x, double *m)
}
}
-#define NITER 500
-#define NCOL 1007
-#define NROW 1007
+// -----------------------------------------------------------------------
+// level three
+
+/*
+ * matrix multiplication
+ * m3 = a(m1 * m2) + b(m3)
+ * einstein notation:
+ * m3_{ij} = a m1_{ik} m2_{kj} + b m3_{ij}
+ *
+ * n1 = # rows of m1 = # rows of m3
+ * n2 = # cols of m2 = # cols of m3
+ * n3 = # cols of m1 = # rows of m2
+ *
+ * TODO: Right now we are 2x slower than OpenBLAS.
+ * This is because we use a very simple algorithm.
+ */
+
+void
+blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3)
+{
+ int i, j, k, len;
+ double prod;
+
+ // TODO: Is there anyway this computation can be integrated into the one below?
+ for (i = 0; i < n1; i++) {
+ for (j = 0; j < n2; j++) {
+ m3[i + n2*j] *= b;
+ }
+ }
+
+ len = n1 & ~7;
+ for (j = 0; j < n2; j++) {
+ for (k = 0; k < n3; k++) {
+ daxpy_kernel8_avx2(len, a * m2[k + n2*j], m1 + n3*k, m3 + n2*j);
+
+ // remainder
+ for (i = len; i < n1; i++) {
+ m3[i + n2*j] += a * m1[i + n3*k] * m2[k + n2*j];
+ }
+ }
+ }
+}
+
+#define NITER 5000
+#define NCOL 57
+#define NROW 57
error
main()
@@ -368,14 +413,16 @@ main()
int i, n;
clock_t t;
- double *x, *y, *m;
+ double *x, *y, *m[3];
double res[2], tprof[2];
openblas_set_num_threads(1);
- x = malloc(sizeof(*x)*NCOL);
- y = malloc(sizeof(*x)*NCOL);
- m = malloc(sizeof(*x)*NCOL*NROW);
+ x = malloc(sizeof(*x)*NCOL);
+ y = malloc(sizeof(*x)*NCOL);
+ m[0] = malloc(sizeof(*x)*NCOL*NROW);
+ m[1] = malloc(sizeof(*x)*NCOL*NROW);
+ m[2] = malloc(sizeof(*x)*NCOL*NROW);
tprof[0] = 0;
tprof[1] = 0;
@@ -387,14 +434,16 @@ main()
}
for (i = 0; i < NCOL*NROW; i++) {
- m[i] = .1;
+ m[0][i] = i/(NCOL*NROW);
+ m[1][i] = i*i/(NCOL*NROW*NCOL*NROW);
+ m[2][i] = 1;
}
t = clock();
- cblas_dger(CblasRowMajor, NROW, NCOL, 1.2, x, 1, y, 1, m, NCOL);
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, NCOL, NROW, NROW, 1.2, m[0], NROW, m[1], NROW, 2.8, m[2], NROW);
t = clock() - t;
tprof[1] += 1000.*t/CLOCKS_PER_SEC;
- res[1] = cblas_ddot(NROW*NCOL, m, 1, m, 1);
+ res[1] = cblas_ddot(NROW*NCOL, m[2], 1, m[2], 1);
for (i = 0; i < NCOL; i++) {
x[i] = i*i+1;
@@ -402,18 +451,20 @@ main()
}
for (i = 0; i < NCOL*NROW; i++) {
- m[i] = .1;
+ m[0][i] = i/(NCOL*NROW);
+ m[1][i] = i*i/(NCOL*NROW*NCOL*NROW);
+ m[2][i] = 1;
}
t = clock();
- blas·ger(NROW, NCOL, 1.2, x, y, m);
+ blas·gemm(NROW, NROW, NROW, 1.2, m[0], m[1], 2.8, m[2]);
t = clock() - t;
tprof[0] += 1000.*t/CLOCKS_PER_SEC;
- res[0] = blas·dot(NROW*NCOL, m, m);
+ res[0] = blas·dot(NROW*NCOL, m[2], m[2]);
}
- printf("avg time elapsed/iteration (naive): %fms\n", tprof[0]/NITER);
+ printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER);
printf("--> result (naive): %f\n", res[0]);
- printf("avg time elapsed/iteration (oblas): %fms\n", tprof[1]/NITER);
+ printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER);
printf("--> result (oblas): %f\n", res[1]);
return 0;