aboutsummaryrefslogtreecommitdiff
path: root/sys/libmath
diff options
context:
space:
mode:
Diffstat (limited to 'sys/libmath')
-rw-r--r--sys/libmath/basic.c531
-rw-r--r--sys/libmath/blas.c63
-rw-r--r--sys/libmath/blas1.c58
-rw-r--r--sys/libmath/blas1body215
-rw-r--r--sys/libmath/blas2.c222
-rw-r--r--sys/libmath/blas2body256
-rw-r--r--sys/libmath/blas3.c279
-rw-r--r--sys/libmath/lapack.c0
-rw-r--r--sys/libmath/linalg.c63
-rw-r--r--sys/libmath/loop.h114
-rw-r--r--sys/libmath/matrix.c176
-rw-r--r--sys/libmath/rules.mk24
-rw-r--r--sys/libmath/test.c471
13 files changed, 0 insertions, 2472 deletions
diff --git a/sys/libmath/basic.c b/sys/libmath/basic.c
deleted file mode 100644
index 1341f7b..0000000
--- a/sys/libmath/basic.c
+++ /dev/null
@@ -1,531 +0,0 @@
-#include <u.h>
-#include <base.h>
-#include <libmath.h>
-
-#include <math.h>
-
-// TODO(nnoll): Replace implementations with your own.
-
-double
-math·acos(double x)
-{
- return acos(x);
-}
-
-float
-math·acosf(float x)
-{
- return acosf(x);
-}
-
-
-double
-math·acosh(double x)
-{
- return acosh(x);
-}
-
-float
-math·acoshf(float x)
-{
- return acoshf(x);
-}
-
-
-double
-math·asin(double x)
-{
- return asin(x);
-}
-
-float
-math·asinf(float x)
-{
- return asinf(x);
-}
-
-
-double
-math·asinh(double x)
-{
- return asinh(x);
-}
-
-float
-math·asinhf(float x)
-{
- return asinhf(x);
-}
-
-
-double
-math·atan(double x)
-{
- return atan(x);
-}
-
-float
-math·atanf(float x)
-{
- return atanf(x);
-}
-
-
-double
-math·atan2(double x, double y)
-{
- return atan2(x, y);
-}
-
-float
-math·atan2f(float x, float y)
-{
- return atan2f(x, y);
-}
-
-double
-math·atanh(double x)
-{
- return atanh(x);
-}
-
-float
-math·atanhf(float x)
-{
- return atanhf(x);
-}
-
-
-double
-math·cbrt(double x)
-{
- return cbrt(x);
-}
-
-float
-math·cbrtf(float x)
-{
- return cbrtf(x);
-}
-
-
-double
-math·ceil(double x)
-{
- return ceil(x);
-}
-
-float
-math·ceilf(float x)
-{
- return ceilf(x);
-}
-
-double
-math·cos(double x)
-{
- return cos(x);
-}
-
-float
-math·cosf(float x)
-{
- return cosf(x);
-}
-
-
-double
-math·cosh(double x)
-{
- return cosh(x);
-}
-
-float
-math·coshf(float x)
-{
- return coshf(x);
-}
-
-
-double
-math·erf(double x)
-{
- return erf(x);
-}
-
-float
-math·erff(float x)
-{
- return erff(x);
-}
-
-
-double
-math·erfc(double x)
-{
- return erfc(x);
-}
-
-float
-math·erfcf(float x)
-{
- return erfcf(x);
-}
-
-
-double
-math·exp(double x)
-{
- return exp(x);
-}
-
-float
-math·expf(float x)
-{
- return expf(x);
-}
-
-
-double
-math·exp2(double x)
-{
- return exp2(x);
-}
-
-float
-math·exp2f(float x)
-{
- return exp2f(x);
-}
-
-
-double
-math·expm1(double x)
-{
- return expm1(x);
-}
-
-float
-math·expm1f(float x)
-{
- return expm1f(x);
-}
-
-
-double
-math·floor(double x)
-{
- return floor(x);
-}
-
-float
-math·floorf(float x)
-{
- return floorf(x);
-}
-
-
-int
-math·ilogb(double x)
-{
- return ilogb(x);
-}
-
-int
-math·ilogbf(float x)
-{
- return ilogbf(x);
-}
-
-double
-math·lgamma(double x)
-{
- return lgamma(x);
-}
-
-float
-math·lgammaf(float x)
-{
- return lgammaf(x);
-}
-
-
-vlong
-math·llrint(double x)
-{
- return math·llrint(x);
-}
-
-vlong
-math·llrintf(float x)
-{
- return math·llrintf(x);
-}
-
-
-vlong
-math·llround(double x)
-{
- return llround(x);
-}
-
-vlong
-math·llroundf(float x)
-{
- return llroundf(x);
-}
-
-
-double
-math·log(double x)
-{
- return log(x);
-}
-
-float
-math·logf(float x)
-{
- return logf(x);
-}
-
-
-double
-math·log10(double x)
-{
- return log10(x);
-}
-
-float
-math·log10f(float x)
-{
- return log10f(x);
-}
-
-
-double
-math·log1p(double x)
-{
- return log1p(x);
-}
-
-float
-math·log1pf(float x)
-{
- return log1pf(x);
-}
-
-
-double
-math·log2(double x)
-{
- return log2(x);
-}
-
-float
-math·log2f(float x)
-{
- return log2f(x);
-}
-
-
-double
-math·logb(double x)
-{
- return logb(x);
-}
-
-float
-math·logbf(float x)
-{
- return logbf(x);
-}
-
-
-long
-math·lrint(double x)
-{
- return lrint(x);
-}
-
-long
-math·lrintf(float x)
-{
- return lrintf(x);
-}
-
-
-long
-math·lround(double x)
-{
- return lround(x);
-}
-
-long
-math·lroundf(float x)
-{
- return lroundf(x);
-}
-
-
-double math·modf(double, double *);
-float math·modff(float, float *);
-
-double
-math·nan(const char * x)
-{
- return nan(x);
-}
-
-float
-math·nanf(const char * x)
-{
- return nanf(x);
-}
-
-
-double
-math·nearbyint(double x)
-{
- return nearbyint(x);
-}
-
-float
-math·nearbyintf(float x)
-{
- return nearbyintf(x);
-}
-
-
-double
-math·pow(double x, double exp)
-{
- return pow(x, exp);
-}
-
-float
-math·powf(float x, float exp)
-{
- return powf(x, exp);
-}
-
-double
-math·rint(double x)
-{
- return rint(x);
-}
-
-float
-math·rintf(float x)
-{
- return rintf(x);
-}
-
-
-double
-math·round(double x)
-{
- return round(x);
-}
-
-float
-math·roundf(float x)
-{
- return roundf(x);
-}
-
-
-double math·scalbln(double, long);
-float math·scalblnf(float, long);
-
-double math·scalbn(double, int);
-float math·scalbnf(float, int);
-
-double
-math·sin(double x)
-{
- return sin(x);
-}
-
-float
-math·sinf(float x)
-{
- return sinf(x);
-}
-
-
-double
-math·sinh(double x)
-{
- return sinh(x);
-}
-
-float
-math·sinhf(float x)
-{
- return sinhf(x);
-}
-
-
-double
-math·sqrt(double x)
-{
- return sqrt(x);
-}
-
-float
-math·sqrtf(float x)
-{
- return sqrtf(x);
-}
-
-
-double
-math·tan(double x)
-{
- return tan(x);
-}
-
-float
-math·tanf(float x)
-{
- return tanf(x);
-}
-
-
-double
-math·tanh(double x)
-{
- return tanh(x);
-}
-
-float
-math·tanhf(float x)
-{
- return tanhf(x);
-}
-
-
-double
-math·tgamma(double x)
-{
- return tgamma(x);
-}
-
-float
-math·tgammaf(float x)
-{
- return tgammaf(x);
-}
-
-
-double
-math·trunc(double x)
-{
- return trunc(x);
-}
-
-float
-math·truncf(float x)
-{
- return truncf(x);
-}
diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c
deleted file mode 100644
index 18f9760..0000000
--- a/sys/libmath/blas.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <u.h>
-#include <base.h>
-#include <libmath.h>
-#include <libmath/blas.h>
-#include <time.h>
-
-/* #include <vendor/blas/cblas.h> */
-
-#define NCOL 2*512
-#define NROW 2*512
-#define NSUM 2*512
-#define NIT 10
-#define INC 1
-error
-main()
-{
- int i, j, nit;
- double *x, *y, *z, *w, res[2];
-
- clock_t t;
- double tprof[2] = { 0 };
-
- rng·init(0);
-
- x = malloc(sizeof(*x)*NROW*NCOL);
- y = malloc(sizeof(*x)*NROW*NCOL);
- z = malloc(sizeof(*x)*NROW*NCOL);
- w = malloc(sizeof(*x)*NROW*NCOL);
-
-#define DO_0 t = clock(); \
- blas·dgemm(0,0,NROW,NCOL,NSUM,10.1,x,NROW,y,NROW,1.2,z,NROW);\
- t = clock() - t; \
- res[0] += blas·dasum(NROW*NCOL,z,INC); \
- tprof[0] += 1000.*t/CLOCKS_PER_SEC; \
-
-#define DO_1 t = clock(); \
- cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,NROW,NCOL,NSUM,10.1,x,NROW,y,NROW,1.2,w,NROW);\
- t = clock() - t; \
- res[1] += cblas_dasum(NROW*NCOL,w,INC); \
- tprof[1] += 1000.*t/CLOCKS_PER_SEC;
-
- for (nit = 0; nit < NIT; nit++) {
- for (i = 0; i < NROW; i++) {
- for (j = 0; j < NCOL; j++) {
- x[j + NROW*i] = rng·random();
- y[j + NROW*i] = rng·random();
- z[j + NROW*i] = rng·random();
- w[j + NROW*i] = z[j + NROW*i];
- }
- }
-
- switch (nit % 2) {
- case 0: DO_0; DO_1; break;
- case 1: DO_1; DO_0; break;
- }
- }
- printf("mean time/iteration (mine): %fms\n", tprof[0]/NIT);
- printf("--> result (mine): %f\n", res[0]);
- printf("mean time/iteration (openblas): %fms\n", tprof[1]/NIT);
- printf("--> result (openblas): %f\n", res[1]);
-
- return 0;
-}
diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c
deleted file mode 100644
index a8ca085..0000000
--- a/sys/libmath/blas1.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <u.h>
-#include <libmath.h>
-
-// -----------------------------------------------------------------------
-// Templates
-
-#include "loop.h"
-#define BODY_XY() \
- LOOP(UNROLL, 0, INIT); \
- n = ROUNDBY(len, UNROLL); \
- if (incx == 1 && incy == 1) { \
- for (i = 0; i < n; i+=UNROLL) { \
- LOOP(UNROLL,0,KERNEL,1,1); \
- } \
- } else { \
- for (i = 0; i < n; i+=UNROLL) { \
- LOOP(UNROLL,0,KERNEL,incx,incy);\
- } \
- } \
- \
- for (; i < len; i++) { \
- LOOP(1,0,KERNEL,incx,incy); \
- }
-
-#define BODY_X() \
- LOOP(UNROLL, 0, INIT); \
- n = ROUNDBY(len, UNROLL); \
- if (incx == 1) { \
- for (i = 0; i < n; i+=UNROLL) { \
- LOOP(UNROLL,0,KERNEL,1); \
- } \
- } else { \
- for (i = 0; i < n; i+=UNROLL) { \
- LOOP(UNROLL,0,KERNEL,incx); \
- } \
- } \
- \
- for (; i < len; i++) { \
- LOOP(1,0,KERNEL,incx); \
- }
-
-// -----------------------------------------------------------------------
-// Implementation
-
-#define UNROLL 8
-#define INT int
-
-#define FLOAT double
-#define func(name) blas·d##name
-#include "blas1body"
-
-#undef FLOAT
-#undef func
-
-#define FLOAT float
-#define func(name) blas·f##name
-#include "blas1body"
-#undef FLOAT
diff --git a/sys/libmath/blas1body b/sys/libmath/blas1body
deleted file mode 100644
index de4b637..0000000
--- a/sys/libmath/blas1body
+++ /dev/null
@@ -1,215 +0,0 @@
-/* vim: set ft=c */
-// -----------------------------------------------------------------------
-// Function implementations
-
-FLOAT
-func(dot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy)
-{
-#define INIT(I,...) reg[I] = 0;
-#define KERNEL(I, INCX, INCY) reg[I] += x[(INCX)*(i + I)] * y[(INCY)*(i + I)];
- INT i, n;
- FLOAT reg[UNROLL];
-
- BODY_XY()
-
- for (i = 1; i < UNROLL; i++)
- reg[0] += reg[i];
-
- return reg[0];
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(copy)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy)
-{
-#define INIT(I,...)
-#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = x[(INCX)*(i + I)];
- INT i, n;
-
- BODY_XY();
-
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(swap)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy)
-{
-#define INIT(I,...)
-#define KERNEL(I, INCX, INCY) tmp[I] = x[(INCX)*(i + I)], x[(INCX)*(i + I)] = y[(INCY)*(i + I)], y[(INCY)*(i + I)] = tmp[I];
- INT i, n;
- FLOAT tmp[UNROLL];
-
- BODY_XY();
-
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(axpy)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy)
-{
-#define INIT(I,...)
-#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] += a*x[(INCX)*(i + I)];
- INT i, n;
-
- BODY_XY();
-
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(axpby)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy)
-{
-#define INIT(I,...)
-#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = a*x[(INCX)*(i + I)] + b*y[(INCY)*(i + I)];
- INT i, n;
-
- BODY_XY();
-
-#undef INIT
-#undef KERNEL
-}
-
-INT
-func(argmax)(INT len, FLOAT *x, INT incx)
-{
-#define INIT(I,...) max[I] = x[I], idx[I] = I;
-#define KERNEL(I, INCX) if (x[(INCX)*(i+I)] > max[I]) {max[i] = x[INCX*(i+I)]; idx[I] = i+I;}
- INT i, n;
- FLOAT max[UNROLL];
- INT idx[UNROLL];
-
- BODY_X();
-
- for (i = 1; i < UNROLL; i++)
- if (max[i] > max[0])
- idx[0] = idx[i];
-
- return idx[0];
-#undef INIT
-#undef KERNEL
-}
-
-INT
-func(argmin)(INT len, FLOAT *x, INT incx)
-{
-#define INIT(I,...) min[I] = x[I], idx[I] = I;
-#define KERNEL(I, INCX) if (x[INCX*(i+I)] < min[I]) {min[i] = x[INCX*(i+I)]; idx[I] = i+I;}
- INT i, n;
- FLOAT min[UNROLL];
- INT idx[UNROLL];
-
- BODY_X();
-
- for (i = 1; i < UNROLL; i++)
- if (min[i] < min[0])
- idx[0] = idx[i];
-
- return idx[0];
-#undef INIT
-#undef KERNEL
-}
-
-FLOAT
-func(asum)(INT len, FLOAT *x, INT incx)
-{
-#define INIT(I,...) sum[I] = 0;
-#define KERNEL(I, INCX) sum[I] += x[INCX*(i+I)] > 0 ? x[INCX*(i+I)] : -x[INCX*(i+I)];
- INT i, n;
- FLOAT sum[UNROLL];
-
- BODY_X();
-
- for (i = 1; i < UNROLL; i++)
- sum[0] += sum[i];
-
- return sum[0];
-
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(scale)(INT len, FLOAT a, FLOAT *x, INT incx)
-{
-#define INIT(I, ...)
-#define KERNEL(I, INCX) x[INCX*(i+I)] *= a;
- INT i, n;
-
- BODY_X();
-
-#undef INIT
-#undef KERNEL
-}
-
-FLOAT
-func(norm)(INT len, FLOAT *x, INT incx)
-{
-#define INIT(I, ...)
-#define KERNEL(I, INCX) norm[I] += x[INCX*(i+I)] * x[INCX*(i+I)];
- INT i, n;
- FLOAT norm[UNROLL];
-
- BODY_X();
-
- for (i = 1; i < UNROLL; i++)
- norm[0] += norm[i];
-
- return math·sqrt(norm[0]);
-
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(drot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT cos, FLOAT sin)
-{
-#define INIT(I, ...)
-#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = cos*x[INCX*(i+I)] + sin*y[INCY*(i+I)], y[INCY*(i+I)] = cos*y[INCY*(i+I)] - sin*tmp[I];
- INT i, n;
- FLOAT tmp[UNROLL];
-
- BODY_XY();
-
-#undef INIT
-#undef KERNEL
-}
-
-void
-func(rotm)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT H[5])
-{
-#define INIT(I, ...)
-#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = H[1]*x[INCX*(i+I)] + H[2]*y[INCY*(i+I)], y[INCY*(i+I)] = H[3]*tmp[I] + H[4]*y[INCY*(i+I)];
- INT i, n, f;
- FLOAT tmp[UNROLL];
-
- f = (INT)H[0];
- switch (f) {
- case -2:
- H[1] = +1;
- H[2] = +0;
- H[3] = +0;
- H[4] = +1;
- break;
- case -1:
- break;
- case +0:
- H[1] = +1;
- H[4] = +1;
- break;
- case +1:
- H[2] = +1;
- H[3] = -1;
- break;
- default:
- return;
- }
-
- BODY_XY();
-
-#undef INIT
-#undef KERNEL
-}
diff --git a/sys/libmath/blas2.c b/sys/libmath/blas2.c
deleted file mode 100644
index 7e4b08e..0000000
--- a/sys/libmath/blas2.c
+++ /dev/null
@@ -1,222 +0,0 @@
-#include <u.h>
-#include <libmath/blas.h>
-#include "loop.h"
-
-// -----------------------------------------------------------------------
-// Templates
-
-#define BODY_RECT() \
- nr = ROUNDBY(nrow, UNROW); \
- nc = ROUNDBY(ncol, UNCOL); \
- if (incx == 1 && incy == 1) { \
- for (r = 0; r < nr; r += UNROW) { \
- LOOP(UNROW,0,INIT,1,1); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(UNROW,0,KERN,1,1,UNCOL); \
- } \
- for (; c < ncol; c++) { \
- LOOP(UNROW,0,KERN,1,1,1); \
- } \
- LOOP(UNROW,0,FINI,1,1); \
- } \
- } else { \
- for (r = 0; r < nr; r += UNROW) { \
- LOOP(UNROW,0,INIT,incx,incy); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(UNROW,0,KERN,incx,incy,UNCOL); \
- } \
- for (; c < ncol; c++) { \
- LOOP(UNROW,0,KERN,incx,incy,1); \
- } \
- LOOP(UNROW,0,FINI,incx,incy); \
- } \
- } \
- \
- for (; r < nrow; r++) { \
- LOOP(1,0,INIT,incx,incy); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(1,0,KERN,incx,incy,UNCOL); \
- } \
- for (; c < ncol; c++) { \
- LOOP(1,0,KERN,incx,incy,1); \
- } \
- LOOP(1,0,FINI,incx,incy); \
- }
-
-#define BODY_LOTRI() \
- nr = ROUNDBY(n, UNROW); \
- if (incx == 1) { \
- for (r = 0; r < nr; r += UNROW) { \
- LOOP(UNROW,0,INIT,1); \
- nc = ROUNDBY(r, UNCOL); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(UNROW,0,KERN,1,UNCOL); \
- } \
- for (; c < r; c++) { \
- LOOP(UNROW,0,KERN,1,1); \
- } \
- LOOP(UNROW,0,FINI,1); \
- } \
- } else { \
- for (r = 0; r < nr; r += UNROW) { \
- LOOP(UNROW,0,INIT,incx); \
- nc = ROUNDBY(r, UNCOL); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(UNROW,0,KERN,incx,UNCOL); \
- } \
- for (; c < r; c++) { \
- LOOP(UNROW,0,KERN,incx,1); \
- } \
- LOOP(UNROW,0,FINI,incx); \
- } \
- } \
- \
- for (; r < n; r++) { \
- LOOP(1,0,INIT,incx); \
- nc = ROUNDBY(r, UNCOL); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(1,0,KERN,incx,UNCOL); \
- } \
- for (; c < r; c++) { \
- LOOP(1,0,KERN,incx,1); \
- } \
- LOOP(1,0,FINI,incx); \
- }
-
-#define BODY_UPTRI() \
- nr = n - ROUNDBY(n, UNROW); \
- if (incx == 1) { \
- for (r = n-1; r >= nr; r -= UNROW) { \
- LOOP(UNROW,0,INIT,1); \
- nc = n - ROUNDBY(r, UNCOL); \
- for (c = n-1; c >= nc; c -= UNCOL) { \
- LOOP(UNROW,0,KERN,1,UNCOL); \
- } \
- for (; c > r; c--) { \
- LOOP(UNROW,0,KERN,1,1); \
- } \
- LOOP(UNROW,0,FINI,1); \
- } \
- } else { \
- for (r = n-1; r >= nr; r -= UNROW) { \
- LOOP(UNROW,0,INIT,incx); \
- nc = n - ROUNDBY(r, UNCOL); \
- for (c = n-1; c >= nc; c -= UNCOL) { \
- LOOP(UNROW,0,KERN,incx,UNCOL); \
- } \
- for (; c > r; c--) { \
- LOOP(UNROW,0,KERN,incx,1); \
- } \
- LOOP(UNROW,0,FINI,incx); \
- } \
- } \
- \
- for (; r >= 0; r--) { \
- LOOP(1,0,INIT,incx); \
- nc = n - ROUNDBY(r, UNCOL); \
- for (c = n-1; c >= nc; c -= UNCOL) { \
- LOOP(1,0,KERN,incx,UNCOL); \
- } \
- for (; c > r; c--) { \
- LOOP(1,0,KERN,incx,1); \
- } \
- LOOP(1,0,FINI,incx); \
- }
-
-#define BODY_LOTRI_XY() \
- nr = ROUNDBY(n, UNROW); \
- if (incx == 1 && incy == 1) { \
- for (r = 0; r < nr; r += UNROW) { \
- LOOP(UNROW,0,INIT,1,1); \
- nc = ROUNDBY(r, UNCOL); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(UNROW,0,KERN,1,1,UNCOL); \
- } \
- for (; c < r; c++) { \
- LOOP(UNROW,0,KERN,1,1,1); \
- } \
- LOOP(UNROW,0,FINI,1,1); \
- } \
- } else { \
- for (r = 0; r < nr; r += UNROW) { \
- LOOP(UNROW,0,INIT,incx,incy); \
- nc = ROUNDBY(r, UNCOL); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(UNROW,0,KERN,incx,incy,UNCOL); \
- } \
- for (; c < r; c++) { \
- LOOP(UNROW,0,KERN,incx,incy,1); \
- } \
- LOOP(UNROW,0,FINI,incx, incy); \
- } \
- } \
- \
- for (; r < n; r++) { \
- LOOP(1,0,INIT,incx,incy); \
- nc = ROUNDBY(r, UNCOL); \
- for (c = 0; c < nc; c += UNCOL) { \
- LOOP(1,0,KERN,incx,incy,UNCOL); \
- } \
- for (; c < r; c++) { \
- LOOP(1,0,KERN,incx,incy,1); \
- } \
- LOOP(1,0,FINI,incx,incy); \
- }
-
-#define BODY_UPTRI_XY() \
- nr = n - ROUNDBY(n, UNROW); \
- if (incx == 1 && incy == 1) { \
- for (r = n-1; r >= nr; r -= UNROW) { \
- LOOP(UNROW,0,INIT,1,1); \
- nc = n - ROUNDBY(r, UNCOL); \
- for (c = n-1; c >= nc; c -= UNCOL) { \
- LOOP(UNROW,0,KERN,1,1,UNCOL); \
- } \
- for (; c > r; c--) { \
- LOOP(UNROW,0,KERN,1,1,1); \
- } \
- LOOP(UNROW,0,FINI,1,1); \
- } \
- } else { \
- for (r = n-1; r >= nr; r -= UNROW) { \
- LOOP(UNROW,0,INIT,incx,incy); \
- nc = n - ROUNDBY(r, UNCOL); \
- for (c = n-1; c >= nc; c -= UNCOL) { \
- LOOP(UNROW,0,KERN,incx,incy,UNCOL); \
- } \
- for (; c > r; c--) { \
- LOOP(UNROW,0,KERN,incx,incy,1); \
- } \
- LOOP(UNROW,0,FINI,incx,incy); \
- } \
- } \
- \
- for (; r >= 0; r--) { \
- LOOP(1,0,INIT,incx,incy); \
- nc = n - ROUNDBY(r, UNCOL); \
- for (c = n-1; c >= nc; c -= UNCOL) { \
- LOOP(1,0,KERN,incx,incy,UNCOL); \
- } \
- for (; c > r; c--) { \
- LOOP(1,0,KERN,incx,incy,1); \
- } \
- LOOP(1,0,FINI,incx,incy); \
- }
-
-// -----------------------------------------------------------------------
-// implementation
-
-#define UNROW 4
-#define UNCOL 4
-
-#define INT int
-#define FLOAT double
-#define func(name) blas·d##name
-#include "blas2body"
-
-#undef FLOAT
-#undef func
-
-#define FLOAT float
-#define func(name) blas·f##name
-#include "blas2body"
diff --git a/sys/libmath/blas2body b/sys/libmath/blas2body
deleted file mode 100644
index 45baf67..0000000
--- a/sys/libmath/blas2body
+++ /dev/null
@@ -1,256 +0,0 @@
-/* general matrix multiply */
-error
-func(gemv)(uint flag, INT nrow, INT ncol, FLOAT a, FLOAT *m, INT incm, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy)
-{
- INT r, c, nr, nc;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = 0;
-#define TERM(J, I, INCX, INCY) row[I][c+J] * x[INCX*(c+J)]
-#define KERN(I, INCX, INCY, LENGTH) reg[I] += EXPAND(LENGTH,0,TERM,+,I,INCX,INCY);
-#define FINI(I, INCX, INCY) y[INCY*(r+I)] = b*y[INCY*(r+I)] + a*reg[I];
-
- if (!flag) {
- BODY_RECT();
- } else {
- func(scale)(ncol, b, y, incy);
-#undef KERN
-#undef FINI
-#undef INIT
-#undef TERM
-#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = a*x[INCX*(r+I)];
-#define TERM(J, I, INCX, INCY) row[I][c+J] * reg[I]
-#define KERN(I, INCX, INCY, LENGTH) y[INCY*(c+I)] += EXPAND(LENGTH,0,TERM,+,I,INCX,INCY);
-#define FINI(I, INCX, INCY)
- BODY_RECT();
- }
-
- return 0;
-#undef INIT
-#undef TERM
-#undef KERN
-#undef FINI
-}
-
-/* symmetric matrix vector multiply (different layouts) */
-void
-func(symv)(uint upper, INT n, FLOAT a, FLOAT *m, INT incm, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy)
-{
- INT r, c, nr, nc, i;
- FLOAT *row[UNROW], reg1[UNROW], reg2[UNROW];
-
-#define INIT(I, INCX, INCY) row[I] = m + (r XX I)*incm, reg1[I] = 0, reg2[I] = 0;
-#define TERM1(J, I, INCX, INCY) row[I][c XX J]*x[INCX*(c XX J)]
-#define TERM2(J, I, INCX, INCY) row[I][c XX J]*x[INCX*((n-c-1) XX J)]
-#define KERN(I, INCX, INCY, REPEAT) reg1[I] += EXPAND(REPEAT,0,TERM1,+,I,INCX,INCY), \
- reg2[I] += EXPAND(REPEAT,0,TERM2,+,I,INCX,INCY);
-#define FINI(I, INCX, INCY) y[INCY*(r+I)] += a*(reg1[I] + row[I][r]*x[INCX*r]), \
- y[INCY*(n-r-1+I)] += a*reg2[I];
-
- func(scale)(n, b, y, incy);
-#define XX +
- if (!upper) {
- BODY_LOTRI_XY();
- } else {
-#undef XX
-#define XX -
- BODY_UPTRI_XY();
- }
-#undef XX
-
-#undef INIT
-}
-
-void
-func(spmv)(uint upper, INT n, FLOAT a, FLOAT *m, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy)
-{
- INT r, c, nr, nc, i;
- FLOAT *row[UNROW], reg1[UNROW], reg2[UNROW];
-
-#define INIT(I, INCX, INCY) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg1[I] = 0, reg2[I] = 0;
-
- func(scale)(n, b, y, incy);
-#define XX +
- if (!upper) {
- BODY_LOTRI_XY();
- } else {
-#undef XX
-#undef INIT
-#define INIT(I, INCX, INCY) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg1[I] = 0, reg2[I] = 0;
-#define XX -
- BODY_UPTRI_XY();
- }
-#undef XX
-
-#undef TERM
-#undef INIT
-#undef KERN
-#undef FINI
-}
-
-/* triangular multiply (different layouts) */
-void
-func(trmv)(uint upper, INT n, FLOAT *m, INT incm, FLOAT *x, INT incx)
-{
- INT r, c, nr, nc, i;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = 0;
-#define TERM(J, I, INCX) row[I][c XX J]*x[INCX*(c XX J)]
-#define KERN(I, INCX, REPEAT) reg[I] += EXPAND(REPEAT,0,TERM,+,I,INCX);
-#define FINI(I, INCX) x[INCX*(r XX I)] = row[I][r XX I]*x[INCX*(r XX I)] + reg[I];
-
-#define XX +
- if (!upper) {
- BODY_LOTRI();
- } else {
-#undef XX
-#define XX -
- BODY_UPTRI();
- }
-#undef XX
-
-#undef INIT
-}
-
-void
-func(tpmv)(uint upper, INT n, FLOAT *m, FLOAT *x, INT incx)
-{
- INT r, c, nr, nc, i;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0;
-
-#define XX +
- if (!upper) {
- BODY_LOTRI();
- } else {
-#undef XX
-#undef INIT
-#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0;
-#define XX -
- BODY_UPTRI();
- }
-#undef XX
-
-#undef INIT
-#undef TERM
-#undef KERN
-#undef FINI
-}
-
-/* triangular solve (different layouts) */
-void
-func(trsv)(uint upper, INT n, FLOAT *m, INT incm, FLOAT *x, INT incx)
-{
- INT r, c, nr, nc, i;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = 0;
-#define TERM(J, I, INCX) row[I][c XX J]*x[c XX J]
-#define KERN(I, INCX, REPEAT) reg[I] += EXPAND(REPEAT,0,TERM,+,I,INCX);
-#define SOLN(J, I, INCX) reg[J] += row[I][r XX I]*x[INCX*(r XX I)]
-#define FINI(I, INCX) x[INCX*(r XX I)] = reg[I] / row[I][r XX I]; EXPAND_TRI(UNROW,INC(I),SOLN,;,I,INCX);
-
-#define XX +
- if (!upper) {
- BODY_LOTRI();
- } else {
-#undef XX
-#define XX -
- BODY_UPTRI();
- }
-#undef XX
-
-#undef INIT
-}
-
-void
-func(tpsv)(uint upper, INT n, FLOAT *m, FLOAT *x, INT incx)
-{
- INT r, c, nr, nc, i;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0;
-
-#define XX +
- if (!upper) {
- BODY_LOTRI();
- } else {
-#undef XX
-#undef INIT
-#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0;
-#define XX -
- BODY_UPTRI();
- }
-#undef XX
-
-#undef INIT
-#undef TERM
-#undef KERN
-#undef SOLN
-#undef FINI
-}
-
-/* rank 1 update */
-void
-func(ger)(INT nrow, INT ncol, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT *m, INT incm)
-{
- INT r, c, nr, nc;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = a*x[INCX*(r+I)];
-#define TERM(J, I, INCX, INCY) row[I][c+J] += reg[I] * y[INCY*(c+J)]
-#define KERN(I, INCX, INCY, LENGTH) EXPAND(LENGTH,0,TERM,;,I,INCX, INCY);
-#define FINI(I, ...)
-
- BODY_RECT();
-
-#undef INIT
-#undef TERM
-#undef KERN
-#undef FINI
-}
-
-/* symmetric rank 1 update (different memory layouts) */
-void
-func(syr)(uint upper, INT n, FLOAT a, FLOAT *x, INT incx, FLOAT *m, INT incm)
-{
- INT r, c, nr, nc;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = a*x[INCX*(r XX I)];
-#define TERM(J, I, INCX) row[I][c XX J] += reg[I] * x[INCX*(c XX J)]
-#define KERN(I, INCX, LENGTH) EXPAND(LENGTH,0,TERM,;,I,INCX);
-#define FINI(I, ...)
-
-#define XX +
- if (!upper) {
- BODY_LOTRI();
- } else {
-#undef XX
-#define XX -
- BODY_UPTRI();
- }
-#undef XX
-
-#undef INIT
-}
-
-void
-func(spr)(uint upper, INT n, FLOAT a, FLOAT *x, INT incx, FLOAT *m)
-{
- INT r, c, nr, nc;
- FLOAT *row[UNROW], reg[UNROW];
-#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0;
-
-#define XX +
- if (!upper) {
- BODY_LOTRI();
- } else {
-#undef XX
-#undef INIT
-#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0;
-#define XX -
- BODY_UPTRI();
- }
-#undef XX
-
-#undef INIT
-#undef TERM
-#undef KERN
-#undef FINI
-}
diff --git a/sys/libmath/blas3.c b/sys/libmath/blas3.c
deleted file mode 100644
index b048c95..0000000
--- a/sys/libmath/blas3.c
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <u.h>
-#include <base.h>
-#include <libmath.h>
-
-#define INT int
-#define FLOAT double
-#define func(name) blas·d##name
-
-#define X(i, j) x[j + incx*(i)]
-#define Y(i, j) y[j + incy*(i)]
-#define Z(i, j) z[j + incz*(i)]
-
-void
-func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz)
-{
- INT jj, jb, kk, kb, dk, i, j, k, end;
- FLOAT r0[8], r1[8], r2[8], r3[8], pf;
-
- for (i = 0; i < ni; i++) {
- for (j = 0; j < nj; j++) {
- Z(i,j) *= b;
- }
- }
-
- jb = MIN(256, nj);
- kb = MIN(48, nk);
- for (jj = 0; jj < nj; jj += jb) {
- for (kk = 0; kk < nk; kk += kb) {
- for (i = 0; i < ni; i += 4) {
- for (j = jj; j < jj + jb; j += 8) {
- r0[0] = Z(i+0, j+0); r0[1] = Z(i+0, j+1); r0[2] = Z(i+0, j+2); r0[3] = Z(i+0, j+3);
- r1[0] = Z(i+1, j+0); r1[1] = Z(i+1, j+1); r1[2] = Z(i+1, j+2); r1[3] = Z(i+1, j+3);
- r2[0] = Z(i+2, j+0); r2[1] = Z(i+2, j+1); r2[2] = Z(i+2, j+2); r2[3] = Z(i+2, j+3);
- r3[0] = Z(i+3, j+0); r3[1] = Z(i+3, j+1); r3[2] = Z(i+3, j+2); r3[3] = Z(i+3, j+3);
- end = MIN(nk, kk+kb);
- for (k = kk; k < end; k++) {
- pf = a * X(i, k);
- r0[0] += pf * Y(k, j+0); r0[1] += pf * Y(k, j+1); r0[2] += pf * Y(k, j+2); r0[3] += pf * Y(k, j+3);
-
- pf = a * X(i+1, k);
- r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3);
-
- pf = a * X(i+2, k);
- r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3);
-
- pf = a * X(i+3, k);
- r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3);
- }
- Z(i+0, j+0) = r0[0]; Z(i+0, j+1) = r0[1]; Z(i+0, j+2) = r0[2]; Z(i+0, j+3) = r0[3];
- Z(i+1, j+0) = r1[0]; Z(i+1, j+1) = r1[1]; Z(i+1, j+2) = r1[2]; Z(i+1, j+3) = r1[3];
- Z(i+2, j+0) = r2[0]; Z(i+2, j+1) = r2[1]; Z(i+2, j+2) = r2[2]; Z(i+2, j+3) = r2[3];
- Z(i+3, j+0) = r3[0]; Z(i+3, j+1) = r3[1]; Z(i+3, j+2) = r3[2]; Z(i+3, j+3) = r3[3];
- }
- }
- }
- }
-}
-
-#if 0
-void
-func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz)
-{
- int i, j, k;
- FLOAT w[nj*nk], acc[4][4];
-
- for (i = 0; i < ni; i++) {
- for (j = 0; j < nj; j++) {
- Z(i,j) *= b;
- W(i,j) = Y(j,i);
- }
- }
-
- for (i = 0; i < ni; i+=4) {
- for (j = 0; j < nj; j+=4) {
- memset(acc, 0, sizeof(acc));
- for (k = 0; k < nk; k+=4) {
- acc[0][0] += X(i+0,k)*W(j+0,k) + X(i+0,k+1)*W(j+0,k+1) + X(i+0,k+2)*W(j+0,k+2) + X(i+0,k+3)*W(j+0,k+3);
- acc[0][1] += X(i+0,k)*W(j+1,k) + X(i+0,k+1)*W(j+1,k+1) + X(i+0,k+2)*W(j+1,k+2) + X(i+0,k+3)*W(j+1,k+3);
- acc[0][2] += X(i+0,k)*W(j+2,k) + X(i+0,k+1)*W(j+2,k+1) + X(i+0,k+2)*W(j+2,k+2) + X(i+0,k+3)*W(j+2,k+3);
- acc[0][3] += X(i+0,k)*W(j+3,k) + X(i+0,k+1)*W(j+3,k+1) + X(i+0,k+2)*W(j+3,k+2) + X(i+0,k+3)*W(j+3,k+3);
-
- acc[1][0] += X(i+1,k)*W(j+0,k) + X(i+1,k+1)*W(j+0,k+1) + X(i+1,k+2)*W(j+0,k+2) + X(i+1,k+3)*W(j+0,k+3);
- acc[1][1] += X(i+1,k)*W(j+1,k) + X(i+1,k+1)*W(j+1,k+1) + X(i+1,k+2)*W(j+1,k+2) + X(i+1,k+3)*W(j+1,k+3);
- acc[1][2] += X(i+1,k)*W(j+2,k) + X(i+1,k+1)*W(j+2,k+1) + X(i+1,k+2)*W(j+2,k+2) + X(i+1,k+3)*W(j+2,k+3);
- acc[1][3] += X(i+1,k)*W(j+3,k) + X(i+1,k+1)*W(j+3,k+1) + X(i+1,k+2)*W(j+3,k+2) + X(i+1,k+3)*W(j+3,k+3);
-
- acc[2][0] += X(i+2,k)*W(j+0,k) + X(i+2,k+1)*W(j+0,k+1) + X(i+2,k+2)*W(j+0,k+2) + X(i+2,k+3)*W(j+0,k+3);
- acc[2][1] += X(i+2,k)*W(j+1,k) + X(i+2,k+1)*W(j+1,k+1) + X(i+2,k+2)*W(j+1,k+2) + X(i+2,k+3)*W(j+1,k+3);
- acc[2][2] += X(i+2,k)*W(j+2,k) + X(i+2,k+1)*W(j+2,k+1) + X(i+2,k+2)*W(j+2,k+2) + X(i+2,k+3)*W(j+2,k+3);
- acc[2][3] += X(i+2,k)*W(j+3,k) + X(i+2,k+1)*W(j+3,k+1) + X(i+2,k+2)*W(j+3,k+2) + X(i+2,k+3)*W(j+3,k+3);
-
- acc[2][0] += X(i+3,k)*W(j+0,k) + X(i+3,k+1)*W(j+0,k+1) + X(i+3,k+2)*W(j+0,k+2) + X(i+3,k+3)*W(j+0,k+3);
- acc[2][1] += X(i+3,k)*W(j+1,k) + X(i+3,k+1)*W(j+1,k+1) + X(i+3,k+2)*W(j+1,k+2) + X(i+3,k+3)*W(j+1,k+3);
- acc[2][2] += X(i+3,k)*W(j+2,k) + X(i+3,k+1)*W(j+2,k+1) + X(i+3,k+2)*W(j+2,k+2) + X(i+3,k+3)*W(j+2,k+3);
- acc[2][3] += X(i+3,k)*W(j+3,k) + X(i+3,k+1)*W(j+3,k+1) + X(i+3,k+2)*W(j+3,k+2) + X(i+3,k+3)*W(j+3,k+3);
- // Z(i,j) += X(i,k)*Y(k,j);
- }
- Z(i+0,j+1) = a*acc[0][0];
- Z(i+0,j+2) = a*acc[0][1];
- Z(i+0,j+3) = a*acc[0][2];
- Z(i+0,j+4) = a*acc[0][3];
-
- Z(i+1,j+1) = a*acc[1][0];
- Z(i+1,j+2) = a*acc[1][1];
- Z(i+1,j+3) = a*acc[1][2];
- Z(i+1,j+4) = a*acc[1][3];
-
- Z(i+2,j+1) = a*acc[2][0];
- Z(i+2,j+2) = a*acc[2][1];
- Z(i+2,j+3) = a*acc[2][2];
- Z(i+2,j+4) = a*acc[2][3];
-
- Z(i+3,j+1) = a*acc[3][0];
- Z(i+3,j+2) = a*acc[3][1];
- Z(i+3,j+3) = a*acc[3][2];
- Z(i+3,j+4) = a*acc[3][3];
- }
- }
-}
-#endif
-
-#if 0
-void
-func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz)
-{
- int i, j, k, ri, rj, rk;
- FLOAT reg[4][4], *xrow[4], *yrow[4];
-
- for (i = 0; i < ni; i++) {
- for (j = 0; j < nj; j++) {
- z[j + incz*i] *= b;
- }
- }
-
- for (i = 0; i < ni; i += 4) {
- xrow[0] = x + incx*(i+0);
- xrow[1] = x + incx*(i+1);
- xrow[2] = x + incx*(i+2);
- xrow[3] = x + incx*(i+3);
- for (k = 0; k < nk; k+=4) {
- yrow[0] = y + incy*(k+0);
- yrow[1] = y + incy*(k+1);
- yrow[2] = y + incy*(k+2);
- yrow[3] = y + incy*(k+3);
- reg[0][0] = a * xrow[0][k+0]; reg[0][1] = a * xrow[0][k+1]; reg[0][2] = a * xrow[0][k+2]; reg[0][3] = a * xrow[0][k+3];
- reg[1][0] = a * xrow[1][k+0]; reg[1][1] = a * xrow[1][k+1]; reg[1][2] = a * xrow[1][k+2]; reg[1][3] = a * xrow[1][k+3];
- reg[2][0] = a * xrow[2][k+0]; reg[2][1] = a * xrow[2][k+1]; reg[2][2] = a * xrow[2][k+2]; reg[2][3] = a * xrow[2][k+3];
- reg[3][0] = a * xrow[3][k+0]; reg[3][1] = a * xrow[3][k+1]; reg[3][2] = a * xrow[3][k+2]; reg[3][3] = a * xrow[3][k+3];
- for (j = 0; j < nj; j += 1) {
- z[j + incz*(i+0)] += (reg[0][0]*yrow[0][j]+reg[0][1]*yrow[1][j]+reg[0][2]*yrow[2][j]+reg[0][3]*yrow[3][j]);
- z[j + incz*(i+1)] += (reg[1][0]*yrow[0][j]+reg[1][1]*yrow[1][j]+reg[1][2]*yrow[2][j]+reg[1][3]*yrow[3][j]);
- z[j + incz*(i+2)] += (reg[2][0]*yrow[0][j]+reg[2][1]*yrow[1][j]+reg[2][2]*yrow[2][j]+reg[2][3]*yrow[3][j]);
- z[j + incz*(i+3)] += (reg[3][0]*yrow[0][j]+reg[3][1]*yrow[1][j]+reg[3][2]*yrow[2][j]+reg[3][3]*yrow[3][j]);
- }
- }
- }
-}
-#endif
-
-#if 0
-void
-func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz)
-{
- int i, j, k, ri, rj, rk;
- FLOAT r[4][4], *row[4];
-
- for (i = 0; i < ni; i++) {
- for (j = 0; j < nj; j++) {
- Z(i, j) *= b;
- }
- }
-
- for (i = 0; i < ni; i+=4) {
- for (j = 0; j < nj; j+=4) {
- r[0][0] = 0; r[0][1] = 0; r[0][2] = 0; r[0][3] = 0;
- r[1][0] = 0; r[1][1] = 0; r[1][2] = 0; r[1][3] = 0;
- r[2][0] = 0; r[2][1] = 0; r[2][2] = 0; r[2][3] = 0;
- r[3][0] = 0; r[3][1] = 0; r[3][2] = 0; r[3][3] = 0;
- row[0] = &X(i+0, 0);
- row[1] = &X(i+1, 0);
- row[2] = &X(i+2, 0);
- row[3] = &X(i+3, 0);
- for (k = 0; k < nk; k++) {
- r[0][0] += row[0][k]*Y(k,0); r[0][1] += row[0][k]*Y(k,1); r[0][2] += row[0][k]*Y(k,2); r[0][3] += row[0][k]*Y(k,3);
- r[1][0] += row[1][k]*Y(k,0); r[1][1] += row[1][k]*Y(k,1); r[1][2] += row[1][k]*Y(k,2); r[1][3] += row[1][k]*Y(k,3);
- r[2][0] += row[2][k]*Y(k,0); r[2][1] += row[2][k]*Y(k,1); r[2][2] += row[2][k]*Y(k,2); r[2][3] += row[2][k]*Y(k,3);
- r[3][0] += row[3][k]*Y(k,0); r[3][1] += row[3][k]*Y(k,1); r[3][2] += row[3][k]*Y(k,2); r[3][3] += row[3][k]*Y(k,3);
- }
- Z(i+0, j+0) += r[0][0]; Z(i+0, j+1) += r[0][1]; Z(i+0, j+2) += r[0][2]; Z(i+0, j+3) += r[0][3];
- Z(i+1, j+0) += r[1][0]; Z(i+1, j+1) += r[1][1]; Z(i+1, j+2) += r[1][2]; Z(i+1, j+3) += r[1][3];
- Z(i+2, j+0) += r[2][0]; Z(i+2, j+1) += r[2][1]; Z(i+2, j+2) += r[2][2]; Z(i+2, j+3) += r[2][3];
- Z(i+3, j+0) += r[3][0]; Z(i+3, j+1) += r[3][1]; Z(i+3, j+2) += r[3][2]; Z(i+3, j+3) += r[3][3];
- }
- }
-}
-#endif
-
-#if 0
-void
-func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz)
-{
- int i, j, k, ri, rj, rk;
- FLOAT *xrow[8], *yrow[8], reg;
-
- for (i = 0; i < ni; i++) {
- for (j = 0; j < nj; j++) {
- z[j + incz*i] *= b;
- }
- }
-
- ri = ni & ~7;
- rj = nj & ~7;
- for (i = 0; i < ri; i += 8) {
- xrow[0] = x + incx*(i+0);
- xrow[1] = x + incx*(i+1);
- xrow[2] = x + incx*(i+2);
- xrow[3] = x + incx*(i+3);
- xrow[4] = x + incx*(i+4);
- xrow[5] = x + incx*(i+5);
- xrow[6] = x + incx*(i+6);
- xrow[7] = x + incx*(i+7);
- for (j = 0; j < rj; j += 8) {
- yrow[0] = y + incy*(j+0);
- yrow[1] = y + incy*(j+1);
- yrow[2] = y + incy*(j+2);
- yrow[3] = y + incy*(j+3);
- yrow[4] = y + incy*(j+4);
- yrow[5] = y + incy*(j+5);
- yrow[6] = y + incy*(j+6);
- yrow[7] = y + incy*(j+7);
- for (k = 0; k < nk; k++) {
- reg = a*(yrow[0][k] + yrow[1][k] + yrow[2][k] + yrow[3][k] + yrow[4][k] + yrow[5][k] + yrow[6][k] + yrow[7][k]);
- z[k + incz*(i+0)] += xrow[0][k]*reg;
- z[k + incz*(i+1)] += xrow[1][k]*reg;
- z[k + incz*(i+2)] += xrow[2][k]*reg;
- z[k + incz*(i+3)] += xrow[3][k]*reg;
- z[k + incz*(i+4)] += xrow[4][k]*reg;
- z[k + incz*(i+5)] += xrow[5][k]*reg;
- z[k + incz*(i+6)] += xrow[6][k]*reg;
- z[k + incz*(i+7)] += xrow[7][k]*reg;
- }
- }
- for (; j < nj; j++) {
- for (k = 0; k < nk; k++) {
- reg = a*y[k+incy*j];
- z[k + incz*(i+0)] += xrow[0][k]*reg;
- z[k + incz*(i+1)] += xrow[1][k]*reg;
- z[k + incz*(i+2)] += xrow[2][k]*reg;
- z[k + incz*(i+3)] += xrow[3][k]*reg;
- z[k + incz*(i+4)] += xrow[4][k]*reg;
- z[k + incz*(i+5)] += xrow[5][k]*reg;
- z[k + incz*(i+6)] += xrow[6][k]*reg;
- z[k + incz*(i+7)] += xrow[7][k]*reg;
- }
- }
- }
-
- for (; i < ni; i++) {
- for (j = 0; j < rj; j += 8) {
- yrow[0] = y + incy*(j+0);
- yrow[1] = y + incy*(j+1);
- yrow[2] = y + incy*(j+2);
- yrow[3] = y + incy*(j+3);
- yrow[4] = y + incy*(j+4);
- yrow[5] = y + incy*(j+5);
- yrow[6] = y + incy*(j+6);
- yrow[7] = y + incy*(j+7);
- for (k = 0; k < nk; k++) {
- z[k + incz*(i)] += a*x[k + incx*i]*(yrow[0][k] + yrow[1][k] + yrow[2][k] + yrow[3][k] + yrow[4][k] + yrow[5][k] + yrow[6][k] + yrow[7][k]);
- }
- }
- for (; j < nj; j++) {
- for (k = 0; k < nk; k++) {
- z[k + incz*i] += a*x[k + incx*i]*y[k + incy*j];
- }
- }
- }
-}
-#endif
diff --git a/sys/libmath/lapack.c b/sys/libmath/lapack.c
deleted file mode 100644
index e69de29..0000000
--- a/sys/libmath/lapack.c
+++ /dev/null
diff --git a/sys/libmath/linalg.c b/sys/libmath/linalg.c
deleted file mode 100644
index 8551ff1..0000000
--- a/sys/libmath/linalg.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <u.h>
-#include <libn.h>
-#include <libmath.h>
-#include <libmath/blas.h>
-
-// -----------------------------------------------------------------------
-// Vector
-
-void
-linalg·normalize(math·Vector vec)
-{
- double norm;
-
- norm = blas·normd(vec.len, vec.data, 1);
- blas·scaled(vec.len, 1/norm, vec.data, 1);
-}
-// TODO: Write blas wrappers that eat vectors for convenience
-
-// -----------------------------------------------------------------------
-// Matrix
-//
-// NOTE: all matrices are row major oriented
-
-/*
- * linalg·lq
- * computes the LQ decomposition of matrix M: M = LQ
- * L is lower triangular
- * Q is orthogonal -> transp(Q) * Q = I
- *
- * m: matrix to factorize. changes in place
- * + lower triangle -> L
- * + upper triangle -> all reflection vectors stored in rows
- * w: working buffer: len = ncols!
- */
-error
-linalg·lq(math·Matrix m, math·Vector w)
-{
- int i, j, len;
- double *row, mag;
- enum {
- err·nil,
- err·baddims,
- };
-
- if (m.dim[0] > m.dim[1]) {
- return err·baddims;
- }
-
- for (i = 0; i < m.dim[0]; i++, m.data += m.dim[1]) {
- row = m.data + i;
- len = m.dim[0] - i;
-
- // TODO: Don't want to compute norm twice!!
- w.data[0] = math·sgn(row[0]) * blas·normd(len, row, 1);
- blas·axpyd(len, 1.0, row, 1, w.data, 1);
- mag = blas·normd(len, w.data, 1);
- blas·scaled(len, 1/mag, w.data, 1);
-
- blas·copyd(len - m.dim[0], w.data, 1, m.data + i, 1);
- }
-
- return err·nil;
-}
diff --git a/sys/libmath/loop.h b/sys/libmath/loop.h
deleted file mode 100644
index a877d84..0000000
--- a/sys/libmath/loop.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#pragma once
-
-/* increment operator */
-#define INC2(x) INC_##x
-#define INC1(x) INC2(x)
-#define INC(x) INC1(x)
-
-#define INC_0 1
-#define INC_1 2
-#define INC_2 3
-#define INC_3 4
-#define INC_4 5
-#define INC_5 6
-#define INC_6 7
-#define INC_7 8
-#define INC_8 9
-#define INC_9 10
-#define INC_10 11
-#define INC_11 12
-#define INC_12 13
-#define INC_13 14
-#define INC_14 15
-#define INC_15 16
-
-#define ROUNDBY(x, n) ((x) & ~((n)-1))
-
-/* subtraction tables */
-#define SUB2(x, y) SUB_##x##_##y
-#define SUB1(x, y) SUB2(x, y)
-#define SUB(x, y) SUB1(x, y)
-#define SUB_8_0 8
-#define SUB_8_1 7
-#define SUB_8_2 6
-#define SUB_8_3 5
-#define SUB_8_4 4
-#define SUB_8_5 3
-#define SUB_8_6 2
-#define SUB_8_7 1
-#define SUB_8_8 0
-#define SUB_7_0 7
-#define SUB_7_1 6
-#define SUB_7_2 5
-#define SUB_7_3 4
-#define SUB_7_4 3
-#define SUB_7_5 2
-#define SUB_7_6 1
-#define SUB_7_7 0
-#define SUB_6_0 6
-#define SUB_6_1 5
-#define SUB_6_2 4
-#define SUB_6_3 3
-#define SUB_6_4 2
-#define SUB_6_5 1
-#define SUB_6_6 0
-#define SUB_5_0 5
-#define SUB_5_1 4
-#define SUB_5_2 3
-#define SUB_5_3 2
-#define SUB_5_4 1
-#define SUB_5_5 0
-#define SUB_4_0 4
-#define SUB_4_1 3
-#define SUB_4_2 2
-#define SUB_4_3 1
-#define SUB_4_4 0
-#define SUB_3_0 3
-#define SUB_3_1 2
-#define SUB_3_2 1
-#define SUB_3_3 0
-#define SUB_2_0 2
-#define SUB_2_1 1
-#define SUB_2_2 0
-#define SUB_1_0 1
-#define SUB_1_1 0
-
-/* rounding operator */
-#define ROUNDBY(x, n) ((x) & ~((n)-1))
-
-/* loop unrolling (vertical) */
-#define LOOP1(I,STMT,...) STMT(I,__VA_ARGS__)
-#define LOOP2(I,STMT,...) STMT(I,__VA_ARGS__) LOOP1(INC(I),STMT,__VA_ARGS__)
-#define LOOP3(I,STMT,...) STMT(I,__VA_ARGS__) LOOP2(INC(I),STMT,__VA_ARGS__)
-#define LOOP4(I,STMT,...) STMT(I,__VA_ARGS__) LOOP3(INC(I),STMT,__VA_ARGS__)
-#define LOOP5(I,STMT,...) STMT(I,__VA_ARGS__) LOOP4(INC(I),STMT,__VA_ARGS__)
-#define LOOP6(I,STMT,...) STMT(I,__VA_ARGS__) LOOP5(INC(I),STMT,__VA_ARGS__)
-#define LOOP7(I,STMT,...) STMT(I,__VA_ARGS__) LOOP6(INC(I),STMT,__VA_ARGS__)
-#define LOOP8(I,STMT,...) STMT(I,__VA_ARGS__) LOOP7(INC(I),STMT,__VA_ARGS__)
-#define LOOP9(I,STMT,...) STMT(I,__VA_ARGS__) LOOP8(INC(I),STMT,__VA_ARGS__)
-#define LOOP10(I,STMT,...) STMT(I,__VA_ARGS__) LOOP9(INC(I),STMT,__VA_ARGS__)
-#define LOOP11(I,STMT,...) STMT(I,__VA_ARGS__) LOOP10(INC(I),STMT,__VA_ARGS__)
-#define LOOP12(I,STMT,...) STMT(I,__VA_ARGS__) LOOP11(INC(I),STMT,__VA_ARGS__)
-#define LOOP13(I,STMT,...) STMT(I,__VA_ARGS__) LOOP12(INC(I),STMT,__VA_ARGS__)
-#define LOOP14(I,STMT,...) STMT(I,__VA_ARGS__) LOOP13(INC(I),STMT,__VA_ARGS__)
-#define LOOP15(I,STMT,...) STMT(I,__VA_ARGS__) LOOP14(INC(I),STMT,__VA_ARGS__)
-#define LOOP16(I,STMT,...) STMT(I,__VA_ARGS__) LOOP15(INC(I),STMT,__VA_ARGS__)
-
-#define _LOOP_(n,I,STMT,...) LOOP##n(I,STMT,__VA_ARGS__)
-#define LOOP(n,I,STMT,...) _LOOP_(n,I,STMT,__VA_ARGS__)
-
-/* loop expansion (horizontal) */
-#define EXPAND0(I,TERM,OP,...)
-#define EXPAND1(I,TERM,OP,...) TERM(I,__VA_ARGS__)
-#define EXPAND2(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND1(INC(I),TERM,OP,__VA_ARGS__)
-#define EXPAND3(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND2(INC(I),TERM,OP,__VA_ARGS__)
-#define EXPAND4(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND3(INC(I),TERM,OP,__VA_ARGS__)
-#define EXPAND5(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND4(INC(I),TERM,OP,__VA_ARGS__)
-#define EXPAND6(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND5(INC(I),TERM,OP,__VA_ARGS__)
-#define EXPAND7(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND6(INC(I),TERM,OP,__VA_ARGS__)
-#define EXPAND8(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND7(INC(I),TERM,OP,__VA_ARGS__)
-
-#define _EXPAND_(n,I,TERM,OP,...) EXPAND##n(I,TERM,OP,__VA_ARGS__)
-#define EXPAND(n,I,TERM,OP,...) _EXPAND_(n,I,TERM,OP,__VA_ARGS__)
-#define EXPAND_TRI1(n,I,TERM,OP,...) EXPAND(n,I,TERM,OP,__VA_ARGS__)
-#define EXPAND_TRI(n,I,TERM,OP,...) EXPAND_TRI1(SUB(n,I),I,TERM,OP,__VA_ARGS__)
diff --git a/sys/libmath/matrix.c b/sys/libmath/matrix.c
deleted file mode 100644
index e8bca0b..0000000
--- a/sys/libmath/matrix.c
+++ /dev/null
@@ -1,176 +0,0 @@
-#include <u.h>
-#include <libn.h>
-#include <libmath.h>
-
-/* TODO: replace (incrementally) with native C version! */
-#include <vendor/blas/cblas.h>
-#include <vendor/blas/lapacke.h>
-
-// -----------------------------------------------------------------------
-// level 1
-
-error
-la·vecslice(math·Vector *x, int min, int max, int inc)
-{
- if (max > x->len || min < 0) {
- errorf("out of bounds: attempted to access vector past length");
- return 1;
- }
- x->len = (max - min) / inc;
- x->d += x->inc * min;
- x->inc *= inc;
-
- return 0;
-}
-
-/* simple blas wrappers */
-void
-la·veccopy(math·Vector *dst, math·Vector *src)
-{
- return cblas_dcopy(src->len, src->d, src->inc, dst->d, dst->inc);
-}
-
-double
-la·vecnorm(math·Vector *x)
-{
- return cblas_dnrm2(x->len, x->d, x->inc);
-}
-
-void
-la·vecscale(math·Vector *x, double a)
-{
- return cblas_dscal(x->len, a, x->d, x->inc);
-}
-
-double
-la·vecdot(math·Vector *x, math·Vector *y)
-{
- return cblas_ddot(x->len, x->d, x->inc, y->d, y->inc);
-}
-
-// -----------------------------------------------------------------------
-// level 2
-
-error
-la·vecmat(math·Vector *x, math·Matrix *M)
-{
- if (M->dim[1] != x->len) {
- errorf("incompatible matrix dimensions");
- return 1;
- }
- if (M->state & ~mat·trans)
- cblas_dgemv(CblasRowMajor,CblasNoTrans,M->dim[0],M->dim[1],1.,M->d,M->inc,x->d,x->inc,0.,x->d,x->inc);
- else
- cblas_dgemv(CblasRowMajor,CblasTrans,M->dim[0],M->dim[1],1.,M->d,M->inc,x->d,x->inc,0.,x->d,x->inc);
-
- return 0;
-}
-
-// -----------------------------------------------------------------------
-// level 3
-
-void
-la·transpose(math·Matrix *X)
-{
- int tmp;
- X->state ^= mat·trans;
- tmp = X->dim[0], X->dim[0] = X->dim[1], X->dim[1] = tmp;
-}
-
-error
-la·matrow(math·Matrix *X, int r, math·Vector *row)
-{
- if (r < 0 || r >= X->dim[0]) {
- errorf("out of bounds");
- return 1;
- }
-
- row->len = X->dim[1];
- row->inc = 1;
- row->d = X->d + X->dim[1] * r;
-
- return 0;
-}
-
-error
-la·matcol(math·Matrix *X, int c, math·Vector *col)
-{
- if (c < 0 || c >= X->dim[1]) {
- errorf("out of bounds");
- return 1;
- }
-
- col->len = X->dim[0];
- col->inc = X->dim[1];
- col->d = X->d + c;
-
- return 0;
-}
-
-error
-la·matslice(math·Matrix *X, int r[3], int c[3])
-{
- /* TODO */
- return 0;
-}
-
-error
-la·eig(math·Matrix *X)
-{
-
-}
-
-/* X = A*B */
-error
-la·matmul(math·Matrix *X, math·Matrix *A, math·Matrix *B)
-{
- if (A->dim[1] != B->dim[0]) {
- errorf("number of interior dimensions of A '%d' not equal to that of B '%d'", A->dim[1], B->dim[0]);
- return 1;
- }
- if (X->dim[0] != A->dim[0]) {
- errorf("number of exterior dimensions of X '%d' not equal to that of A '%d'", X->dim[0], A->dim[0]);
- return 1;
- }
- if (X->dim[1] != B->dim[1]) {
- errorf("number of exterior dimensions of X '%d' not equal to that of B '%d'", X->dim[1], B->dim[1]);
- return 1;
- }
-
- if (X->state & ~mat·trans)
- if (A->state & ~mat·trans)
- cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc);
- else
- cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc);
- else
- if (A->state & ~mat·trans)
- cblas_dgemm(CblasRowMajor,CblasTrans,CblasNoTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc);
- else
- cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc);
-
- return 0;
-}
-
-/*
- * solves A*X=B
- * pass in B via X
- */
-error
-la·solve(math·Matrix *X, math·Matrix *A)
-{
- error err;
- int n, *ipv;
- static int buf[512];
- if (n = A->dim[0], n < arrlen(buf)) {
- ipv = buf;
- n = 0;
- } else
- ipv = malloc(n*sizeof(*ipv));
-
- /* TODO: utilize more specific regimes if applicable */
- err = LAPACKE_dgesv(LAPACK_ROW_MAJOR,A->dim[0],X->dim[1],A->d,A->inc,ipv,X->d,X->inc);
-
- if (n)
- free(ipv);
- return err;
-}
diff --git a/sys/libmath/rules.mk b/sys/libmath/rules.mk
deleted file mode 100644
index 83945d7..0000000
--- a/sys/libmath/rules.mk
+++ /dev/null
@@ -1,24 +0,0 @@
-include share/push.mk
-
-# Iterate through subdirectory tree
-
-# Local sources
-SRCS_$(d) := \
- $(d)/basic.c \
- $(d)/blas1.c \
- $(d)/blas2.c \
- $(d)/blas3.c
-LIBS_$(d) := $(d)/libmath.a
-TSTS_$(d) :=
-
-include share/paths.mk
-
-$(LIBS_$(d)): $(OBJS_$(d))
- $(ARCHIVE)
-
-$(UNTS_$(d)): TCFLAGS := -D_GNU_SOURCE
-$(UNTS_$(d)): TCLIBS := -lpthread -lm $(LIB_DIR)/libblas.a $(OBJ_DIR)/sys/libn/libn.a $(LIBS_$(d))
-$(UNTS_$(d)): $(TOBJS_$(d)) $(LIBS_$(d)) $(OBJ_DIR)/sys/libn/libn.a
- $(LINK)
-
-include share/pop.mk
diff --git a/sys/libmath/test.c b/sys/libmath/test.c
deleted file mode 100644
index 66700f8..0000000
--- a/sys/libmath/test.c
+++ /dev/null
@@ -1,471 +0,0 @@
-#include <u.h>
-#include <base.h>
-/* #include <vendor/blas/cblas.h> */
-
-#include <x86intrin.h>
-
-#include <time.h>
-
-// -----------------------------------------------------------------------
-// Vectors
-
-/*
- * NOTE: I'm not sure I like stashing the header in _all_ vectors
- * The only way to fix is to have a library based allocator...
- */
-typedef struct math·Vec
-{
- struct {
- void *h;
- mem·Allocator heap;
- };
- int len;
- double *d;
-} math·Vec;
-
-math·Vec
-math·makevec(int len, mem·Allocator heap, void *h)
-{
- math·Vec v;
- v.len = len;
- v.heap = heap;
- v.h = h;
- v.d = heap.alloc(h, 1, len*sizeof(double));
-
- // memset(v.d, 0, len*sizeof(double));
-
- return v;
-}
-
-error
-math·freevec(math·Vec *v)
-{
- if (v->h == nil && v->heap.alloc == nil && v->heap.free == nil) {
- errorf("attempting to free a vector that doesn't own its data");
- return 1;
- }
- v->heap.free(v->h, v->d);
- v->d = nil;
- v->len = 0;
-
- return 0;
-}
-
-math·Vec
-math·copyvec(math·Vec v)
-{
- math·Vec cpy;
- cpy.heap = v.heap;
- cpy.h = v.h;
- cpy.len = v.len;
- cpy.d = cpy.heap.alloc(cpy.h, 1, v.len);
-
- memcpy(cpy.d, v.d, sizeof(double)*v.len);
- return cpy;
-}
-
-/*
- * Scale vector
- */
-
-static
-void
-scale_kernel8_avx2(int n, double *x, double a)
-{
- __m128d a128;
- __m256d a256;
- register int i;
-
- a128 = _mm_load_sd(&a);
- a256 = _mm256_broadcastsd_pd(a128);
- for (i = 0; i < n; i += 8) {
- _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0));
- _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4));
- }
-}
-
-static
-void
-scale_kernel8(int n, double *x, double a)
-{
- register int i;
- for (i = 0; i < n; i += 8) {
- x[i+0] *= a;
- x[i+1] *= a;
- x[i+2] *= a;
- x[i+3] *= a;
- x[i+4] *= a;
- x[i+5] *= a;
- x[i+6] *= a;
- x[i+7] *= a;
- }
-}
-
-void
-math·scalevec(math·Vec u, double a)
-{
- int n;
-
- n = u.len & ~7;
- scale_kernel8_avx2(n, u.d, a);
-
- for (; n < u.len; n++) {
- u.d[n] *= a;
- }
-}
-
-/*
- * Add scaled vector
- */
-
-static
-void
-daxpy_kernel8_avx2(int n, double *x, double *y, double a)
-{
- __m128d a128;
- __m256d a256;
- register int i;
-
- a128 = _mm_load_sd(&a);
- a256 = _mm256_broadcastsd_pd(a128);
- for (i = 0; i < n; i += 8) {
- _mm256_storeu_pd(x+i+0, _mm256_loadu_pd(x+i+0) + a256 * _mm256_loadu_pd(y+i+0));
- _mm256_storeu_pd(x+i+4, _mm256_loadu_pd(x+i+4) + a256 * _mm256_loadu_pd(y+i+4));
- }
-}
-
-static
-void
-daxpy_kernel8(int n, double *x, double *y, double a)
-{
- register int i;
- for (i = 0; i < n; i += 8) {
- x[i+0] += a*y[i+0];
- x[i+1] += a*y[i+1];
- x[i+2] += a*y[i+2];
- x[i+3] += a*y[i+3];
- x[i+4] += a*y[i+4];
- x[i+5] += a*y[i+5];
- x[i+6] += a*y[i+6];
- x[i+7] += a*y[i+7];
- }
-}
-
-/* performs u = u + a*v */
-void
-math·addvec(math·Vec u, math·Vec v, double a)
-{
- int n;
-
- n = u.len & ~7;
- daxpy_kernel8_avx2(n, u.d, v.d, a);
-
- for (; n < u.len; n++) {
- u.d[n] += a*v.d[n];
- }
-}
-
-/*
- * Dot product
- */
-
-static
-double
-dot_kernel8_avx2(int len, double *x, double *y)
-{
- register int i;
- __m256d sum[4];
- __m128d res;
-
- for (i = 0; i < arrlen(sum); i++) {
- sum[i] = _mm256_setzero_pd();
- }
-
- for (i = 0; i < len; i += 16) {
- sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0);
- sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4);
- sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8);
- sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12);
- }
-
- sum[0] += sum[1] + sum[2] + sum[3];
-
- res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1));
- res = _mm_hadd_pd(res, res);
-
- return res[0];
-}
-
-static
-double
-dot_kernel8_fma3(int len, double *x, double *y)
-{
- register int i;
- __m256d sum[4];
- __m128d res;
-
- for (i = 0; i < arrlen(sum); i++) {
- sum[i] = _mm256_setzero_pd();
- }
-
- for (i = 0; i < len; i += 16) {
- sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]);
- sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]);
- sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]);
- sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]);
- }
-
- sum[0] += sum[1] + sum[2] + sum[3];
-
- res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1));
- res = _mm_hadd_pd(res, res);
-
- return res[0];
-}
-
-static
-double
-dot_kernel8(int len, double *x, double *y)
-{
- double res;
- register int i;
-
- for (i = 0; i < len; i += 8) {
- res += x[i] * y[i] +
- x[i+1] * y[i+1] +
- x[i+2] * y[i+2] +
- x[i+3] * y[i+3] +
- x[i+4] * y[i+4] +
- x[i+5] * y[i+5] +
- x[i+6] * y[i+6] +
- x[i+7] * y[i+7];
- }
-
- return res;
-}
-
-double
-math·dot(math·Vec u, math·Vec v)
-{
- int i, len;
- double res;
-
- len = u.len & ~15; // neat trick
- res = dot_kernel8_fma3(len, u.d, v.d);
-
- for (i = len; i < u.len; i++) {
- res += u.d[i] * v.d[i];
- }
-
- return res;
-}
-
-// -----------------------------------------------------------------------
-// Matrix
-
-typedef struct math·Mtx
-{
- struct {
- void *h;
- mem·Allocator heap;
- };
- int dim[2];
- double *d;
-} math·Mtx;
-
-math·Mtx
-math·makemtx(int n, int m, mem·Allocator heap, void *h)
-{
- math·Mtx a;
- a.dim[0] = n;
- a.dim[1] = m;
- a.heap = heap;
- a.h = h;
- a.d = heap.alloc(h, 1, n*m*sizeof(double));
-
- // memset(a.d, 0, n*m*sizeof(double));
-
- return a;
-}
-
-error
-math·freemtx(math·Vec *m)
-{
- if (m->h == nil && m->heap.alloc == nil && m->heap.free == nil) {
- errorf("attempting to free a matrix that doesn't own its data");
- return 1;
- }
- m->heap.free(m->h, m->d);
- m->d = nil;
- m->len = 0;
-
- return 0;
-}
-
-/************************************************
- * multiply matrix to vector
- ***********************************************/
-
-/*
- * Notation: (number of rows) x (number of columns) _ unroll factor
- * N => variable we sum over
- */
-static
-void
-mtxvec_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y)
-{
- int c;
- __m128d hr;
- __m256d x256, r256[4];
-
- for (c = 0; c < 4; c++) {
- r256[c] = _mm256_setzero_pd();
- }
-
- for (c = 0; c < ncol; c += 4) {
- x256 = _mm256_loadu_pd(x+c);
- r256[0] += x256 * _mm256_loadu_pd(row[0] + c);
- r256[1] += x256 * _mm256_loadu_pd(row[1] + c);
- r256[2] += x256 * _mm256_loadu_pd(row[2] + c);
- r256[3] += x256 * _mm256_loadu_pd(row[3] + c);
- }
-
- for (c = 0; c < 4; c++) {
- hr = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1));
- hr = _mm_hadd_pd(hr, hr);
- y[c] = hr[0];
- }
-}
-
-static
-void
-mtxvec_kernel4xN_4(int ncol, double **row, double *x, double *y)
-{
- int c;
- double res[4];
-
- res[0] = 0.;
- res[1] = 0.;
- res[2] = 0.;
- res[3] = 0.;
-
- for (c = 0; c < ncol; c += 4) {
- res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3];
- res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3];
- res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3];
- res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3];
- }
-
- y[0] = res[0];
- y[1] = res[1];
- y[2] = res[2];
- y[3] = res[3];
-}
-
-static
-void
-mtxvec_kernel1xN_4(int ncol, double *row, double *x, double *y)
-{
- int c;
- double res;
-
- res = 0.;
- for (c = 0; c < ncol; c += 4) {
- res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3];
- }
-
- y[0] = res;
-}
-
-// y = a*mx + b*y
-error
-math·mtxvec(math·Mtx m, double a, math·Vec x, double b, math·Vec y)
-{
- int c, r, nrow, ncol;
- double *row[4], res[4];
-
- nrow = m.dim[0] & ~3;
- ncol = m.dim[1] & ~3;
- for (r = 0; r < nrow; r += 4) {
- row[0] = m.d + (r * (m.dim[1]+0));
- row[1] = m.d + (r * (m.dim[1]+1));
- row[2] = m.d + (r * (m.dim[1]+2));
- row[3] = m.d + (r * (m.dim[1]+3));
-
- mtxvec_kernel4xN_4_avx2(ncol, row, x.d + r, res);
-
- for (c = ncol; c < m.dim[1]; c++) {
- res[0] += row[0][c];
- res[1] += row[1][c];
- res[2] += row[2][c];
- res[3] += row[3][c];
- }
-
- y.d[r+0] = res[0] + b*y.d[r+0];
- y.d[r+1] = res[1] + b*y.d[r+1];
- y.d[r+2] = res[2] + b*y.d[r+2];
- y.d[r+3] = res[3] + b*y.d[r+3];
- }
-
- for (; r < m.dim[0]; r++) {
- mtxvec_kernel1xN_4(m.dim[0], m.d + (r * m.dim[1]), x.d + r, res);
- y.d[r] = res[0] + b*y.d[r];
- }
-
- return 0;
-}
-
-/************************************************
- * add matrix to vector outerproduct
- ***********************************************/
-
-#define NITER 50
-
-#if 0
-error
-main()
-{
- int i;
- clock_t t;
- double res;
-
- math·Mtx m;
- math·Vec x, y;
-
- openblas_set_num_threads(1);
-
- x = math·makevec(1000, mem·sys, nil);
- y = math·makevec(1000, mem·sys, nil);
- m = math·makemtx(1000, 1000, mem·sys, nil);
-
- for (i = 0; i < x.len; i++) {
- y.d[i] = i;
- }
-
- t = clock();
- for (i = 0; i < NITER; i++) {
- cblas_dgemv(CblasRowMajor, CblasNoTrans, m.dim[0], m.dim[1], 1.5, m.d, m.dim[1], x.d, 1, 2.5, y.d, 1);
- }
- t = clock() - t;
- res = math·dot(y, y);
- printf("the result is %f\n", res);
- printf("time elapsed (blas): %fms\n", 1000.*t/CLOCKS_PER_SEC);
-
- for (i = 0; i < x.len; i++) {
- y.d[i] = i;
- }
-
- t = clock();
- for (i = 0; i < NITER; i++) {
- math·mtxvec(m, 1.5, x, 2.5, y);
- }
- t = clock() - t;
- res = math·dot(y, y);
-
- printf("the dot product is %f\n", res);
- printf("time elapsed (naive): %fms\n", 1000.*t/CLOCKS_PER_SEC);
-
-
- return 0;
-}
-#endif