From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- sys/libmath/basic.c | 531 -------------------------------------------------- sys/libmath/blas.c | 63 ------ sys/libmath/blas1.c | 58 ------ sys/libmath/blas1body | 215 -------------------- sys/libmath/blas2.c | 222 --------------------- sys/libmath/blas2body | 256 ------------------------ sys/libmath/blas3.c | 279 -------------------------- sys/libmath/lapack.c | 0 sys/libmath/linalg.c | 63 ------ sys/libmath/loop.h | 114 ----------- sys/libmath/matrix.c | 176 ----------------- sys/libmath/rules.mk | 24 --- sys/libmath/test.c | 471 -------------------------------------------- 13 files changed, 2472 deletions(-) delete mode 100644 sys/libmath/basic.c delete mode 100644 sys/libmath/blas.c delete mode 100644 sys/libmath/blas1.c delete mode 100644 sys/libmath/blas1body delete mode 100644 sys/libmath/blas2.c delete mode 100644 sys/libmath/blas2body delete mode 100644 sys/libmath/blas3.c delete mode 100644 sys/libmath/lapack.c delete mode 100644 sys/libmath/linalg.c delete mode 100644 sys/libmath/loop.h delete mode 100644 sys/libmath/matrix.c delete mode 100644 sys/libmath/rules.mk delete mode 100644 sys/libmath/test.c (limited to 'sys/libmath') diff --git a/sys/libmath/basic.c b/sys/libmath/basic.c deleted file mode 100644 index 1341f7b..0000000 --- a/sys/libmath/basic.c +++ /dev/null @@ -1,531 +0,0 @@ -#include -#include -#include - -#include - -// TODO(nnoll): Replace implementations with your own. - -double -math·acos(double x) -{ - return acos(x); -} - -float -math·acosf(float x) -{ - return acosf(x); -} - - -double -math·acosh(double x) -{ - return acosh(x); -} - -float -math·acoshf(float x) -{ - return acoshf(x); -} - - -double -math·asin(double x) -{ - return asin(x); -} - -float -math·asinf(float x) -{ - return asinf(x); -} - - -double -math·asinh(double x) -{ - return asinh(x); -} - -float -math·asinhf(float x) -{ - return asinhf(x); -} - - -double -math·atan(double x) -{ - return atan(x); -} - -float -math·atanf(float x) -{ - return atanf(x); -} - - -double -math·atan2(double x, double y) -{ - return atan2(x, y); -} - -float -math·atan2f(float x, float y) -{ - return atan2f(x, y); -} - -double -math·atanh(double x) -{ - return atanh(x); -} - -float -math·atanhf(float x) -{ - return atanhf(x); -} - - -double -math·cbrt(double x) -{ - return cbrt(x); -} - -float -math·cbrtf(float x) -{ - return cbrtf(x); -} - - -double -math·ceil(double x) -{ - return ceil(x); -} - -float -math·ceilf(float x) -{ - return ceilf(x); -} - -double -math·cos(double x) -{ - return cos(x); -} - -float -math·cosf(float x) -{ - return cosf(x); -} - - -double -math·cosh(double x) -{ - return cosh(x); -} - -float -math·coshf(float x) -{ - return coshf(x); -} - - -double -math·erf(double x) -{ - return erf(x); -} - -float -math·erff(float x) -{ - return erff(x); -} - - -double -math·erfc(double x) -{ - return erfc(x); -} - -float -math·erfcf(float x) -{ - return erfcf(x); -} - - -double -math·exp(double x) -{ - return exp(x); -} - -float -math·expf(float x) -{ - return expf(x); -} - - -double -math·exp2(double x) -{ - return exp2(x); -} - -float -math·exp2f(float x) -{ - return exp2f(x); -} - - -double -math·expm1(double x) -{ - return expm1(x); -} - -float -math·expm1f(float x) -{ - return expm1f(x); -} - - -double -math·floor(double x) -{ - return floor(x); -} - -float -math·floorf(float x) -{ - return floorf(x); -} - - -int -math·ilogb(double x) -{ - return ilogb(x); -} - -int -math·ilogbf(float x) -{ - return ilogbf(x); -} - -double -math·lgamma(double x) -{ - return lgamma(x); -} - -float -math·lgammaf(float x) -{ - return lgammaf(x); -} - - -vlong -math·llrint(double x) -{ - return math·llrint(x); -} - -vlong -math·llrintf(float x) -{ - return math·llrintf(x); -} - - -vlong -math·llround(double x) -{ - return llround(x); -} - -vlong -math·llroundf(float x) -{ - return llroundf(x); -} - - -double -math·log(double x) -{ - return log(x); -} - -float -math·logf(float x) -{ - return logf(x); -} - - -double -math·log10(double x) -{ - return log10(x); -} - -float -math·log10f(float x) -{ - return log10f(x); -} - - -double -math·log1p(double x) -{ - return log1p(x); -} - -float -math·log1pf(float x) -{ - return log1pf(x); -} - - -double -math·log2(double x) -{ - return log2(x); -} - -float -math·log2f(float x) -{ - return log2f(x); -} - - -double -math·logb(double x) -{ - return logb(x); -} - -float -math·logbf(float x) -{ - return logbf(x); -} - - -long -math·lrint(double x) -{ - return lrint(x); -} - -long -math·lrintf(float x) -{ - return lrintf(x); -} - - -long -math·lround(double x) -{ - return lround(x); -} - -long -math·lroundf(float x) -{ - return lroundf(x); -} - - -double math·modf(double, double *); -float math·modff(float, float *); - -double -math·nan(const char * x) -{ - return nan(x); -} - -float -math·nanf(const char * x) -{ - return nanf(x); -} - - -double -math·nearbyint(double x) -{ - return nearbyint(x); -} - -float -math·nearbyintf(float x) -{ - return nearbyintf(x); -} - - -double -math·pow(double x, double exp) -{ - return pow(x, exp); -} - -float -math·powf(float x, float exp) -{ - return powf(x, exp); -} - -double -math·rint(double x) -{ - return rint(x); -} - -float -math·rintf(float x) -{ - return rintf(x); -} - - -double -math·round(double x) -{ - return round(x); -} - -float -math·roundf(float x) -{ - return roundf(x); -} - - -double math·scalbln(double, long); -float math·scalblnf(float, long); - -double math·scalbn(double, int); -float math·scalbnf(float, int); - -double -math·sin(double x) -{ - return sin(x); -} - -float -math·sinf(float x) -{ - return sinf(x); -} - - -double -math·sinh(double x) -{ - return sinh(x); -} - -float -math·sinhf(float x) -{ - return sinhf(x); -} - - -double -math·sqrt(double x) -{ - return sqrt(x); -} - -float -math·sqrtf(float x) -{ - return sqrtf(x); -} - - -double -math·tan(double x) -{ - return tan(x); -} - -float -math·tanf(float x) -{ - return tanf(x); -} - - -double -math·tanh(double x) -{ - return tanh(x); -} - -float -math·tanhf(float x) -{ - return tanhf(x); -} - - -double -math·tgamma(double x) -{ - return tgamma(x); -} - -float -math·tgammaf(float x) -{ - return tgammaf(x); -} - - -double -math·trunc(double x) -{ - return trunc(x); -} - -float -math·truncf(float x) -{ - return truncf(x); -} diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c deleted file mode 100644 index 18f9760..0000000 --- a/sys/libmath/blas.c +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include -#include -#include -#include - -/* #include */ - -#define NCOL 2*512 -#define NROW 2*512 -#define NSUM 2*512 -#define NIT 10 -#define INC 1 -error -main() -{ - int i, j, nit; - double *x, *y, *z, *w, res[2]; - - clock_t t; - double tprof[2] = { 0 }; - - rng·init(0); - - x = malloc(sizeof(*x)*NROW*NCOL); - y = malloc(sizeof(*x)*NROW*NCOL); - z = malloc(sizeof(*x)*NROW*NCOL); - w = malloc(sizeof(*x)*NROW*NCOL); - -#define DO_0 t = clock(); \ - blas·dgemm(0,0,NROW,NCOL,NSUM,10.1,x,NROW,y,NROW,1.2,z,NROW);\ - t = clock() - t; \ - res[0] += blas·dasum(NROW*NCOL,z,INC); \ - tprof[0] += 1000.*t/CLOCKS_PER_SEC; \ - -#define DO_1 t = clock(); \ - cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,NROW,NCOL,NSUM,10.1,x,NROW,y,NROW,1.2,w,NROW);\ - t = clock() - t; \ - res[1] += cblas_dasum(NROW*NCOL,w,INC); \ - tprof[1] += 1000.*t/CLOCKS_PER_SEC; - - for (nit = 0; nit < NIT; nit++) { - for (i = 0; i < NROW; i++) { - for (j = 0; j < NCOL; j++) { - x[j + NROW*i] = rng·random(); - y[j + NROW*i] = rng·random(); - z[j + NROW*i] = rng·random(); - w[j + NROW*i] = z[j + NROW*i]; - } - } - - switch (nit % 2) { - case 0: DO_0; DO_1; break; - case 1: DO_1; DO_0; break; - } - } - printf("mean time/iteration (mine): %fms\n", tprof[0]/NIT); - printf("--> result (mine): %f\n", res[0]); - printf("mean time/iteration (openblas): %fms\n", tprof[1]/NIT); - printf("--> result (openblas): %f\n", res[1]); - - return 0; -} diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c deleted file mode 100644 index a8ca085..0000000 --- a/sys/libmath/blas1.c +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include - -// ----------------------------------------------------------------------- -// Templates - -#include "loop.h" -#define BODY_XY() \ - LOOP(UNROLL, 0, INIT); \ - n = ROUNDBY(len, UNROLL); \ - if (incx == 1 && incy == 1) { \ - for (i = 0; i < n; i+=UNROLL) { \ - LOOP(UNROLL,0,KERNEL,1,1); \ - } \ - } else { \ - for (i = 0; i < n; i+=UNROLL) { \ - LOOP(UNROLL,0,KERNEL,incx,incy);\ - } \ - } \ - \ - for (; i < len; i++) { \ - LOOP(1,0,KERNEL,incx,incy); \ - } - -#define BODY_X() \ - LOOP(UNROLL, 0, INIT); \ - n = ROUNDBY(len, UNROLL); \ - if (incx == 1) { \ - for (i = 0; i < n; i+=UNROLL) { \ - LOOP(UNROLL,0,KERNEL,1); \ - } \ - } else { \ - for (i = 0; i < n; i+=UNROLL) { \ - LOOP(UNROLL,0,KERNEL,incx); \ - } \ - } \ - \ - for (; i < len; i++) { \ - LOOP(1,0,KERNEL,incx); \ - } - -// ----------------------------------------------------------------------- -// Implementation - -#define UNROLL 8 -#define INT int - -#define FLOAT double -#define func(name) blas·d##name -#include "blas1body" - -#undef FLOAT -#undef func - -#define FLOAT float -#define func(name) blas·f##name -#include "blas1body" -#undef FLOAT diff --git a/sys/libmath/blas1body b/sys/libmath/blas1body deleted file mode 100644 index de4b637..0000000 --- a/sys/libmath/blas1body +++ /dev/null @@ -1,215 +0,0 @@ -/* vim: set ft=c */ -// ----------------------------------------------------------------------- -// Function implementations - -FLOAT -func(dot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) -{ -#define INIT(I,...) reg[I] = 0; -#define KERNEL(I, INCX, INCY) reg[I] += x[(INCX)*(i + I)] * y[(INCY)*(i + I)]; - INT i, n; - FLOAT reg[UNROLL]; - - BODY_XY() - - for (i = 1; i < UNROLL; i++) - reg[0] += reg[i]; - - return reg[0]; -#undef INIT -#undef KERNEL -} - -void -func(copy)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) -{ -#define INIT(I,...) -#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = x[(INCX)*(i + I)]; - INT i, n; - - BODY_XY(); - -#undef INIT -#undef KERNEL -} - -void -func(swap)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) -{ -#define INIT(I,...) -#define KERNEL(I, INCX, INCY) tmp[I] = x[(INCX)*(i + I)], x[(INCX)*(i + I)] = y[(INCY)*(i + I)], y[(INCY)*(i + I)] = tmp[I]; - INT i, n; - FLOAT tmp[UNROLL]; - - BODY_XY(); - -#undef INIT -#undef KERNEL -} - -void -func(axpy)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy) -{ -#define INIT(I,...) -#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] += a*x[(INCX)*(i + I)]; - INT i, n; - - BODY_XY(); - -#undef INIT -#undef KERNEL -} - -void -func(axpby)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) -{ -#define INIT(I,...) -#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = a*x[(INCX)*(i + I)] + b*y[(INCY)*(i + I)]; - INT i, n; - - BODY_XY(); - -#undef INIT -#undef KERNEL -} - -INT -func(argmax)(INT len, FLOAT *x, INT incx) -{ -#define INIT(I,...) max[I] = x[I], idx[I] = I; -#define KERNEL(I, INCX) if (x[(INCX)*(i+I)] > max[I]) {max[i] = x[INCX*(i+I)]; idx[I] = i+I;} - INT i, n; - FLOAT max[UNROLL]; - INT idx[UNROLL]; - - BODY_X(); - - for (i = 1; i < UNROLL; i++) - if (max[i] > max[0]) - idx[0] = idx[i]; - - return idx[0]; -#undef INIT -#undef KERNEL -} - -INT -func(argmin)(INT len, FLOAT *x, INT incx) -{ -#define INIT(I,...) min[I] = x[I], idx[I] = I; -#define KERNEL(I, INCX) if (x[INCX*(i+I)] < min[I]) {min[i] = x[INCX*(i+I)]; idx[I] = i+I;} - INT i, n; - FLOAT min[UNROLL]; - INT idx[UNROLL]; - - BODY_X(); - - for (i = 1; i < UNROLL; i++) - if (min[i] < min[0]) - idx[0] = idx[i]; - - return idx[0]; -#undef INIT -#undef KERNEL -} - -FLOAT -func(asum)(INT len, FLOAT *x, INT incx) -{ -#define INIT(I,...) sum[I] = 0; -#define KERNEL(I, INCX) sum[I] += x[INCX*(i+I)] > 0 ? x[INCX*(i+I)] : -x[INCX*(i+I)]; - INT i, n; - FLOAT sum[UNROLL]; - - BODY_X(); - - for (i = 1; i < UNROLL; i++) - sum[0] += sum[i]; - - return sum[0]; - -#undef INIT -#undef KERNEL -} - -void -func(scale)(INT len, FLOAT a, FLOAT *x, INT incx) -{ -#define INIT(I, ...) -#define KERNEL(I, INCX) x[INCX*(i+I)] *= a; - INT i, n; - - BODY_X(); - -#undef INIT -#undef KERNEL -} - -FLOAT -func(norm)(INT len, FLOAT *x, INT incx) -{ -#define INIT(I, ...) -#define KERNEL(I, INCX) norm[I] += x[INCX*(i+I)] * x[INCX*(i+I)]; - INT i, n; - FLOAT norm[UNROLL]; - - BODY_X(); - - for (i = 1; i < UNROLL; i++) - norm[0] += norm[i]; - - return math·sqrt(norm[0]); - -#undef INIT -#undef KERNEL -} - -void -func(drot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT cos, FLOAT sin) -{ -#define INIT(I, ...) -#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = cos*x[INCX*(i+I)] + sin*y[INCY*(i+I)], y[INCY*(i+I)] = cos*y[INCY*(i+I)] - sin*tmp[I]; - INT i, n; - FLOAT tmp[UNROLL]; - - BODY_XY(); - -#undef INIT -#undef KERNEL -} - -void -func(rotm)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT H[5]) -{ -#define INIT(I, ...) -#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = H[1]*x[INCX*(i+I)] + H[2]*y[INCY*(i+I)], y[INCY*(i+I)] = H[3]*tmp[I] + H[4]*y[INCY*(i+I)]; - INT i, n, f; - FLOAT tmp[UNROLL]; - - f = (INT)H[0]; - switch (f) { - case -2: - H[1] = +1; - H[2] = +0; - H[3] = +0; - H[4] = +1; - break; - case -1: - break; - case +0: - H[1] = +1; - H[4] = +1; - break; - case +1: - H[2] = +1; - H[3] = -1; - break; - default: - return; - } - - BODY_XY(); - -#undef INIT -#undef KERNEL -} diff --git a/sys/libmath/blas2.c b/sys/libmath/blas2.c deleted file mode 100644 index 7e4b08e..0000000 --- a/sys/libmath/blas2.c +++ /dev/null @@ -1,222 +0,0 @@ -#include -#include -#include "loop.h" - -// ----------------------------------------------------------------------- -// Templates - -#define BODY_RECT() \ - nr = ROUNDBY(nrow, UNROW); \ - nc = ROUNDBY(ncol, UNCOL); \ - if (incx == 1 && incy == 1) { \ - for (r = 0; r < nr; r += UNROW) { \ - LOOP(UNROW,0,INIT,1,1); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(UNROW,0,KERN,1,1,UNCOL); \ - } \ - for (; c < ncol; c++) { \ - LOOP(UNROW,0,KERN,1,1,1); \ - } \ - LOOP(UNROW,0,FINI,1,1); \ - } \ - } else { \ - for (r = 0; r < nr; r += UNROW) { \ - LOOP(UNROW,0,INIT,incx,incy); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(UNROW,0,KERN,incx,incy,UNCOL); \ - } \ - for (; c < ncol; c++) { \ - LOOP(UNROW,0,KERN,incx,incy,1); \ - } \ - LOOP(UNROW,0,FINI,incx,incy); \ - } \ - } \ - \ - for (; r < nrow; r++) { \ - LOOP(1,0,INIT,incx,incy); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(1,0,KERN,incx,incy,UNCOL); \ - } \ - for (; c < ncol; c++) { \ - LOOP(1,0,KERN,incx,incy,1); \ - } \ - LOOP(1,0,FINI,incx,incy); \ - } - -#define BODY_LOTRI() \ - nr = ROUNDBY(n, UNROW); \ - if (incx == 1) { \ - for (r = 0; r < nr; r += UNROW) { \ - LOOP(UNROW,0,INIT,1); \ - nc = ROUNDBY(r, UNCOL); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(UNROW,0,KERN,1,UNCOL); \ - } \ - for (; c < r; c++) { \ - LOOP(UNROW,0,KERN,1,1); \ - } \ - LOOP(UNROW,0,FINI,1); \ - } \ - } else { \ - for (r = 0; r < nr; r += UNROW) { \ - LOOP(UNROW,0,INIT,incx); \ - nc = ROUNDBY(r, UNCOL); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(UNROW,0,KERN,incx,UNCOL); \ - } \ - for (; c < r; c++) { \ - LOOP(UNROW,0,KERN,incx,1); \ - } \ - LOOP(UNROW,0,FINI,incx); \ - } \ - } \ - \ - for (; r < n; r++) { \ - LOOP(1,0,INIT,incx); \ - nc = ROUNDBY(r, UNCOL); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(1,0,KERN,incx,UNCOL); \ - } \ - for (; c < r; c++) { \ - LOOP(1,0,KERN,incx,1); \ - } \ - LOOP(1,0,FINI,incx); \ - } - -#define BODY_UPTRI() \ - nr = n - ROUNDBY(n, UNROW); \ - if (incx == 1) { \ - for (r = n-1; r >= nr; r -= UNROW) { \ - LOOP(UNROW,0,INIT,1); \ - nc = n - ROUNDBY(r, UNCOL); \ - for (c = n-1; c >= nc; c -= UNCOL) { \ - LOOP(UNROW,0,KERN,1,UNCOL); \ - } \ - for (; c > r; c--) { \ - LOOP(UNROW,0,KERN,1,1); \ - } \ - LOOP(UNROW,0,FINI,1); \ - } \ - } else { \ - for (r = n-1; r >= nr; r -= UNROW) { \ - LOOP(UNROW,0,INIT,incx); \ - nc = n - ROUNDBY(r, UNCOL); \ - for (c = n-1; c >= nc; c -= UNCOL) { \ - LOOP(UNROW,0,KERN,incx,UNCOL); \ - } \ - for (; c > r; c--) { \ - LOOP(UNROW,0,KERN,incx,1); \ - } \ - LOOP(UNROW,0,FINI,incx); \ - } \ - } \ - \ - for (; r >= 0; r--) { \ - LOOP(1,0,INIT,incx); \ - nc = n - ROUNDBY(r, UNCOL); \ - for (c = n-1; c >= nc; c -= UNCOL) { \ - LOOP(1,0,KERN,incx,UNCOL); \ - } \ - for (; c > r; c--) { \ - LOOP(1,0,KERN,incx,1); \ - } \ - LOOP(1,0,FINI,incx); \ - } - -#define BODY_LOTRI_XY() \ - nr = ROUNDBY(n, UNROW); \ - if (incx == 1 && incy == 1) { \ - for (r = 0; r < nr; r += UNROW) { \ - LOOP(UNROW,0,INIT,1,1); \ - nc = ROUNDBY(r, UNCOL); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(UNROW,0,KERN,1,1,UNCOL); \ - } \ - for (; c < r; c++) { \ - LOOP(UNROW,0,KERN,1,1,1); \ - } \ - LOOP(UNROW,0,FINI,1,1); \ - } \ - } else { \ - for (r = 0; r < nr; r += UNROW) { \ - LOOP(UNROW,0,INIT,incx,incy); \ - nc = ROUNDBY(r, UNCOL); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(UNROW,0,KERN,incx,incy,UNCOL); \ - } \ - for (; c < r; c++) { \ - LOOP(UNROW,0,KERN,incx,incy,1); \ - } \ - LOOP(UNROW,0,FINI,incx, incy); \ - } \ - } \ - \ - for (; r < n; r++) { \ - LOOP(1,0,INIT,incx,incy); \ - nc = ROUNDBY(r, UNCOL); \ - for (c = 0; c < nc; c += UNCOL) { \ - LOOP(1,0,KERN,incx,incy,UNCOL); \ - } \ - for (; c < r; c++) { \ - LOOP(1,0,KERN,incx,incy,1); \ - } \ - LOOP(1,0,FINI,incx,incy); \ - } - -#define BODY_UPTRI_XY() \ - nr = n - ROUNDBY(n, UNROW); \ - if (incx == 1 && incy == 1) { \ - for (r = n-1; r >= nr; r -= UNROW) { \ - LOOP(UNROW,0,INIT,1,1); \ - nc = n - ROUNDBY(r, UNCOL); \ - for (c = n-1; c >= nc; c -= UNCOL) { \ - LOOP(UNROW,0,KERN,1,1,UNCOL); \ - } \ - for (; c > r; c--) { \ - LOOP(UNROW,0,KERN,1,1,1); \ - } \ - LOOP(UNROW,0,FINI,1,1); \ - } \ - } else { \ - for (r = n-1; r >= nr; r -= UNROW) { \ - LOOP(UNROW,0,INIT,incx,incy); \ - nc = n - ROUNDBY(r, UNCOL); \ - for (c = n-1; c >= nc; c -= UNCOL) { \ - LOOP(UNROW,0,KERN,incx,incy,UNCOL); \ - } \ - for (; c > r; c--) { \ - LOOP(UNROW,0,KERN,incx,incy,1); \ - } \ - LOOP(UNROW,0,FINI,incx,incy); \ - } \ - } \ - \ - for (; r >= 0; r--) { \ - LOOP(1,0,INIT,incx,incy); \ - nc = n - ROUNDBY(r, UNCOL); \ - for (c = n-1; c >= nc; c -= UNCOL) { \ - LOOP(1,0,KERN,incx,incy,UNCOL); \ - } \ - for (; c > r; c--) { \ - LOOP(1,0,KERN,incx,incy,1); \ - } \ - LOOP(1,0,FINI,incx,incy); \ - } - -// ----------------------------------------------------------------------- -// implementation - -#define UNROW 4 -#define UNCOL 4 - -#define INT int -#define FLOAT double -#define func(name) blas·d##name -#include "blas2body" - -#undef FLOAT -#undef func - -#define FLOAT float -#define func(name) blas·f##name -#include "blas2body" diff --git a/sys/libmath/blas2body b/sys/libmath/blas2body deleted file mode 100644 index 45baf67..0000000 --- a/sys/libmath/blas2body +++ /dev/null @@ -1,256 +0,0 @@ -/* general matrix multiply */ -error -func(gemv)(uint flag, INT nrow, INT ncol, FLOAT a, FLOAT *m, INT incm, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) -{ - INT r, c, nr, nc; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = 0; -#define TERM(J, I, INCX, INCY) row[I][c+J] * x[INCX*(c+J)] -#define KERN(I, INCX, INCY, LENGTH) reg[I] += EXPAND(LENGTH,0,TERM,+,I,INCX,INCY); -#define FINI(I, INCX, INCY) y[INCY*(r+I)] = b*y[INCY*(r+I)] + a*reg[I]; - - if (!flag) { - BODY_RECT(); - } else { - func(scale)(ncol, b, y, incy); -#undef KERN -#undef FINI -#undef INIT -#undef TERM -#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = a*x[INCX*(r+I)]; -#define TERM(J, I, INCX, INCY) row[I][c+J] * reg[I] -#define KERN(I, INCX, INCY, LENGTH) y[INCY*(c+I)] += EXPAND(LENGTH,0,TERM,+,I,INCX,INCY); -#define FINI(I, INCX, INCY) - BODY_RECT(); - } - - return 0; -#undef INIT -#undef TERM -#undef KERN -#undef FINI -} - -/* symmetric matrix vector multiply (different layouts) */ -void -func(symv)(uint upper, INT n, FLOAT a, FLOAT *m, INT incm, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) -{ - INT r, c, nr, nc, i; - FLOAT *row[UNROW], reg1[UNROW], reg2[UNROW]; - -#define INIT(I, INCX, INCY) row[I] = m + (r XX I)*incm, reg1[I] = 0, reg2[I] = 0; -#define TERM1(J, I, INCX, INCY) row[I][c XX J]*x[INCX*(c XX J)] -#define TERM2(J, I, INCX, INCY) row[I][c XX J]*x[INCX*((n-c-1) XX J)] -#define KERN(I, INCX, INCY, REPEAT) reg1[I] += EXPAND(REPEAT,0,TERM1,+,I,INCX,INCY), \ - reg2[I] += EXPAND(REPEAT,0,TERM2,+,I,INCX,INCY); -#define FINI(I, INCX, INCY) y[INCY*(r+I)] += a*(reg1[I] + row[I][r]*x[INCX*r]), \ - y[INCY*(n-r-1+I)] += a*reg2[I]; - - func(scale)(n, b, y, incy); -#define XX + - if (!upper) { - BODY_LOTRI_XY(); - } else { -#undef XX -#define XX - - BODY_UPTRI_XY(); - } -#undef XX - -#undef INIT -} - -void -func(spmv)(uint upper, INT n, FLOAT a, FLOAT *m, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) -{ - INT r, c, nr, nc, i; - FLOAT *row[UNROW], reg1[UNROW], reg2[UNROW]; - -#define INIT(I, INCX, INCY) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg1[I] = 0, reg2[I] = 0; - - func(scale)(n, b, y, incy); -#define XX + - if (!upper) { - BODY_LOTRI_XY(); - } else { -#undef XX -#undef INIT -#define INIT(I, INCX, INCY) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg1[I] = 0, reg2[I] = 0; -#define XX - - BODY_UPTRI_XY(); - } -#undef XX - -#undef TERM -#undef INIT -#undef KERN -#undef FINI -} - -/* triangular multiply (different layouts) */ -void -func(trmv)(uint upper, INT n, FLOAT *m, INT incm, FLOAT *x, INT incx) -{ - INT r, c, nr, nc, i; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = 0; -#define TERM(J, I, INCX) row[I][c XX J]*x[INCX*(c XX J)] -#define KERN(I, INCX, REPEAT) reg[I] += EXPAND(REPEAT,0,TERM,+,I,INCX); -#define FINI(I, INCX) x[INCX*(r XX I)] = row[I][r XX I]*x[INCX*(r XX I)] + reg[I]; - -#define XX + - if (!upper) { - BODY_LOTRI(); - } else { -#undef XX -#define XX - - BODY_UPTRI(); - } -#undef XX - -#undef INIT -} - -void -func(tpmv)(uint upper, INT n, FLOAT *m, FLOAT *x, INT incx) -{ - INT r, c, nr, nc, i; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0; - -#define XX + - if (!upper) { - BODY_LOTRI(); - } else { -#undef XX -#undef INIT -#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0; -#define XX - - BODY_UPTRI(); - } -#undef XX - -#undef INIT -#undef TERM -#undef KERN -#undef FINI -} - -/* triangular solve (different layouts) */ -void -func(trsv)(uint upper, INT n, FLOAT *m, INT incm, FLOAT *x, INT incx) -{ - INT r, c, nr, nc, i; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = 0; -#define TERM(J, I, INCX) row[I][c XX J]*x[c XX J] -#define KERN(I, INCX, REPEAT) reg[I] += EXPAND(REPEAT,0,TERM,+,I,INCX); -#define SOLN(J, I, INCX) reg[J] += row[I][r XX I]*x[INCX*(r XX I)] -#define FINI(I, INCX) x[INCX*(r XX I)] = reg[I] / row[I][r XX I]; EXPAND_TRI(UNROW,INC(I),SOLN,;,I,INCX); - -#define XX + - if (!upper) { - BODY_LOTRI(); - } else { -#undef XX -#define XX - - BODY_UPTRI(); - } -#undef XX - -#undef INIT -} - -void -func(tpsv)(uint upper, INT n, FLOAT *m, FLOAT *x, INT incx) -{ - INT r, c, nr, nc, i; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0; - -#define XX + - if (!upper) { - BODY_LOTRI(); - } else { -#undef XX -#undef INIT -#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0; -#define XX - - BODY_UPTRI(); - } -#undef XX - -#undef INIT -#undef TERM -#undef KERN -#undef SOLN -#undef FINI -} - -/* rank 1 update */ -void -func(ger)(INT nrow, INT ncol, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT *m, INT incm) -{ - INT r, c, nr, nc; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = a*x[INCX*(r+I)]; -#define TERM(J, I, INCX, INCY) row[I][c+J] += reg[I] * y[INCY*(c+J)] -#define KERN(I, INCX, INCY, LENGTH) EXPAND(LENGTH,0,TERM,;,I,INCX, INCY); -#define FINI(I, ...) - - BODY_RECT(); - -#undef INIT -#undef TERM -#undef KERN -#undef FINI -} - -/* symmetric rank 1 update (different memory layouts) */ -void -func(syr)(uint upper, INT n, FLOAT a, FLOAT *x, INT incx, FLOAT *m, INT incm) -{ - INT r, c, nr, nc; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = a*x[INCX*(r XX I)]; -#define TERM(J, I, INCX) row[I][c XX J] += reg[I] * x[INCX*(c XX J)] -#define KERN(I, INCX, LENGTH) EXPAND(LENGTH,0,TERM,;,I,INCX); -#define FINI(I, ...) - -#define XX + - if (!upper) { - BODY_LOTRI(); - } else { -#undef XX -#define XX - - BODY_UPTRI(); - } -#undef XX - -#undef INIT -} - -void -func(spr)(uint upper, INT n, FLOAT a, FLOAT *x, INT incx, FLOAT *m) -{ - INT r, c, nr, nc; - FLOAT *row[UNROW], reg[UNROW]; -#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0; - -#define XX + - if (!upper) { - BODY_LOTRI(); - } else { -#undef XX -#undef INIT -#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0; -#define XX - - BODY_UPTRI(); - } -#undef XX - -#undef INIT -#undef TERM -#undef KERN -#undef FINI -} diff --git a/sys/libmath/blas3.c b/sys/libmath/blas3.c deleted file mode 100644 index b048c95..0000000 --- a/sys/libmath/blas3.c +++ /dev/null @@ -1,279 +0,0 @@ -#include -#include -#include - -#define INT int -#define FLOAT double -#define func(name) blas·d##name - -#define X(i, j) x[j + incx*(i)] -#define Y(i, j) y[j + incy*(i)] -#define Z(i, j) z[j + incz*(i)] - -void -func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) -{ - INT jj, jb, kk, kb, dk, i, j, k, end; - FLOAT r0[8], r1[8], r2[8], r3[8], pf; - - for (i = 0; i < ni; i++) { - for (j = 0; j < nj; j++) { - Z(i,j) *= b; - } - } - - jb = MIN(256, nj); - kb = MIN(48, nk); - for (jj = 0; jj < nj; jj += jb) { - for (kk = 0; kk < nk; kk += kb) { - for (i = 0; i < ni; i += 4) { - for (j = jj; j < jj + jb; j += 8) { - r0[0] = Z(i+0, j+0); r0[1] = Z(i+0, j+1); r0[2] = Z(i+0, j+2); r0[3] = Z(i+0, j+3); - r1[0] = Z(i+1, j+0); r1[1] = Z(i+1, j+1); r1[2] = Z(i+1, j+2); r1[3] = Z(i+1, j+3); - r2[0] = Z(i+2, j+0); r2[1] = Z(i+2, j+1); r2[2] = Z(i+2, j+2); r2[3] = Z(i+2, j+3); - r3[0] = Z(i+3, j+0); r3[1] = Z(i+3, j+1); r3[2] = Z(i+3, j+2); r3[3] = Z(i+3, j+3); - end = MIN(nk, kk+kb); - for (k = kk; k < end; k++) { - pf = a * X(i, k); - r0[0] += pf * Y(k, j+0); r0[1] += pf * Y(k, j+1); r0[2] += pf * Y(k, j+2); r0[3] += pf * Y(k, j+3); - - pf = a * X(i+1, k); - r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3); - - pf = a * X(i+2, k); - r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3); - - pf = a * X(i+3, k); - r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3); - } - Z(i+0, j+0) = r0[0]; Z(i+0, j+1) = r0[1]; Z(i+0, j+2) = r0[2]; Z(i+0, j+3) = r0[3]; - Z(i+1, j+0) = r1[0]; Z(i+1, j+1) = r1[1]; Z(i+1, j+2) = r1[2]; Z(i+1, j+3) = r1[3]; - Z(i+2, j+0) = r2[0]; Z(i+2, j+1) = r2[1]; Z(i+2, j+2) = r2[2]; Z(i+2, j+3) = r2[3]; - Z(i+3, j+0) = r3[0]; Z(i+3, j+1) = r3[1]; Z(i+3, j+2) = r3[2]; Z(i+3, j+3) = r3[3]; - } - } - } - } -} - -#if 0 -void -func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) -{ - int i, j, k; - FLOAT w[nj*nk], acc[4][4]; - - for (i = 0; i < ni; i++) { - for (j = 0; j < nj; j++) { - Z(i,j) *= b; - W(i,j) = Y(j,i); - } - } - - for (i = 0; i < ni; i+=4) { - for (j = 0; j < nj; j+=4) { - memset(acc, 0, sizeof(acc)); - for (k = 0; k < nk; k+=4) { - acc[0][0] += X(i+0,k)*W(j+0,k) + X(i+0,k+1)*W(j+0,k+1) + X(i+0,k+2)*W(j+0,k+2) + X(i+0,k+3)*W(j+0,k+3); - acc[0][1] += X(i+0,k)*W(j+1,k) + X(i+0,k+1)*W(j+1,k+1) + X(i+0,k+2)*W(j+1,k+2) + X(i+0,k+3)*W(j+1,k+3); - acc[0][2] += X(i+0,k)*W(j+2,k) + X(i+0,k+1)*W(j+2,k+1) + X(i+0,k+2)*W(j+2,k+2) + X(i+0,k+3)*W(j+2,k+3); - acc[0][3] += X(i+0,k)*W(j+3,k) + X(i+0,k+1)*W(j+3,k+1) + X(i+0,k+2)*W(j+3,k+2) + X(i+0,k+3)*W(j+3,k+3); - - acc[1][0] += X(i+1,k)*W(j+0,k) + X(i+1,k+1)*W(j+0,k+1) + X(i+1,k+2)*W(j+0,k+2) + X(i+1,k+3)*W(j+0,k+3); - acc[1][1] += X(i+1,k)*W(j+1,k) + X(i+1,k+1)*W(j+1,k+1) + X(i+1,k+2)*W(j+1,k+2) + X(i+1,k+3)*W(j+1,k+3); - acc[1][2] += X(i+1,k)*W(j+2,k) + X(i+1,k+1)*W(j+2,k+1) + X(i+1,k+2)*W(j+2,k+2) + X(i+1,k+3)*W(j+2,k+3); - acc[1][3] += X(i+1,k)*W(j+3,k) + X(i+1,k+1)*W(j+3,k+1) + X(i+1,k+2)*W(j+3,k+2) + X(i+1,k+3)*W(j+3,k+3); - - acc[2][0] += X(i+2,k)*W(j+0,k) + X(i+2,k+1)*W(j+0,k+1) + X(i+2,k+2)*W(j+0,k+2) + X(i+2,k+3)*W(j+0,k+3); - acc[2][1] += X(i+2,k)*W(j+1,k) + X(i+2,k+1)*W(j+1,k+1) + X(i+2,k+2)*W(j+1,k+2) + X(i+2,k+3)*W(j+1,k+3); - acc[2][2] += X(i+2,k)*W(j+2,k) + X(i+2,k+1)*W(j+2,k+1) + X(i+2,k+2)*W(j+2,k+2) + X(i+2,k+3)*W(j+2,k+3); - acc[2][3] += X(i+2,k)*W(j+3,k) + X(i+2,k+1)*W(j+3,k+1) + X(i+2,k+2)*W(j+3,k+2) + X(i+2,k+3)*W(j+3,k+3); - - acc[2][0] += X(i+3,k)*W(j+0,k) + X(i+3,k+1)*W(j+0,k+1) + X(i+3,k+2)*W(j+0,k+2) + X(i+3,k+3)*W(j+0,k+3); - acc[2][1] += X(i+3,k)*W(j+1,k) + X(i+3,k+1)*W(j+1,k+1) + X(i+3,k+2)*W(j+1,k+2) + X(i+3,k+3)*W(j+1,k+3); - acc[2][2] += X(i+3,k)*W(j+2,k) + X(i+3,k+1)*W(j+2,k+1) + X(i+3,k+2)*W(j+2,k+2) + X(i+3,k+3)*W(j+2,k+3); - acc[2][3] += X(i+3,k)*W(j+3,k) + X(i+3,k+1)*W(j+3,k+1) + X(i+3,k+2)*W(j+3,k+2) + X(i+3,k+3)*W(j+3,k+3); - // Z(i,j) += X(i,k)*Y(k,j); - } - Z(i+0,j+1) = a*acc[0][0]; - Z(i+0,j+2) = a*acc[0][1]; - Z(i+0,j+3) = a*acc[0][2]; - Z(i+0,j+4) = a*acc[0][3]; - - Z(i+1,j+1) = a*acc[1][0]; - Z(i+1,j+2) = a*acc[1][1]; - Z(i+1,j+3) = a*acc[1][2]; - Z(i+1,j+4) = a*acc[1][3]; - - Z(i+2,j+1) = a*acc[2][0]; - Z(i+2,j+2) = a*acc[2][1]; - Z(i+2,j+3) = a*acc[2][2]; - Z(i+2,j+4) = a*acc[2][3]; - - Z(i+3,j+1) = a*acc[3][0]; - Z(i+3,j+2) = a*acc[3][1]; - Z(i+3,j+3) = a*acc[3][2]; - Z(i+3,j+4) = a*acc[3][3]; - } - } -} -#endif - -#if 0 -void -func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) -{ - int i, j, k, ri, rj, rk; - FLOAT reg[4][4], *xrow[4], *yrow[4]; - - for (i = 0; i < ni; i++) { - for (j = 0; j < nj; j++) { - z[j + incz*i] *= b; - } - } - - for (i = 0; i < ni; i += 4) { - xrow[0] = x + incx*(i+0); - xrow[1] = x + incx*(i+1); - xrow[2] = x + incx*(i+2); - xrow[3] = x + incx*(i+3); - for (k = 0; k < nk; k+=4) { - yrow[0] = y + incy*(k+0); - yrow[1] = y + incy*(k+1); - yrow[2] = y + incy*(k+2); - yrow[3] = y + incy*(k+3); - reg[0][0] = a * xrow[0][k+0]; reg[0][1] = a * xrow[0][k+1]; reg[0][2] = a * xrow[0][k+2]; reg[0][3] = a * xrow[0][k+3]; - reg[1][0] = a * xrow[1][k+0]; reg[1][1] = a * xrow[1][k+1]; reg[1][2] = a * xrow[1][k+2]; reg[1][3] = a * xrow[1][k+3]; - reg[2][0] = a * xrow[2][k+0]; reg[2][1] = a * xrow[2][k+1]; reg[2][2] = a * xrow[2][k+2]; reg[2][3] = a * xrow[2][k+3]; - reg[3][0] = a * xrow[3][k+0]; reg[3][1] = a * xrow[3][k+1]; reg[3][2] = a * xrow[3][k+2]; reg[3][3] = a * xrow[3][k+3]; - for (j = 0; j < nj; j += 1) { - z[j + incz*(i+0)] += (reg[0][0]*yrow[0][j]+reg[0][1]*yrow[1][j]+reg[0][2]*yrow[2][j]+reg[0][3]*yrow[3][j]); - z[j + incz*(i+1)] += (reg[1][0]*yrow[0][j]+reg[1][1]*yrow[1][j]+reg[1][2]*yrow[2][j]+reg[1][3]*yrow[3][j]); - z[j + incz*(i+2)] += (reg[2][0]*yrow[0][j]+reg[2][1]*yrow[1][j]+reg[2][2]*yrow[2][j]+reg[2][3]*yrow[3][j]); - z[j + incz*(i+3)] += (reg[3][0]*yrow[0][j]+reg[3][1]*yrow[1][j]+reg[3][2]*yrow[2][j]+reg[3][3]*yrow[3][j]); - } - } - } -} -#endif - -#if 0 -void -func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) -{ - int i, j, k, ri, rj, rk; - FLOAT r[4][4], *row[4]; - - for (i = 0; i < ni; i++) { - for (j = 0; j < nj; j++) { - Z(i, j) *= b; - } - } - - for (i = 0; i < ni; i+=4) { - for (j = 0; j < nj; j+=4) { - r[0][0] = 0; r[0][1] = 0; r[0][2] = 0; r[0][3] = 0; - r[1][0] = 0; r[1][1] = 0; r[1][2] = 0; r[1][3] = 0; - r[2][0] = 0; r[2][1] = 0; r[2][2] = 0; r[2][3] = 0; - r[3][0] = 0; r[3][1] = 0; r[3][2] = 0; r[3][3] = 0; - row[0] = &X(i+0, 0); - row[1] = &X(i+1, 0); - row[2] = &X(i+2, 0); - row[3] = &X(i+3, 0); - for (k = 0; k < nk; k++) { - r[0][0] += row[0][k]*Y(k,0); r[0][1] += row[0][k]*Y(k,1); r[0][2] += row[0][k]*Y(k,2); r[0][3] += row[0][k]*Y(k,3); - r[1][0] += row[1][k]*Y(k,0); r[1][1] += row[1][k]*Y(k,1); r[1][2] += row[1][k]*Y(k,2); r[1][3] += row[1][k]*Y(k,3); - r[2][0] += row[2][k]*Y(k,0); r[2][1] += row[2][k]*Y(k,1); r[2][2] += row[2][k]*Y(k,2); r[2][3] += row[2][k]*Y(k,3); - r[3][0] += row[3][k]*Y(k,0); r[3][1] += row[3][k]*Y(k,1); r[3][2] += row[3][k]*Y(k,2); r[3][3] += row[3][k]*Y(k,3); - } - Z(i+0, j+0) += r[0][0]; Z(i+0, j+1) += r[0][1]; Z(i+0, j+2) += r[0][2]; Z(i+0, j+3) += r[0][3]; - Z(i+1, j+0) += r[1][0]; Z(i+1, j+1) += r[1][1]; Z(i+1, j+2) += r[1][2]; Z(i+1, j+3) += r[1][3]; - Z(i+2, j+0) += r[2][0]; Z(i+2, j+1) += r[2][1]; Z(i+2, j+2) += r[2][2]; Z(i+2, j+3) += r[2][3]; - Z(i+3, j+0) += r[3][0]; Z(i+3, j+1) += r[3][1]; Z(i+3, j+2) += r[3][2]; Z(i+3, j+3) += r[3][3]; - } - } -} -#endif - -#if 0 -void -func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) -{ - int i, j, k, ri, rj, rk; - FLOAT *xrow[8], *yrow[8], reg; - - for (i = 0; i < ni; i++) { - for (j = 0; j < nj; j++) { - z[j + incz*i] *= b; - } - } - - ri = ni & ~7; - rj = nj & ~7; - for (i = 0; i < ri; i += 8) { - xrow[0] = x + incx*(i+0); - xrow[1] = x + incx*(i+1); - xrow[2] = x + incx*(i+2); - xrow[3] = x + incx*(i+3); - xrow[4] = x + incx*(i+4); - xrow[5] = x + incx*(i+5); - xrow[6] = x + incx*(i+6); - xrow[7] = x + incx*(i+7); - for (j = 0; j < rj; j += 8) { - yrow[0] = y + incy*(j+0); - yrow[1] = y + incy*(j+1); - yrow[2] = y + incy*(j+2); - yrow[3] = y + incy*(j+3); - yrow[4] = y + incy*(j+4); - yrow[5] = y + incy*(j+5); - yrow[6] = y + incy*(j+6); - yrow[7] = y + incy*(j+7); - for (k = 0; k < nk; k++) { - reg = a*(yrow[0][k] + yrow[1][k] + yrow[2][k] + yrow[3][k] + yrow[4][k] + yrow[5][k] + yrow[6][k] + yrow[7][k]); - z[k + incz*(i+0)] += xrow[0][k]*reg; - z[k + incz*(i+1)] += xrow[1][k]*reg; - z[k + incz*(i+2)] += xrow[2][k]*reg; - z[k + incz*(i+3)] += xrow[3][k]*reg; - z[k + incz*(i+4)] += xrow[4][k]*reg; - z[k + incz*(i+5)] += xrow[5][k]*reg; - z[k + incz*(i+6)] += xrow[6][k]*reg; - z[k + incz*(i+7)] += xrow[7][k]*reg; - } - } - for (; j < nj; j++) { - for (k = 0; k < nk; k++) { - reg = a*y[k+incy*j]; - z[k + incz*(i+0)] += xrow[0][k]*reg; - z[k + incz*(i+1)] += xrow[1][k]*reg; - z[k + incz*(i+2)] += xrow[2][k]*reg; - z[k + incz*(i+3)] += xrow[3][k]*reg; - z[k + incz*(i+4)] += xrow[4][k]*reg; - z[k + incz*(i+5)] += xrow[5][k]*reg; - z[k + incz*(i+6)] += xrow[6][k]*reg; - z[k + incz*(i+7)] += xrow[7][k]*reg; - } - } - } - - for (; i < ni; i++) { - for (j = 0; j < rj; j += 8) { - yrow[0] = y + incy*(j+0); - yrow[1] = y + incy*(j+1); - yrow[2] = y + incy*(j+2); - yrow[3] = y + incy*(j+3); - yrow[4] = y + incy*(j+4); - yrow[5] = y + incy*(j+5); - yrow[6] = y + incy*(j+6); - yrow[7] = y + incy*(j+7); - for (k = 0; k < nk; k++) { - z[k + incz*(i)] += a*x[k + incx*i]*(yrow[0][k] + yrow[1][k] + yrow[2][k] + yrow[3][k] + yrow[4][k] + yrow[5][k] + yrow[6][k] + yrow[7][k]); - } - } - for (; j < nj; j++) { - for (k = 0; k < nk; k++) { - z[k + incz*i] += a*x[k + incx*i]*y[k + incy*j]; - } - } - } -} -#endif diff --git a/sys/libmath/lapack.c b/sys/libmath/lapack.c deleted file mode 100644 index e69de29..0000000 diff --git a/sys/libmath/linalg.c b/sys/libmath/linalg.c deleted file mode 100644 index 8551ff1..0000000 --- a/sys/libmath/linalg.c +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include -#include -#include - -// ----------------------------------------------------------------------- -// Vector - -void -linalg·normalize(math·Vector vec) -{ - double norm; - - norm = blas·normd(vec.len, vec.data, 1); - blas·scaled(vec.len, 1/norm, vec.data, 1); -} -// TODO: Write blas wrappers that eat vectors for convenience - -// ----------------------------------------------------------------------- -// Matrix -// -// NOTE: all matrices are row major oriented - -/* - * linalg·lq - * computes the LQ decomposition of matrix M: M = LQ - * L is lower triangular - * Q is orthogonal -> transp(Q) * Q = I - * - * m: matrix to factorize. changes in place - * + lower triangle -> L - * + upper triangle -> all reflection vectors stored in rows - * w: working buffer: len = ncols! - */ -error -linalg·lq(math·Matrix m, math·Vector w) -{ - int i, j, len; - double *row, mag; - enum { - err·nil, - err·baddims, - }; - - if (m.dim[0] > m.dim[1]) { - return err·baddims; - } - - for (i = 0; i < m.dim[0]; i++, m.data += m.dim[1]) { - row = m.data + i; - len = m.dim[0] - i; - - // TODO: Don't want to compute norm twice!! - w.data[0] = math·sgn(row[0]) * blas·normd(len, row, 1); - blas·axpyd(len, 1.0, row, 1, w.data, 1); - mag = blas·normd(len, w.data, 1); - blas·scaled(len, 1/mag, w.data, 1); - - blas·copyd(len - m.dim[0], w.data, 1, m.data + i, 1); - } - - return err·nil; -} diff --git a/sys/libmath/loop.h b/sys/libmath/loop.h deleted file mode 100644 index a877d84..0000000 --- a/sys/libmath/loop.h +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once - -/* increment operator */ -#define INC2(x) INC_##x -#define INC1(x) INC2(x) -#define INC(x) INC1(x) - -#define INC_0 1 -#define INC_1 2 -#define INC_2 3 -#define INC_3 4 -#define INC_4 5 -#define INC_5 6 -#define INC_6 7 -#define INC_7 8 -#define INC_8 9 -#define INC_9 10 -#define INC_10 11 -#define INC_11 12 -#define INC_12 13 -#define INC_13 14 -#define INC_14 15 -#define INC_15 16 - -#define ROUNDBY(x, n) ((x) & ~((n)-1)) - -/* subtraction tables */ -#define SUB2(x, y) SUB_##x##_##y -#define SUB1(x, y) SUB2(x, y) -#define SUB(x, y) SUB1(x, y) -#define SUB_8_0 8 -#define SUB_8_1 7 -#define SUB_8_2 6 -#define SUB_8_3 5 -#define SUB_8_4 4 -#define SUB_8_5 3 -#define SUB_8_6 2 -#define SUB_8_7 1 -#define SUB_8_8 0 -#define SUB_7_0 7 -#define SUB_7_1 6 -#define SUB_7_2 5 -#define SUB_7_3 4 -#define SUB_7_4 3 -#define SUB_7_5 2 -#define SUB_7_6 1 -#define SUB_7_7 0 -#define SUB_6_0 6 -#define SUB_6_1 5 -#define SUB_6_2 4 -#define SUB_6_3 3 -#define SUB_6_4 2 -#define SUB_6_5 1 -#define SUB_6_6 0 -#define SUB_5_0 5 -#define SUB_5_1 4 -#define SUB_5_2 3 -#define SUB_5_3 2 -#define SUB_5_4 1 -#define SUB_5_5 0 -#define SUB_4_0 4 -#define SUB_4_1 3 -#define SUB_4_2 2 -#define SUB_4_3 1 -#define SUB_4_4 0 -#define SUB_3_0 3 -#define SUB_3_1 2 -#define SUB_3_2 1 -#define SUB_3_3 0 -#define SUB_2_0 2 -#define SUB_2_1 1 -#define SUB_2_2 0 -#define SUB_1_0 1 -#define SUB_1_1 0 - -/* rounding operator */ -#define ROUNDBY(x, n) ((x) & ~((n)-1)) - -/* loop unrolling (vertical) */ -#define LOOP1(I,STMT,...) STMT(I,__VA_ARGS__) -#define LOOP2(I,STMT,...) STMT(I,__VA_ARGS__) LOOP1(INC(I),STMT,__VA_ARGS__) -#define LOOP3(I,STMT,...) STMT(I,__VA_ARGS__) LOOP2(INC(I),STMT,__VA_ARGS__) -#define LOOP4(I,STMT,...) STMT(I,__VA_ARGS__) LOOP3(INC(I),STMT,__VA_ARGS__) -#define LOOP5(I,STMT,...) STMT(I,__VA_ARGS__) LOOP4(INC(I),STMT,__VA_ARGS__) -#define LOOP6(I,STMT,...) STMT(I,__VA_ARGS__) LOOP5(INC(I),STMT,__VA_ARGS__) -#define LOOP7(I,STMT,...) STMT(I,__VA_ARGS__) LOOP6(INC(I),STMT,__VA_ARGS__) -#define LOOP8(I,STMT,...) STMT(I,__VA_ARGS__) LOOP7(INC(I),STMT,__VA_ARGS__) -#define LOOP9(I,STMT,...) STMT(I,__VA_ARGS__) LOOP8(INC(I),STMT,__VA_ARGS__) -#define LOOP10(I,STMT,...) STMT(I,__VA_ARGS__) LOOP9(INC(I),STMT,__VA_ARGS__) -#define LOOP11(I,STMT,...) STMT(I,__VA_ARGS__) LOOP10(INC(I),STMT,__VA_ARGS__) -#define LOOP12(I,STMT,...) STMT(I,__VA_ARGS__) LOOP11(INC(I),STMT,__VA_ARGS__) -#define LOOP13(I,STMT,...) STMT(I,__VA_ARGS__) LOOP12(INC(I),STMT,__VA_ARGS__) -#define LOOP14(I,STMT,...) STMT(I,__VA_ARGS__) LOOP13(INC(I),STMT,__VA_ARGS__) -#define LOOP15(I,STMT,...) STMT(I,__VA_ARGS__) LOOP14(INC(I),STMT,__VA_ARGS__) -#define LOOP16(I,STMT,...) STMT(I,__VA_ARGS__) LOOP15(INC(I),STMT,__VA_ARGS__) - -#define _LOOP_(n,I,STMT,...) LOOP##n(I,STMT,__VA_ARGS__) -#define LOOP(n,I,STMT,...) _LOOP_(n,I,STMT,__VA_ARGS__) - -/* loop expansion (horizontal) */ -#define EXPAND0(I,TERM,OP,...) -#define EXPAND1(I,TERM,OP,...) TERM(I,__VA_ARGS__) -#define EXPAND2(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND1(INC(I),TERM,OP,__VA_ARGS__) -#define EXPAND3(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND2(INC(I),TERM,OP,__VA_ARGS__) -#define EXPAND4(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND3(INC(I),TERM,OP,__VA_ARGS__) -#define EXPAND5(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND4(INC(I),TERM,OP,__VA_ARGS__) -#define EXPAND6(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND5(INC(I),TERM,OP,__VA_ARGS__) -#define EXPAND7(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND6(INC(I),TERM,OP,__VA_ARGS__) -#define EXPAND8(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND7(INC(I),TERM,OP,__VA_ARGS__) - -#define _EXPAND_(n,I,TERM,OP,...) EXPAND##n(I,TERM,OP,__VA_ARGS__) -#define EXPAND(n,I,TERM,OP,...) _EXPAND_(n,I,TERM,OP,__VA_ARGS__) -#define EXPAND_TRI1(n,I,TERM,OP,...) EXPAND(n,I,TERM,OP,__VA_ARGS__) -#define EXPAND_TRI(n,I,TERM,OP,...) EXPAND_TRI1(SUB(n,I),I,TERM,OP,__VA_ARGS__) diff --git a/sys/libmath/matrix.c b/sys/libmath/matrix.c deleted file mode 100644 index e8bca0b..0000000 --- a/sys/libmath/matrix.c +++ /dev/null @@ -1,176 +0,0 @@ -#include -#include -#include - -/* TODO: replace (incrementally) with native C version! */ -#include -#include - -// ----------------------------------------------------------------------- -// level 1 - -error -la·vecslice(math·Vector *x, int min, int max, int inc) -{ - if (max > x->len || min < 0) { - errorf("out of bounds: attempted to access vector past length"); - return 1; - } - x->len = (max - min) / inc; - x->d += x->inc * min; - x->inc *= inc; - - return 0; -} - -/* simple blas wrappers */ -void -la·veccopy(math·Vector *dst, math·Vector *src) -{ - return cblas_dcopy(src->len, src->d, src->inc, dst->d, dst->inc); -} - -double -la·vecnorm(math·Vector *x) -{ - return cblas_dnrm2(x->len, x->d, x->inc); -} - -void -la·vecscale(math·Vector *x, double a) -{ - return cblas_dscal(x->len, a, x->d, x->inc); -} - -double -la·vecdot(math·Vector *x, math·Vector *y) -{ - return cblas_ddot(x->len, x->d, x->inc, y->d, y->inc); -} - -// ----------------------------------------------------------------------- -// level 2 - -error -la·vecmat(math·Vector *x, math·Matrix *M) -{ - if (M->dim[1] != x->len) { - errorf("incompatible matrix dimensions"); - return 1; - } - if (M->state & ~mat·trans) - cblas_dgemv(CblasRowMajor,CblasNoTrans,M->dim[0],M->dim[1],1.,M->d,M->inc,x->d,x->inc,0.,x->d,x->inc); - else - cblas_dgemv(CblasRowMajor,CblasTrans,M->dim[0],M->dim[1],1.,M->d,M->inc,x->d,x->inc,0.,x->d,x->inc); - - return 0; -} - -// ----------------------------------------------------------------------- -// level 3 - -void -la·transpose(math·Matrix *X) -{ - int tmp; - X->state ^= mat·trans; - tmp = X->dim[0], X->dim[0] = X->dim[1], X->dim[1] = tmp; -} - -error -la·matrow(math·Matrix *X, int r, math·Vector *row) -{ - if (r < 0 || r >= X->dim[0]) { - errorf("out of bounds"); - return 1; - } - - row->len = X->dim[1]; - row->inc = 1; - row->d = X->d + X->dim[1] * r; - - return 0; -} - -error -la·matcol(math·Matrix *X, int c, math·Vector *col) -{ - if (c < 0 || c >= X->dim[1]) { - errorf("out of bounds"); - return 1; - } - - col->len = X->dim[0]; - col->inc = X->dim[1]; - col->d = X->d + c; - - return 0; -} - -error -la·matslice(math·Matrix *X, int r[3], int c[3]) -{ - /* TODO */ - return 0; -} - -error -la·eig(math·Matrix *X) -{ - -} - -/* X = A*B */ -error -la·matmul(math·Matrix *X, math·Matrix *A, math·Matrix *B) -{ - if (A->dim[1] != B->dim[0]) { - errorf("number of interior dimensions of A '%d' not equal to that of B '%d'", A->dim[1], B->dim[0]); - return 1; - } - if (X->dim[0] != A->dim[0]) { - errorf("number of exterior dimensions of X '%d' not equal to that of A '%d'", X->dim[0], A->dim[0]); - return 1; - } - if (X->dim[1] != B->dim[1]) { - errorf("number of exterior dimensions of X '%d' not equal to that of B '%d'", X->dim[1], B->dim[1]); - return 1; - } - - if (X->state & ~mat·trans) - if (A->state & ~mat·trans) - cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); - else - cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); - else - if (A->state & ~mat·trans) - cblas_dgemm(CblasRowMajor,CblasTrans,CblasNoTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); - else - cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); - - return 0; -} - -/* - * solves A*X=B - * pass in B via X - */ -error -la·solve(math·Matrix *X, math·Matrix *A) -{ - error err; - int n, *ipv; - static int buf[512]; - if (n = A->dim[0], n < arrlen(buf)) { - ipv = buf; - n = 0; - } else - ipv = malloc(n*sizeof(*ipv)); - - /* TODO: utilize more specific regimes if applicable */ - err = LAPACKE_dgesv(LAPACK_ROW_MAJOR,A->dim[0],X->dim[1],A->d,A->inc,ipv,X->d,X->inc); - - if (n) - free(ipv); - return err; -} diff --git a/sys/libmath/rules.mk b/sys/libmath/rules.mk deleted file mode 100644 index 83945d7..0000000 --- a/sys/libmath/rules.mk +++ /dev/null @@ -1,24 +0,0 @@ -include share/push.mk - -# Iterate through subdirectory tree - -# Local sources -SRCS_$(d) := \ - $(d)/basic.c \ - $(d)/blas1.c \ - $(d)/blas2.c \ - $(d)/blas3.c -LIBS_$(d) := $(d)/libmath.a -TSTS_$(d) := - -include share/paths.mk - -$(LIBS_$(d)): $(OBJS_$(d)) - $(ARCHIVE) - -$(UNTS_$(d)): TCFLAGS := -D_GNU_SOURCE -$(UNTS_$(d)): TCLIBS := -lpthread -lm $(LIB_DIR)/libblas.a $(OBJ_DIR)/sys/libn/libn.a $(LIBS_$(d)) -$(UNTS_$(d)): $(TOBJS_$(d)) $(LIBS_$(d)) $(OBJ_DIR)/sys/libn/libn.a - $(LINK) - -include share/pop.mk diff --git a/sys/libmath/test.c b/sys/libmath/test.c deleted file mode 100644 index 66700f8..0000000 --- a/sys/libmath/test.c +++ /dev/null @@ -1,471 +0,0 @@ -#include -#include -/* #include */ - -#include - -#include - -// ----------------------------------------------------------------------- -// Vectors - -/* - * NOTE: I'm not sure I like stashing the header in _all_ vectors - * The only way to fix is to have a library based allocator... - */ -typedef struct math·Vec -{ - struct { - void *h; - mem·Allocator heap; - }; - int len; - double *d; -} math·Vec; - -math·Vec -math·makevec(int len, mem·Allocator heap, void *h) -{ - math·Vec v; - v.len = len; - v.heap = heap; - v.h = h; - v.d = heap.alloc(h, 1, len*sizeof(double)); - - // memset(v.d, 0, len*sizeof(double)); - - return v; -} - -error -math·freevec(math·Vec *v) -{ - if (v->h == nil && v->heap.alloc == nil && v->heap.free == nil) { - errorf("attempting to free a vector that doesn't own its data"); - return 1; - } - v->heap.free(v->h, v->d); - v->d = nil; - v->len = 0; - - return 0; -} - -math·Vec -math·copyvec(math·Vec v) -{ - math·Vec cpy; - cpy.heap = v.heap; - cpy.h = v.h; - cpy.len = v.len; - cpy.d = cpy.heap.alloc(cpy.h, 1, v.len); - - memcpy(cpy.d, v.d, sizeof(double)*v.len); - return cpy; -} - -/* - * Scale vector - */ - -static -void -scale_kernel8_avx2(int n, double *x, double a) -{ - __m128d a128; - __m256d a256; - register int i; - - a128 = _mm_load_sd(&a); - a256 = _mm256_broadcastsd_pd(a128); - for (i = 0; i < n; i += 8) { - _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0)); - _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4)); - } -} - -static -void -scale_kernel8(int n, double *x, double a) -{ - register int i; - for (i = 0; i < n; i += 8) { - x[i+0] *= a; - x[i+1] *= a; - x[i+2] *= a; - x[i+3] *= a; - x[i+4] *= a; - x[i+5] *= a; - x[i+6] *= a; - x[i+7] *= a; - } -} - -void -math·scalevec(math·Vec u, double a) -{ - int n; - - n = u.len & ~7; - scale_kernel8_avx2(n, u.d, a); - - for (; n < u.len; n++) { - u.d[n] *= a; - } -} - -/* - * Add scaled vector - */ - -static -void -daxpy_kernel8_avx2(int n, double *x, double *y, double a) -{ - __m128d a128; - __m256d a256; - register int i; - - a128 = _mm_load_sd(&a); - a256 = _mm256_broadcastsd_pd(a128); - for (i = 0; i < n; i += 8) { - _mm256_storeu_pd(x+i+0, _mm256_loadu_pd(x+i+0) + a256 * _mm256_loadu_pd(y+i+0)); - _mm256_storeu_pd(x+i+4, _mm256_loadu_pd(x+i+4) + a256 * _mm256_loadu_pd(y+i+4)); - } -} - -static -void -daxpy_kernel8(int n, double *x, double *y, double a) -{ - register int i; - for (i = 0; i < n; i += 8) { - x[i+0] += a*y[i+0]; - x[i+1] += a*y[i+1]; - x[i+2] += a*y[i+2]; - x[i+3] += a*y[i+3]; - x[i+4] += a*y[i+4]; - x[i+5] += a*y[i+5]; - x[i+6] += a*y[i+6]; - x[i+7] += a*y[i+7]; - } -} - -/* performs u = u + a*v */ -void -math·addvec(math·Vec u, math·Vec v, double a) -{ - int n; - - n = u.len & ~7; - daxpy_kernel8_avx2(n, u.d, v.d, a); - - for (; n < u.len; n++) { - u.d[n] += a*v.d[n]; - } -} - -/* - * Dot product - */ - -static -double -dot_kernel8_avx2(int len, double *x, double *y) -{ - register int i; - __m256d sum[4]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 16) { - sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0); - sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4); - sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8); - sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12); - } - - sum[0] += sum[1] + sum[2] + sum[3]; - - res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); - res = _mm_hadd_pd(res, res); - - return res[0]; -} - -static -double -dot_kernel8_fma3(int len, double *x, double *y) -{ - register int i; - __m256d sum[4]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 16) { - sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]); - sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]); - sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]); - sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]); - } - - sum[0] += sum[1] + sum[2] + sum[3]; - - res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); - res = _mm_hadd_pd(res, res); - - return res[0]; -} - -static -double -dot_kernel8(int len, double *x, double *y) -{ - double res; - register int i; - - for (i = 0; i < len; i += 8) { - res += x[i] * y[i] + - x[i+1] * y[i+1] + - x[i+2] * y[i+2] + - x[i+3] * y[i+3] + - x[i+4] * y[i+4] + - x[i+5] * y[i+5] + - x[i+6] * y[i+6] + - x[i+7] * y[i+7]; - } - - return res; -} - -double -math·dot(math·Vec u, math·Vec v) -{ - int i, len; - double res; - - len = u.len & ~15; // neat trick - res = dot_kernel8_fma3(len, u.d, v.d); - - for (i = len; i < u.len; i++) { - res += u.d[i] * v.d[i]; - } - - return res; -} - -// ----------------------------------------------------------------------- -// Matrix - -typedef struct math·Mtx -{ - struct { - void *h; - mem·Allocator heap; - }; - int dim[2]; - double *d; -} math·Mtx; - -math·Mtx -math·makemtx(int n, int m, mem·Allocator heap, void *h) -{ - math·Mtx a; - a.dim[0] = n; - a.dim[1] = m; - a.heap = heap; - a.h = h; - a.d = heap.alloc(h, 1, n*m*sizeof(double)); - - // memset(a.d, 0, n*m*sizeof(double)); - - return a; -} - -error -math·freemtx(math·Vec *m) -{ - if (m->h == nil && m->heap.alloc == nil && m->heap.free == nil) { - errorf("attempting to free a matrix that doesn't own its data"); - return 1; - } - m->heap.free(m->h, m->d); - m->d = nil; - m->len = 0; - - return 0; -} - -/************************************************ - * multiply matrix to vector - ***********************************************/ - -/* - * Notation: (number of rows) x (number of columns) _ unroll factor - * N => variable we sum over - */ -static -void -mtxvec_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y) -{ - int c; - __m128d hr; - __m256d x256, r256[4]; - - for (c = 0; c < 4; c++) { - r256[c] = _mm256_setzero_pd(); - } - - for (c = 0; c < ncol; c += 4) { - x256 = _mm256_loadu_pd(x+c); - r256[0] += x256 * _mm256_loadu_pd(row[0] + c); - r256[1] += x256 * _mm256_loadu_pd(row[1] + c); - r256[2] += x256 * _mm256_loadu_pd(row[2] + c); - r256[3] += x256 * _mm256_loadu_pd(row[3] + c); - } - - for (c = 0; c < 4; c++) { - hr = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1)); - hr = _mm_hadd_pd(hr, hr); - y[c] = hr[0]; - } -} - -static -void -mtxvec_kernel4xN_4(int ncol, double **row, double *x, double *y) -{ - int c; - double res[4]; - - res[0] = 0.; - res[1] = 0.; - res[2] = 0.; - res[3] = 0.; - - for (c = 0; c < ncol; c += 4) { - res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; - res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; - res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3]; - res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3]; - } - - y[0] = res[0]; - y[1] = res[1]; - y[2] = res[2]; - y[3] = res[3]; -} - -static -void -mtxvec_kernel1xN_4(int ncol, double *row, double *x, double *y) -{ - int c; - double res; - - res = 0.; - for (c = 0; c < ncol; c += 4) { - res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3]; - } - - y[0] = res; -} - -// y = a*mx + b*y -error -math·mtxvec(math·Mtx m, double a, math·Vec x, double b, math·Vec y) -{ - int c, r, nrow, ncol; - double *row[4], res[4]; - - nrow = m.dim[0] & ~3; - ncol = m.dim[1] & ~3; - for (r = 0; r < nrow; r += 4) { - row[0] = m.d + (r * (m.dim[1]+0)); - row[1] = m.d + (r * (m.dim[1]+1)); - row[2] = m.d + (r * (m.dim[1]+2)); - row[3] = m.d + (r * (m.dim[1]+3)); - - mtxvec_kernel4xN_4_avx2(ncol, row, x.d + r, res); - - for (c = ncol; c < m.dim[1]; c++) { - res[0] += row[0][c]; - res[1] += row[1][c]; - res[2] += row[2][c]; - res[3] += row[3][c]; - } - - y.d[r+0] = res[0] + b*y.d[r+0]; - y.d[r+1] = res[1] + b*y.d[r+1]; - y.d[r+2] = res[2] + b*y.d[r+2]; - y.d[r+3] = res[3] + b*y.d[r+3]; - } - - for (; r < m.dim[0]; r++) { - mtxvec_kernel1xN_4(m.dim[0], m.d + (r * m.dim[1]), x.d + r, res); - y.d[r] = res[0] + b*y.d[r]; - } - - return 0; -} - -/************************************************ - * add matrix to vector outerproduct - ***********************************************/ - -#define NITER 50 - -#if 0 -error -main() -{ - int i; - clock_t t; - double res; - - math·Mtx m; - math·Vec x, y; - - openblas_set_num_threads(1); - - x = math·makevec(1000, mem·sys, nil); - y = math·makevec(1000, mem·sys, nil); - m = math·makemtx(1000, 1000, mem·sys, nil); - - for (i = 0; i < x.len; i++) { - y.d[i] = i; - } - - t = clock(); - for (i = 0; i < NITER; i++) { - cblas_dgemv(CblasRowMajor, CblasNoTrans, m.dim[0], m.dim[1], 1.5, m.d, m.dim[1], x.d, 1, 2.5, y.d, 1); - } - t = clock() - t; - res = math·dot(y, y); - printf("the result is %f\n", res); - printf("time elapsed (blas): %fms\n", 1000.*t/CLOCKS_PER_SEC); - - for (i = 0; i < x.len; i++) { - y.d[i] = i; - } - - t = clock(); - for (i = 0; i < NITER; i++) { - math·mtxvec(m, 1.5, x, 2.5, y); - } - t = clock() - t; - res = math·dot(y, y); - - printf("the dot product is %f\n", res); - printf("time elapsed (naive): %fms\n", 1000.*t/CLOCKS_PER_SEC); - - - return 0; -} -#endif -- cgit v1.2.1