From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/libmath/basic.c | 531 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/libmath/blas.c | 63 ++++++ src/libmath/blas1.c | 58 ++++++ src/libmath/blas1body | 215 ++++++++++++++++++++ src/libmath/blas2.c | 222 +++++++++++++++++++++ src/libmath/blas2body | 256 ++++++++++++++++++++++++ src/libmath/blas3.c | 279 ++++++++++++++++++++++++++ src/libmath/lapack.c | 0 src/libmath/linalg.c | 63 ++++++ src/libmath/loop.h | 114 +++++++++++ src/libmath/matrix.c | 176 +++++++++++++++++ src/libmath/rules.mk | 27 +++ src/libmath/test.c | 471 ++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 2475 insertions(+) create mode 100644 src/libmath/basic.c create mode 100644 src/libmath/blas.c create mode 100644 src/libmath/blas1.c create mode 100644 src/libmath/blas1body create mode 100644 src/libmath/blas2.c create mode 100644 src/libmath/blas2body create mode 100644 src/libmath/blas3.c create mode 100644 src/libmath/lapack.c create mode 100644 src/libmath/linalg.c create mode 100644 src/libmath/loop.h create mode 100644 src/libmath/matrix.c create mode 100644 src/libmath/rules.mk create mode 100644 src/libmath/test.c (limited to 'src/libmath') diff --git a/src/libmath/basic.c b/src/libmath/basic.c new file mode 100644 index 0000000..1341f7b --- /dev/null +++ b/src/libmath/basic.c @@ -0,0 +1,531 @@ +#include +#include +#include + +#include + +// TODO(nnoll): Replace implementations with your own. + +double +math·acos(double x) +{ + return acos(x); +} + +float +math·acosf(float x) +{ + return acosf(x); +} + + +double +math·acosh(double x) +{ + return acosh(x); +} + +float +math·acoshf(float x) +{ + return acoshf(x); +} + + +double +math·asin(double x) +{ + return asin(x); +} + +float +math·asinf(float x) +{ + return asinf(x); +} + + +double +math·asinh(double x) +{ + return asinh(x); +} + +float +math·asinhf(float x) +{ + return asinhf(x); +} + + +double +math·atan(double x) +{ + return atan(x); +} + +float +math·atanf(float x) +{ + return atanf(x); +} + + +double +math·atan2(double x, double y) +{ + return atan2(x, y); +} + +float +math·atan2f(float x, float y) +{ + return atan2f(x, y); +} + +double +math·atanh(double x) +{ + return atanh(x); +} + +float +math·atanhf(float x) +{ + return atanhf(x); +} + + +double +math·cbrt(double x) +{ + return cbrt(x); +} + +float +math·cbrtf(float x) +{ + return cbrtf(x); +} + + +double +math·ceil(double x) +{ + return ceil(x); +} + +float +math·ceilf(float x) +{ + return ceilf(x); +} + +double +math·cos(double x) +{ + return cos(x); +} + +float +math·cosf(float x) +{ + return cosf(x); +} + + +double +math·cosh(double x) +{ + return cosh(x); +} + +float +math·coshf(float x) +{ + return coshf(x); +} + + +double +math·erf(double x) +{ + return erf(x); +} + +float +math·erff(float x) +{ + return erff(x); +} + + +double +math·erfc(double x) +{ + return erfc(x); +} + +float +math·erfcf(float x) +{ + return erfcf(x); +} + + +double +math·exp(double x) +{ + return exp(x); +} + +float +math·expf(float x) +{ + return expf(x); +} + + +double +math·exp2(double x) +{ + return exp2(x); +} + +float +math·exp2f(float x) +{ + return exp2f(x); +} + + +double +math·expm1(double x) +{ + return expm1(x); +} + +float +math·expm1f(float x) +{ + return expm1f(x); +} + + +double +math·floor(double x) +{ + return floor(x); +} + +float +math·floorf(float x) +{ + return floorf(x); +} + + +int +math·ilogb(double x) +{ + return ilogb(x); +} + +int +math·ilogbf(float x) +{ + return ilogbf(x); +} + +double +math·lgamma(double x) +{ + return lgamma(x); +} + +float +math·lgammaf(float x) +{ + return lgammaf(x); +} + + +vlong +math·llrint(double x) +{ + return math·llrint(x); +} + +vlong +math·llrintf(float x) +{ + return math·llrintf(x); +} + + +vlong +math·llround(double x) +{ + return llround(x); +} + +vlong +math·llroundf(float x) +{ + return llroundf(x); +} + + +double +math·log(double x) +{ + return log(x); +} + +float +math·logf(float x) +{ + return logf(x); +} + + +double +math·log10(double x) +{ + return log10(x); +} + +float +math·log10f(float x) +{ + return log10f(x); +} + + +double +math·log1p(double x) +{ + return log1p(x); +} + +float +math·log1pf(float x) +{ + return log1pf(x); +} + + +double +math·log2(double x) +{ + return log2(x); +} + +float +math·log2f(float x) +{ + return log2f(x); +} + + +double +math·logb(double x) +{ + return logb(x); +} + +float +math·logbf(float x) +{ + return logbf(x); +} + + +long +math·lrint(double x) +{ + return lrint(x); +} + +long +math·lrintf(float x) +{ + return lrintf(x); +} + + +long +math·lround(double x) +{ + return lround(x); +} + +long +math·lroundf(float x) +{ + return lroundf(x); +} + + +double math·modf(double, double *); +float math·modff(float, float *); + +double +math·nan(const char * x) +{ + return nan(x); +} + +float +math·nanf(const char * x) +{ + return nanf(x); +} + + +double +math·nearbyint(double x) +{ + return nearbyint(x); +} + +float +math·nearbyintf(float x) +{ + return nearbyintf(x); +} + + +double +math·pow(double x, double exp) +{ + return pow(x, exp); +} + +float +math·powf(float x, float exp) +{ + return powf(x, exp); +} + +double +math·rint(double x) +{ + return rint(x); +} + +float +math·rintf(float x) +{ + return rintf(x); +} + + +double +math·round(double x) +{ + return round(x); +} + +float +math·roundf(float x) +{ + return roundf(x); +} + + +double math·scalbln(double, long); +float math·scalblnf(float, long); + +double math·scalbn(double, int); +float math·scalbnf(float, int); + +double +math·sin(double x) +{ + return sin(x); +} + +float +math·sinf(float x) +{ + return sinf(x); +} + + +double +math·sinh(double x) +{ + return sinh(x); +} + +float +math·sinhf(float x) +{ + return sinhf(x); +} + + +double +math·sqrt(double x) +{ + return sqrt(x); +} + +float +math·sqrtf(float x) +{ + return sqrtf(x); +} + + +double +math·tan(double x) +{ + return tan(x); +} + +float +math·tanf(float x) +{ + return tanf(x); +} + + +double +math·tanh(double x) +{ + return tanh(x); +} + +float +math·tanhf(float x) +{ + return tanhf(x); +} + + +double +math·tgamma(double x) +{ + return tgamma(x); +} + +float +math·tgammaf(float x) +{ + return tgammaf(x); +} + + +double +math·trunc(double x) +{ + return trunc(x); +} + +float +math·truncf(float x) +{ + return truncf(x); +} diff --git a/src/libmath/blas.c b/src/libmath/blas.c new file mode 100644 index 0000000..18f9760 --- /dev/null +++ b/src/libmath/blas.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +/* #include */ + +#define NCOL 2*512 +#define NROW 2*512 +#define NSUM 2*512 +#define NIT 10 +#define INC 1 +error +main() +{ + int i, j, nit; + double *x, *y, *z, *w, res[2]; + + clock_t t; + double tprof[2] = { 0 }; + + rng·init(0); + + x = malloc(sizeof(*x)*NROW*NCOL); + y = malloc(sizeof(*x)*NROW*NCOL); + z = malloc(sizeof(*x)*NROW*NCOL); + w = malloc(sizeof(*x)*NROW*NCOL); + +#define DO_0 t = clock(); \ + blas·dgemm(0,0,NROW,NCOL,NSUM,10.1,x,NROW,y,NROW,1.2,z,NROW);\ + t = clock() - t; \ + res[0] += blas·dasum(NROW*NCOL,z,INC); \ + tprof[0] += 1000.*t/CLOCKS_PER_SEC; \ + +#define DO_1 t = clock(); \ + cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,NROW,NCOL,NSUM,10.1,x,NROW,y,NROW,1.2,w,NROW);\ + t = clock() - t; \ + res[1] += cblas_dasum(NROW*NCOL,w,INC); \ + tprof[1] += 1000.*t/CLOCKS_PER_SEC; + + for (nit = 0; nit < NIT; nit++) { + for (i = 0; i < NROW; i++) { + for (j = 0; j < NCOL; j++) { + x[j + NROW*i] = rng·random(); + y[j + NROW*i] = rng·random(); + z[j + NROW*i] = rng·random(); + w[j + NROW*i] = z[j + NROW*i]; + } + } + + switch (nit % 2) { + case 0: DO_0; DO_1; break; + case 1: DO_1; DO_0; break; + } + } + printf("mean time/iteration (mine): %fms\n", tprof[0]/NIT); + printf("--> result (mine): %f\n", res[0]); + printf("mean time/iteration (openblas): %fms\n", tprof[1]/NIT); + printf("--> result (openblas): %f\n", res[1]); + + return 0; +} diff --git a/src/libmath/blas1.c b/src/libmath/blas1.c new file mode 100644 index 0000000..a8ca085 --- /dev/null +++ b/src/libmath/blas1.c @@ -0,0 +1,58 @@ +#include +#include + +// ----------------------------------------------------------------------- +// Templates + +#include "loop.h" +#define BODY_XY() \ + LOOP(UNROLL, 0, INIT); \ + n = ROUNDBY(len, UNROLL); \ + if (incx == 1 && incy == 1) { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,1,1); \ + } \ + } else { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,incx,incy);\ + } \ + } \ + \ + for (; i < len; i++) { \ + LOOP(1,0,KERNEL,incx,incy); \ + } + +#define BODY_X() \ + LOOP(UNROLL, 0, INIT); \ + n = ROUNDBY(len, UNROLL); \ + if (incx == 1) { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,1); \ + } \ + } else { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,incx); \ + } \ + } \ + \ + for (; i < len; i++) { \ + LOOP(1,0,KERNEL,incx); \ + } + +// ----------------------------------------------------------------------- +// Implementation + +#define UNROLL 8 +#define INT int + +#define FLOAT double +#define func(name) blas·d##name +#include "blas1body" + +#undef FLOAT +#undef func + +#define FLOAT float +#define func(name) blas·f##name +#include "blas1body" +#undef FLOAT diff --git a/src/libmath/blas1body b/src/libmath/blas1body new file mode 100644 index 0000000..de4b637 --- /dev/null +++ b/src/libmath/blas1body @@ -0,0 +1,215 @@ +/* vim: set ft=c */ +// ----------------------------------------------------------------------- +// Function implementations + +FLOAT +func(dot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) reg[I] = 0; +#define KERNEL(I, INCX, INCY) reg[I] += x[(INCX)*(i + I)] * y[(INCY)*(i + I)]; + INT i, n; + FLOAT reg[UNROLL]; + + BODY_XY() + + for (i = 1; i < UNROLL; i++) + reg[0] += reg[i]; + + return reg[0]; +#undef INIT +#undef KERNEL +} + +void +func(copy)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = x[(INCX)*(i + I)]; + INT i, n; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(swap)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) tmp[I] = x[(INCX)*(i + I)], x[(INCX)*(i + I)] = y[(INCY)*(i + I)], y[(INCY)*(i + I)] = tmp[I]; + INT i, n; + FLOAT tmp[UNROLL]; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(axpy)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] += a*x[(INCX)*(i + I)]; + INT i, n; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(axpby)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = a*x[(INCX)*(i + I)] + b*y[(INCY)*(i + I)]; + INT i, n; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +INT +func(argmax)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I,...) max[I] = x[I], idx[I] = I; +#define KERNEL(I, INCX) if (x[(INCX)*(i+I)] > max[I]) {max[i] = x[INCX*(i+I)]; idx[I] = i+I;} + INT i, n; + FLOAT max[UNROLL]; + INT idx[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + if (max[i] > max[0]) + idx[0] = idx[i]; + + return idx[0]; +#undef INIT +#undef KERNEL +} + +INT +func(argmin)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I,...) min[I] = x[I], idx[I] = I; +#define KERNEL(I, INCX) if (x[INCX*(i+I)] < min[I]) {min[i] = x[INCX*(i+I)]; idx[I] = i+I;} + INT i, n; + FLOAT min[UNROLL]; + INT idx[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + if (min[i] < min[0]) + idx[0] = idx[i]; + + return idx[0]; +#undef INIT +#undef KERNEL +} + +FLOAT +func(asum)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I,...) sum[I] = 0; +#define KERNEL(I, INCX) sum[I] += x[INCX*(i+I)] > 0 ? x[INCX*(i+I)] : -x[INCX*(i+I)]; + INT i, n; + FLOAT sum[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + sum[0] += sum[i]; + + return sum[0]; + +#undef INIT +#undef KERNEL +} + +void +func(scale)(INT len, FLOAT a, FLOAT *x, INT incx) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX) x[INCX*(i+I)] *= a; + INT i, n; + + BODY_X(); + +#undef INIT +#undef KERNEL +} + +FLOAT +func(norm)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX) norm[I] += x[INCX*(i+I)] * x[INCX*(i+I)]; + INT i, n; + FLOAT norm[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + norm[0] += norm[i]; + + return math·sqrt(norm[0]); + +#undef INIT +#undef KERNEL +} + +void +func(drot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT cos, FLOAT sin) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = cos*x[INCX*(i+I)] + sin*y[INCY*(i+I)], y[INCY*(i+I)] = cos*y[INCY*(i+I)] - sin*tmp[I]; + INT i, n; + FLOAT tmp[UNROLL]; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(rotm)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT H[5]) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = H[1]*x[INCX*(i+I)] + H[2]*y[INCY*(i+I)], y[INCY*(i+I)] = H[3]*tmp[I] + H[4]*y[INCY*(i+I)]; + INT i, n, f; + FLOAT tmp[UNROLL]; + + f = (INT)H[0]; + switch (f) { + case -2: + H[1] = +1; + H[2] = +0; + H[3] = +0; + H[4] = +1; + break; + case -1: + break; + case +0: + H[1] = +1; + H[4] = +1; + break; + case +1: + H[2] = +1; + H[3] = -1; + break; + default: + return; + } + + BODY_XY(); + +#undef INIT +#undef KERNEL +} diff --git a/src/libmath/blas2.c b/src/libmath/blas2.c new file mode 100644 index 0000000..7e4b08e --- /dev/null +++ b/src/libmath/blas2.c @@ -0,0 +1,222 @@ +#include +#include +#include "loop.h" + +// ----------------------------------------------------------------------- +// Templates + +#define BODY_RECT() \ + nr = ROUNDBY(nrow, UNROW); \ + nc = ROUNDBY(ncol, UNCOL); \ + if (incx == 1 && incy == 1) { \ + for (r = 0; r < nr; r += UNROW) { \ + LOOP(UNROW,0,INIT,1,1); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(UNROW,0,KERN,1,1,UNCOL); \ + } \ + for (; c < ncol; c++) { \ + LOOP(UNROW,0,KERN,1,1,1); \ + } \ + LOOP(UNROW,0,FINI,1,1); \ + } \ + } else { \ + for (r = 0; r < nr; r += UNROW) { \ + LOOP(UNROW,0,INIT,incx,incy); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(UNROW,0,KERN,incx,incy,UNCOL); \ + } \ + for (; c < ncol; c++) { \ + LOOP(UNROW,0,KERN,incx,incy,1); \ + } \ + LOOP(UNROW,0,FINI,incx,incy); \ + } \ + } \ + \ + for (; r < nrow; r++) { \ + LOOP(1,0,INIT,incx,incy); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(1,0,KERN,incx,incy,UNCOL); \ + } \ + for (; c < ncol; c++) { \ + LOOP(1,0,KERN,incx,incy,1); \ + } \ + LOOP(1,0,FINI,incx,incy); \ + } + +#define BODY_LOTRI() \ + nr = ROUNDBY(n, UNROW); \ + if (incx == 1) { \ + for (r = 0; r < nr; r += UNROW) { \ + LOOP(UNROW,0,INIT,1); \ + nc = ROUNDBY(r, UNCOL); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(UNROW,0,KERN,1,UNCOL); \ + } \ + for (; c < r; c++) { \ + LOOP(UNROW,0,KERN,1,1); \ + } \ + LOOP(UNROW,0,FINI,1); \ + } \ + } else { \ + for (r = 0; r < nr; r += UNROW) { \ + LOOP(UNROW,0,INIT,incx); \ + nc = ROUNDBY(r, UNCOL); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(UNROW,0,KERN,incx,UNCOL); \ + } \ + for (; c < r; c++) { \ + LOOP(UNROW,0,KERN,incx,1); \ + } \ + LOOP(UNROW,0,FINI,incx); \ + } \ + } \ + \ + for (; r < n; r++) { \ + LOOP(1,0,INIT,incx); \ + nc = ROUNDBY(r, UNCOL); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(1,0,KERN,incx,UNCOL); \ + } \ + for (; c < r; c++) { \ + LOOP(1,0,KERN,incx,1); \ + } \ + LOOP(1,0,FINI,incx); \ + } + +#define BODY_UPTRI() \ + nr = n - ROUNDBY(n, UNROW); \ + if (incx == 1) { \ + for (r = n-1; r >= nr; r -= UNROW) { \ + LOOP(UNROW,0,INIT,1); \ + nc = n - ROUNDBY(r, UNCOL); \ + for (c = n-1; c >= nc; c -= UNCOL) { \ + LOOP(UNROW,0,KERN,1,UNCOL); \ + } \ + for (; c > r; c--) { \ + LOOP(UNROW,0,KERN,1,1); \ + } \ + LOOP(UNROW,0,FINI,1); \ + } \ + } else { \ + for (r = n-1; r >= nr; r -= UNROW) { \ + LOOP(UNROW,0,INIT,incx); \ + nc = n - ROUNDBY(r, UNCOL); \ + for (c = n-1; c >= nc; c -= UNCOL) { \ + LOOP(UNROW,0,KERN,incx,UNCOL); \ + } \ + for (; c > r; c--) { \ + LOOP(UNROW,0,KERN,incx,1); \ + } \ + LOOP(UNROW,0,FINI,incx); \ + } \ + } \ + \ + for (; r >= 0; r--) { \ + LOOP(1,0,INIT,incx); \ + nc = n - ROUNDBY(r, UNCOL); \ + for (c = n-1; c >= nc; c -= UNCOL) { \ + LOOP(1,0,KERN,incx,UNCOL); \ + } \ + for (; c > r; c--) { \ + LOOP(1,0,KERN,incx,1); \ + } \ + LOOP(1,0,FINI,incx); \ + } + +#define BODY_LOTRI_XY() \ + nr = ROUNDBY(n, UNROW); \ + if (incx == 1 && incy == 1) { \ + for (r = 0; r < nr; r += UNROW) { \ + LOOP(UNROW,0,INIT,1,1); \ + nc = ROUNDBY(r, UNCOL); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(UNROW,0,KERN,1,1,UNCOL); \ + } \ + for (; c < r; c++) { \ + LOOP(UNROW,0,KERN,1,1,1); \ + } \ + LOOP(UNROW,0,FINI,1,1); \ + } \ + } else { \ + for (r = 0; r < nr; r += UNROW) { \ + LOOP(UNROW,0,INIT,incx,incy); \ + nc = ROUNDBY(r, UNCOL); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(UNROW,0,KERN,incx,incy,UNCOL); \ + } \ + for (; c < r; c++) { \ + LOOP(UNROW,0,KERN,incx,incy,1); \ + } \ + LOOP(UNROW,0,FINI,incx, incy); \ + } \ + } \ + \ + for (; r < n; r++) { \ + LOOP(1,0,INIT,incx,incy); \ + nc = ROUNDBY(r, UNCOL); \ + for (c = 0; c < nc; c += UNCOL) { \ + LOOP(1,0,KERN,incx,incy,UNCOL); \ + } \ + for (; c < r; c++) { \ + LOOP(1,0,KERN,incx,incy,1); \ + } \ + LOOP(1,0,FINI,incx,incy); \ + } + +#define BODY_UPTRI_XY() \ + nr = n - ROUNDBY(n, UNROW); \ + if (incx == 1 && incy == 1) { \ + for (r = n-1; r >= nr; r -= UNROW) { \ + LOOP(UNROW,0,INIT,1,1); \ + nc = n - ROUNDBY(r, UNCOL); \ + for (c = n-1; c >= nc; c -= UNCOL) { \ + LOOP(UNROW,0,KERN,1,1,UNCOL); \ + } \ + for (; c > r; c--) { \ + LOOP(UNROW,0,KERN,1,1,1); \ + } \ + LOOP(UNROW,0,FINI,1,1); \ + } \ + } else { \ + for (r = n-1; r >= nr; r -= UNROW) { \ + LOOP(UNROW,0,INIT,incx,incy); \ + nc = n - ROUNDBY(r, UNCOL); \ + for (c = n-1; c >= nc; c -= UNCOL) { \ + LOOP(UNROW,0,KERN,incx,incy,UNCOL); \ + } \ + for (; c > r; c--) { \ + LOOP(UNROW,0,KERN,incx,incy,1); \ + } \ + LOOP(UNROW,0,FINI,incx,incy); \ + } \ + } \ + \ + for (; r >= 0; r--) { \ + LOOP(1,0,INIT,incx,incy); \ + nc = n - ROUNDBY(r, UNCOL); \ + for (c = n-1; c >= nc; c -= UNCOL) { \ + LOOP(1,0,KERN,incx,incy,UNCOL); \ + } \ + for (; c > r; c--) { \ + LOOP(1,0,KERN,incx,incy,1); \ + } \ + LOOP(1,0,FINI,incx,incy); \ + } + +// ----------------------------------------------------------------------- +// implementation + +#define UNROW 4 +#define UNCOL 4 + +#define INT int +#define FLOAT double +#define func(name) blas·d##name +#include "blas2body" + +#undef FLOAT +#undef func + +#define FLOAT float +#define func(name) blas·f##name +#include "blas2body" diff --git a/src/libmath/blas2body b/src/libmath/blas2body new file mode 100644 index 0000000..45baf67 --- /dev/null +++ b/src/libmath/blas2body @@ -0,0 +1,256 @@ +/* general matrix multiply */ +error +func(gemv)(uint flag, INT nrow, INT ncol, FLOAT a, FLOAT *m, INT incm, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) +{ + INT r, c, nr, nc; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = 0; +#define TERM(J, I, INCX, INCY) row[I][c+J] * x[INCX*(c+J)] +#define KERN(I, INCX, INCY, LENGTH) reg[I] += EXPAND(LENGTH,0,TERM,+,I,INCX,INCY); +#define FINI(I, INCX, INCY) y[INCY*(r+I)] = b*y[INCY*(r+I)] + a*reg[I]; + + if (!flag) { + BODY_RECT(); + } else { + func(scale)(ncol, b, y, incy); +#undef KERN +#undef FINI +#undef INIT +#undef TERM +#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = a*x[INCX*(r+I)]; +#define TERM(J, I, INCX, INCY) row[I][c+J] * reg[I] +#define KERN(I, INCX, INCY, LENGTH) y[INCY*(c+I)] += EXPAND(LENGTH,0,TERM,+,I,INCX,INCY); +#define FINI(I, INCX, INCY) + BODY_RECT(); + } + + return 0; +#undef INIT +#undef TERM +#undef KERN +#undef FINI +} + +/* symmetric matrix vector multiply (different layouts) */ +void +func(symv)(uint upper, INT n, FLOAT a, FLOAT *m, INT incm, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) +{ + INT r, c, nr, nc, i; + FLOAT *row[UNROW], reg1[UNROW], reg2[UNROW]; + +#define INIT(I, INCX, INCY) row[I] = m + (r XX I)*incm, reg1[I] = 0, reg2[I] = 0; +#define TERM1(J, I, INCX, INCY) row[I][c XX J]*x[INCX*(c XX J)] +#define TERM2(J, I, INCX, INCY) row[I][c XX J]*x[INCX*((n-c-1) XX J)] +#define KERN(I, INCX, INCY, REPEAT) reg1[I] += EXPAND(REPEAT,0,TERM1,+,I,INCX,INCY), \ + reg2[I] += EXPAND(REPEAT,0,TERM2,+,I,INCX,INCY); +#define FINI(I, INCX, INCY) y[INCY*(r+I)] += a*(reg1[I] + row[I][r]*x[INCX*r]), \ + y[INCY*(n-r-1+I)] += a*reg2[I]; + + func(scale)(n, b, y, incy); +#define XX + + if (!upper) { + BODY_LOTRI_XY(); + } else { +#undef XX +#define XX - + BODY_UPTRI_XY(); + } +#undef XX + +#undef INIT +} + +void +func(spmv)(uint upper, INT n, FLOAT a, FLOAT *m, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) +{ + INT r, c, nr, nc, i; + FLOAT *row[UNROW], reg1[UNROW], reg2[UNROW]; + +#define INIT(I, INCX, INCY) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg1[I] = 0, reg2[I] = 0; + + func(scale)(n, b, y, incy); +#define XX + + if (!upper) { + BODY_LOTRI_XY(); + } else { +#undef XX +#undef INIT +#define INIT(I, INCX, INCY) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg1[I] = 0, reg2[I] = 0; +#define XX - + BODY_UPTRI_XY(); + } +#undef XX + +#undef TERM +#undef INIT +#undef KERN +#undef FINI +} + +/* triangular multiply (different layouts) */ +void +func(trmv)(uint upper, INT n, FLOAT *m, INT incm, FLOAT *x, INT incx) +{ + INT r, c, nr, nc, i; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = 0; +#define TERM(J, I, INCX) row[I][c XX J]*x[INCX*(c XX J)] +#define KERN(I, INCX, REPEAT) reg[I] += EXPAND(REPEAT,0,TERM,+,I,INCX); +#define FINI(I, INCX) x[INCX*(r XX I)] = row[I][r XX I]*x[INCX*(r XX I)] + reg[I]; + +#define XX + + if (!upper) { + BODY_LOTRI(); + } else { +#undef XX +#define XX - + BODY_UPTRI(); + } +#undef XX + +#undef INIT +} + +void +func(tpmv)(uint upper, INT n, FLOAT *m, FLOAT *x, INT incx) +{ + INT r, c, nr, nc, i; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0; + +#define XX + + if (!upper) { + BODY_LOTRI(); + } else { +#undef XX +#undef INIT +#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0; +#define XX - + BODY_UPTRI(); + } +#undef XX + +#undef INIT +#undef TERM +#undef KERN +#undef FINI +} + +/* triangular solve (different layouts) */ +void +func(trsv)(uint upper, INT n, FLOAT *m, INT incm, FLOAT *x, INT incx) +{ + INT r, c, nr, nc, i; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = 0; +#define TERM(J, I, INCX) row[I][c XX J]*x[c XX J] +#define KERN(I, INCX, REPEAT) reg[I] += EXPAND(REPEAT,0,TERM,+,I,INCX); +#define SOLN(J, I, INCX) reg[J] += row[I][r XX I]*x[INCX*(r XX I)] +#define FINI(I, INCX) x[INCX*(r XX I)] = reg[I] / row[I][r XX I]; EXPAND_TRI(UNROW,INC(I),SOLN,;,I,INCX); + +#define XX + + if (!upper) { + BODY_LOTRI(); + } else { +#undef XX +#define XX - + BODY_UPTRI(); + } +#undef XX + +#undef INIT +} + +void +func(tpsv)(uint upper, INT n, FLOAT *m, FLOAT *x, INT incx) +{ + INT r, c, nr, nc, i; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0; + +#define XX + + if (!upper) { + BODY_LOTRI(); + } else { +#undef XX +#undef INIT +#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0; +#define XX - + BODY_UPTRI(); + } +#undef XX + +#undef INIT +#undef TERM +#undef KERN +#undef SOLN +#undef FINI +} + +/* rank 1 update */ +void +func(ger)(INT nrow, INT ncol, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT *m, INT incm) +{ + INT r, c, nr, nc; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX, INCY) row[I] = m + (r+I)*incm, reg[I] = a*x[INCX*(r+I)]; +#define TERM(J, I, INCX, INCY) row[I][c+J] += reg[I] * y[INCY*(c+J)] +#define KERN(I, INCX, INCY, LENGTH) EXPAND(LENGTH,0,TERM,;,I,INCX, INCY); +#define FINI(I, ...) + + BODY_RECT(); + +#undef INIT +#undef TERM +#undef KERN +#undef FINI +} + +/* symmetric rank 1 update (different memory layouts) */ +void +func(syr)(uint upper, INT n, FLOAT a, FLOAT *x, INT incx, FLOAT *m, INT incm) +{ + INT r, c, nr, nc; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX) row[I] = m + (r XX I)*incm, reg[I] = a*x[INCX*(r XX I)]; +#define TERM(J, I, INCX) row[I][c XX J] += reg[I] * x[INCX*(c XX J)] +#define KERN(I, INCX, LENGTH) EXPAND(LENGTH,0,TERM,;,I,INCX); +#define FINI(I, ...) + +#define XX + + if (!upper) { + BODY_LOTRI(); + } else { +#undef XX +#define XX - + BODY_UPTRI(); + } +#undef XX + +#undef INIT +} + +void +func(spr)(uint upper, INT n, FLOAT a, FLOAT *x, INT incx, FLOAT *m) +{ + INT r, c, nr, nc; + FLOAT *row[UNROW], reg[UNROW]; +#define INIT(I, INCX) row[I] = m + ((r+I)*(r+I+1))*(1/2), reg[I] = 0; + +#define XX + + if (!upper) { + BODY_LOTRI(); + } else { +#undef XX +#undef INIT +#define INIT(I, INCX) row[I] = m + ((2*n-r-I)*(r+I+1))*(1/2), reg[I] = 0; +#define XX - + BODY_UPTRI(); + } +#undef XX + +#undef INIT +#undef TERM +#undef KERN +#undef FINI +} diff --git a/src/libmath/blas3.c b/src/libmath/blas3.c new file mode 100644 index 0000000..b048c95 --- /dev/null +++ b/src/libmath/blas3.c @@ -0,0 +1,279 @@ +#include +#include +#include + +#define INT int +#define FLOAT double +#define func(name) blas·d##name + +#define X(i, j) x[j + incx*(i)] +#define Y(i, j) y[j + incy*(i)] +#define Z(i, j) z[j + incz*(i)] + +void +func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) +{ + INT jj, jb, kk, kb, dk, i, j, k, end; + FLOAT r0[8], r1[8], r2[8], r3[8], pf; + + for (i = 0; i < ni; i++) { + for (j = 0; j < nj; j++) { + Z(i,j) *= b; + } + } + + jb = MIN(256, nj); + kb = MIN(48, nk); + for (jj = 0; jj < nj; jj += jb) { + for (kk = 0; kk < nk; kk += kb) { + for (i = 0; i < ni; i += 4) { + for (j = jj; j < jj + jb; j += 8) { + r0[0] = Z(i+0, j+0); r0[1] = Z(i+0, j+1); r0[2] = Z(i+0, j+2); r0[3] = Z(i+0, j+3); + r1[0] = Z(i+1, j+0); r1[1] = Z(i+1, j+1); r1[2] = Z(i+1, j+2); r1[3] = Z(i+1, j+3); + r2[0] = Z(i+2, j+0); r2[1] = Z(i+2, j+1); r2[2] = Z(i+2, j+2); r2[3] = Z(i+2, j+3); + r3[0] = Z(i+3, j+0); r3[1] = Z(i+3, j+1); r3[2] = Z(i+3, j+2); r3[3] = Z(i+3, j+3); + end = MIN(nk, kk+kb); + for (k = kk; k < end; k++) { + pf = a * X(i, k); + r0[0] += pf * Y(k, j+0); r0[1] += pf * Y(k, j+1); r0[2] += pf * Y(k, j+2); r0[3] += pf * Y(k, j+3); + + pf = a * X(i+1, k); + r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3); + + pf = a * X(i+2, k); + r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3); + + pf = a * X(i+3, k); + r1[0] += pf * Y(k, j+0); r1[1] += pf * Y(k, j+1); r1[2] += pf * Y(k, j+2); r1[3] += pf * Y(k, j+3); + } + Z(i+0, j+0) = r0[0]; Z(i+0, j+1) = r0[1]; Z(i+0, j+2) = r0[2]; Z(i+0, j+3) = r0[3]; + Z(i+1, j+0) = r1[0]; Z(i+1, j+1) = r1[1]; Z(i+1, j+2) = r1[2]; Z(i+1, j+3) = r1[3]; + Z(i+2, j+0) = r2[0]; Z(i+2, j+1) = r2[1]; Z(i+2, j+2) = r2[2]; Z(i+2, j+3) = r2[3]; + Z(i+3, j+0) = r3[0]; Z(i+3, j+1) = r3[1]; Z(i+3, j+2) = r3[2]; Z(i+3, j+3) = r3[3]; + } + } + } + } +} + +#if 0 +void +func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) +{ + int i, j, k; + FLOAT w[nj*nk], acc[4][4]; + + for (i = 0; i < ni; i++) { + for (j = 0; j < nj; j++) { + Z(i,j) *= b; + W(i,j) = Y(j,i); + } + } + + for (i = 0; i < ni; i+=4) { + for (j = 0; j < nj; j+=4) { + memset(acc, 0, sizeof(acc)); + for (k = 0; k < nk; k+=4) { + acc[0][0] += X(i+0,k)*W(j+0,k) + X(i+0,k+1)*W(j+0,k+1) + X(i+0,k+2)*W(j+0,k+2) + X(i+0,k+3)*W(j+0,k+3); + acc[0][1] += X(i+0,k)*W(j+1,k) + X(i+0,k+1)*W(j+1,k+1) + X(i+0,k+2)*W(j+1,k+2) + X(i+0,k+3)*W(j+1,k+3); + acc[0][2] += X(i+0,k)*W(j+2,k) + X(i+0,k+1)*W(j+2,k+1) + X(i+0,k+2)*W(j+2,k+2) + X(i+0,k+3)*W(j+2,k+3); + acc[0][3] += X(i+0,k)*W(j+3,k) + X(i+0,k+1)*W(j+3,k+1) + X(i+0,k+2)*W(j+3,k+2) + X(i+0,k+3)*W(j+3,k+3); + + acc[1][0] += X(i+1,k)*W(j+0,k) + X(i+1,k+1)*W(j+0,k+1) + X(i+1,k+2)*W(j+0,k+2) + X(i+1,k+3)*W(j+0,k+3); + acc[1][1] += X(i+1,k)*W(j+1,k) + X(i+1,k+1)*W(j+1,k+1) + X(i+1,k+2)*W(j+1,k+2) + X(i+1,k+3)*W(j+1,k+3); + acc[1][2] += X(i+1,k)*W(j+2,k) + X(i+1,k+1)*W(j+2,k+1) + X(i+1,k+2)*W(j+2,k+2) + X(i+1,k+3)*W(j+2,k+3); + acc[1][3] += X(i+1,k)*W(j+3,k) + X(i+1,k+1)*W(j+3,k+1) + X(i+1,k+2)*W(j+3,k+2) + X(i+1,k+3)*W(j+3,k+3); + + acc[2][0] += X(i+2,k)*W(j+0,k) + X(i+2,k+1)*W(j+0,k+1) + X(i+2,k+2)*W(j+0,k+2) + X(i+2,k+3)*W(j+0,k+3); + acc[2][1] += X(i+2,k)*W(j+1,k) + X(i+2,k+1)*W(j+1,k+1) + X(i+2,k+2)*W(j+1,k+2) + X(i+2,k+3)*W(j+1,k+3); + acc[2][2] += X(i+2,k)*W(j+2,k) + X(i+2,k+1)*W(j+2,k+1) + X(i+2,k+2)*W(j+2,k+2) + X(i+2,k+3)*W(j+2,k+3); + acc[2][3] += X(i+2,k)*W(j+3,k) + X(i+2,k+1)*W(j+3,k+1) + X(i+2,k+2)*W(j+3,k+2) + X(i+2,k+3)*W(j+3,k+3); + + acc[2][0] += X(i+3,k)*W(j+0,k) + X(i+3,k+1)*W(j+0,k+1) + X(i+3,k+2)*W(j+0,k+2) + X(i+3,k+3)*W(j+0,k+3); + acc[2][1] += X(i+3,k)*W(j+1,k) + X(i+3,k+1)*W(j+1,k+1) + X(i+3,k+2)*W(j+1,k+2) + X(i+3,k+3)*W(j+1,k+3); + acc[2][2] += X(i+3,k)*W(j+2,k) + X(i+3,k+1)*W(j+2,k+1) + X(i+3,k+2)*W(j+2,k+2) + X(i+3,k+3)*W(j+2,k+3); + acc[2][3] += X(i+3,k)*W(j+3,k) + X(i+3,k+1)*W(j+3,k+1) + X(i+3,k+2)*W(j+3,k+2) + X(i+3,k+3)*W(j+3,k+3); + // Z(i,j) += X(i,k)*Y(k,j); + } + Z(i+0,j+1) = a*acc[0][0]; + Z(i+0,j+2) = a*acc[0][1]; + Z(i+0,j+3) = a*acc[0][2]; + Z(i+0,j+4) = a*acc[0][3]; + + Z(i+1,j+1) = a*acc[1][0]; + Z(i+1,j+2) = a*acc[1][1]; + Z(i+1,j+3) = a*acc[1][2]; + Z(i+1,j+4) = a*acc[1][3]; + + Z(i+2,j+1) = a*acc[2][0]; + Z(i+2,j+2) = a*acc[2][1]; + Z(i+2,j+3) = a*acc[2][2]; + Z(i+2,j+4) = a*acc[2][3]; + + Z(i+3,j+1) = a*acc[3][0]; + Z(i+3,j+2) = a*acc[3][1]; + Z(i+3,j+3) = a*acc[3][2]; + Z(i+3,j+4) = a*acc[3][3]; + } + } +} +#endif + +#if 0 +void +func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) +{ + int i, j, k, ri, rj, rk; + FLOAT reg[4][4], *xrow[4], *yrow[4]; + + for (i = 0; i < ni; i++) { + for (j = 0; j < nj; j++) { + z[j + incz*i] *= b; + } + } + + for (i = 0; i < ni; i += 4) { + xrow[0] = x + incx*(i+0); + xrow[1] = x + incx*(i+1); + xrow[2] = x + incx*(i+2); + xrow[3] = x + incx*(i+3); + for (k = 0; k < nk; k+=4) { + yrow[0] = y + incy*(k+0); + yrow[1] = y + incy*(k+1); + yrow[2] = y + incy*(k+2); + yrow[3] = y + incy*(k+3); + reg[0][0] = a * xrow[0][k+0]; reg[0][1] = a * xrow[0][k+1]; reg[0][2] = a * xrow[0][k+2]; reg[0][3] = a * xrow[0][k+3]; + reg[1][0] = a * xrow[1][k+0]; reg[1][1] = a * xrow[1][k+1]; reg[1][2] = a * xrow[1][k+2]; reg[1][3] = a * xrow[1][k+3]; + reg[2][0] = a * xrow[2][k+0]; reg[2][1] = a * xrow[2][k+1]; reg[2][2] = a * xrow[2][k+2]; reg[2][3] = a * xrow[2][k+3]; + reg[3][0] = a * xrow[3][k+0]; reg[3][1] = a * xrow[3][k+1]; reg[3][2] = a * xrow[3][k+2]; reg[3][3] = a * xrow[3][k+3]; + for (j = 0; j < nj; j += 1) { + z[j + incz*(i+0)] += (reg[0][0]*yrow[0][j]+reg[0][1]*yrow[1][j]+reg[0][2]*yrow[2][j]+reg[0][3]*yrow[3][j]); + z[j + incz*(i+1)] += (reg[1][0]*yrow[0][j]+reg[1][1]*yrow[1][j]+reg[1][2]*yrow[2][j]+reg[1][3]*yrow[3][j]); + z[j + incz*(i+2)] += (reg[2][0]*yrow[0][j]+reg[2][1]*yrow[1][j]+reg[2][2]*yrow[2][j]+reg[2][3]*yrow[3][j]); + z[j + incz*(i+3)] += (reg[3][0]*yrow[0][j]+reg[3][1]*yrow[1][j]+reg[3][2]*yrow[2][j]+reg[3][3]*yrow[3][j]); + } + } + } +} +#endif + +#if 0 +void +func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) +{ + int i, j, k, ri, rj, rk; + FLOAT r[4][4], *row[4]; + + for (i = 0; i < ni; i++) { + for (j = 0; j < nj; j++) { + Z(i, j) *= b; + } + } + + for (i = 0; i < ni; i+=4) { + for (j = 0; j < nj; j+=4) { + r[0][0] = 0; r[0][1] = 0; r[0][2] = 0; r[0][3] = 0; + r[1][0] = 0; r[1][1] = 0; r[1][2] = 0; r[1][3] = 0; + r[2][0] = 0; r[2][1] = 0; r[2][2] = 0; r[2][3] = 0; + r[3][0] = 0; r[3][1] = 0; r[3][2] = 0; r[3][3] = 0; + row[0] = &X(i+0, 0); + row[1] = &X(i+1, 0); + row[2] = &X(i+2, 0); + row[3] = &X(i+3, 0); + for (k = 0; k < nk; k++) { + r[0][0] += row[0][k]*Y(k,0); r[0][1] += row[0][k]*Y(k,1); r[0][2] += row[0][k]*Y(k,2); r[0][3] += row[0][k]*Y(k,3); + r[1][0] += row[1][k]*Y(k,0); r[1][1] += row[1][k]*Y(k,1); r[1][2] += row[1][k]*Y(k,2); r[1][3] += row[1][k]*Y(k,3); + r[2][0] += row[2][k]*Y(k,0); r[2][1] += row[2][k]*Y(k,1); r[2][2] += row[2][k]*Y(k,2); r[2][3] += row[2][k]*Y(k,3); + r[3][0] += row[3][k]*Y(k,0); r[3][1] += row[3][k]*Y(k,1); r[3][2] += row[3][k]*Y(k,2); r[3][3] += row[3][k]*Y(k,3); + } + Z(i+0, j+0) += r[0][0]; Z(i+0, j+1) += r[0][1]; Z(i+0, j+2) += r[0][2]; Z(i+0, j+3) += r[0][3]; + Z(i+1, j+0) += r[1][0]; Z(i+1, j+1) += r[1][1]; Z(i+1, j+2) += r[1][2]; Z(i+1, j+3) += r[1][3]; + Z(i+2, j+0) += r[2][0]; Z(i+2, j+1) += r[2][1]; Z(i+2, j+2) += r[2][2]; Z(i+2, j+3) += r[2][3]; + Z(i+3, j+0) += r[3][0]; Z(i+3, j+1) += r[3][1]; Z(i+3, j+2) += r[3][2]; Z(i+3, j+3) += r[3][3]; + } + } +} +#endif + +#if 0 +void +func(gemm)(uint trm, uint trn, INT ni, INT nj, INT nk, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT b, FLOAT *z, INT incz) +{ + int i, j, k, ri, rj, rk; + FLOAT *xrow[8], *yrow[8], reg; + + for (i = 0; i < ni; i++) { + for (j = 0; j < nj; j++) { + z[j + incz*i] *= b; + } + } + + ri = ni & ~7; + rj = nj & ~7; + for (i = 0; i < ri; i += 8) { + xrow[0] = x + incx*(i+0); + xrow[1] = x + incx*(i+1); + xrow[2] = x + incx*(i+2); + xrow[3] = x + incx*(i+3); + xrow[4] = x + incx*(i+4); + xrow[5] = x + incx*(i+5); + xrow[6] = x + incx*(i+6); + xrow[7] = x + incx*(i+7); + for (j = 0; j < rj; j += 8) { + yrow[0] = y + incy*(j+0); + yrow[1] = y + incy*(j+1); + yrow[2] = y + incy*(j+2); + yrow[3] = y + incy*(j+3); + yrow[4] = y + incy*(j+4); + yrow[5] = y + incy*(j+5); + yrow[6] = y + incy*(j+6); + yrow[7] = y + incy*(j+7); + for (k = 0; k < nk; k++) { + reg = a*(yrow[0][k] + yrow[1][k] + yrow[2][k] + yrow[3][k] + yrow[4][k] + yrow[5][k] + yrow[6][k] + yrow[7][k]); + z[k + incz*(i+0)] += xrow[0][k]*reg; + z[k + incz*(i+1)] += xrow[1][k]*reg; + z[k + incz*(i+2)] += xrow[2][k]*reg; + z[k + incz*(i+3)] += xrow[3][k]*reg; + z[k + incz*(i+4)] += xrow[4][k]*reg; + z[k + incz*(i+5)] += xrow[5][k]*reg; + z[k + incz*(i+6)] += xrow[6][k]*reg; + z[k + incz*(i+7)] += xrow[7][k]*reg; + } + } + for (; j < nj; j++) { + for (k = 0; k < nk; k++) { + reg = a*y[k+incy*j]; + z[k + incz*(i+0)] += xrow[0][k]*reg; + z[k + incz*(i+1)] += xrow[1][k]*reg; + z[k + incz*(i+2)] += xrow[2][k]*reg; + z[k + incz*(i+3)] += xrow[3][k]*reg; + z[k + incz*(i+4)] += xrow[4][k]*reg; + z[k + incz*(i+5)] += xrow[5][k]*reg; + z[k + incz*(i+6)] += xrow[6][k]*reg; + z[k + incz*(i+7)] += xrow[7][k]*reg; + } + } + } + + for (; i < ni; i++) { + for (j = 0; j < rj; j += 8) { + yrow[0] = y + incy*(j+0); + yrow[1] = y + incy*(j+1); + yrow[2] = y + incy*(j+2); + yrow[3] = y + incy*(j+3); + yrow[4] = y + incy*(j+4); + yrow[5] = y + incy*(j+5); + yrow[6] = y + incy*(j+6); + yrow[7] = y + incy*(j+7); + for (k = 0; k < nk; k++) { + z[k + incz*(i)] += a*x[k + incx*i]*(yrow[0][k] + yrow[1][k] + yrow[2][k] + yrow[3][k] + yrow[4][k] + yrow[5][k] + yrow[6][k] + yrow[7][k]); + } + } + for (; j < nj; j++) { + for (k = 0; k < nk; k++) { + z[k + incz*i] += a*x[k + incx*i]*y[k + incy*j]; + } + } + } +} +#endif diff --git a/src/libmath/lapack.c b/src/libmath/lapack.c new file mode 100644 index 0000000..e69de29 diff --git a/src/libmath/linalg.c b/src/libmath/linalg.c new file mode 100644 index 0000000..8551ff1 --- /dev/null +++ b/src/libmath/linalg.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +// ----------------------------------------------------------------------- +// Vector + +void +linalg·normalize(math·Vector vec) +{ + double norm; + + norm = blas·normd(vec.len, vec.data, 1); + blas·scaled(vec.len, 1/norm, vec.data, 1); +} +// TODO: Write blas wrappers that eat vectors for convenience + +// ----------------------------------------------------------------------- +// Matrix +// +// NOTE: all matrices are row major oriented + +/* + * linalg·lq + * computes the LQ decomposition of matrix M: M = LQ + * L is lower triangular + * Q is orthogonal -> transp(Q) * Q = I + * + * m: matrix to factorize. changes in place + * + lower triangle -> L + * + upper triangle -> all reflection vectors stored in rows + * w: working buffer: len = ncols! + */ +error +linalg·lq(math·Matrix m, math·Vector w) +{ + int i, j, len; + double *row, mag; + enum { + err·nil, + err·baddims, + }; + + if (m.dim[0] > m.dim[1]) { + return err·baddims; + } + + for (i = 0; i < m.dim[0]; i++, m.data += m.dim[1]) { + row = m.data + i; + len = m.dim[0] - i; + + // TODO: Don't want to compute norm twice!! + w.data[0] = math·sgn(row[0]) * blas·normd(len, row, 1); + blas·axpyd(len, 1.0, row, 1, w.data, 1); + mag = blas·normd(len, w.data, 1); + blas·scaled(len, 1/mag, w.data, 1); + + blas·copyd(len - m.dim[0], w.data, 1, m.data + i, 1); + } + + return err·nil; +} diff --git a/src/libmath/loop.h b/src/libmath/loop.h new file mode 100644 index 0000000..a877d84 --- /dev/null +++ b/src/libmath/loop.h @@ -0,0 +1,114 @@ +#pragma once + +/* increment operator */ +#define INC2(x) INC_##x +#define INC1(x) INC2(x) +#define INC(x) INC1(x) + +#define INC_0 1 +#define INC_1 2 +#define INC_2 3 +#define INC_3 4 +#define INC_4 5 +#define INC_5 6 +#define INC_6 7 +#define INC_7 8 +#define INC_8 9 +#define INC_9 10 +#define INC_10 11 +#define INC_11 12 +#define INC_12 13 +#define INC_13 14 +#define INC_14 15 +#define INC_15 16 + +#define ROUNDBY(x, n) ((x) & ~((n)-1)) + +/* subtraction tables */ +#define SUB2(x, y) SUB_##x##_##y +#define SUB1(x, y) SUB2(x, y) +#define SUB(x, y) SUB1(x, y) +#define SUB_8_0 8 +#define SUB_8_1 7 +#define SUB_8_2 6 +#define SUB_8_3 5 +#define SUB_8_4 4 +#define SUB_8_5 3 +#define SUB_8_6 2 +#define SUB_8_7 1 +#define SUB_8_8 0 +#define SUB_7_0 7 +#define SUB_7_1 6 +#define SUB_7_2 5 +#define SUB_7_3 4 +#define SUB_7_4 3 +#define SUB_7_5 2 +#define SUB_7_6 1 +#define SUB_7_7 0 +#define SUB_6_0 6 +#define SUB_6_1 5 +#define SUB_6_2 4 +#define SUB_6_3 3 +#define SUB_6_4 2 +#define SUB_6_5 1 +#define SUB_6_6 0 +#define SUB_5_0 5 +#define SUB_5_1 4 +#define SUB_5_2 3 +#define SUB_5_3 2 +#define SUB_5_4 1 +#define SUB_5_5 0 +#define SUB_4_0 4 +#define SUB_4_1 3 +#define SUB_4_2 2 +#define SUB_4_3 1 +#define SUB_4_4 0 +#define SUB_3_0 3 +#define SUB_3_1 2 +#define SUB_3_2 1 +#define SUB_3_3 0 +#define SUB_2_0 2 +#define SUB_2_1 1 +#define SUB_2_2 0 +#define SUB_1_0 1 +#define SUB_1_1 0 + +/* rounding operator */ +#define ROUNDBY(x, n) ((x) & ~((n)-1)) + +/* loop unrolling (vertical) */ +#define LOOP1(I,STMT,...) STMT(I,__VA_ARGS__) +#define LOOP2(I,STMT,...) STMT(I,__VA_ARGS__) LOOP1(INC(I),STMT,__VA_ARGS__) +#define LOOP3(I,STMT,...) STMT(I,__VA_ARGS__) LOOP2(INC(I),STMT,__VA_ARGS__) +#define LOOP4(I,STMT,...) STMT(I,__VA_ARGS__) LOOP3(INC(I),STMT,__VA_ARGS__) +#define LOOP5(I,STMT,...) STMT(I,__VA_ARGS__) LOOP4(INC(I),STMT,__VA_ARGS__) +#define LOOP6(I,STMT,...) STMT(I,__VA_ARGS__) LOOP5(INC(I),STMT,__VA_ARGS__) +#define LOOP7(I,STMT,...) STMT(I,__VA_ARGS__) LOOP6(INC(I),STMT,__VA_ARGS__) +#define LOOP8(I,STMT,...) STMT(I,__VA_ARGS__) LOOP7(INC(I),STMT,__VA_ARGS__) +#define LOOP9(I,STMT,...) STMT(I,__VA_ARGS__) LOOP8(INC(I),STMT,__VA_ARGS__) +#define LOOP10(I,STMT,...) STMT(I,__VA_ARGS__) LOOP9(INC(I),STMT,__VA_ARGS__) +#define LOOP11(I,STMT,...) STMT(I,__VA_ARGS__) LOOP10(INC(I),STMT,__VA_ARGS__) +#define LOOP12(I,STMT,...) STMT(I,__VA_ARGS__) LOOP11(INC(I),STMT,__VA_ARGS__) +#define LOOP13(I,STMT,...) STMT(I,__VA_ARGS__) LOOP12(INC(I),STMT,__VA_ARGS__) +#define LOOP14(I,STMT,...) STMT(I,__VA_ARGS__) LOOP13(INC(I),STMT,__VA_ARGS__) +#define LOOP15(I,STMT,...) STMT(I,__VA_ARGS__) LOOP14(INC(I),STMT,__VA_ARGS__) +#define LOOP16(I,STMT,...) STMT(I,__VA_ARGS__) LOOP15(INC(I),STMT,__VA_ARGS__) + +#define _LOOP_(n,I,STMT,...) LOOP##n(I,STMT,__VA_ARGS__) +#define LOOP(n,I,STMT,...) _LOOP_(n,I,STMT,__VA_ARGS__) + +/* loop expansion (horizontal) */ +#define EXPAND0(I,TERM,OP,...) +#define EXPAND1(I,TERM,OP,...) TERM(I,__VA_ARGS__) +#define EXPAND2(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND1(INC(I),TERM,OP,__VA_ARGS__) +#define EXPAND3(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND2(INC(I),TERM,OP,__VA_ARGS__) +#define EXPAND4(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND3(INC(I),TERM,OP,__VA_ARGS__) +#define EXPAND5(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND4(INC(I),TERM,OP,__VA_ARGS__) +#define EXPAND6(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND5(INC(I),TERM,OP,__VA_ARGS__) +#define EXPAND7(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND6(INC(I),TERM,OP,__VA_ARGS__) +#define EXPAND8(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND7(INC(I),TERM,OP,__VA_ARGS__) + +#define _EXPAND_(n,I,TERM,OP,...) EXPAND##n(I,TERM,OP,__VA_ARGS__) +#define EXPAND(n,I,TERM,OP,...) _EXPAND_(n,I,TERM,OP,__VA_ARGS__) +#define EXPAND_TRI1(n,I,TERM,OP,...) EXPAND(n,I,TERM,OP,__VA_ARGS__) +#define EXPAND_TRI(n,I,TERM,OP,...) EXPAND_TRI1(SUB(n,I),I,TERM,OP,__VA_ARGS__) diff --git a/src/libmath/matrix.c b/src/libmath/matrix.c new file mode 100644 index 0000000..e8bca0b --- /dev/null +++ b/src/libmath/matrix.c @@ -0,0 +1,176 @@ +#include +#include +#include + +/* TODO: replace (incrementally) with native C version! */ +#include +#include + +// ----------------------------------------------------------------------- +// level 1 + +error +la·vecslice(math·Vector *x, int min, int max, int inc) +{ + if (max > x->len || min < 0) { + errorf("out of bounds: attempted to access vector past length"); + return 1; + } + x->len = (max - min) / inc; + x->d += x->inc * min; + x->inc *= inc; + + return 0; +} + +/* simple blas wrappers */ +void +la·veccopy(math·Vector *dst, math·Vector *src) +{ + return cblas_dcopy(src->len, src->d, src->inc, dst->d, dst->inc); +} + +double +la·vecnorm(math·Vector *x) +{ + return cblas_dnrm2(x->len, x->d, x->inc); +} + +void +la·vecscale(math·Vector *x, double a) +{ + return cblas_dscal(x->len, a, x->d, x->inc); +} + +double +la·vecdot(math·Vector *x, math·Vector *y) +{ + return cblas_ddot(x->len, x->d, x->inc, y->d, y->inc); +} + +// ----------------------------------------------------------------------- +// level 2 + +error +la·vecmat(math·Vector *x, math·Matrix *M) +{ + if (M->dim[1] != x->len) { + errorf("incompatible matrix dimensions"); + return 1; + } + if (M->state & ~mat·trans) + cblas_dgemv(CblasRowMajor,CblasNoTrans,M->dim[0],M->dim[1],1.,M->d,M->inc,x->d,x->inc,0.,x->d,x->inc); + else + cblas_dgemv(CblasRowMajor,CblasTrans,M->dim[0],M->dim[1],1.,M->d,M->inc,x->d,x->inc,0.,x->d,x->inc); + + return 0; +} + +// ----------------------------------------------------------------------- +// level 3 + +void +la·transpose(math·Matrix *X) +{ + int tmp; + X->state ^= mat·trans; + tmp = X->dim[0], X->dim[0] = X->dim[1], X->dim[1] = tmp; +} + +error +la·matrow(math·Matrix *X, int r, math·Vector *row) +{ + if (r < 0 || r >= X->dim[0]) { + errorf("out of bounds"); + return 1; + } + + row->len = X->dim[1]; + row->inc = 1; + row->d = X->d + X->dim[1] * r; + + return 0; +} + +error +la·matcol(math·Matrix *X, int c, math·Vector *col) +{ + if (c < 0 || c >= X->dim[1]) { + errorf("out of bounds"); + return 1; + } + + col->len = X->dim[0]; + col->inc = X->dim[1]; + col->d = X->d + c; + + return 0; +} + +error +la·matslice(math·Matrix *X, int r[3], int c[3]) +{ + /* TODO */ + return 0; +} + +error +la·eig(math·Matrix *X) +{ + +} + +/* X = A*B */ +error +la·matmul(math·Matrix *X, math·Matrix *A, math·Matrix *B) +{ + if (A->dim[1] != B->dim[0]) { + errorf("number of interior dimensions of A '%d' not equal to that of B '%d'", A->dim[1], B->dim[0]); + return 1; + } + if (X->dim[0] != A->dim[0]) { + errorf("number of exterior dimensions of X '%d' not equal to that of A '%d'", X->dim[0], A->dim[0]); + return 1; + } + if (X->dim[1] != B->dim[1]) { + errorf("number of exterior dimensions of X '%d' not equal to that of B '%d'", X->dim[1], B->dim[1]); + return 1; + } + + if (X->state & ~mat·trans) + if (A->state & ~mat·trans) + cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); + else + cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); + else + if (A->state & ~mat·trans) + cblas_dgemm(CblasRowMajor,CblasTrans,CblasNoTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); + else + cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,A->dim[0],B->dim[1],A->dim[1],1.,A->d,A->inc,B->d,B->inc,0.,X->d,X->inc); + + return 0; +} + +/* + * solves A*X=B + * pass in B via X + */ +error +la·solve(math·Matrix *X, math·Matrix *A) +{ + error err; + int n, *ipv; + static int buf[512]; + if (n = A->dim[0], n < arrlen(buf)) { + ipv = buf; + n = 0; + } else + ipv = malloc(n*sizeof(*ipv)); + + /* TODO: utilize more specific regimes if applicable */ + err = LAPACKE_dgesv(LAPACK_ROW_MAJOR,A->dim[0],X->dim[1],A->d,A->inc,ipv,X->d,X->inc); + + if (n) + free(ipv); + return err; +} diff --git a/src/libmath/rules.mk b/src/libmath/rules.mk new file mode 100644 index 0000000..577aba4 --- /dev/null +++ b/src/libmath/rules.mk @@ -0,0 +1,27 @@ +include share/push.mk + +# Iterate through subdirectory tree + +# local sources +SRCS_$(d):=\ + $(d)/basic.c\ + $(d)/blas1.c\ + $(d)/blas2.c\ + $(d)/blas3.c +CHECK_$(d):=\ + +# outputs +LIBS_$(d):=\ + $(d)/libmath.a + +include share/paths.mk + +$(LIBS_$(d)): $(OBJS_$(d)) + $(ARCHIVE) + +$(TEST_$(d)): TCFLAGS = -D_GNU_SOURCE +$(TEST_$(d)): TCLIBS = -lpthread -lm $(LIB_DIR)/libblas.a +$(TEST_$(d)): $(UNIT_$(d)) $(LIBS_$(d)) $(OBJ_DIR)/base/base.a + $(LINK) + +include share/pop.mk diff --git a/src/libmath/test.c b/src/libmath/test.c new file mode 100644 index 0000000..66700f8 --- /dev/null +++ b/src/libmath/test.c @@ -0,0 +1,471 @@ +#include +#include +/* #include */ + +#include + +#include + +// ----------------------------------------------------------------------- +// Vectors + +/* + * NOTE: I'm not sure I like stashing the header in _all_ vectors + * The only way to fix is to have a library based allocator... + */ +typedef struct math·Vec +{ + struct { + void *h; + mem·Allocator heap; + }; + int len; + double *d; +} math·Vec; + +math·Vec +math·makevec(int len, mem·Allocator heap, void *h) +{ + math·Vec v; + v.len = len; + v.heap = heap; + v.h = h; + v.d = heap.alloc(h, 1, len*sizeof(double)); + + // memset(v.d, 0, len*sizeof(double)); + + return v; +} + +error +math·freevec(math·Vec *v) +{ + if (v->h == nil && v->heap.alloc == nil && v->heap.free == nil) { + errorf("attempting to free a vector that doesn't own its data"); + return 1; + } + v->heap.free(v->h, v->d); + v->d = nil; + v->len = 0; + + return 0; +} + +math·Vec +math·copyvec(math·Vec v) +{ + math·Vec cpy; + cpy.heap = v.heap; + cpy.h = v.h; + cpy.len = v.len; + cpy.d = cpy.heap.alloc(cpy.h, 1, v.len); + + memcpy(cpy.d, v.d, sizeof(double)*v.len); + return cpy; +} + +/* + * Scale vector + */ + +static +void +scale_kernel8_avx2(int n, double *x, double a) +{ + __m128d a128; + __m256d a256; + register int i; + + a128 = _mm_load_sd(&a); + a256 = _mm256_broadcastsd_pd(a128); + for (i = 0; i < n; i += 8) { + _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0)); + _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4)); + } +} + +static +void +scale_kernel8(int n, double *x, double a) +{ + register int i; + for (i = 0; i < n; i += 8) { + x[i+0] *= a; + x[i+1] *= a; + x[i+2] *= a; + x[i+3] *= a; + x[i+4] *= a; + x[i+5] *= a; + x[i+6] *= a; + x[i+7] *= a; + } +} + +void +math·scalevec(math·Vec u, double a) +{ + int n; + + n = u.len & ~7; + scale_kernel8_avx2(n, u.d, a); + + for (; n < u.len; n++) { + u.d[n] *= a; + } +} + +/* + * Add scaled vector + */ + +static +void +daxpy_kernel8_avx2(int n, double *x, double *y, double a) +{ + __m128d a128; + __m256d a256; + register int i; + + a128 = _mm_load_sd(&a); + a256 = _mm256_broadcastsd_pd(a128); + for (i = 0; i < n; i += 8) { + _mm256_storeu_pd(x+i+0, _mm256_loadu_pd(x+i+0) + a256 * _mm256_loadu_pd(y+i+0)); + _mm256_storeu_pd(x+i+4, _mm256_loadu_pd(x+i+4) + a256 * _mm256_loadu_pd(y+i+4)); + } +} + +static +void +daxpy_kernel8(int n, double *x, double *y, double a) +{ + register int i; + for (i = 0; i < n; i += 8) { + x[i+0] += a*y[i+0]; + x[i+1] += a*y[i+1]; + x[i+2] += a*y[i+2]; + x[i+3] += a*y[i+3]; + x[i+4] += a*y[i+4]; + x[i+5] += a*y[i+5]; + x[i+6] += a*y[i+6]; + x[i+7] += a*y[i+7]; + } +} + +/* performs u = u + a*v */ +void +math·addvec(math·Vec u, math·Vec v, double a) +{ + int n; + + n = u.len & ~7; + daxpy_kernel8_avx2(n, u.d, v.d, a); + + for (; n < u.len; n++) { + u.d[n] += a*v.d[n]; + } +} + +/* + * Dot product + */ + +static +double +dot_kernel8_avx2(int len, double *x, double *y) +{ + register int i; + __m256d sum[4]; + __m128d res; + + for (i = 0; i < arrlen(sum); i++) { + sum[i] = _mm256_setzero_pd(); + } + + for (i = 0; i < len; i += 16) { + sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0); + sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4); + sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8); + sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12); + } + + sum[0] += sum[1] + sum[2] + sum[3]; + + res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); + res = _mm_hadd_pd(res, res); + + return res[0]; +} + +static +double +dot_kernel8_fma3(int len, double *x, double *y) +{ + register int i; + __m256d sum[4]; + __m128d res; + + for (i = 0; i < arrlen(sum); i++) { + sum[i] = _mm256_setzero_pd(); + } + + for (i = 0; i < len; i += 16) { + sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]); + sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]); + sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]); + sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]); + } + + sum[0] += sum[1] + sum[2] + sum[3]; + + res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); + res = _mm_hadd_pd(res, res); + + return res[0]; +} + +static +double +dot_kernel8(int len, double *x, double *y) +{ + double res; + register int i; + + for (i = 0; i < len; i += 8) { + res += x[i] * y[i] + + x[i+1] * y[i+1] + + x[i+2] * y[i+2] + + x[i+3] * y[i+3] + + x[i+4] * y[i+4] + + x[i+5] * y[i+5] + + x[i+6] * y[i+6] + + x[i+7] * y[i+7]; + } + + return res; +} + +double +math·dot(math·Vec u, math·Vec v) +{ + int i, len; + double res; + + len = u.len & ~15; // neat trick + res = dot_kernel8_fma3(len, u.d, v.d); + + for (i = len; i < u.len; i++) { + res += u.d[i] * v.d[i]; + } + + return res; +} + +// ----------------------------------------------------------------------- +// Matrix + +typedef struct math·Mtx +{ + struct { + void *h; + mem·Allocator heap; + }; + int dim[2]; + double *d; +} math·Mtx; + +math·Mtx +math·makemtx(int n, int m, mem·Allocator heap, void *h) +{ + math·Mtx a; + a.dim[0] = n; + a.dim[1] = m; + a.heap = heap; + a.h = h; + a.d = heap.alloc(h, 1, n*m*sizeof(double)); + + // memset(a.d, 0, n*m*sizeof(double)); + + return a; +} + +error +math·freemtx(math·Vec *m) +{ + if (m->h == nil && m->heap.alloc == nil && m->heap.free == nil) { + errorf("attempting to free a matrix that doesn't own its data"); + return 1; + } + m->heap.free(m->h, m->d); + m->d = nil; + m->len = 0; + + return 0; +} + +/************************************************ + * multiply matrix to vector + ***********************************************/ + +/* + * Notation: (number of rows) x (number of columns) _ unroll factor + * N => variable we sum over + */ +static +void +mtxvec_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y) +{ + int c; + __m128d hr; + __m256d x256, r256[4]; + + for (c = 0; c < 4; c++) { + r256[c] = _mm256_setzero_pd(); + } + + for (c = 0; c < ncol; c += 4) { + x256 = _mm256_loadu_pd(x+c); + r256[0] += x256 * _mm256_loadu_pd(row[0] + c); + r256[1] += x256 * _mm256_loadu_pd(row[1] + c); + r256[2] += x256 * _mm256_loadu_pd(row[2] + c); + r256[3] += x256 * _mm256_loadu_pd(row[3] + c); + } + + for (c = 0; c < 4; c++) { + hr = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1)); + hr = _mm_hadd_pd(hr, hr); + y[c] = hr[0]; + } +} + +static +void +mtxvec_kernel4xN_4(int ncol, double **row, double *x, double *y) +{ + int c; + double res[4]; + + res[0] = 0.; + res[1] = 0.; + res[2] = 0.; + res[3] = 0.; + + for (c = 0; c < ncol; c += 4) { + res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; + res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; + res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3]; + res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3]; + } + + y[0] = res[0]; + y[1] = res[1]; + y[2] = res[2]; + y[3] = res[3]; +} + +static +void +mtxvec_kernel1xN_4(int ncol, double *row, double *x, double *y) +{ + int c; + double res; + + res = 0.; + for (c = 0; c < ncol; c += 4) { + res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3]; + } + + y[0] = res; +} + +// y = a*mx + b*y +error +math·mtxvec(math·Mtx m, double a, math·Vec x, double b, math·Vec y) +{ + int c, r, nrow, ncol; + double *row[4], res[4]; + + nrow = m.dim[0] & ~3; + ncol = m.dim[1] & ~3; + for (r = 0; r < nrow; r += 4) { + row[0] = m.d + (r * (m.dim[1]+0)); + row[1] = m.d + (r * (m.dim[1]+1)); + row[2] = m.d + (r * (m.dim[1]+2)); + row[3] = m.d + (r * (m.dim[1]+3)); + + mtxvec_kernel4xN_4_avx2(ncol, row, x.d + r, res); + + for (c = ncol; c < m.dim[1]; c++) { + res[0] += row[0][c]; + res[1] += row[1][c]; + res[2] += row[2][c]; + res[3] += row[3][c]; + } + + y.d[r+0] = res[0] + b*y.d[r+0]; + y.d[r+1] = res[1] + b*y.d[r+1]; + y.d[r+2] = res[2] + b*y.d[r+2]; + y.d[r+3] = res[3] + b*y.d[r+3]; + } + + for (; r < m.dim[0]; r++) { + mtxvec_kernel1xN_4(m.dim[0], m.d + (r * m.dim[1]), x.d + r, res); + y.d[r] = res[0] + b*y.d[r]; + } + + return 0; +} + +/************************************************ + * add matrix to vector outerproduct + ***********************************************/ + +#define NITER 50 + +#if 0 +error +main() +{ + int i; + clock_t t; + double res; + + math·Mtx m; + math·Vec x, y; + + openblas_set_num_threads(1); + + x = math·makevec(1000, mem·sys, nil); + y = math·makevec(1000, mem·sys, nil); + m = math·makemtx(1000, 1000, mem·sys, nil); + + for (i = 0; i < x.len; i++) { + y.d[i] = i; + } + + t = clock(); + for (i = 0; i < NITER; i++) { + cblas_dgemv(CblasRowMajor, CblasNoTrans, m.dim[0], m.dim[1], 1.5, m.d, m.dim[1], x.d, 1, 2.5, y.d, 1); + } + t = clock() - t; + res = math·dot(y, y); + printf("the result is %f\n", res); + printf("time elapsed (blas): %fms\n", 1000.*t/CLOCKS_PER_SEC); + + for (i = 0; i < x.len; i++) { + y.d[i] = i; + } + + t = clock(); + for (i = 0; i < NITER; i++) { + math·mtxvec(m, 1.5, x, 2.5, y); + } + t = clock() - t; + res = math·dot(y, y); + + printf("the dot product is %f\n", res); + printf("time elapsed (naive): %fms\n", 1000.*t/CLOCKS_PER_SEC); + + + return 0; +} +#endif -- cgit v1.2.1