From 3eff49e57b9d122a1da1103988731c9ace69c304 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Mon, 25 May 2020 15:34:25 -0700 Subject: feat: added preprocessor macros to manually unroll loops --- sys/libmath/blas1body | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++ sys/libmath/loop.h | 37 +++++++++ 2 files changed, 252 insertions(+) create mode 100644 sys/libmath/blas1body create mode 100644 sys/libmath/loop.h (limited to 'sys') diff --git a/sys/libmath/blas1body b/sys/libmath/blas1body new file mode 100644 index 0000000..de4b637 --- /dev/null +++ b/sys/libmath/blas1body @@ -0,0 +1,215 @@ +/* vim: set ft=c */ +// ----------------------------------------------------------------------- +// Function implementations + +FLOAT +func(dot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) reg[I] = 0; +#define KERNEL(I, INCX, INCY) reg[I] += x[(INCX)*(i + I)] * y[(INCY)*(i + I)]; + INT i, n; + FLOAT reg[UNROLL]; + + BODY_XY() + + for (i = 1; i < UNROLL; i++) + reg[0] += reg[i]; + + return reg[0]; +#undef INIT +#undef KERNEL +} + +void +func(copy)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = x[(INCX)*(i + I)]; + INT i, n; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(swap)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) tmp[I] = x[(INCX)*(i + I)], x[(INCX)*(i + I)] = y[(INCY)*(i + I)], y[(INCY)*(i + I)] = tmp[I]; + INT i, n; + FLOAT tmp[UNROLL]; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(axpy)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] += a*x[(INCX)*(i + I)]; + INT i, n; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(axpby)(INT len, FLOAT a, FLOAT *x, INT incx, FLOAT b, FLOAT *y, INT incy) +{ +#define INIT(I,...) +#define KERNEL(I, INCX, INCY) y[(INCY)*(i + I)] = a*x[(INCX)*(i + I)] + b*y[(INCY)*(i + I)]; + INT i, n; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +INT +func(argmax)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I,...) max[I] = x[I], idx[I] = I; +#define KERNEL(I, INCX) if (x[(INCX)*(i+I)] > max[I]) {max[i] = x[INCX*(i+I)]; idx[I] = i+I;} + INT i, n; + FLOAT max[UNROLL]; + INT idx[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + if (max[i] > max[0]) + idx[0] = idx[i]; + + return idx[0]; +#undef INIT +#undef KERNEL +} + +INT +func(argmin)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I,...) min[I] = x[I], idx[I] = I; +#define KERNEL(I, INCX) if (x[INCX*(i+I)] < min[I]) {min[i] = x[INCX*(i+I)]; idx[I] = i+I;} + INT i, n; + FLOAT min[UNROLL]; + INT idx[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + if (min[i] < min[0]) + idx[0] = idx[i]; + + return idx[0]; +#undef INIT +#undef KERNEL +} + +FLOAT +func(asum)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I,...) sum[I] = 0; +#define KERNEL(I, INCX) sum[I] += x[INCX*(i+I)] > 0 ? x[INCX*(i+I)] : -x[INCX*(i+I)]; + INT i, n; + FLOAT sum[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + sum[0] += sum[i]; + + return sum[0]; + +#undef INIT +#undef KERNEL +} + +void +func(scale)(INT len, FLOAT a, FLOAT *x, INT incx) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX) x[INCX*(i+I)] *= a; + INT i, n; + + BODY_X(); + +#undef INIT +#undef KERNEL +} + +FLOAT +func(norm)(INT len, FLOAT *x, INT incx) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX) norm[I] += x[INCX*(i+I)] * x[INCX*(i+I)]; + INT i, n; + FLOAT norm[UNROLL]; + + BODY_X(); + + for (i = 1; i < UNROLL; i++) + norm[0] += norm[i]; + + return math·sqrt(norm[0]); + +#undef INIT +#undef KERNEL +} + +void +func(drot)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT cos, FLOAT sin) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = cos*x[INCX*(i+I)] + sin*y[INCY*(i+I)], y[INCY*(i+I)] = cos*y[INCY*(i+I)] - sin*tmp[I]; + INT i, n; + FLOAT tmp[UNROLL]; + + BODY_XY(); + +#undef INIT +#undef KERNEL +} + +void +func(rotm)(INT len, FLOAT *x, INT incx, FLOAT *y, INT incy, FLOAT H[5]) +{ +#define INIT(I, ...) +#define KERNEL(I, INCX, INCY) tmp[I] = x[INCX*(i+I)], x[INCX*(i+I)] = H[1]*x[INCX*(i+I)] + H[2]*y[INCY*(i+I)], y[INCY*(i+I)] = H[3]*tmp[I] + H[4]*y[INCY*(i+I)]; + INT i, n, f; + FLOAT tmp[UNROLL]; + + f = (INT)H[0]; + switch (f) { + case -2: + H[1] = +1; + H[2] = +0; + H[3] = +0; + H[4] = +1; + break; + case -1: + break; + case +0: + H[1] = +1; + H[4] = +1; + break; + case +1: + H[2] = +1; + H[3] = -1; + break; + default: + return; + } + + BODY_XY(); + +#undef INIT +#undef KERNEL +} diff --git a/sys/libmath/loop.h b/sys/libmath/loop.h new file mode 100644 index 0000000..9a425a7 --- /dev/null +++ b/sys/libmath/loop.h @@ -0,0 +1,37 @@ +#pragma once + +#define ROUNDBY(x, n) ((x) & ~((n)-1)) + +/* loop unrolling (vertical) */ +#define LOOP1(I,STMT,...) STMT(I,__VA_ARGS__) +#define LOOP2(I,STMT,...) STMT(I,__VA_ARGS__) LOOP1((I+1),STMT,__VA_ARGS__) +#define LOOP3(I,STMT,...) STMT(I,__VA_ARGS__) LOOP2((I+1),STMT,__VA_ARGS__) +#define LOOP4(I,STMT,...) STMT(I,__VA_ARGS__) LOOP3((I+1),STMT,__VA_ARGS__) +#define LOOP5(I,STMT,...) STMT(I,__VA_ARGS__) LOOP4((I+1),STMT,__VA_ARGS__) +#define LOOP6(I,STMT,...) STMT(I,__VA_ARGS__) LOOP5((I+1),STMT,__VA_ARGS__) +#define LOOP7(I,STMT,...) STMT(I,__VA_ARGS__) LOOP6((I+1),STMT,__VA_ARGS__) +#define LOOP8(I,STMT,...) STMT(I,__VA_ARGS__) LOOP7((I+1),STMT,__VA_ARGS__) +#define LOOP9(I,STMT,...) STMT(I,__VA_ARGS__) LOOP8((I+1),STMT,__VA_ARGS__) +#define LOOP10(I,STMT,...) STMT(I,__VA_ARGS__) LOOP9((I+1),STMT,__VA_ARGS__) +#define LOOP11(I,STMT,...) STMT(I,__VA_ARGS__) LOOP10((I+1),STMT,__VA_ARGS__) +#define LOOP12(I,STMT,...) STMT(I,__VA_ARGS__) LOOP11((I+1),STMT,__VA_ARGS__) +#define LOOP13(I,STMT,...) STMT(I,__VA_ARGS__) LOOP12((I+1),STMT,__VA_ARGS__) +#define LOOP14(I,STMT,...) STMT(I,__VA_ARGS__) LOOP13((I+1),STMT,__VA_ARGS__) +#define LOOP15(I,STMT,...) STMT(I,__VA_ARGS__) LOOP14((I+1),STMT,__VA_ARGS__) +#define LOOP16(I,STMT,...) STMT(I,__VA_ARGS__) LOOP15((I+1),STMT,__VA_ARGS__) + +#define _LOOP_(n,I,STMT,...) LOOP##n(I,STMT,__VA_ARGS__) +#define LOOP(n,I,STMT,...) _LOOP_(n,I,STMT,__VA_ARGS__) + +/* loop expansion (horizontal) */ +#define EXPAND1(I,TERM,OP,...) TERM(I,__VA_ARGS__) +#define EXPAND2(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND1((I+1),TERM,OP,__VA_ARGS__) +#define EXPAND3(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND2((I+1),TERM,OP,__VA_ARGS__) +#define EXPAND4(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND3((I+1),TERM,OP,__VA_ARGS__) +#define EXPAND5(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND4((I+1),TERM,OP,__VA_ARGS__) +#define EXPAND6(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND5((I+1),TERM,OP,__VA_ARGS__) +#define EXPAND7(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND6((I+1),TERM,OP,__VA_ARGS__) +#define EXPAND8(I,TERM,OP,...) TERM(I,__VA_ARGS__) OP EXPAND7((I+1),TERM,OP,__VA_ARGS__) + +#define _EXPAND_(n,I,TERM,OP,...) EXPAND##n(I,TERM,OP,__VA_ARGS__) +#define EXPAND(n,I,TERM,OP, ...) _EXPAND_(n,I,TERM,OP,__VA_ARGS__) -- cgit v1.2.1