From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- sys/libmath/test.c | 471 ----------------------------------------------------- 1 file changed, 471 deletions(-) delete mode 100644 sys/libmath/test.c (limited to 'sys/libmath/test.c') diff --git a/sys/libmath/test.c b/sys/libmath/test.c deleted file mode 100644 index 66700f8..0000000 --- a/sys/libmath/test.c +++ /dev/null @@ -1,471 +0,0 @@ -#include -#include -/* #include */ - -#include - -#include - -// ----------------------------------------------------------------------- -// Vectors - -/* - * NOTE: I'm not sure I like stashing the header in _all_ vectors - * The only way to fix is to have a library based allocator... - */ -typedef struct math·Vec -{ - struct { - void *h; - mem·Allocator heap; - }; - int len; - double *d; -} math·Vec; - -math·Vec -math·makevec(int len, mem·Allocator heap, void *h) -{ - math·Vec v; - v.len = len; - v.heap = heap; - v.h = h; - v.d = heap.alloc(h, 1, len*sizeof(double)); - - // memset(v.d, 0, len*sizeof(double)); - - return v; -} - -error -math·freevec(math·Vec *v) -{ - if (v->h == nil && v->heap.alloc == nil && v->heap.free == nil) { - errorf("attempting to free a vector that doesn't own its data"); - return 1; - } - v->heap.free(v->h, v->d); - v->d = nil; - v->len = 0; - - return 0; -} - -math·Vec -math·copyvec(math·Vec v) -{ - math·Vec cpy; - cpy.heap = v.heap; - cpy.h = v.h; - cpy.len = v.len; - cpy.d = cpy.heap.alloc(cpy.h, 1, v.len); - - memcpy(cpy.d, v.d, sizeof(double)*v.len); - return cpy; -} - -/* - * Scale vector - */ - -static -void -scale_kernel8_avx2(int n, double *x, double a) -{ - __m128d a128; - __m256d a256; - register int i; - - a128 = _mm_load_sd(&a); - a256 = _mm256_broadcastsd_pd(a128); - for (i = 0; i < n; i += 8) { - _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0)); - _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4)); - } -} - -static -void -scale_kernel8(int n, double *x, double a) -{ - register int i; - for (i = 0; i < n; i += 8) { - x[i+0] *= a; - x[i+1] *= a; - x[i+2] *= a; - x[i+3] *= a; - x[i+4] *= a; - x[i+5] *= a; - x[i+6] *= a; - x[i+7] *= a; - } -} - -void -math·scalevec(math·Vec u, double a) -{ - int n; - - n = u.len & ~7; - scale_kernel8_avx2(n, u.d, a); - - for (; n < u.len; n++) { - u.d[n] *= a; - } -} - -/* - * Add scaled vector - */ - -static -void -daxpy_kernel8_avx2(int n, double *x, double *y, double a) -{ - __m128d a128; - __m256d a256; - register int i; - - a128 = _mm_load_sd(&a); - a256 = _mm256_broadcastsd_pd(a128); - for (i = 0; i < n; i += 8) { - _mm256_storeu_pd(x+i+0, _mm256_loadu_pd(x+i+0) + a256 * _mm256_loadu_pd(y+i+0)); - _mm256_storeu_pd(x+i+4, _mm256_loadu_pd(x+i+4) + a256 * _mm256_loadu_pd(y+i+4)); - } -} - -static -void -daxpy_kernel8(int n, double *x, double *y, double a) -{ - register int i; - for (i = 0; i < n; i += 8) { - x[i+0] += a*y[i+0]; - x[i+1] += a*y[i+1]; - x[i+2] += a*y[i+2]; - x[i+3] += a*y[i+3]; - x[i+4] += a*y[i+4]; - x[i+5] += a*y[i+5]; - x[i+6] += a*y[i+6]; - x[i+7] += a*y[i+7]; - } -} - -/* performs u = u + a*v */ -void -math·addvec(math·Vec u, math·Vec v, double a) -{ - int n; - - n = u.len & ~7; - daxpy_kernel8_avx2(n, u.d, v.d, a); - - for (; n < u.len; n++) { - u.d[n] += a*v.d[n]; - } -} - -/* - * Dot product - */ - -static -double -dot_kernel8_avx2(int len, double *x, double *y) -{ - register int i; - __m256d sum[4]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 16) { - sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0); - sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4); - sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8); - sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12); - } - - sum[0] += sum[1] + sum[2] + sum[3]; - - res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); - res = _mm_hadd_pd(res, res); - - return res[0]; -} - -static -double -dot_kernel8_fma3(int len, double *x, double *y) -{ - register int i; - __m256d sum[4]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 16) { - sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]); - sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]); - sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]); - sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]); - } - - sum[0] += sum[1] + sum[2] + sum[3]; - - res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); - res = _mm_hadd_pd(res, res); - - return res[0]; -} - -static -double -dot_kernel8(int len, double *x, double *y) -{ - double res; - register int i; - - for (i = 0; i < len; i += 8) { - res += x[i] * y[i] + - x[i+1] * y[i+1] + - x[i+2] * y[i+2] + - x[i+3] * y[i+3] + - x[i+4] * y[i+4] + - x[i+5] * y[i+5] + - x[i+6] * y[i+6] + - x[i+7] * y[i+7]; - } - - return res; -} - -double -math·dot(math·Vec u, math·Vec v) -{ - int i, len; - double res; - - len = u.len & ~15; // neat trick - res = dot_kernel8_fma3(len, u.d, v.d); - - for (i = len; i < u.len; i++) { - res += u.d[i] * v.d[i]; - } - - return res; -} - -// ----------------------------------------------------------------------- -// Matrix - -typedef struct math·Mtx -{ - struct { - void *h; - mem·Allocator heap; - }; - int dim[2]; - double *d; -} math·Mtx; - -math·Mtx -math·makemtx(int n, int m, mem·Allocator heap, void *h) -{ - math·Mtx a; - a.dim[0] = n; - a.dim[1] = m; - a.heap = heap; - a.h = h; - a.d = heap.alloc(h, 1, n*m*sizeof(double)); - - // memset(a.d, 0, n*m*sizeof(double)); - - return a; -} - -error -math·freemtx(math·Vec *m) -{ - if (m->h == nil && m->heap.alloc == nil && m->heap.free == nil) { - errorf("attempting to free a matrix that doesn't own its data"); - return 1; - } - m->heap.free(m->h, m->d); - m->d = nil; - m->len = 0; - - return 0; -} - -/************************************************ - * multiply matrix to vector - ***********************************************/ - -/* - * Notation: (number of rows) x (number of columns) _ unroll factor - * N => variable we sum over - */ -static -void -mtxvec_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y) -{ - int c; - __m128d hr; - __m256d x256, r256[4]; - - for (c = 0; c < 4; c++) { - r256[c] = _mm256_setzero_pd(); - } - - for (c = 0; c < ncol; c += 4) { - x256 = _mm256_loadu_pd(x+c); - r256[0] += x256 * _mm256_loadu_pd(row[0] + c); - r256[1] += x256 * _mm256_loadu_pd(row[1] + c); - r256[2] += x256 * _mm256_loadu_pd(row[2] + c); - r256[3] += x256 * _mm256_loadu_pd(row[3] + c); - } - - for (c = 0; c < 4; c++) { - hr = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1)); - hr = _mm_hadd_pd(hr, hr); - y[c] = hr[0]; - } -} - -static -void -mtxvec_kernel4xN_4(int ncol, double **row, double *x, double *y) -{ - int c; - double res[4]; - - res[0] = 0.; - res[1] = 0.; - res[2] = 0.; - res[3] = 0.; - - for (c = 0; c < ncol; c += 4) { - res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; - res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; - res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3]; - res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3]; - } - - y[0] = res[0]; - y[1] = res[1]; - y[2] = res[2]; - y[3] = res[3]; -} - -static -void -mtxvec_kernel1xN_4(int ncol, double *row, double *x, double *y) -{ - int c; - double res; - - res = 0.; - for (c = 0; c < ncol; c += 4) { - res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3]; - } - - y[0] = res; -} - -// y = a*mx + b*y -error -math·mtxvec(math·Mtx m, double a, math·Vec x, double b, math·Vec y) -{ - int c, r, nrow, ncol; - double *row[4], res[4]; - - nrow = m.dim[0] & ~3; - ncol = m.dim[1] & ~3; - for (r = 0; r < nrow; r += 4) { - row[0] = m.d + (r * (m.dim[1]+0)); - row[1] = m.d + (r * (m.dim[1]+1)); - row[2] = m.d + (r * (m.dim[1]+2)); - row[3] = m.d + (r * (m.dim[1]+3)); - - mtxvec_kernel4xN_4_avx2(ncol, row, x.d + r, res); - - for (c = ncol; c < m.dim[1]; c++) { - res[0] += row[0][c]; - res[1] += row[1][c]; - res[2] += row[2][c]; - res[3] += row[3][c]; - } - - y.d[r+0] = res[0] + b*y.d[r+0]; - y.d[r+1] = res[1] + b*y.d[r+1]; - y.d[r+2] = res[2] + b*y.d[r+2]; - y.d[r+3] = res[3] + b*y.d[r+3]; - } - - for (; r < m.dim[0]; r++) { - mtxvec_kernel1xN_4(m.dim[0], m.d + (r * m.dim[1]), x.d + r, res); - y.d[r] = res[0] + b*y.d[r]; - } - - return 0; -} - -/************************************************ - * add matrix to vector outerproduct - ***********************************************/ - -#define NITER 50 - -#if 0 -error -main() -{ - int i; - clock_t t; - double res; - - math·Mtx m; - math·Vec x, y; - - openblas_set_num_threads(1); - - x = math·makevec(1000, mem·sys, nil); - y = math·makevec(1000, mem·sys, nil); - m = math·makemtx(1000, 1000, mem·sys, nil); - - for (i = 0; i < x.len; i++) { - y.d[i] = i; - } - - t = clock(); - for (i = 0; i < NITER; i++) { - cblas_dgemv(CblasRowMajor, CblasNoTrans, m.dim[0], m.dim[1], 1.5, m.d, m.dim[1], x.d, 1, 2.5, y.d, 1); - } - t = clock() - t; - res = math·dot(y, y); - printf("the result is %f\n", res); - printf("time elapsed (blas): %fms\n", 1000.*t/CLOCKS_PER_SEC); - - for (i = 0; i < x.len; i++) { - y.d[i] = i; - } - - t = clock(); - for (i = 0; i < NITER; i++) { - math·mtxvec(m, 1.5, x, 2.5, y); - } - t = clock() - t; - res = math·dot(y, y); - - printf("the dot product is %f\n", res); - printf("time elapsed (naive): %fms\n", 1000.*t/CLOCKS_PER_SEC); - - - return 0; -} -#endif -- cgit v1.2.1