1 files changed, 0 insertions, 471 deletions
diff --git a/sys/libmath/test.c b/sys/libmath/test.c
deleted file mode 100644
index 66700f8..0000000
--- a/sys/libmath/test.c
+++ /dev/null
@@ -1,471 +0,0 @@
-#include <u.h>
-#include <base.h>
-/* #include <vendor/blas/cblas.h> */
-
-#include <x86intrin.h>
-
-#include <time.h>
-
-// -----------------------------------------------------------------------
-// Vectors
-
-/* 
- * NOTE: I'm not sure I like stashing the header in _all_ vectors
- *       The only way to fix is to have a library based allocator...      
- */
-typedef struct math·Vec
-{
-    struct {
-        void         *h;
-        mem·Allocator heap;
-    };
-    int     len;
-    double *d;
-} math·Vec;
-
-math·Vec 
-math·makevec(int len, mem·Allocator heap, void *h)
-{
-    math·Vec v;
-    v.len  = len;
-    v.heap = heap;
-    v.h    = h;
-    v.d    = heap.alloc(h, 1, len*sizeof(double));
-    
-    // memset(v.d, 0, len*sizeof(double));
-
-    return v;
-}
-
-error
-math·freevec(math·Vec *v)
-{
-    if (v->h == nil && v->heap.alloc == nil && v->heap.free == nil) {
-        errorf("attempting to free a vector that doesn't own its data");
-        return 1;
-    }
-    v->heap.free(v->h, v->d);
-    v->d = nil;
-    v->len = 0;
-
-    return 0;
-}
-
-math·Vec
-math·copyvec(math·Vec v)
-{
-    math·Vec cpy;
-    cpy.heap = v.heap;
-    cpy.h    = v.h;
-    cpy.len  = v.len;
-    cpy.d    = cpy.heap.alloc(cpy.h, 1, v.len);
-    
-    memcpy(cpy.d, v.d, sizeof(double)*v.len);
-    return cpy;
-}
-
-/*
- * Scale vector
- */
-
-static
-void
-scale_kernel8_avx2(int n, double *x, double a)
-{
-    __m128d a128;
-    __m256d a256;
-    register int i;
-
-    a128 = _mm_load_sd(&a);
-    a256 = _mm256_broadcastsd_pd(a128);
-    for (i = 0; i < n; i += 8) {
-        _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0));
-        _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4));
-    }
-}
-
-static
-void
-scale_kernel8(int n, double *x, double a)
-{
-    register int i;
-    for (i = 0; i < n; i += 8) {
-        x[i+0] *= a;
-        x[i+1] *= a;
-        x[i+2] *= a;
-        x[i+3] *= a;
-        x[i+4] *= a;
-        x[i+5] *= a;
-        x[i+6] *= a;
-        x[i+7] *= a;
-    }
-}
-
-void
-math·scalevec(math·Vec u, double a)
-{
-    int n;
-
-    n = u.len & ~7;
-    scale_kernel8_avx2(n, u.d, a);
-
-    for (; n < u.len; n++) {
-        u.d[n] *= a;
-    }
-}
-
-/*
- * Add scaled vector
- */
-
-static
-void
-daxpy_kernel8_avx2(int n, double *x, double *y, double a)
-{
-    __m128d a128;
-    __m256d a256;
-    register int i;
-
-    a128 = _mm_load_sd(&a);
-    a256 = _mm256_broadcastsd_pd(a128);
-    for (i = 0; i < n; i += 8) {
-        _mm256_storeu_pd(x+i+0, _mm256_loadu_pd(x+i+0) + a256 * _mm256_loadu_pd(y+i+0));
-        _mm256_storeu_pd(x+i+4, _mm256_loadu_pd(x+i+4) + a256 * _mm256_loadu_pd(y+i+4));
-    }
-}
-
-static
-void
-daxpy_kernel8(int n, double *x, double *y, double a)
-{
-    register int i;
-    for (i = 0; i < n; i += 8) {
-        x[i+0] += a*y[i+0];
-        x[i+1] += a*y[i+1];
-        x[i+2] += a*y[i+2];
-        x[i+3] += a*y[i+3];
-        x[i+4] += a*y[i+4];
-        x[i+5] += a*y[i+5];
-        x[i+6] += a*y[i+6];
-        x[i+7] += a*y[i+7];
-    }
-}
-
-/* performs u = u + a*v */
-void
-math·addvec(math·Vec u, math·Vec v, double a)
-{
-    int n;
-
-    n = u.len & ~7;
-    daxpy_kernel8_avx2(n, u.d, v.d, a);
-
-    for (; n < u.len; n++) {
-        u.d[n] += a*v.d[n];
-    }
-}
-
-/*
- * Dot product
- */
-
-static
-double
-dot_kernel8_avx2(int len, double *x, double *y)
-{
-    register int i;
-    __m256d sum[4];
-    __m128d res;
-
-    for (i = 0; i < arrlen(sum); i++) {
-        sum[i] = _mm256_setzero_pd();
-    }
-
-    for (i = 0; i < len; i += 16) {
-        sum[0] += _mm256_loadu_pd(x+i+0)  * _mm256_loadu_pd(y+i+0);
-        sum[1] += _mm256_loadu_pd(x+i+4)  * _mm256_loadu_pd(y+i+4);
-        sum[2] += _mm256_loadu_pd(x+i+8)  * _mm256_loadu_pd(y+i+8);
-        sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12);
-    }
-
-    sum[0] += sum[1] + sum[2] + sum[3];
-
-    res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1));
-    res = _mm_hadd_pd(res, res);
-
-    return res[0];
-}
-
-static
-double
-dot_kernel8_fma3(int len, double *x, double *y)
-{
-    register int i;
-    __m256d sum[4];
-    __m128d res;
-
-    for (i = 0; i < arrlen(sum); i++) {
-        sum[i] = _mm256_setzero_pd();
-    }
-
-    for (i = 0; i < len; i += 16) {
-        sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0),  _mm256_loadu_pd(y+i+0), sum[0]);
-        sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4),  _mm256_loadu_pd(y+i+4), sum[1]);
-        sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8),  _mm256_loadu_pd(y+i+8), sum[2]);
-        sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]);
-    }
-
-    sum[0] += sum[1] + sum[2] + sum[3];
-
-    res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1));
-    res = _mm_hadd_pd(res, res);
-
-    return res[0];
-}
-
-static
-double
-dot_kernel8(int len, double *x, double *y)
-{
-    double res;
-    register int i;
-
-    for (i = 0; i < len; i += 8) {
-        res += x[i]   * y[i]   +
-               x[i+1] * y[i+1] +
-               x[i+2] * y[i+2] +
-               x[i+3] * y[i+3] +
-               x[i+4] * y[i+4] +
-               x[i+5] * y[i+5] +
-               x[i+6] * y[i+6] +
-               x[i+7] * y[i+7];
-    }
-
-    return res;
-}
-
-double
-math·dot(math·Vec u, math·Vec v)
-{
-    int i, len;
-    double res;
-
-    len = u.len & ~15; // neat trick
-    res = dot_kernel8_fma3(len, u.d, v.d);
-
-    for (i = len; i < u.len; i++) {
-        res += u.d[i] * v.d[i];
-    }
-
-    return res;
-}
-
-// -----------------------------------------------------------------------
-// Matrix
-
-typedef struct math·Mtx
-{
-    struct {
-        void         *h;
-        mem·Allocator heap;
-    };
-    int dim[2];
-    double  *d;
-} math·Mtx;
-
-math·Mtx
-math·makemtx(int n, int m, mem·Allocator heap, void *h)
-{
-    math·Mtx a;
-    a.dim[0] = n;
-    a.dim[1] = m;
-    a.heap = heap;
-    a.h    = h;
-    a.d    = heap.alloc(h, 1, n*m*sizeof(double));
-    
-    // memset(a.d, 0, n*m*sizeof(double));
-
-    return a;
-}
-
-error
-math·freemtx(math·Vec *m)
-{
-    if (m->h == nil && m->heap.alloc == nil && m->heap.free == nil) {
-        errorf("attempting to free a matrix that doesn't own its data");
-        return 1;
-    }
-    m->heap.free(m->h, m->d);
-    m->d = nil;
-    m->len = 0;
-
-    return 0;
-}
-
-/************************************************
- * multiply matrix to vector
- ***********************************************/
-
-/* 
- * Notation: (number of rows) x (number of columns) _ unroll factor
- *            N => variable we sum over
- */
-static
-void
-mtxvec_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y)
-{
-    int c;
-    __m128d hr;
-    __m256d x256, r256[4];
-
-    for (c = 0; c < 4; c++) {
-        r256[c] = _mm256_setzero_pd();
-    }
-
-    for (c = 0; c < ncol; c += 4) {
-        x256     = _mm256_loadu_pd(x+c);
-        r256[0] += x256 * _mm256_loadu_pd(row[0] + c);
-        r256[1] += x256 * _mm256_loadu_pd(row[1] + c);
-        r256[2] += x256 * _mm256_loadu_pd(row[2] + c);
-        r256[3] += x256 * _mm256_loadu_pd(row[3] + c);
-    }
-
-    for (c = 0; c < 4; c++) {
-        hr   = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1));
-        hr   = _mm_hadd_pd(hr, hr);
-        y[c] = hr[0];
-    }
-}
-
-static
-void
-mtxvec_kernel4xN_4(int ncol, double **row, double *x, double *y)
-{
-    int c;
-    double res[4];
-
-    res[0] = 0.;
-    res[1] = 0.;
-    res[2] = 0.;
-    res[3] = 0.;
-
-    for (c = 0; c < ncol; c += 4) {
-        res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3];
-        res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3];
-        res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3];
-        res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3];
-    }
-
-    y[0] = res[0];
-    y[1] = res[1];
-    y[2] = res[2];
-    y[3] = res[3];
-}
-
-static
-void
-mtxvec_kernel1xN_4(int ncol, double *row, double *x, double *y)
-{
-    int c;
-    double res;
-
-    res = 0.;
-    for (c = 0; c < ncol; c += 4) {
-        res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3];
-    }
-
-    y[0] = res;
-}
-
-// y = a*mx + b*y
-error
-math·mtxvec(math·Mtx m, double a, math·Vec x, double b, math·Vec y)
-{
-    int c, r, nrow, ncol;
-    double *row[4], res[4];
-
-    nrow = m.dim[0] & ~3;
-    ncol = m.dim[1] & ~3;
-    for (r = 0; r < nrow; r += 4) {
-        row[0] = m.d + (r * (m.dim[1]+0));
-        row[1] = m.d + (r * (m.dim[1]+1));
-        row[2] = m.d + (r * (m.dim[1]+2));
-        row[3] = m.d + (r * (m.dim[1]+3));
-
-        mtxvec_kernel4xN_4_avx2(ncol, row, x.d + r, res);
-
-        for (c = ncol; c < m.dim[1]; c++) {
-            res[0] += row[0][c];
-            res[1] += row[1][c];
-            res[2] += row[2][c];
-            res[3] += row[3][c];
-        }
-
-        y.d[r+0] = res[0] + b*y.d[r+0];
-        y.d[r+1] = res[1] + b*y.d[r+1];
-        y.d[r+2] = res[2] + b*y.d[r+2];
-        y.d[r+3] = res[3] + b*y.d[r+3];
-    }
-
-    for (; r < m.dim[0]; r++) {
-        mtxvec_kernel1xN_4(m.dim[0], m.d + (r * m.dim[1]), x.d + r, res);
-        y.d[r] = res[0] + b*y.d[r];
-    }
-
-    return 0;
-}
-
-/************************************************
- * add matrix to vector outerproduct
- ***********************************************/
-
-#define NITER 50
-
-#if 0
-error
-main()
-{
-    int i;
-    clock_t t;
-    double  res;
-
-    math·Mtx m;
-    math·Vec x, y;
-
-    openblas_set_num_threads(1);
-
-    x = math·makevec(1000, mem·sys, nil);
-    y = math·makevec(1000, mem·sys, nil);
-    m = math·makemtx(1000, 1000, mem·sys, nil);
-
-    for (i = 0; i < x.len; i++) {
-        y.d[i] = i;
-    }
-
-    t = clock();
-    for (i = 0; i < NITER; i++) {
-        cblas_dgemv(CblasRowMajor, CblasNoTrans, m.dim[0], m.dim[1], 1.5, m.d, m.dim[1], x.d, 1, 2.5, y.d, 1);
-    }
-    t   = clock() - t;
-    res = math·dot(y, y);
-    printf("the result is %f\n", res);
-    printf("time elapsed (blas): %fms\n", 1000.*t/CLOCKS_PER_SEC);
-
-    for (i = 0; i < x.len; i++) {
-        y.d[i] = i;
-    }
-
-    t = clock();
-    for (i = 0; i < NITER; i++) {
-        math·mtxvec(m, 1.5, x, 2.5, y);
-    }
-    t   = clock() - t;
-    res = math·dot(y, y);
-
-    printf("the dot product is %f\n", res);
-    printf("time elapsed (naive): %fms\n", 1000.*t/CLOCKS_PER_SEC);
-
-
-    return 0;
-}
-#endif