#include #include #include #include #include // ----------------------------------------------------------------------- // Level One /* * Scale vector * x = ax */ static void scale_kernel8_avx2(int n, double *x, double a) { __m128d a128; __m256d a256; register int i; a128 = _mm_load_sd(&a); a256 = _mm256_broadcastsd_pd(a128); for (i = 0; i < n; i += 8) { _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0)); _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4)); } } static void scale_kernel8(int n, double *x, double a) { register int i; for (i = 0; i < n; i += 8) { x[i+0] *= a; x[i+1] *= a; x[i+2] *= a; x[i+3] *= a; x[i+4] *= a; x[i+5] *= a; x[i+6] *= a; x[i+7] *= a; } } void blas·scalevec(int len, double *x, double a) { int n; n = len & ~7; scale_kernel8_avx2(n, x, a); for (; n < len; n++) { x[n] *= a; } } /************************************************ * Daxpy * y = ax + y ***********************************************/ static void daxpy_kernel8_avx2(int n, double a, double *x, double *y) { __m128d a128; __m256d a256; register int i; a128 = _mm_load_sd(&a); a256 = _mm256_broadcastsd_pd(a128); for (i = 0; i < n; i += 8) { _mm256_storeu_pd(y+i+0, _mm256_loadu_pd(y+i+0) + a256 * _mm256_loadu_pd(x+i+0)); _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(y+i+4) + a256 * _mm256_loadu_pd(x+i+4)); } } static void daxpy_kernel8(int n, double a, double *x, double *y) { register int i; for (i = 0; i < n; i += 8) { y[i+0] += a*x[i+0]; y[i+1] += a*x[i+1]; y[i+2] += a*x[i+2]; y[i+3] += a*x[i+3]; y[i+4] += a*x[i+4]; y[i+5] += a*x[i+5]; y[i+6] += a*x[i+6]; y[i+7] += a*x[i+7]; } } void blas·daxpy(int len, double a, double *x, double *y) { int n; n = len & ~7; daxpy_kernel8_avx2(n, a, x, y); for (; n < len; n++) { y[n] += a*x[n]; } } /************************************************ * Dot product * x·y ***********************************************/ static double dot_kernel8_avx2(int len, double *x, double *y) { register int i; __m256d sum[4]; __m128d res; for (i = 0; i < arrlen(sum); i++) { sum[i] = _mm256_setzero_pd(); } for (i = 0; i < len; i += 16) { sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0); sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4); sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8); sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12); } sum[0] += sum[1] + sum[2] + sum[3]; res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); res = _mm_hadd_pd(res, res); return res[0]; } static double dot_kernel8_fma3(int len, double *x, double *y) { register int i; __m256d sum[4]; __m128d res; for (i = 0; i < arrlen(sum); i++) { sum[i] = _mm256_setzero_pd(); } for (i = 0; i < len; i += 16) { sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]); sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]); sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]); sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]); } sum[0] += sum[1] + sum[2] + sum[3]; res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); res = _mm_hadd_pd(res, res); return res[0]; } static double dot_kernel8(int len, double *x, double *y) { double res; register int i; for (i = 0; i < len; i += 8) { res += x[i] * y[i] + x[i+1] * y[i+1] + x[i+2] * y[i+2] + x[i+3] * y[i+3] + x[i+4] * y[i+4] + x[i+5] * y[i+5] + x[i+6] * y[i+6] + x[i+7] * y[i+7]; } return res; } double blas·dot(int len, double *x, double *y) { int i, n; double res; n = len & ~15; // neat trick res = dot_kernel8_fma3(n, x, y); for (i = n; i < len; i++) { res += x[i] * y[i]; } return res; } // ----------------------------------------------------------------------- // Level Two /* * Notation: (number of rows) x (number of columns) _ unroll factor * N => variable we sum over */ /* * Affine transformation * y = aMx + by */ static void gemv_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y) { int c; __m128d hr; __m256d x256, r256[4]; for (c = 0; c < 4; c++) { r256[c] = _mm256_setzero_pd(); } for (c = 0; c < ncol; c += 4) { x256 = _mm256_loadu_pd(x+c); r256[0] += x256 * _mm256_loadu_pd(row[0] + c); r256[1] += x256 * _mm256_loadu_pd(row[1] + c); r256[2] += x256 * _mm256_loadu_pd(row[2] + c); r256[3] += x256 * _mm256_loadu_pd(row[3] + c); } for (c = 0; c < 4; c++) { hr = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1)); hr = _mm_hadd_pd(hr, hr); y[c] = hr[0]; } } static void gemv_kernel4xN_4(int ncol, double **row, double *x, double *y) { int c; double res[4]; res[0] = 0.; res[1] = 0.; res[2] = 0.; res[3] = 0.; for (c = 0; c < ncol; c += 4) { res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3]; res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3]; } y[0] = res[0]; y[1] = res[1]; y[2] = res[2]; y[3] = res[3]; } static void gemv_kernel1xN_4(int ncol, double *row, double *x, double *y) { int c; double res; res = 0.; for (c = 0; c < ncol; c += 4) { res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3]; } y[0] = res; } error blas·gemv(int nrow, int ncol, double a, double *m, double *x, double b, double *y) { int c, r, nr, nc; double *row[4], res[4]; nr = nrow & ~3; nc = ncol & ~3; for (r = 0; r < nrow; r += 4) { row[0] = m + (r * (ncol+0)); row[1] = m + (r * (ncol+1)); row[2] = m + (r * (ncol+2)); row[3] = m + (r * (ncol+3)); gemv_kernel4xN_4_avx2(ncol, row, x + r, res); for (c = nc; c < ncol; c++) { res[0] += row[0][c]; res[1] += row[1][c]; res[2] += row[2][c]; res[3] += row[3][c]; } y[r+0] = res[0] + b*y[r+0]; y[r+1] = res[1] + b*y[r+1]; y[r+2] = res[2] + b*y[r+2]; y[r+3] = res[3] + b*y[r+3]; } for (; r < nrow; r++) { gemv_kernel1xN_4(nrow, m + (r * ncol), x + r, res); y[r] = res[0] + b*y[r]; } return 0; } /* * rank one addition * M = ax(y^T) + M */ void blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { m[i+ncol*j] += a * x[i] * y[j]; } } } /* * symmetric rank one addition * M = ax(x^T) + M */ void blas·syr(int nrow, int ncol, double a, double *x, double *m) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { m[i+ncol*j] += a * x[i] * x[j]; } } } #define NITER 500 #define NCOL 1007 #define NROW 1007 error main() { int i, n; clock_t t; double *x, *y, *m; double res[2], tprof[2]; openblas_set_num_threads(1); x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NCOL); m = malloc(sizeof(*x)*NCOL*NROW); tprof[0] = 0; tprof[1] = 0; for (n = 0; n < NITER; n++) { for (i = 0; i < NCOL; i++) { x[i] = i*i+1; y[i] = i+1; } for (i = 0; i < NCOL*NROW; i++) { m[i] = .1; } t = clock(); cblas_dger(CblasRowMajor, NROW, NCOL, 1.2, x, 1, y, 1, m, NCOL); t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; res[1] = cblas_ddot(NROW*NCOL, m, 1, m, 1); for (i = 0; i < NCOL; i++) { x[i] = i*i+1; y[i] = i+1; } for (i = 0; i < NCOL*NROW; i++) { m[i] = .1; } t = clock(); blas·ger(NROW, NCOL, 1.2, x, y, m); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; res[0] = blas·dot(NROW*NCOL, m, m); } printf("avg time elapsed/iteration (naive): %fms\n", tprof[0]/NITER); printf("--> result (naive): %f\n", res[0]); printf("avg time elapsed/iteration (oblas): %fms\n", tprof[1]/NITER); printf("--> result (oblas): %f\n", res[1]); return 0; }