#include #include #include #include #include #include // ----------------------------------------------------------------------- // level one /* * rotate vector * x = cos*x + sin*y * y = cos*x - sin*y */ static void rot_kernel8_avx2(int n, double *x, double *y, double cos, double sin) { register int i; __m256d x256, y256; __m128d cos128, sin128; __m256d cos256, sin256; cos128 = _mm_load_sd(&cos); cos256 = _mm256_broadcastsd_pd(cos128); sin128 = _mm_load_sd(&sin); sin256 = _mm256_broadcastsd_pd(sin128); for (i = 0; i < n; i+=8) { x256 = _mm256_loadu_pd(x+i+0); y256 = _mm256_loadu_pd(y+i+0); _mm256_storeu_pd(x+i+0, cos256 * x256 + sin256 * y256); _mm256_storeu_pd(y+i+0, cos256 * y256 - sin256 * x256); x256 = _mm256_loadu_pd(x+i+4); y256 = _mm256_loadu_pd(y+i+4); _mm256_storeu_pd(x+i+4, cos256 * x256 + sin256 * y256); _mm256_storeu_pd(y+i+4, cos256 * y256 - sin256 * x256); } } static void rot_kernel8(int n, double *x, double *y, double cos, double sin) { register int i; register double tmp; for (i = 0; i < n; i+=8) { tmp = x[i+0], x[i+0] = cos*x[i+0] + sin*y[i+0], y[i+0] = cos*y[i+0] - sin*tmp; tmp = x[i+1], x[i+1] = cos*x[i+1] + sin*y[i+1], y[i+1] = cos*y[i+1] - sin*tmp; tmp = x[i+2], x[i+2] = cos*x[i+2] + sin*y[i+2], y[i+2] = cos*y[i+2] - sin*tmp; tmp = x[i+3], x[i+3] = cos*x[i+3] + sin*y[i+3], y[i+3] = cos*y[i+3] - sin*tmp; tmp = x[i+4], x[i+4] = cos*x[i+4] + sin*y[i+4], y[i+4] = cos*y[i+4] - sin*tmp; tmp = x[i+5], x[i+5] = cos*x[i+5] + sin*y[i+5], y[i+5] = cos*y[i+5] - sin*tmp; tmp = x[i+6], x[i+6] = cos*x[i+6] + sin*y[i+6], y[i+6] = cos*y[i+6] - sin*tmp; tmp = x[i+7], x[i+7] = cos*x[i+7] + sin*y[i+7], y[i+7] = cos*y[i+7] - sin*tmp; } } void blas·rot(int len, double *x, double *y, double cos, double sin) { register int n; register double tmp; n = len & ~7; rot_kernel8_avx2(n, x, y, cos, sin); for (; n < len; n++) { tmp = x[n], x[n] = cos*x[n] + sin*y[n], y[n] = cos*y[n]- sin*tmp; } } /* * compute givens rotate vector * -- -- * |+cos -sin| | a | = | r | * |-sin +cos| | b | = | 0 | * -- -- */ void blas·rotg(double *a, double *b, double *cos, double *sin) { double abs_a, abs_b, r, rho, scale, z; abs_a = math·abs(*a); abs_b = math·abs(*b); rho = abs_a > abs_b ? *a : *b; scale = abs_a + abs_b; if (scale == 0) { *cos = 1, *sin = 0; r = 0.; z = 0.; } else { r = math·sgn(rho) * scale * math·sqrt(math·pow(abs_a/scale, 2) + math·pow(abs_b/scale, 2)); *cos = *a / r; *sin = *b / r; if (abs_a > abs_b) z = *sin; else if (abs_b >= abs_a && *cos != 0) z = 1/(*cos); else z = 1.; } *a = r; *b = z; } /* * modified Givens rotation of points in plane * operates on len points * * params = [flag, h11, h12, h21, h22] * NOTE: This is row major as opposed to other implementations * * Flags correspond to: * @flag = -1: * H -> [ [h11, h12], [h21, h22] ] * @flag = 0.0: * H -> [ [1, h12], [h21, 1] ] * @flag = +1: * H -> [ [h11, 1], [-1, h22] ] * @flag = -2: * H -> [ [1, 0], [0, 1] ] * @flag = * * return error * * Replaces: * x -> H11 * x + H12 * y * y -> H21 * x + H22 * y */ static void rotm_kernel8_avx2(int n, double *x, double *y, double H[4]) { register int i; __m256d x256, y256; __m256d H256[4]; for (i = 0; i < 4; i++) { H256[i] = _mm256_broadcastsd_pd(_mm_load_sd(H+i)); } for (i = 0; i < n; i+=8) { x256 = _mm256_loadu_pd(x+i+0); y256 = _mm256_loadu_pd(y+i+0); _mm256_storeu_pd(x+i+0, H256[0] * x256 + H256[1] * y256); _mm256_storeu_pd(y+i+0, H256[2] * y256 - H256[3] * x256); x256 = _mm256_loadu_pd(x+i+4); y256 = _mm256_loadu_pd(y+i+4); _mm256_storeu_pd(x+i+4, H256[0] * x256 + H256[1] * y256); _mm256_storeu_pd(y+i+4, H256[2] * y256 - H256[3] * x256); } } static void rotm_kernel8(int n, double *x, double *y, double H[4]) { register int i; register double tmp; for (i = 0; i < n; i+=8) { tmp = x[i+0], x[i+0] = H[0]*x[i+0] + H[1]*y[i+0], y[i+0] = H[2]*y[i+0] + H[3]*tmp; tmp = x[i+1], x[i+1] = H[0]*x[i+1] + H[1]*y[i+1], y[i+1] = H[2]*y[i+1] + H[3]*tmp; tmp = x[i+2], x[i+2] = H[0]*x[i+2] + H[1]*y[i+2], y[i+2] = H[2]*y[i+2] + H[3]*tmp; tmp = x[i+3], x[i+3] = H[0]*x[i+3] + H[1]*y[i+3], y[i+3] = H[2]*y[i+3] + H[3]*tmp; tmp = x[i+4], x[i+4] = H[0]*x[i+4] + H[1]*y[i+4], y[i+4] = H[2]*y[i+4] + H[3]*tmp; tmp = x[i+5], x[i+5] = H[0]*x[i+5] + H[1]*y[i+5], y[i+5] = H[2]*y[i+5] + H[3]*tmp; tmp = x[i+6], x[i+6] = H[0]*x[i+6] + H[1]*y[i+6], y[i+6] = H[2]*y[i+6] + H[3]*tmp; tmp = x[i+7], x[i+7] = H[0]*x[i+7] + H[1]*y[i+7], y[i+7] = H[2]*y[i+7] + H[3]*tmp; } } error blas·rotm(int len, double *x, double *y, double p[5]) { int n, flag; double tmp, H[4]; flag = math·round(p[0]); switch (flag) { case -1: H[0] = p[1], H[1] = p[2], H[2] = p[3], H[3] = p[4]; break; case 0: H[0] = +1, H[1] = p[2], H[2] = p[3], H[3] = +1; break; case +1: H[0] = p[1], H[1] = +1, H[2] = -1, H[3] = p[4]; break; case -2: H[0] = +1, H[1] = 0, H[2] = 0, H[3] = +1; break; default: errorf("rotm: flag '%d' unrecognized", flag); return 1; } n = len & ~7; rotm_kernel8_avx2(n, x, y, H); for (; n < len; n++) { tmp = x[n], x[n] = H[0]*x[n] + H[1]*y[n], y[n] = H[2]*y[n] + H[3]*tmp; } return 0; } /* * scale vector * x = ax */ static void scale_kernel8_avx2(int n, double *x, double a) { __m128d a128; __m256d a256; register int i; a128 = _mm_load_sd(&a); a256 = _mm256_broadcastsd_pd(a128); for (i = 0; i < n; i += 8) { _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0)); _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4)); } } static void scale_kernel8(int n, double *x, double a) { register int i; for (i = 0; i < n; i += 8) { x[i+0] *= a; x[i+1] *= a; x[i+2] *= a; x[i+3] *= a; x[i+4] *= a; x[i+5] *= a; x[i+6] *= a; x[i+7] *= a; } } void blas·scale(int len, double a, double *x) { int n; n = len & ~7; scale_kernel8_avx2(n, x, a); for (; n < len; n++) { x[n] *= a; } } /* * copy * y = x */ void blas·copy(int len, double *x, double *y) { memcpy(y, x, sizeof(*x) * len); } /* * swap * y <=> x */ static void swap_kernel8_avx2(int n, double *x, double *y) { register int i; __m256d tmp[2]; for (i = 0; i < n; i += 8) { tmp[0] = _mm256_loadu_pd(x+i+0); tmp[1] = _mm256_loadu_pd(y+i+0); _mm256_storeu_pd(x+i+0, tmp[1]); _mm256_storeu_pd(y+i+0, tmp[0]); tmp[0] = _mm256_loadu_pd(x+i+4); tmp[1] = _mm256_loadu_pd(y+i+4); _mm256_storeu_pd(x+i+4, tmp[1]); _mm256_storeu_pd(y+i+4, tmp[0]); } } static void swap_kernel8(int n, double *x, double *y) { register int i; register double tmp; for (i = 0; i < n; i += 8) { tmp = x[i+0], x[i+0] = y[i+0], y[i+0] = tmp; tmp = x[i+1], x[i+1] = y[i+1], y[i+1] = tmp; tmp = x[i+2], x[i+2] = y[i+2], y[i+2] = tmp; tmp = x[i+3], x[i+3] = y[i+3], y[i+3] = tmp; tmp = x[i+4], x[i+4] = y[i+4], y[i+4] = tmp; tmp = x[i+5], x[i+5] = y[i+5], y[i+5] = tmp; tmp = x[i+6], x[i+6] = y[i+6], y[i+6] = tmp; tmp = x[i+7], x[i+7] = y[i+7], y[i+7] = tmp; } } void blas·swap(int len, double *x, double *y) { int n; double tmp; n = len & ~7; swap_kernel8(n, x, y); for (; n < len; n++) { tmp = x[n], x[n] = y[n], y[n] = tmp; } } /* * daxpy * y = ax + y */ static void axpy_kernel8_avx2(int n, double a, double *x, double *y) { __m128d a128; __m256d a256; register int i; a128 = _mm_load_sd(&a); a256 = _mm256_broadcastsd_pd(a128); for (i = 0; i < n; i += 8) { _mm256_storeu_pd(y+i+0, _mm256_loadu_pd(y+i+0) + a256 * _mm256_loadu_pd(x+i+0)); _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(y+i+4) + a256 * _mm256_loadu_pd(x+i+4)); } } static void axpy_kernel8(int n, double a, double *x, double *y) { register int i; for (i = 0; i < n; i += 8) { y[i+0] += a*x[i+0]; y[i+1] += a*x[i+1]; y[i+2] += a*x[i+2]; y[i+3] += a*x[i+3]; y[i+4] += a*x[i+4]; y[i+5] += a*x[i+5]; y[i+6] += a*x[i+6]; y[i+7] += a*x[i+7]; } } void blas·axpy(int len, double a, double *x, double *y) { int n; n = len & ~7; axpy_kernel8_avx2(n, a, x, y); for (; n < len; n++) { y[n] += a*x[n]; } } /* * dot product * x·y */ static double dot_kernel8_avx2(int len, double *x, double *y) { register int i; __m256d sum[4]; __m128d res; for (i = 0; i < arrlen(sum); i++) { sum[i] = _mm256_setzero_pd(); } for (i = 0; i < len; i += 16) { sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0); sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4); sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8); sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12); } sum[0] += sum[1] + sum[2] + sum[3]; res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); res = _mm_hadd_pd(res, res); return res[0]; } static double dot_kernel8_fma3(int len, double *x, double *y) { register int i; __m256d sum[4]; __m128d res; for (i = 0; i < arrlen(sum); i++) { sum[i] = _mm256_setzero_pd(); } for (i = 0; i < len; i += 16) { sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]); sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]); sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]); sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]); } sum[0] += sum[1] + sum[2] + sum[3]; res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); res = _mm_hadd_pd(res, res); return res[0]; } static double dot_kernel8(int len, double *x, double *y) { double res; register int i; for (i = 0; i < len; i += 8) { res += x[i] * y[i] + x[i+1] * y[i+1] + x[i+2] * y[i+2] + x[i+3] * y[i+3] + x[i+4] * y[i+4] + x[i+5] * y[i+5] + x[i+6] * y[i+6] + x[i+7] * y[i+7]; } return res; } double blas·dot(int len, double *x, int incx, double *y, int incy) { int i, n, ix, iy; double res, mul[4], sum[2]; if (len == 0) return 0; if (incx == 1 && incy == 1) { n = len & ~15; // neat trick res = dot_kernel8_fma3(n, x, y); for (i = n; i < len; i++) { res += x[i] * y[i]; } return res; } n = len & ~3; for (i = 0, ix = 0, iy = 0; i < n; i += 4, ix += 4*incx, iy += 4*incy) { mul[0] = x[ix+0*incx] * y[iy+0*incy]; mul[1] = x[ix+1*incx] * y[iy+1*incy]; mul[2] = x[ix+2*incx] * y[iy+2*incy]; mul[3] = x[ix+3*incx] * y[iy+3*incy]; sum[0] += mul[0] + mul[2]; sum[1] += mul[1] + mul[3]; } for (; i < len; i++, ix += incx, iy += incy) { sum[0] += x[ix] * y[iy]; } res = sum[0] + sum[1]; return res; } /* * euclidean norm * ||x|| */ double blas·norm(int len, double *x) { double res; res = blas·dot(len, x, 1, x, 1); res = math·sqrt(res); return res; } static double sum_kernel8_avx2(int len, double *x) { register int i; __m256d sum[2]; __m128d res; for (i = 0; i < arrlen(sum); i++) { sum[i] = _mm256_setzero_pd(); } for (i = 0; i < len; i += 8) { sum[0] += _mm256_loadu_pd(x+i+0); sum[1] += _mm256_loadu_pd(x+i+4); } sum[0] += sum[1]; res = _mm_add_pd(_mm256_extractf128_pd(sum[0], 0), _mm256_extractf128_pd(sum[0], 1)); res = _mm_hadd_pd(res, res); return res[0]; } static double sum_kernel8(int len, double *x, double *y) { double res; register int i; for (i = 0; i < len; i += 8) { res += x[i] + x[i+1] + x[i+2] + x[i+3] + x[i+4] + x[i+5] + x[i+6] + x[i+7]; } return res; } /* * L1 norm * sum(x_i) */ double blas·sum(int len, double *x) { int i, n; double res; if (len == 0) return 0; n = len & ~7; res = sum_kernel8_avx2(n, x); for (i = n; i < len; i++) { res += x[i]; } return res; } /* * argmax * i = argmax(x) */ int argmax_kernel8_avx2(int n, double *x) { register int i, msk, maxidx[4]; __m256d val, cmp, max; maxidx[0] = 0, maxidx[1] = 0, maxidx[2] = 0, maxidx[3] = 0; max = _mm256_loadu_pd(x); for (i = 0; i < n; i += 4) { val = _mm256_loadu_pd(x+i); cmp = _mm256_cmp_pd(val, max, _CMP_GT_OQ); msk = _mm256_movemask_pd(cmp); switch (msk) { case 1: max = _mm256_blend_pd(max, val, 1); maxidx[0] = i; break; case 2: max = _mm256_blend_pd(max, val, 2); maxidx[1] = i; break; case 3: max = _mm256_blend_pd(max, val, 3); maxidx[0] = i; maxidx[1] = i; break; case 4: max = _mm256_blend_pd(max, val, 4); maxidx[2] = i; break; case 5: max = _mm256_blend_pd(max, val, 5); maxidx[2] = i; maxidx[0] = i; break; case 6: max = _mm256_blend_pd(max, val, 6); maxidx[2] = i; maxidx[1] = i; break; case 7: max = _mm256_blend_pd(max, val, 7); maxidx[2] = i; maxidx[1] = i; maxidx[0] = i; break; case 8: max = _mm256_blend_pd(max, val, 8); maxidx[3] = i; break; case 9: max = _mm256_blend_pd(max, val, 9); maxidx[3] = i; maxidx[0] = i; break; case 10: max = _mm256_blend_pd(max, val, 10); maxidx[3] = i; maxidx[1] = i; break; case 11: max = _mm256_blend_pd(max, val, 11); maxidx[3] = i; maxidx[1] = i; maxidx[0] = i; break; case 12: max = _mm256_blend_pd(max, val, 12); maxidx[3] = i; maxidx[2] = i; break; case 13: max = _mm256_blend_pd(max, val, 13); maxidx[3] = i; maxidx[2] = i; maxidx[0] = i; break; case 14: max = _mm256_blend_pd(max, val, 14); maxidx[3] = i; maxidx[2] = i; maxidx[1] = i; break; case 15: max = _mm256_blend_pd(max, val, 15); maxidx[3] = i; maxidx[2] = i; maxidx[1] = i; maxidx[0] = i; break; case 0: default: ; } } maxidx[0] = (x[maxidx[0]+0] > x[maxidx[1]+1]) ? maxidx[0]+0 : maxidx[1]+1; maxidx[1] = (x[maxidx[2]+2] > x[maxidx[3]+3]) ? maxidx[2]+2 : maxidx[3]+3; maxidx[0] = (x[maxidx[0]] > x[maxidx[1]]) ? maxidx[0] : maxidx[1]; return maxidx[0]; } int argmax_kernel8(int n, double *x) { #define SET(d) idx[d] = d, max[d] = x[d] #define PUT(d) if (x[i+d] > max[d]) idx[d] = i+d, max[d] = x[i+d] int i, idx[8]; double max[8]; SET(0); SET(1); SET(2); SET(3); SET(4); SET(5); SET(6); SET(7); for (i = 0; i < n; i += 8) { PUT(0); PUT(1); PUT(2); PUT(3); PUT(4); PUT(5); PUT(6); PUT(7); } n = 0; for (i = 1; i < 8; i++ ) { if (max[i] > max[n]) { n = i; } } return idx[n]; #undef PUT #undef SET } int blas·argmax(int len, double *x) { int i, n; double max; i = len & ~7; n = argmax_kernel8_avx2(i, x); max = x[n]; for (; i < len; i++) { if (x[i] > max) { n = i; max = x[i]; } } return n; } int argmin_kernel8_avx2(int n, double *x) { register int i, msk, minidx[4]; __m256d val, cmp, min; minidx[0] = 0, minidx[1] = 0, minidx[2] = 0, minidx[3] = 0; min = _mm256_loadu_pd(x); for (i = 0; i < n; i += 4) { val = _mm256_loadu_pd(x+i); cmp = _mm256_cmp_pd(val, min, _CMP_LT_OS); msk = _mm256_movemask_pd(cmp); switch (msk) { case 1: min = _mm256_blend_pd(min, val, 1); minidx[0] = i; break; case 2: min = _mm256_blend_pd(min, val, 2); minidx[1] = i; break; case 3: min = _mm256_blend_pd(min, val, 3); minidx[0] = i; minidx[1] = i; break; case 4: min = _mm256_blend_pd(min, val, 4); minidx[2] = i; break; case 5: min = _mm256_blend_pd(min, val, 5); minidx[2] = i; minidx[0] = i; break; case 6: min = _mm256_blend_pd(min, val, 6); minidx[2] = i; minidx[1] = i; break; case 7: min = _mm256_blend_pd(min, val, 7); minidx[2] = i; minidx[1] = i; minidx[0] = i; break; case 8: min = _mm256_blend_pd(min, val, 8); minidx[3] = i; break; case 9: min = _mm256_blend_pd(min, val, 9); minidx[3] = i; minidx[0] = i; break; case 10: min = _mm256_blend_pd(min, val, 10); minidx[3] = i; minidx[1] = i; break; case 11: min = _mm256_blend_pd(min, val, 11); minidx[3] = i; minidx[1] = i; minidx[0] = i; break; case 12: min = _mm256_blend_pd(min, val, 12); minidx[3] = i; minidx[2] = i; break; case 13: min = _mm256_blend_pd(min, val, 13); minidx[3] = i; minidx[2] = i; minidx[0] = i; break; case 14: min = _mm256_blend_pd(min, val, 14); minidx[3] = i; minidx[2] = i; minidx[1] = i; break; case 15: min = _mm256_blend_pd(min, val, 15); minidx[3] = i; minidx[2] = i; minidx[1] = i; minidx[0] = i; break; case 0: default: ; } } minidx[0] = (x[minidx[0]+0] < x[minidx[1]+1]) ? minidx[0]+0 : minidx[1]+1; minidx[1] = (x[minidx[2]+2] < x[minidx[3]+3]) ? minidx[2]+2 : minidx[3]+3; minidx[0] = (x[minidx[0]] < x[minidx[1]]) ? minidx[0] : minidx[1]; return minidx[0]; } int argmin_kernel8(int n, double *x) { #define SET(d) idx[d] = d, min[d] = x[d] #define PUT(d) if (x[i+d] < min[d]) idx[d] = i+d, min[d] = x[i+d] int i, idx[8]; double min[8]; SET(0); SET(1); SET(2); SET(3); SET(4); SET(5); SET(6); SET(7); for (i = 0; i < n; i += 8) { PUT(0); PUT(1); PUT(2); PUT(3); PUT(4); PUT(5); PUT(6); PUT(7); } n = 0; for (i = 1; i < 8; i++) { if (min[i] < min[n]) { n = i; } } return idx[n]; #undef PUT #undef SET } int blas·argmin(int len, double *x) { int i, n; double min; i = len & ~7; n = argmin_kernel8_avx2(i, x); min = x[n]; for (; i < len; i++) { if (x[i] < min) { n = i; min = x[i]; } } return n; } // ----------------------------------------------------------------------- // level two /* * Notation: (number of rows) x (number of columns) _ unroll factor * N => variable we sum over */ // NOTE: All triangular matrix methods are assumed packed and upper for now! /* * triangular shaped transformation * x = Mx * @M: square triangular * TODO(PERF): Diagnose speed issues * TODO: Finish all other flag cases! */ void blas·tpmv(blas·Flag f, int n, double *m, double *x) { int i; for (i = 0; i < n; m += (n-i), ++x, ++i) { *x = blas·dot(n-i, m, 1, x, 1); } } /* * solve triangular set of equations * x = M^{-1}b * @M: square triangular * TODO(PERF): Diagnose speed issues * TODO: Finish all other flag cases! */ void blas·tpsv(blas·Flag f, int n, double *m, double *x) { int i; double r; x += (n - 1); m += ((n * (n+1))/2 - 1); for (i = n-1; i >= 0; --i, --x, m -= (n-i)) { r = blas·dot(n-i-1, m+1, 1, x+1, 1); *x = (*x - r) / *m; } } /* * general affine transformation * y = aMx + by */ static void gemv_kernel4xN_4_avx2(int ncol, double **row, double *x, double *y) { int c; __m128d hr; __m256d x256, r256[4]; for (c = 0; c < 4; c++) { r256[c] = _mm256_setzero_pd(); } for (c = 0; c < ncol; c += 4) { x256 = _mm256_loadu_pd(x+c); r256[0] += x256 * _mm256_loadu_pd(row[0] + c); r256[1] += x256 * _mm256_loadu_pd(row[1] + c); r256[2] += x256 * _mm256_loadu_pd(row[2] + c); r256[3] += x256 * _mm256_loadu_pd(row[3] + c); } for (c = 0; c < 4; c++) { hr = _mm_add_pd(_mm256_extractf128_pd(r256[c], 0), _mm256_extractf128_pd(r256[c], 1)); hr = _mm_hadd_pd(hr, hr); y[c] = hr[0]; } } static void gemv_kernel4xN_4(int ncol, double **row, double *x, double *y) { int c; double res[4]; res[0] = 0.; res[1] = 0.; res[2] = 0.; res[3] = 0.; for (c = 0; c < ncol; c += 4) { res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3]; res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3]; } y[0] = res[0]; y[1] = res[1]; y[2] = res[2]; y[3] = res[3]; } static void gemv_kernel1xN_4(int ncol, double *row, double *x, double *y) { int c; double res; res = 0.; for (c = 0; c < ncol; c += 4) { res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3]; } y[0] = res; } error blas·gemv(int nrow, int ncol, double a, double *m, double *x, double b, double *y) { int c, r, nr, nc; double *row[4], res[4]; nr = nrow & ~3; nc = ncol & ~3; for (r = 0; r < nrow; r += 4) { row[0] = m + (r * (ncol+0)); row[1] = m + (r * (ncol+1)); row[2] = m + (r * (ncol+2)); row[3] = m + (r * (ncol+3)); gemv_kernel4xN_4_avx2(ncol, row, x + r, res); for (c = nc; c < ncol; c++) { res[0] += row[0][c]; res[1] += row[1][c]; res[2] += row[2][c]; res[3] += row[3][c]; } y[r+0] = res[0] + b*y[r+0]; y[r+1] = res[1] + b*y[r+1]; y[r+2] = res[2] + b*y[r+2]; y[r+3] = res[3] + b*y[r+3]; } for (; r < nrow; r++) { gemv_kernel1xN_4(nrow, m + (r * ncol), x + r, res); y[r] = res[0] + b*y[r]; } return 0; } /* * rank one addition * M = ax(y^T) + M * TODO: vectorize kernel */ void blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { m[i+ncol*j] += a * x[i] * y[j]; } } } /* * rank one addition * M = ax(x^T) + M */ void blas·her(int n, double a, double *x, double *m) { blas·ger(n, n, a, x, x, m); } /* * symmetric rank one addition * M = ax(x^T) + M * TODO: vectorize kernel */ void blas·syr(int nrow, int ncol, double a, double *x, double *m) { int i, j; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { m[i+ncol*j] += a * x[i] * x[j]; } } } // ----------------------------------------------------------------------- // level three /* * matrix multiplication * m3 = a(m1 * m2) + b(m3) * einstein notation: * m3_{ij} = a m1_{ik} m2_{kj} + b m3_{ij} * * n1 = # rows of m1 = # rows of m3 * n2 = # cols of m2 = # cols of m3 * n3 = # cols of m1 = # rows of m2 * * TODO: Right now we are 2x slower than OpenBLAS. * This is because we use a very simple algorithm. */ void blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3) { int i, j, k, len; // TODO: Is there anyway this computation can be integrated into the one below? for (i = 0; i < n1; i++) { for (j = 0; j < n2; j++) { m3[i + n2*j] *= b; } } len = n1 & ~7; for (j = 0; j < n2; j++) { for (k = 0; k < n3; k++) { axpy_kernel8_avx2(len, a * m2[k + n2*j], m1 + n3*k, m3 + n2*j); // remainder for (i = len; i < n1; i++) { m3[i + n2*j] += a * m1[i + n3*k] * m2[k + n2*j]; } } } } /* * triangular matrix multiplication * m2 = a * m1 * m2 _OR_ a * m2 * m1 * m1 is assumed triangular * einstein notation: * m2_{ij} = a m1_{ik} m2_{kj} _OR_ a m1_{kj} m2_{ik} * * nrow = # rows of m2 * ncol = # cols of m2 * TODO(PERF): make compute kernel * TODO: finish all other flags */ void blas·trmm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2) { int i, j, k, len; for (i = 0; i < nrow; i++) { for (j = 0; j < ncol; j++) { for (k = i; k < ncol; k++) { m2[i + ncol*j] += a * m1[i + nrow*k] * m2[k + ncol*j]; } } } } /* * solve triangular matrix system of equations * m2 = a * m1^{-1L} _OR_ a * m2 * m1 * m1 is assumed triangular * * nrow = # rows of m2 * ncol = # cols of m2 * TODO: complete stub * TODO: finish all other flags */ void blas·trsm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2) { } #define NITER 10000 #define NCOL 5000007 #define NROW 57 error test·level3() { vlong i, n; clock_t t; double *x, *y, *m[3]; double res[2], tprof[2]; // openblas_set_num_threads(1); x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NCOL); m[0] = malloc(sizeof(*x)*NCOL*NROW); m[1] = malloc(sizeof(*x)*NCOL*NROW); m[2] = malloc(sizeof(*x)*NCOL*NROW); tprof[0] = 0; tprof[1] = 0; for (n = 0; n < NITER; n++) { for (i = 0; i < NCOL; i++) { x[i] = i*i+1; y[i] = i+1; } for (i = 0; i < NCOL*NROW; i++) { m[0][i] = i/(NCOL*NROW); m[1][i] = i*i/(NCOL*NROW*NCOL*NROW); m[2][i] = 1; } t = clock(); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, NCOL, NROW, NROW, 1.2, m[0], NROW, m[1], NROW, 2.8, m[2], NROW); t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; res[1] = cblas_ddot(NROW*NCOL, m[2], 1, m[2], 1); for (i = 0; i < NCOL; i++) { x[i] = i*i+1; y[i] = i+1; } for (i = 0; i < NCOL*NROW; i++) { m[0][i] = i/(NCOL*NROW); m[1][i] = i*i/(NCOL*NROW*NCOL*NROW); m[2][i] = 1; } t = clock(); blas·gemm(NROW, NROW, NROW, 1.2, m[0], m[1], 2.8, m[2]); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; res[0] = blas·dot(NROW*NCOL, m[2], 1, m[2], 1); } printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("--> result (naive): %f\n", res[0]); printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); printf("--> result (oblas): %f\n", res[1]); return 0; } void test·level2() { int i, j, n, it; clock_t t; double *x, *y, *m; double tprof[2]; rng·init(0); tprof[0] = 0, tprof[1] = 0; x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NCOL); m = malloc(sizeof(*x)*NCOL*(NCOL+1)/2); for (it = 0; it < NITER; it++) { n = 0; for (i = 0; i < NCOL; i++) { y[i] = rng·random(); for (j = i; j < NCOL; j++) { m[n++] = rng·random() + .1; // To ensure not singular } } memcpy(x, y, NCOL * sizeof(*x)); t = clock(); blas·tpsv(0, NCOL, m, x); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; t = clock(); cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, NCOL, m, y, 1); t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; for (i = 0; i < NCOL; i++) { if (math·abs(x[i] - y[i])/math·abs(x[i]) > 1e-5) { errorf("failure at index %d: %f != %f", i, x[i], y[i]); } } } printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); } void print·array(int n, double *x) { double *end; printf("["); for (end=x+n; x != end; ++x) { printf("%f,", *x); } printf("]\n"); } error test·level1() { int ai, ai2, i, n; double *x, *y; double tprof[2]; // double params[5]; clock_t t; x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NCOL); rng·init(0); // params[0] = -1.; // params[1] = 100; params[2] = 20; params[3] = 30; params[4] = 10; for (n = 0; n < NITER; n++) { for (i = 0; i < NCOL; i++) { y[i] = rng·random(); } memcpy(x, y, sizeof(*x)*NCOL); t = clock(); ai = blas·argmin(NCOL, x); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; if (n == 20729) { printf("[%d]=%f vs [%d]=%f\n", 74202, x[74202], 3, x[3]); } memcpy(x, y, sizeof(*x)*NCOL); t = clock(); ai2 = cblas_idamin(NCOL, x, 1); t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; if (ai != ai2) { printf("iteration %d: %d not equal to %d. %f vs %f\n", n, ai, ai2, x[ai], x[ai2]); } } printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); double a, b, c, s; a = 10.234, b = 2.; cblas_drotg(&a, &b, &c, &s); printf("%f, %f, %f, %f\n", a, b, c, s); a = 10.234, b = 2.; blas·rotg(&a, &b, &c, &s); printf("%f, %f, %f, %f\n", a, b, c, s); return 0; } error main() { int i, n; double *x, *y; double res[2], tprof[2]; clock_t t; x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NCOL); rng·init(0); for (n = 0; n < NITER; n++) { for (i = 0; i < NCOL; i++) { x[i] = rng·random(); y[i] = rng·random(); } t = clock(); res[1] += cblas_ddot(NCOL/4, x, 4, y, 4); t = clock() - t; tprof[1] += 1000.*t/CLOCKS_PER_SEC; t = clock(); res[0] += blas·dot(NCOL/4, x, 4, y, 4); t = clock() - t; tprof[0] += 1000.*t/CLOCKS_PER_SEC; } printf("%f, %f\n", res[0], res[1]); printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); double a, b, c, s; a = 10.234, b = 2.; cblas_drotg(&a, &b, &c, &s); printf("%f, %f, %f, %f\n", a, b, c, s); a = 10.234, b = 2.; blas·rotg(&a, &b, &c, &s); printf("%f, %f, %f, %f\n", a, b, c, s); return 0; }