From 5e53685221576ad6ec53bafffd14bc7d5e01fa30 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Mon, 25 May 2020 15:33:54 -0700 Subject: deprecated old python generation files --- sys/libmath/blas1.c | 2135 ++------------------------------------------------- 1 file changed, 56 insertions(+), 2079 deletions(-) (limited to 'sys/libmath/blas1.c') diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c index 1907179..d9792f6 100644 --- a/sys/libmath/blas1.c +++ b/sys/libmath/blas1.c @@ -1,2082 +1,59 @@ #include -#include #include -/*********************************************************/ -/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */ -/*********************************************************/ - - -static -int -argminf_kernel16(int *ip, float *x) -{ - int ix[16]; - float min[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - for (i = 0; i < 16; i++) { - min[i] = x[0]; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] < min[0]) { - ix[0] = (i + 0); - min[0] = x[ix[0]]; - } - if (x[i + 1] < min[1]) { - ix[1] = (i + 1); - min[1] = x[ix[1]]; - } - if (x[i + 2] < min[2]) { - ix[2] = (i + 2); - min[2] = x[ix[2]]; - } - if (x[i + 3] < min[3]) { - ix[3] = (i + 3); - min[3] = x[ix[3]]; - } - if (x[i + 4] < min[4]) { - ix[4] = (i + 4); - min[4] = x[ix[4]]; - } - if (x[i + 5] < min[5]) { - ix[5] = (i + 5); - min[5] = x[ix[5]]; - } - if (x[i + 6] < min[6]) { - ix[6] = (i + 6); - min[6] = x[ix[6]]; - } - if (x[i + 7] < min[7]) { - ix[7] = (i + 7); - min[7] = x[ix[7]]; - } - if (x[i + 8] < min[8]) { - ix[8] = (i + 8); - min[8] = x[ix[8]]; - } - if (x[i + 9] < min[9]) { - ix[9] = (i + 9); - min[9] = x[ix[9]]; - } - if (x[i + 10] < min[10]) { - ix[10] = (i + 10); - min[10] = x[ix[10]]; - } - if (x[i + 11] < min[11]) { - ix[11] = (i + 11); - min[11] = x[ix[11]]; - } - if (x[i + 12] < min[12]) { - ix[12] = (i + 12); - min[12] = x[ix[12]]; - } - if (x[i + 13] < min[13]) { - ix[13] = (i + 13); - min[13] = x[ix[13]]; - } - if (x[i + 14] < min[14]) { - ix[14] = (i + 14); - min[14] = x[ix[14]]; - } - if (x[i + 15] < min[15]) { - ix[15] = (i + 15); - min[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argminf_s_kernel8(int *ip, int incx, float *x) -{ - int ix[8]; - float min[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - for (i = 0; i < 8; i++) { - min[i] = x[0]; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] < min[0]) { - ix[0] = (i + 0); - min[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] < min[1]) { - ix[1] = (i + 1); - min[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] < min[2]) { - ix[2] = (i + 2); - min[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] < min[3]) { - ix[3] = (i + 3); - min[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] < min[4]) { - ix[4] = (i + 4); - min[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] < min[5]) { - ix[5] = (i + 5); - min[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] < min[6]) { - ix[6] = (i + 6); - min[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] < min[7]) { - ix[7] = (i + 7); - min[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argminf(int len, float *x, int incx) -{ - int i; - int ix; - float min; - - if (incx == 1) { - i = len; - ix = argminf_kernel16(&i, x); - } else { - i = len; - ix = argminf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - if (x[incx * i] < min) { - ix = i; - min = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] < min) { - ix = i; - min = x[ix]; - } - } - - return ix; -} - -static -int -argmaxf_kernel16(int *ip, float *x) -{ - int ix[16]; - float max[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - for (i = 0; i < 16; i++) { - max[i] = x[0]; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] > max[0]) { - ix[0] = (i + 0); - max[0] = x[ix[0]]; - } - if (x[i + 1] > max[1]) { - ix[1] = (i + 1); - max[1] = x[ix[1]]; - } - if (x[i + 2] > max[2]) { - ix[2] = (i + 2); - max[2] = x[ix[2]]; - } - if (x[i + 3] > max[3]) { - ix[3] = (i + 3); - max[3] = x[ix[3]]; - } - if (x[i + 4] > max[4]) { - ix[4] = (i + 4); - max[4] = x[ix[4]]; - } - if (x[i + 5] > max[5]) { - ix[5] = (i + 5); - max[5] = x[ix[5]]; - } - if (x[i + 6] > max[6]) { - ix[6] = (i + 6); - max[6] = x[ix[6]]; - } - if (x[i + 7] > max[7]) { - ix[7] = (i + 7); - max[7] = x[ix[7]]; - } - if (x[i + 8] > max[8]) { - ix[8] = (i + 8); - max[8] = x[ix[8]]; - } - if (x[i + 9] > max[9]) { - ix[9] = (i + 9); - max[9] = x[ix[9]]; - } - if (x[i + 10] > max[10]) { - ix[10] = (i + 10); - max[10] = x[ix[10]]; - } - if (x[i + 11] > max[11]) { - ix[11] = (i + 11); - max[11] = x[ix[11]]; - } - if (x[i + 12] > max[12]) { - ix[12] = (i + 12); - max[12] = x[ix[12]]; - } - if (x[i + 13] > max[13]) { - ix[13] = (i + 13); - max[13] = x[ix[13]]; - } - if (x[i + 14] > max[14]) { - ix[14] = (i + 14); - max[14] = x[ix[14]]; - } - if (x[i + 15] > max[15]) { - ix[15] = (i + 15); - max[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argmaxf_s_kernel8(int *ip, int incx, float *x) -{ - int ix[8]; - float max[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - for (i = 0; i < 8; i++) { - max[i] = x[0]; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] > max[0]) { - ix[0] = (i + 0); - max[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] > max[1]) { - ix[1] = (i + 1); - max[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] > max[2]) { - ix[2] = (i + 2); - max[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] > max[3]) { - ix[3] = (i + 3); - max[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] > max[4]) { - ix[4] = (i + 4); - max[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] > max[5]) { - ix[5] = (i + 5); - max[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] > max[6]) { - ix[6] = (i + 6); - max[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] > max[7]) { - ix[7] = (i + 7); - max[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argmaxf(int len, float *x, int incx) -{ - int i; - int ix; - float max; - - if (incx == 1) { - i = len; - ix = argmaxf_kernel16(&i, x); - } else { - i = len; - ix = argmaxf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - if (x[incx * i] > max) { - ix = i; - max = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] > max) { - ix = i; - max = x[ix]; - } - } - - return ix; -} - -static -void -copyf_kernel16(int *ip, float *x, float *y) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = x[i + 0]; - y[i + 1] = x[i + 1]; - y[i + 2] = x[i + 2]; - y[i + 3] = x[i + 3]; - y[i + 4] = x[i + 4]; - y[i + 5] = x[i + 5]; - y[i + 6] = x[i + 6]; - y[i + 7] = x[i + 7]; - y[i + 8] = x[i + 8]; - y[i + 9] = x[i + 9]; - y[i + 10] = x[i + 10]; - y[i + 11] = x[i + 11]; - y[i + 12] = x[i + 12]; - y[i + 13] = x[i + 13]; - y[i + 14] = x[i + 14]; - y[i + 15] = x[i + 15]; - } - *ip = i; -} - -static -void -copyf_s_kernel8(int *ip, int incx, int incy, float *x, float *y) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = x[incx * (i + 0)]; - y[incy * (i + 1)] = x[incx * (i + 1)]; - y[incy * (i + 2)] = x[incx * (i + 2)]; - y[incy * (i + 3)] = x[incx * (i + 3)]; - y[incy * (i + 4)] = x[incx * (i + 4)]; - y[incy * (i + 5)] = x[incx * (i + 5)]; - y[incy * (i + 6)] = x[incx * (i + 6)]; - y[incy * (i + 7)] = x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·copyf(int len, float *x, int incx, float *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - copyf_kernel16(&i, x, y); - } else { - i = len; - copyf_s_kernel8(&i, incx, incy, x, y); - } - for (; i < len; i++) { - y[incy * i] = x[incx * i]; - } -} - -static -void -axpyf_kernel16(int *ip, float *x, float a, float *y) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = y[i + 0] + a * x[i + 0]; - y[i + 1] = y[i + 1] + a * x[i + 1]; - y[i + 2] = y[i + 2] + a * x[i + 2]; - y[i + 3] = y[i + 3] + a * x[i + 3]; - y[i + 4] = y[i + 4] + a * x[i + 4]; - y[i + 5] = y[i + 5] + a * x[i + 5]; - y[i + 6] = y[i + 6] + a * x[i + 6]; - y[i + 7] = y[i + 7] + a * x[i + 7]; - y[i + 8] = y[i + 8] + a * x[i + 8]; - y[i + 9] = y[i + 9] + a * x[i + 9]; - y[i + 10] = y[i + 10] + a * x[i + 10]; - y[i + 11] = y[i + 11] + a * x[i + 11]; - y[i + 12] = y[i + 12] + a * x[i + 12]; - y[i + 13] = y[i + 13] + a * x[i + 13]; - y[i + 14] = y[i + 14] + a * x[i + 14]; - y[i + 15] = y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpyf_s_kernel8(int *ip, float *x, float a, int incx, float *y, int incy) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpyf(int len, float a, float *x, int incx, float *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpyf_kernel16(&i, x, a, y); - } else { - i = len; - axpyf_s_kernel8(&i, x, a, incx, y, incy); - } - for (; i < len; i++) { - y[incy * i] = y[incy * i] + a * x[incx * i]; - } -} - -static -void -axpbyf_kernel16(int *ip, float a, float *y, float b, float *x) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = b * y[i + 0] + a * x[i + 0]; - y[i + 1] = b * y[i + 1] + a * x[i + 1]; - y[i + 2] = b * y[i + 2] + a * x[i + 2]; - y[i + 3] = b * y[i + 3] + a * x[i + 3]; - y[i + 4] = b * y[i + 4] + a * x[i + 4]; - y[i + 5] = b * y[i + 5] + a * x[i + 5]; - y[i + 6] = b * y[i + 6] + a * x[i + 6]; - y[i + 7] = b * y[i + 7] + a * x[i + 7]; - y[i + 8] = b * y[i + 8] + a * x[i + 8]; - y[i + 9] = b * y[i + 9] + a * x[i + 9]; - y[i + 10] = b * y[i + 10] + a * x[i + 10]; - y[i + 11] = b * y[i + 11] + a * x[i + 11]; - y[i + 12] = b * y[i + 12] + a * x[i + 12]; - y[i + 13] = b * y[i + 13] + a * x[i + 13]; - y[i + 14] = b * y[i + 14] + a * x[i + 14]; - y[i + 15] = b * y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpbyf_s_kernel8(int *ip, float a, int incx, float *y, float b, int incy, float *x) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpbyf_kernel16(&i, a, y, b, x); - } else { - i = len; - axpbyf_s_kernel8(&i, a, incx, y, b, incy, x); - } - for (; i < len; i++) { - y[incy * i] = b * y[incy * i] + a * x[incx * i]; - } -} - -static -float -dotf_kernel16(int *ip, float *y, float *x) -{ - float sum[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0] * y[i + 0]; - sum[1] += x[i + 1] * y[i + 1]; - sum[2] += x[i + 2] * y[i + 2]; - sum[3] += x[i + 3] * y[i + 3]; - sum[4] += x[i + 4] * y[i + 4]; - sum[5] += x[i + 5] * y[i + 5]; - sum[6] += x[i + 6] * y[i + 6]; - sum[7] += x[i + 7] * y[i + 7]; - sum[8] += x[i + 8] * y[i + 8]; - sum[9] += x[i + 9] * y[i + 9]; - sum[10] += x[i + 10] * y[i + 10]; - sum[11] += x[i + 11] * y[i + 11]; - sum[12] += x[i + 12] * y[i + 12]; - sum[13] += x[i + 13] * y[i + 13]; - sum[14] += x[i + 14] * y[i + 14]; - sum[15] += x[i + 15] * y[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -float -dotf_s_kernel8(int *ip, int incy, float *x, float *y, int incx) -{ - float sum[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; - sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; - sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; - sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; - sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; - sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; - sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; - sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -float -blas·dotf(int len, float *x, int incx, float *y, int incy) -{ - int i; - float sum; - - if (incx == 1 && incy == 1) { - i = len; - sum = dotf_kernel16(&i, y, x); - } else { - i = len; - sum = dotf_s_kernel8(&i, incy, x, y, incx); - } - for (; i < len; i++) { - sum += x[incx * i] * y[incy * i]; - } - - return sum; -} - -static -float -sumf_kernel16(int *ip, float *x) -{ - float sum[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0]; - sum[1] += x[i + 1]; - sum[2] += x[i + 2]; - sum[3] += x[i + 3]; - sum[4] += x[i + 4]; - sum[5] += x[i + 5]; - sum[6] += x[i + 6]; - sum[7] += x[i + 7]; - sum[8] += x[i + 8]; - sum[9] += x[i + 9]; - sum[10] += x[i + 10]; - sum[11] += x[i + 11]; - sum[12] += x[i + 12]; - sum[13] += x[i + 13]; - sum[14] += x[i + 14]; - sum[15] += x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -float -sumf_s_kernel8(int *ip, int incx, float *x) -{ - float sum[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)]; - sum[1] += x[incx * (i + 1)]; - sum[2] += x[incx * (i + 2)]; - sum[3] += x[incx * (i + 3)]; - sum[4] += x[incx * (i + 4)]; - sum[5] += x[incx * (i + 5)]; - sum[6] += x[incx * (i + 6)]; - sum[7] += x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -float -blas·sumf(int len, float *x, int incx) -{ - int i; - float sum; - - if (incx == 1) { - i = len; - sum = sumf_kernel16(&i, x); - } else { - i = len; - sum = sumf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - sum += x[incx * i]; - } - - return sum; -} - -static -float -normf_kernel16(int *ip, float *x) -{ - int i; - float nrm[16]; - int len; - - for (i = 0; i < 16; i++) { - nrm[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - nrm[0] += x[i + 0] * x[i + 0]; - nrm[1] += x[i + 1] * x[i + 1]; - nrm[2] += x[i + 2] * x[i + 2]; - nrm[3] += x[i + 3] * x[i + 3]; - nrm[4] += x[i + 4] * x[i + 4]; - nrm[5] += x[i + 5] * x[i + 5]; - nrm[6] += x[i + 6] * x[i + 6]; - nrm[7] += x[i + 7] * x[i + 7]; - nrm[8] += x[i + 8] * x[i + 8]; - nrm[9] += x[i + 9] * x[i + 9]; - nrm[10] += x[i + 10] * x[i + 10]; - nrm[11] += x[i + 11] * x[i + 11]; - nrm[12] += x[i + 12] * x[i + 12]; - nrm[13] += x[i + 13] * x[i + 13]; - nrm[14] += x[i + 14] * x[i + 14]; - nrm[15] += x[i + 15] * x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -static -float -normf_s_kernel8(int *ip, int incx, float *x) -{ - int i; - float nrm[8]; - int len; - - for (i = 0; i < 8; i++) { - nrm[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; - nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; - nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; - nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; - nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; - nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; - nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; - nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -float -blas·normf(int len, float *x, int incx) -{ - int i; - float nrm; - - if (incx == 1) { - i = len; - nrm = normf_kernel16(&i, x); - } else { - i = len; - nrm = normf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - nrm += x[incx * i] * x[incx * i]; - } - - return math·sqrtf(nrm); -} - -static -void -scalef_kernel16(int *ip, float *x, float a) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - x[i + 0] = a * x[i + 0]; - x[i + 1] = a * x[i + 1]; - x[i + 2] = a * x[i + 2]; - x[i + 3] = a * x[i + 3]; - x[i + 4] = a * x[i + 4]; - x[i + 5] = a * x[i + 5]; - x[i + 6] = a * x[i + 6]; - x[i + 7] = a * x[i + 7]; - x[i + 8] = a * x[i + 8]; - x[i + 9] = a * x[i + 9]; - x[i + 10] = a * x[i + 10]; - x[i + 11] = a * x[i + 11]; - x[i + 12] = a * x[i + 12]; - x[i + 13] = a * x[i + 13]; - x[i + 14] = a * x[i + 14]; - x[i + 15] = a * x[i + 15]; - } - *ip = i; -} - -static -void -scalef_s_kernel8(int *ip, float *x, int incx, float a) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - x[incx * (i + 0)] = a * x[incx * (i + 0)]; - x[incx * (i + 1)] = a * x[incx * (i + 1)]; - x[incx * (i + 2)] = a * x[incx * (i + 2)]; - x[incx * (i + 3)] = a * x[incx * (i + 3)]; - x[incx * (i + 4)] = a * x[incx * (i + 4)]; - x[incx * (i + 5)] = a * x[incx * (i + 5)]; - x[incx * (i + 6)] = a * x[incx * (i + 6)]; - x[incx * (i + 7)] = a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·scalef(int len, float *x, int incx, float a) -{ - int i; - - if (incx == 1) { - i = len; - scalef_kernel16(&i, x, a); - } else { - i = len; - scalef_s_kernel8(&i, x, incx, a); - } - for (; i < len; i++) { - x[incx * i] = a * x[incx * i]; - } -} - -static -void -rotf_kernel16(int *ip, float sin, float *x, float cos, float *y) -{ - int i; - float tmp[16]; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; - } - *ip = i; -} - -static -void -rotf_s_kernel8(int *ip, int incy, float cos, float sin, int incx, float *x, float *y) -{ - int i; - float tmp[8]; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; - } - *ip = i; -} - -void -blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin) -{ - int i; - float tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotf_kernel16(&i, sin, x, cos, y); - } else { - i = len; - rotf_s_kernel8(&i, incy, cos, sin, incx, x, y); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp; - } -} - -static -void -rotgf_kernel16(int *ip, float *x, float H[5], float *y) -{ - float tmp[16]; - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; - } - *ip = i; -} - -static -void -rotgf_s_kernel8(int *ip, int incx, int incy, float *x, float H[5], float *y) -{ - float tmp[8]; - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; - } - *ip = i; -} - -void -blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5]) -{ - int i; - float tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotgf_kernel16(&i, x, H, y); - } else { - i = len; - rotgf_s_kernel8(&i, incx, incy, x, H, y); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp; - } -} - -static -int -argmind_kernel16(int *ip, double *x) -{ - double min[16]; - int i; - int ix[16]; - int len; - - for (i = 0; i < 16; i++) { - min[i] = x[0]; - } - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] < min[0]) { - ix[0] = (i + 0); - min[0] = x[ix[0]]; - } - if (x[i + 1] < min[1]) { - ix[1] = (i + 1); - min[1] = x[ix[1]]; - } - if (x[i + 2] < min[2]) { - ix[2] = (i + 2); - min[2] = x[ix[2]]; - } - if (x[i + 3] < min[3]) { - ix[3] = (i + 3); - min[3] = x[ix[3]]; - } - if (x[i + 4] < min[4]) { - ix[4] = (i + 4); - min[4] = x[ix[4]]; - } - if (x[i + 5] < min[5]) { - ix[5] = (i + 5); - min[5] = x[ix[5]]; - } - if (x[i + 6] < min[6]) { - ix[6] = (i + 6); - min[6] = x[ix[6]]; - } - if (x[i + 7] < min[7]) { - ix[7] = (i + 7); - min[7] = x[ix[7]]; - } - if (x[i + 8] < min[8]) { - ix[8] = (i + 8); - min[8] = x[ix[8]]; - } - if (x[i + 9] < min[9]) { - ix[9] = (i + 9); - min[9] = x[ix[9]]; - } - if (x[i + 10] < min[10]) { - ix[10] = (i + 10); - min[10] = x[ix[10]]; - } - if (x[i + 11] < min[11]) { - ix[11] = (i + 11); - min[11] = x[ix[11]]; - } - if (x[i + 12] < min[12]) { - ix[12] = (i + 12); - min[12] = x[ix[12]]; - } - if (x[i + 13] < min[13]) { - ix[13] = (i + 13); - min[13] = x[ix[13]]; - } - if (x[i + 14] < min[14]) { - ix[14] = (i + 14); - min[14] = x[ix[14]]; - } - if (x[i + 15] < min[15]) { - ix[15] = (i + 15); - min[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argmind_s_kernel8(int *ip, double *x, int incx) -{ - double min[8]; - int i; - int ix[8]; - int len; - - for (i = 0; i < 8; i++) { - min[i] = x[0]; - } - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] < min[0]) { - ix[0] = (i + 0); - min[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] < min[1]) { - ix[1] = (i + 1); - min[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] < min[2]) { - ix[2] = (i + 2); - min[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] < min[3]) { - ix[3] = (i + 3); - min[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] < min[4]) { - ix[4] = (i + 4); - min[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] < min[5]) { - ix[5] = (i + 5); - min[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] < min[6]) { - ix[6] = (i + 6); - min[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] < min[7]) { - ix[7] = (i + 7); - min[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argmind(int len, double *x, int incx) -{ - int i; - int ix; - double min; - - if (incx == 1) { - i = len; - ix = argmind_kernel16(&i, x); - } else { - i = len; - ix = argmind_s_kernel8(&i, x, incx); - } - for (; i < len; i++) { - if (x[incx * i] < min) { - ix = i; - min = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] < min) { - ix = i; - min = x[ix]; - } - } - - return ix; -} - -static -int -argmaxd_kernel16(int *ip, double *x) -{ - int ix[16]; - int i; - double max[16]; - int len; - - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - for (i = 0; i < 16; i++) { - max[i] = x[0]; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] > max[0]) { - ix[0] = (i + 0); - max[0] = x[ix[0]]; - } - if (x[i + 1] > max[1]) { - ix[1] = (i + 1); - max[1] = x[ix[1]]; - } - if (x[i + 2] > max[2]) { - ix[2] = (i + 2); - max[2] = x[ix[2]]; - } - if (x[i + 3] > max[3]) { - ix[3] = (i + 3); - max[3] = x[ix[3]]; - } - if (x[i + 4] > max[4]) { - ix[4] = (i + 4); - max[4] = x[ix[4]]; - } - if (x[i + 5] > max[5]) { - ix[5] = (i + 5); - max[5] = x[ix[5]]; - } - if (x[i + 6] > max[6]) { - ix[6] = (i + 6); - max[6] = x[ix[6]]; - } - if (x[i + 7] > max[7]) { - ix[7] = (i + 7); - max[7] = x[ix[7]]; - } - if (x[i + 8] > max[8]) { - ix[8] = (i + 8); - max[8] = x[ix[8]]; - } - if (x[i + 9] > max[9]) { - ix[9] = (i + 9); - max[9] = x[ix[9]]; - } - if (x[i + 10] > max[10]) { - ix[10] = (i + 10); - max[10] = x[ix[10]]; - } - if (x[i + 11] > max[11]) { - ix[11] = (i + 11); - max[11] = x[ix[11]]; - } - if (x[i + 12] > max[12]) { - ix[12] = (i + 12); - max[12] = x[ix[12]]; - } - if (x[i + 13] > max[13]) { - ix[13] = (i + 13); - max[13] = x[ix[13]]; - } - if (x[i + 14] > max[14]) { - ix[14] = (i + 14); - max[14] = x[ix[14]]; - } - if (x[i + 15] > max[15]) { - ix[15] = (i + 15); - max[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argmaxd_s_kernel8(int *ip, int incx, double *x) -{ - int ix[8]; - int i; - double max[8]; - int len; - - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - for (i = 0; i < 8; i++) { - max[i] = x[0]; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] > max[0]) { - ix[0] = (i + 0); - max[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] > max[1]) { - ix[1] = (i + 1); - max[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] > max[2]) { - ix[2] = (i + 2); - max[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] > max[3]) { - ix[3] = (i + 3); - max[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] > max[4]) { - ix[4] = (i + 4); - max[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] > max[5]) { - ix[5] = (i + 5); - max[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] > max[6]) { - ix[6] = (i + 6); - max[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] > max[7]) { - ix[7] = (i + 7); - max[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argmaxd(int len, double *x, int incx) -{ - int i; - int ix; - double max; - - if (incx == 1) { - i = len; - ix = argmaxd_kernel16(&i, x); - } else { - i = len; - ix = argmaxd_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - if (x[incx * i] > max) { - ix = i; - max = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] > max) { - ix = i; - max = x[ix]; - } - } - - return ix; -} - -static -void -copyd_kernel16(int *ip, double *y, double *x) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = x[i + 0]; - y[i + 1] = x[i + 1]; - y[i + 2] = x[i + 2]; - y[i + 3] = x[i + 3]; - y[i + 4] = x[i + 4]; - y[i + 5] = x[i + 5]; - y[i + 6] = x[i + 6]; - y[i + 7] = x[i + 7]; - y[i + 8] = x[i + 8]; - y[i + 9] = x[i + 9]; - y[i + 10] = x[i + 10]; - y[i + 11] = x[i + 11]; - y[i + 12] = x[i + 12]; - y[i + 13] = x[i + 13]; - y[i + 14] = x[i + 14]; - y[i + 15] = x[i + 15]; - } - *ip = i; -} - -static -void -copyd_s_kernel8(int *ip, double *y, int incx, int incy, double *x) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = x[incx * (i + 0)]; - y[incy * (i + 1)] = x[incx * (i + 1)]; - y[incy * (i + 2)] = x[incx * (i + 2)]; - y[incy * (i + 3)] = x[incx * (i + 3)]; - y[incy * (i + 4)] = x[incx * (i + 4)]; - y[incy * (i + 5)] = x[incx * (i + 5)]; - y[incy * (i + 6)] = x[incx * (i + 6)]; - y[incy * (i + 7)] = x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·copyd(int len, double *x, int incx, double *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - copyd_kernel16(&i, y, x); - } else { - i = len; - copyd_s_kernel8(&i, y, incx, incy, x); - } - for (; i < len; i++) { - y[incy * i] = x[incx * i]; - } -} - -static -void -axpyd_kernel16(int *ip, double *x, double *y, double a) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = y[i + 0] + a * x[i + 0]; - y[i + 1] = y[i + 1] + a * x[i + 1]; - y[i + 2] = y[i + 2] + a * x[i + 2]; - y[i + 3] = y[i + 3] + a * x[i + 3]; - y[i + 4] = y[i + 4] + a * x[i + 4]; - y[i + 5] = y[i + 5] + a * x[i + 5]; - y[i + 6] = y[i + 6] + a * x[i + 6]; - y[i + 7] = y[i + 7] + a * x[i + 7]; - y[i + 8] = y[i + 8] + a * x[i + 8]; - y[i + 9] = y[i + 9] + a * x[i + 9]; - y[i + 10] = y[i + 10] + a * x[i + 10]; - y[i + 11] = y[i + 11] + a * x[i + 11]; - y[i + 12] = y[i + 12] + a * x[i + 12]; - y[i + 13] = y[i + 13] + a * x[i + 13]; - y[i + 14] = y[i + 14] + a * x[i + 14]; - y[i + 15] = y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpyd_s_kernel8(int *ip, double *x, int incy, double *y, double a, int incx) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpyd(int len, double a, double *x, int incx, double *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpyd_kernel16(&i, x, y, a); - } else { - i = len; - axpyd_s_kernel8(&i, x, incy, y, a, incx); - } - for (; i < len; i++) { - y[incy * i] = y[incy * i] + a * x[incx * i]; - } -} - -static -void -axpbyd_kernel16(int *ip, double a, double b, double *x, double *y) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = b * y[i + 0] + a * x[i + 0]; - y[i + 1] = b * y[i + 1] + a * x[i + 1]; - y[i + 2] = b * y[i + 2] + a * x[i + 2]; - y[i + 3] = b * y[i + 3] + a * x[i + 3]; - y[i + 4] = b * y[i + 4] + a * x[i + 4]; - y[i + 5] = b * y[i + 5] + a * x[i + 5]; - y[i + 6] = b * y[i + 6] + a * x[i + 6]; - y[i + 7] = b * y[i + 7] + a * x[i + 7]; - y[i + 8] = b * y[i + 8] + a * x[i + 8]; - y[i + 9] = b * y[i + 9] + a * x[i + 9]; - y[i + 10] = b * y[i + 10] + a * x[i + 10]; - y[i + 11] = b * y[i + 11] + a * x[i + 11]; - y[i + 12] = b * y[i + 12] + a * x[i + 12]; - y[i + 13] = b * y[i + 13] + a * x[i + 13]; - y[i + 14] = b * y[i + 14] + a * x[i + 14]; - y[i + 15] = b * y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpbyd_s_kernel8(int *ip, double a, int incy, double b, double *x, int incx, double *y) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpbyd_kernel16(&i, a, b, x, y); - } else { - i = len; - axpbyd_s_kernel8(&i, a, incy, b, x, incx, y); - } - for (; i < len; i++) { - y[incy * i] = b * y[incy * i] + a * x[incx * i]; - } -} - -static -double -dotd_kernel16(int *ip, double *y, double *x) -{ - double sum[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0] * y[i + 0]; - sum[1] += x[i + 1] * y[i + 1]; - sum[2] += x[i + 2] * y[i + 2]; - sum[3] += x[i + 3] * y[i + 3]; - sum[4] += x[i + 4] * y[i + 4]; - sum[5] += x[i + 5] * y[i + 5]; - sum[6] += x[i + 6] * y[i + 6]; - sum[7] += x[i + 7] * y[i + 7]; - sum[8] += x[i + 8] * y[i + 8]; - sum[9] += x[i + 9] * y[i + 9]; - sum[10] += x[i + 10] * y[i + 10]; - sum[11] += x[i + 11] * y[i + 11]; - sum[12] += x[i + 12] * y[i + 12]; - sum[13] += x[i + 13] * y[i + 13]; - sum[14] += x[i + 14] * y[i + 14]; - sum[15] += x[i + 15] * y[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -double -dotd_s_kernel8(int *ip, double *y, int incx, int incy, double *x) -{ - double sum[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; - sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; - sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; - sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; - sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; - sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; - sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; - sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -double -blas·dotd(int len, double *x, int incx, double *y, int incy) -{ - int i; - double sum; - - if (incx == 1 && incy == 1) { - i = len; - sum = dotd_kernel16(&i, y, x); - } else { - i = len; - sum = dotd_s_kernel8(&i, y, incx, incy, x); - } - for (; i < len; i++) { - sum += x[incx * i] * y[incy * i]; - } - - return sum; -} - -static -double -sumd_kernel16(int *ip, double *x) -{ - int i; - double sum[16]; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0]; - sum[1] += x[i + 1]; - sum[2] += x[i + 2]; - sum[3] += x[i + 3]; - sum[4] += x[i + 4]; - sum[5] += x[i + 5]; - sum[6] += x[i + 6]; - sum[7] += x[i + 7]; - sum[8] += x[i + 8]; - sum[9] += x[i + 9]; - sum[10] += x[i + 10]; - sum[11] += x[i + 11]; - sum[12] += x[i + 12]; - sum[13] += x[i + 13]; - sum[14] += x[i + 14]; - sum[15] += x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -double -sumd_s_kernel8(int *ip, int incx, double *x) -{ - int i; - double sum[8]; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)]; - sum[1] += x[incx * (i + 1)]; - sum[2] += x[incx * (i + 2)]; - sum[3] += x[incx * (i + 3)]; - sum[4] += x[incx * (i + 4)]; - sum[5] += x[incx * (i + 5)]; - sum[6] += x[incx * (i + 6)]; - sum[7] += x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -double -blas·sumd(int len, double *x, int incx) -{ - int i; - double sum; - - if (incx == 1) { - i = len; - sum = sumd_kernel16(&i, x); - } else { - i = len; - sum = sumd_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - sum += x[incx * i]; - } - - return sum; -} - -static -double -normd_kernel16(int *ip, double *x) -{ - int i; - double nrm[16]; - int len; - - for (i = 0; i < 16; i++) { - nrm[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - nrm[0] += x[i + 0] * x[i + 0]; - nrm[1] += x[i + 1] * x[i + 1]; - nrm[2] += x[i + 2] * x[i + 2]; - nrm[3] += x[i + 3] * x[i + 3]; - nrm[4] += x[i + 4] * x[i + 4]; - nrm[5] += x[i + 5] * x[i + 5]; - nrm[6] += x[i + 6] * x[i + 6]; - nrm[7] += x[i + 7] * x[i + 7]; - nrm[8] += x[i + 8] * x[i + 8]; - nrm[9] += x[i + 9] * x[i + 9]; - nrm[10] += x[i + 10] * x[i + 10]; - nrm[11] += x[i + 11] * x[i + 11]; - nrm[12] += x[i + 12] * x[i + 12]; - nrm[13] += x[i + 13] * x[i + 13]; - nrm[14] += x[i + 14] * x[i + 14]; - nrm[15] += x[i + 15] * x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -static -double -normd_s_kernel8(int *ip, int incx, double *x) -{ - double nrm[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - nrm[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; - nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; - nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; - nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; - nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; - nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; - nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; - nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -double -blas·normd(int len, double *x, int incx) -{ - int i; - double nrm; - - if (incx == 1) { - i = len; - nrm = normd_kernel16(&i, x); - } else { - i = len; - nrm = normd_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - nrm += x[incx * i] * x[incx * i]; - } - - return math·sqrt(nrm); -} - -static -void -scaled_kernel16(int *ip, double a, double *x) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - x[i + 0] = a * x[i + 0]; - x[i + 1] = a * x[i + 1]; - x[i + 2] = a * x[i + 2]; - x[i + 3] = a * x[i + 3]; - x[i + 4] = a * x[i + 4]; - x[i + 5] = a * x[i + 5]; - x[i + 6] = a * x[i + 6]; - x[i + 7] = a * x[i + 7]; - x[i + 8] = a * x[i + 8]; - x[i + 9] = a * x[i + 9]; - x[i + 10] = a * x[i + 10]; - x[i + 11] = a * x[i + 11]; - x[i + 12] = a * x[i + 12]; - x[i + 13] = a * x[i + 13]; - x[i + 14] = a * x[i + 14]; - x[i + 15] = a * x[i + 15]; - } - *ip = i; -} - -static -void -scaled_s_kernel8(int *ip, double *x, int incx, double a) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - x[incx * (i + 0)] = a * x[incx * (i + 0)]; - x[incx * (i + 1)] = a * x[incx * (i + 1)]; - x[incx * (i + 2)] = a * x[incx * (i + 2)]; - x[incx * (i + 3)] = a * x[incx * (i + 3)]; - x[incx * (i + 4)] = a * x[incx * (i + 4)]; - x[incx * (i + 5)] = a * x[incx * (i + 5)]; - x[incx * (i + 6)] = a * x[incx * (i + 6)]; - x[incx * (i + 7)] = a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·scaled(int len, double *x, int incx, double a) -{ - int i; - - if (incx == 1) { - i = len; - scaled_kernel16(&i, a, x); - } else { - i = len; - scaled_s_kernel8(&i, x, incx, a); - } - for (; i < len; i++) { - x[incx * i] = a * x[incx * i]; - } -} - -static -void -rotd_kernel16(int *ip, double sin, double cos, double *x, double *y) -{ - double tmp[16]; - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; - } - *ip = i; -} - -static -void -rotd_s_kernel8(int *ip, double sin, double *x, double *y, double cos, int incy, int incx) -{ - double tmp[8]; - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; - } - *ip = i; -} - -void -blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double sin) -{ - int i; - double tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotd_kernel16(&i, sin, cos, x, y); - } else { - i = len; - rotd_s_kernel8(&i, sin, x, y, cos, incy, incx); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp; - } -} - -static -void -rotgd_kernel16(int *ip, double H[5], double *y, double *x) -{ - int i; - double tmp[16]; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; - } - *ip = i; -} - -static -void -rotgd_s_kernel8(int *ip, double H[5], int incx, double *y, double *x, int incy) -{ - int i; - double tmp[8]; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; - } - *ip = i; -} - -void -blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5]) -{ - int i; - double tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotgd_kernel16(&i, H, y, x); - } else { - i = len; - rotgd_s_kernel8(&i, H, incx, y, x, incy); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp; - } -} - - +#define UNROLL 8 +#define INT uintptr + +// ----------------------------------------------------------------------- +// Templates + +#include "loop.h" +#define BODY_XY() \ + LOOP(UNROLL, 0, INIT); \ + n = ROUNDBY(len, UNROLL); \ + if (incx == 1 && incy == 1) { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,1,1); \ + } \ + } else { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,incx,incy);\ + } \ + } \ + \ + for (; i < len; i++) { \ + LOOP(1,0,KERNEL,incx,incy); \ + } + +#define BODY_X() \ + LOOP(UNROLL, 0, INIT); \ + n = ROUNDBY(len, UNROLL); \ + if (incx == 1) { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,1); \ + } \ + } else { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,incx); \ + } \ + } \ + \ + for (; i < len; i++) { \ + LOOP(1,0,KERNEL,incx); \ + } + +// ----------------------------------------------------------------------- +// Implementation + +#define FLOAT double +#define func(name) blas·d##name +#include "blas1body" +#undef FLOAT + +#undef FLOAT +#undef func + +#define FLOAT float +#define func(name) blas·f##name +#include "blas1body" +#undef FLOAT -- cgit v1.2.1