From d982e7c2fdebf560ccce193cb98b85d4fac28a45 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 13 May 2020 17:30:19 -0700 Subject: blas 1 generation code complete --- sys/libmath/blas1.c | 2062 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2062 insertions(+) create mode 100644 sys/libmath/blas1.c (limited to 'sys/libmath/blas1.c') diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c new file mode 100644 index 0000000..8cc70eb --- /dev/null +++ b/sys/libmath/blas1.c @@ -0,0 +1,2062 @@ +#include +#include +#include + +/*********************************************************/ +/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */ +/*********************************************************/ + + +static +int +argminf_kernel16(int *ip, float *x) +{ + int ix[16]; + float min[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + for (i = 0; i < 16; i++) { + min[i] = x[0]; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] < min[0]) { + ix[0] = (i + 0); + min[0] = x[ix[0]]; + } + if (x[i + 1] < min[1]) { + ix[1] = (i + 1); + min[1] = x[ix[1]]; + } + if (x[i + 2] < min[2]) { + ix[2] = (i + 2); + min[2] = x[ix[2]]; + } + if (x[i + 3] < min[3]) { + ix[3] = (i + 3); + min[3] = x[ix[3]]; + } + if (x[i + 4] < min[4]) { + ix[4] = (i + 4); + min[4] = x[ix[4]]; + } + if (x[i + 5] < min[5]) { + ix[5] = (i + 5); + min[5] = x[ix[5]]; + } + if (x[i + 6] < min[6]) { + ix[6] = (i + 6); + min[6] = x[ix[6]]; + } + if (x[i + 7] < min[7]) { + ix[7] = (i + 7); + min[7] = x[ix[7]]; + } + if (x[i + 8] < min[8]) { + ix[8] = (i + 8); + min[8] = x[ix[8]]; + } + if (x[i + 9] < min[9]) { + ix[9] = (i + 9); + min[9] = x[ix[9]]; + } + if (x[i + 10] < min[10]) { + ix[10] = (i + 10); + min[10] = x[ix[10]]; + } + if (x[i + 11] < min[11]) { + ix[11] = (i + 11); + min[11] = x[ix[11]]; + } + if (x[i + 12] < min[12]) { + ix[12] = (i + 12); + min[12] = x[ix[12]]; + } + if (x[i + 13] < min[13]) { + ix[13] = (i + 13); + min[13] = x[ix[13]]; + } + if (x[i + 14] < min[14]) { + ix[14] = (i + 14); + min[14] = x[ix[14]]; + } + if (x[i + 15] < min[15]) { + ix[15] = (i + 15); + min[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argminf_s_kernel8(int *ip, int incx, float *x) +{ + int ix[8]; + float min[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + for (i = 0; i < 8; i++) { + min[i] = x[0]; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] < min[0]) { + ix[0] = (i + 0); + min[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] < min[1]) { + ix[1] = (i + 1); + min[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] < min[2]) { + ix[2] = (i + 2); + min[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] < min[3]) { + ix[3] = (i + 3); + min[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] < min[4]) { + ix[4] = (i + 4); + min[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] < min[5]) { + ix[5] = (i + 5); + min[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] < min[6]) { + ix[6] = (i + 6); + min[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] < min[7]) { + ix[7] = (i + 7); + min[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argminf(int len, float *x, int incx) +{ + int i; + int ix; + float min; + + if (incx == 1) { + i = len; + ix = argminf_kernel16(&i, x); + } else { + i = len; + ix = argminf_s_kernel8(&i, incx, x); + } + min = x[ix]; + for (; i < len; i++) { + if (x[i] < min) { + ix = i; + min = x[ix]; + } + } + + return ix; +} + +static +int +argmaxf_kernel16(int *ip, float *x) +{ + float max[16]; + int i; + int ix[16]; + int len; + + for (i = 0; i < 16; i++) { + max[i] = x[0]; + } + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] > max[0]) { + ix[0] = (i + 0); + max[0] = x[ix[0]]; + } + if (x[i + 1] > max[1]) { + ix[1] = (i + 1); + max[1] = x[ix[1]]; + } + if (x[i + 2] > max[2]) { + ix[2] = (i + 2); + max[2] = x[ix[2]]; + } + if (x[i + 3] > max[3]) { + ix[3] = (i + 3); + max[3] = x[ix[3]]; + } + if (x[i + 4] > max[4]) { + ix[4] = (i + 4); + max[4] = x[ix[4]]; + } + if (x[i + 5] > max[5]) { + ix[5] = (i + 5); + max[5] = x[ix[5]]; + } + if (x[i + 6] > max[6]) { + ix[6] = (i + 6); + max[6] = x[ix[6]]; + } + if (x[i + 7] > max[7]) { + ix[7] = (i + 7); + max[7] = x[ix[7]]; + } + if (x[i + 8] > max[8]) { + ix[8] = (i + 8); + max[8] = x[ix[8]]; + } + if (x[i + 9] > max[9]) { + ix[9] = (i + 9); + max[9] = x[ix[9]]; + } + if (x[i + 10] > max[10]) { + ix[10] = (i + 10); + max[10] = x[ix[10]]; + } + if (x[i + 11] > max[11]) { + ix[11] = (i + 11); + max[11] = x[ix[11]]; + } + if (x[i + 12] > max[12]) { + ix[12] = (i + 12); + max[12] = x[ix[12]]; + } + if (x[i + 13] > max[13]) { + ix[13] = (i + 13); + max[13] = x[ix[13]]; + } + if (x[i + 14] > max[14]) { + ix[14] = (i + 14); + max[14] = x[ix[14]]; + } + if (x[i + 15] > max[15]) { + ix[15] = (i + 15); + max[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argmaxf_s_kernel8(int *ip, int incx, float *x) +{ + float max[8]; + int i; + int ix[8]; + int len; + + for (i = 0; i < 8; i++) { + max[i] = x[0]; + } + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] > max[0]) { + ix[0] = (i + 0); + max[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] > max[1]) { + ix[1] = (i + 1); + max[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] > max[2]) { + ix[2] = (i + 2); + max[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] > max[3]) { + ix[3] = (i + 3); + max[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] > max[4]) { + ix[4] = (i + 4); + max[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] > max[5]) { + ix[5] = (i + 5); + max[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] > max[6]) { + ix[6] = (i + 6); + max[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] > max[7]) { + ix[7] = (i + 7); + max[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argmaxf(int len, float *x, int incx) +{ + int i; + int ix; + float max; + + if (incx == 1) { + i = len; + ix = argmaxf_kernel16(&i, x); + } else { + i = len; + ix = argmaxf_s_kernel8(&i, incx, x); + } + max = x[ix]; + for (; i < len; i++) { + if (x[i] > max) { + ix = i; + max = x[ix]; + } + } + + return ix; +} + +static +void +copyf_kernel16(int *ip, float *x, float *y) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = x[i + 0]; + y[i + 1] = x[i + 1]; + y[i + 2] = x[i + 2]; + y[i + 3] = x[i + 3]; + y[i + 4] = x[i + 4]; + y[i + 5] = x[i + 5]; + y[i + 6] = x[i + 6]; + y[i + 7] = x[i + 7]; + y[i + 8] = x[i + 8]; + y[i + 9] = x[i + 9]; + y[i + 10] = x[i + 10]; + y[i + 11] = x[i + 11]; + y[i + 12] = x[i + 12]; + y[i + 13] = x[i + 13]; + y[i + 14] = x[i + 14]; + y[i + 15] = x[i + 15]; + } + *ip = i; +} + +static +void +copyf_s_kernel8(int *ip, int incx, float *x, int incy, float *y) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = x[incx * (i + 0)]; + y[incy * (i + 1)] = x[incx * (i + 1)]; + y[incy * (i + 2)] = x[incx * (i + 2)]; + y[incy * (i + 3)] = x[incx * (i + 3)]; + y[incy * (i + 4)] = x[incx * (i + 4)]; + y[incy * (i + 5)] = x[incx * (i + 5)]; + y[incy * (i + 6)] = x[incx * (i + 6)]; + y[incy * (i + 7)] = x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·copyf(int len, float *x, int incx, float *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + copyf_kernel16(&i, x, y); + } else { + i = len; + copyf_s_kernel8(&i, incx, x, incy, y); + } + for (; i < len; i++) { + y[i] = x[i]; + } +} + +static +void +axpyf_kernel16(int *ip, float a, float *x, float *y) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = y[i + 0] + a * x[i + 0]; + y[i + 1] = y[i + 1] + a * x[i + 1]; + y[i + 2] = y[i + 2] + a * x[i + 2]; + y[i + 3] = y[i + 3] + a * x[i + 3]; + y[i + 4] = y[i + 4] + a * x[i + 4]; + y[i + 5] = y[i + 5] + a * x[i + 5]; + y[i + 6] = y[i + 6] + a * x[i + 6]; + y[i + 7] = y[i + 7] + a * x[i + 7]; + y[i + 8] = y[i + 8] + a * x[i + 8]; + y[i + 9] = y[i + 9] + a * x[i + 9]; + y[i + 10] = y[i + 10] + a * x[i + 10]; + y[i + 11] = y[i + 11] + a * x[i + 11]; + y[i + 12] = y[i + 12] + a * x[i + 12]; + y[i + 13] = y[i + 13] + a * x[i + 13]; + y[i + 14] = y[i + 14] + a * x[i + 14]; + y[i + 15] = y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpyf_s_kernel8(int *ip, float a, float *x, int incy, float *y, int incx) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpyf(int len, float a, float *x, int incx, float *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpyf_kernel16(&i, a, x, y); + } else { + i = len; + axpyf_s_kernel8(&i, a, x, incy, y, incx); + } + for (; i < len; i++) { + y[i] = y[i] + a * x[i]; + } +} + +static +void +axpbyf_kernel16(int *ip, float *x, float a, float *y, float b) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = b * y[i + 0] + a * x[i + 0]; + y[i + 1] = b * y[i + 1] + a * x[i + 1]; + y[i + 2] = b * y[i + 2] + a * x[i + 2]; + y[i + 3] = b * y[i + 3] + a * x[i + 3]; + y[i + 4] = b * y[i + 4] + a * x[i + 4]; + y[i + 5] = b * y[i + 5] + a * x[i + 5]; + y[i + 6] = b * y[i + 6] + a * x[i + 6]; + y[i + 7] = b * y[i + 7] + a * x[i + 7]; + y[i + 8] = b * y[i + 8] + a * x[i + 8]; + y[i + 9] = b * y[i + 9] + a * x[i + 9]; + y[i + 10] = b * y[i + 10] + a * x[i + 10]; + y[i + 11] = b * y[i + 11] + a * x[i + 11]; + y[i + 12] = b * y[i + 12] + a * x[i + 12]; + y[i + 13] = b * y[i + 13] + a * x[i + 13]; + y[i + 14] = b * y[i + 14] + a * x[i + 14]; + y[i + 15] = b * y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpbyf_s_kernel8(int *ip, float *x, float a, float *y, float b, int incx, int incy) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpbyf_kernel16(&i, x, a, y, b); + } else { + i = len; + axpbyf_s_kernel8(&i, x, a, y, b, incx, incy); + } + for (; i < len; i++) { + y[i] = b * y[i] + a * x[i]; + } +} + +static +float +dotf_kernel16(int *ip, float *x, float *y) +{ + int i; + float sum[16]; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0] * y[i + 0]; + sum[1] += x[i + 1] * y[i + 1]; + sum[2] += x[i + 2] * y[i + 2]; + sum[3] += x[i + 3] * y[i + 3]; + sum[4] += x[i + 4] * y[i + 4]; + sum[5] += x[i + 5] * y[i + 5]; + sum[6] += x[i + 6] * y[i + 6]; + sum[7] += x[i + 7] * y[i + 7]; + sum[8] += x[i + 8] * y[i + 8]; + sum[9] += x[i + 9] * y[i + 9]; + sum[10] += x[i + 10] * y[i + 10]; + sum[11] += x[i + 11] * y[i + 11]; + sum[12] += x[i + 12] * y[i + 12]; + sum[13] += x[i + 13] * y[i + 13]; + sum[14] += x[i + 14] * y[i + 14]; + sum[15] += x[i + 15] * y[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +float +dotf_s_kernel8(int *ip, float *x, int incy, float *y, int incx) +{ + int i; + float sum[8]; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; + sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; + sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; + sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; + sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; + sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; + sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; + sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +float +blas·dotf(int len, float *x, int incx, float *y, int incy) +{ + int i; + float sum; + + if (incx == 1 && incy == 1) { + i = len; + sum = dotf_kernel16(&i, x, y); + } else { + i = len; + sum = dotf_s_kernel8(&i, x, incy, y, incx); + } + for (; i < len; i++) { + sum += x[i] * y[i]; + } + + return sum; +} + +static +float +sumf_kernel16(int *ip, float *x) +{ + int i; + float sum[16]; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0]; + sum[1] += x[i + 1]; + sum[2] += x[i + 2]; + sum[3] += x[i + 3]; + sum[4] += x[i + 4]; + sum[5] += x[i + 5]; + sum[6] += x[i + 6]; + sum[7] += x[i + 7]; + sum[8] += x[i + 8]; + sum[9] += x[i + 9]; + sum[10] += x[i + 10]; + sum[11] += x[i + 11]; + sum[12] += x[i + 12]; + sum[13] += x[i + 13]; + sum[14] += x[i + 14]; + sum[15] += x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +float +sumf_s_kernel8(int *ip, float *x, int incx) +{ + float sum[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)]; + sum[1] += x[incx * (i + 1)]; + sum[2] += x[incx * (i + 2)]; + sum[3] += x[incx * (i + 3)]; + sum[4] += x[incx * (i + 4)]; + sum[5] += x[incx * (i + 5)]; + sum[6] += x[incx * (i + 6)]; + sum[7] += x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +float +blas·sumf(int len, float *x, int incx) +{ + int i; + float sum; + + if (incx == 1) { + i = len; + sum = sumf_kernel16(&i, x); + } else { + i = len; + sum = sumf_s_kernel8(&i, x, incx); + } + for (; i < len; i++) { + sum += x[i]; + } + + return sum; +} + +static +float +normf_kernel16(int *ip, float *x) +{ + float nrm[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + nrm[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + nrm[0] += x[i + 0] * x[i + 0]; + nrm[1] += x[i + 1] * x[i + 1]; + nrm[2] += x[i + 2] * x[i + 2]; + nrm[3] += x[i + 3] * x[i + 3]; + nrm[4] += x[i + 4] * x[i + 4]; + nrm[5] += x[i + 5] * x[i + 5]; + nrm[6] += x[i + 6] * x[i + 6]; + nrm[7] += x[i + 7] * x[i + 7]; + nrm[8] += x[i + 8] * x[i + 8]; + nrm[9] += x[i + 9] * x[i + 9]; + nrm[10] += x[i + 10] * x[i + 10]; + nrm[11] += x[i + 11] * x[i + 11]; + nrm[12] += x[i + 12] * x[i + 12]; + nrm[13] += x[i + 13] * x[i + 13]; + nrm[14] += x[i + 14] * x[i + 14]; + nrm[15] += x[i + 15] * x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +static +float +normf_s_kernel8(int *ip, int incx, float *x) +{ + float nrm[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + nrm[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; + nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; + nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; + nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; + nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; + nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; + nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; + nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +float +blas·normf(int len, float *x, int incx) +{ + int i; + float nrm; + + if (incx == 1) { + i = len; + nrm = normf_kernel16(&i, x); + } else { + i = len; + nrm = normf_s_kernel8(&i, incx, x); + } + for (; i < len; i++) { + nrm += x[i] * x[i]; + } + + return math·sqrtf(nrm); +} + +static +void +scalef_kernel16(int *ip, float *x, float a) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + x[i + 0] = a * x[i + 0]; + x[i + 1] = a * x[i + 1]; + x[i + 2] = a * x[i + 2]; + x[i + 3] = a * x[i + 3]; + x[i + 4] = a * x[i + 4]; + x[i + 5] = a * x[i + 5]; + x[i + 6] = a * x[i + 6]; + x[i + 7] = a * x[i + 7]; + x[i + 8] = a * x[i + 8]; + x[i + 9] = a * x[i + 9]; + x[i + 10] = a * x[i + 10]; + x[i + 11] = a * x[i + 11]; + x[i + 12] = a * x[i + 12]; + x[i + 13] = a * x[i + 13]; + x[i + 14] = a * x[i + 14]; + x[i + 15] = a * x[i + 15]; + } + *ip = i; +} + +static +void +scalef_s_kernel8(int *ip, int incx, float *x, float a) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + x[incx * (i + 0)] = a * x[incx * (i + 0)]; + x[incx * (i + 1)] = a * x[incx * (i + 1)]; + x[incx * (i + 2)] = a * x[incx * (i + 2)]; + x[incx * (i + 3)] = a * x[incx * (i + 3)]; + x[incx * (i + 4)] = a * x[incx * (i + 4)]; + x[incx * (i + 5)] = a * x[incx * (i + 5)]; + x[incx * (i + 6)] = a * x[incx * (i + 6)]; + x[incx * (i + 7)] = a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·scalef(int len, float *x, int incx, float a) +{ + int i; + + if (incx == 1) { + i = len; + scalef_kernel16(&i, x, a); + } else { + i = len; + scalef_s_kernel8(&i, incx, x, a); + } + for (; i < len; i++) { + x[i] = a * x[i]; + } +} + +static +void +rotf_kernel16(int *ip, float *y, float cos, float sin, float *x) +{ + int i; + float tmp[16]; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; + } + *ip = i; +} + +static +void +rotf_s_kernel8(int *ip, float *y, float sin, float cos, float *x, int incx, int incy) +{ + int i; + float tmp[8]; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; + } + *ip = i; +} + +void +blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin) +{ + int i; + float tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotf_kernel16(&i, y, cos, sin, x); + } else { + i = len; + rotf_s_kernel8(&i, y, sin, cos, x, incx, incy); + } + for (; i < len; i++) { + tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp; + } +} + +static +void +rotgf_kernel16(int *ip, float H[5], float *y, float *x) +{ + float tmp[16]; + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; + } + *ip = i; +} + +static +void +rotgf_s_kernel8(int *ip, int incy, int incx, float H[5], float *y, float *x) +{ + float tmp[8]; + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; + } + *ip = i; +} + +void +blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5]) +{ + int i; + float tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotgf_kernel16(&i, H, y, x); + } else { + i = len; + rotgf_s_kernel8(&i, incy, incx, H, y, x); + } + for (; i < len; i++) { + tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp; + } +} + +static +int +argmind_kernel16(int *ip, double *x) +{ + int ix[16]; + double min[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + for (i = 0; i < 16; i++) { + min[i] = x[0]; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] < min[0]) { + ix[0] = (i + 0); + min[0] = x[ix[0]]; + } + if (x[i + 1] < min[1]) { + ix[1] = (i + 1); + min[1] = x[ix[1]]; + } + if (x[i + 2] < min[2]) { + ix[2] = (i + 2); + min[2] = x[ix[2]]; + } + if (x[i + 3] < min[3]) { + ix[3] = (i + 3); + min[3] = x[ix[3]]; + } + if (x[i + 4] < min[4]) { + ix[4] = (i + 4); + min[4] = x[ix[4]]; + } + if (x[i + 5] < min[5]) { + ix[5] = (i + 5); + min[5] = x[ix[5]]; + } + if (x[i + 6] < min[6]) { + ix[6] = (i + 6); + min[6] = x[ix[6]]; + } + if (x[i + 7] < min[7]) { + ix[7] = (i + 7); + min[7] = x[ix[7]]; + } + if (x[i + 8] < min[8]) { + ix[8] = (i + 8); + min[8] = x[ix[8]]; + } + if (x[i + 9] < min[9]) { + ix[9] = (i + 9); + min[9] = x[ix[9]]; + } + if (x[i + 10] < min[10]) { + ix[10] = (i + 10); + min[10] = x[ix[10]]; + } + if (x[i + 11] < min[11]) { + ix[11] = (i + 11); + min[11] = x[ix[11]]; + } + if (x[i + 12] < min[12]) { + ix[12] = (i + 12); + min[12] = x[ix[12]]; + } + if (x[i + 13] < min[13]) { + ix[13] = (i + 13); + min[13] = x[ix[13]]; + } + if (x[i + 14] < min[14]) { + ix[14] = (i + 14); + min[14] = x[ix[14]]; + } + if (x[i + 15] < min[15]) { + ix[15] = (i + 15); + min[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argmind_s_kernel8(int *ip, double *x, int incx) +{ + int ix[8]; + double min[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + for (i = 0; i < 8; i++) { + min[i] = x[0]; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] < min[0]) { + ix[0] = (i + 0); + min[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] < min[1]) { + ix[1] = (i + 1); + min[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] < min[2]) { + ix[2] = (i + 2); + min[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] < min[3]) { + ix[3] = (i + 3); + min[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] < min[4]) { + ix[4] = (i + 4); + min[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] < min[5]) { + ix[5] = (i + 5); + min[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] < min[6]) { + ix[6] = (i + 6); + min[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] < min[7]) { + ix[7] = (i + 7); + min[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argmind(int len, double *x, int incx) +{ + int i; + int ix; + double min; + + if (incx == 1) { + i = len; + ix = argmind_kernel16(&i, x); + } else { + i = len; + ix = argmind_s_kernel8(&i, x, incx); + } + min = x[ix]; + for (; i < len; i++) { + if (x[i] < min) { + ix = i; + min = x[ix]; + } + } + + return ix; +} + +static +int +argmaxd_kernel16(int *ip, double *x) +{ + int i; + int ix[16]; + double max[16]; + int len; + + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + for (i = 0; i < 16; i++) { + max[i] = x[0]; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] > max[0]) { + ix[0] = (i + 0); + max[0] = x[ix[0]]; + } + if (x[i + 1] > max[1]) { + ix[1] = (i + 1); + max[1] = x[ix[1]]; + } + if (x[i + 2] > max[2]) { + ix[2] = (i + 2); + max[2] = x[ix[2]]; + } + if (x[i + 3] > max[3]) { + ix[3] = (i + 3); + max[3] = x[ix[3]]; + } + if (x[i + 4] > max[4]) { + ix[4] = (i + 4); + max[4] = x[ix[4]]; + } + if (x[i + 5] > max[5]) { + ix[5] = (i + 5); + max[5] = x[ix[5]]; + } + if (x[i + 6] > max[6]) { + ix[6] = (i + 6); + max[6] = x[ix[6]]; + } + if (x[i + 7] > max[7]) { + ix[7] = (i + 7); + max[7] = x[ix[7]]; + } + if (x[i + 8] > max[8]) { + ix[8] = (i + 8); + max[8] = x[ix[8]]; + } + if (x[i + 9] > max[9]) { + ix[9] = (i + 9); + max[9] = x[ix[9]]; + } + if (x[i + 10] > max[10]) { + ix[10] = (i + 10); + max[10] = x[ix[10]]; + } + if (x[i + 11] > max[11]) { + ix[11] = (i + 11); + max[11] = x[ix[11]]; + } + if (x[i + 12] > max[12]) { + ix[12] = (i + 12); + max[12] = x[ix[12]]; + } + if (x[i + 13] > max[13]) { + ix[13] = (i + 13); + max[13] = x[ix[13]]; + } + if (x[i + 14] > max[14]) { + ix[14] = (i + 14); + max[14] = x[ix[14]]; + } + if (x[i + 15] > max[15]) { + ix[15] = (i + 15); + max[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argmaxd_s_kernel8(int *ip, int incx, double *x) +{ + int i; + int ix[8]; + double max[8]; + int len; + + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + for (i = 0; i < 8; i++) { + max[i] = x[0]; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] > max[0]) { + ix[0] = (i + 0); + max[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] > max[1]) { + ix[1] = (i + 1); + max[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] > max[2]) { + ix[2] = (i + 2); + max[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] > max[3]) { + ix[3] = (i + 3); + max[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] > max[4]) { + ix[4] = (i + 4); + max[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] > max[5]) { + ix[5] = (i + 5); + max[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] > max[6]) { + ix[6] = (i + 6); + max[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] > max[7]) { + ix[7] = (i + 7); + max[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argmaxd(int len, double *x, int incx) +{ + int i; + int ix; + double max; + + if (incx == 1) { + i = len; + ix = argmaxd_kernel16(&i, x); + } else { + i = len; + ix = argmaxd_s_kernel8(&i, incx, x); + } + max = x[ix]; + for (; i < len; i++) { + if (x[i] > max) { + ix = i; + max = x[ix]; + } + } + + return ix; +} + +static +void +copyd_kernel16(int *ip, double *y, double *x) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = x[i + 0]; + y[i + 1] = x[i + 1]; + y[i + 2] = x[i + 2]; + y[i + 3] = x[i + 3]; + y[i + 4] = x[i + 4]; + y[i + 5] = x[i + 5]; + y[i + 6] = x[i + 6]; + y[i + 7] = x[i + 7]; + y[i + 8] = x[i + 8]; + y[i + 9] = x[i + 9]; + y[i + 10] = x[i + 10]; + y[i + 11] = x[i + 11]; + y[i + 12] = x[i + 12]; + y[i + 13] = x[i + 13]; + y[i + 14] = x[i + 14]; + y[i + 15] = x[i + 15]; + } + *ip = i; +} + +static +void +copyd_s_kernel8(int *ip, double *y, double *x, int incy, int incx) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = x[incx * (i + 0)]; + y[incy * (i + 1)] = x[incx * (i + 1)]; + y[incy * (i + 2)] = x[incx * (i + 2)]; + y[incy * (i + 3)] = x[incx * (i + 3)]; + y[incy * (i + 4)] = x[incx * (i + 4)]; + y[incy * (i + 5)] = x[incx * (i + 5)]; + y[incy * (i + 6)] = x[incx * (i + 6)]; + y[incy * (i + 7)] = x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·copyd(int len, double *x, int incx, double *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + copyd_kernel16(&i, y, x); + } else { + i = len; + copyd_s_kernel8(&i, y, x, incy, incx); + } + for (; i < len; i++) { + y[i] = x[i]; + } +} + +static +void +axpyd_kernel16(int *ip, double a, double *y, double *x) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = y[i + 0] + a * x[i + 0]; + y[i + 1] = y[i + 1] + a * x[i + 1]; + y[i + 2] = y[i + 2] + a * x[i + 2]; + y[i + 3] = y[i + 3] + a * x[i + 3]; + y[i + 4] = y[i + 4] + a * x[i + 4]; + y[i + 5] = y[i + 5] + a * x[i + 5]; + y[i + 6] = y[i + 6] + a * x[i + 6]; + y[i + 7] = y[i + 7] + a * x[i + 7]; + y[i + 8] = y[i + 8] + a * x[i + 8]; + y[i + 9] = y[i + 9] + a * x[i + 9]; + y[i + 10] = y[i + 10] + a * x[i + 10]; + y[i + 11] = y[i + 11] + a * x[i + 11]; + y[i + 12] = y[i + 12] + a * x[i + 12]; + y[i + 13] = y[i + 13] + a * x[i + 13]; + y[i + 14] = y[i + 14] + a * x[i + 14]; + y[i + 15] = y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpyd_s_kernel8(int *ip, double a, int incx, double *y, double *x, int incy) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpyd(int len, double a, double *x, int incx, double *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpyd_kernel16(&i, a, y, x); + } else { + i = len; + axpyd_s_kernel8(&i, a, incx, y, x, incy); + } + for (; i < len; i++) { + y[i] = y[i] + a * x[i]; + } +} + +static +void +axpbyd_kernel16(int *ip, double a, double b, double *x, double *y) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = b * y[i + 0] + a * x[i + 0]; + y[i + 1] = b * y[i + 1] + a * x[i + 1]; + y[i + 2] = b * y[i + 2] + a * x[i + 2]; + y[i + 3] = b * y[i + 3] + a * x[i + 3]; + y[i + 4] = b * y[i + 4] + a * x[i + 4]; + y[i + 5] = b * y[i + 5] + a * x[i + 5]; + y[i + 6] = b * y[i + 6] + a * x[i + 6]; + y[i + 7] = b * y[i + 7] + a * x[i + 7]; + y[i + 8] = b * y[i + 8] + a * x[i + 8]; + y[i + 9] = b * y[i + 9] + a * x[i + 9]; + y[i + 10] = b * y[i + 10] + a * x[i + 10]; + y[i + 11] = b * y[i + 11] + a * x[i + 11]; + y[i + 12] = b * y[i + 12] + a * x[i + 12]; + y[i + 13] = b * y[i + 13] + a * x[i + 13]; + y[i + 14] = b * y[i + 14] + a * x[i + 14]; + y[i + 15] = b * y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpbyd_s_kernel8(int *ip, double a, double b, double *x, int incx, int incy, double *y) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpbyd_kernel16(&i, a, b, x, y); + } else { + i = len; + axpbyd_s_kernel8(&i, a, b, x, incx, incy, y); + } + for (; i < len; i++) { + y[i] = b * y[i] + a * x[i]; + } +} + +static +double +dotd_kernel16(int *ip, double *x, double *y) +{ + double sum[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0] * y[i + 0]; + sum[1] += x[i + 1] * y[i + 1]; + sum[2] += x[i + 2] * y[i + 2]; + sum[3] += x[i + 3] * y[i + 3]; + sum[4] += x[i + 4] * y[i + 4]; + sum[5] += x[i + 5] * y[i + 5]; + sum[6] += x[i + 6] * y[i + 6]; + sum[7] += x[i + 7] * y[i + 7]; + sum[8] += x[i + 8] * y[i + 8]; + sum[9] += x[i + 9] * y[i + 9]; + sum[10] += x[i + 10] * y[i + 10]; + sum[11] += x[i + 11] * y[i + 11]; + sum[12] += x[i + 12] * y[i + 12]; + sum[13] += x[i + 13] * y[i + 13]; + sum[14] += x[i + 14] * y[i + 14]; + sum[15] += x[i + 15] * y[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +double +dotd_s_kernel8(int *ip, int incx, double *x, int incy, double *y) +{ + double sum[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; + sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; + sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; + sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; + sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; + sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; + sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; + sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +double +blas·dotd(int len, double *x, int incx, double *y, int incy) +{ + int i; + double sum; + + if (incx == 1 && incy == 1) { + i = len; + sum = dotd_kernel16(&i, x, y); + } else { + i = len; + sum = dotd_s_kernel8(&i, incx, x, incy, y); + } + for (; i < len; i++) { + sum += x[i] * y[i]; + } + + return sum; +} + +static +double +sumd_kernel16(int *ip, double *x) +{ + int i; + double sum[16]; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0]; + sum[1] += x[i + 1]; + sum[2] += x[i + 2]; + sum[3] += x[i + 3]; + sum[4] += x[i + 4]; + sum[5] += x[i + 5]; + sum[6] += x[i + 6]; + sum[7] += x[i + 7]; + sum[8] += x[i + 8]; + sum[9] += x[i + 9]; + sum[10] += x[i + 10]; + sum[11] += x[i + 11]; + sum[12] += x[i + 12]; + sum[13] += x[i + 13]; + sum[14] += x[i + 14]; + sum[15] += x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +double +sumd_s_kernel8(int *ip, double *x, int incx) +{ + int i; + double sum[8]; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)]; + sum[1] += x[incx * (i + 1)]; + sum[2] += x[incx * (i + 2)]; + sum[3] += x[incx * (i + 3)]; + sum[4] += x[incx * (i + 4)]; + sum[5] += x[incx * (i + 5)]; + sum[6] += x[incx * (i + 6)]; + sum[7] += x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +double +blas·sumd(int len, double *x, int incx) +{ + int i; + double sum; + + if (incx == 1) { + i = len; + sum = sumd_kernel16(&i, x); + } else { + i = len; + sum = sumd_s_kernel8(&i, x, incx); + } + for (; i < len; i++) { + sum += x[i]; + } + + return sum; +} + +static +double +normd_kernel16(int *ip, double *x) +{ + double nrm[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + nrm[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + nrm[0] += x[i + 0] * x[i + 0]; + nrm[1] += x[i + 1] * x[i + 1]; + nrm[2] += x[i + 2] * x[i + 2]; + nrm[3] += x[i + 3] * x[i + 3]; + nrm[4] += x[i + 4] * x[i + 4]; + nrm[5] += x[i + 5] * x[i + 5]; + nrm[6] += x[i + 6] * x[i + 6]; + nrm[7] += x[i + 7] * x[i + 7]; + nrm[8] += x[i + 8] * x[i + 8]; + nrm[9] += x[i + 9] * x[i + 9]; + nrm[10] += x[i + 10] * x[i + 10]; + nrm[11] += x[i + 11] * x[i + 11]; + nrm[12] += x[i + 12] * x[i + 12]; + nrm[13] += x[i + 13] * x[i + 13]; + nrm[14] += x[i + 14] * x[i + 14]; + nrm[15] += x[i + 15] * x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +static +double +normd_s_kernel8(int *ip, int incx, double *x) +{ + int i; + double nrm[8]; + int len; + + for (i = 0; i < 8; i++) { + nrm[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; + nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; + nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; + nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; + nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; + nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; + nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; + nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +double +blas·normd(int len, double *x, int incx) +{ + int i; + double nrm; + + if (incx == 1) { + i = len; + nrm = normd_kernel16(&i, x); + } else { + i = len; + nrm = normd_s_kernel8(&i, incx, x); + } + for (; i < len; i++) { + nrm += x[i] * x[i]; + } + + return math·sqrt(nrm); +} + +static +void +scaled_kernel16(int *ip, double a, double *x) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + x[i + 0] = a * x[i + 0]; + x[i + 1] = a * x[i + 1]; + x[i + 2] = a * x[i + 2]; + x[i + 3] = a * x[i + 3]; + x[i + 4] = a * x[i + 4]; + x[i + 5] = a * x[i + 5]; + x[i + 6] = a * x[i + 6]; + x[i + 7] = a * x[i + 7]; + x[i + 8] = a * x[i + 8]; + x[i + 9] = a * x[i + 9]; + x[i + 10] = a * x[i + 10]; + x[i + 11] = a * x[i + 11]; + x[i + 12] = a * x[i + 12]; + x[i + 13] = a * x[i + 13]; + x[i + 14] = a * x[i + 14]; + x[i + 15] = a * x[i + 15]; + } + *ip = i; +} + +static +void +scaled_s_kernel8(int *ip, double a, int incx, double *x) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + x[incx * (i + 0)] = a * x[incx * (i + 0)]; + x[incx * (i + 1)] = a * x[incx * (i + 1)]; + x[incx * (i + 2)] = a * x[incx * (i + 2)]; + x[incx * (i + 3)] = a * x[incx * (i + 3)]; + x[incx * (i + 4)] = a * x[incx * (i + 4)]; + x[incx * (i + 5)] = a * x[incx * (i + 5)]; + x[incx * (i + 6)] = a * x[incx * (i + 6)]; + x[incx * (i + 7)] = a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·scaled(int len, double *x, int incx, double a) +{ + int i; + + if (incx == 1) { + i = len; + scaled_kernel16(&i, a, x); + } else { + i = len; + scaled_s_kernel8(&i, a, incx, x); + } + for (; i < len; i++) { + x[i] = a * x[i]; + } +} + +static +void +rotd_kernel16(int *ip, double *x, double *y, double cos, double sin) +{ + int i; + double tmp[16]; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; + } + *ip = i; +} + +static +void +rotd_s_kernel8(int *ip, double *y, int incx, double sin, int incy, double *x, double cos) +{ + int i; + double tmp[8]; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; + } + *ip = i; +} + +void +blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double sin) +{ + int i; + double tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotd_kernel16(&i, x, y, cos, sin); + } else { + i = len; + rotd_s_kernel8(&i, y, incx, sin, incy, x, cos); + } + for (; i < len; i++) { + tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp; + } +} + +static +void +rotgd_kernel16(int *ip, double *y, double H[5], double *x) +{ + double tmp[16]; + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; + } + *ip = i; +} + +static +void +rotgd_s_kernel8(int *ip, int incx, double *y, int incy, double H[5], double *x) +{ + double tmp[8]; + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; + } + *ip = i; +} + +void +blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5]) +{ + int i; + double tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotgd_kernel16(&i, y, H, x); + } else { + i = len; + rotgd_s_kernel8(&i, incx, y, incy, H, x); + } + for (; i < len; i++) { + tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp; + } +} + + -- cgit v1.2.1