aboutsummaryrefslogtreecommitdiff
path: root/sys/libmath/blas1.c
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-05-25 15:33:54 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-05-25 15:33:54 -0700
commit5e53685221576ad6ec53bafffd14bc7d5e01fa30 (patch)
tree9780092401ccd8a116fbc33a50756cf2a90985b6 /sys/libmath/blas1.c
parent70148cca0463cc45408401a0f2d76ba805d0a157 (diff)
deprecated old python generation files
Diffstat (limited to 'sys/libmath/blas1.c')
-rw-r--r--sys/libmath/blas1.c2135
1 files changed, 56 insertions, 2079 deletions
diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c
index 1907179..d9792f6 100644
--- a/sys/libmath/blas1.c
+++ b/sys/libmath/blas1.c
@@ -1,2082 +1,59 @@
#include <u.h>
-#include <libn.h>
#include <libmath.h>
-/*********************************************************/
-/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */
-/*********************************************************/
-
-
-static
-int
-argminf_kernel16(int *ip, float *x)
-{
- int ix[16];
- float min[16];
- int i;
- int len;
-
- for (i = 0; i < 16; i++) {
- ix[i] = 0;
- }
- for (i = 0; i < 16; i++) {
- min[i] = x[0];
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- if (x[i + 0] < min[0]) {
- ix[0] = (i + 0);
- min[0] = x[ix[0]];
- }
- if (x[i + 1] < min[1]) {
- ix[1] = (i + 1);
- min[1] = x[ix[1]];
- }
- if (x[i + 2] < min[2]) {
- ix[2] = (i + 2);
- min[2] = x[ix[2]];
- }
- if (x[i + 3] < min[3]) {
- ix[3] = (i + 3);
- min[3] = x[ix[3]];
- }
- if (x[i + 4] < min[4]) {
- ix[4] = (i + 4);
- min[4] = x[ix[4]];
- }
- if (x[i + 5] < min[5]) {
- ix[5] = (i + 5);
- min[5] = x[ix[5]];
- }
- if (x[i + 6] < min[6]) {
- ix[6] = (i + 6);
- min[6] = x[ix[6]];
- }
- if (x[i + 7] < min[7]) {
- ix[7] = (i + 7);
- min[7] = x[ix[7]];
- }
- if (x[i + 8] < min[8]) {
- ix[8] = (i + 8);
- min[8] = x[ix[8]];
- }
- if (x[i + 9] < min[9]) {
- ix[9] = (i + 9);
- min[9] = x[ix[9]];
- }
- if (x[i + 10] < min[10]) {
- ix[10] = (i + 10);
- min[10] = x[ix[10]];
- }
- if (x[i + 11] < min[11]) {
- ix[11] = (i + 11);
- min[11] = x[ix[11]];
- }
- if (x[i + 12] < min[12]) {
- ix[12] = (i + 12);
- min[12] = x[ix[12]];
- }
- if (x[i + 13] < min[13]) {
- ix[13] = (i + 13);
- min[13] = x[ix[13]];
- }
- if (x[i + 14] < min[14]) {
- ix[14] = (i + 14);
- min[14] = x[ix[14]];
- }
- if (x[i + 15] < min[15]) {
- ix[15] = (i + 15);
- min[15] = x[ix[15]];
- }
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-static
-int
-argminf_s_kernel8(int *ip, int incx, float *x)
-{
- int ix[8];
- float min[8];
- int i;
- int len;
-
- for (i = 0; i < 8; i++) {
- ix[i] = 0;
- }
- for (i = 0; i < 8; i++) {
- min[i] = x[0];
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- if (x[incx * (i + 0)] < min[0]) {
- ix[0] = (i + 0);
- min[0] = x[incx * ix[0]];
- }
- if (x[incx * (i + 1)] < min[1]) {
- ix[1] = (i + 1);
- min[1] = x[incx * ix[1]];
- }
- if (x[incx * (i + 2)] < min[2]) {
- ix[2] = (i + 2);
- min[2] = x[incx * ix[2]];
- }
- if (x[incx * (i + 3)] < min[3]) {
- ix[3] = (i + 3);
- min[3] = x[incx * ix[3]];
- }
- if (x[incx * (i + 4)] < min[4]) {
- ix[4] = (i + 4);
- min[4] = x[incx * ix[4]];
- }
- if (x[incx * (i + 5)] < min[5]) {
- ix[5] = (i + 5);
- min[5] = x[incx * ix[5]];
- }
- if (x[incx * (i + 6)] < min[6]) {
- ix[6] = (i + 6);
- min[6] = x[incx * ix[6]];
- }
- if (x[incx * (i + 7)] < min[7]) {
- ix[7] = (i + 7);
- min[7] = x[incx * ix[7]];
- }
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-int
-blas·argminf(int len, float *x, int incx)
-{
- int i;
- int ix;
- float min;
-
- if (incx == 1) {
- i = len;
- ix = argminf_kernel16(&i, x);
- } else {
- i = len;
- ix = argminf_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- if (x[incx * i] < min) {
- ix = i;
- min = x[incx * ix];
- }
- }
- for (; i < len; i++) {
- if (x[i] < min) {
- ix = i;
- min = x[ix];
- }
- }
-
- return ix;
-}
-
-static
-int
-argmaxf_kernel16(int *ip, float *x)
-{
- int ix[16];
- float max[16];
- int i;
- int len;
-
- for (i = 0; i < 16; i++) {
- ix[i] = 0;
- }
- for (i = 0; i < 16; i++) {
- max[i] = x[0];
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- if (x[i + 0] > max[0]) {
- ix[0] = (i + 0);
- max[0] = x[ix[0]];
- }
- if (x[i + 1] > max[1]) {
- ix[1] = (i + 1);
- max[1] = x[ix[1]];
- }
- if (x[i + 2] > max[2]) {
- ix[2] = (i + 2);
- max[2] = x[ix[2]];
- }
- if (x[i + 3] > max[3]) {
- ix[3] = (i + 3);
- max[3] = x[ix[3]];
- }
- if (x[i + 4] > max[4]) {
- ix[4] = (i + 4);
- max[4] = x[ix[4]];
- }
- if (x[i + 5] > max[5]) {
- ix[5] = (i + 5);
- max[5] = x[ix[5]];
- }
- if (x[i + 6] > max[6]) {
- ix[6] = (i + 6);
- max[6] = x[ix[6]];
- }
- if (x[i + 7] > max[7]) {
- ix[7] = (i + 7);
- max[7] = x[ix[7]];
- }
- if (x[i + 8] > max[8]) {
- ix[8] = (i + 8);
- max[8] = x[ix[8]];
- }
- if (x[i + 9] > max[9]) {
- ix[9] = (i + 9);
- max[9] = x[ix[9]];
- }
- if (x[i + 10] > max[10]) {
- ix[10] = (i + 10);
- max[10] = x[ix[10]];
- }
- if (x[i + 11] > max[11]) {
- ix[11] = (i + 11);
- max[11] = x[ix[11]];
- }
- if (x[i + 12] > max[12]) {
- ix[12] = (i + 12);
- max[12] = x[ix[12]];
- }
- if (x[i + 13] > max[13]) {
- ix[13] = (i + 13);
- max[13] = x[ix[13]];
- }
- if (x[i + 14] > max[14]) {
- ix[14] = (i + 14);
- max[14] = x[ix[14]];
- }
- if (x[i + 15] > max[15]) {
- ix[15] = (i + 15);
- max[15] = x[ix[15]];
- }
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-static
-int
-argmaxf_s_kernel8(int *ip, int incx, float *x)
-{
- int ix[8];
- float max[8];
- int i;
- int len;
-
- for (i = 0; i < 8; i++) {
- ix[i] = 0;
- }
- for (i = 0; i < 8; i++) {
- max[i] = x[0];
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- if (x[incx * (i + 0)] > max[0]) {
- ix[0] = (i + 0);
- max[0] = x[incx * ix[0]];
- }
- if (x[incx * (i + 1)] > max[1]) {
- ix[1] = (i + 1);
- max[1] = x[incx * ix[1]];
- }
- if (x[incx * (i + 2)] > max[2]) {
- ix[2] = (i + 2);
- max[2] = x[incx * ix[2]];
- }
- if (x[incx * (i + 3)] > max[3]) {
- ix[3] = (i + 3);
- max[3] = x[incx * ix[3]];
- }
- if (x[incx * (i + 4)] > max[4]) {
- ix[4] = (i + 4);
- max[4] = x[incx * ix[4]];
- }
- if (x[incx * (i + 5)] > max[5]) {
- ix[5] = (i + 5);
- max[5] = x[incx * ix[5]];
- }
- if (x[incx * (i + 6)] > max[6]) {
- ix[6] = (i + 6);
- max[6] = x[incx * ix[6]];
- }
- if (x[incx * (i + 7)] > max[7]) {
- ix[7] = (i + 7);
- max[7] = x[incx * ix[7]];
- }
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-int
-blas·argmaxf(int len, float *x, int incx)
-{
- int i;
- int ix;
- float max;
-
- if (incx == 1) {
- i = len;
- ix = argmaxf_kernel16(&i, x);
- } else {
- i = len;
- ix = argmaxf_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- if (x[incx * i] > max) {
- ix = i;
- max = x[incx * ix];
- }
- }
- for (; i < len; i++) {
- if (x[i] > max) {
- ix = i;
- max = x[ix];
- }
- }
-
- return ix;
-}
-
-static
-void
-copyf_kernel16(int *ip, float *x, float *y)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- y[i + 0] = x[i + 0];
- y[i + 1] = x[i + 1];
- y[i + 2] = x[i + 2];
- y[i + 3] = x[i + 3];
- y[i + 4] = x[i + 4];
- y[i + 5] = x[i + 5];
- y[i + 6] = x[i + 6];
- y[i + 7] = x[i + 7];
- y[i + 8] = x[i + 8];
- y[i + 9] = x[i + 9];
- y[i + 10] = x[i + 10];
- y[i + 11] = x[i + 11];
- y[i + 12] = x[i + 12];
- y[i + 13] = x[i + 13];
- y[i + 14] = x[i + 14];
- y[i + 15] = x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-copyf_s_kernel8(int *ip, int incx, int incy, float *x, float *y)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- y[incy * (i + 0)] = x[incx * (i + 0)];
- y[incy * (i + 1)] = x[incx * (i + 1)];
- y[incy * (i + 2)] = x[incx * (i + 2)];
- y[incy * (i + 3)] = x[incx * (i + 3)];
- y[incy * (i + 4)] = x[incx * (i + 4)];
- y[incy * (i + 5)] = x[incx * (i + 5)];
- y[incy * (i + 6)] = x[incx * (i + 6)];
- y[incy * (i + 7)] = x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·copyf(int len, float *x, int incx, float *y, int incy)
-{
- int i;
-
- if (incx == 1 && incy == 1) {
- i = len;
- copyf_kernel16(&i, x, y);
- } else {
- i = len;
- copyf_s_kernel8(&i, incx, incy, x, y);
- }
- for (; i < len; i++) {
- y[incy * i] = x[incx * i];
- }
-}
-
-static
-void
-axpyf_kernel16(int *ip, float *x, float a, float *y)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- y[i + 0] = y[i + 0] + a * x[i + 0];
- y[i + 1] = y[i + 1] + a * x[i + 1];
- y[i + 2] = y[i + 2] + a * x[i + 2];
- y[i + 3] = y[i + 3] + a * x[i + 3];
- y[i + 4] = y[i + 4] + a * x[i + 4];
- y[i + 5] = y[i + 5] + a * x[i + 5];
- y[i + 6] = y[i + 6] + a * x[i + 6];
- y[i + 7] = y[i + 7] + a * x[i + 7];
- y[i + 8] = y[i + 8] + a * x[i + 8];
- y[i + 9] = y[i + 9] + a * x[i + 9];
- y[i + 10] = y[i + 10] + a * x[i + 10];
- y[i + 11] = y[i + 11] + a * x[i + 11];
- y[i + 12] = y[i + 12] + a * x[i + 12];
- y[i + 13] = y[i + 13] + a * x[i + 13];
- y[i + 14] = y[i + 14] + a * x[i + 14];
- y[i + 15] = y[i + 15] + a * x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-axpyf_s_kernel8(int *ip, float *x, float a, int incx, float *y, int incy)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)];
- y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)];
- y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)];
- y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)];
- y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)];
- y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)];
- y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)];
- y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·axpyf(int len, float a, float *x, int incx, float *y, int incy)
-{
- int i;
-
- if (incx == 1 && incy == 1) {
- i = len;
- axpyf_kernel16(&i, x, a, y);
- } else {
- i = len;
- axpyf_s_kernel8(&i, x, a, incx, y, incy);
- }
- for (; i < len; i++) {
- y[incy * i] = y[incy * i] + a * x[incx * i];
- }
-}
-
-static
-void
-axpbyf_kernel16(int *ip, float a, float *y, float b, float *x)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- y[i + 0] = b * y[i + 0] + a * x[i + 0];
- y[i + 1] = b * y[i + 1] + a * x[i + 1];
- y[i + 2] = b * y[i + 2] + a * x[i + 2];
- y[i + 3] = b * y[i + 3] + a * x[i + 3];
- y[i + 4] = b * y[i + 4] + a * x[i + 4];
- y[i + 5] = b * y[i + 5] + a * x[i + 5];
- y[i + 6] = b * y[i + 6] + a * x[i + 6];
- y[i + 7] = b * y[i + 7] + a * x[i + 7];
- y[i + 8] = b * y[i + 8] + a * x[i + 8];
- y[i + 9] = b * y[i + 9] + a * x[i + 9];
- y[i + 10] = b * y[i + 10] + a * x[i + 10];
- y[i + 11] = b * y[i + 11] + a * x[i + 11];
- y[i + 12] = b * y[i + 12] + a * x[i + 12];
- y[i + 13] = b * y[i + 13] + a * x[i + 13];
- y[i + 14] = b * y[i + 14] + a * x[i + 14];
- y[i + 15] = b * y[i + 15] + a * x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-axpbyf_s_kernel8(int *ip, float a, int incx, float *y, float b, int incy, float *x)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)];
- y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)];
- y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)];
- y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)];
- y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)];
- y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)];
- y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)];
- y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy)
-{
- int i;
-
- if (incx == 1 && incy == 1) {
- i = len;
- axpbyf_kernel16(&i, a, y, b, x);
- } else {
- i = len;
- axpbyf_s_kernel8(&i, a, incx, y, b, incy, x);
- }
- for (; i < len; i++) {
- y[incy * i] = b * y[incy * i] + a * x[incx * i];
- }
-}
-
-static
-float
-dotf_kernel16(int *ip, float *y, float *x)
-{
- float sum[16];
- int i;
- int len;
-
- for (i = 0; i < 16; i++) {
- sum[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- sum[0] += x[i + 0] * y[i + 0];
- sum[1] += x[i + 1] * y[i + 1];
- sum[2] += x[i + 2] * y[i + 2];
- sum[3] += x[i + 3] * y[i + 3];
- sum[4] += x[i + 4] * y[i + 4];
- sum[5] += x[i + 5] * y[i + 5];
- sum[6] += x[i + 6] * y[i + 6];
- sum[7] += x[i + 7] * y[i + 7];
- sum[8] += x[i + 8] * y[i + 8];
- sum[9] += x[i + 9] * y[i + 9];
- sum[10] += x[i + 10] * y[i + 10];
- sum[11] += x[i + 11] * y[i + 11];
- sum[12] += x[i + 12] * y[i + 12];
- sum[13] += x[i + 13] * y[i + 13];
- sum[14] += x[i + 14] * y[i + 14];
- sum[15] += x[i + 15] * y[i + 15];
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-static
-float
-dotf_s_kernel8(int *ip, int incy, float *x, float *y, int incx)
-{
- float sum[8];
- int i;
- int len;
-
- for (i = 0; i < 8; i++) {
- sum[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)];
- sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)];
- sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)];
- sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)];
- sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)];
- sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)];
- sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)];
- sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)];
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-float
-blas·dotf(int len, float *x, int incx, float *y, int incy)
-{
- int i;
- float sum;
-
- if (incx == 1 && incy == 1) {
- i = len;
- sum = dotf_kernel16(&i, y, x);
- } else {
- i = len;
- sum = dotf_s_kernel8(&i, incy, x, y, incx);
- }
- for (; i < len; i++) {
- sum += x[incx * i] * y[incy * i];
- }
-
- return sum;
-}
-
-static
-float
-sumf_kernel16(int *ip, float *x)
-{
- float sum[16];
- int i;
- int len;
-
- for (i = 0; i < 16; i++) {
- sum[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- sum[0] += x[i + 0];
- sum[1] += x[i + 1];
- sum[2] += x[i + 2];
- sum[3] += x[i + 3];
- sum[4] += x[i + 4];
- sum[5] += x[i + 5];
- sum[6] += x[i + 6];
- sum[7] += x[i + 7];
- sum[8] += x[i + 8];
- sum[9] += x[i + 9];
- sum[10] += x[i + 10];
- sum[11] += x[i + 11];
- sum[12] += x[i + 12];
- sum[13] += x[i + 13];
- sum[14] += x[i + 14];
- sum[15] += x[i + 15];
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-static
-float
-sumf_s_kernel8(int *ip, int incx, float *x)
-{
- float sum[8];
- int i;
- int len;
-
- for (i = 0; i < 8; i++) {
- sum[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- sum[0] += x[incx * (i + 0)];
- sum[1] += x[incx * (i + 1)];
- sum[2] += x[incx * (i + 2)];
- sum[3] += x[incx * (i + 3)];
- sum[4] += x[incx * (i + 4)];
- sum[5] += x[incx * (i + 5)];
- sum[6] += x[incx * (i + 6)];
- sum[7] += x[incx * (i + 7)];
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-float
-blas·sumf(int len, float *x, int incx)
-{
- int i;
- float sum;
-
- if (incx == 1) {
- i = len;
- sum = sumf_kernel16(&i, x);
- } else {
- i = len;
- sum = sumf_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- sum += x[incx * i];
- }
-
- return sum;
-}
-
-static
-float
-normf_kernel16(int *ip, float *x)
-{
- int i;
- float nrm[16];
- int len;
-
- for (i = 0; i < 16; i++) {
- nrm[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- nrm[0] += x[i + 0] * x[i + 0];
- nrm[1] += x[i + 1] * x[i + 1];
- nrm[2] += x[i + 2] * x[i + 2];
- nrm[3] += x[i + 3] * x[i + 3];
- nrm[4] += x[i + 4] * x[i + 4];
- nrm[5] += x[i + 5] * x[i + 5];
- nrm[6] += x[i + 6] * x[i + 6];
- nrm[7] += x[i + 7] * x[i + 7];
- nrm[8] += x[i + 8] * x[i + 8];
- nrm[9] += x[i + 9] * x[i + 9];
- nrm[10] += x[i + 10] * x[i + 10];
- nrm[11] += x[i + 11] * x[i + 11];
- nrm[12] += x[i + 12] * x[i + 12];
- nrm[13] += x[i + 13] * x[i + 13];
- nrm[14] += x[i + 14] * x[i + 14];
- nrm[15] += x[i + 15] * x[i + 15];
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- nrm[0] += nrm[i];
- }
-
- return nrm[0];
-}
-
-static
-float
-normf_s_kernel8(int *ip, int incx, float *x)
-{
- int i;
- float nrm[8];
- int len;
-
- for (i = 0; i < 8; i++) {
- nrm[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)];
- nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)];
- nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)];
- nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)];
- nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)];
- nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)];
- nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)];
- nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)];
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- nrm[0] += nrm[i];
- }
-
- return nrm[0];
-}
-
-float
-blas·normf(int len, float *x, int incx)
-{
- int i;
- float nrm;
-
- if (incx == 1) {
- i = len;
- nrm = normf_kernel16(&i, x);
- } else {
- i = len;
- nrm = normf_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- nrm += x[incx * i] * x[incx * i];
- }
-
- return math·sqrtf(nrm);
-}
-
-static
-void
-scalef_kernel16(int *ip, float *x, float a)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- x[i + 0] = a * x[i + 0];
- x[i + 1] = a * x[i + 1];
- x[i + 2] = a * x[i + 2];
- x[i + 3] = a * x[i + 3];
- x[i + 4] = a * x[i + 4];
- x[i + 5] = a * x[i + 5];
- x[i + 6] = a * x[i + 6];
- x[i + 7] = a * x[i + 7];
- x[i + 8] = a * x[i + 8];
- x[i + 9] = a * x[i + 9];
- x[i + 10] = a * x[i + 10];
- x[i + 11] = a * x[i + 11];
- x[i + 12] = a * x[i + 12];
- x[i + 13] = a * x[i + 13];
- x[i + 14] = a * x[i + 14];
- x[i + 15] = a * x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-scalef_s_kernel8(int *ip, float *x, int incx, float a)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- x[incx * (i + 0)] = a * x[incx * (i + 0)];
- x[incx * (i + 1)] = a * x[incx * (i + 1)];
- x[incx * (i + 2)] = a * x[incx * (i + 2)];
- x[incx * (i + 3)] = a * x[incx * (i + 3)];
- x[incx * (i + 4)] = a * x[incx * (i + 4)];
- x[incx * (i + 5)] = a * x[incx * (i + 5)];
- x[incx * (i + 6)] = a * x[incx * (i + 6)];
- x[incx * (i + 7)] = a * x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·scalef(int len, float *x, int incx, float a)
-{
- int i;
-
- if (incx == 1) {
- i = len;
- scalef_kernel16(&i, x, a);
- } else {
- i = len;
- scalef_s_kernel8(&i, x, incx, a);
- }
- for (; i < len; i++) {
- x[incx * i] = a * x[incx * i];
- }
-}
-
-static
-void
-rotf_kernel16(int *ip, float sin, float *x, float cos, float *y)
-{
- int i;
- float tmp[16];
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0];
- tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1];
- tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2];
- tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3];
- tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4];
- tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5];
- tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6];
- tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7];
- tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8];
- tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9];
- tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10];
- tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11];
- tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12];
- tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13];
- tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14];
- tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15];
- }
- *ip = i;
-}
-
-static
-void
-rotf_s_kernel8(int *ip, int incy, float cos, float sin, int incx, float *x, float *y)
-{
- int i;
- float tmp[8];
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0];
- tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1];
- tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2];
- tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3];
- tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4];
- tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5];
- tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6];
- tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7];
- }
- *ip = i;
-}
-
-void
-blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin)
-{
- int i;
- float tmp;
-
- if (incx == 1 && incy == 1) {
- i = len;
- rotf_kernel16(&i, sin, x, cos, y);
- } else {
- i = len;
- rotf_s_kernel8(&i, incy, cos, sin, incx, x, y);
- }
- for (; i < len; i++) {
- tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp;
- }
-}
-
-static
-void
-rotgf_kernel16(int *ip, float *x, float H[5], float *y)
-{
- float tmp[16];
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0];
- tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1];
- tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2];
- tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3];
- tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4];
- tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5];
- tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6];
- tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7];
- tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8];
- tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9];
- tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10];
- tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11];
- tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12];
- tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13];
- tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14];
- tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15];
- }
- *ip = i;
-}
-
-static
-void
-rotgf_s_kernel8(int *ip, int incx, int incy, float *x, float H[5], float *y)
-{
- float tmp[8];
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0];
- tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1];
- tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2];
- tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3];
- tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4];
- tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5];
- tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6];
- tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7];
- }
- *ip = i;
-}
-
-void
-blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5])
-{
- int i;
- float tmp;
-
- if (incx == 1 && incy == 1) {
- i = len;
- rotgf_kernel16(&i, x, H, y);
- } else {
- i = len;
- rotgf_s_kernel8(&i, incx, incy, x, H, y);
- }
- for (; i < len; i++) {
- tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp;
- }
-}
-
-static
-int
-argmind_kernel16(int *ip, double *x)
-{
- double min[16];
- int i;
- int ix[16];
- int len;
-
- for (i = 0; i < 16; i++) {
- min[i] = x[0];
- }
- for (i = 0; i < 16; i++) {
- ix[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- if (x[i + 0] < min[0]) {
- ix[0] = (i + 0);
- min[0] = x[ix[0]];
- }
- if (x[i + 1] < min[1]) {
- ix[1] = (i + 1);
- min[1] = x[ix[1]];
- }
- if (x[i + 2] < min[2]) {
- ix[2] = (i + 2);
- min[2] = x[ix[2]];
- }
- if (x[i + 3] < min[3]) {
- ix[3] = (i + 3);
- min[3] = x[ix[3]];
- }
- if (x[i + 4] < min[4]) {
- ix[4] = (i + 4);
- min[4] = x[ix[4]];
- }
- if (x[i + 5] < min[5]) {
- ix[5] = (i + 5);
- min[5] = x[ix[5]];
- }
- if (x[i + 6] < min[6]) {
- ix[6] = (i + 6);
- min[6] = x[ix[6]];
- }
- if (x[i + 7] < min[7]) {
- ix[7] = (i + 7);
- min[7] = x[ix[7]];
- }
- if (x[i + 8] < min[8]) {
- ix[8] = (i + 8);
- min[8] = x[ix[8]];
- }
- if (x[i + 9] < min[9]) {
- ix[9] = (i + 9);
- min[9] = x[ix[9]];
- }
- if (x[i + 10] < min[10]) {
- ix[10] = (i + 10);
- min[10] = x[ix[10]];
- }
- if (x[i + 11] < min[11]) {
- ix[11] = (i + 11);
- min[11] = x[ix[11]];
- }
- if (x[i + 12] < min[12]) {
- ix[12] = (i + 12);
- min[12] = x[ix[12]];
- }
- if (x[i + 13] < min[13]) {
- ix[13] = (i + 13);
- min[13] = x[ix[13]];
- }
- if (x[i + 14] < min[14]) {
- ix[14] = (i + 14);
- min[14] = x[ix[14]];
- }
- if (x[i + 15] < min[15]) {
- ix[15] = (i + 15);
- min[15] = x[ix[15]];
- }
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-static
-int
-argmind_s_kernel8(int *ip, double *x, int incx)
-{
- double min[8];
- int i;
- int ix[8];
- int len;
-
- for (i = 0; i < 8; i++) {
- min[i] = x[0];
- }
- for (i = 0; i < 8; i++) {
- ix[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- if (x[incx * (i + 0)] < min[0]) {
- ix[0] = (i + 0);
- min[0] = x[incx * ix[0]];
- }
- if (x[incx * (i + 1)] < min[1]) {
- ix[1] = (i + 1);
- min[1] = x[incx * ix[1]];
- }
- if (x[incx * (i + 2)] < min[2]) {
- ix[2] = (i + 2);
- min[2] = x[incx * ix[2]];
- }
- if (x[incx * (i + 3)] < min[3]) {
- ix[3] = (i + 3);
- min[3] = x[incx * ix[3]];
- }
- if (x[incx * (i + 4)] < min[4]) {
- ix[4] = (i + 4);
- min[4] = x[incx * ix[4]];
- }
- if (x[incx * (i + 5)] < min[5]) {
- ix[5] = (i + 5);
- min[5] = x[incx * ix[5]];
- }
- if (x[incx * (i + 6)] < min[6]) {
- ix[6] = (i + 6);
- min[6] = x[incx * ix[6]];
- }
- if (x[incx * (i + 7)] < min[7]) {
- ix[7] = (i + 7);
- min[7] = x[incx * ix[7]];
- }
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-int
-blas·argmind(int len, double *x, int incx)
-{
- int i;
- int ix;
- double min;
-
- if (incx == 1) {
- i = len;
- ix = argmind_kernel16(&i, x);
- } else {
- i = len;
- ix = argmind_s_kernel8(&i, x, incx);
- }
- for (; i < len; i++) {
- if (x[incx * i] < min) {
- ix = i;
- min = x[incx * ix];
- }
- }
- for (; i < len; i++) {
- if (x[i] < min) {
- ix = i;
- min = x[ix];
- }
- }
-
- return ix;
-}
-
-static
-int
-argmaxd_kernel16(int *ip, double *x)
-{
- int ix[16];
- int i;
- double max[16];
- int len;
-
- for (i = 0; i < 16; i++) {
- ix[i] = 0;
- }
- for (i = 0; i < 16; i++) {
- max[i] = x[0];
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- if (x[i + 0] > max[0]) {
- ix[0] = (i + 0);
- max[0] = x[ix[0]];
- }
- if (x[i + 1] > max[1]) {
- ix[1] = (i + 1);
- max[1] = x[ix[1]];
- }
- if (x[i + 2] > max[2]) {
- ix[2] = (i + 2);
- max[2] = x[ix[2]];
- }
- if (x[i + 3] > max[3]) {
- ix[3] = (i + 3);
- max[3] = x[ix[3]];
- }
- if (x[i + 4] > max[4]) {
- ix[4] = (i + 4);
- max[4] = x[ix[4]];
- }
- if (x[i + 5] > max[5]) {
- ix[5] = (i + 5);
- max[5] = x[ix[5]];
- }
- if (x[i + 6] > max[6]) {
- ix[6] = (i + 6);
- max[6] = x[ix[6]];
- }
- if (x[i + 7] > max[7]) {
- ix[7] = (i + 7);
- max[7] = x[ix[7]];
- }
- if (x[i + 8] > max[8]) {
- ix[8] = (i + 8);
- max[8] = x[ix[8]];
- }
- if (x[i + 9] > max[9]) {
- ix[9] = (i + 9);
- max[9] = x[ix[9]];
- }
- if (x[i + 10] > max[10]) {
- ix[10] = (i + 10);
- max[10] = x[ix[10]];
- }
- if (x[i + 11] > max[11]) {
- ix[11] = (i + 11);
- max[11] = x[ix[11]];
- }
- if (x[i + 12] > max[12]) {
- ix[12] = (i + 12);
- max[12] = x[ix[12]];
- }
- if (x[i + 13] > max[13]) {
- ix[13] = (i + 13);
- max[13] = x[ix[13]];
- }
- if (x[i + 14] > max[14]) {
- ix[14] = (i + 14);
- max[14] = x[ix[14]];
- }
- if (x[i + 15] > max[15]) {
- ix[15] = (i + 15);
- max[15] = x[ix[15]];
- }
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-static
-int
-argmaxd_s_kernel8(int *ip, int incx, double *x)
-{
- int ix[8];
- int i;
- double max[8];
- int len;
-
- for (i = 0; i < 8; i++) {
- ix[i] = 0;
- }
- for (i = 0; i < 8; i++) {
- max[i] = x[0];
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- if (x[incx * (i + 0)] > max[0]) {
- ix[0] = (i + 0);
- max[0] = x[incx * ix[0]];
- }
- if (x[incx * (i + 1)] > max[1]) {
- ix[1] = (i + 1);
- max[1] = x[incx * ix[1]];
- }
- if (x[incx * (i + 2)] > max[2]) {
- ix[2] = (i + 2);
- max[2] = x[incx * ix[2]];
- }
- if (x[incx * (i + 3)] > max[3]) {
- ix[3] = (i + 3);
- max[3] = x[incx * ix[3]];
- }
- if (x[incx * (i + 4)] > max[4]) {
- ix[4] = (i + 4);
- max[4] = x[incx * ix[4]];
- }
- if (x[incx * (i + 5)] > max[5]) {
- ix[5] = (i + 5);
- max[5] = x[incx * ix[5]];
- }
- if (x[incx * (i + 6)] > max[6]) {
- ix[6] = (i + 6);
- max[6] = x[incx * ix[6]];
- }
- if (x[incx * (i + 7)] > max[7]) {
- ix[7] = (i + 7);
- max[7] = x[incx * ix[7]];
- }
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- if (x[ix[i]] > x[ix[0]]) {
- ix[0] = ix[i];
- }
- }
-
- return ix[0];
-}
-
-int
-blas·argmaxd(int len, double *x, int incx)
-{
- int i;
- int ix;
- double max;
-
- if (incx == 1) {
- i = len;
- ix = argmaxd_kernel16(&i, x);
- } else {
- i = len;
- ix = argmaxd_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- if (x[incx * i] > max) {
- ix = i;
- max = x[incx * ix];
- }
- }
- for (; i < len; i++) {
- if (x[i] > max) {
- ix = i;
- max = x[ix];
- }
- }
-
- return ix;
-}
-
-static
-void
-copyd_kernel16(int *ip, double *y, double *x)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- y[i + 0] = x[i + 0];
- y[i + 1] = x[i + 1];
- y[i + 2] = x[i + 2];
- y[i + 3] = x[i + 3];
- y[i + 4] = x[i + 4];
- y[i + 5] = x[i + 5];
- y[i + 6] = x[i + 6];
- y[i + 7] = x[i + 7];
- y[i + 8] = x[i + 8];
- y[i + 9] = x[i + 9];
- y[i + 10] = x[i + 10];
- y[i + 11] = x[i + 11];
- y[i + 12] = x[i + 12];
- y[i + 13] = x[i + 13];
- y[i + 14] = x[i + 14];
- y[i + 15] = x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-copyd_s_kernel8(int *ip, double *y, int incx, int incy, double *x)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- y[incy * (i + 0)] = x[incx * (i + 0)];
- y[incy * (i + 1)] = x[incx * (i + 1)];
- y[incy * (i + 2)] = x[incx * (i + 2)];
- y[incy * (i + 3)] = x[incx * (i + 3)];
- y[incy * (i + 4)] = x[incx * (i + 4)];
- y[incy * (i + 5)] = x[incx * (i + 5)];
- y[incy * (i + 6)] = x[incx * (i + 6)];
- y[incy * (i + 7)] = x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·copyd(int len, double *x, int incx, double *y, int incy)
-{
- int i;
-
- if (incx == 1 && incy == 1) {
- i = len;
- copyd_kernel16(&i, y, x);
- } else {
- i = len;
- copyd_s_kernel8(&i, y, incx, incy, x);
- }
- for (; i < len; i++) {
- y[incy * i] = x[incx * i];
- }
-}
-
-static
-void
-axpyd_kernel16(int *ip, double *x, double *y, double a)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- y[i + 0] = y[i + 0] + a * x[i + 0];
- y[i + 1] = y[i + 1] + a * x[i + 1];
- y[i + 2] = y[i + 2] + a * x[i + 2];
- y[i + 3] = y[i + 3] + a * x[i + 3];
- y[i + 4] = y[i + 4] + a * x[i + 4];
- y[i + 5] = y[i + 5] + a * x[i + 5];
- y[i + 6] = y[i + 6] + a * x[i + 6];
- y[i + 7] = y[i + 7] + a * x[i + 7];
- y[i + 8] = y[i + 8] + a * x[i + 8];
- y[i + 9] = y[i + 9] + a * x[i + 9];
- y[i + 10] = y[i + 10] + a * x[i + 10];
- y[i + 11] = y[i + 11] + a * x[i + 11];
- y[i + 12] = y[i + 12] + a * x[i + 12];
- y[i + 13] = y[i + 13] + a * x[i + 13];
- y[i + 14] = y[i + 14] + a * x[i + 14];
- y[i + 15] = y[i + 15] + a * x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-axpyd_s_kernel8(int *ip, double *x, int incy, double *y, double a, int incx)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)];
- y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)];
- y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)];
- y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)];
- y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)];
- y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)];
- y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)];
- y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·axpyd(int len, double a, double *x, int incx, double *y, int incy)
-{
- int i;
-
- if (incx == 1 && incy == 1) {
- i = len;
- axpyd_kernel16(&i, x, y, a);
- } else {
- i = len;
- axpyd_s_kernel8(&i, x, incy, y, a, incx);
- }
- for (; i < len; i++) {
- y[incy * i] = y[incy * i] + a * x[incx * i];
- }
-}
-
-static
-void
-axpbyd_kernel16(int *ip, double a, double b, double *x, double *y)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- y[i + 0] = b * y[i + 0] + a * x[i + 0];
- y[i + 1] = b * y[i + 1] + a * x[i + 1];
- y[i + 2] = b * y[i + 2] + a * x[i + 2];
- y[i + 3] = b * y[i + 3] + a * x[i + 3];
- y[i + 4] = b * y[i + 4] + a * x[i + 4];
- y[i + 5] = b * y[i + 5] + a * x[i + 5];
- y[i + 6] = b * y[i + 6] + a * x[i + 6];
- y[i + 7] = b * y[i + 7] + a * x[i + 7];
- y[i + 8] = b * y[i + 8] + a * x[i + 8];
- y[i + 9] = b * y[i + 9] + a * x[i + 9];
- y[i + 10] = b * y[i + 10] + a * x[i + 10];
- y[i + 11] = b * y[i + 11] + a * x[i + 11];
- y[i + 12] = b * y[i + 12] + a * x[i + 12];
- y[i + 13] = b * y[i + 13] + a * x[i + 13];
- y[i + 14] = b * y[i + 14] + a * x[i + 14];
- y[i + 15] = b * y[i + 15] + a * x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-axpbyd_s_kernel8(int *ip, double a, int incy, double b, double *x, int incx, double *y)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)];
- y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)];
- y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)];
- y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)];
- y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)];
- y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)];
- y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)];
- y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int incy)
-{
- int i;
-
- if (incx == 1 && incy == 1) {
- i = len;
- axpbyd_kernel16(&i, a, b, x, y);
- } else {
- i = len;
- axpbyd_s_kernel8(&i, a, incy, b, x, incx, y);
- }
- for (; i < len; i++) {
- y[incy * i] = b * y[incy * i] + a * x[incx * i];
- }
-}
-
-static
-double
-dotd_kernel16(int *ip, double *y, double *x)
-{
- double sum[16];
- int i;
- int len;
-
- for (i = 0; i < 16; i++) {
- sum[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- sum[0] += x[i + 0] * y[i + 0];
- sum[1] += x[i + 1] * y[i + 1];
- sum[2] += x[i + 2] * y[i + 2];
- sum[3] += x[i + 3] * y[i + 3];
- sum[4] += x[i + 4] * y[i + 4];
- sum[5] += x[i + 5] * y[i + 5];
- sum[6] += x[i + 6] * y[i + 6];
- sum[7] += x[i + 7] * y[i + 7];
- sum[8] += x[i + 8] * y[i + 8];
- sum[9] += x[i + 9] * y[i + 9];
- sum[10] += x[i + 10] * y[i + 10];
- sum[11] += x[i + 11] * y[i + 11];
- sum[12] += x[i + 12] * y[i + 12];
- sum[13] += x[i + 13] * y[i + 13];
- sum[14] += x[i + 14] * y[i + 14];
- sum[15] += x[i + 15] * y[i + 15];
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-static
-double
-dotd_s_kernel8(int *ip, double *y, int incx, int incy, double *x)
-{
- double sum[8];
- int i;
- int len;
-
- for (i = 0; i < 8; i++) {
- sum[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)];
- sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)];
- sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)];
- sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)];
- sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)];
- sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)];
- sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)];
- sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)];
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-double
-blas·dotd(int len, double *x, int incx, double *y, int incy)
-{
- int i;
- double sum;
-
- if (incx == 1 && incy == 1) {
- i = len;
- sum = dotd_kernel16(&i, y, x);
- } else {
- i = len;
- sum = dotd_s_kernel8(&i, y, incx, incy, x);
- }
- for (; i < len; i++) {
- sum += x[incx * i] * y[incy * i];
- }
-
- return sum;
-}
-
-static
-double
-sumd_kernel16(int *ip, double *x)
-{
- int i;
- double sum[16];
- int len;
-
- for (i = 0; i < 16; i++) {
- sum[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- sum[0] += x[i + 0];
- sum[1] += x[i + 1];
- sum[2] += x[i + 2];
- sum[3] += x[i + 3];
- sum[4] += x[i + 4];
- sum[5] += x[i + 5];
- sum[6] += x[i + 6];
- sum[7] += x[i + 7];
- sum[8] += x[i + 8];
- sum[9] += x[i + 9];
- sum[10] += x[i + 10];
- sum[11] += x[i + 11];
- sum[12] += x[i + 12];
- sum[13] += x[i + 13];
- sum[14] += x[i + 14];
- sum[15] += x[i + 15];
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-static
-double
-sumd_s_kernel8(int *ip, int incx, double *x)
-{
- int i;
- double sum[8];
- int len;
-
- for (i = 0; i < 8; i++) {
- sum[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- sum[0] += x[incx * (i + 0)];
- sum[1] += x[incx * (i + 1)];
- sum[2] += x[incx * (i + 2)];
- sum[3] += x[incx * (i + 3)];
- sum[4] += x[incx * (i + 4)];
- sum[5] += x[incx * (i + 5)];
- sum[6] += x[incx * (i + 6)];
- sum[7] += x[incx * (i + 7)];
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- sum[0] += sum[i];
- }
-
- return sum[0];
-}
-
-double
-blas·sumd(int len, double *x, int incx)
-{
- int i;
- double sum;
-
- if (incx == 1) {
- i = len;
- sum = sumd_kernel16(&i, x);
- } else {
- i = len;
- sum = sumd_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- sum += x[incx * i];
- }
-
- return sum;
-}
-
-static
-double
-normd_kernel16(int *ip, double *x)
-{
- int i;
- double nrm[16];
- int len;
-
- for (i = 0; i < 16; i++) {
- nrm[i] = 0;
- }
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- nrm[0] += x[i + 0] * x[i + 0];
- nrm[1] += x[i + 1] * x[i + 1];
- nrm[2] += x[i + 2] * x[i + 2];
- nrm[3] += x[i + 3] * x[i + 3];
- nrm[4] += x[i + 4] * x[i + 4];
- nrm[5] += x[i + 5] * x[i + 5];
- nrm[6] += x[i + 6] * x[i + 6];
- nrm[7] += x[i + 7] * x[i + 7];
- nrm[8] += x[i + 8] * x[i + 8];
- nrm[9] += x[i + 9] * x[i + 9];
- nrm[10] += x[i + 10] * x[i + 10];
- nrm[11] += x[i + 11] * x[i + 11];
- nrm[12] += x[i + 12] * x[i + 12];
- nrm[13] += x[i + 13] * x[i + 13];
- nrm[14] += x[i + 14] * x[i + 14];
- nrm[15] += x[i + 15] * x[i + 15];
- }
- *ip = i;
- for (i = 1; i < 16; i++) {
- nrm[0] += nrm[i];
- }
-
- return nrm[0];
-}
-
-static
-double
-normd_s_kernel8(int *ip, int incx, double *x)
-{
- double nrm[8];
- int i;
- int len;
-
- for (i = 0; i < 8; i++) {
- nrm[i] = 0;
- }
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)];
- nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)];
- nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)];
- nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)];
- nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)];
- nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)];
- nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)];
- nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)];
- }
- *ip = i;
- for (i = 1; i < 8; i++) {
- nrm[0] += nrm[i];
- }
-
- return nrm[0];
-}
-
-double
-blas·normd(int len, double *x, int incx)
-{
- int i;
- double nrm;
-
- if (incx == 1) {
- i = len;
- nrm = normd_kernel16(&i, x);
- } else {
- i = len;
- nrm = normd_s_kernel8(&i, incx, x);
- }
- for (; i < len; i++) {
- nrm += x[incx * i] * x[incx * i];
- }
-
- return math·sqrt(nrm);
-}
-
-static
-void
-scaled_kernel16(int *ip, double a, double *x)
-{
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- x[i + 0] = a * x[i + 0];
- x[i + 1] = a * x[i + 1];
- x[i + 2] = a * x[i + 2];
- x[i + 3] = a * x[i + 3];
- x[i + 4] = a * x[i + 4];
- x[i + 5] = a * x[i + 5];
- x[i + 6] = a * x[i + 6];
- x[i + 7] = a * x[i + 7];
- x[i + 8] = a * x[i + 8];
- x[i + 9] = a * x[i + 9];
- x[i + 10] = a * x[i + 10];
- x[i + 11] = a * x[i + 11];
- x[i + 12] = a * x[i + 12];
- x[i + 13] = a * x[i + 13];
- x[i + 14] = a * x[i + 14];
- x[i + 15] = a * x[i + 15];
- }
- *ip = i;
-}
-
-static
-void
-scaled_s_kernel8(int *ip, double *x, int incx, double a)
-{
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- x[incx * (i + 0)] = a * x[incx * (i + 0)];
- x[incx * (i + 1)] = a * x[incx * (i + 1)];
- x[incx * (i + 2)] = a * x[incx * (i + 2)];
- x[incx * (i + 3)] = a * x[incx * (i + 3)];
- x[incx * (i + 4)] = a * x[incx * (i + 4)];
- x[incx * (i + 5)] = a * x[incx * (i + 5)];
- x[incx * (i + 6)] = a * x[incx * (i + 6)];
- x[incx * (i + 7)] = a * x[incx * (i + 7)];
- }
- *ip = i;
-}
-
-void
-blas·scaled(int len, double *x, int incx, double a)
-{
- int i;
-
- if (incx == 1) {
- i = len;
- scaled_kernel16(&i, a, x);
- } else {
- i = len;
- scaled_s_kernel8(&i, x, incx, a);
- }
- for (; i < len; i++) {
- x[incx * i] = a * x[incx * i];
- }
-}
-
-static
-void
-rotd_kernel16(int *ip, double sin, double cos, double *x, double *y)
-{
- double tmp[16];
- int i;
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0];
- tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1];
- tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2];
- tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3];
- tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4];
- tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5];
- tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6];
- tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7];
- tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8];
- tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9];
- tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10];
- tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11];
- tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12];
- tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13];
- tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14];
- tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15];
- }
- *ip = i;
-}
-
-static
-void
-rotd_s_kernel8(int *ip, double sin, double *x, double *y, double cos, int incy, int incx)
-{
- double tmp[8];
- int i;
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0];
- tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1];
- tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2];
- tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3];
- tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4];
- tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5];
- tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6];
- tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7];
- }
- *ip = i;
-}
-
-void
-blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double sin)
-{
- int i;
- double tmp;
-
- if (incx == 1 && incy == 1) {
- i = len;
- rotd_kernel16(&i, sin, cos, x, y);
- } else {
- i = len;
- rotd_s_kernel8(&i, sin, x, y, cos, incy, incx);
- }
- for (; i < len; i++) {
- tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp;
- }
-}
-
-static
-void
-rotgd_kernel16(int *ip, double H[5], double *y, double *x)
-{
- int i;
- double tmp[16];
- int len;
-
- len = *ip & ~15;
- for (i = 0; i < len; i += 16) {
- tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0];
- tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1];
- tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2];
- tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3];
- tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4];
- tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5];
- tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6];
- tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7];
- tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8];
- tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9];
- tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10];
- tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11];
- tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12];
- tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13];
- tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14];
- tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15];
- }
- *ip = i;
-}
-
-static
-void
-rotgd_s_kernel8(int *ip, double H[5], int incx, double *y, double *x, int incy)
-{
- int i;
- double tmp[8];
- int len;
-
- len = *ip & ~7;
- for (i = 0; i < len; i += 8) {
- tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0];
- tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1];
- tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2];
- tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3];
- tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4];
- tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5];
- tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6];
- tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7];
- }
- *ip = i;
-}
-
-void
-blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5])
-{
- int i;
- double tmp;
-
- if (incx == 1 && incy == 1) {
- i = len;
- rotgd_kernel16(&i, H, y, x);
- } else {
- i = len;
- rotgd_s_kernel8(&i, H, incx, y, x, incy);
- }
- for (; i < len; i++) {
- tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp;
- }
-}
-
-
+#define UNROLL 8
+#define INT uintptr
+
+// -----------------------------------------------------------------------
+// Templates
+
+#include "loop.h"
+#define BODY_XY() \
+ LOOP(UNROLL, 0, INIT); \
+ n = ROUNDBY(len, UNROLL); \
+ if (incx == 1 && incy == 1) { \
+ for (i = 0; i < n; i+=UNROLL) { \
+ LOOP(UNROLL,0,KERNEL,1,1); \
+ } \
+ } else { \
+ for (i = 0; i < n; i+=UNROLL) { \
+ LOOP(UNROLL,0,KERNEL,incx,incy);\
+ } \
+ } \
+ \
+ for (; i < len; i++) { \
+ LOOP(1,0,KERNEL,incx,incy); \
+ }
+
+#define BODY_X() \
+ LOOP(UNROLL, 0, INIT); \
+ n = ROUNDBY(len, UNROLL); \
+ if (incx == 1) { \
+ for (i = 0; i < n; i+=UNROLL) { \
+ LOOP(UNROLL,0,KERNEL,1); \
+ } \
+ } else { \
+ for (i = 0; i < n; i+=UNROLL) { \
+ LOOP(UNROLL,0,KERNEL,incx); \
+ } \
+ } \
+ \
+ for (; i < len; i++) { \
+ LOOP(1,0,KERNEL,incx); \
+ }
+
+// -----------------------------------------------------------------------
+// Implementation
+
+#define FLOAT double
+#define func(name) blas·d##name
+#include "blas1body"
+#undef FLOAT
+
+#undef FLOAT
+#undef func
+
+#define FLOAT float
+#define func(name) blas·f##name
+#include "blas1body"
+#undef FLOAT