aboutsummaryrefslogtreecommitdiff
path: root/sys/libmath/blas1.c
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-05-14 18:15:23 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-05-14 18:15:23 -0700
commit463ed852261da4d1dd1b859fa717a1d683306c9d (patch)
treed832d08663dcb53f86073d4fbb1609712fe5513f /sys/libmath/blas1.c
parentd982e7c2fdebf560ccce193cb98b85d4fac28a45 (diff)
feat: begun work on final blas level 2
Diffstat (limited to 'sys/libmath/blas1.c')
-rw-r--r--sys/libmath/blas1.c214
1 files changed, 117 insertions, 97 deletions
diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c
index 8cc70eb..1907179 100644
--- a/sys/libmath/blas1.c
+++ b/sys/libmath/blas1.c
@@ -173,7 +173,12 @@ blas·argminf(int len, float *x, int incx)
i = len;
ix = argminf_s_kernel8(&i, incx, x);
}
- min = x[ix];
+ for (; i < len; i++) {
+ if (x[incx * i] < min) {
+ ix = i;
+ min = x[incx * ix];
+ }
+ }
for (; i < len; i++) {
if (x[i] < min) {
ix = i;
@@ -188,16 +193,16 @@ static
int
argmaxf_kernel16(int *ip, float *x)
{
+ int ix[16];
float max[16];
int i;
- int ix[16];
int len;
for (i = 0; i < 16; i++) {
- max[i] = x[0];
+ ix[i] = 0;
}
for (i = 0; i < 16; i++) {
- ix[i] = 0;
+ max[i] = x[0];
}
len = *ip & ~15;
for (i = 0; i < len; i += 16) {
@@ -280,16 +285,16 @@ static
int
argmaxf_s_kernel8(int *ip, int incx, float *x)
{
+ int ix[8];
float max[8];
int i;
- int ix[8];
int len;
for (i = 0; i < 8; i++) {
- max[i] = x[0];
+ ix[i] = 0;
}
for (i = 0; i < 8; i++) {
- ix[i] = 0;
+ max[i] = x[0];
}
len = *ip & ~7;
for (i = 0; i < len; i += 8) {
@@ -350,7 +355,12 @@ blas·argmaxf(int len, float *x, int incx)
i = len;
ix = argmaxf_s_kernel8(&i, incx, x);
}
- max = x[ix];
+ for (; i < len; i++) {
+ if (x[incx * i] > max) {
+ ix = i;
+ max = x[incx * ix];
+ }
+ }
for (; i < len; i++) {
if (x[i] > max) {
ix = i;
@@ -392,7 +402,7 @@ copyf_kernel16(int *ip, float *x, float *y)
static
void
-copyf_s_kernel8(int *ip, int incx, float *x, int incy, float *y)
+copyf_s_kernel8(int *ip, int incx, int incy, float *x, float *y)
{
int i;
int len;
@@ -421,16 +431,16 @@ blas·copyf(int len, float *x, int incx, float *y, int incy)
copyf_kernel16(&i, x, y);
} else {
i = len;
- copyf_s_kernel8(&i, incx, x, incy, y);
+ copyf_s_kernel8(&i, incx, incy, x, y);
}
for (; i < len; i++) {
- y[i] = x[i];
+ y[incy * i] = x[incx * i];
}
}
static
void
-axpyf_kernel16(int *ip, float a, float *x, float *y)
+axpyf_kernel16(int *ip, float *x, float a, float *y)
{
int i;
int len;
@@ -459,7 +469,7 @@ axpyf_kernel16(int *ip, float a, float *x, float *y)
static
void
-axpyf_s_kernel8(int *ip, float a, float *x, int incy, float *y, int incx)
+axpyf_s_kernel8(int *ip, float *x, float a, int incx, float *y, int incy)
{
int i;
int len;
@@ -485,19 +495,19 @@ blas·axpyf(int len, float a, float *x, int incx, float *y, int incy)
if (incx == 1 && incy == 1) {
i = len;
- axpyf_kernel16(&i, a, x, y);
+ axpyf_kernel16(&i, x, a, y);
} else {
i = len;
- axpyf_s_kernel8(&i, a, x, incy, y, incx);
+ axpyf_s_kernel8(&i, x, a, incx, y, incy);
}
for (; i < len; i++) {
- y[i] = y[i] + a * x[i];
+ y[incy * i] = y[incy * i] + a * x[incx * i];
}
}
static
void
-axpbyf_kernel16(int *ip, float *x, float a, float *y, float b)
+axpbyf_kernel16(int *ip, float a, float *y, float b, float *x)
{
int i;
int len;
@@ -526,7 +536,7 @@ axpbyf_kernel16(int *ip, float *x, float a, float *y, float b)
static
void
-axpbyf_s_kernel8(int *ip, float *x, float a, float *y, float b, int incx, int incy)
+axpbyf_s_kernel8(int *ip, float a, int incx, float *y, float b, int incy, float *x)
{
int i;
int len;
@@ -552,22 +562,22 @@ blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy)
if (incx == 1 && incy == 1) {
i = len;
- axpbyf_kernel16(&i, x, a, y, b);
+ axpbyf_kernel16(&i, a, y, b, x);
} else {
i = len;
- axpbyf_s_kernel8(&i, x, a, y, b, incx, incy);
+ axpbyf_s_kernel8(&i, a, incx, y, b, incy, x);
}
for (; i < len; i++) {
- y[i] = b * y[i] + a * x[i];
+ y[incy * i] = b * y[incy * i] + a * x[incx * i];
}
}
static
float
-dotf_kernel16(int *ip, float *x, float *y)
+dotf_kernel16(int *ip, float *y, float *x)
{
- int i;
float sum[16];
+ int i;
int len;
for (i = 0; i < 16; i++) {
@@ -602,10 +612,10 @@ dotf_kernel16(int *ip, float *x, float *y)
static
float
-dotf_s_kernel8(int *ip, float *x, int incy, float *y, int incx)
+dotf_s_kernel8(int *ip, int incy, float *x, float *y, int incx)
{
- int i;
float sum[8];
+ int i;
int len;
for (i = 0; i < 8; i++) {
@@ -638,13 +648,13 @@ blas·dotf(int len, float *x, int incx, float *y, int incy)
if (incx == 1 && incy == 1) {
i = len;
- sum = dotf_kernel16(&i, x, y);
+ sum = dotf_kernel16(&i, y, x);
} else {
i = len;
- sum = dotf_s_kernel8(&i, x, incy, y, incx);
+ sum = dotf_s_kernel8(&i, incy, x, y, incx);
}
for (; i < len; i++) {
- sum += x[i] * y[i];
+ sum += x[incx * i] * y[incy * i];
}
return sum;
@@ -654,8 +664,8 @@ static
float
sumf_kernel16(int *ip, float *x)
{
- int i;
float sum[16];
+ int i;
int len;
for (i = 0; i < 16; i++) {
@@ -690,7 +700,7 @@ sumf_kernel16(int *ip, float *x)
static
float
-sumf_s_kernel8(int *ip, float *x, int incx)
+sumf_s_kernel8(int *ip, int incx, float *x)
{
float sum[8];
int i;
@@ -729,10 +739,10 @@ blas·sumf(int len, float *x, int incx)
sum = sumf_kernel16(&i, x);
} else {
i = len;
- sum = sumf_s_kernel8(&i, x, incx);
+ sum = sumf_s_kernel8(&i, incx, x);
}
for (; i < len; i++) {
- sum += x[i];
+ sum += x[incx * i];
}
return sum;
@@ -742,8 +752,8 @@ static
float
normf_kernel16(int *ip, float *x)
{
- float nrm[16];
int i;
+ float nrm[16];
int len;
for (i = 0; i < 16; i++) {
@@ -780,8 +790,8 @@ static
float
normf_s_kernel8(int *ip, int incx, float *x)
{
- float nrm[8];
int i;
+ float nrm[8];
int len;
for (i = 0; i < 8; i++) {
@@ -820,7 +830,7 @@ blas·normf(int len, float *x, int incx)
nrm = normf_s_kernel8(&i, incx, x);
}
for (; i < len; i++) {
- nrm += x[i] * x[i];
+ nrm += x[incx * i] * x[incx * i];
}
return math·sqrtf(nrm);
@@ -857,7 +867,7 @@ scalef_kernel16(int *ip, float *x, float a)
static
void
-scalef_s_kernel8(int *ip, int incx, float *x, float a)
+scalef_s_kernel8(int *ip, float *x, int incx, float a)
{
int i;
int len;
@@ -886,16 +896,16 @@ blas·scalef(int len, float *x, int incx, float a)
scalef_kernel16(&i, x, a);
} else {
i = len;
- scalef_s_kernel8(&i, incx, x, a);
+ scalef_s_kernel8(&i, x, incx, a);
}
for (; i < len; i++) {
- x[i] = a * x[i];
+ x[incx * i] = a * x[incx * i];
}
}
static
void
-rotf_kernel16(int *ip, float *y, float cos, float sin, float *x)
+rotf_kernel16(int *ip, float sin, float *x, float cos, float *y)
{
int i;
float tmp[16];
@@ -925,7 +935,7 @@ rotf_kernel16(int *ip, float *y, float cos, float sin, float *x)
static
void
-rotf_s_kernel8(int *ip, float *y, float sin, float cos, float *x, int incx, int incy)
+rotf_s_kernel8(int *ip, int incy, float cos, float sin, int incx, float *x, float *y)
{
int i;
float tmp[8];
@@ -953,19 +963,19 @@ blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin
if (incx == 1 && incy == 1) {
i = len;
- rotf_kernel16(&i, y, cos, sin, x);
+ rotf_kernel16(&i, sin, x, cos, y);
} else {
i = len;
- rotf_s_kernel8(&i, y, sin, cos, x, incx, incy);
+ rotf_s_kernel8(&i, incy, cos, sin, incx, x, y);
}
for (; i < len; i++) {
- tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp;
+ tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp;
}
}
static
void
-rotgf_kernel16(int *ip, float H[5], float *y, float *x)
+rotgf_kernel16(int *ip, float *x, float H[5], float *y)
{
float tmp[16];
int i;
@@ -995,7 +1005,7 @@ rotgf_kernel16(int *ip, float H[5], float *y, float *x)
static
void
-rotgf_s_kernel8(int *ip, int incy, int incx, float H[5], float *y, float *x)
+rotgf_s_kernel8(int *ip, int incx, int incy, float *x, float H[5], float *y)
{
float tmp[8];
int i;
@@ -1023,13 +1033,13 @@ blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5])
if (incx == 1 && incy == 1) {
i = len;
- rotgf_kernel16(&i, H, y, x);
+ rotgf_kernel16(&i, x, H, y);
} else {
i = len;
- rotgf_s_kernel8(&i, incy, incx, H, y, x);
+ rotgf_s_kernel8(&i, incx, incy, x, H, y);
}
for (; i < len; i++) {
- tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp;
+ tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp;
}
}
@@ -1037,16 +1047,16 @@ static
int
argmind_kernel16(int *ip, double *x)
{
- int ix[16];
double min[16];
int i;
+ int ix[16];
int len;
for (i = 0; i < 16; i++) {
- ix[i] = 0;
+ min[i] = x[0];
}
for (i = 0; i < 16; i++) {
- min[i] = x[0];
+ ix[i] = 0;
}
len = *ip & ~15;
for (i = 0; i < len; i += 16) {
@@ -1129,16 +1139,16 @@ static
int
argmind_s_kernel8(int *ip, double *x, int incx)
{
- int ix[8];
double min[8];
int i;
+ int ix[8];
int len;
for (i = 0; i < 8; i++) {
- ix[i] = 0;
+ min[i] = x[0];
}
for (i = 0; i < 8; i++) {
- min[i] = x[0];
+ ix[i] = 0;
}
len = *ip & ~7;
for (i = 0; i < len; i += 8) {
@@ -1199,7 +1209,12 @@ blas·argmind(int len, double *x, int incx)
i = len;
ix = argmind_s_kernel8(&i, x, incx);
}
- min = x[ix];
+ for (; i < len; i++) {
+ if (x[incx * i] < min) {
+ ix = i;
+ min = x[incx * ix];
+ }
+ }
for (; i < len; i++) {
if (x[i] < min) {
ix = i;
@@ -1214,8 +1229,8 @@ static
int
argmaxd_kernel16(int *ip, double *x)
{
- int i;
int ix[16];
+ int i;
double max[16];
int len;
@@ -1306,8 +1321,8 @@ static
int
argmaxd_s_kernel8(int *ip, int incx, double *x)
{
- int i;
int ix[8];
+ int i;
double max[8];
int len;
@@ -1376,7 +1391,12 @@ blas·argmaxd(int len, double *x, int incx)
i = len;
ix = argmaxd_s_kernel8(&i, incx, x);
}
- max = x[ix];
+ for (; i < len; i++) {
+ if (x[incx * i] > max) {
+ ix = i;
+ max = x[incx * ix];
+ }
+ }
for (; i < len; i++) {
if (x[i] > max) {
ix = i;
@@ -1418,7 +1438,7 @@ copyd_kernel16(int *ip, double *y, double *x)
static
void
-copyd_s_kernel8(int *ip, double *y, double *x, int incy, int incx)
+copyd_s_kernel8(int *ip, double *y, int incx, int incy, double *x)
{
int i;
int len;
@@ -1447,16 +1467,16 @@ blas·copyd(int len, double *x, int incx, double *y, int incy)
copyd_kernel16(&i, y, x);
} else {
i = len;
- copyd_s_kernel8(&i, y, x, incy, incx);
+ copyd_s_kernel8(&i, y, incx, incy, x);
}
for (; i < len; i++) {
- y[i] = x[i];
+ y[incy * i] = x[incx * i];
}
}
static
void
-axpyd_kernel16(int *ip, double a, double *y, double *x)
+axpyd_kernel16(int *ip, double *x, double *y, double a)
{
int i;
int len;
@@ -1485,7 +1505,7 @@ axpyd_kernel16(int *ip, double a, double *y, double *x)
static
void
-axpyd_s_kernel8(int *ip, double a, int incx, double *y, double *x, int incy)
+axpyd_s_kernel8(int *ip, double *x, int incy, double *y, double a, int incx)
{
int i;
int len;
@@ -1511,13 +1531,13 @@ blas·axpyd(int len, double a, double *x, int incx, double *y, int incy)
if (incx == 1 && incy == 1) {
i = len;
- axpyd_kernel16(&i, a, y, x);
+ axpyd_kernel16(&i, x, y, a);
} else {
i = len;
- axpyd_s_kernel8(&i, a, incx, y, x, incy);
+ axpyd_s_kernel8(&i, x, incy, y, a, incx);
}
for (; i < len; i++) {
- y[i] = y[i] + a * x[i];
+ y[incy * i] = y[incy * i] + a * x[incx * i];
}
}
@@ -1552,7 +1572,7 @@ axpbyd_kernel16(int *ip, double a, double b, double *x, double *y)
static
void
-axpbyd_s_kernel8(int *ip, double a, double b, double *x, int incx, int incy, double *y)
+axpbyd_s_kernel8(int *ip, double a, int incy, double b, double *x, int incx, double *y)
{
int i;
int len;
@@ -1581,16 +1601,16 @@ blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int in
axpbyd_kernel16(&i, a, b, x, y);
} else {
i = len;
- axpbyd_s_kernel8(&i, a, b, x, incx, incy, y);
+ axpbyd_s_kernel8(&i, a, incy, b, x, incx, y);
}
for (; i < len; i++) {
- y[i] = b * y[i] + a * x[i];
+ y[incy * i] = b * y[incy * i] + a * x[incx * i];
}
}
static
double
-dotd_kernel16(int *ip, double *x, double *y)
+dotd_kernel16(int *ip, double *y, double *x)
{
double sum[16];
int i;
@@ -1628,7 +1648,7 @@ dotd_kernel16(int *ip, double *x, double *y)
static
double
-dotd_s_kernel8(int *ip, int incx, double *x, int incy, double *y)
+dotd_s_kernel8(int *ip, double *y, int incx, int incy, double *x)
{
double sum[8];
int i;
@@ -1664,13 +1684,13 @@ blas·dotd(int len, double *x, int incx, double *y, int incy)
if (incx == 1 && incy == 1) {
i = len;
- sum = dotd_kernel16(&i, x, y);
+ sum = dotd_kernel16(&i, y, x);
} else {
i = len;
- sum = dotd_s_kernel8(&i, incx, x, incy, y);
+ sum = dotd_s_kernel8(&i, y, incx, incy, x);
}
for (; i < len; i++) {
- sum += x[i] * y[i];
+ sum += x[incx * i] * y[incy * i];
}
return sum;
@@ -1716,7 +1736,7 @@ sumd_kernel16(int *ip, double *x)
static
double
-sumd_s_kernel8(int *ip, double *x, int incx)
+sumd_s_kernel8(int *ip, int incx, double *x)
{
int i;
double sum[8];
@@ -1755,10 +1775,10 @@ blas·sumd(int len, double *x, int incx)
sum = sumd_kernel16(&i, x);
} else {
i = len;
- sum = sumd_s_kernel8(&i, x, incx);
+ sum = sumd_s_kernel8(&i, incx, x);
}
for (; i < len; i++) {
- sum += x[i];
+ sum += x[incx * i];
}
return sum;
@@ -1768,8 +1788,8 @@ static
double
normd_kernel16(int *ip, double *x)
{
- double nrm[16];
int i;
+ double nrm[16];
int len;
for (i = 0; i < 16; i++) {
@@ -1806,8 +1826,8 @@ static
double
normd_s_kernel8(int *ip, int incx, double *x)
{
- int i;
double nrm[8];
+ int i;
int len;
for (i = 0; i < 8; i++) {
@@ -1846,7 +1866,7 @@ blas·normd(int len, double *x, int incx)
nrm = normd_s_kernel8(&i, incx, x);
}
for (; i < len; i++) {
- nrm += x[i] * x[i];
+ nrm += x[incx * i] * x[incx * i];
}
return math·sqrt(nrm);
@@ -1883,7 +1903,7 @@ scaled_kernel16(int *ip, double a, double *x)
static
void
-scaled_s_kernel8(int *ip, double a, int incx, double *x)
+scaled_s_kernel8(int *ip, double *x, int incx, double a)
{
int i;
int len;
@@ -1912,19 +1932,19 @@ blas·scaled(int len, double *x, int incx, double a)
scaled_kernel16(&i, a, x);
} else {
i = len;
- scaled_s_kernel8(&i, a, incx, x);
+ scaled_s_kernel8(&i, x, incx, a);
}
for (; i < len; i++) {
- x[i] = a * x[i];
+ x[incx * i] = a * x[incx * i];
}
}
static
void
-rotd_kernel16(int *ip, double *x, double *y, double cos, double sin)
+rotd_kernel16(int *ip, double sin, double cos, double *x, double *y)
{
- int i;
double tmp[16];
+ int i;
int len;
len = *ip & ~15;
@@ -1951,10 +1971,10 @@ rotd_kernel16(int *ip, double *x, double *y, double cos, double sin)
static
void
-rotd_s_kernel8(int *ip, double *y, int incx, double sin, int incy, double *x, double cos)
+rotd_s_kernel8(int *ip, double sin, double *x, double *y, double cos, int incy, int incx)
{
- int i;
double tmp[8];
+ int i;
int len;
len = *ip & ~7;
@@ -1979,22 +1999,22 @@ blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double
if (incx == 1 && incy == 1) {
i = len;
- rotd_kernel16(&i, x, y, cos, sin);
+ rotd_kernel16(&i, sin, cos, x, y);
} else {
i = len;
- rotd_s_kernel8(&i, y, incx, sin, incy, x, cos);
+ rotd_s_kernel8(&i, sin, x, y, cos, incy, incx);
}
for (; i < len; i++) {
- tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp;
+ tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp;
}
}
static
void
-rotgd_kernel16(int *ip, double *y, double H[5], double *x)
+rotgd_kernel16(int *ip, double H[5], double *y, double *x)
{
- double tmp[16];
int i;
+ double tmp[16];
int len;
len = *ip & ~15;
@@ -2021,10 +2041,10 @@ rotgd_kernel16(int *ip, double *y, double H[5], double *x)
static
void
-rotgd_s_kernel8(int *ip, int incx, double *y, int incy, double H[5], double *x)
+rotgd_s_kernel8(int *ip, double H[5], int incx, double *y, double *x, int incy)
{
- double tmp[8];
int i;
+ double tmp[8];
int len;
len = *ip & ~7;
@@ -2049,13 +2069,13 @@ blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5])
if (incx == 1 && incy == 1) {
i = len;
- rotgd_kernel16(&i, y, H, x);
+ rotgd_kernel16(&i, H, y, x);
} else {
i = len;
- rotgd_s_kernel8(&i, incx, y, incy, H, x);
+ rotgd_s_kernel8(&i, H, incx, y, x, incy);
}
for (; i < len; i++) {
- tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp;
+ tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp;
}
}