From 463ed852261da4d1dd1b859fa717a1d683306c9d Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 14 May 2020 18:15:23 -0700 Subject: feat: begun work on final blas level 2 --- sys/libmath/blas1.c | 214 ++++++++++++++++++++++++++++------------------------ 1 file changed, 117 insertions(+), 97 deletions(-) (limited to 'sys/libmath/blas1.c') diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c index 8cc70eb..1907179 100644 --- a/sys/libmath/blas1.c +++ b/sys/libmath/blas1.c @@ -173,7 +173,12 @@ blas·argminf(int len, float *x, int incx) i = len; ix = argminf_s_kernel8(&i, incx, x); } - min = x[ix]; + for (; i < len; i++) { + if (x[incx * i] < min) { + ix = i; + min = x[incx * ix]; + } + } for (; i < len; i++) { if (x[i] < min) { ix = i; @@ -188,16 +193,16 @@ static int argmaxf_kernel16(int *ip, float *x) { + int ix[16]; float max[16]; int i; - int ix[16]; int len; for (i = 0; i < 16; i++) { - max[i] = x[0]; + ix[i] = 0; } for (i = 0; i < 16; i++) { - ix[i] = 0; + max[i] = x[0]; } len = *ip & ~15; for (i = 0; i < len; i += 16) { @@ -280,16 +285,16 @@ static int argmaxf_s_kernel8(int *ip, int incx, float *x) { + int ix[8]; float max[8]; int i; - int ix[8]; int len; for (i = 0; i < 8; i++) { - max[i] = x[0]; + ix[i] = 0; } for (i = 0; i < 8; i++) { - ix[i] = 0; + max[i] = x[0]; } len = *ip & ~7; for (i = 0; i < len; i += 8) { @@ -350,7 +355,12 @@ blas·argmaxf(int len, float *x, int incx) i = len; ix = argmaxf_s_kernel8(&i, incx, x); } - max = x[ix]; + for (; i < len; i++) { + if (x[incx * i] > max) { + ix = i; + max = x[incx * ix]; + } + } for (; i < len; i++) { if (x[i] > max) { ix = i; @@ -392,7 +402,7 @@ copyf_kernel16(int *ip, float *x, float *y) static void -copyf_s_kernel8(int *ip, int incx, float *x, int incy, float *y) +copyf_s_kernel8(int *ip, int incx, int incy, float *x, float *y) { int i; int len; @@ -421,16 +431,16 @@ blas·copyf(int len, float *x, int incx, float *y, int incy) copyf_kernel16(&i, x, y); } else { i = len; - copyf_s_kernel8(&i, incx, x, incy, y); + copyf_s_kernel8(&i, incx, incy, x, y); } for (; i < len; i++) { - y[i] = x[i]; + y[incy * i] = x[incx * i]; } } static void -axpyf_kernel16(int *ip, float a, float *x, float *y) +axpyf_kernel16(int *ip, float *x, float a, float *y) { int i; int len; @@ -459,7 +469,7 @@ axpyf_kernel16(int *ip, float a, float *x, float *y) static void -axpyf_s_kernel8(int *ip, float a, float *x, int incy, float *y, int incx) +axpyf_s_kernel8(int *ip, float *x, float a, int incx, float *y, int incy) { int i; int len; @@ -485,19 +495,19 @@ blas·axpyf(int len, float a, float *x, int incx, float *y, int incy) if (incx == 1 && incy == 1) { i = len; - axpyf_kernel16(&i, a, x, y); + axpyf_kernel16(&i, x, a, y); } else { i = len; - axpyf_s_kernel8(&i, a, x, incy, y, incx); + axpyf_s_kernel8(&i, x, a, incx, y, incy); } for (; i < len; i++) { - y[i] = y[i] + a * x[i]; + y[incy * i] = y[incy * i] + a * x[incx * i]; } } static void -axpbyf_kernel16(int *ip, float *x, float a, float *y, float b) +axpbyf_kernel16(int *ip, float a, float *y, float b, float *x) { int i; int len; @@ -526,7 +536,7 @@ axpbyf_kernel16(int *ip, float *x, float a, float *y, float b) static void -axpbyf_s_kernel8(int *ip, float *x, float a, float *y, float b, int incx, int incy) +axpbyf_s_kernel8(int *ip, float a, int incx, float *y, float b, int incy, float *x) { int i; int len; @@ -552,22 +562,22 @@ blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy) if (incx == 1 && incy == 1) { i = len; - axpbyf_kernel16(&i, x, a, y, b); + axpbyf_kernel16(&i, a, y, b, x); } else { i = len; - axpbyf_s_kernel8(&i, x, a, y, b, incx, incy); + axpbyf_s_kernel8(&i, a, incx, y, b, incy, x); } for (; i < len; i++) { - y[i] = b * y[i] + a * x[i]; + y[incy * i] = b * y[incy * i] + a * x[incx * i]; } } static float -dotf_kernel16(int *ip, float *x, float *y) +dotf_kernel16(int *ip, float *y, float *x) { - int i; float sum[16]; + int i; int len; for (i = 0; i < 16; i++) { @@ -602,10 +612,10 @@ dotf_kernel16(int *ip, float *x, float *y) static float -dotf_s_kernel8(int *ip, float *x, int incy, float *y, int incx) +dotf_s_kernel8(int *ip, int incy, float *x, float *y, int incx) { - int i; float sum[8]; + int i; int len; for (i = 0; i < 8; i++) { @@ -638,13 +648,13 @@ blas·dotf(int len, float *x, int incx, float *y, int incy) if (incx == 1 && incy == 1) { i = len; - sum = dotf_kernel16(&i, x, y); + sum = dotf_kernel16(&i, y, x); } else { i = len; - sum = dotf_s_kernel8(&i, x, incy, y, incx); + sum = dotf_s_kernel8(&i, incy, x, y, incx); } for (; i < len; i++) { - sum += x[i] * y[i]; + sum += x[incx * i] * y[incy * i]; } return sum; @@ -654,8 +664,8 @@ static float sumf_kernel16(int *ip, float *x) { - int i; float sum[16]; + int i; int len; for (i = 0; i < 16; i++) { @@ -690,7 +700,7 @@ sumf_kernel16(int *ip, float *x) static float -sumf_s_kernel8(int *ip, float *x, int incx) +sumf_s_kernel8(int *ip, int incx, float *x) { float sum[8]; int i; @@ -729,10 +739,10 @@ blas·sumf(int len, float *x, int incx) sum = sumf_kernel16(&i, x); } else { i = len; - sum = sumf_s_kernel8(&i, x, incx); + sum = sumf_s_kernel8(&i, incx, x); } for (; i < len; i++) { - sum += x[i]; + sum += x[incx * i]; } return sum; @@ -742,8 +752,8 @@ static float normf_kernel16(int *ip, float *x) { - float nrm[16]; int i; + float nrm[16]; int len; for (i = 0; i < 16; i++) { @@ -780,8 +790,8 @@ static float normf_s_kernel8(int *ip, int incx, float *x) { - float nrm[8]; int i; + float nrm[8]; int len; for (i = 0; i < 8; i++) { @@ -820,7 +830,7 @@ blas·normf(int len, float *x, int incx) nrm = normf_s_kernel8(&i, incx, x); } for (; i < len; i++) { - nrm += x[i] * x[i]; + nrm += x[incx * i] * x[incx * i]; } return math·sqrtf(nrm); @@ -857,7 +867,7 @@ scalef_kernel16(int *ip, float *x, float a) static void -scalef_s_kernel8(int *ip, int incx, float *x, float a) +scalef_s_kernel8(int *ip, float *x, int incx, float a) { int i; int len; @@ -886,16 +896,16 @@ blas·scalef(int len, float *x, int incx, float a) scalef_kernel16(&i, x, a); } else { i = len; - scalef_s_kernel8(&i, incx, x, a); + scalef_s_kernel8(&i, x, incx, a); } for (; i < len; i++) { - x[i] = a * x[i]; + x[incx * i] = a * x[incx * i]; } } static void -rotf_kernel16(int *ip, float *y, float cos, float sin, float *x) +rotf_kernel16(int *ip, float sin, float *x, float cos, float *y) { int i; float tmp[16]; @@ -925,7 +935,7 @@ rotf_kernel16(int *ip, float *y, float cos, float sin, float *x) static void -rotf_s_kernel8(int *ip, float *y, float sin, float cos, float *x, int incx, int incy) +rotf_s_kernel8(int *ip, int incy, float cos, float sin, int incx, float *x, float *y) { int i; float tmp[8]; @@ -953,19 +963,19 @@ blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin if (incx == 1 && incy == 1) { i = len; - rotf_kernel16(&i, y, cos, sin, x); + rotf_kernel16(&i, sin, x, cos, y); } else { i = len; - rotf_s_kernel8(&i, y, sin, cos, x, incx, incy); + rotf_s_kernel8(&i, incy, cos, sin, incx, x, y); } for (; i < len; i++) { - tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp; + tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp; } } static void -rotgf_kernel16(int *ip, float H[5], float *y, float *x) +rotgf_kernel16(int *ip, float *x, float H[5], float *y) { float tmp[16]; int i; @@ -995,7 +1005,7 @@ rotgf_kernel16(int *ip, float H[5], float *y, float *x) static void -rotgf_s_kernel8(int *ip, int incy, int incx, float H[5], float *y, float *x) +rotgf_s_kernel8(int *ip, int incx, int incy, float *x, float H[5], float *y) { float tmp[8]; int i; @@ -1023,13 +1033,13 @@ blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5]) if (incx == 1 && incy == 1) { i = len; - rotgf_kernel16(&i, H, y, x); + rotgf_kernel16(&i, x, H, y); } else { i = len; - rotgf_s_kernel8(&i, incy, incx, H, y, x); + rotgf_s_kernel8(&i, incx, incy, x, H, y); } for (; i < len; i++) { - tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp; + tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp; } } @@ -1037,16 +1047,16 @@ static int argmind_kernel16(int *ip, double *x) { - int ix[16]; double min[16]; int i; + int ix[16]; int len; for (i = 0; i < 16; i++) { - ix[i] = 0; + min[i] = x[0]; } for (i = 0; i < 16; i++) { - min[i] = x[0]; + ix[i] = 0; } len = *ip & ~15; for (i = 0; i < len; i += 16) { @@ -1129,16 +1139,16 @@ static int argmind_s_kernel8(int *ip, double *x, int incx) { - int ix[8]; double min[8]; int i; + int ix[8]; int len; for (i = 0; i < 8; i++) { - ix[i] = 0; + min[i] = x[0]; } for (i = 0; i < 8; i++) { - min[i] = x[0]; + ix[i] = 0; } len = *ip & ~7; for (i = 0; i < len; i += 8) { @@ -1199,7 +1209,12 @@ blas·argmind(int len, double *x, int incx) i = len; ix = argmind_s_kernel8(&i, x, incx); } - min = x[ix]; + for (; i < len; i++) { + if (x[incx * i] < min) { + ix = i; + min = x[incx * ix]; + } + } for (; i < len; i++) { if (x[i] < min) { ix = i; @@ -1214,8 +1229,8 @@ static int argmaxd_kernel16(int *ip, double *x) { - int i; int ix[16]; + int i; double max[16]; int len; @@ -1306,8 +1321,8 @@ static int argmaxd_s_kernel8(int *ip, int incx, double *x) { - int i; int ix[8]; + int i; double max[8]; int len; @@ -1376,7 +1391,12 @@ blas·argmaxd(int len, double *x, int incx) i = len; ix = argmaxd_s_kernel8(&i, incx, x); } - max = x[ix]; + for (; i < len; i++) { + if (x[incx * i] > max) { + ix = i; + max = x[incx * ix]; + } + } for (; i < len; i++) { if (x[i] > max) { ix = i; @@ -1418,7 +1438,7 @@ copyd_kernel16(int *ip, double *y, double *x) static void -copyd_s_kernel8(int *ip, double *y, double *x, int incy, int incx) +copyd_s_kernel8(int *ip, double *y, int incx, int incy, double *x) { int i; int len; @@ -1447,16 +1467,16 @@ blas·copyd(int len, double *x, int incx, double *y, int incy) copyd_kernel16(&i, y, x); } else { i = len; - copyd_s_kernel8(&i, y, x, incy, incx); + copyd_s_kernel8(&i, y, incx, incy, x); } for (; i < len; i++) { - y[i] = x[i]; + y[incy * i] = x[incx * i]; } } static void -axpyd_kernel16(int *ip, double a, double *y, double *x) +axpyd_kernel16(int *ip, double *x, double *y, double a) { int i; int len; @@ -1485,7 +1505,7 @@ axpyd_kernel16(int *ip, double a, double *y, double *x) static void -axpyd_s_kernel8(int *ip, double a, int incx, double *y, double *x, int incy) +axpyd_s_kernel8(int *ip, double *x, int incy, double *y, double a, int incx) { int i; int len; @@ -1511,13 +1531,13 @@ blas·axpyd(int len, double a, double *x, int incx, double *y, int incy) if (incx == 1 && incy == 1) { i = len; - axpyd_kernel16(&i, a, y, x); + axpyd_kernel16(&i, x, y, a); } else { i = len; - axpyd_s_kernel8(&i, a, incx, y, x, incy); + axpyd_s_kernel8(&i, x, incy, y, a, incx); } for (; i < len; i++) { - y[i] = y[i] + a * x[i]; + y[incy * i] = y[incy * i] + a * x[incx * i]; } } @@ -1552,7 +1572,7 @@ axpbyd_kernel16(int *ip, double a, double b, double *x, double *y) static void -axpbyd_s_kernel8(int *ip, double a, double b, double *x, int incx, int incy, double *y) +axpbyd_s_kernel8(int *ip, double a, int incy, double b, double *x, int incx, double *y) { int i; int len; @@ -1581,16 +1601,16 @@ blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int in axpbyd_kernel16(&i, a, b, x, y); } else { i = len; - axpbyd_s_kernel8(&i, a, b, x, incx, incy, y); + axpbyd_s_kernel8(&i, a, incy, b, x, incx, y); } for (; i < len; i++) { - y[i] = b * y[i] + a * x[i]; + y[incy * i] = b * y[incy * i] + a * x[incx * i]; } } static double -dotd_kernel16(int *ip, double *x, double *y) +dotd_kernel16(int *ip, double *y, double *x) { double sum[16]; int i; @@ -1628,7 +1648,7 @@ dotd_kernel16(int *ip, double *x, double *y) static double -dotd_s_kernel8(int *ip, int incx, double *x, int incy, double *y) +dotd_s_kernel8(int *ip, double *y, int incx, int incy, double *x) { double sum[8]; int i; @@ -1664,13 +1684,13 @@ blas·dotd(int len, double *x, int incx, double *y, int incy) if (incx == 1 && incy == 1) { i = len; - sum = dotd_kernel16(&i, x, y); + sum = dotd_kernel16(&i, y, x); } else { i = len; - sum = dotd_s_kernel8(&i, incx, x, incy, y); + sum = dotd_s_kernel8(&i, y, incx, incy, x); } for (; i < len; i++) { - sum += x[i] * y[i]; + sum += x[incx * i] * y[incy * i]; } return sum; @@ -1716,7 +1736,7 @@ sumd_kernel16(int *ip, double *x) static double -sumd_s_kernel8(int *ip, double *x, int incx) +sumd_s_kernel8(int *ip, int incx, double *x) { int i; double sum[8]; @@ -1755,10 +1775,10 @@ blas·sumd(int len, double *x, int incx) sum = sumd_kernel16(&i, x); } else { i = len; - sum = sumd_s_kernel8(&i, x, incx); + sum = sumd_s_kernel8(&i, incx, x); } for (; i < len; i++) { - sum += x[i]; + sum += x[incx * i]; } return sum; @@ -1768,8 +1788,8 @@ static double normd_kernel16(int *ip, double *x) { - double nrm[16]; int i; + double nrm[16]; int len; for (i = 0; i < 16; i++) { @@ -1806,8 +1826,8 @@ static double normd_s_kernel8(int *ip, int incx, double *x) { - int i; double nrm[8]; + int i; int len; for (i = 0; i < 8; i++) { @@ -1846,7 +1866,7 @@ blas·normd(int len, double *x, int incx) nrm = normd_s_kernel8(&i, incx, x); } for (; i < len; i++) { - nrm += x[i] * x[i]; + nrm += x[incx * i] * x[incx * i]; } return math·sqrt(nrm); @@ -1883,7 +1903,7 @@ scaled_kernel16(int *ip, double a, double *x) static void -scaled_s_kernel8(int *ip, double a, int incx, double *x) +scaled_s_kernel8(int *ip, double *x, int incx, double a) { int i; int len; @@ -1912,19 +1932,19 @@ blas·scaled(int len, double *x, int incx, double a) scaled_kernel16(&i, a, x); } else { i = len; - scaled_s_kernel8(&i, a, incx, x); + scaled_s_kernel8(&i, x, incx, a); } for (; i < len; i++) { - x[i] = a * x[i]; + x[incx * i] = a * x[incx * i]; } } static void -rotd_kernel16(int *ip, double *x, double *y, double cos, double sin) +rotd_kernel16(int *ip, double sin, double cos, double *x, double *y) { - int i; double tmp[16]; + int i; int len; len = *ip & ~15; @@ -1951,10 +1971,10 @@ rotd_kernel16(int *ip, double *x, double *y, double cos, double sin) static void -rotd_s_kernel8(int *ip, double *y, int incx, double sin, int incy, double *x, double cos) +rotd_s_kernel8(int *ip, double sin, double *x, double *y, double cos, int incy, int incx) { - int i; double tmp[8]; + int i; int len; len = *ip & ~7; @@ -1979,22 +1999,22 @@ blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double if (incx == 1 && incy == 1) { i = len; - rotd_kernel16(&i, x, y, cos, sin); + rotd_kernel16(&i, sin, cos, x, y); } else { i = len; - rotd_s_kernel8(&i, y, incx, sin, incy, x, cos); + rotd_s_kernel8(&i, sin, x, y, cos, incy, incx); } for (; i < len; i++) { - tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp; + tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp; } } static void -rotgd_kernel16(int *ip, double *y, double H[5], double *x) +rotgd_kernel16(int *ip, double H[5], double *y, double *x) { - double tmp[16]; int i; + double tmp[16]; int len; len = *ip & ~15; @@ -2021,10 +2041,10 @@ rotgd_kernel16(int *ip, double *y, double H[5], double *x) static void -rotgd_s_kernel8(int *ip, int incx, double *y, int incy, double H[5], double *x) +rotgd_s_kernel8(int *ip, double H[5], int incx, double *y, double *x, int incy) { - double tmp[8]; int i; + double tmp[8]; int len; len = *ip & ~7; @@ -2049,13 +2069,13 @@ blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5]) if (incx == 1 && incy == 1) { i = len; - rotgd_kernel16(&i, y, H, x); + rotgd_kernel16(&i, H, y, x); } else { i = len; - rotgd_s_kernel8(&i, incx, y, incy, H, x); + rotgd_s_kernel8(&i, H, incx, y, x, incy); } for (; i < len; i++) { - tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp; + tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp; } } -- cgit v1.2.1