From 5e53685221576ad6ec53bafffd14bc7d5e01fa30 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Mon, 25 May 2020 15:33:54 -0700 Subject: deprecated old python generation files --- sys/cmd/cc/ast.c | 167 +++- sys/cmd/cc/cc.c | 3 + sys/cmd/cc/cc.h | 42 +- sys/cmd/rules.mk | 4 +- sys/libmath/blas.c | 18 +- sys/libmath/blas1.c | 2135 ++------------------------------------------------ sys/libmath/gen1.py | 360 --------- sys/libmath/gen2.py | 410 ---------- sys/libmath/rules.mk | 4 +- 9 files changed, 257 insertions(+), 2886 deletions(-) delete mode 100755 sys/libmath/gen1.py delete mode 100755 sys/libmath/gen2.py (limited to 'sys') diff --git a/sys/cmd/cc/ast.c b/sys/cmd/cc/ast.c index 49e35fd..0547712 100644 --- a/sys/cmd/cc/ast.c +++ b/sys/cmd/cc/ast.c @@ -17,15 +17,20 @@ // ----------------------------------------------------------------------- // helper functions +static string nameof(Name *n) { - if (n->kind & Nident) + switch (n->kind) { + case Nident: return n->ident; - if (n->kind & Nparen) + case Nparen: return nameof(n->paren->name); - - panicf("ill-formed declarator name"); + case Nindex: + case Ncall: + return nameof(n->sfx.name); + } + panicf("unreachable"); } static @@ -201,8 +206,12 @@ nomatch(Token t, vlong kind) static error spec(Parser *, Lexer *, uint64 *); static error dtor(Parser *p, Lexer *lx, Dtor *d, int ab); +static Type *typeofspec(Parser *, Lexer *, uint64 *); +static Type *typeofdtor(Dtor *, Type *); + static Decl *decl(Parser *, Lexer *); static Expr *expr(Parser *, Lexer *); + static error blkstmt(Parser *, Lexer *, Stmt **); #define MAKEX(x, state) alloc((x)), (x)->kind = X##state @@ -434,6 +443,7 @@ Bad: return nil; } +#if 0 static Type* type(Parser *p, Lexer *lx) @@ -462,6 +472,7 @@ Bad: errorat(lx->pos, "failed to parse type expression"); return nil; } +#endif static Expr* @@ -1005,6 +1016,107 @@ Bad: // ----------------------------------------------------------------------- // declarations +static +Type * +typeofname(Name *name, Type *type) +{ + switch (name->kind) { + case Nident: + return type; + case Nparen: + return typeofdtor(name->paren, type); + case Nindex: + return typeofname(name->sfx.name, array(type, name->sfx.idx.q, name->sfx.idx.x)); + case Ncall: + return typeofname(name->sfx.name, func(type, name->sfx.call.n, name->sfx.call.arg, name->sfx.call.dots)); + default: + panicf("unreachable"); + } +} + +static +Type * +typeofdtor(Dtor *decl, Type* type) +{ + int n; + Ptr *p; + uint64 b, tmp; + + n = 0; + p = &decl->ptr; + b = p->kind; + while (b & 1) { + type = ptr(type, b >> 1); + if (++n >= 8) { + p = p->link; + b = p->kind; + } else { + b >>= 6; + } + } + + return typeofname(decl->name, type); +} + +static +Type * +aggr(Parser *p, Lexer *lx, string name, int kind) +{ + int n; + uint64 s; + Dtor *dt; + Token tk; + Type *t; + Type *fs[1024]; + byte *nm[1024]; + Expr *cx[1024]; + + n = -1; + for (;tk.kind != Arbrace && n < arrlen(fs); n++) { + t = typeofspec(p, lx, &s); + + Field: + dt = getdtor(p); + dtor(p, lx, dt, 0); + fs[n] = typeofdtor(dt, t); + nm[n] = nameof(dt->name); + putdtor(p); + + tk = peek(p, 0); + switch (tk.kind) { + case Acolon: + advance(p, lx); + cx[n] = expr(p, lx); + tk = peek(p, 0); + if (tk.kind == Asemi) { + tk = advance(p, lx); + continue; + } + if (tk.kind != Acomma) { + errorat(tk.pos, "unrecognized token %s in struct field declaration", tokens[tk.kind]); + goto Bad; + } + /* fallthrough */ + case Acomma: + advance(p, lx); + n++; + goto Field; + + case Asemi: + tk = advance(p, lx); + continue; + + default: + errorat(tk.pos, "unrecognized token %s in struct field declaration", tokens[tk.kind]); + goto Bad; + } + } + +Bad: + errorat(tk.pos, "failed to parse aggregate declaration"); + return nil; +} + static error spec(Parser *p, Lexer *lx, uint64 *spec) @@ -1012,6 +1124,8 @@ spec(Parser *p, Lexer *lx, uint64 *spec) Token t; int n; Sym *typ; + string name; + Type *tag; uint64 s, sm; s = 0; @@ -1073,11 +1187,46 @@ spec(Parser *p, Lexer *lx, uint64 *spec) errorat(lx->pos, "more than one base type specified"); goto Bad; } + s += Bit(n); break; case Kstruct: case Kunion: + if (s & (Tstruct | Tunion)) { + errorat(lx->pos, "more than one aggregate type specified"); + goto Bad; + } + s += Bit(n); + t = advance(p, lx); + if (t.kind != Aident && t.kind != Albrace) { + errorat(t.pos, "struct specifier missing valid declaration"); + goto Bad; + } + + name = nil; + tag = nil; + if (t.kind == Aident) { + name = t.val.s; + t = advance(p, lx); + } + if (t.kind == Albrace) { + t = advance(p, lx); + tag = aggr(p, lx, name, s & (Tstruct | Tunion)); + + if (nomatch(t, Arbrace)) { + errorat(t.pos, "invalid token %s in struct/union declaration", tokens[t.kind]); + goto Bad; + } + } + break; + case Kenum: + if (s & Tenum) { + errorat(lx->pos, "more than one enum type specifier found"); + goto Bad; + } + s += Bit(n); panicf("need to implement"); + break; default: errorat(t.pos, "invalid keyword '%s' found in declaration specifier", keywords[n]); @@ -1100,7 +1249,7 @@ Bad: static Type* -typespec(Parser *p, Lexer *lx, uint64 *s) +typeofspec(Parser *p, Lexer *lx, uint64 *s) { int n; uint64 m; @@ -1123,7 +1272,9 @@ typespec(Parser *p, Lexer *lx, uint64 *s) } return C.type.info + indextypespec[n]; } - panicf("unreachable"); + + errorat(lx->pos, "invalid type specifier"); + return nil; } @@ -1301,7 +1452,7 @@ name(Parser *p, Lexer *lx, Name **nmp, int ab) } while (t = peek(p, 0), t.kind == Acomma); if (t.kind == Aellip) { - nm->sfx.call.vararg = 1; + nm->sfx.call.dots = 1; t = advance(p, lx); } @@ -1362,7 +1513,7 @@ Key: switch (k = t.kind) { case Akeywd: if (Kconst <= k && k <= Katomic) - ptr->kind |= Bit(n + (t.val.i - Kconst + 1)); + ptr->kind |= Bit(6*n + (t.val.i - Kconst + 1)); else { errorat(lx->pos, "invalid keyword '%s' modifies pointer", keywords[t.val.i]); goto Bad; diff --git a/sys/cmd/cc/cc.c b/sys/cmd/cc/cc.c index 24d8e45..d5cd995 100644 --- a/sys/cmd/cc/cc.c +++ b/sys/cmd/cc/cc.c @@ -91,6 +91,9 @@ END: return C.strs.vals[i]; } +// ----------------------------------------------------------------------- +// type interning + // ----------------------------------------------------------------------- // universal compiler builtins diff --git a/sys/cmd/cc/cc.h b/sys/cmd/cc/cc.h index e8ab05e..ce7eca7 100644 --- a/sys/cmd/cc/cc.h +++ b/sys/cmd/cc/cc.h @@ -374,7 +374,7 @@ enum Dvar, Dvars, - /* names (shouldn't interact with base AST node enumeration */ + /* names */ Nident, Nparen, Nindex, @@ -577,7 +577,7 @@ struct Name } idx; struct { int n; - int vararg : 1; + int dots : 1; Obj *arg; } call; }; @@ -641,7 +641,7 @@ struct Type uintptr size; uintptr max; uint16 align : 8; - uint16 sign : 2; + uint8 sign : 2; union { struct { uint32 qual; @@ -655,12 +655,17 @@ struct Type int len; Obj *f; Expr *x; - } agr; + } aggr; struct { int len; string *s; Expr *x; } enm; + struct { + Type *ret; + int n; + Type **arg; + } func; }; }; @@ -701,7 +706,7 @@ struct Parser error parse(Parser *, Lexer *); // ----------------------------------------------------------------------- -// global compiler +// global compiler data struct StrTab { @@ -714,14 +719,19 @@ struct StrTab int32 *vals; }; -/* cc.c string functions */ -int32 intern(byte **str); -string internview(byte* beg, byte *end); +struct TypeSet +{ + int32 n_buckets; + int32 size; + int32 n_occupied; + int32 upper_bound; + int32 *flags; + Type **keys; +}; /* main data */ struct Compiler { - /* storage */ mem·Arena *heap; StrTab strs; string outfile; @@ -739,15 +749,23 @@ struct Compiler } inc; struct { + struct TypeSet p; // pointer + struct TypeSet a; // array + struct TypeSet f; // funcs + struct TypeSet s; // structs + struct TypeSet u; // unions + struct TypeSet e; // enums + int cap; int len; Type *info; } type; -}; +}; extern Compiler C; -/* cc.c compiler functions */ -void init(); +/* cc.c functions */ +void init(); +int32 intern(byte **str); #undef iota diff --git a/sys/cmd/rules.mk b/sys/cmd/rules.mk index 4905a5f..81c1cbe 100644 --- a/sys/cmd/rules.mk +++ b/sys/cmd/rules.mk @@ -5,8 +5,8 @@ include share/push.mk DIR := $(d)/cat include $(DIR)/rules.mk -DIR := $(d)/cc -include $(DIR)/rules.mk +# DIR := $(d)/cc +# include $(DIR)/rules.mk DIR := $(d)/rc include $(DIR)/rules.mk diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c index 43b0e70..224480b 100644 --- a/sys/libmath/blas.c +++ b/sys/libmath/blas.c @@ -6,15 +6,15 @@ #include -#define NCOL 2000 -#define NROW 2000 +#define NCOL 20000 +#define NROW 20000 #define NIT 2000 -#define INC 2 +#define INC 1 error main() { int i, j, nit; - double *x, *y, *m, res[2]; + double *x, *y, res[2]; clock_t t; double tprof[2] = { 0 }; @@ -23,17 +23,14 @@ main() x = malloc(sizeof(*x)*NCOL); y = malloc(sizeof(*x)*NROW); - m = malloc(sizeof(*x)*NROW*NCOL); #define DO_0 t = clock(); \ - blas·gerd(NROW/INC, NCOL/INC, 1.2, x, INC, y, INC, m, NCOL); \ - res[0] += m[0]; \ + res[0] += blas·dasum(NROW/INC, x, INC); \ t = clock() - t; \ tprof[0] += 1000.*t/CLOCKS_PER_SEC; \ #define DO_1 t = clock(); \ - cblas_dger(CblasRowMajor, NROW/INC, NCOL/INC, 1.2, x, INC, y, INC, m, NCOL); \ - res[1] += m[0]; \ + res[1] += cblas_dasum(NROW/INC, x, INC); \ t = clock() - t; \ tprof[1] += 1000.*t/CLOCKS_PER_SEC; @@ -41,9 +38,6 @@ main() for (i = 0; i < NROW; i++) { x[i] = rng·random(); y[i] = rng·random(); - for (j = 0; j < NCOL; j++) { - m[j + NCOL*i] = rng·random(); - } } switch (nit % 2) { diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c index 1907179..d9792f6 100644 --- a/sys/libmath/blas1.c +++ b/sys/libmath/blas1.c @@ -1,2082 +1,59 @@ #include -#include #include -/*********************************************************/ -/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */ -/*********************************************************/ - - -static -int -argminf_kernel16(int *ip, float *x) -{ - int ix[16]; - float min[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - for (i = 0; i < 16; i++) { - min[i] = x[0]; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] < min[0]) { - ix[0] = (i + 0); - min[0] = x[ix[0]]; - } - if (x[i + 1] < min[1]) { - ix[1] = (i + 1); - min[1] = x[ix[1]]; - } - if (x[i + 2] < min[2]) { - ix[2] = (i + 2); - min[2] = x[ix[2]]; - } - if (x[i + 3] < min[3]) { - ix[3] = (i + 3); - min[3] = x[ix[3]]; - } - if (x[i + 4] < min[4]) { - ix[4] = (i + 4); - min[4] = x[ix[4]]; - } - if (x[i + 5] < min[5]) { - ix[5] = (i + 5); - min[5] = x[ix[5]]; - } - if (x[i + 6] < min[6]) { - ix[6] = (i + 6); - min[6] = x[ix[6]]; - } - if (x[i + 7] < min[7]) { - ix[7] = (i + 7); - min[7] = x[ix[7]]; - } - if (x[i + 8] < min[8]) { - ix[8] = (i + 8); - min[8] = x[ix[8]]; - } - if (x[i + 9] < min[9]) { - ix[9] = (i + 9); - min[9] = x[ix[9]]; - } - if (x[i + 10] < min[10]) { - ix[10] = (i + 10); - min[10] = x[ix[10]]; - } - if (x[i + 11] < min[11]) { - ix[11] = (i + 11); - min[11] = x[ix[11]]; - } - if (x[i + 12] < min[12]) { - ix[12] = (i + 12); - min[12] = x[ix[12]]; - } - if (x[i + 13] < min[13]) { - ix[13] = (i + 13); - min[13] = x[ix[13]]; - } - if (x[i + 14] < min[14]) { - ix[14] = (i + 14); - min[14] = x[ix[14]]; - } - if (x[i + 15] < min[15]) { - ix[15] = (i + 15); - min[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argminf_s_kernel8(int *ip, int incx, float *x) -{ - int ix[8]; - float min[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - for (i = 0; i < 8; i++) { - min[i] = x[0]; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] < min[0]) { - ix[0] = (i + 0); - min[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] < min[1]) { - ix[1] = (i + 1); - min[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] < min[2]) { - ix[2] = (i + 2); - min[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] < min[3]) { - ix[3] = (i + 3); - min[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] < min[4]) { - ix[4] = (i + 4); - min[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] < min[5]) { - ix[5] = (i + 5); - min[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] < min[6]) { - ix[6] = (i + 6); - min[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] < min[7]) { - ix[7] = (i + 7); - min[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argminf(int len, float *x, int incx) -{ - int i; - int ix; - float min; - - if (incx == 1) { - i = len; - ix = argminf_kernel16(&i, x); - } else { - i = len; - ix = argminf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - if (x[incx * i] < min) { - ix = i; - min = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] < min) { - ix = i; - min = x[ix]; - } - } - - return ix; -} - -static -int -argmaxf_kernel16(int *ip, float *x) -{ - int ix[16]; - float max[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - for (i = 0; i < 16; i++) { - max[i] = x[0]; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] > max[0]) { - ix[0] = (i + 0); - max[0] = x[ix[0]]; - } - if (x[i + 1] > max[1]) { - ix[1] = (i + 1); - max[1] = x[ix[1]]; - } - if (x[i + 2] > max[2]) { - ix[2] = (i + 2); - max[2] = x[ix[2]]; - } - if (x[i + 3] > max[3]) { - ix[3] = (i + 3); - max[3] = x[ix[3]]; - } - if (x[i + 4] > max[4]) { - ix[4] = (i + 4); - max[4] = x[ix[4]]; - } - if (x[i + 5] > max[5]) { - ix[5] = (i + 5); - max[5] = x[ix[5]]; - } - if (x[i + 6] > max[6]) { - ix[6] = (i + 6); - max[6] = x[ix[6]]; - } - if (x[i + 7] > max[7]) { - ix[7] = (i + 7); - max[7] = x[ix[7]]; - } - if (x[i + 8] > max[8]) { - ix[8] = (i + 8); - max[8] = x[ix[8]]; - } - if (x[i + 9] > max[9]) { - ix[9] = (i + 9); - max[9] = x[ix[9]]; - } - if (x[i + 10] > max[10]) { - ix[10] = (i + 10); - max[10] = x[ix[10]]; - } - if (x[i + 11] > max[11]) { - ix[11] = (i + 11); - max[11] = x[ix[11]]; - } - if (x[i + 12] > max[12]) { - ix[12] = (i + 12); - max[12] = x[ix[12]]; - } - if (x[i + 13] > max[13]) { - ix[13] = (i + 13); - max[13] = x[ix[13]]; - } - if (x[i + 14] > max[14]) { - ix[14] = (i + 14); - max[14] = x[ix[14]]; - } - if (x[i + 15] > max[15]) { - ix[15] = (i + 15); - max[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argmaxf_s_kernel8(int *ip, int incx, float *x) -{ - int ix[8]; - float max[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - for (i = 0; i < 8; i++) { - max[i] = x[0]; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] > max[0]) { - ix[0] = (i + 0); - max[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] > max[1]) { - ix[1] = (i + 1); - max[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] > max[2]) { - ix[2] = (i + 2); - max[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] > max[3]) { - ix[3] = (i + 3); - max[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] > max[4]) { - ix[4] = (i + 4); - max[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] > max[5]) { - ix[5] = (i + 5); - max[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] > max[6]) { - ix[6] = (i + 6); - max[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] > max[7]) { - ix[7] = (i + 7); - max[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argmaxf(int len, float *x, int incx) -{ - int i; - int ix; - float max; - - if (incx == 1) { - i = len; - ix = argmaxf_kernel16(&i, x); - } else { - i = len; - ix = argmaxf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - if (x[incx * i] > max) { - ix = i; - max = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] > max) { - ix = i; - max = x[ix]; - } - } - - return ix; -} - -static -void -copyf_kernel16(int *ip, float *x, float *y) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = x[i + 0]; - y[i + 1] = x[i + 1]; - y[i + 2] = x[i + 2]; - y[i + 3] = x[i + 3]; - y[i + 4] = x[i + 4]; - y[i + 5] = x[i + 5]; - y[i + 6] = x[i + 6]; - y[i + 7] = x[i + 7]; - y[i + 8] = x[i + 8]; - y[i + 9] = x[i + 9]; - y[i + 10] = x[i + 10]; - y[i + 11] = x[i + 11]; - y[i + 12] = x[i + 12]; - y[i + 13] = x[i + 13]; - y[i + 14] = x[i + 14]; - y[i + 15] = x[i + 15]; - } - *ip = i; -} - -static -void -copyf_s_kernel8(int *ip, int incx, int incy, float *x, float *y) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = x[incx * (i + 0)]; - y[incy * (i + 1)] = x[incx * (i + 1)]; - y[incy * (i + 2)] = x[incx * (i + 2)]; - y[incy * (i + 3)] = x[incx * (i + 3)]; - y[incy * (i + 4)] = x[incx * (i + 4)]; - y[incy * (i + 5)] = x[incx * (i + 5)]; - y[incy * (i + 6)] = x[incx * (i + 6)]; - y[incy * (i + 7)] = x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·copyf(int len, float *x, int incx, float *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - copyf_kernel16(&i, x, y); - } else { - i = len; - copyf_s_kernel8(&i, incx, incy, x, y); - } - for (; i < len; i++) { - y[incy * i] = x[incx * i]; - } -} - -static -void -axpyf_kernel16(int *ip, float *x, float a, float *y) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = y[i + 0] + a * x[i + 0]; - y[i + 1] = y[i + 1] + a * x[i + 1]; - y[i + 2] = y[i + 2] + a * x[i + 2]; - y[i + 3] = y[i + 3] + a * x[i + 3]; - y[i + 4] = y[i + 4] + a * x[i + 4]; - y[i + 5] = y[i + 5] + a * x[i + 5]; - y[i + 6] = y[i + 6] + a * x[i + 6]; - y[i + 7] = y[i + 7] + a * x[i + 7]; - y[i + 8] = y[i + 8] + a * x[i + 8]; - y[i + 9] = y[i + 9] + a * x[i + 9]; - y[i + 10] = y[i + 10] + a * x[i + 10]; - y[i + 11] = y[i + 11] + a * x[i + 11]; - y[i + 12] = y[i + 12] + a * x[i + 12]; - y[i + 13] = y[i + 13] + a * x[i + 13]; - y[i + 14] = y[i + 14] + a * x[i + 14]; - y[i + 15] = y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpyf_s_kernel8(int *ip, float *x, float a, int incx, float *y, int incy) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpyf(int len, float a, float *x, int incx, float *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpyf_kernel16(&i, x, a, y); - } else { - i = len; - axpyf_s_kernel8(&i, x, a, incx, y, incy); - } - for (; i < len; i++) { - y[incy * i] = y[incy * i] + a * x[incx * i]; - } -} - -static -void -axpbyf_kernel16(int *ip, float a, float *y, float b, float *x) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = b * y[i + 0] + a * x[i + 0]; - y[i + 1] = b * y[i + 1] + a * x[i + 1]; - y[i + 2] = b * y[i + 2] + a * x[i + 2]; - y[i + 3] = b * y[i + 3] + a * x[i + 3]; - y[i + 4] = b * y[i + 4] + a * x[i + 4]; - y[i + 5] = b * y[i + 5] + a * x[i + 5]; - y[i + 6] = b * y[i + 6] + a * x[i + 6]; - y[i + 7] = b * y[i + 7] + a * x[i + 7]; - y[i + 8] = b * y[i + 8] + a * x[i + 8]; - y[i + 9] = b * y[i + 9] + a * x[i + 9]; - y[i + 10] = b * y[i + 10] + a * x[i + 10]; - y[i + 11] = b * y[i + 11] + a * x[i + 11]; - y[i + 12] = b * y[i + 12] + a * x[i + 12]; - y[i + 13] = b * y[i + 13] + a * x[i + 13]; - y[i + 14] = b * y[i + 14] + a * x[i + 14]; - y[i + 15] = b * y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpbyf_s_kernel8(int *ip, float a, int incx, float *y, float b, int incy, float *x) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpbyf_kernel16(&i, a, y, b, x); - } else { - i = len; - axpbyf_s_kernel8(&i, a, incx, y, b, incy, x); - } - for (; i < len; i++) { - y[incy * i] = b * y[incy * i] + a * x[incx * i]; - } -} - -static -float -dotf_kernel16(int *ip, float *y, float *x) -{ - float sum[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0] * y[i + 0]; - sum[1] += x[i + 1] * y[i + 1]; - sum[2] += x[i + 2] * y[i + 2]; - sum[3] += x[i + 3] * y[i + 3]; - sum[4] += x[i + 4] * y[i + 4]; - sum[5] += x[i + 5] * y[i + 5]; - sum[6] += x[i + 6] * y[i + 6]; - sum[7] += x[i + 7] * y[i + 7]; - sum[8] += x[i + 8] * y[i + 8]; - sum[9] += x[i + 9] * y[i + 9]; - sum[10] += x[i + 10] * y[i + 10]; - sum[11] += x[i + 11] * y[i + 11]; - sum[12] += x[i + 12] * y[i + 12]; - sum[13] += x[i + 13] * y[i + 13]; - sum[14] += x[i + 14] * y[i + 14]; - sum[15] += x[i + 15] * y[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -float -dotf_s_kernel8(int *ip, int incy, float *x, float *y, int incx) -{ - float sum[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; - sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; - sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; - sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; - sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; - sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; - sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; - sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -float -blas·dotf(int len, float *x, int incx, float *y, int incy) -{ - int i; - float sum; - - if (incx == 1 && incy == 1) { - i = len; - sum = dotf_kernel16(&i, y, x); - } else { - i = len; - sum = dotf_s_kernel8(&i, incy, x, y, incx); - } - for (; i < len; i++) { - sum += x[incx * i] * y[incy * i]; - } - - return sum; -} - -static -float -sumf_kernel16(int *ip, float *x) -{ - float sum[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0]; - sum[1] += x[i + 1]; - sum[2] += x[i + 2]; - sum[3] += x[i + 3]; - sum[4] += x[i + 4]; - sum[5] += x[i + 5]; - sum[6] += x[i + 6]; - sum[7] += x[i + 7]; - sum[8] += x[i + 8]; - sum[9] += x[i + 9]; - sum[10] += x[i + 10]; - sum[11] += x[i + 11]; - sum[12] += x[i + 12]; - sum[13] += x[i + 13]; - sum[14] += x[i + 14]; - sum[15] += x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -float -sumf_s_kernel8(int *ip, int incx, float *x) -{ - float sum[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)]; - sum[1] += x[incx * (i + 1)]; - sum[2] += x[incx * (i + 2)]; - sum[3] += x[incx * (i + 3)]; - sum[4] += x[incx * (i + 4)]; - sum[5] += x[incx * (i + 5)]; - sum[6] += x[incx * (i + 6)]; - sum[7] += x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -float -blas·sumf(int len, float *x, int incx) -{ - int i; - float sum; - - if (incx == 1) { - i = len; - sum = sumf_kernel16(&i, x); - } else { - i = len; - sum = sumf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - sum += x[incx * i]; - } - - return sum; -} - -static -float -normf_kernel16(int *ip, float *x) -{ - int i; - float nrm[16]; - int len; - - for (i = 0; i < 16; i++) { - nrm[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - nrm[0] += x[i + 0] * x[i + 0]; - nrm[1] += x[i + 1] * x[i + 1]; - nrm[2] += x[i + 2] * x[i + 2]; - nrm[3] += x[i + 3] * x[i + 3]; - nrm[4] += x[i + 4] * x[i + 4]; - nrm[5] += x[i + 5] * x[i + 5]; - nrm[6] += x[i + 6] * x[i + 6]; - nrm[7] += x[i + 7] * x[i + 7]; - nrm[8] += x[i + 8] * x[i + 8]; - nrm[9] += x[i + 9] * x[i + 9]; - nrm[10] += x[i + 10] * x[i + 10]; - nrm[11] += x[i + 11] * x[i + 11]; - nrm[12] += x[i + 12] * x[i + 12]; - nrm[13] += x[i + 13] * x[i + 13]; - nrm[14] += x[i + 14] * x[i + 14]; - nrm[15] += x[i + 15] * x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -static -float -normf_s_kernel8(int *ip, int incx, float *x) -{ - int i; - float nrm[8]; - int len; - - for (i = 0; i < 8; i++) { - nrm[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; - nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; - nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; - nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; - nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; - nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; - nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; - nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -float -blas·normf(int len, float *x, int incx) -{ - int i; - float nrm; - - if (incx == 1) { - i = len; - nrm = normf_kernel16(&i, x); - } else { - i = len; - nrm = normf_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - nrm += x[incx * i] * x[incx * i]; - } - - return math·sqrtf(nrm); -} - -static -void -scalef_kernel16(int *ip, float *x, float a) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - x[i + 0] = a * x[i + 0]; - x[i + 1] = a * x[i + 1]; - x[i + 2] = a * x[i + 2]; - x[i + 3] = a * x[i + 3]; - x[i + 4] = a * x[i + 4]; - x[i + 5] = a * x[i + 5]; - x[i + 6] = a * x[i + 6]; - x[i + 7] = a * x[i + 7]; - x[i + 8] = a * x[i + 8]; - x[i + 9] = a * x[i + 9]; - x[i + 10] = a * x[i + 10]; - x[i + 11] = a * x[i + 11]; - x[i + 12] = a * x[i + 12]; - x[i + 13] = a * x[i + 13]; - x[i + 14] = a * x[i + 14]; - x[i + 15] = a * x[i + 15]; - } - *ip = i; -} - -static -void -scalef_s_kernel8(int *ip, float *x, int incx, float a) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - x[incx * (i + 0)] = a * x[incx * (i + 0)]; - x[incx * (i + 1)] = a * x[incx * (i + 1)]; - x[incx * (i + 2)] = a * x[incx * (i + 2)]; - x[incx * (i + 3)] = a * x[incx * (i + 3)]; - x[incx * (i + 4)] = a * x[incx * (i + 4)]; - x[incx * (i + 5)] = a * x[incx * (i + 5)]; - x[incx * (i + 6)] = a * x[incx * (i + 6)]; - x[incx * (i + 7)] = a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·scalef(int len, float *x, int incx, float a) -{ - int i; - - if (incx == 1) { - i = len; - scalef_kernel16(&i, x, a); - } else { - i = len; - scalef_s_kernel8(&i, x, incx, a); - } - for (; i < len; i++) { - x[incx * i] = a * x[incx * i]; - } -} - -static -void -rotf_kernel16(int *ip, float sin, float *x, float cos, float *y) -{ - int i; - float tmp[16]; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; - } - *ip = i; -} - -static -void -rotf_s_kernel8(int *ip, int incy, float cos, float sin, int incx, float *x, float *y) -{ - int i; - float tmp[8]; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; - } - *ip = i; -} - -void -blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin) -{ - int i; - float tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotf_kernel16(&i, sin, x, cos, y); - } else { - i = len; - rotf_s_kernel8(&i, incy, cos, sin, incx, x, y); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp; - } -} - -static -void -rotgf_kernel16(int *ip, float *x, float H[5], float *y) -{ - float tmp[16]; - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; - } - *ip = i; -} - -static -void -rotgf_s_kernel8(int *ip, int incx, int incy, float *x, float H[5], float *y) -{ - float tmp[8]; - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; - } - *ip = i; -} - -void -blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5]) -{ - int i; - float tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotgf_kernel16(&i, x, H, y); - } else { - i = len; - rotgf_s_kernel8(&i, incx, incy, x, H, y); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp; - } -} - -static -int -argmind_kernel16(int *ip, double *x) -{ - double min[16]; - int i; - int ix[16]; - int len; - - for (i = 0; i < 16; i++) { - min[i] = x[0]; - } - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] < min[0]) { - ix[0] = (i + 0); - min[0] = x[ix[0]]; - } - if (x[i + 1] < min[1]) { - ix[1] = (i + 1); - min[1] = x[ix[1]]; - } - if (x[i + 2] < min[2]) { - ix[2] = (i + 2); - min[2] = x[ix[2]]; - } - if (x[i + 3] < min[3]) { - ix[3] = (i + 3); - min[3] = x[ix[3]]; - } - if (x[i + 4] < min[4]) { - ix[4] = (i + 4); - min[4] = x[ix[4]]; - } - if (x[i + 5] < min[5]) { - ix[5] = (i + 5); - min[5] = x[ix[5]]; - } - if (x[i + 6] < min[6]) { - ix[6] = (i + 6); - min[6] = x[ix[6]]; - } - if (x[i + 7] < min[7]) { - ix[7] = (i + 7); - min[7] = x[ix[7]]; - } - if (x[i + 8] < min[8]) { - ix[8] = (i + 8); - min[8] = x[ix[8]]; - } - if (x[i + 9] < min[9]) { - ix[9] = (i + 9); - min[9] = x[ix[9]]; - } - if (x[i + 10] < min[10]) { - ix[10] = (i + 10); - min[10] = x[ix[10]]; - } - if (x[i + 11] < min[11]) { - ix[11] = (i + 11); - min[11] = x[ix[11]]; - } - if (x[i + 12] < min[12]) { - ix[12] = (i + 12); - min[12] = x[ix[12]]; - } - if (x[i + 13] < min[13]) { - ix[13] = (i + 13); - min[13] = x[ix[13]]; - } - if (x[i + 14] < min[14]) { - ix[14] = (i + 14); - min[14] = x[ix[14]]; - } - if (x[i + 15] < min[15]) { - ix[15] = (i + 15); - min[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argmind_s_kernel8(int *ip, double *x, int incx) -{ - double min[8]; - int i; - int ix[8]; - int len; - - for (i = 0; i < 8; i++) { - min[i] = x[0]; - } - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] < min[0]) { - ix[0] = (i + 0); - min[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] < min[1]) { - ix[1] = (i + 1); - min[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] < min[2]) { - ix[2] = (i + 2); - min[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] < min[3]) { - ix[3] = (i + 3); - min[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] < min[4]) { - ix[4] = (i + 4); - min[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] < min[5]) { - ix[5] = (i + 5); - min[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] < min[6]) { - ix[6] = (i + 6); - min[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] < min[7]) { - ix[7] = (i + 7); - min[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argmind(int len, double *x, int incx) -{ - int i; - int ix; - double min; - - if (incx == 1) { - i = len; - ix = argmind_kernel16(&i, x); - } else { - i = len; - ix = argmind_s_kernel8(&i, x, incx); - } - for (; i < len; i++) { - if (x[incx * i] < min) { - ix = i; - min = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] < min) { - ix = i; - min = x[ix]; - } - } - - return ix; -} - -static -int -argmaxd_kernel16(int *ip, double *x) -{ - int ix[16]; - int i; - double max[16]; - int len; - - for (i = 0; i < 16; i++) { - ix[i] = 0; - } - for (i = 0; i < 16; i++) { - max[i] = x[0]; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - if (x[i + 0] > max[0]) { - ix[0] = (i + 0); - max[0] = x[ix[0]]; - } - if (x[i + 1] > max[1]) { - ix[1] = (i + 1); - max[1] = x[ix[1]]; - } - if (x[i + 2] > max[2]) { - ix[2] = (i + 2); - max[2] = x[ix[2]]; - } - if (x[i + 3] > max[3]) { - ix[3] = (i + 3); - max[3] = x[ix[3]]; - } - if (x[i + 4] > max[4]) { - ix[4] = (i + 4); - max[4] = x[ix[4]]; - } - if (x[i + 5] > max[5]) { - ix[5] = (i + 5); - max[5] = x[ix[5]]; - } - if (x[i + 6] > max[6]) { - ix[6] = (i + 6); - max[6] = x[ix[6]]; - } - if (x[i + 7] > max[7]) { - ix[7] = (i + 7); - max[7] = x[ix[7]]; - } - if (x[i + 8] > max[8]) { - ix[8] = (i + 8); - max[8] = x[ix[8]]; - } - if (x[i + 9] > max[9]) { - ix[9] = (i + 9); - max[9] = x[ix[9]]; - } - if (x[i + 10] > max[10]) { - ix[10] = (i + 10); - max[10] = x[ix[10]]; - } - if (x[i + 11] > max[11]) { - ix[11] = (i + 11); - max[11] = x[ix[11]]; - } - if (x[i + 12] > max[12]) { - ix[12] = (i + 12); - max[12] = x[ix[12]]; - } - if (x[i + 13] > max[13]) { - ix[13] = (i + 13); - max[13] = x[ix[13]]; - } - if (x[i + 14] > max[14]) { - ix[14] = (i + 14); - max[14] = x[ix[14]]; - } - if (x[i + 15] > max[15]) { - ix[15] = (i + 15); - max[15] = x[ix[15]]; - } - } - *ip = i; - for (i = 1; i < 16; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -static -int -argmaxd_s_kernel8(int *ip, int incx, double *x) -{ - int ix[8]; - int i; - double max[8]; - int len; - - for (i = 0; i < 8; i++) { - ix[i] = 0; - } - for (i = 0; i < 8; i++) { - max[i] = x[0]; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - if (x[incx * (i + 0)] > max[0]) { - ix[0] = (i + 0); - max[0] = x[incx * ix[0]]; - } - if (x[incx * (i + 1)] > max[1]) { - ix[1] = (i + 1); - max[1] = x[incx * ix[1]]; - } - if (x[incx * (i + 2)] > max[2]) { - ix[2] = (i + 2); - max[2] = x[incx * ix[2]]; - } - if (x[incx * (i + 3)] > max[3]) { - ix[3] = (i + 3); - max[3] = x[incx * ix[3]]; - } - if (x[incx * (i + 4)] > max[4]) { - ix[4] = (i + 4); - max[4] = x[incx * ix[4]]; - } - if (x[incx * (i + 5)] > max[5]) { - ix[5] = (i + 5); - max[5] = x[incx * ix[5]]; - } - if (x[incx * (i + 6)] > max[6]) { - ix[6] = (i + 6); - max[6] = x[incx * ix[6]]; - } - if (x[incx * (i + 7)] > max[7]) { - ix[7] = (i + 7); - max[7] = x[incx * ix[7]]; - } - } - *ip = i; - for (i = 1; i < 8; i++) { - if (x[ix[i]] > x[ix[0]]) { - ix[0] = ix[i]; - } - } - - return ix[0]; -} - -int -blas·argmaxd(int len, double *x, int incx) -{ - int i; - int ix; - double max; - - if (incx == 1) { - i = len; - ix = argmaxd_kernel16(&i, x); - } else { - i = len; - ix = argmaxd_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - if (x[incx * i] > max) { - ix = i; - max = x[incx * ix]; - } - } - for (; i < len; i++) { - if (x[i] > max) { - ix = i; - max = x[ix]; - } - } - - return ix; -} - -static -void -copyd_kernel16(int *ip, double *y, double *x) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = x[i + 0]; - y[i + 1] = x[i + 1]; - y[i + 2] = x[i + 2]; - y[i + 3] = x[i + 3]; - y[i + 4] = x[i + 4]; - y[i + 5] = x[i + 5]; - y[i + 6] = x[i + 6]; - y[i + 7] = x[i + 7]; - y[i + 8] = x[i + 8]; - y[i + 9] = x[i + 9]; - y[i + 10] = x[i + 10]; - y[i + 11] = x[i + 11]; - y[i + 12] = x[i + 12]; - y[i + 13] = x[i + 13]; - y[i + 14] = x[i + 14]; - y[i + 15] = x[i + 15]; - } - *ip = i; -} - -static -void -copyd_s_kernel8(int *ip, double *y, int incx, int incy, double *x) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = x[incx * (i + 0)]; - y[incy * (i + 1)] = x[incx * (i + 1)]; - y[incy * (i + 2)] = x[incx * (i + 2)]; - y[incy * (i + 3)] = x[incx * (i + 3)]; - y[incy * (i + 4)] = x[incx * (i + 4)]; - y[incy * (i + 5)] = x[incx * (i + 5)]; - y[incy * (i + 6)] = x[incx * (i + 6)]; - y[incy * (i + 7)] = x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·copyd(int len, double *x, int incx, double *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - copyd_kernel16(&i, y, x); - } else { - i = len; - copyd_s_kernel8(&i, y, incx, incy, x); - } - for (; i < len; i++) { - y[incy * i] = x[incx * i]; - } -} - -static -void -axpyd_kernel16(int *ip, double *x, double *y, double a) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = y[i + 0] + a * x[i + 0]; - y[i + 1] = y[i + 1] + a * x[i + 1]; - y[i + 2] = y[i + 2] + a * x[i + 2]; - y[i + 3] = y[i + 3] + a * x[i + 3]; - y[i + 4] = y[i + 4] + a * x[i + 4]; - y[i + 5] = y[i + 5] + a * x[i + 5]; - y[i + 6] = y[i + 6] + a * x[i + 6]; - y[i + 7] = y[i + 7] + a * x[i + 7]; - y[i + 8] = y[i + 8] + a * x[i + 8]; - y[i + 9] = y[i + 9] + a * x[i + 9]; - y[i + 10] = y[i + 10] + a * x[i + 10]; - y[i + 11] = y[i + 11] + a * x[i + 11]; - y[i + 12] = y[i + 12] + a * x[i + 12]; - y[i + 13] = y[i + 13] + a * x[i + 13]; - y[i + 14] = y[i + 14] + a * x[i + 14]; - y[i + 15] = y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpyd_s_kernel8(int *ip, double *x, int incy, double *y, double a, int incx) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpyd(int len, double a, double *x, int incx, double *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpyd_kernel16(&i, x, y, a); - } else { - i = len; - axpyd_s_kernel8(&i, x, incy, y, a, incx); - } - for (; i < len; i++) { - y[incy * i] = y[incy * i] + a * x[incx * i]; - } -} - -static -void -axpbyd_kernel16(int *ip, double a, double b, double *x, double *y) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - y[i + 0] = b * y[i + 0] + a * x[i + 0]; - y[i + 1] = b * y[i + 1] + a * x[i + 1]; - y[i + 2] = b * y[i + 2] + a * x[i + 2]; - y[i + 3] = b * y[i + 3] + a * x[i + 3]; - y[i + 4] = b * y[i + 4] + a * x[i + 4]; - y[i + 5] = b * y[i + 5] + a * x[i + 5]; - y[i + 6] = b * y[i + 6] + a * x[i + 6]; - y[i + 7] = b * y[i + 7] + a * x[i + 7]; - y[i + 8] = b * y[i + 8] + a * x[i + 8]; - y[i + 9] = b * y[i + 9] + a * x[i + 9]; - y[i + 10] = b * y[i + 10] + a * x[i + 10]; - y[i + 11] = b * y[i + 11] + a * x[i + 11]; - y[i + 12] = b * y[i + 12] + a * x[i + 12]; - y[i + 13] = b * y[i + 13] + a * x[i + 13]; - y[i + 14] = b * y[i + 14] + a * x[i + 14]; - y[i + 15] = b * y[i + 15] + a * x[i + 15]; - } - *ip = i; -} - -static -void -axpbyd_s_kernel8(int *ip, double a, int incy, double b, double *x, int incx, double *y) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; - y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; - y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; - y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; - y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; - y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; - y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; - y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int incy) -{ - int i; - - if (incx == 1 && incy == 1) { - i = len; - axpbyd_kernel16(&i, a, b, x, y); - } else { - i = len; - axpbyd_s_kernel8(&i, a, incy, b, x, incx, y); - } - for (; i < len; i++) { - y[incy * i] = b * y[incy * i] + a * x[incx * i]; - } -} - -static -double -dotd_kernel16(int *ip, double *y, double *x) -{ - double sum[16]; - int i; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0] * y[i + 0]; - sum[1] += x[i + 1] * y[i + 1]; - sum[2] += x[i + 2] * y[i + 2]; - sum[3] += x[i + 3] * y[i + 3]; - sum[4] += x[i + 4] * y[i + 4]; - sum[5] += x[i + 5] * y[i + 5]; - sum[6] += x[i + 6] * y[i + 6]; - sum[7] += x[i + 7] * y[i + 7]; - sum[8] += x[i + 8] * y[i + 8]; - sum[9] += x[i + 9] * y[i + 9]; - sum[10] += x[i + 10] * y[i + 10]; - sum[11] += x[i + 11] * y[i + 11]; - sum[12] += x[i + 12] * y[i + 12]; - sum[13] += x[i + 13] * y[i + 13]; - sum[14] += x[i + 14] * y[i + 14]; - sum[15] += x[i + 15] * y[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -double -dotd_s_kernel8(int *ip, double *y, int incx, int incy, double *x) -{ - double sum[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; - sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; - sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; - sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; - sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; - sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; - sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; - sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -double -blas·dotd(int len, double *x, int incx, double *y, int incy) -{ - int i; - double sum; - - if (incx == 1 && incy == 1) { - i = len; - sum = dotd_kernel16(&i, y, x); - } else { - i = len; - sum = dotd_s_kernel8(&i, y, incx, incy, x); - } - for (; i < len; i++) { - sum += x[incx * i] * y[incy * i]; - } - - return sum; -} - -static -double -sumd_kernel16(int *ip, double *x) -{ - int i; - double sum[16]; - int len; - - for (i = 0; i < 16; i++) { - sum[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - sum[0] += x[i + 0]; - sum[1] += x[i + 1]; - sum[2] += x[i + 2]; - sum[3] += x[i + 3]; - sum[4] += x[i + 4]; - sum[5] += x[i + 5]; - sum[6] += x[i + 6]; - sum[7] += x[i + 7]; - sum[8] += x[i + 8]; - sum[9] += x[i + 9]; - sum[10] += x[i + 10]; - sum[11] += x[i + 11]; - sum[12] += x[i + 12]; - sum[13] += x[i + 13]; - sum[14] += x[i + 14]; - sum[15] += x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -static -double -sumd_s_kernel8(int *ip, int incx, double *x) -{ - int i; - double sum[8]; - int len; - - for (i = 0; i < 8; i++) { - sum[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - sum[0] += x[incx * (i + 0)]; - sum[1] += x[incx * (i + 1)]; - sum[2] += x[incx * (i + 2)]; - sum[3] += x[incx * (i + 3)]; - sum[4] += x[incx * (i + 4)]; - sum[5] += x[incx * (i + 5)]; - sum[6] += x[incx * (i + 6)]; - sum[7] += x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - sum[0] += sum[i]; - } - - return sum[0]; -} - -double -blas·sumd(int len, double *x, int incx) -{ - int i; - double sum; - - if (incx == 1) { - i = len; - sum = sumd_kernel16(&i, x); - } else { - i = len; - sum = sumd_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - sum += x[incx * i]; - } - - return sum; -} - -static -double -normd_kernel16(int *ip, double *x) -{ - int i; - double nrm[16]; - int len; - - for (i = 0; i < 16; i++) { - nrm[i] = 0; - } - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - nrm[0] += x[i + 0] * x[i + 0]; - nrm[1] += x[i + 1] * x[i + 1]; - nrm[2] += x[i + 2] * x[i + 2]; - nrm[3] += x[i + 3] * x[i + 3]; - nrm[4] += x[i + 4] * x[i + 4]; - nrm[5] += x[i + 5] * x[i + 5]; - nrm[6] += x[i + 6] * x[i + 6]; - nrm[7] += x[i + 7] * x[i + 7]; - nrm[8] += x[i + 8] * x[i + 8]; - nrm[9] += x[i + 9] * x[i + 9]; - nrm[10] += x[i + 10] * x[i + 10]; - nrm[11] += x[i + 11] * x[i + 11]; - nrm[12] += x[i + 12] * x[i + 12]; - nrm[13] += x[i + 13] * x[i + 13]; - nrm[14] += x[i + 14] * x[i + 14]; - nrm[15] += x[i + 15] * x[i + 15]; - } - *ip = i; - for (i = 1; i < 16; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -static -double -normd_s_kernel8(int *ip, int incx, double *x) -{ - double nrm[8]; - int i; - int len; - - for (i = 0; i < 8; i++) { - nrm[i] = 0; - } - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; - nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; - nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; - nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; - nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; - nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; - nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; - nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; - } - *ip = i; - for (i = 1; i < 8; i++) { - nrm[0] += nrm[i]; - } - - return nrm[0]; -} - -double -blas·normd(int len, double *x, int incx) -{ - int i; - double nrm; - - if (incx == 1) { - i = len; - nrm = normd_kernel16(&i, x); - } else { - i = len; - nrm = normd_s_kernel8(&i, incx, x); - } - for (; i < len; i++) { - nrm += x[incx * i] * x[incx * i]; - } - - return math·sqrt(nrm); -} - -static -void -scaled_kernel16(int *ip, double a, double *x) -{ - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - x[i + 0] = a * x[i + 0]; - x[i + 1] = a * x[i + 1]; - x[i + 2] = a * x[i + 2]; - x[i + 3] = a * x[i + 3]; - x[i + 4] = a * x[i + 4]; - x[i + 5] = a * x[i + 5]; - x[i + 6] = a * x[i + 6]; - x[i + 7] = a * x[i + 7]; - x[i + 8] = a * x[i + 8]; - x[i + 9] = a * x[i + 9]; - x[i + 10] = a * x[i + 10]; - x[i + 11] = a * x[i + 11]; - x[i + 12] = a * x[i + 12]; - x[i + 13] = a * x[i + 13]; - x[i + 14] = a * x[i + 14]; - x[i + 15] = a * x[i + 15]; - } - *ip = i; -} - -static -void -scaled_s_kernel8(int *ip, double *x, int incx, double a) -{ - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - x[incx * (i + 0)] = a * x[incx * (i + 0)]; - x[incx * (i + 1)] = a * x[incx * (i + 1)]; - x[incx * (i + 2)] = a * x[incx * (i + 2)]; - x[incx * (i + 3)] = a * x[incx * (i + 3)]; - x[incx * (i + 4)] = a * x[incx * (i + 4)]; - x[incx * (i + 5)] = a * x[incx * (i + 5)]; - x[incx * (i + 6)] = a * x[incx * (i + 6)]; - x[incx * (i + 7)] = a * x[incx * (i + 7)]; - } - *ip = i; -} - -void -blas·scaled(int len, double *x, int incx, double a) -{ - int i; - - if (incx == 1) { - i = len; - scaled_kernel16(&i, a, x); - } else { - i = len; - scaled_s_kernel8(&i, x, incx, a); - } - for (; i < len; i++) { - x[incx * i] = a * x[incx * i]; - } -} - -static -void -rotd_kernel16(int *ip, double sin, double cos, double *x, double *y) -{ - double tmp[16]; - int i; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; - } - *ip = i; -} - -static -void -rotd_s_kernel8(int *ip, double sin, double *x, double *y, double cos, int incy, int incx) -{ - double tmp[8]; - int i; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; - } - *ip = i; -} - -void -blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double sin) -{ - int i; - double tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotd_kernel16(&i, sin, cos, x, y); - } else { - i = len; - rotd_s_kernel8(&i, sin, x, y, cos, incy, incx); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = cos * x[incx * i] + sin * y[incy * i], y[incy * i] = cos * y[incy * i] - sin * tmp; - } -} - -static -void -rotgd_kernel16(int *ip, double H[5], double *y, double *x) -{ - int i; - double tmp[16]; - int len; - - len = *ip & ~15; - for (i = 0; i < len; i += 16) { - tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; - tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; - tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; - tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; - tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; - tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; - tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; - tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; - tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; - tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; - tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; - tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; - tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; - tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; - tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; - tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; - } - *ip = i; -} - -static -void -rotgd_s_kernel8(int *ip, double H[5], int incx, double *y, double *x, int incy) -{ - int i; - double tmp[8]; - int len; - - len = *ip & ~7; - for (i = 0; i < len; i += 8) { - tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; - tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; - tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; - tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; - tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; - tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; - tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; - tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; - } - *ip = i; -} - -void -blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5]) -{ - int i; - double tmp; - - if (incx == 1 && incy == 1) { - i = len; - rotgd_kernel16(&i, H, y, x); - } else { - i = len; - rotgd_s_kernel8(&i, H, incx, y, x, incy); - } - for (; i < len; i++) { - tmp = x[incx * i], x[incx * i] = H[1] * x[incx * i] + H[2] * y[incy * i], y[incy * i] = H[3] * y[incy * i] + H[4] * tmp; - } -} - - +#define UNROLL 8 +#define INT uintptr + +// ----------------------------------------------------------------------- +// Templates + +#include "loop.h" +#define BODY_XY() \ + LOOP(UNROLL, 0, INIT); \ + n = ROUNDBY(len, UNROLL); \ + if (incx == 1 && incy == 1) { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,1,1); \ + } \ + } else { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,incx,incy);\ + } \ + } \ + \ + for (; i < len; i++) { \ + LOOP(1,0,KERNEL,incx,incy); \ + } + +#define BODY_X() \ + LOOP(UNROLL, 0, INIT); \ + n = ROUNDBY(len, UNROLL); \ + if (incx == 1) { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,1); \ + } \ + } else { \ + for (i = 0; i < n; i+=UNROLL) { \ + LOOP(UNROLL,0,KERNEL,incx); \ + } \ + } \ + \ + for (; i < len; i++) { \ + LOOP(1,0,KERNEL,incx); \ + } + +// ----------------------------------------------------------------------- +// Implementation + +#define FLOAT double +#define func(name) blas·d##name +#include "blas1body" +#undef FLOAT + +#undef FLOAT +#undef func + +#define FLOAT float +#define func(name) blas·f##name +#include "blas1body" +#undef FLOAT diff --git a/sys/libmath/gen1.py b/sys/libmath/gen1.py deleted file mode 100755 index 936bc50..0000000 --- a/sys/libmath/gen1.py +++ /dev/null @@ -1,360 +0,0 @@ -#!/bin/python - -from C import * - -NUNROLL = 16 -def pkg(name): - return f"blas·{name}" - -def typeify(string, kind): - if (kind == Float32): - return f"{string}f" - if (kind == Float64): - return f"{string}d" - -def fini(func, loop, strided, calls, ret=[]): - func.execute(*calls[:2]) - func, scall = Strided(func, loop, NUNROLL//2, strided, *ret) - calls[2] = scall[0] - func.execute(*calls[2:]) - func.emit() - -# ------------------------------------------------------------------------ -# Blas level 1 functions - -def copy(kind): - name = typeify("copy", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y") - ), - Vars( - (Int, "i") - ) - ) - - len, x, y, i = F.variables("len", "x", "y", "i") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Set(Index(y, i), Index(x, i)) - ) - - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - - # F.execute(*calls) - # F.emit() - fini(F, loop, F.params[1:3], calls) - -def swap(kind): - name = typeify("swap", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y") - ), - Vars( - (Int, "i"), (kind, "tmp") - ) - ) - - len, x, y, i, tmp = F.variables("len", "x", "y", "i", "tmp") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Swap(Index(x, i), Index(y, i), tmp) - ) - - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - - # F.execute(*calls) - # F.emit() - fini(F, loop, F.params[1:3], calls) - -def axpby(kind): - name = typeify("axpby", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (kind, "a"), (Ptr(kind), "x"), (kind, "b"), (Ptr(kind), "y") - ), - Vars( - (Int, "i"), - ) - ) - - len, x, y, i, a, b = F.variables("len", "x", "y", "i", "a", "b") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Set(Index(y, i), - Mul(b, - Add(Index(y, i), - Mul(a, Index(x, i)), - ) - ) - ) - ) - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - - # F.execute(*calls) - # F.emit() - fini(F, loop, [F.params[2], F.params[4]], calls) - -def axpy(kind): - name = typeify("axpy", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (kind, "a"), (Ptr(kind), "x"), (Ptr(kind), "y") - ), - Vars( - (Int, "i"), - ) - ) - - len, x, y, i, a = F.variables("len", "x", "y", "i", "a") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Set(Index(y, i), - Add(Index(y, i), - Mul(a, Index(x, i)), - ) - ) - ) - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - - fini(F, loop, F.params[2:4], calls) - # F.execute(*calls) - # F.emit() - -def argminmax(kind): - for operation in [("min", LT), ("max", GT)]: - name = typeify(f"arg{operation[0]}", kind) - F = Func(pkg(name), Int, - Params( - (Int, "len"), (Ptr(kind), "x"), - ), - Vars( - (Int, "i"), (Int, "ix"), (kind, operation[0]) - ) - ) - len, x, i, ix, op = F.variables("len", "x", "i", "ix", operation[0]) - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - If(operation[1](Index(x, i), op), - Block( - Set(ix, i), - Set(op, Index(x, ix)), - ) - ) - ) - - ret = (ix, lambda ires, icur, node: - If(GT(Index(x, Index(node, icur)), Index(x, Index(node, ires))), - Block(Set(Index(node, ires), Index(node, icur))) - ) - ) - - kernel, calls = Unroll(name, loop, NUNROLL, *ret) - kernel.emit() - - # F.execute(*calls, Return(ix)) - # F.emit() - fini(F, loop, [F.params[1]], calls[:2] + [Set(op, Index(x, ix))] + calls[2:] + [Return(ix)], ret) - -def dot(kind): - nm = typeify("dot", kind) - F = Func(pkg(nm), kind, - Params( - (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y") - ), - Vars( - (Int, "i"), (kind, "sum"), - ) - ) - len, x, i, y, sum = F.variables("len", "x", "i", "y", "sum") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - AddSet(sum, Mul(Index(x, i), Index(y, i))) - ) - ret = (sum, lambda ires, icur, node: StmtExpr(AddSet(Index(node, ires), Index(node, icur)))) - - kernel, calls = Unroll(nm, loop, NUNROLL, *ret) - kernel.emit() - - # F.execute(*calls, Return(sum)) - fini(F, loop, F.params[1:3], calls + [Return(sum)], ret) - -def norm(kind): - nm = typeify("norm", kind) - F = Func(pkg(nm), kind, - Params( - (Int, "len"), (Ptr(kind), "x") - ), - Vars( - (Int, "i"), (kind, "nrm"), - ) - ) - len, x, i, nrm = F.variables("len", "x", "i", "nrm") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - AddSet(nrm, Mul(Index(x, i), Index(x, i))) - ) - - ret = (nrm, lambda ires, icur, node: - StmtExpr(AddSet(Index(node, ires), Index(node, icur))) - ) - - kernel, calls = Unroll(nm, loop, NUNROLL, *ret) - kernel.emit() - - if kind == Float64: - sqrt = Func("math·sqrt", kind) - elif kind == Float32: - sqrt = Func("math·sqrtf", kind) - else: - raise ValueError(f"no sqrt for type {kind}") - - # F.execute(*calls, Return(Call(sqrt, [nrm]))) - # F.emit() - fini(F, loop, [F.params[1]], calls + [Return(Call(sqrt, [nrm]))], ret) - -def sum(kind): - name = typeify("sum", kind) - F = Func(pkg(name), kind, - Params( - (Int, "len"), (Ptr(kind), "x"), - ), - Vars( - (Int, "i"), (kind, "sum") - ) - ) - - len, x, i, sum = F.variables("len", "x", "i", "sum") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - AddSet(sum, Index(x, i)) - ) - - ret = (sum, lambda ires, icur, node: - StmtExpr(AddSet(Index(node, ires), Index(node, icur))) - ) - - kernel, calls = Unroll(name, loop, NUNROLL, *ret) - kernel.emit() - - fini(F, loop, [F.params[1]], calls + [Return(sum)], ret) - # F.execute(*calls, Return(sum)) - # F.emit() - -def scale(kind): - name = typeify("scale", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (Ptr(kind), "x"), (kind, "a") - ), - Vars( - (Int, "i"), - ) - ) - - len, a, x, i = F.variables("len", "a", "x", "i") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute(Set(Index(x, i), Mul(a, Index(x, i)))) - - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - - fini(F, loop, [F.params[1]], calls) - # F.execute(*calls) - # F.emit() - -def rot(kind): - name = typeify("rot", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y"), (kind, "cos"), (kind, "sin") - ), - Vars( - (Int, "i"), (kind, "tmp") - ) - ) - - len, x, y, i, tmp, cos, sin = F.variables("len", "x", "y", "i", "tmp", "cos", "sin") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Comma(Set(tmp, Index(x, i)), - Comma( - Set(Index(x, i), Add(Mul(cos, Index(x, i)), Mul(sin, Index(y, i)))), - Set(Index(y, i), Sub(Mul(cos, Index(y, i)), Mul(sin, tmp))) - ) - ) - ) - - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - - fini(F, loop, [F.params[1], F.params[2]], calls) - # F.execute(*calls) - # F.emit() - -def rotm(kind): - name = typeify("rotg", kind) - F = Func(pkg(name), Void, - Params( - (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y"), (Array(kind, 5), "H") - ), - Vars( - (Int, "i"), (kind, "tmp") - ) - ) - - len, x, y, i, tmp, H = F.variables("len", "x", "y", "i", "tmp", "H") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Comma(Set(tmp, Index(x, i)), - Comma( - Set(Index(x, i), Add(Mul(Index(H, I(1)), Index(x, i)), Mul(Index(H, I(2)), Index(y, i)))), - Set(Index(y, i), Add(Mul(Index(H, I(3)), Index(y, i)), Mul(Index(H, I(4)), tmp))) - ) - ) - ) - - kernel, calls = Unroll(name, loop, NUNROLL) - kernel.emit() - fini(F, loop, F.params[1:3], calls) - -if __name__ == "__main__": - emit("#include \n") - emit("#include \n") - emit("#include \n") - emitln() - emit("/*********************************************************/\n") - emit("/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */\n") - emit("/*********************************************************/\n") - emitln(2) - - # TODO: Implement rotg/rotmg - for kind in [Float32, Float64]: - argminmax(kind) - copy(kind) - axpy(kind) - axpby(kind) - dot(kind) - sum(kind) - norm(kind) - scale(kind) - rot(kind) - rotm(kind) - - flush() diff --git a/sys/libmath/gen2.py b/sys/libmath/gen2.py deleted file mode 100755 index 2afbe1d..0000000 --- a/sys/libmath/gen2.py +++ /dev/null @@ -1,410 +0,0 @@ -from C import * -import copy - -ROW = 4 -COL = 4 - -def pkg(name): - return f"blas·{name}" - -def typeify(string, kind): - if (kind == Float32): - return f"{string}f" - if (kind == Float64): - return f"{string}d" - -# ------------------------------------------------------------------------ -# Helpers (abandoning the automatic unroll from level 1) - -def toarray(len: int, *args): - return [Param(Array(arg.type, len), arg.name) for arg in args] - -def TryIndex(x, i): - if IsArrayType(x.var.type): - return Index(x, i) - return x - -def AddElts(root, *vars): - for var in vars: - root = Add(root ,var) - return root - -def Round(store, number, by): - return Set(store, And(number, Negate(I(by-1)))) - -def UnitIncs(root, *incs): - root = EQ(root, I(1)) - for inc in incs: - root = AndAnd(root, EQ(inc, I(1))) - return root - -def IsInc(p): - return p.name != "incx" and p.name != "incy" - -def Identity(p): - return True - -def FilterParams(params, func): - return [p for p in params if func(p)] - -def StrideAllIndexedTerms(stmts, var, itor, inc): - def is_hit(x): - if isinstance(x, Index): - if isinstance(x.i, BinaryOp) and x.x == var: - return x.i.l == itor - - return False - - terms = [] - for stmt in stmts: - Visit(stmt, lambda node: Filter(node, is_hit, terms)) - - for term in terms: - term.i = Mul(Paren(term.i), inc) - -def AsStrided(stmts, var, itor, inc): - def increment(x): - if isinstance(x, Index): - if isinstance(x.i, BinaryOp) and x.x == var: - return Index(x.x, Mul(Paren(x.i), inc)) - - return copy.copy(x) - - if isinstance(stmts, Block): - return Block(*[Make(stmt, lambda node: Transform(node, increment)) for stmt in stmts.stmts]) - elif isinstance(stmts, list): - return [Make(stmt, lambda node: Transform(node, increment(node))) for stmt in stmts] - else: - raise TypeError("unrecognized stmts type") - -class Iter(object): - def __init__(self, it, end, len, inc): - self.it = it - self.end = end - self.len = len - self.inc = inc - -def DoubleLoop(top, bot, Kernel, Preamble=[], Postamble=[]): - def Step(it, inc): - if inc == 1: - return Inc(it) - else: - return AddSet(it, I(inc)) - - return For(Set(top.it, I(0)), LT(top.it, top.end), Step(top.it, top.inc), - Block(*[ - *[func(i) for func in Preamble for i in range(top.inc)], - For(Set(bot.it, I(0)), LT(bot.it, bot.end), Step(bot.it, bot.inc), - Block(*[func for i in range(top.inc) for func in Kernel(top.it, bot.it, i, bot.inc)]) - ), - For(None, LT(bot.it, bot.len), Inc(bot.it), - Block(*[func for i in range(top.inc) for func in Kernel(top.it, bot.it, i, 1)]) - ), - *[func(i) for func in Postamble for i in range(bot.inc)] - ]) - ) - -def TriangularLoop(top, bot, Kernel, Preamble=[], Postamble=lambda i: None, upper=True): - # --------------------------------------------------------------- - # Helper functions - - def Finish(j, upper=False): - if upper: - if j == 0: - return Block(For(None, GT(bot.it, top.it), Dec(bot.it), - Block(*[func for i in range(j, top.inc) for func in Kernel(top.it, bot.it, i, 1)]) - ), Postamble(j) - ) - return Block(*[func for i in range(j, top.inc) for func in Kernel(top.it, bot.it, i, 1)], Dec(bot.it), Postamble(j)) - if not upper: - if j == 0: - return Block(For(None, LT(bot.it, top.it), Inc(bot.it), - Block(*[func for i in range(j, top.inc) for func in Kernel(top.it, bot.it, i, 1)]) - ), Postamble(j)) - return Block(*[func for i in range(j, top.inc) for func in Kernel(top.it, bot.it, i, 1)], Inc(bot.it), Postamble(j)) - - def Step(it, inc, down=False): - if not down: - if inc == 1: - return Inc(it) - else: - return AddSet(it, I(inc)) - else: - if inc == 1: - return Dec(it) - else: - return SubSet(it, I(inc)) - - # --------------------------------------------------------------- - # Main body - - if upper: - return For(Set(top.it, Sub(top.end, I(1))), GE(top.it, I(0)), Step(top.it, top.inc, down=True), - Block(*[ - *[func(i) for func in Preamble for i in range(top.inc)], - Set(bot.end, Sub(Paren(EvenTo(Paren(Sub(top.it, Paren(Sub(bot.len, I(1))))), bot.inc)), Paren(Sub(bot.len, I(1))))), - For(Set(bot.it, Sub(bot.len, I(1))), GT(bot.it, bot.end), Step(bot.it, bot.inc, down=True), - Block(*[func for i in range(top.inc) for func in Kernel(top.it, bot.it, i, bot.inc)]) - ), - *[ Finish(j, upper) if bot.inc > 1 else Postamble(j) for j in range(top.inc) ], - # *[func(i) for func in Postamble for i in range(bot.inc)] - ]) - ) - else: - return For(Set(top.it, I(0)), LT(top.it, top.end), Step(top.it, top.inc), - Block(*[ - *[func(i) for func in Preamble for i in range(top.inc)], - Set(bot.end, EvenTo(top.it, bot.inc)), - For(Set(bot.it, I(0)), LT(bot.it, bot.end), Step(bot.it, bot.inc), - Block(*[func for i in range(top.inc) for func in Kernel(top.it, bot.it, i, bot.inc)]) - ), - *[ Finish(j) if bot.inc > 1 else Postamble(j) for j in range(top.inc) ], - # *[func(i) for func in Postamble for i in range(bot.inc)] - ]) - ) - -def Shift(x, i, uplo): - if uplo == "upper": - return Sub(x, I(i)) - if uplo == "lower": - return Add(x, I(i)) - raise ValueError("unrecognized value") - - -def ToKernel(name, loop): - vars = VarsUsed(StmtExpr(loop.init)) | VarsUsed(StmtExpr(loop.cond)) | \ - VarsUsed(StmtExpr(loop.step)) | VarsUsed(loop.body) - -def ExpandAdd(row, x, i: int, c: Emitter, inc: int, uplo): - offset = Add(c, I(0)) - root = Mul(Index(Index(row, I(i)), offset), Index(x, offset)) - for n in range(1, inc): - offset = Shift(c, I(n), uplo) - root = Add(root, Mul(Index(Index(row, I(i)), offset), Index(x, offset))) - return root - -# ------------------------------------------------------------------------ -# Blas level 2 functions - -def trsv(kind): - name = typeify("trsv", kind) - F = Func(pkg(name), Void, - Params( - (UInt32, "flag"), (Int, "len"), (Ptr(kind), "m"), (Int, "incm"), (Ptr(kind), "x"), (Int, "incx") - ), - Vars( - (Int, "r"), (Int, "c"), (Int, "nr"), (Int, "nc"), (Array(Ptr(kind), ROW), "row"), (Array(kind, COL), "res") - ) - ) - - r, c, nr, nc, row, res = F.variables("r", "c", "nr", "nc", "row", "res") - flag, _len, m, incm, x = F.variables("flag", "len", "m", "incm", "x") - incx = F.variables("incx") - - rows, cols = lambda inc_r: Iter(r, nr, _len, inc_r), lambda inc_c: Iter(c, nc, _len, inc_c) - - template = lambda inc_r, inc_c, upper: TriangularLoop(rows(inc_r), cols(inc_c), - Kernel = lambda r, c, i, inc: [AddSet(Index(res, I(i)), ExpandAdd(row, x, i, c, inc, upper))], - Preamble = [lambda i: Set(Index(row, I(i)), Add(m, Mul(Paren(Shift(r, i, upper)), incm))), - lambda i: Set(Index(res, I(i)), I(0))], - Postamble = lambda i: Set(Index(x, Shift(c, I(0), upper)), Div(Index(res, I(i)), Index(Index(row, I(i)), Shift(c, I(0), upper)))), - upper = upper == "upper" - ) - - loop = template(1, 1, "upper") - loop.emit() - -def syr(kind): - name = typeify("syr", kind) - F = Func(pkg(name), Void, - Params( - (UInt32, "flag"), (Int, "len"), (kind, "a"), - (Ptr(kind), "x"), (Int, "incx"), (Ptr(kind), "m"), (Int, "incm"), - ), - Vars( - (Int, "r"), (Int, "c"), (Int, "nr"), (Int, "nc"), (Array(Ptr(kind), ROW), "row"), (Array(kind, COL), "res") - ) - ) - - r, c, nr, nc, row, res = F.variables("r", "c", "nr", "nc", "row", "res") - flag, _len, a, m, incm, x = F.variables("flag", "len", "a", "m", "incm", "x") - incx = F.variables("incx") - - rows, cols = lambda inc_r: Iter(r, nr, _len, inc_r), lambda inc_c: Iter(c, nc, _len, inc_c) - - template = lambda inc_r, inc_c, upper: TriangularLoop(rows(inc_r), cols(inc_c), - Kernel = lambda r, c, i, inc: [AddSet(Index(Index(row, I(i)), Shift(c, j, upper)), Mul(Index(res, I(i)), Index(x, Shift(c, j, upper)))) for j in range(inc)], - Preamble = [lambda i: Set(Index(row, I(i)), Add(m, Mul(Paren(Shift(r, i, upper)), incm))), - lambda i: Set(Index(res, I(i)), Mul(a, Index(x, Shift(r, i, upper))))], - upper = upper == "upper" - ) - - blocks = [] - for layout in ["lower", "upper"]: - floop = template(2, 2, layout) - sloop = template(2, 2, layout) - sloop.body = AsStrided(sloop.body, x, c, incx) - - fini = template(1, 1, layout) - fini.init = None - fini.body = AsStrided(fini.body, x, c, incx) - fini.cond = LT(r, _len) - - blocks.append( - Block( - If(UnitIncs(incx), Block(floop), Block(sloop)), - fini, - Return(), - ) - ) - F.execute( - Set(nr, EvenTo(_len, ROW)), - If(flag, blocks[0], blocks[1]) - ) - F.emit() - -def ger(kind): - name = typeify("ger", kind) - F = Func(pkg(name), Void, - Params( - (Int, "nrow"), (Int, "ncol"), (kind, "a"), - (Ptr(kind), "x"), (Int, "incx"), (Ptr(kind), "y"), (Int, "incy"), (Ptr(kind), "m"), (Int, "incm"), - ), - Vars( - (Int, "r"), (Int, "c"), (Int, "nr"), (Int, "nc"), (Array(Ptr(kind), ROW), "row"), (Array(kind, COL), "res") - ) - ) - - r, c, nr, nc, row, res = F.variables("r", "c", "nr", "nc", "row", "res") - nrow, ncol, a, m, incm, x, y = F.variables("nrow", "ncol", "a", "m", "incm", "x", "y") - incx, incy = F.variables("incx", "incy") - - rows, cols = lambda incr: Iter(r, nr, nrow, incr), lambda incc: Iter(c, nc, ncol, incc) - - template = lambda incr, incc: DoubleLoop(rows(incr), cols(incc), - Kernel = lambda r, c, i, inc: [AddSet(Index(Index(row, I(i)), Add(c, I(j))), Mul(Index(res, I(i)), Index(y, Add(c, I(j))))) for j in range(inc)], - Preamble = [lambda i: Set(Index(row, I(i)), Add(m, Mul(Paren(Add(r, I(i))), incm))), - lambda i: Set(Index(res, I(i)), Mul(a, Index(x, Add(r, I(i)))))], - ) - - # loop = template(1, 1) - # F.execute(loop) - # F.emit() - floop = template(ROW, COL) - sloop = template(ROW, COL) - sloop.body = AsStrided(AsStrided(sloop.body, x, c, incx), y, r, incy) - - fini = template(1, 2*COL) - fini.init = None - fini.body = AsStrided(AsStrided(fini.body, x, c, incx), y, r, incy) - fini.cond = LT(r, nrow) - - F.execute( - Set(nr, EvenTo(nrow, ROW)), - Set(nc, EvenTo(ncol, COL)), - If(UnitIncs(incx, incy), Block(floop), Block(sloop)) - ) - F.execute(fini) - F.emit() - -def gemv(kind): - name = typeify("gemv", kind) - params = Params( - (Int, "nrow"), (Int, "ncol"), (kind, "a"), (Ptr(kind), "m"), (Int, "incm"), - (Ptr(kind), "x"), (Int, "incx"), (kind, "b"), (Ptr(kind), "y"), (Int, "incy") - ) - stack = Vars((Int, "r"), (Int, "c"), (Ptr(kind), "row"), (kind, "res")) - F = Func(pkg(name), Void, params, stack) - - # --------------------- - # Kernel - - def innerloop(rinc, cit, cend, cinc): - return For(Set(cit, I(0)), LT(cit, cend), AddSet(cit, I(cinc)), - Block(*[AddSet(TryIndex(res, I(i)), - AddElts(*(Mul( - Index(TryIndex(row, I(i)), Add(cit, I(j))), - Index(x, Add(cit, I(j)))) for j in range(cinc) - ) - ) - ) for i in range(rinc)]) - ) - - def tryloop(rinc, cit, cend, cinc): - if cinc > 1: - loop = innerloop(rinc, cit, cend, 1) - loop.init = None - return loop - - def outerloop(rit, rlen, rinc, cit, cend, clen, cinc, row, res): - return For(Set(rit, I(0)), LT(rit, rlen), AddSet(r, I(rinc)), - Block( - *[Set(TryIndex(row, I(i)), Add(m, Mul(Paren(Add(rit, I(i))), incm))) for i in range(rinc)], - *[Set(TryIndex(res, I(i)), I(0)) for i in range(rinc)], - innerloop(rinc, cit, cend, cinc), - tryloop(rinc, cit, clen, cinc), - *[Set(Index(y, Add(rit, I(i))), Add(Mul(a, TryIndex(res, I(i))), Mul(b, Index(y, Add(rit, I(i)))))) for i in range(rinc)] - ) - ) - - kerns = [] - for func, sfx in [(IsInc, ""), (Identity, "_s")]: - kern = Func(f"{name}{sfx}_{ROW}x{COL}kernel", Void, FilterParams(params, func), stack[0:2] + toarray(ROW, *stack[2:]), static=True) - r, c, row, res = kern.variables("r", "c", "row", "res") - nrow, ncol, a, m, incm, x, b, y = kern.variables("nrow", "ncol", "a", "m", "incm", "x", "b", "y") - - ncolr = kern.declare(Var(Int, "ncolr")) - loop = outerloop(r, nrow, ROW, c, ncolr, ncol, COL, row, res) - - kern.execute(Round(ncolr, ncol, COL)) - kern.execute(loop) - if "_s" in sfx: - incx, incy = kern.variables("incx", "incy") - StrideAllIndexedTerms(kern.stmts, x, c, incx) - StrideAllIndexedTerms(kern.stmts, y, r, incy) - - kern.emit() - - kerns.append(kern) - - r, c, row, res = F.variables("r", "c", "row", "res") - nrow, ncol, a, m, incm, x, b, y = F.variables("nrow", "ncol", "a", "m", "incm", "x", "b", "y") - incx, incy = F.variables("incx", "incy") - F.execute(Round(r, nrow, ROW)) - F.execute( - If(UnitIncs(incx, incy), - Block(Call(kerns[0], [r, ncol, a, m, incm, x, b, y])), - Block(Call(kerns[1], [r, ncol, a, m, incm, x, incx, b, y, incy])), - ) - ) - - F.params = Params((UInt32, "flag")) + F.params - - remainder = outerloop(r, nrow, 1, c, ncol, ncol, COL, row, res) - remainder.init = None - F.execute(remainder) - StrideAllIndexedTerms(F, x, c, incx) - StrideAllIndexedTerms(F, y, r, incy) - - F.emit() - -# ------------------------------------------------------------------------ -# Code Generation - -if __name__ == "__main__": - emit("#include \n") - emit("#include \n") - emit("#include \n") - emitln() - emit("/*********************************************************/\n") - emit("/* THIS CODE IS GENERATED BY GEN2.PY! DON'T EDIT BY HAND */\n") - emit("/*********************************************************/\n") - emitln(2) - - for kind in [Float64]: #[Float32, Float64]: - trsv(kind) - # syr(kind) - # ger(kind) - # gemv(kind) - - flush() diff --git a/sys/libmath/rules.mk b/sys/libmath/rules.mk index c7d8d61..fddd74a 100644 --- a/sys/libmath/rules.mk +++ b/sys/libmath/rules.mk @@ -5,9 +5,7 @@ include share/push.mk # Local sources SRCS_$(d) := \ $(d)/basic.c \ - $(d)/blas1.c \ - $(d)/blas2.c \ - $(d)/linalg.c + $(d)/blas1.c LIBS_$(d) := $(d)/libmath.a BINS_$(d) := TSTS_$(d) := \ -- cgit v1.2.1