From d982e7c2fdebf560ccce193cb98b85d4fac28a45 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Wed, 13 May 2020 17:30:19 -0700 Subject: blas 1 generation code complete --- include/libmath.h | 46 +- lib/c.py | 1579 -------------------------------------- rules.mk | 17 +- sys/libbio/rules.mk | 6 +- sys/libmath/blas.c | 127 +--- sys/libmath/blas1.c | 2062 ++++++++++++++++++++++++++++++++++++++++++++++++++ sys/libmath/gen1.py | 357 +++++++++ sys/libmath/linalg.c | 15 +- sys/libmath/proto.c | 1761 ------------------------------------------ sys/libmath/rules.mk | 10 +- sys/libn/rules.mk | 6 +- 11 files changed, 2480 insertions(+), 3506 deletions(-) delete mode 100644 lib/c.py create mode 100644 sys/libmath/blas1.c create mode 100755 sys/libmath/gen1.py delete mode 100644 sys/libmath/proto.c diff --git a/include/libmath.h b/include/libmath.h index 5a7dc4e..40ae4ee 100644 --- a/include/libmath.h +++ b/include/libmath.h @@ -130,49 +130,7 @@ double math·trunc(double); float math·truncf(float); // ----------------------------------------------------------------------- -// basic linear algebra compute kernels - -// TODO: think of better names -enum -{ - blas·LowerTri = 1u, - blas·Transpose = 2u, - blas·ConjTranspose = 4u, - blas·DiagOnes = 8u, - blas·LeftSide = 16u, -}; - -typedef uint32 blas·Flag; - -/* level 1 */ -void blas·rot(int len, double *x, int incx, double *y, int incy, double cos, double sin); -void blas·rotg(double *a, double *b, double *cos, double *sin); -error blas·rotm(int len, double *x, int incx, double *y, int incy, double p[5]); -void blas·scale(int len, double a, double *x, int inc); -void blas·copy(int len, double *x, int incx, double *y, int incy); -void blas·swap(int len, double *x, int incx, double *y, int incy); -void blas·axpy(int len, double a, double *x, int incx, double *y, int incy); -double blas·dot(int len, double *x, int incx, double *y, int incy); -double blas·norm(int len, double *x, int inc); -double blas·sum(int len, double *x, int inc); -int blas·argmax(int len, double *x, int inc); -int blas·argmin(int len, double *x, int inc); - -/* level 2 */ -void blas·tpmv(blas·Flag f, int n, double *m, double *x); -error blas·gemv(int nrow, int ncol, double a, double *m, int incm, double *x, int incx, double b, double *y, int incy) ; -void blas·tpsv(blas·Flag f, int n, double *m, double *x); -void blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m); -void blas·her(int n, double a, double *x, double *m); -void blas·syr(int nrow, int ncol, double a, double *x, double *m); - -/* level 3 */ -void blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3); -void blas·trmm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2); -void blas·trsm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2); - -// ----------------------------------------------------------------------- -// higher level linear algebra +// linear algebra typedef struct math·Vector { @@ -185,7 +143,7 @@ typedef struct math·Vector typedef struct math·Matrix { double *data; - blas·Flag kind; + uint32 kind; int dim[2]; } math·Matrix; diff --git a/lib/c.py b/lib/c.py deleted file mode 100644 index bbc9ce3..0000000 --- a/lib/c.py +++ /dev/null @@ -1,1579 +0,0 @@ -#! /bin/python3 -from __future__ import annotations - -import os -import sys - -import sympy - -from enum import Enum -from typing import Callable, List, Tuple, Dict -from functools import singledispatch - -numel = len - -# ------------------------------------------------------------------------ -# String buffer - -level = 0 -buffer = "" - -def emit(s: str): - global buffer - buffer += s - -def emitln(n=1): - global buffer - buffer += "\n"*n + (" " * level) - -def enter_scope(): - global level - emit("{") - level += 1 - emitln() - -def exits_scope(): - global level - level -= 1 - emitln() - emit("}") - -def emitheader(): - emit("#include \n") - emit("#include \n") - emit("#include \n") - emitln() - -# ------------------------------------------------------------------------ -# Simple C AST -# TODO: Type checking - -# Abstract class everything will derive from -# All AST nodes will have an "emit" function that outputs formatted C code -class Emitter(object): - def emit(self): - pass - -# ------------------------------------------ -# Representation of a C type - -class Type(Emitter): - def emit(self): - pass - - def emitspec(self, var): - pass - -class Base(Type): - def __init__(self, name: str): - self.name = name - - def __hash__(self): - return hash(self.name) - - def __eq__(self, other): - return type(self) == type(other) and self.name == other.name - - def __str__(self): - return self.name - - def emit(self): - emit(self.name) - - def emitspec(self, ident): - emit(f"{ident}") - -# TODO: Operation lookup tables... - -class Ptr(Type): - def __init__(self, to: Type): - self.to = to - - def __str__(self): - return f"*{self.to}" - - def __eq__(self, other): - return type(self) == type(other) and self.to == other.to - - def __hash__(self): - return hash((hash(self.to), "*")) - - def emit(self): - self.to.emit() - - def emitspec(self, ident): - emit("*") - self.to.emitspec(ident) - -class Array(Type): - def __init__(self, base: Type, len: int): - self.base = base - self.len = len - - def __str__(self): - return f"[{self.len}]{self.base}" - - def __eq__(self, other): - return type(self) == type(other) and self.base == other.base - - def __hash__(self): - return hash((hash(self.base), self.len)) - - def emit(self): - self.base.emit() - - def emitspec(self, ident): - self.base.emitspec(ident) - emit(f"[{self.len}]") - -# Machine primitive types -Void = Base("void") -Error = Base("error") - -Byte = Base("byte") -String = Ptr(Byte) - -Int = Base("int") -Int8 = Base("int8") -Int16 = Base("int16") -Int32 = Base("int32") -Int64 = Base("int64") -Int32x4 = Base("__mm128i") -Int32x8 = Base("__mm256i") -Int64x2 = Base("__mm128i") -Int64x4 = Base("__mm256i") - -Float32 = Base("float") -Float64 = Base("double") -Float32x4 = Base("__m128") -Float32x8 = Base("__mm256") -Float64x2 = Base("__m128d") -Float64x4 = Base("__mm256d") - -def IsVectorType(kind): - if kind is Float32x4 or \ - kind is Float32x8 or \ - kind is Float64x2 or \ - kind is Float64x4 or \ - kind is Int32x4 or \ - kind is Int32x8 or \ - kind is Int64x2 or \ - kind is Int64x4: - return True - - if type(kind) == Ptr: - return IsVectorType(kind.to) - - return False - -def IsArrayType(kind): - return isinstance(kind, Array) - -# TODO: Make this ARCH dependent -BitDepth= { - Void: 8, - Int: 32, - Int32: 32, - Int64: 64, - Float32: 64, - Float64: 64, -} - -class SIMD(Enum): - SSE = "sse" - SSE2 = "sse2" - AVX = "avx" - AVX2 = "avx2" - FMA3 = "fma3" - AVX5 = "avx512" - -RegisterSize = { - SIMD.SSE: 128, - SIMD.SSE2: 128, - SIMD.AVX: 256, - SIMD.AVX2: 256, - SIMD.FMA3: 256, - SIMD.AVX5: 512, -} - -SIMDSupport = { - SIMD.SSE: set([Float32]), - SIMD.SSE2: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]), - SIMD.AVX: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]), - SIMD.AVX2: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]), - SIMD.FMA3: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]), - SIMD.AVX5: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]), -} - -# TODO: Think of a better way to handle this -def emit·load128(l, r): - if l is not None: - l.emit() - emit(" = ") - emit("_mm_loadu_sd(") - r.emit() - emit(")") - -def emit·broadcast256(l, r): - l.emit() - emit(" = _mm256_broadcastsd_pd(") - r.emit() - emit(")") - -def emit·load256(l, r): - if l is not None: - l.emit() - emit(" = ") - emit("_mm256_loadu_pd(") - r.emit() - emit(")") - -def emit·store256(l, r): - emit("_mm256_storeu_pd(") - l.emit() - emit(", ") - r.emit() - emit(")") - -def emit·copy256(l, r): - emit("_mm256_storeu_pd(") - l.emit() - emit(", ") - emit("_mm256_loadu_pd(") - r.emit() - emit("))") - -# TODO: Typedefs... - -# ------------------------------------------ -# C expressions -class Expr(Emitter): - def emit(): - pass - -# Literals -class Literal(Expr): - def emit(self): - emit(f"{self}") - -class I(Literal, int): - def __new__(cls, i: int): - return super(I, cls).__new__(cls, i) - -class F(Literal, float): - def __new__(self, f: float): - return super(F, self).__new__(cls, f) - -class S(Literal, str): - def __new__(self, s: str): - return super(S, self).__new__(cls, s) - -# Ident of symbol -class Ident(Expr): - def __init__(self, var): - self.name = var.name - self.var = var - - def __str__(self): - return str(self.name) - - def __hash__(self): - return hash(self.name) - - def __eq__(self, other): - return type(self) == type(other) and self.name == other.name - - def emit(self): - emit(f"{self.name}") - - -# Unary operators -class UnaryOp(Expr): - def __init__(self, x: Expr): - self.x = x - - def emit(self): - pass - -class Deref(UnaryOp): - method = { - Ptr(Float64x4) : lambda x: emit·load256(None, x), - } - - def emit(self): - kind = GetType(self.x) - if kind in self.method: - self.method[kind](self.x) - else: - emit("*") - self.x.emit() - -class Negate(UnaryOp): - def emit(self): - emit("~") - self.x.emit() - -class Ref(UnaryOp): - def emit(self): - emit("&") - self.x.emit() - -class Inc(UnaryOp): - def __init__(self, x: Expr, pre=False): - self.x = x - self.pre = pre - - def emit(self): - if self.pre: - emit("++") - self.x.emit() - else: - self.x.emit() - emit("++") - -class Dec(UnaryOp): - def __init__(self, x: Expr, pre=False): - self.x = x - self.pre = pre - - def emit(self): - if self.pre: - emit("--") - self.x.emit() - else: - self.x.emit() - emit("--") - -# Binary operators -class BinaryOp(Expr): - def __init__(self, left: Expr, right: Expr): - self.l = left - self.r = right - - def emit(self): - pass - -# TODO: check types if they are vectorized and emit correct intrinsic -class Add(BinaryOp): - def emit(self): - self.l.emit() - emit(f" + ") - self.r.emit() - -class Sub(BinaryOp): - def emit(self): - self.l.emit() - emit(f" - ") - self.r.emit() - -class Mul(BinaryOp): - def emit(self): - self.l.emit() - emit(f" * ") - self.r.emit() - -class Div(BinaryOp): - def emit(self): - self.l.emit() - emit(f" / ") - self.r.emit() - -class And(BinaryOp): - def emit(self): - self.l.emit() - emit(f" & ") - self.r.emit() - -class Xor(BinaryOp): - def emit(self): - self.l.emit() - emit(f" ^ ") - self.r.emit() - -class GT(BinaryOp): - def emit(self): - self.l.emit() - emit(f" > ") - self.r.emit() - -class LT(BinaryOp): - def emit(self): - self.l.emit() - emit(f" < ") - self.r.emit() - -class GE(BinaryOp): - def emit(self): - self.l.emit() - emit(f" >= ") - self.r.emit() - -class LE(BinaryOp): - def emit(self): - self.l.emit() - emit(f" <= ") - self.r.emit() - -class EQ(BinaryOp): - def emit(self): - self.l.emit() - emit(f" == ") - self.r.emit() - -# Loads a scalar -class SIMDLoad(BinaryOp): - def emit(self): - self.l.emit() - self.r.emit() - -# Loads data at address -class SIMDLoadAt(BinaryOp): - def emit(self): - self.l.emit() - emit(" = _mm256_loadu_pd(") - self.r.emit() - emit(")") - -# Assignment (stores) -class Assign(Expr): - def __init__(self, lhs: Expr, rhs: Expr): - self.lhs = lhs - self.rhs = rhs - -class Set(Assign): - method = { - (Float64x2, Ptr(Float64)) : emit·load128, - (Float64x4, Float64x2) : emit·broadcast256, - (Float64x4, Ptr(Float64x4)) : emit·load256, - (Ptr(Float64x4), Float64x4) : emit·store256, - (Ptr(Float64x4), Ptr(Float64x4)) : emit·copy256, - } - def emit(self): - lhs = GetType(self.lhs) - rhs = GetType(self.rhs) - if (lhs, rhs) in self.method: - self.method[(lhs, rhs)](self.lhs, self.rhs) - else: - self.lhs.emit() - emit(f" = ") - self.rhs.emit() - -class AddSet(Assign): - def emit(self): - self.lhs.emit() - emit(f" += ") - self.rhs.emit() - -class SubSet(Assign): - def emit(self): - self.lhs.emit() - emit(f" -= ") - self.rhs.emit() - -class MulSet(Assign): - def emit(self): - self.lhs.emit() - emit(f" *= ") - self.rhs.emit() - -class DivSet(Assign): - def emit(self): - self.lhs.emit() - emit(f" /= ") - self.rhs.emit() - -class Comma(Expr): - def __init__(self, x: Expr, next: Expr): - self.expr = (x, next) - - def emit(self): - self.expr[0].emit() - emit(", ") - self.expr[1].emit() - -class Index(Expr): - def __init__(self, x: Expr, i: Expr): - self.x = x - self.i = i - - def emit(self): - self.x.emit() - emit("[") - self.i.emit() - emit("]") - -class Paren(Expr): - def __init__(self, x: Expr): - self.x = x - - def emit(self): - emit("(") - self.x.emit() - emit(")") - -class Call(Expr): - def __init__(self, func: Func, args: List[Param]): - self.func = func - self.args = args - - def emit(self): - emit(self.func.name) - emit("(") - if numel(self.args) > 0: - self.args[0].emit() - for arg in self.args[1:]: - emit(", ") - arg.emit() - emit(")") - -def ExprList(*expr: Expr | List[Expr]): - if numel(expr) > 1: - x = expr[0] - return Comma(x, ExprList(*expr[1:])) - else: - return expr[0] - -# Common assignments are kept globally -# C statements - -class Stmt(Emitter): - def emit(self): - emit(";") - -class Empty(Stmt): - pass - -class Block(Stmt): - def __init__(self, *stmts: List[Stmt | Expr]): - self.stmts = [ s if isinstance(s, Stmt) else StmtExpr(s) for s in stmts ] - - def emit(self): - enter_scope() - for i, stmt in enumerate(self.stmts): - stmt.emit() - if i < numel(self.stmts) - 1: - emitln() - - exits_scope() - -class If(Stmt): - def __init__(self, cond: Expr | List[Expr], then: Stmt | List[Stmt], orelse: Stmt | List[Stmt] | None = None): - self.cond = cond if isinstance(cond, Expr) else ExprList(*cond) - self.then = Block(then) if isinstance(then, list) else then - self.orelse = Block(orelse) if isinstance(orelse, list) else orelse - - def emit(self): - emit("if (") - self.cond.emit() - emit(") ") - self.then.emit() - if self.orelse is not None: - self.orelse.emit() - -class For(Stmt): - def __init__(self, init: Expr | List[Expr], cond: Expr | List[Expr], step: Expr | List[Expr], body: Stmt| None = None): - self.init = init if isinstance(init, Expr) else ExprList(*init) - self.cond = cond if isinstance(cond, Expr) else ExprList(*cond) - self.step = step if isinstance(step, Expr) else ExprList(*step) - self.body = body if body is not None else Empty() - - def emit(self): - emit("for (") - if self.init is not None: - self.init.emit() - emit("; ") - if self.cond is not None: - self.cond.emit() - emit("; ") - if self.step is not None: - self.step.emit() - emit(") ") - - if isinstance(self.body, Block): - self.body.emit() - else: - enter_scope() - self.body.emit() - exits_scope() - - def execute(self, stmt: Stmt | Expr): - if isinstance(stmt, Expr): - stmt = StmtExpr(stmt) - - if isinstance(self.body, Empty): - self.body = stmt - return - elif not isinstance(self.body, Block): - self.body = Block(self.body) - self.body.stmts.append(stmt) - -class Return(Stmt): - def __init__(self, val: Expr): - self.val = val - - def emit(self): - emitln() - emit("return ") - self.val.emit() - super(Return, self).emit() - -class StmtExpr(Stmt): - def __init__(self, x: Expr): - self.x = x - - def emit(self): - self.x.emit() - super(StmtExpr, self).emit() - -class Mem(Enum): - Auto = "" - Static = "static" - Register = "register" - Typedef = "typedef" - External = "extern" - -class Decl(Emitter): - def __init__(self): - pass - - def emit(self): - pass - -class Func(Decl): - def __init__(self, name: str, ret: Expr = Void, params: List[Param] = None, vars: List[Var | List[Var]] = None, body: List[Stmt] = None): - self.name = name - self.ret = ret - self.params = params if params is not None else [] - self.vars = vars if vars is not None else [] - self.stmts = body if body is not None else [] - - def emit(self): - self.ret.emit() - emitln() - emit(self.name) - emit("(") - for i, p in enumerate(self.params): - p.emittype() - emit(" ") - p.emitspec() - if i < numel(self.params) - 1: - emit(", ") - emit(")\n") - - enter_scope() - - for var in self.vars: - if isinstance(var, list): - v = var[0] - v.emittype() - emit(" ") - v.emitspec() - for v in var[1:]: - emit(", ") - v.emitspec() - else: - var.emittype() - emit(" ") - var.emitspec() - - emit(";") - emitln() - - if numel(self.vars) > 0: - emitln() - - for stmt in self.stmts[:-1]: - stmt.emit() - emitln() - if numel(self.stmts) > 0: - self.stmts[-1].emit() - - exits_scope() - emitln(2) - - def declare(self, var: Var, *vars: List[Var | List[Var]]) -> Expr | List[Expr]: - if var.name in [v.name for v in self.vars]: - return - - self.vars.append(var) - if numel(vars) == 0: - return Ident(var) - - self.vars.extend(vars) - - idents = [Ident(var)] - idents += [Ident(v) for v in vars] - return idents - - def execute(self, stmt: Stmt | Expr, *args: List[Stmt | Expr]): - def push(n): - if isinstance(n, Stmt): - self.stmts.append(n) - elif isinstance(n, Expr): - self.stmts.append(StmtExpr(n)) - else: - raise TypeError("unrecognized type for function") - push(stmt) - for arg in args: - push(arg) - - def variables(self, *idents: List[str]) -> List[Expr]: - vars = {v.name : v for v in self.vars + self.params} - - if numel(idents) == 1: - return Ident(vars[idents[0]]) - return [Ident(vars[ident]) for ident in idents] - -class Var(Decl): - def __init__(self, type: Type, name: str, storage: Mem = Mem.Auto): - self.name = name - self.type = type - self.storage = storage - - def emit(self): - emit(f"{self.name}") - - def emittype(self): - if self.storage != Mem.Auto: - emit(self.storage.value) - emit(" ") - self.type.emit() - - def emitspec(self): - self.type.emitspec(self.name) - - @classmethod - def copy(cls, other: Var): - return cls(other.type, other.name, other.storage) - -class Param(Var): - def __init__(self, type: Type, name: str): - return super(Param, self).__init__(type, name, Mem.Auto) - - @classmethod - def copy(cls, other: Param): - return cls(other.type, other.name) - -def Params(*ps: List[Tuple(Type, str)]) -> List[Param]: - return [Param(p[0], p[1]) for p in ps] - -def Vars(*ps: List[Tuple(Type, str)]) -> List[Var]: - return [Var(p[0], p[1]) for p in ps] - -# ------------------------------------------------------------------------ -# AST modification/production functions - -# ------------------------------------------ -# basic (non-recursive) commands - -def Swap(x: Var, y: Var, tmp: Var) -> Stmt: - return StmtExpr(ExprList(Set(tmp, x), Set(x, y), Set(y, tmp))) - -def IsLoop(s: Stmt) -> bool: - return type(s) == For - -def EvenTo(x: Var, n: int) -> Var: - return And(x, Negate(I(n-1))) - -# ------------------------------------------ -# Expand: takes statement, indexed, and expands it x times - -# def Expand(s: Stmt, times: int) -> Block: -# if not isinstance(s, StmtExpr): -# raise TypeError(f"{type(x)} not supported by Expand operation") - -# return Block(StmtExpr(Step(s.x, i)) for i in range(times)) - -# ------------------------------------------ -# repeat: takes command on an array and repeats it - -@singledispatch -def Repeat(x: Expr, times: int) -> Expr | List[Expr]: - raise TypeError(f"{type(x)} not supported by Repeat operation") - -@Repeat.register -def _(x: Inc, times: int): - return AddSet(x.x, I(times)) - -@Repeat.register -def _(x: Dec, times: int): - return DecSet(x.x, I(times)) - -@Repeat.register -def _(x: Comma, times: int): - return Comma(Repeat(x.expr[0], times), Repeat(x.expr[1], times)) - -@singledispatch -def Repeat(x: Expr, times: int) -> Expr | List[Expr]: - raise TypeError(f"{type(x)} not supported by Repeat operation") - -@Repeat.register -def _(x: Inc, times: int): - return AddSet(x.x, I(times)) - -@Repeat.register -def _(x: Dec, times: int): - return DecSet(x.x, I(times)) - -@Repeat.register -def _(x: Comma, times: int): - return Comma(Repeat(x.expr[0], times), Repeat(x.expr[1], times)) - -# ------------------------------------------ -# step: indexes an expression by i - -@singledispatch -def Step(x: Expr, i: int) -> Expr: - raise TypeError(f"{type(x)} not supported by Step operation") - -@Step.register -def _(x: Comma, i: int): - return Comma(Step(x.expr[0], i), Step(x.expr[1], i)) - -@Step.register -def _(x: Assign, i: int): - return type(x)(Step(x.lhs, i), Step(x.rhs, i)) - -@Step.register -def _(x: BinaryOp, i: int): - return type(x)(Step(x.l, i), Step(x.r, i)) - -@Step.register -def _(x: UnaryOp, i: int): - return type(x)(Step(x.x, i)) - -@Step.register -def _(x: Ident, i: int): - return x - -@Step.register -def _(x: Deref, i: int): - return Index(x.x, I(i)) - -@Step.register -def _(ix: Index, i: int): - return Index(ix.x, Add(ix.i, I(i))) - -# ------------------------------------------ -# bfs search on statements in ast - -@singledispatch -def Visit(s: Stmt, func): - raise TypeError(f"{type(s)} not supported by Visit operation") - -@Visit.register -def _(s: Empty, func): - return - -@Visit.register -def _(blk: Block, func): - for stmt in blk.stmts: - Visit(stmt, func) - -@Visit.register -def _(jmp: If, func): - func(jmp.cond) - Visit(jmp.then, func) - if jmp.orelse is not None: - Visit(jmp.orelse, func) - -@Visit.register -def _(loop: For, func): - func(loop.init) - func(loop.cond) - func(loop.step) - Visit(loop.body, func) - -@Visit.register -def _(ret: Return, func): - func(ret.val) - -@Visit.register -def _(x: StmtExpr, func): - func(x.x) - -# ------------------------------------------ -# recreates a piece of an AST, allowing an arbitrary transformation of expression nodes - -@singledispatch -def Make(s: Stmt, func): - raise TypeError(f"{type(s)} not supported by Make operation") - -@Make.register -def _(s: Empty, func): - return Empty() - -@Make.register -def _(blk: Block, func): - return Block(*(Make(stmt, func) for stmt in blk.stmts)) - -@Make.register -def _(loop: For, func): - return For(func(loop.init), func(loop.cond), func(loop.step), Make(loop.body, func)) - -@Make.register -def _(jmp: If, func): - if jmp.orelse is not None: - return If(func(jmp.cond), Make(jmp.then, func), Make(jmp.orelse, func)) - else: - return If(func(jmp.cond), Make(jmp.then, func)) - -@Make.register -def _(ret: Return, func): - return Return(func(ret.val)) - -@Make.register -def _(x: StmtExpr, func): - return StmtExpr(func(x.x)) - -# ------------------------------------------ -# GetType function - -@singledispatch -def GetType(x: Expr) -> Type: - if x is None: - return - raise TypeError(f"{type(x)} not supported by GetType operation") - -@GetType.register -def _(x: Empty): - return Void - -@GetType.register -def _(x: Empty): - return Void - -@GetType.register -def _(x: S): - return Ptr(Byte) - -@GetType.register -def _(x: I): - return Int - -@GetType.register -def _(x: F): - return Float64 - -@GetType.register -def _(x: Ident): - return x.var.type - -@GetType.register -def _(x: UnaryOp): - return GetType(x.x) - -@GetType.register -def _(x: Deref): - return GetType(x.x).to - -@GetType.register -def _(x: Ref): - return Ptr(GetType(x.x)) - -@GetType.register -def _(x: Index): - base = GetType(x.x) - if isinstance(base, Ptr): - return base.to - elif isinstance(base, Array): - return base.base - else: - pass - # raise TypeError(f"attempting to index type {base} of node {x.x}") - -# TODO: type checking for both - -@GetType.register -def _(x: BinaryOp): - lhs = GetType(x.l) - rhs = GetType(x.r) - return lhs - -@GetType.register -def _(x: Var): - return x.type - -@GetType.register -def _(x: Assign): - lhs = GetType(x.lhs) - rhs = GetType(x.rhs) - return lhs - -@GetType.register -def _(x: Comma): - return GetType(x.expr[1]) - -@GetType.register -def _(x: Paren): - return GetType(x.x) - -@GetType.register -def _(x: Call): - return x.func.ret - -# ------------------------------------------ -# Transform function -# Recurses down AST, and modifies Expr nodes -# Returns a brand new tree structure - -@singledispatch -def Transform(x: Expr, func: Callable[Expr, Expr]): - raise TypeError(f"{type(x)} not supported by Transform operation") - -@Transform.register -def _(x: Empty, func): - return func(Empty()) - -@Transform.register -def _(x: Literal, func): - return func(type(x)(x)) - -@Transform.register -def _(x: Ident, func): - return func(Ident(x.var)) - -@Transform.register -def _(x: UnaryOp, func): - return func(UnaryOp(Transform(x.x, func))) - -@Transform.register -def _(x: Assign, func): - return func(type(x)(Transform(x.lhs, func), Transform(x.rhs, func))) - -@Transform.register -def _(x: Deref, func): - return func(Transform(Deref(x.x, func))) - -@Transform.register -def _(x: Index, func): - return func(Index(Transform(x.x, func), Transform(x.i, func))) - -@Transform.register -def _(x: Comma, func): - return func(Comma(Transform(x.expr[0], func), Transform(x.expr[1], func))) - -@Transform.register -def _(x: BinaryOp, func): - return func(type(x)(Transform(x.l, func), Transform(x.r, func))) - -# ------------------------------------------ -# Filter function -# Recurses down AST, and stores Expr nodes that satisfy condition in results - -@singledispatch -def Filter(x: Expr, cond, results: List[Expr]): - raise TypeError(f"{type(x)} not supported by Filter operation") - -@Filter.register -def _(x: Empty, cond, results: List[Expr]): - if cond(x): - results.append(x) - -@Filter.register(Ident) -@Filter.register(Literal) -def _(x, cond, results: List[Expr]): - if cond(x): - results.append(x) - -@Filter.register -def _(x: UnaryOp, cond, results: List[Expr]): - if cond(x): - results.append(x) - Filter(op.x, cond, results) - -@Filter.register -def _(s: Assign, cond, results: List[Expr]): - if cond(s): - results.append(s) - Filter(s.lhs, cond, results) - Filter(s.rhs, cond, results) - -@Filter.register -def _(v: Deref, cond, results: List[Expr]): - if cond(v): - results.append(v) - Filter(v.x, cond, results) - -@Filter.register -def _(i: Index, cond, results: List[Expr]): - if cond(i): - results.append(i) - Filter(i.x, cond, results) - Filter(i.i, cond, results) - -@Filter.register -def _(comma: Comma, cond, results: List[Expr]): - if cond(comma): - results.append(comma) - Filter(comma.expr[0], cond, results) - Filter(comma.expr[1], cond, results) - -@Filter.register -def _(op: BinaryOp, cond, results: List[Expr]): - if cond(op): - results.append(op) - Filter(op.l, cond, results) - Filter(op.r, cond, results) - -# ------------------------------------------ -# Eval function -# Recurses down AST, and evaluates Expr nodes -# Throws an error if tree can't be evaluated at compile time - -@singledispatch -def Eval(x: Expr) -> object: - raise TypeError(f"{type(x)} not supported by Eval operation") - -@Eval.register -def _(x: Empty): - pass - -@Eval.register(Ident) -@Eval.register(S) -def _(x): - return sympy.symbols(f"{x}") - -@Eval.register(float) -@Eval.register(int) -@Eval.register(I) -@Eval.register(F) -def _(x): - return x - -@Eval.register -def _(x: Inc): - return Eval(Add(Eval(op.x, cond, results), I(1))) - -@Eval.register -def _(x: Dec): - return Eval(Sub(Eval(op.x, cond, results), I(1))) - -# TODO: This won't work in general (if we have things like sizeof(x) + 1 - sizeof(x)) -@Eval.register -def _(op: Add): - l = Eval(op.l) - r = Eval(op.r) - return sympy.simplify(l + r) - -@Eval.register -def _(op: Sub): - l = Eval(op.l) - r = Eval(op.r) - return sympy.simplify(l - r) - -@Eval.register -def _(op: Mul): - l = Eval(op.l) - r = Eval(op.r) - return sympy.simplify(l * r) - -@Eval.register -def _(op: Div): - l = Eval(op.l) - r = Eval(op.r) - return sympy.simplify(l / r) - -@Eval.register -def _(s: Assign): - return Eval(s.rhs) - -@Eval.register -def _(comma: Comma): - return Eval(comma.expr[1]) - -# ------------------------------------------ -# Leaf traversal - -def VarsUsed(stmt: Stmt) -> List[Var]: - vars = [] - Visit(stmt, lambda node: Filter(node, lambda x: isinstance(x, Ident), vars)) - vars = set([v.var for v in vars]) - - return vars - -def AddrAccessed(stmt: Stmt): - scalars = [] # variables that are accessed as scalars (single sites) - vectors = {} # variables that are accessed as vectors (indexed/dereferenced) - nodes = [] - Visit(stmt, lambda node: Filter(node, lambda x: isinstance(x, Ident), scalars)) - Visit(stmt, lambda node: Filter(node, lambda x: isinstance(x, Index), nodes)) - - for node in nodes: - vars = [] - Filter(node.x, lambda x: isinstance(x, Ident), vars) - if numel(vars) != 1: - raise ValueError("multiple variables used in index expression not supported") - vectors[vars[0]] = node.i - if vars[0] in scalars: - scalars.remove(vars[0]) - - return set(scalars), vectors - -# ------------------------------------------ -# Large scale functions - -def Unroll(name: str, loop: For, times: int, ret: Ident = None, accumulator = None) -> (Func, List[Stmt]): - # TODO: More sophisticated computation for length of loop - if not isinstance(loop.cond, LE) and not isinstance(loop.cond, LT): - raise TypeError(f"{type(loop.cond)} not supported in loop unrolling") - - # pull off needed features of the loop - i = loop.init.lhs.var - vars = VarsUsed(loop.body) - - def asvector(v): - if v != i: - return Var(Array(v.type, times), v.name, v.storage) - else: - return Var.copy(v) - - n = loop.cond.r.var - param = [Param(Ptr(i.type), f"{i.name}p")] + [Param.copy(v) for v in vars if (type(v) == Param and v != n)] - stack = {v.name: asvector(v) for v in vars if type(v) == Var} - - # TODO: More sophisticated type checking - if (type(n) != Param): - raise TypeError(f"{type(n)} not implemented yet") - - if ret is None: - kernel = Func(f"{name}_kernel{times}", Void, param, list(stack.values())) - else: - kernel = Func(f"{name}_kernel{times}", ret.var.type, param, list(stack.values())) - - len = kernel.declare(n) - - body = loop.body - itor, itorp = kernel.variables("i", "ip") - - def mkarray(x: Expr, times: int): - if isinstance(x, Ident): - if type(x.var) == Var: - x.var = stack[x.name] - return x - - def step(x: Expr, times: int): - if x == itor: - return Add(x, I(times)) - if isinstance(x, Ident): - if type(x.var) == Var: - return Index(x, I(times)) - if not isinstance(x, Expr): - raise ValueError(f"panic, hit type {type(x)}") - - return x - - expandedloop = Make(loop.body, lambda node: Transform(node, lambda x: mkarray(x, times))) - kernel.execute( - Set(len, EvenTo(Deref(itorp), times)), - For(Set(itor, I(0)), LT(itor, len), Repeat(loop.step, times), - body = Block(* - (Make(expandedloop, lambda node: - Transform(node, lambda x: step(x, i))) for i in range(times) - ) - ) - ), - Set(Deref(itorp), itor) - ) - - if ret is not None: - k = kernel.variables(ret.name) - if k.var.type != ret.var.type: - if accumulator is None: - raise ValueError("If loop returns a value, an accumulator must be given") - else: - accumulator = For(Set(itor, I(1)), LT(itor, I(times)), Inc(itor), - body = accumulator(I(0), itor, k) - ) - kernel.execute(accumulator) - - kernel.execute(Return(Index(ret, I(0)))) - - loop.init = None - if ret is None: - return kernel, [Set(itor, len), Call(kernel, [Ref(itor)] + param[1:]), loop] - else: - return kernel, [Set(itor, len), Set(ret, Call(kernel, [Ref(itor)] + param[1:])), loop] - -# Replaces all vectorizable loops inside Func with vectorized variants -# Returns the new vectorized function (tagged by the SIMD chosen) - -class SymTab(object): - def __init__(self): - self.stack = {} - self.addrs = {} - - def have(self, name): - return name in self.stack or name in self.addrs - -def Vectorize(func: Func, isa: SIMD) -> Func: - if isa != SIMD.AVX2: - raise ValueError(f"ISA '{isa}' not currently implemented") - - vfunc = Func(f"{func.name}_{isa.value}", func.ret, func.params) - for stmt in func.stmts: - if IsLoop(stmt): - loop = For(stmt.init, stmt.cond, stmt.step) - iterator = set([stmt.init.lhs]) - - body = stmt.body - if type(body) != Block: - # TODO: Think through this carefully... - # This is coded for the accumulation step at the end of a kernel - loop.cond.r = I(Eval(Div(loop.cond.r, 4))) - loop.body = body - else: - instr = body.stmts - # TODO: Remove hardcoded 4 -> should be function of types! - # As of now, we have harcoded AVX2 <-> float64 relationship - if numel(instr) % 4 != 0: - raise ValueError("loop can not be vectorized, instructions can not be globbed equally") - - # TODO: Allow for non-sequential accesses? - for i in range(0, numel(instr), 4): - scalars = [] - vectors = [] - for j in range(4): - s, v = AddrAccessed(instr[i+j]) - s -= iterator # TODO: This is hacky - scalars.append(frozenset(s)) - vectors.append(v) - - # Test if code in uniform to allow for vectorization - if numel(set(scalars)) != 1: - raise ValueError("non uniform scalar accesses in consecutive line. can not vectorize") - - for j in range(1, 4): - for v, idx in vectors[j].items(): - if (delta := (Eval(Sub(idx, vectors[j-1][v])))) != 1: - print(f"{delta}") - raise ValueError("non uniform vector accesses in consecutive line. can not vectorize") - - # If we made it to here, we have passed all checks. vectorize! - vecs = vectors[0] - if i == 0: - syms = SymTab() - for s in scalars[0]: - intermediate = vfunc.declare(Var(Float64x2, f"{s.name}128")) - syms.stack[s.name] = vfunc.declare(Var(Float64x4, f"{s.name}256")) - vfunc.execute(Set(intermediate, Ref(s))) - vfunc.execute(Set(syms.stack[s.name], intermediate)) - - for v in vecs.keys(): - # All params are treated AS addresses to load into vectorized registers - if type(v.var) == Param: - syms.addrs[v.name] = Var(Ptr(Float64x4), f"{v.name}256") - # All stack declared parameters MUST be moved to vectorized registers - else: - assert IsArrayType(v.var.type), f"must be an array type, instead got {v.var.type}" - nreg = v.var.type.len // 4 - if nreg > 1: - syms.stack[v.name] = vfunc.declare(Var(Array(Float64x4, nreg), f"{v.name}256")) - else: - syms.stack[v.name] = vfunc.declare(Var(Float64x4, f"{v.name}256")) - - - # IMPORTANT: We do a post-order traversal. - # We transforms leaves (identifiers) first and then move back up the root - def translate(x): - if isinstance(x, Ident): - if x.name in syms.stack: - return syms.stack[x.name] - elif x.name in syms.addrs: - x.var = syms.addrs[x.name] - if isinstance(x, Index): - if type(x.x) == Ident: - if x.x.name in syms.addrs: - return Add(x.x, x.i) - elif f"{x.x.name[:-3]}" in syms.stack: #NOTE: This is hacky. Think of something better - return Index(x.x, I(Eval(Div(x.i, 4)))) - if isinstance(x, BinaryOp): - l, r = GetType(x.l), GetType(x.r) - if IsVectorType(l) or IsVectorType(r): - if type(l) == Ptr: - x.l = Deref(x.l) - if type(r) == Ptr: - x.r = Deref(x.r) - - return x - - loop.execute(Make(instr[i], lambda node: Transform(node, translate))) - - vfunc.execute(loop) - else: - vfunc.execute(stmt) - - return vfunc - - -def Strided(func: Func) -> Func: - pass - -# ------------------------------------------------------------------------ -# Point of testing - -def copy(): - F = Func("blas·copy", Void, - Params( - (Int, "len"), (Ptr(Float64), "x"), (Ptr(Float64), "y") - ), - Vars( - (Int, "i"), (Float64, "tmp") - ) - ) - # could also declare like - # F.declare( ... ) - - len, x, y, i, tmp = F.variables("len", "x", "y", "i", "tmp") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Swap(Index(x, i), Index(y, i), tmp) - ) - - kernel, calls = Unroll("copy", loop, 8) - kernel.emit() - - avx256kernel = Vectorize(kernel, SIMD.AVX2) - avx256kernel.emit() - - F.execute(*calls) - F.emit() - -def axpby(): - F = Func("blas·axpby", Void, - Params( - (Int, "len"), (Float64, "a"), (Ptr(Float64), "x"), (Float64, "b"), (Ptr(Float64), "y") - ), - Vars( - (Int, "i"), #(Float64, "tmp") - ) - ) - # could also declare like - # F.declare( ... ) - - # TODO: Increase ergonomics here... - len, x, y, i, a, b = F.variables("len", "x", "y", "i", "a", "b") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - Set(Index(y, i), - Mul(b, - Add(Index(y, i), - Mul(a, Index(x, i)), - ) - ) - ) - ) - - kernel, calls = Unroll("axpby", loop, 8) - kernel.emit() - - avx256kernel = Vectorize(kernel, SIMD.AVX2) - avx256kernel.emit() - - F.execute(*calls) - F.emit() - -def argmax(): - F = Func("blas·argmax", Int, - Params( - (Int, "len"), (Ptr(Float64), "x"), - ), - Vars( - (Int, "i"), (Int, "ix"), (Float64, "max") - ) - ) - len, x, i, ix, max = F.variables("len", "x", "i", "ix", "max") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - If(GT(Index(x, i), max), - Block( - Set(ix, i), - Set(max, Index(x, ix)), - ) - ) - ) - kernel, calls = Unroll("argmax", loop, 8, ix, - lambda ires, icur, node: - If(GT(Index(x, Index(node, icur)), Index(x, Index(node, ires))), - Block(Set(Index(node, ires), Index(node, icur))) - ) - ) - kernel.emit() - - # avx256kernel = Vectorize(kernel, SIMD.AVX2) - # avx256kernel.emit() - - F.execute(*calls, Return(ix)) - - F.execute(loop) - F.emit() - -def dot(): - F = Func("blas·dot", Float64, - Params( - (Int, "len"), (Ptr(Float64), "x"), (Ptr(Float64), "y") - ), - Vars( - (Int, "i"), (Float64, "sum"), - ) - ) - len, x, i, y, sum = F.variables("len", "x", "i", "y", "sum") - - loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) - loop.execute( - AddSet(sum, Mul(Index(x, i), Index(y, i))) - ) - - kernel, calls = Unroll("dot", loop, 16, sum, - lambda ires, icur, node: - StmtExpr(AddSet(Index(node, ires), Index(node, icur))) - ) - kernel.emit() - - avx256kernel = Vectorize(kernel, SIMD.AVX2) - avx256kernel.emit() - - F.execute(*calls, Return(sum)) - - F.emit() - -def gemv(): - F = Func("blas·gemv", Void, - Params( - (Int, "nrow"), (Int, "ncol"), (Float64, "a"), (Ptr(Float64), "m"), (Int, "incm"), - (Ptr(Float64), "x"), (Float64, "b"), (Ptr(Float64), "y") - ), - Vars( - (Int, "r"), (Int, "c"), (Ptr(Float64), "row"), (Float64, "res") - ) - ) - r, c, row, res = F.variables("r", "c", "row", "res") - nrow, ncol, a, m, incm, x, b, y = F.variables("nrow", "ncol", "a", "m", "incm", "x", "b", "y") - loop = For(Set(r, I(0)), LT(r, nrow), Inc(r), - Block( - Set(row, Add(m, incm)), - Set(res, I(0)), - For(Set(c, I(0)), LT(c, ncol), Inc(c), - AddSet(res, Mul(Index(row, c), Index(x, c))) - ), - Set(Index(y, r), Add(Mul(a, res), Mul(b, Index(y, r)))) - ) - ) - - F.execute(loop) - F.emit() - - -if __name__ == "__main__": - emitheader() - - gemv() - # dot() - # argmax() - # copy() - # axpby() - - print(buffer) diff --git a/rules.mk b/rules.mk index 7fc6a1f..ce74da2 100644 --- a/rules.mk +++ b/rules.mk @@ -8,7 +8,7 @@ all: targets debug: CFLAGS += -DDEBUG debug: targets -release: CFLAGS += -DNDEBUG -O3 +release: CFLAGS += -O3 -flto #-DNDEBUG release: targets # Targets & array of sources & intermediates @@ -25,19 +25,24 @@ include $(DIR)/rules.mk # Generic rules %.a: %.o - $(ARCHIVE) + @echo AR $@ $^ + @$(ARCHIVE) $(OBJ_DIR)/%.o: $(SRC_DIR)/%.c - $(COMPILE) + @echo CC $^ + @$(COMPILE) $(OBJ_DIR)/%.o: $(SRC_DIR)/%.s - $(ASSEMBLE) + @echo AS $^ + @$(ASSEMBLE) %: %.o - $(LINK) + @echo CC $^ + @$(LINK) $(OBJ_DIR)/%: $(SRC_DIR)/%.c - $(COMPLNK) + @echo CC $^ + @$(COMPLNK) .PHONY: targets targets: $(LIBS) $(BINS) diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk index 0e3d482..ff71746 100644 --- a/sys/libbio/rules.mk +++ b/sys/libbio/rules.mk @@ -31,10 +31,12 @@ BINS := $(BINS) $(BINS_$(d)) # $(LIBS_$(d)) = TCLIBS := $(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io) - $(ARCHIVE) + @echo LIB $@ + @$(ARCHIVE) $(BINS_$(d)): $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a - $(LINK) + @echo BIN $@ + @$(LINK) # ---- Pop off stack ---- -include $(DEPS_$(d)) diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c index f6eb830..f5392c8 100644 --- a/sys/libmath/blas.c +++ b/sys/libmath/blas.c @@ -1,132 +1,53 @@ #include #include +#include +#include #include -#include - -int -argmax2(int len, double *x) -{ - int i, ix[8]; - double *end; - double max[8]; - - max[0] = x[0]; max[1] = x[1]; max[2] = x[2]; max[3] = x[3]; - max[4] = x[4]; max[5] = x[5]; max[6] = x[6]; max[7] = x[7]; - for (i = 0; i < len; i+=8) { - if (x[i+0] > max[0]) { - max[0] = *x; - ix[0] = i; - } - if (x[i+1] > max[1]) { - max[1] = *x; - ix[1] = i; - } - if (x[i+2] > max[2]) { - max[2] = *x; - ix[2] = i; - } - if (x[i+3] > max[3]) { - max[3] = *x; - ix[3] = i; - } - if (x[i+4] > max[4]) { - max[4] = *x; - ix[4] = i; - } - if (x[i+5] > max[5]) { - max[5] = *x; - ix[5] = i; - } - if (x[i+6] > max[6]) { - max[6] = *x; - ix[6] = i; - } - if (x[i+7] > max[7]) { - max[7] = *x; - ix[7] = i; - } - } - for (i = 1; i < 8; i++) { - if (max[i] > max[0]) { - max[0] = max[i]; - ix[0] = ix[i] + i; - } - } - - return ix[0]; -} - -int -argmax(int len, double *x) -{ - int i, ix; - double *end; - double max; - - max = *x; - for (i = 0; i < len; i++) { - if (x[i] > max) { - max = *x; - ix = i; - } - } - - return ix; -} +#include -#define LEN 1000000 +#define LEN 2000000 #define NIT 2000 +#define INC 2 error main() { int i, nit; - double *x, *y, res[3]; + double *x, *y, res[2]; clock_t t; - double tprof[3] = { 0 }; + double tprof[2] = { 0 }; rng·init(0); x = malloc(sizeof(*x)*LEN); - // y = malloc(sizeof(*x)*LEN); - -#define DO_0 t = clock(); \ - res[0] += argmax(LEN, x); \ - t = clock() - t; \ - tprof[0] += 1000.*t/CLOCKS_PER_SEC; \ + y = malloc(sizeof(*x)*LEN); -#define DO_1 t = clock(); \ - res[1] += argmax2(LEN, x); \ - t = clock() - t; \ - tprof[1] += 1000.*t/CLOCKS_PER_SEC; \ +#define DO_0 t = clock(); \ + res[0] += blas·sumd(LEN/INC, x, INC); \ + t = clock() - t; \ + tprof[0] += 1000.*t/CLOCKS_PER_SEC; \ -#define DO_2 t = clock(); \ - res[2] += cblas_idamax(LEN, x, 1); \ - t = clock() - t; \ - tprof[2] += 1000.*t/CLOCKS_PER_SEC; +#define DO_1 t = clock(); \ + res[1] += cblas_dsum(LEN/INC, x, INC); \ + t = clock() - t; \ + tprof[1] += 1000.*t/CLOCKS_PER_SEC; for (nit = 0; nit < NIT; nit++) { for (i = 0; i < LEN; i++) { x[i] = rng·random(); - // y[i] = rng·random(); + y[i] = rng·random(); } - switch (nit % 6) { - case 0: DO_0; DO_1; DO_2; break; - case 1: DO_0; DO_2; DO_1; break; - case 2: DO_1; DO_0; DO_2; break; - case 3: DO_1; DO_2; DO_0; break; - case 4: DO_2; DO_0; DO_1; break; - case 5: DO_2; DO_1; DO_0; break; + switch (nit % 2) { + case 0: DO_0; DO_1; break; + case 1: DO_1; DO_0; break; } } - printf("mean time/iteration (naive): %fms\n", tprof[0]/NIT); - printf("--> result (naive): %f\n", res[0]); - printf("mean time/iteration (unrolled): %fms\n", tprof[1]/NIT); - printf("--> result (unrolled): %f\n", res[1]); - printf("mean time/iteration (openblas): %fms\n", tprof[2]/NIT); - printf("--> result (openblas): %f\n", res[2]); + printf("mean time/iteration (mine): %fms\n", tprof[0]/NIT); + printf("--> result (mine): %f\n", res[0]); + printf("mean time/iteration (openblas): %fms\n", tprof[1]/NIT); + printf("--> result (openblas): %f\n", res[1]); return 0; } diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c new file mode 100644 index 0000000..8cc70eb --- /dev/null +++ b/sys/libmath/blas1.c @@ -0,0 +1,2062 @@ +#include +#include +#include + +/*********************************************************/ +/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */ +/*********************************************************/ + + +static +int +argminf_kernel16(int *ip, float *x) +{ + int ix[16]; + float min[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + for (i = 0; i < 16; i++) { + min[i] = x[0]; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] < min[0]) { + ix[0] = (i + 0); + min[0] = x[ix[0]]; + } + if (x[i + 1] < min[1]) { + ix[1] = (i + 1); + min[1] = x[ix[1]]; + } + if (x[i + 2] < min[2]) { + ix[2] = (i + 2); + min[2] = x[ix[2]]; + } + if (x[i + 3] < min[3]) { + ix[3] = (i + 3); + min[3] = x[ix[3]]; + } + if (x[i + 4] < min[4]) { + ix[4] = (i + 4); + min[4] = x[ix[4]]; + } + if (x[i + 5] < min[5]) { + ix[5] = (i + 5); + min[5] = x[ix[5]]; + } + if (x[i + 6] < min[6]) { + ix[6] = (i + 6); + min[6] = x[ix[6]]; + } + if (x[i + 7] < min[7]) { + ix[7] = (i + 7); + min[7] = x[ix[7]]; + } + if (x[i + 8] < min[8]) { + ix[8] = (i + 8); + min[8] = x[ix[8]]; + } + if (x[i + 9] < min[9]) { + ix[9] = (i + 9); + min[9] = x[ix[9]]; + } + if (x[i + 10] < min[10]) { + ix[10] = (i + 10); + min[10] = x[ix[10]]; + } + if (x[i + 11] < min[11]) { + ix[11] = (i + 11); + min[11] = x[ix[11]]; + } + if (x[i + 12] < min[12]) { + ix[12] = (i + 12); + min[12] = x[ix[12]]; + } + if (x[i + 13] < min[13]) { + ix[13] = (i + 13); + min[13] = x[ix[13]]; + } + if (x[i + 14] < min[14]) { + ix[14] = (i + 14); + min[14] = x[ix[14]]; + } + if (x[i + 15] < min[15]) { + ix[15] = (i + 15); + min[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argminf_s_kernel8(int *ip, int incx, float *x) +{ + int ix[8]; + float min[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + for (i = 0; i < 8; i++) { + min[i] = x[0]; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] < min[0]) { + ix[0] = (i + 0); + min[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] < min[1]) { + ix[1] = (i + 1); + min[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] < min[2]) { + ix[2] = (i + 2); + min[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] < min[3]) { + ix[3] = (i + 3); + min[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] < min[4]) { + ix[4] = (i + 4); + min[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] < min[5]) { + ix[5] = (i + 5); + min[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] < min[6]) { + ix[6] = (i + 6); + min[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] < min[7]) { + ix[7] = (i + 7); + min[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argminf(int len, float *x, int incx) +{ + int i; + int ix; + float min; + + if (incx == 1) { + i = len; + ix = argminf_kernel16(&i, x); + } else { + i = len; + ix = argminf_s_kernel8(&i, incx, x); + } + min = x[ix]; + for (; i < len; i++) { + if (x[i] < min) { + ix = i; + min = x[ix]; + } + } + + return ix; +} + +static +int +argmaxf_kernel16(int *ip, float *x) +{ + float max[16]; + int i; + int ix[16]; + int len; + + for (i = 0; i < 16; i++) { + max[i] = x[0]; + } + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] > max[0]) { + ix[0] = (i + 0); + max[0] = x[ix[0]]; + } + if (x[i + 1] > max[1]) { + ix[1] = (i + 1); + max[1] = x[ix[1]]; + } + if (x[i + 2] > max[2]) { + ix[2] = (i + 2); + max[2] = x[ix[2]]; + } + if (x[i + 3] > max[3]) { + ix[3] = (i + 3); + max[3] = x[ix[3]]; + } + if (x[i + 4] > max[4]) { + ix[4] = (i + 4); + max[4] = x[ix[4]]; + } + if (x[i + 5] > max[5]) { + ix[5] = (i + 5); + max[5] = x[ix[5]]; + } + if (x[i + 6] > max[6]) { + ix[6] = (i + 6); + max[6] = x[ix[6]]; + } + if (x[i + 7] > max[7]) { + ix[7] = (i + 7); + max[7] = x[ix[7]]; + } + if (x[i + 8] > max[8]) { + ix[8] = (i + 8); + max[8] = x[ix[8]]; + } + if (x[i + 9] > max[9]) { + ix[9] = (i + 9); + max[9] = x[ix[9]]; + } + if (x[i + 10] > max[10]) { + ix[10] = (i + 10); + max[10] = x[ix[10]]; + } + if (x[i + 11] > max[11]) { + ix[11] = (i + 11); + max[11] = x[ix[11]]; + } + if (x[i + 12] > max[12]) { + ix[12] = (i + 12); + max[12] = x[ix[12]]; + } + if (x[i + 13] > max[13]) { + ix[13] = (i + 13); + max[13] = x[ix[13]]; + } + if (x[i + 14] > max[14]) { + ix[14] = (i + 14); + max[14] = x[ix[14]]; + } + if (x[i + 15] > max[15]) { + ix[15] = (i + 15); + max[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argmaxf_s_kernel8(int *ip, int incx, float *x) +{ + float max[8]; + int i; + int ix[8]; + int len; + + for (i = 0; i < 8; i++) { + max[i] = x[0]; + } + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] > max[0]) { + ix[0] = (i + 0); + max[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] > max[1]) { + ix[1] = (i + 1); + max[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] > max[2]) { + ix[2] = (i + 2); + max[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] > max[3]) { + ix[3] = (i + 3); + max[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] > max[4]) { + ix[4] = (i + 4); + max[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] > max[5]) { + ix[5] = (i + 5); + max[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] > max[6]) { + ix[6] = (i + 6); + max[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] > max[7]) { + ix[7] = (i + 7); + max[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argmaxf(int len, float *x, int incx) +{ + int i; + int ix; + float max; + + if (incx == 1) { + i = len; + ix = argmaxf_kernel16(&i, x); + } else { + i = len; + ix = argmaxf_s_kernel8(&i, incx, x); + } + max = x[ix]; + for (; i < len; i++) { + if (x[i] > max) { + ix = i; + max = x[ix]; + } + } + + return ix; +} + +static +void +copyf_kernel16(int *ip, float *x, float *y) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = x[i + 0]; + y[i + 1] = x[i + 1]; + y[i + 2] = x[i + 2]; + y[i + 3] = x[i + 3]; + y[i + 4] = x[i + 4]; + y[i + 5] = x[i + 5]; + y[i + 6] = x[i + 6]; + y[i + 7] = x[i + 7]; + y[i + 8] = x[i + 8]; + y[i + 9] = x[i + 9]; + y[i + 10] = x[i + 10]; + y[i + 11] = x[i + 11]; + y[i + 12] = x[i + 12]; + y[i + 13] = x[i + 13]; + y[i + 14] = x[i + 14]; + y[i + 15] = x[i + 15]; + } + *ip = i; +} + +static +void +copyf_s_kernel8(int *ip, int incx, float *x, int incy, float *y) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = x[incx * (i + 0)]; + y[incy * (i + 1)] = x[incx * (i + 1)]; + y[incy * (i + 2)] = x[incx * (i + 2)]; + y[incy * (i + 3)] = x[incx * (i + 3)]; + y[incy * (i + 4)] = x[incx * (i + 4)]; + y[incy * (i + 5)] = x[incx * (i + 5)]; + y[incy * (i + 6)] = x[incx * (i + 6)]; + y[incy * (i + 7)] = x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·copyf(int len, float *x, int incx, float *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + copyf_kernel16(&i, x, y); + } else { + i = len; + copyf_s_kernel8(&i, incx, x, incy, y); + } + for (; i < len; i++) { + y[i] = x[i]; + } +} + +static +void +axpyf_kernel16(int *ip, float a, float *x, float *y) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = y[i + 0] + a * x[i + 0]; + y[i + 1] = y[i + 1] + a * x[i + 1]; + y[i + 2] = y[i + 2] + a * x[i + 2]; + y[i + 3] = y[i + 3] + a * x[i + 3]; + y[i + 4] = y[i + 4] + a * x[i + 4]; + y[i + 5] = y[i + 5] + a * x[i + 5]; + y[i + 6] = y[i + 6] + a * x[i + 6]; + y[i + 7] = y[i + 7] + a * x[i + 7]; + y[i + 8] = y[i + 8] + a * x[i + 8]; + y[i + 9] = y[i + 9] + a * x[i + 9]; + y[i + 10] = y[i + 10] + a * x[i + 10]; + y[i + 11] = y[i + 11] + a * x[i + 11]; + y[i + 12] = y[i + 12] + a * x[i + 12]; + y[i + 13] = y[i + 13] + a * x[i + 13]; + y[i + 14] = y[i + 14] + a * x[i + 14]; + y[i + 15] = y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpyf_s_kernel8(int *ip, float a, float *x, int incy, float *y, int incx) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpyf(int len, float a, float *x, int incx, float *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpyf_kernel16(&i, a, x, y); + } else { + i = len; + axpyf_s_kernel8(&i, a, x, incy, y, incx); + } + for (; i < len; i++) { + y[i] = y[i] + a * x[i]; + } +} + +static +void +axpbyf_kernel16(int *ip, float *x, float a, float *y, float b) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = b * y[i + 0] + a * x[i + 0]; + y[i + 1] = b * y[i + 1] + a * x[i + 1]; + y[i + 2] = b * y[i + 2] + a * x[i + 2]; + y[i + 3] = b * y[i + 3] + a * x[i + 3]; + y[i + 4] = b * y[i + 4] + a * x[i + 4]; + y[i + 5] = b * y[i + 5] + a * x[i + 5]; + y[i + 6] = b * y[i + 6] + a * x[i + 6]; + y[i + 7] = b * y[i + 7] + a * x[i + 7]; + y[i + 8] = b * y[i + 8] + a * x[i + 8]; + y[i + 9] = b * y[i + 9] + a * x[i + 9]; + y[i + 10] = b * y[i + 10] + a * x[i + 10]; + y[i + 11] = b * y[i + 11] + a * x[i + 11]; + y[i + 12] = b * y[i + 12] + a * x[i + 12]; + y[i + 13] = b * y[i + 13] + a * x[i + 13]; + y[i + 14] = b * y[i + 14] + a * x[i + 14]; + y[i + 15] = b * y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpbyf_s_kernel8(int *ip, float *x, float a, float *y, float b, int incx, int incy) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpbyf_kernel16(&i, x, a, y, b); + } else { + i = len; + axpbyf_s_kernel8(&i, x, a, y, b, incx, incy); + } + for (; i < len; i++) { + y[i] = b * y[i] + a * x[i]; + } +} + +static +float +dotf_kernel16(int *ip, float *x, float *y) +{ + int i; + float sum[16]; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0] * y[i + 0]; + sum[1] += x[i + 1] * y[i + 1]; + sum[2] += x[i + 2] * y[i + 2]; + sum[3] += x[i + 3] * y[i + 3]; + sum[4] += x[i + 4] * y[i + 4]; + sum[5] += x[i + 5] * y[i + 5]; + sum[6] += x[i + 6] * y[i + 6]; + sum[7] += x[i + 7] * y[i + 7]; + sum[8] += x[i + 8] * y[i + 8]; + sum[9] += x[i + 9] * y[i + 9]; + sum[10] += x[i + 10] * y[i + 10]; + sum[11] += x[i + 11] * y[i + 11]; + sum[12] += x[i + 12] * y[i + 12]; + sum[13] += x[i + 13] * y[i + 13]; + sum[14] += x[i + 14] * y[i + 14]; + sum[15] += x[i + 15] * y[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +float +dotf_s_kernel8(int *ip, float *x, int incy, float *y, int incx) +{ + int i; + float sum[8]; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; + sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; + sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; + sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; + sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; + sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; + sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; + sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +float +blas·dotf(int len, float *x, int incx, float *y, int incy) +{ + int i; + float sum; + + if (incx == 1 && incy == 1) { + i = len; + sum = dotf_kernel16(&i, x, y); + } else { + i = len; + sum = dotf_s_kernel8(&i, x, incy, y, incx); + } + for (; i < len; i++) { + sum += x[i] * y[i]; + } + + return sum; +} + +static +float +sumf_kernel16(int *ip, float *x) +{ + int i; + float sum[16]; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0]; + sum[1] += x[i + 1]; + sum[2] += x[i + 2]; + sum[3] += x[i + 3]; + sum[4] += x[i + 4]; + sum[5] += x[i + 5]; + sum[6] += x[i + 6]; + sum[7] += x[i + 7]; + sum[8] += x[i + 8]; + sum[9] += x[i + 9]; + sum[10] += x[i + 10]; + sum[11] += x[i + 11]; + sum[12] += x[i + 12]; + sum[13] += x[i + 13]; + sum[14] += x[i + 14]; + sum[15] += x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +float +sumf_s_kernel8(int *ip, float *x, int incx) +{ + float sum[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)]; + sum[1] += x[incx * (i + 1)]; + sum[2] += x[incx * (i + 2)]; + sum[3] += x[incx * (i + 3)]; + sum[4] += x[incx * (i + 4)]; + sum[5] += x[incx * (i + 5)]; + sum[6] += x[incx * (i + 6)]; + sum[7] += x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +float +blas·sumf(int len, float *x, int incx) +{ + int i; + float sum; + + if (incx == 1) { + i = len; + sum = sumf_kernel16(&i, x); + } else { + i = len; + sum = sumf_s_kernel8(&i, x, incx); + } + for (; i < len; i++) { + sum += x[i]; + } + + return sum; +} + +static +float +normf_kernel16(int *ip, float *x) +{ + float nrm[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + nrm[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + nrm[0] += x[i + 0] * x[i + 0]; + nrm[1] += x[i + 1] * x[i + 1]; + nrm[2] += x[i + 2] * x[i + 2]; + nrm[3] += x[i + 3] * x[i + 3]; + nrm[4] += x[i + 4] * x[i + 4]; + nrm[5] += x[i + 5] * x[i + 5]; + nrm[6] += x[i + 6] * x[i + 6]; + nrm[7] += x[i + 7] * x[i + 7]; + nrm[8] += x[i + 8] * x[i + 8]; + nrm[9] += x[i + 9] * x[i + 9]; + nrm[10] += x[i + 10] * x[i + 10]; + nrm[11] += x[i + 11] * x[i + 11]; + nrm[12] += x[i + 12] * x[i + 12]; + nrm[13] += x[i + 13] * x[i + 13]; + nrm[14] += x[i + 14] * x[i + 14]; + nrm[15] += x[i + 15] * x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +static +float +normf_s_kernel8(int *ip, int incx, float *x) +{ + float nrm[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + nrm[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; + nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; + nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; + nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; + nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; + nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; + nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; + nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +float +blas·normf(int len, float *x, int incx) +{ + int i; + float nrm; + + if (incx == 1) { + i = len; + nrm = normf_kernel16(&i, x); + } else { + i = len; + nrm = normf_s_kernel8(&i, incx, x); + } + for (; i < len; i++) { + nrm += x[i] * x[i]; + } + + return math·sqrtf(nrm); +} + +static +void +scalef_kernel16(int *ip, float *x, float a) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + x[i + 0] = a * x[i + 0]; + x[i + 1] = a * x[i + 1]; + x[i + 2] = a * x[i + 2]; + x[i + 3] = a * x[i + 3]; + x[i + 4] = a * x[i + 4]; + x[i + 5] = a * x[i + 5]; + x[i + 6] = a * x[i + 6]; + x[i + 7] = a * x[i + 7]; + x[i + 8] = a * x[i + 8]; + x[i + 9] = a * x[i + 9]; + x[i + 10] = a * x[i + 10]; + x[i + 11] = a * x[i + 11]; + x[i + 12] = a * x[i + 12]; + x[i + 13] = a * x[i + 13]; + x[i + 14] = a * x[i + 14]; + x[i + 15] = a * x[i + 15]; + } + *ip = i; +} + +static +void +scalef_s_kernel8(int *ip, int incx, float *x, float a) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + x[incx * (i + 0)] = a * x[incx * (i + 0)]; + x[incx * (i + 1)] = a * x[incx * (i + 1)]; + x[incx * (i + 2)] = a * x[incx * (i + 2)]; + x[incx * (i + 3)] = a * x[incx * (i + 3)]; + x[incx * (i + 4)] = a * x[incx * (i + 4)]; + x[incx * (i + 5)] = a * x[incx * (i + 5)]; + x[incx * (i + 6)] = a * x[incx * (i + 6)]; + x[incx * (i + 7)] = a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·scalef(int len, float *x, int incx, float a) +{ + int i; + + if (incx == 1) { + i = len; + scalef_kernel16(&i, x, a); + } else { + i = len; + scalef_s_kernel8(&i, incx, x, a); + } + for (; i < len; i++) { + x[i] = a * x[i]; + } +} + +static +void +rotf_kernel16(int *ip, float *y, float cos, float sin, float *x) +{ + int i; + float tmp[16]; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; + } + *ip = i; +} + +static +void +rotf_s_kernel8(int *ip, float *y, float sin, float cos, float *x, int incx, int incy) +{ + int i; + float tmp[8]; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; + } + *ip = i; +} + +void +blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin) +{ + int i; + float tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotf_kernel16(&i, y, cos, sin, x); + } else { + i = len; + rotf_s_kernel8(&i, y, sin, cos, x, incx, incy); + } + for (; i < len; i++) { + tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp; + } +} + +static +void +rotgf_kernel16(int *ip, float H[5], float *y, float *x) +{ + float tmp[16]; + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; + } + *ip = i; +} + +static +void +rotgf_s_kernel8(int *ip, int incy, int incx, float H[5], float *y, float *x) +{ + float tmp[8]; + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; + } + *ip = i; +} + +void +blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5]) +{ + int i; + float tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotgf_kernel16(&i, H, y, x); + } else { + i = len; + rotgf_s_kernel8(&i, incy, incx, H, y, x); + } + for (; i < len; i++) { + tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp; + } +} + +static +int +argmind_kernel16(int *ip, double *x) +{ + int ix[16]; + double min[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + for (i = 0; i < 16; i++) { + min[i] = x[0]; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] < min[0]) { + ix[0] = (i + 0); + min[0] = x[ix[0]]; + } + if (x[i + 1] < min[1]) { + ix[1] = (i + 1); + min[1] = x[ix[1]]; + } + if (x[i + 2] < min[2]) { + ix[2] = (i + 2); + min[2] = x[ix[2]]; + } + if (x[i + 3] < min[3]) { + ix[3] = (i + 3); + min[3] = x[ix[3]]; + } + if (x[i + 4] < min[4]) { + ix[4] = (i + 4); + min[4] = x[ix[4]]; + } + if (x[i + 5] < min[5]) { + ix[5] = (i + 5); + min[5] = x[ix[5]]; + } + if (x[i + 6] < min[6]) { + ix[6] = (i + 6); + min[6] = x[ix[6]]; + } + if (x[i + 7] < min[7]) { + ix[7] = (i + 7); + min[7] = x[ix[7]]; + } + if (x[i + 8] < min[8]) { + ix[8] = (i + 8); + min[8] = x[ix[8]]; + } + if (x[i + 9] < min[9]) { + ix[9] = (i + 9); + min[9] = x[ix[9]]; + } + if (x[i + 10] < min[10]) { + ix[10] = (i + 10); + min[10] = x[ix[10]]; + } + if (x[i + 11] < min[11]) { + ix[11] = (i + 11); + min[11] = x[ix[11]]; + } + if (x[i + 12] < min[12]) { + ix[12] = (i + 12); + min[12] = x[ix[12]]; + } + if (x[i + 13] < min[13]) { + ix[13] = (i + 13); + min[13] = x[ix[13]]; + } + if (x[i + 14] < min[14]) { + ix[14] = (i + 14); + min[14] = x[ix[14]]; + } + if (x[i + 15] < min[15]) { + ix[15] = (i + 15); + min[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argmind_s_kernel8(int *ip, double *x, int incx) +{ + int ix[8]; + double min[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + for (i = 0; i < 8; i++) { + min[i] = x[0]; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] < min[0]) { + ix[0] = (i + 0); + min[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] < min[1]) { + ix[1] = (i + 1); + min[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] < min[2]) { + ix[2] = (i + 2); + min[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] < min[3]) { + ix[3] = (i + 3); + min[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] < min[4]) { + ix[4] = (i + 4); + min[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] < min[5]) { + ix[5] = (i + 5); + min[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] < min[6]) { + ix[6] = (i + 6); + min[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] < min[7]) { + ix[7] = (i + 7); + min[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argmind(int len, double *x, int incx) +{ + int i; + int ix; + double min; + + if (incx == 1) { + i = len; + ix = argmind_kernel16(&i, x); + } else { + i = len; + ix = argmind_s_kernel8(&i, x, incx); + } + min = x[ix]; + for (; i < len; i++) { + if (x[i] < min) { + ix = i; + min = x[ix]; + } + } + + return ix; +} + +static +int +argmaxd_kernel16(int *ip, double *x) +{ + int i; + int ix[16]; + double max[16]; + int len; + + for (i = 0; i < 16; i++) { + ix[i] = 0; + } + for (i = 0; i < 16; i++) { + max[i] = x[0]; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + if (x[i + 0] > max[0]) { + ix[0] = (i + 0); + max[0] = x[ix[0]]; + } + if (x[i + 1] > max[1]) { + ix[1] = (i + 1); + max[1] = x[ix[1]]; + } + if (x[i + 2] > max[2]) { + ix[2] = (i + 2); + max[2] = x[ix[2]]; + } + if (x[i + 3] > max[3]) { + ix[3] = (i + 3); + max[3] = x[ix[3]]; + } + if (x[i + 4] > max[4]) { + ix[4] = (i + 4); + max[4] = x[ix[4]]; + } + if (x[i + 5] > max[5]) { + ix[5] = (i + 5); + max[5] = x[ix[5]]; + } + if (x[i + 6] > max[6]) { + ix[6] = (i + 6); + max[6] = x[ix[6]]; + } + if (x[i + 7] > max[7]) { + ix[7] = (i + 7); + max[7] = x[ix[7]]; + } + if (x[i + 8] > max[8]) { + ix[8] = (i + 8); + max[8] = x[ix[8]]; + } + if (x[i + 9] > max[9]) { + ix[9] = (i + 9); + max[9] = x[ix[9]]; + } + if (x[i + 10] > max[10]) { + ix[10] = (i + 10); + max[10] = x[ix[10]]; + } + if (x[i + 11] > max[11]) { + ix[11] = (i + 11); + max[11] = x[ix[11]]; + } + if (x[i + 12] > max[12]) { + ix[12] = (i + 12); + max[12] = x[ix[12]]; + } + if (x[i + 13] > max[13]) { + ix[13] = (i + 13); + max[13] = x[ix[13]]; + } + if (x[i + 14] > max[14]) { + ix[14] = (i + 14); + max[14] = x[ix[14]]; + } + if (x[i + 15] > max[15]) { + ix[15] = (i + 15); + max[15] = x[ix[15]]; + } + } + *ip = i; + for (i = 1; i < 16; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +static +int +argmaxd_s_kernel8(int *ip, int incx, double *x) +{ + int i; + int ix[8]; + double max[8]; + int len; + + for (i = 0; i < 8; i++) { + ix[i] = 0; + } + for (i = 0; i < 8; i++) { + max[i] = x[0]; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + if (x[incx * (i + 0)] > max[0]) { + ix[0] = (i + 0); + max[0] = x[incx * ix[0]]; + } + if (x[incx * (i + 1)] > max[1]) { + ix[1] = (i + 1); + max[1] = x[incx * ix[1]]; + } + if (x[incx * (i + 2)] > max[2]) { + ix[2] = (i + 2); + max[2] = x[incx * ix[2]]; + } + if (x[incx * (i + 3)] > max[3]) { + ix[3] = (i + 3); + max[3] = x[incx * ix[3]]; + } + if (x[incx * (i + 4)] > max[4]) { + ix[4] = (i + 4); + max[4] = x[incx * ix[4]]; + } + if (x[incx * (i + 5)] > max[5]) { + ix[5] = (i + 5); + max[5] = x[incx * ix[5]]; + } + if (x[incx * (i + 6)] > max[6]) { + ix[6] = (i + 6); + max[6] = x[incx * ix[6]]; + } + if (x[incx * (i + 7)] > max[7]) { + ix[7] = (i + 7); + max[7] = x[incx * ix[7]]; + } + } + *ip = i; + for (i = 1; i < 8; i++) { + if (x[ix[i]] > x[ix[0]]) { + ix[0] = ix[i]; + } + } + + return ix[0]; +} + +int +blas·argmaxd(int len, double *x, int incx) +{ + int i; + int ix; + double max; + + if (incx == 1) { + i = len; + ix = argmaxd_kernel16(&i, x); + } else { + i = len; + ix = argmaxd_s_kernel8(&i, incx, x); + } + max = x[ix]; + for (; i < len; i++) { + if (x[i] > max) { + ix = i; + max = x[ix]; + } + } + + return ix; +} + +static +void +copyd_kernel16(int *ip, double *y, double *x) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = x[i + 0]; + y[i + 1] = x[i + 1]; + y[i + 2] = x[i + 2]; + y[i + 3] = x[i + 3]; + y[i + 4] = x[i + 4]; + y[i + 5] = x[i + 5]; + y[i + 6] = x[i + 6]; + y[i + 7] = x[i + 7]; + y[i + 8] = x[i + 8]; + y[i + 9] = x[i + 9]; + y[i + 10] = x[i + 10]; + y[i + 11] = x[i + 11]; + y[i + 12] = x[i + 12]; + y[i + 13] = x[i + 13]; + y[i + 14] = x[i + 14]; + y[i + 15] = x[i + 15]; + } + *ip = i; +} + +static +void +copyd_s_kernel8(int *ip, double *y, double *x, int incy, int incx) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = x[incx * (i + 0)]; + y[incy * (i + 1)] = x[incx * (i + 1)]; + y[incy * (i + 2)] = x[incx * (i + 2)]; + y[incy * (i + 3)] = x[incx * (i + 3)]; + y[incy * (i + 4)] = x[incx * (i + 4)]; + y[incy * (i + 5)] = x[incx * (i + 5)]; + y[incy * (i + 6)] = x[incx * (i + 6)]; + y[incy * (i + 7)] = x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·copyd(int len, double *x, int incx, double *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + copyd_kernel16(&i, y, x); + } else { + i = len; + copyd_s_kernel8(&i, y, x, incy, incx); + } + for (; i < len; i++) { + y[i] = x[i]; + } +} + +static +void +axpyd_kernel16(int *ip, double a, double *y, double *x) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = y[i + 0] + a * x[i + 0]; + y[i + 1] = y[i + 1] + a * x[i + 1]; + y[i + 2] = y[i + 2] + a * x[i + 2]; + y[i + 3] = y[i + 3] + a * x[i + 3]; + y[i + 4] = y[i + 4] + a * x[i + 4]; + y[i + 5] = y[i + 5] + a * x[i + 5]; + y[i + 6] = y[i + 6] + a * x[i + 6]; + y[i + 7] = y[i + 7] + a * x[i + 7]; + y[i + 8] = y[i + 8] + a * x[i + 8]; + y[i + 9] = y[i + 9] + a * x[i + 9]; + y[i + 10] = y[i + 10] + a * x[i + 10]; + y[i + 11] = y[i + 11] + a * x[i + 11]; + y[i + 12] = y[i + 12] + a * x[i + 12]; + y[i + 13] = y[i + 13] + a * x[i + 13]; + y[i + 14] = y[i + 14] + a * x[i + 14]; + y[i + 15] = y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpyd_s_kernel8(int *ip, double a, int incx, double *y, double *x, int incy) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpyd(int len, double a, double *x, int incx, double *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpyd_kernel16(&i, a, y, x); + } else { + i = len; + axpyd_s_kernel8(&i, a, incx, y, x, incy); + } + for (; i < len; i++) { + y[i] = y[i] + a * x[i]; + } +} + +static +void +axpbyd_kernel16(int *ip, double a, double b, double *x, double *y) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + y[i + 0] = b * y[i + 0] + a * x[i + 0]; + y[i + 1] = b * y[i + 1] + a * x[i + 1]; + y[i + 2] = b * y[i + 2] + a * x[i + 2]; + y[i + 3] = b * y[i + 3] + a * x[i + 3]; + y[i + 4] = b * y[i + 4] + a * x[i + 4]; + y[i + 5] = b * y[i + 5] + a * x[i + 5]; + y[i + 6] = b * y[i + 6] + a * x[i + 6]; + y[i + 7] = b * y[i + 7] + a * x[i + 7]; + y[i + 8] = b * y[i + 8] + a * x[i + 8]; + y[i + 9] = b * y[i + 9] + a * x[i + 9]; + y[i + 10] = b * y[i + 10] + a * x[i + 10]; + y[i + 11] = b * y[i + 11] + a * x[i + 11]; + y[i + 12] = b * y[i + 12] + a * x[i + 12]; + y[i + 13] = b * y[i + 13] + a * x[i + 13]; + y[i + 14] = b * y[i + 14] + a * x[i + 14]; + y[i + 15] = b * y[i + 15] + a * x[i + 15]; + } + *ip = i; +} + +static +void +axpbyd_s_kernel8(int *ip, double a, double b, double *x, int incx, int incy, double *y) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)]; + y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)]; + y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)]; + y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)]; + y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)]; + y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)]; + y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)]; + y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int incy) +{ + int i; + + if (incx == 1 && incy == 1) { + i = len; + axpbyd_kernel16(&i, a, b, x, y); + } else { + i = len; + axpbyd_s_kernel8(&i, a, b, x, incx, incy, y); + } + for (; i < len; i++) { + y[i] = b * y[i] + a * x[i]; + } +} + +static +double +dotd_kernel16(int *ip, double *x, double *y) +{ + double sum[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0] * y[i + 0]; + sum[1] += x[i + 1] * y[i + 1]; + sum[2] += x[i + 2] * y[i + 2]; + sum[3] += x[i + 3] * y[i + 3]; + sum[4] += x[i + 4] * y[i + 4]; + sum[5] += x[i + 5] * y[i + 5]; + sum[6] += x[i + 6] * y[i + 6]; + sum[7] += x[i + 7] * y[i + 7]; + sum[8] += x[i + 8] * y[i + 8]; + sum[9] += x[i + 9] * y[i + 9]; + sum[10] += x[i + 10] * y[i + 10]; + sum[11] += x[i + 11] * y[i + 11]; + sum[12] += x[i + 12] * y[i + 12]; + sum[13] += x[i + 13] * y[i + 13]; + sum[14] += x[i + 14] * y[i + 14]; + sum[15] += x[i + 15] * y[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +double +dotd_s_kernel8(int *ip, int incx, double *x, int incy, double *y) +{ + double sum[8]; + int i; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)]; + sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)]; + sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)]; + sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)]; + sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)]; + sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)]; + sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)]; + sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +double +blas·dotd(int len, double *x, int incx, double *y, int incy) +{ + int i; + double sum; + + if (incx == 1 && incy == 1) { + i = len; + sum = dotd_kernel16(&i, x, y); + } else { + i = len; + sum = dotd_s_kernel8(&i, incx, x, incy, y); + } + for (; i < len; i++) { + sum += x[i] * y[i]; + } + + return sum; +} + +static +double +sumd_kernel16(int *ip, double *x) +{ + int i; + double sum[16]; + int len; + + for (i = 0; i < 16; i++) { + sum[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + sum[0] += x[i + 0]; + sum[1] += x[i + 1]; + sum[2] += x[i + 2]; + sum[3] += x[i + 3]; + sum[4] += x[i + 4]; + sum[5] += x[i + 5]; + sum[6] += x[i + 6]; + sum[7] += x[i + 7]; + sum[8] += x[i + 8]; + sum[9] += x[i + 9]; + sum[10] += x[i + 10]; + sum[11] += x[i + 11]; + sum[12] += x[i + 12]; + sum[13] += x[i + 13]; + sum[14] += x[i + 14]; + sum[15] += x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +static +double +sumd_s_kernel8(int *ip, double *x, int incx) +{ + int i; + double sum[8]; + int len; + + for (i = 0; i < 8; i++) { + sum[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + sum[0] += x[incx * (i + 0)]; + sum[1] += x[incx * (i + 1)]; + sum[2] += x[incx * (i + 2)]; + sum[3] += x[incx * (i + 3)]; + sum[4] += x[incx * (i + 4)]; + sum[5] += x[incx * (i + 5)]; + sum[6] += x[incx * (i + 6)]; + sum[7] += x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + sum[0] += sum[i]; + } + + return sum[0]; +} + +double +blas·sumd(int len, double *x, int incx) +{ + int i; + double sum; + + if (incx == 1) { + i = len; + sum = sumd_kernel16(&i, x); + } else { + i = len; + sum = sumd_s_kernel8(&i, x, incx); + } + for (; i < len; i++) { + sum += x[i]; + } + + return sum; +} + +static +double +normd_kernel16(int *ip, double *x) +{ + double nrm[16]; + int i; + int len; + + for (i = 0; i < 16; i++) { + nrm[i] = 0; + } + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + nrm[0] += x[i + 0] * x[i + 0]; + nrm[1] += x[i + 1] * x[i + 1]; + nrm[2] += x[i + 2] * x[i + 2]; + nrm[3] += x[i + 3] * x[i + 3]; + nrm[4] += x[i + 4] * x[i + 4]; + nrm[5] += x[i + 5] * x[i + 5]; + nrm[6] += x[i + 6] * x[i + 6]; + nrm[7] += x[i + 7] * x[i + 7]; + nrm[8] += x[i + 8] * x[i + 8]; + nrm[9] += x[i + 9] * x[i + 9]; + nrm[10] += x[i + 10] * x[i + 10]; + nrm[11] += x[i + 11] * x[i + 11]; + nrm[12] += x[i + 12] * x[i + 12]; + nrm[13] += x[i + 13] * x[i + 13]; + nrm[14] += x[i + 14] * x[i + 14]; + nrm[15] += x[i + 15] * x[i + 15]; + } + *ip = i; + for (i = 1; i < 16; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +static +double +normd_s_kernel8(int *ip, int incx, double *x) +{ + int i; + double nrm[8]; + int len; + + for (i = 0; i < 8; i++) { + nrm[i] = 0; + } + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)]; + nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)]; + nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)]; + nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)]; + nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)]; + nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)]; + nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)]; + nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)]; + } + *ip = i; + for (i = 1; i < 8; i++) { + nrm[0] += nrm[i]; + } + + return nrm[0]; +} + +double +blas·normd(int len, double *x, int incx) +{ + int i; + double nrm; + + if (incx == 1) { + i = len; + nrm = normd_kernel16(&i, x); + } else { + i = len; + nrm = normd_s_kernel8(&i, incx, x); + } + for (; i < len; i++) { + nrm += x[i] * x[i]; + } + + return math·sqrt(nrm); +} + +static +void +scaled_kernel16(int *ip, double a, double *x) +{ + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + x[i + 0] = a * x[i + 0]; + x[i + 1] = a * x[i + 1]; + x[i + 2] = a * x[i + 2]; + x[i + 3] = a * x[i + 3]; + x[i + 4] = a * x[i + 4]; + x[i + 5] = a * x[i + 5]; + x[i + 6] = a * x[i + 6]; + x[i + 7] = a * x[i + 7]; + x[i + 8] = a * x[i + 8]; + x[i + 9] = a * x[i + 9]; + x[i + 10] = a * x[i + 10]; + x[i + 11] = a * x[i + 11]; + x[i + 12] = a * x[i + 12]; + x[i + 13] = a * x[i + 13]; + x[i + 14] = a * x[i + 14]; + x[i + 15] = a * x[i + 15]; + } + *ip = i; +} + +static +void +scaled_s_kernel8(int *ip, double a, int incx, double *x) +{ + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + x[incx * (i + 0)] = a * x[incx * (i + 0)]; + x[incx * (i + 1)] = a * x[incx * (i + 1)]; + x[incx * (i + 2)] = a * x[incx * (i + 2)]; + x[incx * (i + 3)] = a * x[incx * (i + 3)]; + x[incx * (i + 4)] = a * x[incx * (i + 4)]; + x[incx * (i + 5)] = a * x[incx * (i + 5)]; + x[incx * (i + 6)] = a * x[incx * (i + 6)]; + x[incx * (i + 7)] = a * x[incx * (i + 7)]; + } + *ip = i; +} + +void +blas·scaled(int len, double *x, int incx, double a) +{ + int i; + + if (incx == 1) { + i = len; + scaled_kernel16(&i, a, x); + } else { + i = len; + scaled_s_kernel8(&i, a, incx, x); + } + for (; i < len; i++) { + x[i] = a * x[i]; + } +} + +static +void +rotd_kernel16(int *ip, double *x, double *y, double cos, double sin) +{ + int i; + double tmp[16]; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15]; + } + *ip = i; +} + +static +void +rotd_s_kernel8(int *ip, double *y, int incx, double sin, int incy, double *x, double cos) +{ + int i; + double tmp[8]; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7]; + } + *ip = i; +} + +void +blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double sin) +{ + int i; + double tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotd_kernel16(&i, x, y, cos, sin); + } else { + i = len; + rotd_s_kernel8(&i, y, incx, sin, incy, x, cos); + } + for (; i < len; i++) { + tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp; + } +} + +static +void +rotgd_kernel16(int *ip, double *y, double H[5], double *x) +{ + double tmp[16]; + int i; + int len; + + len = *ip & ~15; + for (i = 0; i < len; i += 16) { + tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0]; + tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1]; + tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2]; + tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3]; + tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4]; + tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5]; + tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6]; + tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7]; + tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8]; + tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9]; + tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10]; + tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11]; + tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12]; + tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13]; + tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14]; + tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15]; + } + *ip = i; +} + +static +void +rotgd_s_kernel8(int *ip, int incx, double *y, int incy, double H[5], double *x) +{ + double tmp[8]; + int i; + int len; + + len = *ip & ~7; + for (i = 0; i < len; i += 8) { + tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0]; + tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1]; + tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2]; + tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3]; + tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4]; + tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5]; + tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6]; + tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7]; + } + *ip = i; +} + +void +blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5]) +{ + int i; + double tmp; + + if (incx == 1 && incy == 1) { + i = len; + rotgd_kernel16(&i, y, H, x); + } else { + i = len; + rotgd_s_kernel8(&i, incx, y, incy, H, x); + } + for (; i < len; i++) { + tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp; + } +} + + diff --git a/sys/libmath/gen1.py b/sys/libmath/gen1.py new file mode 100755 index 0000000..b0f9ecc --- /dev/null +++ b/sys/libmath/gen1.py @@ -0,0 +1,357 @@ +from C import * + +NUNROLL = 16 +def pkg(name): + return f"blas·{name}" + +def typeify(string, kind): + if (kind == Float32): + return f"{string}f" + if (kind == Float64): + return f"{string}d" + +def fini(func, loop, strided, calls, ret=[]): + func.execute(*calls[:2]) + func = Strided(func, loop, NUNROLL//2, strided, *ret) + func.execute(*calls[2:]) + func.emit() + +# ------------------------------------------------------------------------ +# Blas level 1 functions + +def copy(kind): + name = typeify("copy", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y") + ), + Vars( + (Int, "i") + ) + ) + + len, x, y, i = F.variables("len", "x", "y", "i") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + Set(Index(y, i), Index(x, i)) + ) + + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + + # F.execute(*calls) + # F.emit() + fini(F, loop, F.params[1:3], calls) + +def swap(kind): + name = typeify("swap", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y") + ), + Vars( + (Int, "i"), (kind, "tmp") + ) + ) + + len, x, y, i, tmp = F.variables("len", "x", "y", "i", "tmp") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + Swap(Index(x, i), Index(y, i), tmp) + ) + + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + + # F.execute(*calls) + # F.emit() + fini(F, loop, F.params[1:3], calls) + +def axpby(kind): + name = typeify("axpby", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (kind, "a"), (Ptr(kind), "x"), (kind, "b"), (Ptr(kind), "y") + ), + Vars( + (Int, "i"), + ) + ) + + len, x, y, i, a, b = F.variables("len", "x", "y", "i", "a", "b") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + Set(Index(y, i), + Mul(b, + Add(Index(y, i), + Mul(a, Index(x, i)), + ) + ) + ) + ) + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + + # F.execute(*calls) + # F.emit() + fini(F, loop, [F.params[2], F.params[4]], calls) + +def axpy(kind): + name = typeify("axpy", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (kind, "a"), (Ptr(kind), "x"), (Ptr(kind), "y") + ), + Vars( + (Int, "i"), + ) + ) + + len, x, y, i, a = F.variables("len", "x", "y", "i", "a") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + Set(Index(y, i), + Add(Index(y, i), + Mul(a, Index(x, i)), + ) + ) + ) + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + + fini(F, loop, F.params[2:4], calls) + # F.execute(*calls) + # F.emit() + +def argminmax(kind): + for operation in [("min", LT), ("max", GT)]: + name = typeify(f"arg{operation[0]}", kind) + F = Func(pkg(name), Int, + Params( + (Int, "len"), (Ptr(kind), "x"), + ), + Vars( + (Int, "i"), (Int, "ix"), (kind, operation[0]) + ) + ) + len, x, i, ix, op = F.variables("len", "x", "i", "ix", operation[0]) + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + If(operation[1](Index(x, i), op), + Block( + Set(ix, i), + Set(op, Index(x, ix)), + ) + ) + ) + + ret = (ix, lambda ires, icur, node: + If(GT(Index(x, Index(node, icur)), Index(x, Index(node, ires))), + Block(Set(Index(node, ires), Index(node, icur))) + ) + ) + + kernel, calls = Unroll(name, loop, NUNROLL, *ret) + kernel.emit() + + # F.execute(*calls, Return(ix)) + # F.emit() + fini(F, loop, [F.params[1]], calls[:2] + [Set(op, Index(x, ix))] + calls[2:] + [Return(ix)], ret) + +def dot(kind): + nm = typeify("dot", kind) + F = Func(pkg(nm), kind, + Params( + (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y") + ), + Vars( + (Int, "i"), (kind, "sum"), + ) + ) + len, x, i, y, sum = F.variables("len", "x", "i", "y", "sum") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + AddSet(sum, Mul(Index(x, i), Index(y, i))) + ) + ret = (sum, lambda ires, icur, node: StmtExpr(AddSet(Index(node, ires), Index(node, icur)))) + + kernel, calls = Unroll(nm, loop, NUNROLL, *ret) + kernel.emit() + + # F.execute(*calls, Return(sum)) + fini(F, loop, F.params[1:3], calls + [Return(sum)], ret) + +def norm(kind): + nm = typeify("norm", kind) + F = Func(pkg(nm), kind, + Params( + (Int, "len"), (Ptr(kind), "x") + ), + Vars( + (Int, "i"), (kind, "nrm"), + ) + ) + len, x, i, nrm = F.variables("len", "x", "i", "nrm") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + AddSet(nrm, Mul(Index(x, i), Index(x, i))) + ) + + ret = (nrm, lambda ires, icur, node: + StmtExpr(AddSet(Index(node, ires), Index(node, icur))) + ) + + kernel, calls = Unroll(nm, loop, NUNROLL, *ret) + kernel.emit() + + if kind == Float64: + sqrt = Func("math·sqrt", kind) + elif kind == Float32: + sqrt = Func("math·sqrtf", kind) + else: + raise ValueError(f"no sqrt for type {kind}") + + # F.execute(*calls, Return(Call(sqrt, [nrm]))) + # F.emit() + fini(F, loop, [F.params[1]], calls + [Return(Call(sqrt, [nrm]))], ret) + +def sum(kind): + name = typeify("sum", kind) + F = Func(pkg(name), kind, + Params( + (Int, "len"), (Ptr(kind), "x"), + ), + Vars( + (Int, "i"), (kind, "sum") + ) + ) + + len, x, i, sum = F.variables("len", "x", "i", "sum") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + AddSet(sum, Index(x, i)) + ) + + ret = (sum, lambda ires, icur, node: + StmtExpr(AddSet(Index(node, ires), Index(node, icur))) + ) + + kernel, calls = Unroll(name, loop, NUNROLL, *ret) + kernel.emit() + + fini(F, loop, [F.params[1]], calls + [Return(sum)], ret) + # F.execute(*calls, Return(sum)) + # F.emit() + +def scale(kind): + name = typeify("scale", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (Ptr(kind), "x"), (kind, "a") + ), + Vars( + (Int, "i"), + ) + ) + + len, a, x, i = F.variables("len", "a", "x", "i") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute(Set(Index(x, i), Mul(a, Index(x, i)))) + + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + + fini(F, loop, [F.params[1]], calls) + # F.execute(*calls) + # F.emit() + +def rot(kind): + name = typeify("rot", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y"), (kind, "cos"), (kind, "sin") + ), + Vars( + (Int, "i"), (kind, "tmp") + ) + ) + + len, x, y, i, tmp, cos, sin = F.variables("len", "x", "y", "i", "tmp", "cos", "sin") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + Comma(Set(tmp, Index(x, i)), + Comma( + Set(Index(x, i), Add(Mul(cos, Index(x, i)), Mul(sin, Index(y, i)))), + Set(Index(y, i), Sub(Mul(cos, Index(y, i)), Mul(sin, tmp))) + ) + ) + ) + + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + + fini(F, loop, [F.params[1], F.params[2]], calls) + # F.execute(*calls) + # F.emit() + +def rotm(kind): + name = typeify("rotg", kind) + F = Func(pkg(name), Void, + Params( + (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y"), (Array(kind, 5), "H") + ), + Vars( + (Int, "i"), (kind, "tmp") + ) + ) + + len, x, y, i, tmp, H = F.variables("len", "x", "y", "i", "tmp", "H") + + loop = For(Set(i, I(0)), LT(i, len), [Inc(i)]) + loop.execute( + Comma(Set(tmp, Index(x, i)), + Comma( + Set(Index(x, i), Add(Mul(Index(H, I(1)), Index(x, i)), Mul(Index(H, I(2)), Index(y, i)))), + Set(Index(y, i), Add(Mul(Index(H, I(3)), Index(y, i)), Mul(Index(H, I(4)), tmp))) + ) + ) + ) + + kernel, calls = Unroll(name, loop, NUNROLL) + kernel.emit() + fini(F, loop, F.params[1:3], calls) + +if __name__ == "__main__": + emit("#include \n") + emit("#include \n") + emit("#include \n") + emitln() + emit("/*********************************************************/\n") + emit("/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */\n") + emit("/*********************************************************/\n") + emitln(2) + + # TODO: Implement rotg/rotmg + for kind in [Float32, Float64]: + argminmax(kind) + copy(kind) + axpy(kind) + axpby(kind) + dot(kind) + sum(kind) + norm(kind) + scale(kind) + rot(kind) + rotm(kind) + + flush() diff --git a/sys/libmath/linalg.c b/sys/libmath/linalg.c index 09e100c..8551ff1 100644 --- a/sys/libmath/linalg.c +++ b/sys/libmath/linalg.c @@ -1,6 +1,7 @@ #include #include #include +#include // ----------------------------------------------------------------------- // Vector @@ -10,8 +11,8 @@ linalg·normalize(math·Vector vec) { double norm; - norm = blas·norm(vec.len, vec.data, 1); - blas·scale(vec.len, 1/norm, vec.data, 1); + norm = blas·normd(vec.len, vec.data, 1); + blas·scaled(vec.len, 1/norm, vec.data, 1); } // TODO: Write blas wrappers that eat vectors for convenience @@ -50,12 +51,12 @@ linalg·lq(math·Matrix m, math·Vector w) len = m.dim[0] - i; // TODO: Don't want to compute norm twice!! - w.data[0] = math·sgn(row[0]) * blas·norm(len, row, 1); - blas·axpy(len, 1.0, row, 1, w.data, 1); - mag = blas·norm(len, w.data, 1); - blas·scale(len, 1/mag, w.data, 1); + w.data[0] = math·sgn(row[0]) * blas·normd(len, row, 1); + blas·axpyd(len, 1.0, row, 1, w.data, 1); + mag = blas·normd(len, w.data, 1); + blas·scaled(len, 1/mag, w.data, 1); - blas·copy(len - m.dim[0], w.data, 1, m.data + i, 1); + blas·copyd(len - m.dim[0], w.data, 1, m.data + i, 1); } return err·nil; diff --git a/sys/libmath/proto.c b/sys/libmath/proto.c deleted file mode 100644 index 63f8856..0000000 --- a/sys/libmath/proto.c +++ /dev/null @@ -1,1761 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#define EVEN_BY(x, n) (x) & ~((n)-1) - -// ----------------------------------------------------------------------- -// misc functions - -static -inline -double -hsum_avx2(__m256d x) -{ - __m128d lo128, hi128, hi64; - - lo128 = _mm256_castpd256_pd128(x); - hi128 = _mm256_extractf128_pd(x, 1); - lo128 = _mm_add_pd(lo128, hi128); - - hi64 = _mm_unpackhi_pd(lo128, lo128); - - return _mm_cvtsd_f64(_mm_add_sd(lo128, hi64)); -} - - -// ----------------------------------------------------------------------- -// level one - -/* - * rotate vector - * x = cos*x + sin*y - * y = cos*x - sin*y - */ - -static -void -rot_kernel8_avx2(int n, double *x, double *y, double cos, double sin) -{ - register int i; - __m256d x256, y256; - __m128d cos128, sin128; - __m256d cos256, sin256; - - cos128 = _mm_load_sd(&cos); - cos256 = _mm256_broadcastsd_pd(cos128); - - sin128 = _mm_load_sd(&sin); - sin256 = _mm256_broadcastsd_pd(sin128); - - for (i = 0; i < n; i+=8) { - x256 = _mm256_loadu_pd(x+i+0); - y256 = _mm256_loadu_pd(y+i+0); - _mm256_storeu_pd(x+i+0, cos256 * x256 + sin256 * y256); - _mm256_storeu_pd(y+i+0, cos256 * y256 - sin256 * x256); - - x256 = _mm256_loadu_pd(x+i+4); - y256 = _mm256_loadu_pd(y+i+4); - _mm256_storeu_pd(x+i+4, cos256 * x256 + sin256 * y256); - _mm256_storeu_pd(y+i+4, cos256 * y256 - sin256 * x256); - } -} - -static -void -rot_kernel8(int n, double *x, double *y, double cos, double sin) -{ - register int i; - register double tmp; - - for (i = 0; i < n; i+=8) { - tmp = x[i+0], x[i+0] = cos*x[i+0] + sin*y[i+0], y[i+0] = cos*y[i+0] - sin*tmp; - tmp = x[i+1], x[i+1] = cos*x[i+1] + sin*y[i+1], y[i+1] = cos*y[i+1] - sin*tmp; - tmp = x[i+2], x[i+2] = cos*x[i+2] + sin*y[i+2], y[i+2] = cos*y[i+2] - sin*tmp; - tmp = x[i+3], x[i+3] = cos*x[i+3] + sin*y[i+3], y[i+3] = cos*y[i+3] - sin*tmp; - tmp = x[i+4], x[i+4] = cos*x[i+4] + sin*y[i+4], y[i+4] = cos*y[i+4] - sin*tmp; - tmp = x[i+5], x[i+5] = cos*x[i+5] + sin*y[i+5], y[i+5] = cos*y[i+5] - sin*tmp; - tmp = x[i+6], x[i+6] = cos*x[i+6] + sin*y[i+6], y[i+6] = cos*y[i+6] - sin*tmp; - tmp = x[i+7], x[i+7] = cos*x[i+7] + sin*y[i+7], y[i+7] = cos*y[i+7] - sin*tmp; - } -} - -void -blas·rot(int len, double *x, int incx, double *y, int incy, double cos, double sin) -{ - register int i, n; - register double tmp; - - if (incx == 1 && incy == 1) { - n = EVEN_BY(len, 8); - rot_kernel8_avx2(n, x, y, cos, sin); - x += n; - y += n; - } else { - n = EVEN_BY(len, 4); - for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) { - tmp = x[0*incx], x[0*incx] = cos*x[0*incx] + sin*y[0*incy], y[0*incy] = cos*y[0*incy] - sin*tmp; - tmp = x[1*incx], x[1*incx] = cos*x[1*incx] + sin*y[1*incy], y[1*incy] = cos*y[1*incy] - sin*tmp; - tmp = x[2*incx], x[2*incx] = cos*x[2*incx] + sin*y[2*incy], y[2*incy] = cos*y[2*incy] - sin*tmp; - tmp = x[3*incx], x[3*incx] = cos*x[3*incx] + sin*y[3*incy], y[3*incy] = cos*y[3*incy] - sin*tmp; - } - } - - for (; n < len; n++, x += incx, y += incy) { - tmp = x[0], x[0] = cos*x[0] + sin*y[0], y[0] = cos*y[0] - sin*tmp; - } - -} - -/* - * compute givens rotate vector - * -- -- - * |+cos -sin| | a | = | r | - * |-sin +cos| | b | = | 0 | - * -- -- - */ - -void -blas·rotg(double *a, double *b, double *cos, double *sin) -{ - double abs_a, abs_b, r, rho, scale, z; - - abs_a = math·abs(*a); - abs_b = math·abs(*b); - rho = abs_a > abs_b ? *a : *b; - scale = abs_a + abs_b; - - if (scale == 0) { - *cos = 1, *sin = 0; - r = 0.; - z = 0.; - } else { - r = math·sgn(rho) * scale * math·sqrt(math·pow(abs_a/scale, 2) + math·pow(abs_b/scale, 2)); - *cos = *a / r; - *sin = *b / r; - if (abs_a > abs_b) - z = *sin; - else if (abs_b >= abs_a && *cos != 0) - z = 1/(*cos); - else - z = 1.; - } - *a = r; - *b = z; -} - -/* - * modified Givens rotation of points in plane - * operates on len points - * - * params = [flag, h11, h12, h21, h22] - * NOTE: This is row major as opposed to other implementations - * - * Flags correspond to: - * @flag = -1: - * H -> [ [h11, h12], [h21, h22] ] - * @flag = 0.0: - * H -> [ [1, h12], [h21, 1] ] - * @flag = +1: - * H -> [ [h11, 1], [-1, h22] ] - * @flag = -2: - * H -> [ [1, 0], [0, 1] ] - * @flag = * - * return error - * - * Replaces: - * x -> H11 * x + H12 * y - * y -> H21 * x + H22 * y - */ - -static -void -rotm_kernel8_avx2(int n, double *x, double *y, double H[4]) -{ - register int i; - __m256d x256, y256; - __m256d H256[4]; - - for (i = 0; i < 4; i++) { - H256[i] = _mm256_broadcastsd_pd(_mm_load_sd(H+i)); - } - - for (i = 0; i < n; i+=8) { - x256 = _mm256_loadu_pd(x+i+0); - y256 = _mm256_loadu_pd(y+i+0); - _mm256_storeu_pd(x+i+0, H256[0] * x256 + H256[1] * y256); - _mm256_storeu_pd(y+i+0, H256[2] * x256 + H256[3] * y256); - - x256 = _mm256_loadu_pd(x+i+4); - y256 = _mm256_loadu_pd(y+i+4); - _mm256_storeu_pd(x+i+4, H256[0] * x256 + H256[1] * y256); - _mm256_storeu_pd(y+i+4, H256[2] * x256 + H256[3] * y256); - } -} - -static -void -rotm_kernel8(int n, double *x, double *y, double H[4]) -{ - register int i; - register double tmp; - - for (i = 0; i < n; i+=8) { - tmp = x[i+0], x[i+0] = H[0]*x[i+0] + H[1]*y[i+0], y[i+0] = H[2]*tmp + H[3]*y[i+0]; - tmp = x[i+1], x[i+1] = H[0]*x[i+1] + H[1]*y[i+1], y[i+1] = H[2]*tmp + H[3]*y[i+1]; - tmp = x[i+2], x[i+2] = H[0]*x[i+2] + H[1]*y[i+2], y[i+2] = H[2]*tmp + H[3]*y[i+2]; - tmp = x[i+3], x[i+3] = H[0]*x[i+3] + H[1]*y[i+3], y[i+3] = H[2]*tmp + H[3]*y[i+3]; - tmp = x[i+4], x[i+4] = H[0]*x[i+4] + H[1]*y[i+4], y[i+4] = H[2]*tmp + H[3]*y[i+4]; - tmp = x[i+5], x[i+5] = H[0]*x[i+5] + H[1]*y[i+5], y[i+5] = H[2]*tmp + H[3]*y[i+5]; - tmp = x[i+6], x[i+6] = H[0]*x[i+6] + H[1]*y[i+6], y[i+6] = H[2]*tmp + H[3]*y[i+6]; - tmp = x[i+7], x[i+7] = H[0]*x[i+7] + H[1]*y[i+7], y[i+7] = H[2]*tmp + H[3]*y[i+7]; - } -} - -error -blas·rotm(int len, double *x, int incx, double *y, int incy, double p[5]) -{ - int i, n, flag; - double tmp, H[4]; - - flag = math·round(p[0]); - switch (flag) { - case -1: H[0] = p[1], H[1] = p[2], H[2] = p[3], H[3] = p[4]; break; - case 0: H[0] = +1, H[1] = p[2], H[2] = p[3], H[3] = +1; break; - case +1: H[0] = p[1], H[1] = +1, H[2] = -1, H[3] = p[4]; break; - case -2: H[0] = +1, H[1] = 0, H[2] = 0, H[3] = +1; break; - default: - errorf("rotm: flag '%d' unrecognized", flag); - return 1; - } - - if (incx == 1 && incy == 1) { - n = EVEN_BY(len, 8); - rotm_kernel8_avx2(n, x, y, H); - x += n; - y += n; - } else { - n = EVEN_BY(len, 4); - for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) { - tmp = x[0*incx], x[0*incx] = H[0]*x[0*incx] + H[1]*y[0*incy], y[0*incy] = H[2]*tmp + H[3]*y[0*incy]; - tmp = x[1*incx], x[1*incx] = H[0]*x[1*incx] + H[1]*y[1*incy], y[1*incy] = H[2]*tmp + H[3]*y[1*incy]; - tmp = x[2*incx], x[2*incx] = H[0]*x[2*incx] + H[1]*y[2*incy], y[2*incy] = H[2]*tmp + H[3]*y[2*incy]; - tmp = x[3*incx], x[3*incx] = H[0]*x[3*incx] + H[1]*y[3*incy], y[3*incy] = H[2]*tmp + H[3]*y[3*incy]; - } - } - - for (; n < len; n++, x += incx, y += incy) { - tmp = x[0], x[0] = H[0]*x[0] + H[1]*y[0], y[0] = H[2]*tmp + H[3]*y[0]; - } - - return 0; -} - - -/* - * scale vector - * x = ax - */ - -static -void -scale_kernel8_avx2(int n, double *x, double a) -{ - __m128d a128; - __m256d a256; - register int i; - - a128 = _mm_load_sd(&a); - a256 = _mm256_broadcastsd_pd(a128); - for (i = 0; i < n; i += 8) { - _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0)); - _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4)); - } -} - -static -void -scale_kernel8(int n, double *x, double a) -{ - register int i; - for (i = 0; i < n; i += 8) { - x[i+0] *= a; - x[i+1] *= a; - x[i+2] *= a; - x[i+3] *= a; - x[i+4] *= a; - x[i+5] *= a; - x[i+6] *= a; - x[i+7] *= a; - } -} - -void -blas·scale(int len, double a, double *x, int inc) -{ - int n, ix; - - if (inc == 1) { - n = EVEN_BY(len, 8); - scale_kernel8_avx2(n, x, a); - ix = n; - } else { - n = EVEN_BY(len, 4); - for (ix = 0; ix < n*inc; ix += 4*inc) { - x[ix+0*inc] *= a; - x[ix+1*inc] *= a; - x[ix+2*inc] *= a; - x[ix+3*inc] *= a; - } - } - for (; n < len; n++, ix += inc) { - x[ix] *= a; - } -} - -/* - * copy - * y = x - */ - -void -blas·copy(int len, double *x, int incx, double *y, int incy) -{ - int n, i, ix, iy; - if (incx == 1 && incy == 1) { - memcpy(y, x, sizeof(*x) * len); - return; - } - - n = EVEN_BY(len, 4); - for (i = 0, incx = 0, incy = 0; i < n; i+=4, ix+=4*incx, iy+=4*incy) { - y[iy+0*incy] = x[ix+0*incx]; - y[iy+1*incy] = x[ix+1*incx]; - y[iy+2*incy] = x[ix+2*incx]; - y[iy+3*incy] = x[ix+3*incx]; - } - - for (; n < len; n++, ix+=incx, iy+=incy) { - y[iy] = x[ix]; - } -} - -/* - * swap - * y <=> x - */ - -static -void -swap_kernel8_avx2(int n, double *x, double *y) -{ - register int i; - __m256d tmp[2]; - for (i = 0; i < n; i += 8) { - tmp[0] = _mm256_loadu_pd(x+i+0); - tmp[1] = _mm256_loadu_pd(y+i+0); - _mm256_storeu_pd(x+i+0, tmp[1]); - _mm256_storeu_pd(y+i+0, tmp[0]); - - tmp[0] = _mm256_loadu_pd(x+i+4); - tmp[1] = _mm256_loadu_pd(y+i+4); - _mm256_storeu_pd(x+i+4, tmp[1]); - _mm256_storeu_pd(y+i+4, tmp[0]); - } -} - -static -void -swap_kernel8(int n, double *x, double *y) -{ - register int i; - register double tmp; - for (i = 0; i < n; i += 8) { - tmp = x[i+0], x[i+0] = y[i+0], y[i+0] = tmp; - tmp = x[i+1], x[i+1] = y[i+1], y[i+1] = tmp; - tmp = x[i+2], x[i+2] = y[i+2], y[i+2] = tmp; - tmp = x[i+3], x[i+3] = y[i+3], y[i+3] = tmp; - tmp = x[i+4], x[i+4] = y[i+4], y[i+4] = tmp; - tmp = x[i+5], x[i+5] = y[i+5], y[i+5] = tmp; - tmp = x[i+6], x[i+6] = y[i+6], y[i+6] = tmp; - tmp = x[i+7], x[i+7] = y[i+7], y[i+7] = tmp; - } -} - -void -blas·swap(int len, double *x, int incx, double *y, int incy) -{ - int n, i, ix, iy; - double tmp; - - if (incx == 1 && incy == 1) { - n = EVEN_BY(len, 8); - swap_kernel8(n, x, y); - ix = n; - iy = n; - } else { - n = EVEN_BY(len, 4); - for (i = 0, ix = 0, iy = 0; i < n; i += 4, ix += 4*incx, iy += 4*incy) { - tmp = x[ix + 0*incx], x[ix + 0*incx] = y[iy + 0*incy], y[iy + 0*incy] = tmp; - tmp = x[ix + 1*incx], x[ix + 1*incx] = y[iy + 1*incy], y[iy + 1*incy] = tmp; - tmp = x[ix + 2*incx], x[ix + 2*incx] = y[iy + 2*incy], y[iy + 2*incy] = tmp; - tmp = x[ix + 3*incx], x[ix + 3*incx] = y[iy + 3*incy], y[iy + 3*incy] = tmp; - } - } - - for (; n < len; n++, ix += incx, iy += incy) { - tmp = x[ix], x[ix] = y[iy], y[iy] = tmp; - } -} - -/* - * daxpy - * y = ax + y - */ - - -static -void -axpy_kernel8_avx2(int n, double a, double *x, double *y) -{ - __m128d a128; - __m256d a256; - register int i; - - a128 = _mm_load_sd(&a); - a256 = _mm256_broadcastsd_pd(a128); - for (i = 0; i < n; i += 8) { - _mm256_storeu_pd(y+i+0, _mm256_loadu_pd(y+i+0) + a256 * _mm256_loadu_pd(x+i+0)); - _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(y+i+4) + a256 * _mm256_loadu_pd(x+i+4)); - } -} - -static -void -axpy_kernel8(int n, double a, double *x, double *y) -{ - register int i; - for (i = 0; i < n; i += 8) { - y[i+0] += a*x[i+0]; - y[i+1] += a*x[i+1]; - y[i+2] += a*x[i+2]; - y[i+3] += a*x[i+3]; - y[i+4] += a*x[i+4]; - y[i+5] += a*x[i+5]; - y[i+6] += a*x[i+6]; - y[i+7] += a*x[i+7]; - } -} - -void -blas·axpy(int len, double a, double *x, int incx, double *y, int incy) -{ - int n, i; - - if (incx == 1 && incy == 1) { - n = EVEN_BY(len, 8); - axpy_kernel8_avx2(n, a, x, y); - x += n; - y += n; - } else { - n = EVEN_BY(len, 4); - for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) { - y[0*incy] += a*x[0*incx]; - y[1*incy] += a*x[1*incx]; - y[2*incy] += a*x[2*incx]; - y[3*incy] += a*x[3*incx]; - } - } - - for (; n < len; n++, x+=incx, y+=incy) { - *y += a*(*x); - } -} - -/* - * dot product - * x·y - */ - -static -double -dot_kernel8_fma3(int len, double *x, double *y) -{ - register int i; - __m256d sum[4]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 16) { - sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0), _mm256_loadu_pd(y+i+0), sum[0]); - sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4), _mm256_loadu_pd(y+i+4), sum[1]); - sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8), _mm256_loadu_pd(y+i+8), sum[2]); - sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]); - } - - sum[0] += sum[1] + sum[2] + sum[3]; - - return hsum_avx2(sum[0]); -} - -static -double -dot_kernel8_avx2(int len, double *x, double *y) -{ - register int i; - __m256d sum[4]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 16) { - sum[0] += _mm256_loadu_pd(x+i+0) * _mm256_loadu_pd(y+i+0); - sum[1] += _mm256_loadu_pd(x+i+4) * _mm256_loadu_pd(y+i+4); - sum[2] += _mm256_loadu_pd(x+i+8) * _mm256_loadu_pd(y+i+8); - sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12); - } - - sum[0] += sum[1] + sum[2] + sum[3]; - - return hsum_avx2(sum[0]); -} - -static -double -dot_kernel8(int len, double *x, double *y) -{ - double res; - register int i; - - for (i = 0; i < len; i += 8) { - res += x[i] * y[i] + - x[i+1] * y[i+1] + - x[i+2] * y[i+2] + - x[i+3] * y[i+3] + - x[i+4] * y[i+4] + - x[i+5] * y[i+5] + - x[i+6] * y[i+6] + - x[i+7] * y[i+7]; - } - - return res; -} - -double -blas·dot(int len, double *x, int incx, double *y, int incy) -{ - int i, n; - double mul[4], sum[2]; - if (len == 0) return 0; - - sum[0] = 0, sum[1] = 0; - if (incx == 1 && incy == 1) { - n = EVEN_BY(len, 16); - sum[0] = dot_kernel8_fma3(n, x, y); - - x += n; - y += n; - } else { - n = EVEN_BY(len, 4); - for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) { - mul[0] = x[0*incx] * y[0*incy]; - mul[1] = x[1*incx] * y[1*incy]; - mul[2] = x[2*incx] * y[2*incy]; - mul[3] = x[3*incx] * y[3*incy]; - - sum[0] += mul[0] + mul[2]; - sum[1] += mul[1] + mul[3]; - } - } - - for (; n < len; n++, x += incx, y += incy) { - sum[0] += x[0] * y[0]; - } - - sum[0] += sum[1]; - return sum[0]; -} - -/* - * euclidean norm - * ||x|| - */ -double -blas·norm(int len, double *x, int incx) -{ - double res; - - res = blas·dot(len, x, incx, x, incx); - res = math·sqrt(res); - - return res; -} - -static -double -sum_kernel8_avx2(int len, double *x) -{ - register int i; - __m256d sum[2]; - __m128d res; - - for (i = 0; i < arrlen(sum); i++) { - sum[i] = _mm256_setzero_pd(); - } - - for (i = 0; i < len; i += 8) { - sum[0] += _mm256_loadu_pd(x+i+0); - sum[1] += _mm256_loadu_pd(x+i+4); - } - - sum[0] += sum[1]; - - return hsum_avx2(sum[0]); -} - -static -double -sum_kernel8(int len, double *x, double *y) -{ - double res; - register int i; - - for (i = 0; i < len; i += 8) { - res += x[i] + - x[i+1] + - x[i+2] + - x[i+3] + - x[i+4] + - x[i+5] + - x[i+6] + - x[i+7]; - } - - return res; -} - - -/* - * L1 norm - * sum(x_i) - */ -double -blas·sum(int len, double *x, int inc) -{ - int i, n; - double res; - - if (len == 0) return 0; - - if (inc == 1) { - n = EVEN_BY(len, 8); - res = sum_kernel8_avx2(n, x); - } else { - n = EVEN_BY(len, 4); - for (i = 0; i < n; i++, x += 4*inc) { - res += x[0*inc]; - res += x[1*inc]; - res += x[2*inc]; - res += x[3*inc]; - } - } - - for (i = n; i < len; i++, x += inc) { - res += x[0]; - } - - return res; -} - -/* - * argmax - * i = argmax(x) - */ - -int -argmax_kernel8_avx2(int n, double *x) -{ - register int i, msk, maxidx[4]; - __m256d val, cmp, max; - - maxidx[0] = 0, maxidx[1] = 0, maxidx[2] = 0, maxidx[3] = 0; - max = _mm256_loadu_pd(x); - - for (i = 0; i < n; i += 4) { - val = _mm256_loadu_pd(x+i); - cmp = _mm256_cmp_pd(val, max, _CMP_GT_OQ); - msk = _mm256_movemask_pd(cmp); - switch (msk) { - case 1: - max = _mm256_blend_pd(max, val, 1); - maxidx[0] = i; - break; - - case 2: - max = _mm256_blend_pd(max, val, 2); - maxidx[1] = i; - break; - - case 3: - max = _mm256_blend_pd(max, val, 3); - maxidx[0] = i; - maxidx[1] = i; - break; - - case 4: - max = _mm256_blend_pd(max, val, 4); - maxidx[2] = i; - break; - - case 5: - max = _mm256_blend_pd(max, val, 5); - maxidx[2] = i; - maxidx[0] = i; - break; - - case 6: - max = _mm256_blend_pd(max, val, 6); - maxidx[2] = i; - maxidx[1] = i; - break; - - case 7: - max = _mm256_blend_pd(max, val, 7); - maxidx[2] = i; - maxidx[1] = i; - maxidx[0] = i; - break; - - case 8: - max = _mm256_blend_pd(max, val, 8); - maxidx[3] = i; - break; - - case 9: - max = _mm256_blend_pd(max, val, 9); - maxidx[3] = i; - maxidx[0] = i; - break; - - case 10: - max = _mm256_blend_pd(max, val, 10); - maxidx[3] = i; - maxidx[1] = i; - break; - - case 11: - max = _mm256_blend_pd(max, val, 11); - maxidx[3] = i; - maxidx[1] = i; - maxidx[0] = i; - break; - - case 12: - max = _mm256_blend_pd(max, val, 12); - maxidx[3] = i; - maxidx[2] = i; - break; - - case 13: - max = _mm256_blend_pd(max, val, 13); - maxidx[3] = i; - maxidx[2] = i; - maxidx[0] = i; - break; - - case 14: - max = _mm256_blend_pd(max, val, 14); - maxidx[3] = i; - maxidx[2] = i; - maxidx[1] = i; - break; - - case 15: - max = _mm256_blend_pd(max, val, 15); - maxidx[3] = i; - maxidx[2] = i; - maxidx[1] = i; - maxidx[0] = i; - break; - - case 0: - default: ; - } - } - maxidx[0] = (x[maxidx[0]+0] > x[maxidx[1]+1]) ? maxidx[0]+0 : maxidx[1]+1; - maxidx[1] = (x[maxidx[2]+2] > x[maxidx[3]+3]) ? maxidx[2]+2 : maxidx[3]+3; - maxidx[0] = (x[maxidx[0]] > x[maxidx[1]]) ? maxidx[0] : maxidx[1]; - - return maxidx[0]; -} - -int -argmax_kernel8(int n, double *x) -{ -#define SET(d) idx[d] = d, max[d] = x[d] -#define PUT(d) if (x[i+d] > max[d]) idx[d] = i+d, max[d] = x[i+d] - int i, idx[8]; - double max[8]; - - SET(0); SET(1); SET(2); SET(3); - SET(4); SET(5); SET(6); SET(7); - - for (i = 0; i < n; i += 8) { - PUT(0); PUT(1); PUT(2); PUT(3); - PUT(4); PUT(5); PUT(6); PUT(7); - } - - n = 0; - for (i = 1; i < 8; i++ ) { - if (max[i] > max[n]) { - n = i; - } - } - return idx[n]; -#undef PUT -#undef SET -} - -int -blas·argmax(int len, double *x, int inc) -{ - int i, ix, n; - double max; - - if (len == 0) { - return -1; - } - - if (inc == 1) { - n = EVEN_BY(len, 8); - ix = argmax_kernel8_avx2(n, x); - max = x[ix]; - x += n; - } else { - n = EVEN_BY(len, 4); - ix = 0; - max = x[ix]; - for (i = 0; i < n; i += 4, x += 4*inc) { - if (x[0*inc] > max) { - ix = i; - max = x[0*inc]; - } - if (x[1*inc] > max) { - ix = i+1; - max = x[1*inc]; - } - if (x[2*inc] > max) { - ix = i+2; - max = x[2*inc]; - } - if (x[3*inc] > max) { - ix = i+3; - max = x[3*inc]; - } - } - } - - for (; n < len; n++, x += inc) { - if (*x > max) { - ix = n; - max = *x; - } - } - - return ix; -} - -int -argmin_kernel8_avx2(int n, double *x) -{ - register int i, msk, minidx[4]; - __m256d val, cmp, min; - - minidx[0] = 0, minidx[1] = 0, minidx[2] = 0, minidx[3] = 0; - min = _mm256_loadu_pd(x); - - for (i = 0; i < n; i += 4) { - val = _mm256_loadu_pd(x+i); - cmp = _mm256_cmp_pd(val, min, _CMP_LT_OS); - msk = _mm256_movemask_pd(cmp); - switch (msk) { - case 1: - min = _mm256_blend_pd(min, val, 1); - minidx[0] = i; - break; - - case 2: - min = _mm256_blend_pd(min, val, 2); - minidx[1] = i; - break; - - case 3: - min = _mm256_blend_pd(min, val, 3); - minidx[0] = i; - minidx[1] = i; - break; - - case 4: - min = _mm256_blend_pd(min, val, 4); - minidx[2] = i; - break; - - case 5: - min = _mm256_blend_pd(min, val, 5); - minidx[2] = i; - minidx[0] = i; - break; - - case 6: - min = _mm256_blend_pd(min, val, 6); - minidx[2] = i; - minidx[1] = i; - break; - - case 7: - min = _mm256_blend_pd(min, val, 7); - minidx[2] = i; - minidx[1] = i; - minidx[0] = i; - break; - - case 8: - min = _mm256_blend_pd(min, val, 8); - minidx[3] = i; - break; - - case 9: - min = _mm256_blend_pd(min, val, 9); - minidx[3] = i; - minidx[0] = i; - break; - - case 10: - min = _mm256_blend_pd(min, val, 10); - minidx[3] = i; - minidx[1] = i; - break; - - case 11: - min = _mm256_blend_pd(min, val, 11); - minidx[3] = i; - minidx[1] = i; - minidx[0] = i; - break; - - case 12: - min = _mm256_blend_pd(min, val, 12); - minidx[3] = i; - minidx[2] = i; - break; - - case 13: - min = _mm256_blend_pd(min, val, 13); - minidx[3] = i; - minidx[2] = i; - minidx[0] = i; - break; - - case 14: - min = _mm256_blend_pd(min, val, 14); - minidx[3] = i; - minidx[2] = i; - minidx[1] = i; - break; - - case 15: - min = _mm256_blend_pd(min, val, 15); - minidx[3] = i; - minidx[2] = i; - minidx[1] = i; - minidx[0] = i; - break; - - case 0: - default: ; - } - } - minidx[0] = (x[minidx[0]+0] < x[minidx[1]+1]) ? minidx[0]+0 : minidx[1]+1; - minidx[1] = (x[minidx[2]+2] < x[minidx[3]+3]) ? minidx[2]+2 : minidx[3]+3; - minidx[0] = (x[minidx[0]] < x[minidx[1]]) ? minidx[0] : minidx[1]; - - return minidx[0]; -} - - -int -argmin_kernel8(int n, double *x) -{ -#define SET(d) idx[d] = d, min[d] = x[d] -#define PUT(d) if (x[i+d] < min[d]) idx[d] = i+d, min[d] = x[i+d] - int i, idx[8]; - double min[8]; - - SET(0); SET(1); SET(2); SET(3); - SET(4); SET(5); SET(6); SET(7); - - for (i = 0; i < n; i += 8) { - PUT(0); PUT(1); PUT(2); PUT(3); - PUT(4); PUT(5); PUT(6); PUT(7); - } - - n = 0; - for (i = 1; i < 8; i++) { - if (min[i] < min[n]) { - n = i; - } - } - return idx[n]; -#undef PUT -#undef SET -} - -int -blas·argmin(int len, double *x, int inc) -{ - double min; - int i, ix, n; - - if (len == 0) { - return -1; - } - - if (inc == 1) { - n = EVEN_BY(len, 8); - ix = argmin_kernel8_avx2(n, x); - min = x[ix]; - x += n; - } else { - n = EVEN_BY(len, 4); - ix = 0; - min = x[ix]; - for (i = 0; i < n; i += 4, x += 4*inc) { - if (x[0*inc] < min) { - ix = i; - min = x[0*inc]; - } - if (x[1*inc] < min) { - ix = i+1; - min = x[1*inc]; - } - if (x[2*inc] < min) { - ix = i+2; - min = x[2*inc]; - } - if (x[3*inc] < min) { - ix = i+3; - min = x[3*inc]; - } - } - } - for (; n < len; n++, x += inc) { - if (*x < min) { - ix = n; - min = *x; - } - - } - - return ix; -} - -// ----------------------------------------------------------------------- -// level two - -/* - * Notation: (number of rows) x (number of columns) _ unroll factor - * N => variable we sum over - */ - - -// NOTE: All triangular matrix methods are assumed packed and upper for now! - -/* - * triangular shaped transformation - * x = Mx - * @M: square triangular - * TODO(PERF): Diagnose speed issues - * TODO: Finish all other flag cases! - */ -void -blas·tpmv(blas·Flag f, int n, double *m, double *x) -{ - int i; - for (i = 0; i < n; m += (n-i), ++x, ++i) { - *x = blas·dot(n-i, m, 1, x, 1); - } -} - -/* - * solve triangular set of equations - * x = M^{-1}b - * @M: square triangular - * TODO(PERF): Diagnose speed issues - * TODO: Finish all other flag cases! - */ -void -blas·tpsv(blas·Flag f, int n, double *m, double *x) -{ - int i; - double r; - - x += (n - 1); - m += ((n * (n+1))/2 - 1); - for (i = n-1; i >= 0; --i, --x, m -= (n-i)) { - r = blas·dot(n-i-1, m+1, 1, x+1, 1); - *x = (*x - r) / *m; - } -} - -/* - * general affine transformation - * y = aMx + by - */ - -static -void -gemv_8xN_kernel4_avx2(int ncol, double *row[8], double *x, double *y) -{ - int c; - __m128d hr; - __m256d x256, r256[8]; - - for (c = 0; c < 8; c++) { - r256[c] = _mm256_setzero_pd(); - } - - for (c = 0; c < ncol; c += 4) { - x256 = _mm256_loadu_pd(x+c); - r256[0] += x256 * _mm256_loadu_pd(row[0] + c); - r256[1] += x256 * _mm256_loadu_pd(row[1] + c); - r256[2] += x256 * _mm256_loadu_pd(row[2] + c); - r256[3] += x256 * _mm256_loadu_pd(row[3] + c); - r256[4] += x256 * _mm256_loadu_pd(row[4] + c); - r256[5] += x256 * _mm256_loadu_pd(row[5] + c); - r256[6] += x256 * _mm256_loadu_pd(row[6] + c); - r256[7] += x256 * _mm256_loadu_pd(row[7] + c); - } - - y[0] = hsum_avx2(r256[0]); - y[1] = hsum_avx2(r256[1]); - y[2] = hsum_avx2(r256[2]); - y[3] = hsum_avx2(r256[3]); - y[4] = hsum_avx2(r256[4]); - y[5] = hsum_avx2(r256[5]); - y[6] = hsum_avx2(r256[6]); - y[7] = hsum_avx2(r256[7]); -} - -static -void -gemv_4xN_kernel4_avx2(int ncol, double *row[4], double *x, double *y) -{ - int c; - __m128d hr; - __m256d x256, r256[4]; - - for (c = 0; c < 4; c++) { - r256[c] = _mm256_setzero_pd(); - } - - for (c = 0; c < ncol; c += 4) { - x256 = _mm256_loadu_pd(x+c); - r256[0] += x256 * _mm256_loadu_pd(row[0] + c); - r256[1] += x256 * _mm256_loadu_pd(row[1] + c); - r256[2] += x256 * _mm256_loadu_pd(row[2] + c); - r256[3] += x256 * _mm256_loadu_pd(row[3] + c); - } - - y[0] = hsum_avx2(r256[0]); - y[1] = hsum_avx2(r256[1]); - y[2] = hsum_avx2(r256[2]); - y[3] = hsum_avx2(r256[3]); -} - -static -void -gemv_4xN_kernel4(int n, double *row[4], double *x, double *y) -{ - int c; - double res[4]; - - res[0] = 0.0; - res[1] = 0.0; - res[2] = 0.0; - res[3] = 0.0; - - for (c = 0; c < n; c += 4) { - res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; - res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; - res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3]; - res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3]; - } - - y[0] = res[0]; - y[1] = res[1]; - y[2] = res[2]; - y[3] = res[3]; -} - -static -void -gemv_2xN_kernel4_avx2(int n, double *row[2], double *x, double *y) -{ - int c; - __m128d hr; - __m256d x256, r256[2]; - - for (c = 0; c < 2; c++) { - r256[c] = _mm256_setzero_pd(); - } - - for (c = 0; c < n; c += 4) { - x256 = _mm256_loadu_pd(x+c); - r256[0] += x256 * _mm256_loadu_pd(row[0] + c); - r256[1] += x256 * _mm256_loadu_pd(row[1] + c); - } - - y[0] = hsum_avx2(r256[0]); - y[1] = hsum_avx2(r256[1]); -} - -static -void -gemv_2xN_kernel4(int n, double *row[2], double *x, double *y) -{ - int c; - double res[2]; - - res[0] = 0.0; - res[1] = 0.0; - - for (c = 0; c < n; c += 4) { - res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3]; - res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3]; - } - - y[0] = res[0]; - y[1] = res[1]; -} - -static -void -gemv_1xN_kernel4_avx2(int n, double *row, double *x, double *y) -{ - int c; - __m128d r128; - __m256d r256; - - r256 = _mm256_setzero_pd(); - for (c = 0; c < n; c += 4) { - r256 += _mm256_loadu_pd(row+c) * _mm256_loadu_pd(x+c); - } - - r128 = _mm_add_pd(_mm256_extractf128_pd(r256, 0), _mm256_extractf128_pd(r256, 1)); - r128 = _mm_hadd_pd(r128, r128); - - *y = r128[0]; - *y = hsum_avx2(r256); -} - -static -void -gemv_1xN_kernel4(int n, double *row, double *x, double *y) -{ - int c; - double res; - - res = 0.; - for (c = 0; c < n; c += 4) { - res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3]; - } - - *y = res; -} - -error -blas·gemv(int nrow, int ncol, double a, double *m, int incm, double *x, int incx, double b, double *y, int incy) -{ - int c, r, nr, nc; - double *row[8], res[8]; - enum { - err·nil, - err·incm, - }; - - if (incm < ncol) { - errorf("aliased matrix: inc = %d < ncols = %d", incm, ncol); - return err·incm; - } - - if (incx == 1 && incy == 1) { - nc = EVEN_BY(ncol, 4); - - nr = EVEN_BY(nrow, 8); - for (r = 0; r < nr; r += 8) { - row[0] = m + ((r+0) * incm); - row[1] = m + ((r+1) * incm); - row[2] = m + ((r+2) * incm); - row[3] = m + ((r+3) * incm); - row[4] = m + ((r+4) * incm); - row[5] = m + ((r+5) * incm); - row[6] = m + ((r+6) * incm); - row[7] = m + ((r+7) * incm); - - gemv_8xN_kernel4_avx2(nc, row, x, res); - - for (c = nc; c < ncol; c++) { - res[0] += row[0][c]*x[c]; - res[1] += row[1][c]*x[c]; - res[2] += row[2][c]*x[c]; - res[3] += row[3][c]*x[c]; - res[4] += row[4][c]*x[c]; - res[5] += row[5][c]*x[c]; - res[6] += row[6][c]*x[c]; - res[7] += row[7][c]*x[c]; - } - - y[r+0] = a*res[0] + b*y[r+0]; - y[r+1] = a*res[1] + b*y[r+1]; - y[r+2] = a*res[2] + b*y[r+2]; - y[r+3] = a*res[3] + b*y[r+3]; - y[r+4] = a*res[4] + b*y[r+4]; - y[r+5] = a*res[5] + b*y[r+5]; - y[r+6] = a*res[6] + b*y[r+6]; - y[r+7] = a*res[7] + b*y[r+7]; - } - - nr = EVEN_BY(nrow, 4); - for (; r < nr; r += 4) { - row[0] = m + ((r+0) * incm); - row[1] = m + ((r+1) * incm); - row[2] = m + ((r+2) * incm); - row[3] = m + ((r+3) * incm); - - gemv_4xN_kernel4_avx2(nc, row, x, res); - - for (c = nc; c < ncol; c++) { - res[0] += row[0][c]*x[c]; - res[1] += row[1][c]*x[c]; - res[2] += row[2][c]*x[c]; - res[3] += row[3][c]*x[c]; - } - - y[r+0] = a*res[0] + b*y[r+0]; - y[r+1] = a*res[1] + b*y[r+1]; - y[r+2] = a*res[2] + b*y[r+2]; - y[r+3] = a*res[3] + b*y[r+3]; - } - - nr = EVEN_BY(nrow, 2); - for (; r < nr; r += 2) { - row[0] = m + ((r+0) * incm); - row[1] = m + ((r+1) * incm); - gemv_2xN_kernel4_avx2(nc, row, x, res); - - for (c = nc; c < ncol; c++) { - res[0] += row[0][c]*x[c]; - res[1] += row[1][c]*x[c]; - } - - y[r+0] = a*res[0] + b*y[r+0]; - y[r+1] = a*res[1] + b*y[r+1]; - } - - for (; r < nrow; r++) { - row[0] = m + ((r+0) * ncol); - res[0] = blas·dot(ncol, row[0], 1, x, 1); - y[r] = a*res[0] + b*y[r]; - } - } - - return 0; -} - -/* - * rank one addition - * M = ax(y^T) + M - * TODO: vectorize kernel - */ - -void -blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m) -{ - int i, j; - - for (i = 0; i < nrow; i++) { - for (j = 0; j < ncol; j++) { - m[i+ncol*j] += a * x[i] * y[j]; - } - } -} - -/* - * rank one addition - * M = ax(x^T) + M - */ - -void -blas·her(int n, double a, double *x, double *m) -{ - blas·ger(n, n, a, x, x, m); -} - -/* - * symmetric rank one addition - * M = ax(x^T) + M - * TODO: vectorize kernel - */ - -void -blas·syr(int nrow, int ncol, double a, double *x, double *m) -{ - int i, j; - - for (i = 0; i < nrow; i++) { - for (j = 0; j < ncol; j++) { - m[i+ncol*j] += a * x[i] * x[j]; - } - } -} - - -// ----------------------------------------------------------------------- -// level three - -/* - * matrix multiplication - * m3 = a(m1 * m2) + b(m3) - * einstein notation: - * m3_{ij} = a m1_{ik} m2_{kj} + b m3_{ij} - * - * n1 = # rows of m1 = # rows of m3 - * n2 = # cols of m2 = # cols of m3 - * n3 = # cols of m1 = # rows of m2 - * - * TODO: Right now we are 2x slower than OpenBLAS. - * This is because we use a very simple algorithm. - */ - -void -blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3) -{ - int i, j, k, len; - - // TODO: Is there anyway this computation can be integrated into the one below? - for (i = 0; i < n1; i++) { - for (j = 0; j < n2; j++) { - m3[i + n2*j] *= b; - } - } - - len = n1 & ~7; - for (j = 0; j < n2; j++) { - for (k = 0; k < n3; k++) { - axpy_kernel8_avx2(len, a * m2[k + n2*j], m1 + n3*k, m3 + n2*j); - - // remainder - for (i = len; i < n1; i++) { - m3[i + n2*j] += a * m1[i + n3*k] * m2[k + n2*j]; - } - } - } -} - -/* - * triangular matrix multiplication - * m2 = a * m1 * m2 _OR_ a * m2 * m1 - * m1 is assumed triangular - * einstein notation: - * m2_{ij} = a m1_{ik} m2_{kj} _OR_ a m1_{kj} m2_{ik} - * - * nrow = # rows of m2 - * ncol = # cols of m2 - * TODO(PERF): make compute kernel - * TODO: finish all other flags - */ -void -blas·trmm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2) -{ - int i, j, k, len; - - for (i = 0; i < nrow; i++) { - for (j = 0; j < ncol; j++) { - for (k = i; k < ncol; k++) { - m2[i + ncol*j] += a * m1[i + nrow*k] * m2[k + ncol*j]; - } - } - } -} - -/* - * solve triangular matrix system of equations - * m2 = a * m1^{-1L} _OR_ a * m2 * m1 - * m1 is assumed triangular - * - * nrow = # rows of m2 - * ncol = # cols of m2 - * TODO: complete stub - * TODO: finish all other flags - */ -void -blas·trsm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2) -{ -} - -#define NITER 1000 -#define NCOL 2005 -#define NROW 2005 - -error -test·level3() -{ - vlong i, n; - clock_t t; - - double *x, *y, *m[3]; - double res[2], tprof[2]; - - // openblas_set_num_threads(1); - - x = malloc(sizeof(*x)*NCOL); - y = malloc(sizeof(*x)*NCOL); - m[0] = malloc(sizeof(*x)*NCOL*NROW); - m[1] = malloc(sizeof(*x)*NCOL*NROW); - m[2] = malloc(sizeof(*x)*NCOL*NROW); - - tprof[0] = 0; - tprof[1] = 0; - - for (n = 0; n < NITER; n++) { - for (i = 0; i < NCOL; i++) { - x[i] = i*i+1; - y[i] = i+1; - } - - for (i = 0; i < NCOL*NROW; i++) { - m[0][i] = i/(NCOL*NROW); - m[1][i] = i*i/(NCOL*NROW*NCOL*NROW); - m[2][i] = 1; - } - - t = clock(); - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, NCOL, NROW, NROW, 1.2, m[0], NROW, m[1], NROW, 2.8, m[2], NROW); - t = clock() - t; - tprof[1] += 1000.*t/CLOCKS_PER_SEC; - res[1] = cblas_ddot(NROW*NCOL, m[2], 1, m[2], 1); - - for (i = 0; i < NCOL; i++) { - x[i] = i*i+1; - y[i] = i+1; - } - - for (i = 0; i < NCOL*NROW; i++) { - m[0][i] = i/(NCOL*NROW); - m[1][i] = i*i/(NCOL*NROW*NCOL*NROW); - m[2][i] = 1; - } - - t = clock(); - blas·gemm(NROW, NROW, NROW, 1.2, m[0], m[1], 2.8, m[2]); - t = clock() - t; - tprof[0] += 1000.*t/CLOCKS_PER_SEC; - res[0] = blas·dot(NROW*NCOL, m[2], 1, m[2], 1); - } - printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); - printf("--> result (naive): %f\n", res[0]); - printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); - printf("--> result (oblas): %f\n", res[1]); - - return 0; -} - -void -test·level2() -{ - int i, j, n, it; - clock_t t; - - double *x, *y, *z, *m; - double tprof[2]; - - rng·init(0); - - tprof[0] = 0; - tprof[1] = 0; - - x = malloc(sizeof(*x)*NCOL); - y = malloc(sizeof(*x)*NCOL); - z = malloc(sizeof(*x)*NCOL); - m = malloc(sizeof(*x)*NROW*NCOL); - - for (it = 0; it < NITER; it++) { - n = 0; - for (i = 0; i < NROW; i++) { - x[i] = rng·random(); - y[i] = rng·random(); - for (j = 0; j < NCOL; j++) { - m[n++] = rng·random() + .1; - } - } - - memcpy(z, y, NCOL * sizeof(*y)); - - t = clock(); - blas·gemv(NROW, NCOL, 2, m, NCOL, x, 1, 1.0, y, 1); - t = clock() - t; - - tprof[0] += 1000.*t/CLOCKS_PER_SEC; - - t = clock(); - cblas_dgemv(CblasRowMajor, CblasNoTrans, NROW, NCOL, 2, m, NROW, x, 1, 1.0, z, 1); - t = clock() - t; - - tprof[1] += 1000.*t/CLOCKS_PER_SEC; - - for (i = 0; i < NCOL; i++) { - if (math·abs(z[i] - y[i])/math·abs(x[i]) > 1e-5) { - errorf("failure at index %d: %f != %f", i, z[i], y[i]); - } - } - } - - printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); - printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); -} - -void -print·array(int n, double *x) -{ - double *end; - printf("["); - for (end=x+n; x != end; ++x) { - printf("%f,", *x); - } - printf("]\n"); -} - -error -test·level1() -{ - int ai, ai2, i, n; - double *x, *y; - double tprof[2]; - // double params[5]; - clock_t t; - - x = malloc(sizeof(*x)*NCOL); - y = malloc(sizeof(*x)*NCOL); - rng·init(0); - - // params[0] = -1.; - // params[1] = 100; params[2] = 20; params[3] = 30; params[4] = 10; - - for (n = 0; n < NITER; n++) { - for (i = 0; i < NCOL; i++) { - y[i] = rng·random(); - } - memcpy(x, y, sizeof(*x)*NCOL); - - t = clock(); - ai = blas·argmin(NCOL, x, 1); - t = clock() - t; - tprof[0] += 1000.*t/CLOCKS_PER_SEC; - - if (n == 20729) { - printf("[%d]=%f vs [%d]=%f\n", 74202, x[74202], 3, x[3]); - } - memcpy(x, y, sizeof(*x)*NCOL); - - t = clock(); - ai2 = cblas_idamin(NCOL, x, 1); - t = clock() - t; - tprof[1] += 1000.*t/CLOCKS_PER_SEC; - - if (ai != ai2) { - printf("iteration %d: %d not equal to %d. %f vs %f\n", n, ai, ai2, x[ai], x[ai2]); - } - } - - printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER); - printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER); - - double a, b, c, s; - - a = 10.234, b = 2.; - cblas_drotg(&a, &b, &c, &s); - printf("%f, %f, %f, %f\n", a, b, c, s); - - a = 10.234, b = 2.; - blas·rotg(&a, &b, &c, &s); - printf("%f, %f, %f, %f\n", a, b, c, s); - - return 0; -} - -#define STEP 1 -error -test·argmax() -{ - int i, n; - double *x, *y, *w, *z; - double res[2], tprof[2]; - int idx[2]; - clock_t t; - - x = malloc(sizeof(*x)*NCOL); - y = malloc(sizeof(*x)*NCOL); - w = malloc(sizeof(*x)*NCOL); - z = malloc(sizeof(*x)*NCOL); - rng·init(0); - - for (n = 0; n < NITER; n++) { - for (i = 0; i < NCOL; i++) { - x[i] = rng·random(); - y[i] = rng·random(); - - w[i] = x[i]; - z[i] = y[i]; - } - - t = clock(); - idx[0] = cblas_idamin(NCOL/STEP, w, STEP); - t = clock() - t; - tprof[1] += 1000.*t/CLOCKS_PER_SEC; - - t = clock(); - idx[1] = blas·argmin(NCOL/STEP, x, STEP); - t = clock() - t; - tprof[0] += 1000.*t/CLOCKS_PER_SEC; - - if (idx[0] != idx[1]) { - errorf("%d != %d", idx[0], idx[1]); - } - // if (math·abs(res[0] - res[1])/math·abs(res[0]) > 1e-4) { - // errorf("%f != %f", res[0], res[1]); - // } - // for (i = 0; i < NCOL; i++) { - // if (math·abs(x[i] - w[i]) + math·abs(y[i] - z[i]) > 1e-4) { - // errorf("%f != %f & %f != %f at index %d", x[i], w[i], y[i], z[i], i); - // } - // } - } - - return 0; -} - -error -main() -{ - test·level2(); - return 0; -} diff --git a/sys/libmath/rules.mk b/sys/libmath/rules.mk index 049092a..73d63e3 100644 --- a/sys/libmath/rules.mk +++ b/sys/libmath/rules.mk @@ -28,13 +28,19 @@ BINS := $(BINS) $(BINS_$(d)) # $(LIBS_$(d)) = TCINCS := # $(LIBS_$(d)) = TCLIBS := +GENERATE = python $^ > $@ +$(d)/blas1.c: $(d)/gen1.py + @$(GENERATE) + $(LIBS_$(d)): $(OBJS_$(d)) - $(ARCHIVE) + @echo AR $^ + @$(ARCHIVE) $(BINS_$(d)): TCFLAGS := -D_GNU_SOURCE $(BINS_$(d)): TCLIBS := -lpthread -lm $(BINS_$(d)): $(OBJS_$(d)) $(OBJ_DIR)/libn/libn.a $(LIB_DIR)/vendor/libblas.a - $(LINK) + @echo BIN $@ + @$(LINK) # ---- Pop off stack ---- -include $(DEPS_$(d)) diff --git a/sys/libn/rules.mk b/sys/libn/rules.mk index a60a3d5..7efcae6 100644 --- a/sys/libn/rules.mk +++ b/sys/libn/rules.mk @@ -34,11 +34,13 @@ BINS := $(BINS) $(BINS_$(d)) # $(LIBS_$(d)) := TCLIBS := $(LIBS_$(d)): $(OBJS_$(d)) - $(ARCHIVE) + @echo LIB $@ + @$(ARCHIVE) $(BINS_$(d)): TCLIBS := $(LIBS_$(d)) $(LIB_DIR)/vendor/libz.a $(BINS_$(d)): $(OBJ_DIR)/libn/test.o $(LIBS_$(d)) - $(LINK) + @echo BIN $@ + @$(LINK) # ---- Pop off stack ---- -include $(DEPS_$(d)) -- cgit v1.2.1