blas 1 generation code complete

author: Nicholas Noll <nbnoll@eml.cc> 2020-05-13 17:30:19 -0700
committer: Nicholas Noll <nbnoll@eml.cc> 2020-05-13 17:30:19 -0700
commit: d982e7c2fdebf560ccce193cb98b85d4fac28a45 (patch)
tree: b18902eea12a2d55a24994ca0681ca1a369631aa
parent: c9d4b2d7dd1d9a46571e5d2b2cf6ce10a9d9ebea (diff)
11 files changed, 2480 insertions, 3506 deletions
diff --git a/include/libmath.h b/include/libmath.h
index 5a7dc4e..40ae4ee 100644
--- a/include/libmath.h
+++ b/include/libmath.h
@@ -130,49 +130,7 @@ double      math·trunc(double);
 float       math·truncf(float);
 
 // -----------------------------------------------------------------------
-// basic linear algebra compute kernels
-
-// TODO: think of better names
-enum
-{
-    blas·LowerTri      = 1u,
-    blas·Transpose     = 2u,
-    blas·ConjTranspose = 4u,
-    blas·DiagOnes      = 8u,
-    blas·LeftSide      = 16u,
-};
-
-typedef uint32 blas·Flag;
-
-/* level 1 */
-void   blas·rot(int len, double *x, int incx, double *y, int incy, double cos, double sin);
-void   blas·rotg(double *a, double *b, double *cos, double *sin);
-error  blas·rotm(int len, double *x, int incx, double *y, int incy, double p[5]);
-void   blas·scale(int len, double a, double *x, int inc);
-void   blas·copy(int len, double *x, int incx, double *y, int incy);
-void   blas·swap(int len, double *x, int incx, double *y, int incy);
-void   blas·axpy(int len, double a, double *x, int incx, double *y, int incy);
-double blas·dot(int len, double *x, int incx, double *y, int incy);
-double blas·norm(int len, double *x, int inc);
-double blas·sum(int len, double *x, int inc);
-int    blas·argmax(int len, double *x, int inc);
-int    blas·argmin(int len, double *x, int inc);
-
-/* level 2 */
-void   blas·tpmv(blas·Flag f, int n, double *m, double *x);
-error  blas·gemv(int nrow, int ncol, double a, double *m, int incm, double *x, int incx, double b, double *y, int incy) ;
-void   blas·tpsv(blas·Flag f, int n, double *m, double *x);
-void   blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m);
-void   blas·her(int n, double a, double *x, double *m);
-void   blas·syr(int nrow, int ncol, double a, double *x, double *m);
-
-/* level 3 */
-void   blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3);
-void   blas·trmm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2);
-void   blas·trsm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2);
-
-// -----------------------------------------------------------------------
-// higher level linear algebra
+// linear algebra
 
 typedef struct math·Vector
 {
@@ -185,7 +143,7 @@ typedef struct math·Vector
 typedef struct math·Matrix
 {
     double   *data;
-    blas·Flag kind;
+    uint32    kind;
     int       dim[2];
 } math·Matrix;
 
diff --git a/lib/c.py b/lib/c.py
deleted file mode 100644
index bbc9ce3..0000000
--- a/lib/c.py
+++ /dev/null
@@ -1,1579 +0,0 @@
-#! /bin/python3
-from __future__ import annotations
-
-import os
-import sys
-
-import sympy
-
-from enum      import Enum
-from typing    import Callable, List, Tuple, Dict
-from functools import singledispatch
-
-numel = len
-
-# ------------------------------------------------------------------------
-# String buffer
-
-level  = 0
-buffer = ""
-
-def emit(s: str):
-    global buffer
-    buffer += s
-
-def emitln(n=1):
-    global buffer
-    buffer += "\n"*n + ("    " * level)
-
-def enter_scope():
-    global level
-    emit("{")
-    level += 1
-    emitln()
-
-def exits_scope():
-    global level
-    level -= 1
-    emitln()
-    emit("}")
-
-def emitheader():
-    emit("#include <u.h>\n")
-    emit("#include <libn.h>\n")
-    emit("#include <libmath.h>\n")
-    emitln()
-
-# ------------------------------------------------------------------------
-# Simple C AST
-# TODO: Type checking
-
-# Abstract class everything will derive from
-# All AST nodes will have an "emit" function that outputs formatted C code
-class Emitter(object):
-    def emit(self):
-        pass
-
-# ------------------------------------------
-# Representation of a C type
-
-class Type(Emitter):
-    def emit(self):
-        pass
-
-    def emitspec(self, var):
-        pass
-
-class Base(Type):
-    def __init__(self, name: str):
-        self.name = name
-
-    def __hash__(self):
-        return hash(self.name)
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.name == other.name
-
-    def __str__(self):
-        return self.name
-
-    def emit(self):
-        emit(self.name)
-
-    def emitspec(self, ident):
-        emit(f"{ident}")
-
-# TODO: Operation lookup tables...
-
-class Ptr(Type):
-    def __init__(self, to: Type):
-        self.to = to
-
-    def __str__(self):
-        return f"*{self.to}"
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.to == other.to
-
-    def __hash__(self):
-        return hash((hash(self.to), "*"))
-
-    def emit(self):
-        self.to.emit()
-
-    def emitspec(self, ident):
-        emit("*")
-        self.to.emitspec(ident)
-
-class Array(Type):
-    def __init__(self, base: Type, len: int):
-        self.base = base
-        self.len  = len
-
-    def __str__(self):
-        return f"[{self.len}]{self.base}"
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.base == other.base
-
-    def __hash__(self):
-        return hash((hash(self.base), self.len))
-
-    def emit(self):
-        self.base.emit()
-
-    def emitspec(self, ident):
-        self.base.emitspec(ident)
-        emit(f"[{self.len}]")
-
-# Machine primitive types
-Void      = Base("void")
-Error     = Base("error")
-
-Byte      = Base("byte")
-String    = Ptr(Byte)
-
-Int       = Base("int")
-Int8      = Base("int8")
-Int16     = Base("int16")
-Int32     = Base("int32")
-Int64     = Base("int64")
-Int32x4   = Base("__mm128i")
-Int32x8   = Base("__mm256i")
-Int64x2   = Base("__mm128i")
-Int64x4   = Base("__mm256i")
-
-Float32   = Base("float")
-Float64   = Base("double")
-Float32x4 = Base("__m128")
-Float32x8 = Base("__mm256")
-Float64x2 = Base("__m128d")
-Float64x4 = Base("__mm256d")
-
-def IsVectorType(kind):
-    if kind is Float32x4 or \
-       kind is Float32x8 or \
-       kind is Float64x2 or \
-       kind is Float64x4 or \
-       kind is Int32x4   or \
-       kind is Int32x8   or \
-       kind is Int64x2   or \
-       kind is Int64x4:
-           return True
-
-    if type(kind) == Ptr:
-        return IsVectorType(kind.to)
-
-    return False
-
-def IsArrayType(kind):
-    return isinstance(kind, Array)
-
-# TODO: Make this ARCH dependent
-BitDepth= {
-    Void:    8,
-    Int:     32,
-    Int32:   32,
-    Int64:   64,
-    Float32: 64,
-    Float64: 64,
-}
-
-class SIMD(Enum):
-    SSE  = "sse"
-    SSE2 = "sse2"
-    AVX  = "avx"
-    AVX2 = "avx2"
-    FMA3 = "fma3"
-    AVX5 = "avx512"
-
-RegisterSize = {
-    SIMD.SSE:  128,
-    SIMD.SSE2: 128,
-    SIMD.AVX:  256,
-    SIMD.AVX2: 256,
-    SIMD.FMA3: 256,
-    SIMD.AVX5: 512,
-}
-
-SIMDSupport = {
-    SIMD.SSE:  set([Float32]),
-    SIMD.SSE2: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]),
-    SIMD.AVX:  set([Float32, Float64, Int, Int8, Int16, Int32, Int64]),
-    SIMD.AVX2: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]),
-    SIMD.FMA3: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]),
-    SIMD.AVX5: set([Float32, Float64, Int, Int8, Int16, Int32, Int64]),
-}
-
-# TODO: Think of a better way to handle this
-def emit·load128(l, r):
-    if l is not None:
-        l.emit()
-        emit(" = ")
-    emit("_mm_loadu_sd(")
-    r.emit()
-    emit(")")
-
-def emit·broadcast256(l, r):
-    l.emit()
-    emit(" = _mm256_broadcastsd_pd(")
-    r.emit()
-    emit(")")
-
-def emit·load256(l, r):
-    if l is not None:
-        l.emit()
-        emit(" = ")
-    emit("_mm256_loadu_pd(")
-    r.emit()
-    emit(")")
-
-def emit·store256(l, r):
-    emit("_mm256_storeu_pd(")
-    l.emit()
-    emit(", ")
-    r.emit()
-    emit(")")
-
-def emit·copy256(l, r):
-    emit("_mm256_storeu_pd(")
-    l.emit()
-    emit(", ")
-    emit("_mm256_loadu_pd(")
-    r.emit()
-    emit("))")
-
-# TODO: Typedefs...
-
-# ------------------------------------------
-# C expressions
-class Expr(Emitter):
-    def emit():
-        pass
-
-# Literals
-class Literal(Expr):
-    def emit(self):
-        emit(f"{self}")
-
-class I(Literal, int):
-    def __new__(cls, i: int):
-        return super(I, cls).__new__(cls, i)
-
-class F(Literal, float):
-    def __new__(self, f: float):
-        return super(F, self).__new__(cls, f)
-
-class S(Literal, str):
-    def __new__(self, s: str):
-        return super(S, self).__new__(cls, s)
-
-# Ident of symbol
-class Ident(Expr):
-    def __init__(self, var):
-        self.name = var.name
-        self.var  = var
-
-    def __str__(self):
-        return str(self.name)
-
-    def __hash__(self):
-        return hash(self.name)
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self.name == other.name
-
-    def emit(self):
-        emit(f"{self.name}")
-
-
-# Unary operators
-class UnaryOp(Expr):
-    def __init__(self, x: Expr):
-        self.x = x
-
-    def emit(self):
-        pass
-
-class Deref(UnaryOp):
-    method = {
-        Ptr(Float64x4) : lambda x: emit·load256(None, x),
-    }
-
-    def emit(self):
-        kind = GetType(self.x)
-        if kind in self.method:
-            self.method[kind](self.x)
-        else:
-            emit("*")
-            self.x.emit()
-
-class Negate(UnaryOp):
-    def emit(self):
-        emit("~")
-        self.x.emit()
-
-class Ref(UnaryOp):
-    def emit(self):
-        emit("&")
-        self.x.emit()
-
-class Inc(UnaryOp):
-    def __init__(self, x: Expr, pre=False):
-        self.x   = x
-        self.pre = pre
-
-    def emit(self):
-        if self.pre:
-            emit("++")
-            self.x.emit()
-        else:
-            self.x.emit()
-            emit("++")
-
-class Dec(UnaryOp):
-    def __init__(self, x: Expr, pre=False):
-        self.x   = x
-        self.pre = pre
-
-    def emit(self):
-        if self.pre:
-            emit("--")
-            self.x.emit()
-        else:
-            self.x.emit()
-            emit("--")
-
-# Binary operators
-class BinaryOp(Expr):
-    def __init__(self, left: Expr, right: Expr):
-        self.l = left
-        self.r = right
-
-    def emit(self):
-        pass
-
-# TODO: check types if they are vectorized and emit correct intrinsic
-class Add(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" + ")
-        self.r.emit()
-
-class Sub(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" - ")
-        self.r.emit()
-
-class Mul(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" * ")
-        self.r.emit()
-
-class Div(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" / ")
-        self.r.emit()
-
-class And(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" & ")
-        self.r.emit()
-
-class Xor(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" ^ ")
-        self.r.emit()
-
-class GT(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" > ")
-        self.r.emit()
-
-class LT(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" < ")
-        self.r.emit()
-
-class GE(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" >= ")
-        self.r.emit()
-
-class LE(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" <= ")
-        self.r.emit()
-
-class EQ(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(f" == ")
-        self.r.emit()
-
-# Loads a scalar
-class SIMDLoad(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        self.r.emit()
-
-# Loads data at address
-class SIMDLoadAt(BinaryOp):
-    def emit(self):
-        self.l.emit()
-        emit(" = _mm256_loadu_pd(")
-        self.r.emit()
-        emit(")")
-
-# Assignment (stores)
-class Assign(Expr):
-    def __init__(self, lhs: Expr, rhs: Expr):
-        self.lhs = lhs
-        self.rhs = rhs
-
-class Set(Assign):
-    method = {
-        (Float64x2, Ptr(Float64))   :      emit·load128,
-        (Float64x4, Float64x2)      :      emit·broadcast256,
-        (Float64x4, Ptr(Float64x4)) :      emit·load256,
-        (Ptr(Float64x4), Float64x4) :      emit·store256,
-        (Ptr(Float64x4), Ptr(Float64x4)) : emit·copy256,
-    }
-    def emit(self):
-        lhs = GetType(self.lhs)
-        rhs = GetType(self.rhs)
-        if (lhs, rhs) in self.method:
-            self.method[(lhs, rhs)](self.lhs, self.rhs)
-        else:
-            self.lhs.emit()
-            emit(f" = ")
-            self.rhs.emit()
-
-class AddSet(Assign):
-    def emit(self):
-        self.lhs.emit()
-        emit(f" += ")
-        self.rhs.emit()
-
-class SubSet(Assign):
-    def emit(self):
-        self.lhs.emit()
-        emit(f" -= ")
-        self.rhs.emit()
-
-class MulSet(Assign):
-    def emit(self):
-        self.lhs.emit()
-        emit(f" *= ")
-        self.rhs.emit()
-
-class DivSet(Assign):
-    def emit(self):
-        self.lhs.emit()
-        emit(f" /= ")
-        self.rhs.emit()
-
-class Comma(Expr):
-    def __init__(self, x: Expr, next: Expr):
-        self.expr = (x, next)
-
-    def emit(self):
-        self.expr[0].emit()
-        emit(", ")
-        self.expr[1].emit()
-
-class Index(Expr):
-    def __init__(self, x: Expr, i: Expr):
-        self.x = x
-        self.i = i
-
-    def emit(self):
-        self.x.emit()
-        emit("[")
-        self.i.emit()
-        emit("]")
-
-class Paren(Expr):
-    def __init__(self, x: Expr):
-        self.x = x
-
-    def emit(self):
-        emit("(")
-        self.x.emit()
-        emit(")")
-
-class Call(Expr):
-    def __init__(self, func: Func, args: List[Param]):
-        self.func = func
-        self.args = args
-
-    def emit(self):
-        emit(self.func.name)
-        emit("(")
-        if numel(self.args) > 0:
-            self.args[0].emit()
-            for arg in self.args[1:]:
-                emit(", ")
-                arg.emit()
-        emit(")")
-
-def ExprList(*expr: Expr | List[Expr]):
-    if numel(expr) > 1:
-        x = expr[0]
-        return Comma(x, ExprList(*expr[1:]))
-    else:
-        return expr[0]
-
-# Common assignments are kept globally
-# C statements
-
-class Stmt(Emitter):
-    def emit(self):
-        emit(";")
-
-class Empty(Stmt):
-    pass
-
-class Block(Stmt):
-    def __init__(self, *stmts: List[Stmt | Expr]):
-        self.stmts = [ s if isinstance(s, Stmt) else StmtExpr(s) for s in stmts ]
-
-    def emit(self):
-        enter_scope()
-        for i, stmt in enumerate(self.stmts):
-            stmt.emit()
-            if i < numel(self.stmts) - 1:
-                emitln()
-
-        exits_scope()
-
-class If(Stmt):
-    def __init__(self, cond: Expr | List[Expr], then: Stmt | List[Stmt], orelse: Stmt | List[Stmt] | None = None):
-        self.cond   = cond if isinstance(cond, Expr) else ExprList(*cond)
-        self.then   = Block(then) if isinstance(then, list) else then
-        self.orelse = Block(orelse) if isinstance(orelse, list) else orelse
-
-    def emit(self):
-        emit("if (")
-        self.cond.emit()
-        emit(") ")
-        self.then.emit()
-        if self.orelse is not None:
-            self.orelse.emit()
-
-class For(Stmt):
-    def __init__(self, init: Expr | List[Expr], cond: Expr | List[Expr], step: Expr | List[Expr], body: Stmt| None = None):
-        self.init = init if isinstance(init, Expr) else ExprList(*init)
-        self.cond = cond if isinstance(cond, Expr) else ExprList(*cond)
-        self.step = step if isinstance(step, Expr) else ExprList(*step)
-        self.body = body if body is not None else Empty()
-
-    def emit(self):
-        emit("for (")
-        if self.init is not None:
-            self.init.emit()
-        emit("; ")
-        if self.cond is not None:
-            self.cond.emit()
-        emit("; ")
-        if self.step is not None:
-            self.step.emit()
-        emit(") ")
-
-        if isinstance(self.body, Block):
-            self.body.emit()
-        else:
-            enter_scope()
-            self.body.emit()
-            exits_scope()
-
-    def execute(self, stmt: Stmt | Expr):
-        if isinstance(stmt, Expr):
-            stmt = StmtExpr(stmt)
-
-        if isinstance(self.body, Empty):
-            self.body = stmt
-            return
-        elif not isinstance(self.body, Block):
-            self.body = Block(self.body)
-        self.body.stmts.append(stmt)
-
-class Return(Stmt):
-    def __init__(self, val: Expr):
-        self.val = val
-
-    def emit(self):
-        emitln()
-        emit("return ")
-        self.val.emit()
-        super(Return, self).emit()
-
-class StmtExpr(Stmt):
-    def __init__(self, x: Expr):
-        self.x = x
-
-    def emit(self):
-        self.x.emit()
-        super(StmtExpr, self).emit()
-
-class Mem(Enum):
-    Auto     = ""
-    Static   = "static"
-    Register = "register"
-    Typedef  = "typedef"
-    External = "extern"
-
-class Decl(Emitter):
-    def __init__(self):
-        pass
-
-    def emit(self):
-        pass
-
-class Func(Decl):
-    def __init__(self, name: str, ret: Expr = Void, params: List[Param] = None, vars: List[Var | List[Var]] = None, body: List[Stmt] = None):
-        self.name   = name
-        self.ret    = ret
-        self.params = params if params is not None else []
-        self.vars   = vars if vars is not None else []
-        self.stmts  = body if body is not None else []
-
-    def emit(self):
-        self.ret.emit()
-        emitln()
-        emit(self.name)
-        emit("(")
-        for i, p in enumerate(self.params):
-            p.emittype()
-            emit(" ")
-            p.emitspec()
-            if i < numel(self.params) - 1:
-                emit(", ")
-        emit(")\n")
-
-        enter_scope()
-
-        for var in self.vars:
-            if isinstance(var, list):
-                v = var[0]
-                v.emittype()
-                emit(" ")
-                v.emitspec()
-                for v in var[1:]:
-                    emit(", ")
-                    v.emitspec()
-            else:
-                var.emittype()
-                emit(" ")
-                var.emitspec()
-
-            emit(";")
-            emitln()
-
-        if numel(self.vars) > 0:
-            emitln()
-
-        for stmt in self.stmts[:-1]:
-            stmt.emit()
-            emitln()
-        if numel(self.stmts) > 0:
-            self.stmts[-1].emit()
-
-        exits_scope()
-        emitln(2)
-
-    def declare(self, var: Var, *vars: List[Var | List[Var]]) -> Expr | List[Expr]:
-        if var.name in [v.name for v in self.vars]:
-            return
-
-        self.vars.append(var)
-        if numel(vars) == 0:
-            return Ident(var)
-
-        self.vars.extend(vars)
-
-        idents  = [Ident(var)]
-        idents += [Ident(v) for v in vars]
-        return idents
-
-    def execute(self, stmt: Stmt | Expr, *args: List[Stmt | Expr]):
-        def push(n):
-            if isinstance(n, Stmt):
-                self.stmts.append(n)
-            elif isinstance(n, Expr):
-                self.stmts.append(StmtExpr(n))
-            else:
-                raise TypeError("unrecognized type for function")
-        push(stmt)
-        for arg in args:
-            push(arg)
-
-    def variables(self, *idents: List[str]) -> List[Expr]:
-        vars = {v.name : v for v in self.vars + self.params}
-
-        if numel(idents) == 1:
-            return Ident(vars[idents[0]])
-        return [Ident(vars[ident]) for ident in idents]
-
-class Var(Decl):
-    def __init__(self, type: Type, name: str, storage: Mem = Mem.Auto):
-        self.name    = name
-        self.type    = type
-        self.storage = storage
-
-    def emit(self):
-        emit(f"{self.name}")
-
-    def emittype(self):
-        if self.storage != Mem.Auto:
-            emit(self.storage.value)
-            emit(" ")
-        self.type.emit()
-
-    def emitspec(self):
-        self.type.emitspec(self.name)
-
-    @classmethod
-    def copy(cls, other: Var):
-        return cls(other.type, other.name, other.storage)
-
-class Param(Var):
-    def __init__(self, type: Type, name: str):
-        return super(Param, self).__init__(type, name, Mem.Auto)
-
-    @classmethod
-    def copy(cls, other: Param):
-        return cls(other.type, other.name)
-
-def Params(*ps: List[Tuple(Type, str)]) -> List[Param]:
-    return [Param(p[0], p[1]) for p in ps]
-
-def Vars(*ps: List[Tuple(Type, str)]) -> List[Var]:
-    return [Var(p[0], p[1]) for p in ps]
-
-# ------------------------------------------------------------------------
-# AST modification/production functions
-
-# ------------------------------------------
-# basic (non-recursive) commands
-
-def Swap(x: Var, y: Var, tmp: Var) -> Stmt:
-    return StmtExpr(ExprList(Set(tmp, x), Set(x, y), Set(y, tmp)))
-
-def IsLoop(s: Stmt) -> bool:
-    return type(s) == For
-
-def EvenTo(x: Var, n: int) -> Var:
-    return And(x, Negate(I(n-1)))
-
-# ------------------------------------------
-# Expand: takes statement, indexed, and expands it x times
-
-# def Expand(s: Stmt, times: int) -> Block:
-#     if not isinstance(s, StmtExpr):
-#         raise TypeError(f"{type(x)} not supported by Expand operation")
-
-#     return Block(StmtExpr(Step(s.x, i)) for i in range(times))
-
-# ------------------------------------------
-# repeat: takes command on an array and repeats it 
-
-@singledispatch
-def Repeat(x: Expr, times: int) -> Expr | List[Expr]:
-    raise TypeError(f"{type(x)} not supported by Repeat operation")
-
-@Repeat.register
-def _(x: Inc, times: int):
-    return AddSet(x.x, I(times))
-
-@Repeat.register
-def _(x: Dec, times: int):
-    return DecSet(x.x, I(times))
-
-@Repeat.register
-def _(x: Comma, times: int):
-    return Comma(Repeat(x.expr[0], times), Repeat(x.expr[1], times))
-
-@singledispatch
-def Repeat(x: Expr, times: int) -> Expr | List[Expr]:
-    raise TypeError(f"{type(x)} not supported by Repeat operation")
-
-@Repeat.register
-def _(x: Inc, times: int):
-    return AddSet(x.x, I(times))
-
-@Repeat.register
-def _(x: Dec, times: int):
-    return DecSet(x.x, I(times))
-
-@Repeat.register
-def _(x: Comma, times: int):
-    return Comma(Repeat(x.expr[0], times), Repeat(x.expr[1], times))
-
-# ------------------------------------------
-# step: indexes an expression by i
-
-@singledispatch
-def Step(x: Expr, i: int) -> Expr:
-    raise TypeError(f"{type(x)} not supported by Step operation")
-
-@Step.register
-def _(x: Comma, i: int):
-    return Comma(Step(x.expr[0], i), Step(x.expr[1], i))
-
-@Step.register
-def _(x: Assign, i: int):
-    return type(x)(Step(x.lhs, i), Step(x.rhs, i))
-
-@Step.register
-def _(x: BinaryOp, i: int):
-    return type(x)(Step(x.l, i), Step(x.r, i))
-
-@Step.register
-def _(x: UnaryOp, i: int):
-    return type(x)(Step(x.x, i))
-
-@Step.register
-def _(x: Ident, i: int):
-    return x
-
-@Step.register
-def _(x: Deref, i: int):
-    return Index(x.x, I(i))
-
-@Step.register
-def _(ix: Index, i: int):
-    return Index(ix.x, Add(ix.i, I(i)))
-
-# ------------------------------------------
-# bfs search on statements in ast
-
-@singledispatch
-def Visit(s: Stmt, func):
-    raise TypeError(f"{type(s)} not supported by Visit operation")
-
-@Visit.register
-def _(s: Empty, func):
-    return
-
-@Visit.register
-def _(blk: Block, func):
-    for stmt in blk.stmts:
-        Visit(stmt, func)
-
-@Visit.register
-def _(jmp: If, func):
-    func(jmp.cond)
-    Visit(jmp.then, func)
-    if jmp.orelse is not None:
-        Visit(jmp.orelse, func)
-
-@Visit.register
-def _(loop: For, func):
-    func(loop.init)
-    func(loop.cond)
-    func(loop.step)
-    Visit(loop.body, func)
-
-@Visit.register
-def _(ret: Return, func):
-    func(ret.val)
-
-@Visit.register
-def _(x: StmtExpr, func):
-    func(x.x)
-
-# ------------------------------------------
-# recreates a piece of an AST, allowing an arbitrary transformation of expression nodes
-
-@singledispatch
-def Make(s: Stmt, func):
-    raise TypeError(f"{type(s)} not supported by Make operation")
-
-@Make.register
-def _(s: Empty, func):
-    return Empty()
-
-@Make.register
-def _(blk: Block, func):
-    return Block(*(Make(stmt, func) for stmt in blk.stmts))
-
-@Make.register
-def _(loop: For, func):
-    return For(func(loop.init), func(loop.cond), func(loop.step), Make(loop.body, func))
-
-@Make.register
-def _(jmp: If, func):
-    if jmp.orelse is not None:
-        return If(func(jmp.cond), Make(jmp.then, func), Make(jmp.orelse, func))
-    else:
-        return If(func(jmp.cond), Make(jmp.then, func))
-
-@Make.register
-def _(ret: Return, func):
-    return Return(func(ret.val))
-
-@Make.register
-def _(x: StmtExpr, func):
-    return StmtExpr(func(x.x))
-
-# ------------------------------------------
-# GetType function
-
-@singledispatch
-def GetType(x: Expr) -> Type:
-    if x is None:
-        return
-    raise TypeError(f"{type(x)} not supported by GetType operation")
-
-@GetType.register
-def _(x: Empty):
-    return Void
-
-@GetType.register
-def _(x: Empty):
-    return Void
-
-@GetType.register
-def _(x: S):
-    return Ptr(Byte)
-
-@GetType.register
-def _(x: I):
-    return Int
-
-@GetType.register
-def _(x: F):
-    return Float64
-
-@GetType.register
-def _(x: Ident):
-    return x.var.type
-
-@GetType.register
-def _(x: UnaryOp):
-    return GetType(x.x)
-
-@GetType.register
-def _(x: Deref):
-    return GetType(x.x).to
-
-@GetType.register
-def _(x: Ref):
-    return Ptr(GetType(x.x))
-
-@GetType.register
-def _(x: Index):
-    base = GetType(x.x)
-    if isinstance(base, Ptr):
-        return base.to
-    elif isinstance(base, Array):
-        return base.base
-    else:
-        pass
-        # raise TypeError(f"attempting to index type {base} of node {x.x}")
-
-# TODO: type checking for both
-
-@GetType.register
-def _(x: BinaryOp):
-    lhs = GetType(x.l)
-    rhs = GetType(x.r)
-    return lhs
-
-@GetType.register
-def _(x: Var):
-    return x.type
-
-@GetType.register
-def _(x: Assign):
-    lhs = GetType(x.lhs)
-    rhs = GetType(x.rhs)
-    return lhs
-
-@GetType.register
-def _(x: Comma):
-    return GetType(x.expr[1])
-
-@GetType.register
-def _(x: Paren):
-    return GetType(x.x)
-
-@GetType.register
-def _(x: Call):
-    return x.func.ret
-
-# ------------------------------------------
-# Transform function
-# Recurses down AST, and modifies Expr nodes
-# Returns a brand new tree structure
-
-@singledispatch
-def Transform(x: Expr, func: Callable[Expr, Expr]):
-    raise TypeError(f"{type(x)} not supported by Transform operation")
-
-@Transform.register
-def _(x: Empty, func):
-    return func(Empty())
-
-@Transform.register
-def _(x: Literal, func):
-    return func(type(x)(x))
-
-@Transform.register
-def _(x: Ident, func):
-    return func(Ident(x.var))
-
-@Transform.register
-def _(x: UnaryOp, func):
-    return func(UnaryOp(Transform(x.x, func)))
-
-@Transform.register
-def _(x: Assign, func):
-    return func(type(x)(Transform(x.lhs, func), Transform(x.rhs, func)))
-
-@Transform.register
-def _(x: Deref, func):
-    return func(Transform(Deref(x.x, func)))
-
-@Transform.register
-def _(x: Index, func):
-    return func(Index(Transform(x.x, func), Transform(x.i, func)))
-
-@Transform.register
-def _(x: Comma, func):
-    return func(Comma(Transform(x.expr[0], func), Transform(x.expr[1], func)))
-
-@Transform.register
-def _(x: BinaryOp, func):
-    return func(type(x)(Transform(x.l, func), Transform(x.r, func)))
-
-# ------------------------------------------
-# Filter function
-# Recurses down AST, and stores Expr nodes that satisfy condition in results
-
-@singledispatch
-def Filter(x: Expr, cond, results: List[Expr]):
-    raise TypeError(f"{type(x)} not supported by Filter operation")
-
-@Filter.register
-def _(x: Empty, cond, results: List[Expr]):
-    if cond(x):
-        results.append(x)
-
-@Filter.register(Ident)
-@Filter.register(Literal)
-def _(x, cond, results: List[Expr]):
-    if cond(x):
-        results.append(x)
-
-@Filter.register
-def _(x: UnaryOp, cond, results: List[Expr]):
-    if cond(x):
-        results.append(x)
-    Filter(op.x, cond, results)
-
-@Filter.register
-def _(s: Assign, cond, results: List[Expr]):
-    if cond(s):
-        results.append(s)
-    Filter(s.lhs, cond, results)
-    Filter(s.rhs, cond, results)
-
-@Filter.register
-def _(v: Deref, cond, results: List[Expr]):
-    if cond(v):
-        results.append(v)
-    Filter(v.x, cond, results)
-
-@Filter.register
-def _(i: Index, cond, results: List[Expr]):
-    if cond(i):
-        results.append(i)
-    Filter(i.x, cond, results)
-    Filter(i.i, cond, results)
-
-@Filter.register
-def _(comma: Comma, cond, results: List[Expr]):
-    if cond(comma):
-        results.append(comma)
-    Filter(comma.expr[0], cond, results)
-    Filter(comma.expr[1], cond, results)
-
-@Filter.register
-def _(op: BinaryOp, cond, results: List[Expr]):
-    if cond(op):
-        results.append(op)
-    Filter(op.l, cond, results)
-    Filter(op.r, cond, results)
-
-# ------------------------------------------
-# Eval function
-# Recurses down AST, and evaluates Expr nodes 
-# Throws an error if tree can't be evaluated at compile time
-
-@singledispatch
-def Eval(x: Expr) -> object:
-    raise TypeError(f"{type(x)} not supported by Eval operation")
-
-@Eval.register
-def _(x: Empty):
-    pass
-
-@Eval.register(Ident)
-@Eval.register(S)
-def _(x):
-    return sympy.symbols(f"{x}")
-
-@Eval.register(float)
-@Eval.register(int)
-@Eval.register(I)
-@Eval.register(F)
-def _(x):
-    return x
-
-@Eval.register
-def _(x: Inc):
-    return Eval(Add(Eval(op.x, cond, results), I(1)))
-
-@Eval.register
-def _(x: Dec):
-    return Eval(Sub(Eval(op.x, cond, results), I(1)))
-
-# TODO: This won't work in general (if we have things like sizeof(x) + 1 - sizeof(x))
-@Eval.register
-def _(op: Add):
-    l = Eval(op.l)
-    r = Eval(op.r)
-    return sympy.simplify(l + r)
-
-@Eval.register
-def _(op: Sub):
-    l = Eval(op.l)
-    r = Eval(op.r)
-    return sympy.simplify(l - r)
-
-@Eval.register
-def _(op: Mul):
-    l = Eval(op.l)
-    r = Eval(op.r)
-    return sympy.simplify(l * r)
-
-@Eval.register
-def _(op: Div):
-    l = Eval(op.l)
-    r = Eval(op.r)
-    return sympy.simplify(l / r)
-
-@Eval.register
-def _(s: Assign):
-    return Eval(s.rhs)
-
-@Eval.register
-def _(comma: Comma):
-    return Eval(comma.expr[1])
-
-# ------------------------------------------
-# Leaf traversal
-
-def VarsUsed(stmt: Stmt) -> List[Var]:
-    vars = []
-    Visit(stmt, lambda node: Filter(node, lambda x: isinstance(x, Ident), vars))
-    vars = set([v.var for v in vars])
-
-    return vars
-
-def AddrAccessed(stmt: Stmt):
-    scalars = []  # variables that are accessed as scalars (single sites)
-    vectors = {}  # variables that are accessed as vectors (indexed/dereferenced)
-    nodes   = []
-    Visit(stmt, lambda node: Filter(node, lambda x: isinstance(x, Ident), scalars))
-    Visit(stmt, lambda node: Filter(node, lambda x: isinstance(x, Index), nodes))
-
-    for node in nodes:
-        vars = []
-        Filter(node.x, lambda x: isinstance(x, Ident), vars)
-        if numel(vars) != 1:
-            raise ValueError("multiple variables used in index expression not supported")
-        vectors[vars[0]] = node.i
-        if vars[0] in scalars:
-            scalars.remove(vars[0])
-
-    return set(scalars), vectors
-
-# ------------------------------------------
-# Large scale functions
-
-def Unroll(name: str, loop: For, times: int, ret: Ident = None, accumulator = None) -> (Func, List[Stmt]):
-    # TODO: More sophisticated computation for length of loop
-    if not isinstance(loop.cond, LE) and not isinstance(loop.cond, LT):
-        raise TypeError(f"{type(loop.cond)} not supported in loop unrolling")
-
-    # pull off needed features of the loop
-    i    = loop.init.lhs.var
-    vars = VarsUsed(loop.body)
-
-    def asvector(v):
-        if v != i:
-            return Var(Array(v.type, times), v.name, v.storage)
-        else:
-            return Var.copy(v)
-
-    n     = loop.cond.r.var
-    param = [Param(Ptr(i.type), f"{i.name}p")] + [Param.copy(v) for v in vars if (type(v) == Param and v != n)]
-    stack = {v.name: asvector(v) for v in vars if type(v) == Var}
-
-    # TODO: More sophisticated type checking
-    if (type(n) != Param):
-        raise TypeError(f"{type(n)} not implemented yet")
-
-    if ret is None:
-        kernel = Func(f"{name}_kernel{times}", Void, param, list(stack.values()))
-    else:
-        kernel = Func(f"{name}_kernel{times}", ret.var.type, param, list(stack.values()))
-
-    len = kernel.declare(n)
-
-    body = loop.body
-    itor, itorp = kernel.variables("i", "ip")
-
-    def mkarray(x: Expr, times: int):
-        if isinstance(x, Ident):
-            if type(x.var) == Var:
-                x.var = stack[x.name]
-        return x
-
-    def step(x: Expr, times: int):
-        if x == itor:
-            return Add(x, I(times))
-        if isinstance(x, Ident):
-            if type(x.var) == Var:
-                return Index(x, I(times))
-        if not isinstance(x, Expr):
-            raise ValueError(f"panic, hit type {type(x)}")
-
-        return x
-
-    expandedloop = Make(loop.body, lambda node: Transform(node, lambda x: mkarray(x, times)))
-    kernel.execute(
-        Set(len, EvenTo(Deref(itorp), times)),
-        For(Set(itor, I(0)), LT(itor, len), Repeat(loop.step, times),
-            body = Block(*
-                (Make(expandedloop, lambda node:
-                    Transform(node, lambda x: step(x, i))) for i in range(times)
-                )
-            )
-        ),
-        Set(Deref(itorp), itor)
-    )
-
-    if ret is not None:
-        k = kernel.variables(ret.name)
-        if k.var.type != ret.var.type:
-            if accumulator is None:
-                raise ValueError("If loop returns a value, an accumulator must be given")
-            else:
-                accumulator = For(Set(itor, I(1)), LT(itor, I(times)), Inc(itor),
-                    body = accumulator(I(0), itor, k)
-                )
-            kernel.execute(accumulator)
-
-        kernel.execute(Return(Index(ret, I(0))))
-
-    loop.init = None
-    if ret is None:
-        return kernel, [Set(itor, len), Call(kernel, [Ref(itor)] + param[1:]), loop]
-    else:
-        return kernel, [Set(itor, len), Set(ret, Call(kernel, [Ref(itor)] + param[1:])), loop]
-
-# Replaces all vectorizable loops inside Func with vectorized variants
-# Returns the new vectorized function (tagged by the SIMD chosen)
-
-class SymTab(object):
-    def __init__(self):
-        self.stack = {}
-        self.addrs = {}
-
-    def have(self, name):
-        return name in self.stack or name in self.addrs
-
-def Vectorize(func: Func, isa: SIMD) -> Func:
-    if isa != SIMD.AVX2:
-        raise ValueError(f"ISA '{isa}' not currently implemented")
-
-    vfunc = Func(f"{func.name}_{isa.value}", func.ret, func.params)
-    for stmt in func.stmts:
-        if IsLoop(stmt):
-            loop     = For(stmt.init, stmt.cond, stmt.step)
-            iterator = set([stmt.init.lhs])
-
-            body = stmt.body
-            if type(body) != Block:
-                # TODO: Think through this carefully...
-                #       This is coded for the accumulation step at the end of a kernel
-                loop.cond.r = I(Eval(Div(loop.cond.r, 4)))
-                loop.body   = body
-            else:
-                instr = body.stmts
-                # TODO: Remove hardcoded 4 -> should be function of types!
-                #       As of now, we have harcoded AVX2 <-> float64 relationship
-                if numel(instr) % 4 != 0:
-                    raise ValueError("loop can not be vectorized, instructions can not be globbed equally")
-
-                # TODO: Allow for non-sequential accesses?
-                for i in range(0, numel(instr), 4):
-                    scalars = []
-                    vectors = []
-                    for j in range(4):
-                        s, v = AddrAccessed(instr[i+j])
-                        s   -= iterator # TODO: This is hacky
-                        scalars.append(frozenset(s))
-                        vectors.append(v)
-
-                    # Test if code in uniform to allow for vectorization
-                    if numel(set(scalars)) != 1:
-                        raise ValueError("non uniform scalar accesses in consecutive line. can not vectorize")
-
-                    for j in range(1, 4):
-                        for v, idx in vectors[j].items():
-                            if (delta := (Eval(Sub(idx, vectors[j-1][v])))) != 1:
-                                print(f"{delta}")
-                                raise ValueError("non uniform vector accesses in consecutive line. can not vectorize")
-
-                    # If we made it to here, we have passed all checks. vectorize!
-                    vecs = vectors[0]
-                    if i == 0:
-                        syms = SymTab()
-                        for s in scalars[0]:
-                            intermediate       = vfunc.declare(Var(Float64x2, f"{s.name}128"))
-                            syms.stack[s.name] = vfunc.declare(Var(Float64x4, f"{s.name}256"))
-                            vfunc.execute(Set(intermediate, Ref(s)))
-                            vfunc.execute(Set(syms.stack[s.name], intermediate))
-
-                        for v in vecs.keys():
-                            # All params are treated AS addresses to load into vectorized registers
-                            if type(v.var) == Param:
-                                syms.addrs[v.name] = Var(Ptr(Float64x4), f"{v.name}256")
-                            # All stack declared parameters MUST be moved to vectorized registers
-                            else:
-                                assert IsArrayType(v.var.type), f"must be an array type, instead got {v.var.type}"
-                                nreg = v.var.type.len // 4
-                                if nreg > 1:
-                                    syms.stack[v.name] = vfunc.declare(Var(Array(Float64x4, nreg), f"{v.name}256"))
-                                else:
-                                    syms.stack[v.name] = vfunc.declare(Var(Float64x4, f"{v.name}256"))
-
-
-                    # IMPORTANT: We do a post-order traversal.
-                    # We transforms leaves (identifiers) first and then move back up the root
-                    def translate(x):
-                        if isinstance(x, Ident):
-                            if x.name in syms.stack:
-                                return syms.stack[x.name]
-                            elif x.name in syms.addrs:
-                                x.var = syms.addrs[x.name]
-                        if isinstance(x, Index):
-                            if type(x.x) == Ident:
-                                if x.x.name in syms.addrs:
-                                    return Add(x.x, x.i)
-                                elif f"{x.x.name[:-3]}" in syms.stack: #NOTE: This is hacky. Think of something better
-                                    return Index(x.x, I(Eval(Div(x.i, 4))))
-                        if isinstance(x, BinaryOp):
-                            l, r = GetType(x.l), GetType(x.r)
-                            if IsVectorType(l) or IsVectorType(r):
-                                if type(l) == Ptr:
-                                    x.l = Deref(x.l)
-                                if type(r) == Ptr:
-                                    x.r = Deref(x.r)
-
-                        return x
-
-                    loop.execute(Make(instr[i], lambda node: Transform(node, translate)))
-
-            vfunc.execute(loop)
-        else:
-            vfunc.execute(stmt)
-
-    return vfunc
-
-
-def Strided(func: Func) -> Func:
-    pass
-
-# ------------------------------------------------------------------------
-# Point of testing
-
-def copy():
-    F = Func("blas·copy", Void,
-        Params(
-            (Int, "len"), (Ptr(Float64), "x"), (Ptr(Float64), "y")
-        ),
-        Vars(
-            (Int, "i"), (Float64, "tmp")
-        )
-    )
-    # could also declare like
-    # F.declare( ... )
-
-    len, x, y, i, tmp = F.variables("len", "x", "y", "i", "tmp")
-
-    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
-    loop.execute(
-        Swap(Index(x, i), Index(y, i), tmp)
-    )
-
-    kernel, calls = Unroll("copy", loop, 8)
-    kernel.emit()
-
-    avx256kernel = Vectorize(kernel, SIMD.AVX2)
-    avx256kernel.emit()
-
-    F.execute(*calls)
-    F.emit()
-
-def axpby():
-    F = Func("blas·axpby", Void,
-        Params(
-            (Int, "len"), (Float64, "a"), (Ptr(Float64), "x"), (Float64, "b"), (Ptr(Float64), "y")
-        ),
-        Vars(
-            (Int, "i"), #(Float64, "tmp")
-        )
-    )
-    # could also declare like
-    # F.declare( ... )
-
-    # TODO: Increase ergonomics here...
-    len, x, y, i, a, b = F.variables("len", "x", "y", "i", "a", "b")
-
-    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
-    loop.execute(
-        Set(Index(y, i),
-            Mul(b,
-                Add(Index(y, i),
-                    Mul(a, Index(x, i)),
-                )
-            )
-        )
-    )
-
-    kernel, calls = Unroll("axpby", loop, 8)
-    kernel.emit()
-
-    avx256kernel = Vectorize(kernel, SIMD.AVX2)
-    avx256kernel.emit()
-
-    F.execute(*calls)
-    F.emit()
-
-def argmax():
-    F = Func("blas·argmax", Int,
-        Params(
-            (Int, "len"), (Ptr(Float64), "x"),
-        ),
-        Vars(
-            (Int, "i"), (Int, "ix"), (Float64, "max")
-        )
-    )
-    len, x, i, ix, max = F.variables("len", "x", "i", "ix", "max")
-
-    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
-    loop.execute(
-        If(GT(Index(x, i), max),
-            Block(
-                Set(ix, i),
-                Set(max, Index(x, ix)),
-            )
-        )
-    )
-    kernel, calls = Unroll("argmax", loop, 8, ix,
-        lambda ires, icur, node:
-            If(GT(Index(x, Index(node, icur)), Index(x, Index(node, ires))),
-                Block(Set(Index(node, ires), Index(node, icur)))
-            )
-    )
-    kernel.emit()
-
-    # avx256kernel = Vectorize(kernel, SIMD.AVX2)
-    # avx256kernel.emit()
-
-    F.execute(*calls, Return(ix))
-
-    F.execute(loop)
-    F.emit()
-
-def dot():
-    F = Func("blas·dot", Float64,
-        Params(
-            (Int, "len"), (Ptr(Float64), "x"), (Ptr(Float64), "y")
-        ),
-        Vars(
-            (Int, "i"), (Float64, "sum"),
-        )
-    )
-    len, x, i, y, sum = F.variables("len", "x", "i", "y", "sum")
-
-    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
-    loop.execute(
-        AddSet(sum, Mul(Index(x, i), Index(y, i)))
-    )
-
-    kernel, calls = Unroll("dot", loop, 16, sum,
-        lambda ires, icur, node:
-            StmtExpr(AddSet(Index(node, ires), Index(node, icur)))
-    )
-    kernel.emit()
-
-    avx256kernel = Vectorize(kernel, SIMD.AVX2)
-    avx256kernel.emit()
-
-    F.execute(*calls, Return(sum))
-
-    F.emit()
-
-def gemv():
-    F = Func("blas·gemv", Void,
-        Params(
-            (Int, "nrow"), (Int, "ncol"), (Float64, "a"), (Ptr(Float64), "m"), (Int, "incm"),
-            (Ptr(Float64), "x"), (Float64, "b"), (Ptr(Float64), "y")
-        ),
-        Vars(
-            (Int, "r"), (Int, "c"), (Ptr(Float64), "row"), (Float64, "res")
-        )
-    )
-    r, c, row, res = F.variables("r", "c", "row", "res")
-    nrow, ncol, a, m, incm, x, b, y = F.variables("nrow", "ncol", "a", "m", "incm", "x", "b", "y")
-    loop = For(Set(r, I(0)), LT(r, nrow), Inc(r),
-                Block(
-                    Set(row, Add(m, incm)),
-                    Set(res, I(0)),
-                    For(Set(c, I(0)), LT(c, ncol), Inc(c),
-                        AddSet(res, Mul(Index(row, c), Index(x, c)))
-                    ),
-                    Set(Index(y, r), Add(Mul(a, res), Mul(b, Index(y, r))))
-                )
-           )
-
-    F.execute(loop)
-    F.emit()
-
-
-if __name__ == "__main__":
-    emitheader()
-
-    gemv()
-    # dot()
-    # argmax()
-    # copy()
-    # axpby()
-
-    print(buffer)
diff --git a/rules.mk b/rules.mk
index 7fc6a1f..ce74da2 100644
--- a/rules.mk
+++ b/rules.mk
@@ -8,7 +8,7 @@ all: targets
 debug: CFLAGS += -DDEBUG
 debug: targets
 
-release: CFLAGS += -DNDEBUG -O3
+release: CFLAGS += -O3 -flto #-DNDEBUG 
 release: targets
 
 # Targets & array of sources & intermediates
@@ -25,19 +25,24 @@ include $(DIR)/rules.mk
 
 # Generic rules
 %.a: %.o
-	$(ARCHIVE)
+	@echo AR $@ $^
+	@$(ARCHIVE)
 
 $(OBJ_DIR)/%.o: $(SRC_DIR)/%.c
-	$(COMPILE)
+	@echo CC $^
+	@$(COMPILE)
 
 $(OBJ_DIR)/%.o: $(SRC_DIR)/%.s
-	$(ASSEMBLE)
+	@echo AS $^
+	@$(ASSEMBLE)
 
 %: %.o
-	$(LINK)
+	@echo CC $^
+	@$(LINK)
 
 $(OBJ_DIR)/%: $(SRC_DIR)/%.c
-	$(COMPLNK)
+	@echo CC $^
+	@$(COMPLNK)
 
 .PHONY: targets
 targets: $(LIBS) $(BINS)
diff --git a/sys/libbio/rules.mk b/sys/libbio/rules.mk
index 0e3d482..ff71746 100644
--- a/sys/libbio/rules.mk
+++ b/sys/libbio/rules.mk
@@ -31,10 +31,12 @@ BINS 	  := $(BINS) $(BINS_$(d))
 # $(LIBS_$(d)) = TCLIBS  :=
 
 $(LIBS_$(d)): $(OBJS_$(d)) $(OBJS_$(d)/io)
-	$(ARCHIVE)
+	@echo LIB $@
+	@$(ARCHIVE)
 
 $(BINS_$(d)): $(LIBS_$(d)) $(OBJ_DIR)/libn/libn.a
-	$(LINK)
+	@echo BIN $@
+	@$(LINK)
 
 # ---- Pop off stack ----
 -include $(DEPS_$(d))
diff --git a/sys/libmath/blas.c b/sys/libmath/blas.c
index f6eb830..f5392c8 100644
--- a/sys/libmath/blas.c
+++ b/sys/libmath/blas.c
@@ -1,132 +1,53 @@
 #include <u.h>
 #include <libn.h>
+#include <libmath.h>
+#include <libmath/blas.h>
 #include <time.h>
-#include <vendor/blas/cblas.h>
-
-int
-argmax2(int len, double *x)
-{
-    int i, ix[8];
-    double *end;
-    double max[8];
-
-    max[0] = x[0]; max[1] = x[1]; max[2] = x[2]; max[3] = x[3];
-    max[4] = x[4]; max[5] = x[5]; max[6] = x[6]; max[7] = x[7];
-    for (i = 0; i < len; i+=8) {
-        if (x[i+0] > max[0]) {
-            max[0] = *x;
-            ix[0]  =  i; 
-        }
-        if (x[i+1] > max[1]) {
-            max[1] = *x;
-            ix[1]  =  i; 
-        }
-        if (x[i+2] > max[2]) {
-            max[2] = *x;
-            ix[2]  =  i; 
-        }
-        if (x[i+3] > max[3]) {
-            max[3] = *x;
-            ix[3]  =  i; 
-        }
-        if (x[i+4] > max[4]) {
-            max[4] = *x;
-            ix[4]  =  i; 
-        }
-        if (x[i+5] > max[5]) {
-            max[5] = *x;
-            ix[5]  =  i; 
-        }
-        if (x[i+6] > max[6]) {
-            max[6] = *x;
-            ix[6]  =  i; 
-        }
-        if (x[i+7] > max[7]) {
-            max[7] = *x;
-            ix[7]  =  i; 
-        }
-    }
 
-    for (i = 1; i < 8; i++) {
-        if (max[i] > max[0]) {
-            max[0] = max[i]; 
-            ix[0]  = ix[i] + i;
-        }
-    }
-
-    return ix[0];
-}
-
-int
-argmax(int len, double *x)
-{
-    int i, ix;
-    double *end;
-    double max;
-
-    max = *x;
-    for (i = 0; i < len; i++) {
-        if (x[i] > max) {
-            max = *x;
-            ix  =  i; 
-        }
-    }
-
-    return ix;
-}
+#include <vendor/blas/cblas.h>
 
-#define LEN 1000000
+#define LEN 2000000
 #define NIT 2000
+#define INC 2
 error
 main()
 {
     int i, nit;
-    double *x, *y, res[3];
+    double *x, *y, res[2];
 
     clock_t t;
-    double tprof[3] = { 0 };
+    double tprof[2] = { 0 };
 
     rng·init(0);
 
     x = malloc(sizeof(*x)*LEN);
-    // y = malloc(sizeof(*x)*LEN);
-
-#define DO_0  t = clock();                            \
-              res[0]   += argmax(LEN, x);             \
-              t = clock() - t;                        \
-              tprof[0] += 1000.*t/CLOCKS_PER_SEC;     \
+    y = malloc(sizeof(*x)*LEN);
 
-#define DO_1  t = clock();                            \
-              res[1]   += argmax2(LEN, x);            \
-              t = clock() - t;                        \
-              tprof[1] += 1000.*t/CLOCKS_PER_SEC;     \
+#define DO_0  t = clock();                                  \
+              res[0]   += blas·sumd(LEN/INC, x, INC);       \
+              t = clock() - t;                              \
+              tprof[0] += 1000.*t/CLOCKS_PER_SEC;           \
 
-#define DO_2 t = clock();                             \
-             res[2]   += cblas_idamax(LEN, x, 1);     \
-             t = clock() - t;                         \
-             tprof[2] += 1000.*t/CLOCKS_PER_SEC;      
+#define DO_1 t = clock();                                   \
+             res[1]   += cblas_dsum(LEN/INC, x, INC);      \
+             t = clock() - t;                               \
+             tprof[1] += 1000.*t/CLOCKS_PER_SEC;      
 
     for (nit = 0; nit < NIT; nit++) {
         for (i = 0; i < LEN; i++) {
             x[i] = rng·random();
-            // y[i] = rng·random();
+            y[i] = rng·random();
         }
 
-        switch (nit % 6) {
-            case 0: DO_0; DO_1; DO_2; break;
-            case 1: DO_0; DO_2; DO_1; break;
-            case 2: DO_1; DO_0; DO_2; break;
-            case 3: DO_1; DO_2; DO_0; break;
-            case 4: DO_2; DO_0; DO_1; break;
-            case 5: DO_2; DO_1; DO_0; break;
+        switch (nit % 2) {
+            case 0: DO_0; DO_1; break;
+            case 1: DO_1; DO_0; break;
         }
     }
-    printf("mean time/iteration (naive): %fms\n", tprof[0]/NIT);
-    printf("--> result (naive): %f\n", res[0]);
-    printf("mean time/iteration (unrolled): %fms\n", tprof[1]/NIT);
-    printf("--> result (unrolled): %f\n", res[1]);
-    printf("mean time/iteration (openblas): %fms\n", tprof[2]/NIT);
-    printf("--> result (openblas): %f\n", res[2]);
+    printf("mean time/iteration (mine): %fms\n", tprof[0]/NIT);
+    printf("--> result (mine): %f\n", res[0]);
+    printf("mean time/iteration (openblas): %fms\n", tprof[1]/NIT);
+    printf("--> result (openblas): %f\n", res[1]);
 
     return 0;
 }
diff --git a/sys/libmath/blas1.c b/sys/libmath/blas1.c
new file mode 100644
index 0000000..8cc70eb
--- /dev/null
+++ b/sys/libmath/blas1.c
@@ -0,0 +1,2062 @@
+#include <u.h>
+#include <libn.h>
+#include <libmath.h>
+
+/*********************************************************/
+/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */
+/*********************************************************/
+
+
+static
+int
+argminf_kernel16(int *ip, float *x)
+{
+    int ix[16];
+    float min[16];
+    int i;
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        ix[i] = 0;
+    }
+    for (i = 0; i < 16; i++) {
+        min[i] = x[0];
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        if (x[i + 0] < min[0]) {
+            ix[0] = (i + 0);
+            min[0] = x[ix[0]];
+        }
+        if (x[i + 1] < min[1]) {
+            ix[1] = (i + 1);
+            min[1] = x[ix[1]];
+        }
+        if (x[i + 2] < min[2]) {
+            ix[2] = (i + 2);
+            min[2] = x[ix[2]];
+        }
+        if (x[i + 3] < min[3]) {
+            ix[3] = (i + 3);
+            min[3] = x[ix[3]];
+        }
+        if (x[i + 4] < min[4]) {
+            ix[4] = (i + 4);
+            min[4] = x[ix[4]];
+        }
+        if (x[i + 5] < min[5]) {
+            ix[5] = (i + 5);
+            min[5] = x[ix[5]];
+        }
+        if (x[i + 6] < min[6]) {
+            ix[6] = (i + 6);
+            min[6] = x[ix[6]];
+        }
+        if (x[i + 7] < min[7]) {
+            ix[7] = (i + 7);
+            min[7] = x[ix[7]];
+        }
+        if (x[i + 8] < min[8]) {
+            ix[8] = (i + 8);
+            min[8] = x[ix[8]];
+        }
+        if (x[i + 9] < min[9]) {
+            ix[9] = (i + 9);
+            min[9] = x[ix[9]];
+        }
+        if (x[i + 10] < min[10]) {
+            ix[10] = (i + 10);
+            min[10] = x[ix[10]];
+        }
+        if (x[i + 11] < min[11]) {
+            ix[11] = (i + 11);
+            min[11] = x[ix[11]];
+        }
+        if (x[i + 12] < min[12]) {
+            ix[12] = (i + 12);
+            min[12] = x[ix[12]];
+        }
+        if (x[i + 13] < min[13]) {
+            ix[13] = (i + 13);
+            min[13] = x[ix[13]];
+        }
+        if (x[i + 14] < min[14]) {
+            ix[14] = (i + 14);
+            min[14] = x[ix[14]];
+        }
+        if (x[i + 15] < min[15]) {
+            ix[15] = (i + 15);
+            min[15] = x[ix[15]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+static
+int
+argminf_s_kernel8(int *ip, int incx, float *x)
+{
+    int ix[8];
+    float min[8];
+    int i;
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        ix[i] = 0;
+    }
+    for (i = 0; i < 8; i++) {
+        min[i] = x[0];
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        if (x[incx * (i + 0)] < min[0]) {
+            ix[0] = (i + 0);
+            min[0] = x[incx * ix[0]];
+        }
+        if (x[incx * (i + 1)] < min[1]) {
+            ix[1] = (i + 1);
+            min[1] = x[incx * ix[1]];
+        }
+        if (x[incx * (i + 2)] < min[2]) {
+            ix[2] = (i + 2);
+            min[2] = x[incx * ix[2]];
+        }
+        if (x[incx * (i + 3)] < min[3]) {
+            ix[3] = (i + 3);
+            min[3] = x[incx * ix[3]];
+        }
+        if (x[incx * (i + 4)] < min[4]) {
+            ix[4] = (i + 4);
+            min[4] = x[incx * ix[4]];
+        }
+        if (x[incx * (i + 5)] < min[5]) {
+            ix[5] = (i + 5);
+            min[5] = x[incx * ix[5]];
+        }
+        if (x[incx * (i + 6)] < min[6]) {
+            ix[6] = (i + 6);
+            min[6] = x[incx * ix[6]];
+        }
+        if (x[incx * (i + 7)] < min[7]) {
+            ix[7] = (i + 7);
+            min[7] = x[incx * ix[7]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+int
+blas·argminf(int len, float *x, int incx)
+{
+    int i;
+    int ix;
+    float min;
+    
+    if (incx == 1) {
+        i = len;
+        ix = argminf_kernel16(&i, x);
+    } else {
+        i = len;
+        ix = argminf_s_kernel8(&i, incx, x);
+    }
+    min = x[ix];
+    for (; i < len; i++) {
+        if (x[i] < min) {
+            ix = i;
+            min = x[ix];
+        }
+    }
+    
+    return ix;
+}
+
+static
+int
+argmaxf_kernel16(int *ip, float *x)
+{
+    float max[16];
+    int i;
+    int ix[16];
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        max[i] = x[0];
+    }
+    for (i = 0; i < 16; i++) {
+        ix[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        if (x[i + 0] > max[0]) {
+            ix[0] = (i + 0);
+            max[0] = x[ix[0]];
+        }
+        if (x[i + 1] > max[1]) {
+            ix[1] = (i + 1);
+            max[1] = x[ix[1]];
+        }
+        if (x[i + 2] > max[2]) {
+            ix[2] = (i + 2);
+            max[2] = x[ix[2]];
+        }
+        if (x[i + 3] > max[3]) {
+            ix[3] = (i + 3);
+            max[3] = x[ix[3]];
+        }
+        if (x[i + 4] > max[4]) {
+            ix[4] = (i + 4);
+            max[4] = x[ix[4]];
+        }
+        if (x[i + 5] > max[5]) {
+            ix[5] = (i + 5);
+            max[5] = x[ix[5]];
+        }
+        if (x[i + 6] > max[6]) {
+            ix[6] = (i + 6);
+            max[6] = x[ix[6]];
+        }
+        if (x[i + 7] > max[7]) {
+            ix[7] = (i + 7);
+            max[7] = x[ix[7]];
+        }
+        if (x[i + 8] > max[8]) {
+            ix[8] = (i + 8);
+            max[8] = x[ix[8]];
+        }
+        if (x[i + 9] > max[9]) {
+            ix[9] = (i + 9);
+            max[9] = x[ix[9]];
+        }
+        if (x[i + 10] > max[10]) {
+            ix[10] = (i + 10);
+            max[10] = x[ix[10]];
+        }
+        if (x[i + 11] > max[11]) {
+            ix[11] = (i + 11);
+            max[11] = x[ix[11]];
+        }
+        if (x[i + 12] > max[12]) {
+            ix[12] = (i + 12);
+            max[12] = x[ix[12]];
+        }
+        if (x[i + 13] > max[13]) {
+            ix[13] = (i + 13);
+            max[13] = x[ix[13]];
+        }
+        if (x[i + 14] > max[14]) {
+            ix[14] = (i + 14);
+            max[14] = x[ix[14]];
+        }
+        if (x[i + 15] > max[15]) {
+            ix[15] = (i + 15);
+            max[15] = x[ix[15]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+static
+int
+argmaxf_s_kernel8(int *ip, int incx, float *x)
+{
+    float max[8];
+    int i;
+    int ix[8];
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        max[i] = x[0];
+    }
+    for (i = 0; i < 8; i++) {
+        ix[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        if (x[incx * (i + 0)] > max[0]) {
+            ix[0] = (i + 0);
+            max[0] = x[incx * ix[0]];
+        }
+        if (x[incx * (i + 1)] > max[1]) {
+            ix[1] = (i + 1);
+            max[1] = x[incx * ix[1]];
+        }
+        if (x[incx * (i + 2)] > max[2]) {
+            ix[2] = (i + 2);
+            max[2] = x[incx * ix[2]];
+        }
+        if (x[incx * (i + 3)] > max[3]) {
+            ix[3] = (i + 3);
+            max[3] = x[incx * ix[3]];
+        }
+        if (x[incx * (i + 4)] > max[4]) {
+            ix[4] = (i + 4);
+            max[4] = x[incx * ix[4]];
+        }
+        if (x[incx * (i + 5)] > max[5]) {
+            ix[5] = (i + 5);
+            max[5] = x[incx * ix[5]];
+        }
+        if (x[incx * (i + 6)] > max[6]) {
+            ix[6] = (i + 6);
+            max[6] = x[incx * ix[6]];
+        }
+        if (x[incx * (i + 7)] > max[7]) {
+            ix[7] = (i + 7);
+            max[7] = x[incx * ix[7]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+int
+blas·argmaxf(int len, float *x, int incx)
+{
+    int i;
+    int ix;
+    float max;
+    
+    if (incx == 1) {
+        i = len;
+        ix = argmaxf_kernel16(&i, x);
+    } else {
+        i = len;
+        ix = argmaxf_s_kernel8(&i, incx, x);
+    }
+    max = x[ix];
+    for (; i < len; i++) {
+        if (x[i] > max) {
+            ix = i;
+            max = x[ix];
+        }
+    }
+    
+    return ix;
+}
+
+static
+void
+copyf_kernel16(int *ip, float *x, float *y)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        y[i + 0] = x[i + 0];
+        y[i + 1] = x[i + 1];
+        y[i + 2] = x[i + 2];
+        y[i + 3] = x[i + 3];
+        y[i + 4] = x[i + 4];
+        y[i + 5] = x[i + 5];
+        y[i + 6] = x[i + 6];
+        y[i + 7] = x[i + 7];
+        y[i + 8] = x[i + 8];
+        y[i + 9] = x[i + 9];
+        y[i + 10] = x[i + 10];
+        y[i + 11] = x[i + 11];
+        y[i + 12] = x[i + 12];
+        y[i + 13] = x[i + 13];
+        y[i + 14] = x[i + 14];
+        y[i + 15] = x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+copyf_s_kernel8(int *ip, int incx, float *x, int incy, float *y)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        y[incy * (i + 0)] = x[incx * (i + 0)];
+        y[incy * (i + 1)] = x[incx * (i + 1)];
+        y[incy * (i + 2)] = x[incx * (i + 2)];
+        y[incy * (i + 3)] = x[incx * (i + 3)];
+        y[incy * (i + 4)] = x[incx * (i + 4)];
+        y[incy * (i + 5)] = x[incx * (i + 5)];
+        y[incy * (i + 6)] = x[incx * (i + 6)];
+        y[incy * (i + 7)] = x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·copyf(int len, float *x, int incx, float *y, int incy)
+{
+    int i;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        copyf_kernel16(&i, x, y);
+    } else {
+        i = len;
+        copyf_s_kernel8(&i, incx, x, incy, y);
+    }
+    for (; i < len; i++) {
+        y[i] = x[i];
+    }
+}
+
+static
+void
+axpyf_kernel16(int *ip, float a, float *x, float *y)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        y[i + 0] = y[i + 0] + a * x[i + 0];
+        y[i + 1] = y[i + 1] + a * x[i + 1];
+        y[i + 2] = y[i + 2] + a * x[i + 2];
+        y[i + 3] = y[i + 3] + a * x[i + 3];
+        y[i + 4] = y[i + 4] + a * x[i + 4];
+        y[i + 5] = y[i + 5] + a * x[i + 5];
+        y[i + 6] = y[i + 6] + a * x[i + 6];
+        y[i + 7] = y[i + 7] + a * x[i + 7];
+        y[i + 8] = y[i + 8] + a * x[i + 8];
+        y[i + 9] = y[i + 9] + a * x[i + 9];
+        y[i + 10] = y[i + 10] + a * x[i + 10];
+        y[i + 11] = y[i + 11] + a * x[i + 11];
+        y[i + 12] = y[i + 12] + a * x[i + 12];
+        y[i + 13] = y[i + 13] + a * x[i + 13];
+        y[i + 14] = y[i + 14] + a * x[i + 14];
+        y[i + 15] = y[i + 15] + a * x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+axpyf_s_kernel8(int *ip, float a, float *x, int incy, float *y, int incx)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)];
+        y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)];
+        y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)];
+        y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)];
+        y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)];
+        y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)];
+        y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)];
+        y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·axpyf(int len, float a, float *x, int incx, float *y, int incy)
+{
+    int i;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        axpyf_kernel16(&i, a, x, y);
+    } else {
+        i = len;
+        axpyf_s_kernel8(&i, a, x, incy, y, incx);
+    }
+    for (; i < len; i++) {
+        y[i] = y[i] + a * x[i];
+    }
+}
+
+static
+void
+axpbyf_kernel16(int *ip, float *x, float a, float *y, float b)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        y[i + 0] = b * y[i + 0] + a * x[i + 0];
+        y[i + 1] = b * y[i + 1] + a * x[i + 1];
+        y[i + 2] = b * y[i + 2] + a * x[i + 2];
+        y[i + 3] = b * y[i + 3] + a * x[i + 3];
+        y[i + 4] = b * y[i + 4] + a * x[i + 4];
+        y[i + 5] = b * y[i + 5] + a * x[i + 5];
+        y[i + 6] = b * y[i + 6] + a * x[i + 6];
+        y[i + 7] = b * y[i + 7] + a * x[i + 7];
+        y[i + 8] = b * y[i + 8] + a * x[i + 8];
+        y[i + 9] = b * y[i + 9] + a * x[i + 9];
+        y[i + 10] = b * y[i + 10] + a * x[i + 10];
+        y[i + 11] = b * y[i + 11] + a * x[i + 11];
+        y[i + 12] = b * y[i + 12] + a * x[i + 12];
+        y[i + 13] = b * y[i + 13] + a * x[i + 13];
+        y[i + 14] = b * y[i + 14] + a * x[i + 14];
+        y[i + 15] = b * y[i + 15] + a * x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+axpbyf_s_kernel8(int *ip, float *x, float a, float *y, float b, int incx, int incy)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)];
+        y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)];
+        y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)];
+        y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)];
+        y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)];
+        y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)];
+        y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)];
+        y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·axpbyf(int len, float a, float *x, int incx, float b, float *y, int incy)
+{
+    int i;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        axpbyf_kernel16(&i, x, a, y, b);
+    } else {
+        i = len;
+        axpbyf_s_kernel8(&i, x, a, y, b, incx, incy);
+    }
+    for (; i < len; i++) {
+        y[i] = b * y[i] + a * x[i];
+    }
+}
+
+static
+float
+dotf_kernel16(int *ip, float *x, float *y)
+{
+    int i;
+    float sum[16];
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        sum[0] += x[i + 0] * y[i + 0];
+        sum[1] += x[i + 1] * y[i + 1];
+        sum[2] += x[i + 2] * y[i + 2];
+        sum[3] += x[i + 3] * y[i + 3];
+        sum[4] += x[i + 4] * y[i + 4];
+        sum[5] += x[i + 5] * y[i + 5];
+        sum[6] += x[i + 6] * y[i + 6];
+        sum[7] += x[i + 7] * y[i + 7];
+        sum[8] += x[i + 8] * y[i + 8];
+        sum[9] += x[i + 9] * y[i + 9];
+        sum[10] += x[i + 10] * y[i + 10];
+        sum[11] += x[i + 11] * y[i + 11];
+        sum[12] += x[i + 12] * y[i + 12];
+        sum[13] += x[i + 13] * y[i + 13];
+        sum[14] += x[i + 14] * y[i + 14];
+        sum[15] += x[i + 15] * y[i + 15];
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+static
+float
+dotf_s_kernel8(int *ip, float *x, int incy, float *y, int incx)
+{
+    int i;
+    float sum[8];
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)];
+        sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)];
+        sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)];
+        sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)];
+        sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)];
+        sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)];
+        sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)];
+        sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)];
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+float
+blas·dotf(int len, float *x, int incx, float *y, int incy)
+{
+    int i;
+    float sum;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        sum = dotf_kernel16(&i, x, y);
+    } else {
+        i = len;
+        sum = dotf_s_kernel8(&i, x, incy, y, incx);
+    }
+    for (; i < len; i++) {
+        sum += x[i] * y[i];
+    }
+    
+    return sum;
+}
+
+static
+float
+sumf_kernel16(int *ip, float *x)
+{
+    int i;
+    float sum[16];
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        sum[0] += x[i + 0];
+        sum[1] += x[i + 1];
+        sum[2] += x[i + 2];
+        sum[3] += x[i + 3];
+        sum[4] += x[i + 4];
+        sum[5] += x[i + 5];
+        sum[6] += x[i + 6];
+        sum[7] += x[i + 7];
+        sum[8] += x[i + 8];
+        sum[9] += x[i + 9];
+        sum[10] += x[i + 10];
+        sum[11] += x[i + 11];
+        sum[12] += x[i + 12];
+        sum[13] += x[i + 13];
+        sum[14] += x[i + 14];
+        sum[15] += x[i + 15];
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+static
+float
+sumf_s_kernel8(int *ip, float *x, int incx)
+{
+    float sum[8];
+    int i;
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        sum[0] += x[incx * (i + 0)];
+        sum[1] += x[incx * (i + 1)];
+        sum[2] += x[incx * (i + 2)];
+        sum[3] += x[incx * (i + 3)];
+        sum[4] += x[incx * (i + 4)];
+        sum[5] += x[incx * (i + 5)];
+        sum[6] += x[incx * (i + 6)];
+        sum[7] += x[incx * (i + 7)];
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+float
+blas·sumf(int len, float *x, int incx)
+{
+    int i;
+    float sum;
+    
+    if (incx == 1) {
+        i = len;
+        sum = sumf_kernel16(&i, x);
+    } else {
+        i = len;
+        sum = sumf_s_kernel8(&i, x, incx);
+    }
+    for (; i < len; i++) {
+        sum += x[i];
+    }
+    
+    return sum;
+}
+
+static
+float
+normf_kernel16(int *ip, float *x)
+{
+    float nrm[16];
+    int i;
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        nrm[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        nrm[0] += x[i + 0] * x[i + 0];
+        nrm[1] += x[i + 1] * x[i + 1];
+        nrm[2] += x[i + 2] * x[i + 2];
+        nrm[3] += x[i + 3] * x[i + 3];
+        nrm[4] += x[i + 4] * x[i + 4];
+        nrm[5] += x[i + 5] * x[i + 5];
+        nrm[6] += x[i + 6] * x[i + 6];
+        nrm[7] += x[i + 7] * x[i + 7];
+        nrm[8] += x[i + 8] * x[i + 8];
+        nrm[9] += x[i + 9] * x[i + 9];
+        nrm[10] += x[i + 10] * x[i + 10];
+        nrm[11] += x[i + 11] * x[i + 11];
+        nrm[12] += x[i + 12] * x[i + 12];
+        nrm[13] += x[i + 13] * x[i + 13];
+        nrm[14] += x[i + 14] * x[i + 14];
+        nrm[15] += x[i + 15] * x[i + 15];
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        nrm[0] += nrm[i];
+    }
+    
+    return nrm[0];
+}
+
+static
+float
+normf_s_kernel8(int *ip, int incx, float *x)
+{
+    float nrm[8];
+    int i;
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        nrm[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)];
+        nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)];
+        nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)];
+        nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)];
+        nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)];
+        nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)];
+        nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)];
+        nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)];
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        nrm[0] += nrm[i];
+    }
+    
+    return nrm[0];
+}
+
+float
+blas·normf(int len, float *x, int incx)
+{
+    int i;
+    float nrm;
+    
+    if (incx == 1) {
+        i = len;
+        nrm = normf_kernel16(&i, x);
+    } else {
+        i = len;
+        nrm = normf_s_kernel8(&i, incx, x);
+    }
+    for (; i < len; i++) {
+        nrm += x[i] * x[i];
+    }
+    
+    return math·sqrtf(nrm);
+}
+
+static
+void
+scalef_kernel16(int *ip, float *x, float a)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        x[i + 0] = a * x[i + 0];
+        x[i + 1] = a * x[i + 1];
+        x[i + 2] = a * x[i + 2];
+        x[i + 3] = a * x[i + 3];
+        x[i + 4] = a * x[i + 4];
+        x[i + 5] = a * x[i + 5];
+        x[i + 6] = a * x[i + 6];
+        x[i + 7] = a * x[i + 7];
+        x[i + 8] = a * x[i + 8];
+        x[i + 9] = a * x[i + 9];
+        x[i + 10] = a * x[i + 10];
+        x[i + 11] = a * x[i + 11];
+        x[i + 12] = a * x[i + 12];
+        x[i + 13] = a * x[i + 13];
+        x[i + 14] = a * x[i + 14];
+        x[i + 15] = a * x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+scalef_s_kernel8(int *ip, int incx, float *x, float a)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        x[incx * (i + 0)] = a * x[incx * (i + 0)];
+        x[incx * (i + 1)] = a * x[incx * (i + 1)];
+        x[incx * (i + 2)] = a * x[incx * (i + 2)];
+        x[incx * (i + 3)] = a * x[incx * (i + 3)];
+        x[incx * (i + 4)] = a * x[incx * (i + 4)];
+        x[incx * (i + 5)] = a * x[incx * (i + 5)];
+        x[incx * (i + 6)] = a * x[incx * (i + 6)];
+        x[incx * (i + 7)] = a * x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·scalef(int len, float *x, int incx, float a)
+{
+    int i;
+    
+    if (incx == 1) {
+        i = len;
+        scalef_kernel16(&i, x, a);
+    } else {
+        i = len;
+        scalef_s_kernel8(&i, incx, x, a);
+    }
+    for (; i < len; i++) {
+        x[i] = a * x[i];
+    }
+}
+
+static
+void
+rotf_kernel16(int *ip, float *y, float cos, float sin, float *x)
+{
+    int i;
+    float tmp[16];
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0];
+        tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1];
+        tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2];
+        tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3];
+        tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4];
+        tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5];
+        tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6];
+        tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7];
+        tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8];
+        tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9];
+        tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10];
+        tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11];
+        tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12];
+        tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13];
+        tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14];
+        tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15];
+    }
+    *ip = i;
+}
+
+static
+void
+rotf_s_kernel8(int *ip, float *y, float sin, float cos, float *x, int incx, int incy)
+{
+    int i;
+    float tmp[8];
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0];
+        tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1];
+        tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2];
+        tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3];
+        tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4];
+        tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5];
+        tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6];
+        tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7];
+    }
+    *ip = i;
+}
+
+void
+blas·rotf(int len, float *x, int incx, float *y, int incy, float cos, float sin)
+{
+    int i;
+    float tmp;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        rotf_kernel16(&i, y, cos, sin, x);
+    } else {
+        i = len;
+        rotf_s_kernel8(&i, y, sin, cos, x, incx, incy);
+    }
+    for (; i < len; i++) {
+        tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp;
+    }
+}
+
+static
+void
+rotgf_kernel16(int *ip, float H[5], float *y, float *x)
+{
+    float tmp[16];
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0];
+        tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1];
+        tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2];
+        tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3];
+        tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4];
+        tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5];
+        tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6];
+        tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7];
+        tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8];
+        tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9];
+        tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10];
+        tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11];
+        tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12];
+        tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13];
+        tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14];
+        tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15];
+    }
+    *ip = i;
+}
+
+static
+void
+rotgf_s_kernel8(int *ip, int incy, int incx, float H[5], float *y, float *x)
+{
+    float tmp[8];
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0];
+        tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1];
+        tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2];
+        tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3];
+        tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4];
+        tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5];
+        tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6];
+        tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7];
+    }
+    *ip = i;
+}
+
+void
+blas·rotgf(int len, float *x, int incx, float *y, int incy, float H[5])
+{
+    int i;
+    float tmp;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        rotgf_kernel16(&i, H, y, x);
+    } else {
+        i = len;
+        rotgf_s_kernel8(&i, incy, incx, H, y, x);
+    }
+    for (; i < len; i++) {
+        tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp;
+    }
+}
+
+static
+int
+argmind_kernel16(int *ip, double *x)
+{
+    int ix[16];
+    double min[16];
+    int i;
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        ix[i] = 0;
+    }
+    for (i = 0; i < 16; i++) {
+        min[i] = x[0];
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        if (x[i + 0] < min[0]) {
+            ix[0] = (i + 0);
+            min[0] = x[ix[0]];
+        }
+        if (x[i + 1] < min[1]) {
+            ix[1] = (i + 1);
+            min[1] = x[ix[1]];
+        }
+        if (x[i + 2] < min[2]) {
+            ix[2] = (i + 2);
+            min[2] = x[ix[2]];
+        }
+        if (x[i + 3] < min[3]) {
+            ix[3] = (i + 3);
+            min[3] = x[ix[3]];
+        }
+        if (x[i + 4] < min[4]) {
+            ix[4] = (i + 4);
+            min[4] = x[ix[4]];
+        }
+        if (x[i + 5] < min[5]) {
+            ix[5] = (i + 5);
+            min[5] = x[ix[5]];
+        }
+        if (x[i + 6] < min[6]) {
+            ix[6] = (i + 6);
+            min[6] = x[ix[6]];
+        }
+        if (x[i + 7] < min[7]) {
+            ix[7] = (i + 7);
+            min[7] = x[ix[7]];
+        }
+        if (x[i + 8] < min[8]) {
+            ix[8] = (i + 8);
+            min[8] = x[ix[8]];
+        }
+        if (x[i + 9] < min[9]) {
+            ix[9] = (i + 9);
+            min[9] = x[ix[9]];
+        }
+        if (x[i + 10] < min[10]) {
+            ix[10] = (i + 10);
+            min[10] = x[ix[10]];
+        }
+        if (x[i + 11] < min[11]) {
+            ix[11] = (i + 11);
+            min[11] = x[ix[11]];
+        }
+        if (x[i + 12] < min[12]) {
+            ix[12] = (i + 12);
+            min[12] = x[ix[12]];
+        }
+        if (x[i + 13] < min[13]) {
+            ix[13] = (i + 13);
+            min[13] = x[ix[13]];
+        }
+        if (x[i + 14] < min[14]) {
+            ix[14] = (i + 14);
+            min[14] = x[ix[14]];
+        }
+        if (x[i + 15] < min[15]) {
+            ix[15] = (i + 15);
+            min[15] = x[ix[15]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+static
+int
+argmind_s_kernel8(int *ip, double *x, int incx)
+{
+    int ix[8];
+    double min[8];
+    int i;
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        ix[i] = 0;
+    }
+    for (i = 0; i < 8; i++) {
+        min[i] = x[0];
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        if (x[incx * (i + 0)] < min[0]) {
+            ix[0] = (i + 0);
+            min[0] = x[incx * ix[0]];
+        }
+        if (x[incx * (i + 1)] < min[1]) {
+            ix[1] = (i + 1);
+            min[1] = x[incx * ix[1]];
+        }
+        if (x[incx * (i + 2)] < min[2]) {
+            ix[2] = (i + 2);
+            min[2] = x[incx * ix[2]];
+        }
+        if (x[incx * (i + 3)] < min[3]) {
+            ix[3] = (i + 3);
+            min[3] = x[incx * ix[3]];
+        }
+        if (x[incx * (i + 4)] < min[4]) {
+            ix[4] = (i + 4);
+            min[4] = x[incx * ix[4]];
+        }
+        if (x[incx * (i + 5)] < min[5]) {
+            ix[5] = (i + 5);
+            min[5] = x[incx * ix[5]];
+        }
+        if (x[incx * (i + 6)] < min[6]) {
+            ix[6] = (i + 6);
+            min[6] = x[incx * ix[6]];
+        }
+        if (x[incx * (i + 7)] < min[7]) {
+            ix[7] = (i + 7);
+            min[7] = x[incx * ix[7]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+int
+blas·argmind(int len, double *x, int incx)
+{
+    int i;
+    int ix;
+    double min;
+    
+    if (incx == 1) {
+        i = len;
+        ix = argmind_kernel16(&i, x);
+    } else {
+        i = len;
+        ix = argmind_s_kernel8(&i, x, incx);
+    }
+    min = x[ix];
+    for (; i < len; i++) {
+        if (x[i] < min) {
+            ix = i;
+            min = x[ix];
+        }
+    }
+    
+    return ix;
+}
+
+static
+int
+argmaxd_kernel16(int *ip, double *x)
+{
+    int i;
+    int ix[16];
+    double max[16];
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        ix[i] = 0;
+    }
+    for (i = 0; i < 16; i++) {
+        max[i] = x[0];
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        if (x[i + 0] > max[0]) {
+            ix[0] = (i + 0);
+            max[0] = x[ix[0]];
+        }
+        if (x[i + 1] > max[1]) {
+            ix[1] = (i + 1);
+            max[1] = x[ix[1]];
+        }
+        if (x[i + 2] > max[2]) {
+            ix[2] = (i + 2);
+            max[2] = x[ix[2]];
+        }
+        if (x[i + 3] > max[3]) {
+            ix[3] = (i + 3);
+            max[3] = x[ix[3]];
+        }
+        if (x[i + 4] > max[4]) {
+            ix[4] = (i + 4);
+            max[4] = x[ix[4]];
+        }
+        if (x[i + 5] > max[5]) {
+            ix[5] = (i + 5);
+            max[5] = x[ix[5]];
+        }
+        if (x[i + 6] > max[6]) {
+            ix[6] = (i + 6);
+            max[6] = x[ix[6]];
+        }
+        if (x[i + 7] > max[7]) {
+            ix[7] = (i + 7);
+            max[7] = x[ix[7]];
+        }
+        if (x[i + 8] > max[8]) {
+            ix[8] = (i + 8);
+            max[8] = x[ix[8]];
+        }
+        if (x[i + 9] > max[9]) {
+            ix[9] = (i + 9);
+            max[9] = x[ix[9]];
+        }
+        if (x[i + 10] > max[10]) {
+            ix[10] = (i + 10);
+            max[10] = x[ix[10]];
+        }
+        if (x[i + 11] > max[11]) {
+            ix[11] = (i + 11);
+            max[11] = x[ix[11]];
+        }
+        if (x[i + 12] > max[12]) {
+            ix[12] = (i + 12);
+            max[12] = x[ix[12]];
+        }
+        if (x[i + 13] > max[13]) {
+            ix[13] = (i + 13);
+            max[13] = x[ix[13]];
+        }
+        if (x[i + 14] > max[14]) {
+            ix[14] = (i + 14);
+            max[14] = x[ix[14]];
+        }
+        if (x[i + 15] > max[15]) {
+            ix[15] = (i + 15);
+            max[15] = x[ix[15]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+static
+int
+argmaxd_s_kernel8(int *ip, int incx, double *x)
+{
+    int i;
+    int ix[8];
+    double max[8];
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        ix[i] = 0;
+    }
+    for (i = 0; i < 8; i++) {
+        max[i] = x[0];
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        if (x[incx * (i + 0)] > max[0]) {
+            ix[0] = (i + 0);
+            max[0] = x[incx * ix[0]];
+        }
+        if (x[incx * (i + 1)] > max[1]) {
+            ix[1] = (i + 1);
+            max[1] = x[incx * ix[1]];
+        }
+        if (x[incx * (i + 2)] > max[2]) {
+            ix[2] = (i + 2);
+            max[2] = x[incx * ix[2]];
+        }
+        if (x[incx * (i + 3)] > max[3]) {
+            ix[3] = (i + 3);
+            max[3] = x[incx * ix[3]];
+        }
+        if (x[incx * (i + 4)] > max[4]) {
+            ix[4] = (i + 4);
+            max[4] = x[incx * ix[4]];
+        }
+        if (x[incx * (i + 5)] > max[5]) {
+            ix[5] = (i + 5);
+            max[5] = x[incx * ix[5]];
+        }
+        if (x[incx * (i + 6)] > max[6]) {
+            ix[6] = (i + 6);
+            max[6] = x[incx * ix[6]];
+        }
+        if (x[incx * (i + 7)] > max[7]) {
+            ix[7] = (i + 7);
+            max[7] = x[incx * ix[7]];
+        }
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        if (x[ix[i]] > x[ix[0]]) {
+            ix[0] = ix[i];
+        }
+    }
+    
+    return ix[0];
+}
+
+int
+blas·argmaxd(int len, double *x, int incx)
+{
+    int i;
+    int ix;
+    double max;
+    
+    if (incx == 1) {
+        i = len;
+        ix = argmaxd_kernel16(&i, x);
+    } else {
+        i = len;
+        ix = argmaxd_s_kernel8(&i, incx, x);
+    }
+    max = x[ix];
+    for (; i < len; i++) {
+        if (x[i] > max) {
+            ix = i;
+            max = x[ix];
+        }
+    }
+    
+    return ix;
+}
+
+static
+void
+copyd_kernel16(int *ip, double *y, double *x)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        y[i + 0] = x[i + 0];
+        y[i + 1] = x[i + 1];
+        y[i + 2] = x[i + 2];
+        y[i + 3] = x[i + 3];
+        y[i + 4] = x[i + 4];
+        y[i + 5] = x[i + 5];
+        y[i + 6] = x[i + 6];
+        y[i + 7] = x[i + 7];
+        y[i + 8] = x[i + 8];
+        y[i + 9] = x[i + 9];
+        y[i + 10] = x[i + 10];
+        y[i + 11] = x[i + 11];
+        y[i + 12] = x[i + 12];
+        y[i + 13] = x[i + 13];
+        y[i + 14] = x[i + 14];
+        y[i + 15] = x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+copyd_s_kernel8(int *ip, double *y, double *x, int incy, int incx)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        y[incy * (i + 0)] = x[incx * (i + 0)];
+        y[incy * (i + 1)] = x[incx * (i + 1)];
+        y[incy * (i + 2)] = x[incx * (i + 2)];
+        y[incy * (i + 3)] = x[incx * (i + 3)];
+        y[incy * (i + 4)] = x[incx * (i + 4)];
+        y[incy * (i + 5)] = x[incx * (i + 5)];
+        y[incy * (i + 6)] = x[incx * (i + 6)];
+        y[incy * (i + 7)] = x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·copyd(int len, double *x, int incx, double *y, int incy)
+{
+    int i;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        copyd_kernel16(&i, y, x);
+    } else {
+        i = len;
+        copyd_s_kernel8(&i, y, x, incy, incx);
+    }
+    for (; i < len; i++) {
+        y[i] = x[i];
+    }
+}
+
+static
+void
+axpyd_kernel16(int *ip, double a, double *y, double *x)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        y[i + 0] = y[i + 0] + a * x[i + 0];
+        y[i + 1] = y[i + 1] + a * x[i + 1];
+        y[i + 2] = y[i + 2] + a * x[i + 2];
+        y[i + 3] = y[i + 3] + a * x[i + 3];
+        y[i + 4] = y[i + 4] + a * x[i + 4];
+        y[i + 5] = y[i + 5] + a * x[i + 5];
+        y[i + 6] = y[i + 6] + a * x[i + 6];
+        y[i + 7] = y[i + 7] + a * x[i + 7];
+        y[i + 8] = y[i + 8] + a * x[i + 8];
+        y[i + 9] = y[i + 9] + a * x[i + 9];
+        y[i + 10] = y[i + 10] + a * x[i + 10];
+        y[i + 11] = y[i + 11] + a * x[i + 11];
+        y[i + 12] = y[i + 12] + a * x[i + 12];
+        y[i + 13] = y[i + 13] + a * x[i + 13];
+        y[i + 14] = y[i + 14] + a * x[i + 14];
+        y[i + 15] = y[i + 15] + a * x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+axpyd_s_kernel8(int *ip, double a, int incx, double *y, double *x, int incy)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        y[incy * (i + 0)] = y[incy * (i + 0)] + a * x[incx * (i + 0)];
+        y[incy * (i + 1)] = y[incy * (i + 1)] + a * x[incx * (i + 1)];
+        y[incy * (i + 2)] = y[incy * (i + 2)] + a * x[incx * (i + 2)];
+        y[incy * (i + 3)] = y[incy * (i + 3)] + a * x[incx * (i + 3)];
+        y[incy * (i + 4)] = y[incy * (i + 4)] + a * x[incx * (i + 4)];
+        y[incy * (i + 5)] = y[incy * (i + 5)] + a * x[incx * (i + 5)];
+        y[incy * (i + 6)] = y[incy * (i + 6)] + a * x[incx * (i + 6)];
+        y[incy * (i + 7)] = y[incy * (i + 7)] + a * x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·axpyd(int len, double a, double *x, int incx, double *y, int incy)
+{
+    int i;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        axpyd_kernel16(&i, a, y, x);
+    } else {
+        i = len;
+        axpyd_s_kernel8(&i, a, incx, y, x, incy);
+    }
+    for (; i < len; i++) {
+        y[i] = y[i] + a * x[i];
+    }
+}
+
+static
+void
+axpbyd_kernel16(int *ip, double a, double b, double *x, double *y)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        y[i + 0] = b * y[i + 0] + a * x[i + 0];
+        y[i + 1] = b * y[i + 1] + a * x[i + 1];
+        y[i + 2] = b * y[i + 2] + a * x[i + 2];
+        y[i + 3] = b * y[i + 3] + a * x[i + 3];
+        y[i + 4] = b * y[i + 4] + a * x[i + 4];
+        y[i + 5] = b * y[i + 5] + a * x[i + 5];
+        y[i + 6] = b * y[i + 6] + a * x[i + 6];
+        y[i + 7] = b * y[i + 7] + a * x[i + 7];
+        y[i + 8] = b * y[i + 8] + a * x[i + 8];
+        y[i + 9] = b * y[i + 9] + a * x[i + 9];
+        y[i + 10] = b * y[i + 10] + a * x[i + 10];
+        y[i + 11] = b * y[i + 11] + a * x[i + 11];
+        y[i + 12] = b * y[i + 12] + a * x[i + 12];
+        y[i + 13] = b * y[i + 13] + a * x[i + 13];
+        y[i + 14] = b * y[i + 14] + a * x[i + 14];
+        y[i + 15] = b * y[i + 15] + a * x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+axpbyd_s_kernel8(int *ip, double a, double b, double *x, int incx, int incy, double *y)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        y[incy * (i + 0)] = b * y[incy * (i + 0)] + a * x[incx * (i + 0)];
+        y[incy * (i + 1)] = b * y[incy * (i + 1)] + a * x[incx * (i + 1)];
+        y[incy * (i + 2)] = b * y[incy * (i + 2)] + a * x[incx * (i + 2)];
+        y[incy * (i + 3)] = b * y[incy * (i + 3)] + a * x[incx * (i + 3)];
+        y[incy * (i + 4)] = b * y[incy * (i + 4)] + a * x[incx * (i + 4)];
+        y[incy * (i + 5)] = b * y[incy * (i + 5)] + a * x[incx * (i + 5)];
+        y[incy * (i + 6)] = b * y[incy * (i + 6)] + a * x[incx * (i + 6)];
+        y[incy * (i + 7)] = b * y[incy * (i + 7)] + a * x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·axpbyd(int len, double a, double *x, int incx, double b, double *y, int incy)
+{
+    int i;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        axpbyd_kernel16(&i, a, b, x, y);
+    } else {
+        i = len;
+        axpbyd_s_kernel8(&i, a, b, x, incx, incy, y);
+    }
+    for (; i < len; i++) {
+        y[i] = b * y[i] + a * x[i];
+    }
+}
+
+static
+double
+dotd_kernel16(int *ip, double *x, double *y)
+{
+    double sum[16];
+    int i;
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        sum[0] += x[i + 0] * y[i + 0];
+        sum[1] += x[i + 1] * y[i + 1];
+        sum[2] += x[i + 2] * y[i + 2];
+        sum[3] += x[i + 3] * y[i + 3];
+        sum[4] += x[i + 4] * y[i + 4];
+        sum[5] += x[i + 5] * y[i + 5];
+        sum[6] += x[i + 6] * y[i + 6];
+        sum[7] += x[i + 7] * y[i + 7];
+        sum[8] += x[i + 8] * y[i + 8];
+        sum[9] += x[i + 9] * y[i + 9];
+        sum[10] += x[i + 10] * y[i + 10];
+        sum[11] += x[i + 11] * y[i + 11];
+        sum[12] += x[i + 12] * y[i + 12];
+        sum[13] += x[i + 13] * y[i + 13];
+        sum[14] += x[i + 14] * y[i + 14];
+        sum[15] += x[i + 15] * y[i + 15];
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+static
+double
+dotd_s_kernel8(int *ip, int incx, double *x, int incy, double *y)
+{
+    double sum[8];
+    int i;
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        sum[0] += x[incx * (i + 0)] * y[incy * (i + 0)];
+        sum[1] += x[incx * (i + 1)] * y[incy * (i + 1)];
+        sum[2] += x[incx * (i + 2)] * y[incy * (i + 2)];
+        sum[3] += x[incx * (i + 3)] * y[incy * (i + 3)];
+        sum[4] += x[incx * (i + 4)] * y[incy * (i + 4)];
+        sum[5] += x[incx * (i + 5)] * y[incy * (i + 5)];
+        sum[6] += x[incx * (i + 6)] * y[incy * (i + 6)];
+        sum[7] += x[incx * (i + 7)] * y[incy * (i + 7)];
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+double
+blas·dotd(int len, double *x, int incx, double *y, int incy)
+{
+    int i;
+    double sum;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        sum = dotd_kernel16(&i, x, y);
+    } else {
+        i = len;
+        sum = dotd_s_kernel8(&i, incx, x, incy, y);
+    }
+    for (; i < len; i++) {
+        sum += x[i] * y[i];
+    }
+    
+    return sum;
+}
+
+static
+double
+sumd_kernel16(int *ip, double *x)
+{
+    int i;
+    double sum[16];
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        sum[0] += x[i + 0];
+        sum[1] += x[i + 1];
+        sum[2] += x[i + 2];
+        sum[3] += x[i + 3];
+        sum[4] += x[i + 4];
+        sum[5] += x[i + 5];
+        sum[6] += x[i + 6];
+        sum[7] += x[i + 7];
+        sum[8] += x[i + 8];
+        sum[9] += x[i + 9];
+        sum[10] += x[i + 10];
+        sum[11] += x[i + 11];
+        sum[12] += x[i + 12];
+        sum[13] += x[i + 13];
+        sum[14] += x[i + 14];
+        sum[15] += x[i + 15];
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+static
+double
+sumd_s_kernel8(int *ip, double *x, int incx)
+{
+    int i;
+    double sum[8];
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        sum[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        sum[0] += x[incx * (i + 0)];
+        sum[1] += x[incx * (i + 1)];
+        sum[2] += x[incx * (i + 2)];
+        sum[3] += x[incx * (i + 3)];
+        sum[4] += x[incx * (i + 4)];
+        sum[5] += x[incx * (i + 5)];
+        sum[6] += x[incx * (i + 6)];
+        sum[7] += x[incx * (i + 7)];
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        sum[0] += sum[i];
+    }
+    
+    return sum[0];
+}
+
+double
+blas·sumd(int len, double *x, int incx)
+{
+    int i;
+    double sum;
+    
+    if (incx == 1) {
+        i = len;
+        sum = sumd_kernel16(&i, x);
+    } else {
+        i = len;
+        sum = sumd_s_kernel8(&i, x, incx);
+    }
+    for (; i < len; i++) {
+        sum += x[i];
+    }
+    
+    return sum;
+}
+
+static
+double
+normd_kernel16(int *ip, double *x)
+{
+    double nrm[16];
+    int i;
+    int len;
+    
+    for (i = 0; i < 16; i++) {
+        nrm[i] = 0;
+    }
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        nrm[0] += x[i + 0] * x[i + 0];
+        nrm[1] += x[i + 1] * x[i + 1];
+        nrm[2] += x[i + 2] * x[i + 2];
+        nrm[3] += x[i + 3] * x[i + 3];
+        nrm[4] += x[i + 4] * x[i + 4];
+        nrm[5] += x[i + 5] * x[i + 5];
+        nrm[6] += x[i + 6] * x[i + 6];
+        nrm[7] += x[i + 7] * x[i + 7];
+        nrm[8] += x[i + 8] * x[i + 8];
+        nrm[9] += x[i + 9] * x[i + 9];
+        nrm[10] += x[i + 10] * x[i + 10];
+        nrm[11] += x[i + 11] * x[i + 11];
+        nrm[12] += x[i + 12] * x[i + 12];
+        nrm[13] += x[i + 13] * x[i + 13];
+        nrm[14] += x[i + 14] * x[i + 14];
+        nrm[15] += x[i + 15] * x[i + 15];
+    }
+    *ip = i;
+    for (i = 1; i < 16; i++) {
+        nrm[0] += nrm[i];
+    }
+    
+    return nrm[0];
+}
+
+static
+double
+normd_s_kernel8(int *ip, int incx, double *x)
+{
+    int i;
+    double nrm[8];
+    int len;
+    
+    for (i = 0; i < 8; i++) {
+        nrm[i] = 0;
+    }
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        nrm[0] += x[incx * (i + 0)] * x[incx * (i + 0)];
+        nrm[1] += x[incx * (i + 1)] * x[incx * (i + 1)];
+        nrm[2] += x[incx * (i + 2)] * x[incx * (i + 2)];
+        nrm[3] += x[incx * (i + 3)] * x[incx * (i + 3)];
+        nrm[4] += x[incx * (i + 4)] * x[incx * (i + 4)];
+        nrm[5] += x[incx * (i + 5)] * x[incx * (i + 5)];
+        nrm[6] += x[incx * (i + 6)] * x[incx * (i + 6)];
+        nrm[7] += x[incx * (i + 7)] * x[incx * (i + 7)];
+    }
+    *ip = i;
+    for (i = 1; i < 8; i++) {
+        nrm[0] += nrm[i];
+    }
+    
+    return nrm[0];
+}
+
+double
+blas·normd(int len, double *x, int incx)
+{
+    int i;
+    double nrm;
+    
+    if (incx == 1) {
+        i = len;
+        nrm = normd_kernel16(&i, x);
+    } else {
+        i = len;
+        nrm = normd_s_kernel8(&i, incx, x);
+    }
+    for (; i < len; i++) {
+        nrm += x[i] * x[i];
+    }
+    
+    return math·sqrt(nrm);
+}
+
+static
+void
+scaled_kernel16(int *ip, double a, double *x)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        x[i + 0] = a * x[i + 0];
+        x[i + 1] = a * x[i + 1];
+        x[i + 2] = a * x[i + 2];
+        x[i + 3] = a * x[i + 3];
+        x[i + 4] = a * x[i + 4];
+        x[i + 5] = a * x[i + 5];
+        x[i + 6] = a * x[i + 6];
+        x[i + 7] = a * x[i + 7];
+        x[i + 8] = a * x[i + 8];
+        x[i + 9] = a * x[i + 9];
+        x[i + 10] = a * x[i + 10];
+        x[i + 11] = a * x[i + 11];
+        x[i + 12] = a * x[i + 12];
+        x[i + 13] = a * x[i + 13];
+        x[i + 14] = a * x[i + 14];
+        x[i + 15] = a * x[i + 15];
+    }
+    *ip = i;
+}
+
+static
+void
+scaled_s_kernel8(int *ip, double a, int incx, double *x)
+{
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        x[incx * (i + 0)] = a * x[incx * (i + 0)];
+        x[incx * (i + 1)] = a * x[incx * (i + 1)];
+        x[incx * (i + 2)] = a * x[incx * (i + 2)];
+        x[incx * (i + 3)] = a * x[incx * (i + 3)];
+        x[incx * (i + 4)] = a * x[incx * (i + 4)];
+        x[incx * (i + 5)] = a * x[incx * (i + 5)];
+        x[incx * (i + 6)] = a * x[incx * (i + 6)];
+        x[incx * (i + 7)] = a * x[incx * (i + 7)];
+    }
+    *ip = i;
+}
+
+void
+blas·scaled(int len, double *x, int incx, double a)
+{
+    int i;
+    
+    if (incx == 1) {
+        i = len;
+        scaled_kernel16(&i, a, x);
+    } else {
+        i = len;
+        scaled_s_kernel8(&i, a, incx, x);
+    }
+    for (; i < len; i++) {
+        x[i] = a * x[i];
+    }
+}
+
+static
+void
+rotd_kernel16(int *ip, double *x, double *y, double cos, double sin)
+{
+    int i;
+    double tmp[16];
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        tmp[0] = x[i + 0], x[i + 0] = cos * x[i + 0] + sin * y[i + 0], y[i + 0] = cos * y[i + 0] - sin * tmp[0];
+        tmp[1] = x[i + 1], x[i + 1] = cos * x[i + 1] + sin * y[i + 1], y[i + 1] = cos * y[i + 1] - sin * tmp[1];
+        tmp[2] = x[i + 2], x[i + 2] = cos * x[i + 2] + sin * y[i + 2], y[i + 2] = cos * y[i + 2] - sin * tmp[2];
+        tmp[3] = x[i + 3], x[i + 3] = cos * x[i + 3] + sin * y[i + 3], y[i + 3] = cos * y[i + 3] - sin * tmp[3];
+        tmp[4] = x[i + 4], x[i + 4] = cos * x[i + 4] + sin * y[i + 4], y[i + 4] = cos * y[i + 4] - sin * tmp[4];
+        tmp[5] = x[i + 5], x[i + 5] = cos * x[i + 5] + sin * y[i + 5], y[i + 5] = cos * y[i + 5] - sin * tmp[5];
+        tmp[6] = x[i + 6], x[i + 6] = cos * x[i + 6] + sin * y[i + 6], y[i + 6] = cos * y[i + 6] - sin * tmp[6];
+        tmp[7] = x[i + 7], x[i + 7] = cos * x[i + 7] + sin * y[i + 7], y[i + 7] = cos * y[i + 7] - sin * tmp[7];
+        tmp[8] = x[i + 8], x[i + 8] = cos * x[i + 8] + sin * y[i + 8], y[i + 8] = cos * y[i + 8] - sin * tmp[8];
+        tmp[9] = x[i + 9], x[i + 9] = cos * x[i + 9] + sin * y[i + 9], y[i + 9] = cos * y[i + 9] - sin * tmp[9];
+        tmp[10] = x[i + 10], x[i + 10] = cos * x[i + 10] + sin * y[i + 10], y[i + 10] = cos * y[i + 10] - sin * tmp[10];
+        tmp[11] = x[i + 11], x[i + 11] = cos * x[i + 11] + sin * y[i + 11], y[i + 11] = cos * y[i + 11] - sin * tmp[11];
+        tmp[12] = x[i + 12], x[i + 12] = cos * x[i + 12] + sin * y[i + 12], y[i + 12] = cos * y[i + 12] - sin * tmp[12];
+        tmp[13] = x[i + 13], x[i + 13] = cos * x[i + 13] + sin * y[i + 13], y[i + 13] = cos * y[i + 13] - sin * tmp[13];
+        tmp[14] = x[i + 14], x[i + 14] = cos * x[i + 14] + sin * y[i + 14], y[i + 14] = cos * y[i + 14] - sin * tmp[14];
+        tmp[15] = x[i + 15], x[i + 15] = cos * x[i + 15] + sin * y[i + 15], y[i + 15] = cos * y[i + 15] - sin * tmp[15];
+    }
+    *ip = i;
+}
+
+static
+void
+rotd_s_kernel8(int *ip, double *y, int incx, double sin, int incy, double *x, double cos)
+{
+    int i;
+    double tmp[8];
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = cos * x[incx * (i + 0)] + sin * y[incy * (i + 0)], y[incy * (i + 0)] = cos * y[incy * (i + 0)] - sin * tmp[0];
+        tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = cos * x[incx * (i + 1)] + sin * y[incy * (i + 1)], y[incy * (i + 1)] = cos * y[incy * (i + 1)] - sin * tmp[1];
+        tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = cos * x[incx * (i + 2)] + sin * y[incy * (i + 2)], y[incy * (i + 2)] = cos * y[incy * (i + 2)] - sin * tmp[2];
+        tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = cos * x[incx * (i + 3)] + sin * y[incy * (i + 3)], y[incy * (i + 3)] = cos * y[incy * (i + 3)] - sin * tmp[3];
+        tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = cos * x[incx * (i + 4)] + sin * y[incy * (i + 4)], y[incy * (i + 4)] = cos * y[incy * (i + 4)] - sin * tmp[4];
+        tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = cos * x[incx * (i + 5)] + sin * y[incy * (i + 5)], y[incy * (i + 5)] = cos * y[incy * (i + 5)] - sin * tmp[5];
+        tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = cos * x[incx * (i + 6)] + sin * y[incy * (i + 6)], y[incy * (i + 6)] = cos * y[incy * (i + 6)] - sin * tmp[6];
+        tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = cos * x[incx * (i + 7)] + sin * y[incy * (i + 7)], y[incy * (i + 7)] = cos * y[incy * (i + 7)] - sin * tmp[7];
+    }
+    *ip = i;
+}
+
+void
+blas·rotd(int len, double *x, int incx, double *y, int incy, double cos, double sin)
+{
+    int i;
+    double tmp;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        rotd_kernel16(&i, x, y, cos, sin);
+    } else {
+        i = len;
+        rotd_s_kernel8(&i, y, incx, sin, incy, x, cos);
+    }
+    for (; i < len; i++) {
+        tmp = x[i], x[i] = cos * x[i] + sin * y[i], y[i] = cos * y[i] - sin * tmp;
+    }
+}
+
+static
+void
+rotgd_kernel16(int *ip, double *y, double H[5], double *x)
+{
+    double tmp[16];
+    int i;
+    int len;
+    
+    len = *ip & ~15;
+    for (i = 0; i < len; i += 16) {
+        tmp[0] = x[i + 0], x[i + 0] = H[1] * x[i + 0] + H[2] * y[i + 0], y[i + 0] = H[3] * y[i + 0] + H[4] * tmp[0];
+        tmp[1] = x[i + 1], x[i + 1] = H[1] * x[i + 1] + H[2] * y[i + 1], y[i + 1] = H[3] * y[i + 1] + H[4] * tmp[1];
+        tmp[2] = x[i + 2], x[i + 2] = H[1] * x[i + 2] + H[2] * y[i + 2], y[i + 2] = H[3] * y[i + 2] + H[4] * tmp[2];
+        tmp[3] = x[i + 3], x[i + 3] = H[1] * x[i + 3] + H[2] * y[i + 3], y[i + 3] = H[3] * y[i + 3] + H[4] * tmp[3];
+        tmp[4] = x[i + 4], x[i + 4] = H[1] * x[i + 4] + H[2] * y[i + 4], y[i + 4] = H[3] * y[i + 4] + H[4] * tmp[4];
+        tmp[5] = x[i + 5], x[i + 5] = H[1] * x[i + 5] + H[2] * y[i + 5], y[i + 5] = H[3] * y[i + 5] + H[4] * tmp[5];
+        tmp[6] = x[i + 6], x[i + 6] = H[1] * x[i + 6] + H[2] * y[i + 6], y[i + 6] = H[3] * y[i + 6] + H[4] * tmp[6];
+        tmp[7] = x[i + 7], x[i + 7] = H[1] * x[i + 7] + H[2] * y[i + 7], y[i + 7] = H[3] * y[i + 7] + H[4] * tmp[7];
+        tmp[8] = x[i + 8], x[i + 8] = H[1] * x[i + 8] + H[2] * y[i + 8], y[i + 8] = H[3] * y[i + 8] + H[4] * tmp[8];
+        tmp[9] = x[i + 9], x[i + 9] = H[1] * x[i + 9] + H[2] * y[i + 9], y[i + 9] = H[3] * y[i + 9] + H[4] * tmp[9];
+        tmp[10] = x[i + 10], x[i + 10] = H[1] * x[i + 10] + H[2] * y[i + 10], y[i + 10] = H[3] * y[i + 10] + H[4] * tmp[10];
+        tmp[11] = x[i + 11], x[i + 11] = H[1] * x[i + 11] + H[2] * y[i + 11], y[i + 11] = H[3] * y[i + 11] + H[4] * tmp[11];
+        tmp[12] = x[i + 12], x[i + 12] = H[1] * x[i + 12] + H[2] * y[i + 12], y[i + 12] = H[3] * y[i + 12] + H[4] * tmp[12];
+        tmp[13] = x[i + 13], x[i + 13] = H[1] * x[i + 13] + H[2] * y[i + 13], y[i + 13] = H[3] * y[i + 13] + H[4] * tmp[13];
+        tmp[14] = x[i + 14], x[i + 14] = H[1] * x[i + 14] + H[2] * y[i + 14], y[i + 14] = H[3] * y[i + 14] + H[4] * tmp[14];
+        tmp[15] = x[i + 15], x[i + 15] = H[1] * x[i + 15] + H[2] * y[i + 15], y[i + 15] = H[3] * y[i + 15] + H[4] * tmp[15];
+    }
+    *ip = i;
+}
+
+static
+void
+rotgd_s_kernel8(int *ip, int incx, double *y, int incy, double H[5], double *x)
+{
+    double tmp[8];
+    int i;
+    int len;
+    
+    len = *ip & ~7;
+    for (i = 0; i < len; i += 8) {
+        tmp[0] = x[incx * (i + 0)], x[incx * (i + 0)] = H[1] * x[incx * (i + 0)] + H[2] * y[incy * (i + 0)], y[incy * (i + 0)] = H[3] * y[incy * (i + 0)] + H[4] * tmp[0];
+        tmp[1] = x[incx * (i + 1)], x[incx * (i + 1)] = H[1] * x[incx * (i + 1)] + H[2] * y[incy * (i + 1)], y[incy * (i + 1)] = H[3] * y[incy * (i + 1)] + H[4] * tmp[1];
+        tmp[2] = x[incx * (i + 2)], x[incx * (i + 2)] = H[1] * x[incx * (i + 2)] + H[2] * y[incy * (i + 2)], y[incy * (i + 2)] = H[3] * y[incy * (i + 2)] + H[4] * tmp[2];
+        tmp[3] = x[incx * (i + 3)], x[incx * (i + 3)] = H[1] * x[incx * (i + 3)] + H[2] * y[incy * (i + 3)], y[incy * (i + 3)] = H[3] * y[incy * (i + 3)] + H[4] * tmp[3];
+        tmp[4] = x[incx * (i + 4)], x[incx * (i + 4)] = H[1] * x[incx * (i + 4)] + H[2] * y[incy * (i + 4)], y[incy * (i + 4)] = H[3] * y[incy * (i + 4)] + H[4] * tmp[4];
+        tmp[5] = x[incx * (i + 5)], x[incx * (i + 5)] = H[1] * x[incx * (i + 5)] + H[2] * y[incy * (i + 5)], y[incy * (i + 5)] = H[3] * y[incy * (i + 5)] + H[4] * tmp[5];
+        tmp[6] = x[incx * (i + 6)], x[incx * (i + 6)] = H[1] * x[incx * (i + 6)] + H[2] * y[incy * (i + 6)], y[incy * (i + 6)] = H[3] * y[incy * (i + 6)] + H[4] * tmp[6];
+        tmp[7] = x[incx * (i + 7)], x[incx * (i + 7)] = H[1] * x[incx * (i + 7)] + H[2] * y[incy * (i + 7)], y[incy * (i + 7)] = H[3] * y[incy * (i + 7)] + H[4] * tmp[7];
+    }
+    *ip = i;
+}
+
+void
+blas·rotgd(int len, double *x, int incx, double *y, int incy, double H[5])
+{
+    int i;
+    double tmp;
+    
+    if (incx == 1 && incy == 1) {
+        i = len;
+        rotgd_kernel16(&i, y, H, x);
+    } else {
+        i = len;
+        rotgd_s_kernel8(&i, incx, y, incy, H, x);
+    }
+    for (; i < len; i++) {
+        tmp = x[i], x[i] = H[1] * x[i] + H[2] * y[i], y[i] = H[3] * y[i] + H[4] * tmp;
+    }
+}
+
+
diff --git a/sys/libmath/gen1.py b/sys/libmath/gen1.py
new file mode 100755
index 0000000..b0f9ecc
--- /dev/null
+++ b/sys/libmath/gen1.py
@@ -0,0 +1,357 @@
+from C import *
+
+NUNROLL = 16
+def pkg(name):
+    return f"blas·{name}"
+
+def typeify(string, kind):
+    if (kind == Float32):
+        return f"{string}f"
+    if (kind == Float64):
+        return f"{string}d"
+
+def fini(func, loop, strided, calls, ret=[]):
+    func.execute(*calls[:2])
+    func = Strided(func, loop, NUNROLL//2, strided, *ret)
+    func.execute(*calls[2:])
+    func.emit()
+
+# ------------------------------------------------------------------------
+# Blas level 1 functions
+
+def copy(kind):
+    name = typeify("copy", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y")
+        ),
+        Vars(
+            (Int, "i")
+        )
+    )
+
+    len, x, y, i = F.variables("len", "x", "y", "i")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        Set(Index(y, i), Index(x, i))
+    )
+
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+
+    # F.execute(*calls)
+    # F.emit()
+    fini(F, loop, F.params[1:3], calls)
+
+def swap(kind):
+    name = typeify("swap", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y")
+        ),
+        Vars(
+            (Int, "i"), (kind, "tmp")
+        )
+    )
+
+    len, x, y, i, tmp = F.variables("len", "x", "y", "i", "tmp")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        Swap(Index(x, i), Index(y, i), tmp)
+    )
+
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+
+    # F.execute(*calls)
+    # F.emit()
+    fini(F, loop, F.params[1:3], calls)
+
+def axpby(kind):
+    name = typeify("axpby", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (kind, "a"), (Ptr(kind), "x"), (kind, "b"), (Ptr(kind), "y")
+        ),
+        Vars(
+            (Int, "i"),
+        )
+    )
+
+    len, x, y, i, a, b = F.variables("len", "x", "y", "i", "a", "b")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        Set(Index(y, i),
+            Mul(b,
+                Add(Index(y, i),
+                    Mul(a, Index(x, i)),
+                )
+            )
+        )
+    )
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+
+    # F.execute(*calls)
+    # F.emit()
+    fini(F, loop, [F.params[2], F.params[4]], calls)
+
+def axpy(kind):
+    name = typeify("axpy", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (kind, "a"), (Ptr(kind), "x"), (Ptr(kind), "y")
+        ),
+        Vars(
+            (Int, "i"),
+        )
+    )
+
+    len, x, y, i, a = F.variables("len", "x", "y", "i", "a")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        Set(Index(y, i),
+            Add(Index(y, i),
+                Mul(a, Index(x, i)),
+            )
+        )
+    )
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+
+    fini(F, loop, F.params[2:4], calls)
+    # F.execute(*calls)
+    # F.emit()
+
+def argminmax(kind):
+    for operation in [("min", LT), ("max", GT)]:
+        name = typeify(f"arg{operation[0]}", kind)
+        F = Func(pkg(name), Int,
+            Params(
+                (Int, "len"), (Ptr(kind), "x"),
+            ),
+            Vars(
+                (Int, "i"), (Int, "ix"), (kind, operation[0])
+            )
+        )
+        len, x, i, ix, op = F.variables("len", "x", "i", "ix", operation[0])
+
+        loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+        loop.execute(
+            If(operation[1](Index(x, i), op),
+                Block(
+                    Set(ix, i),
+                    Set(op, Index(x, ix)),
+                )
+            )
+        )
+
+        ret = (ix, lambda ires, icur, node:
+                If(GT(Index(x, Index(node, icur)), Index(x, Index(node, ires))),
+                    Block(Set(Index(node, ires), Index(node, icur)))
+                )
+        )
+
+        kernel, calls = Unroll(name, loop, NUNROLL, *ret)
+        kernel.emit()
+
+        # F.execute(*calls, Return(ix))
+        # F.emit()
+        fini(F, loop, [F.params[1]],  calls[:2] + [Set(op, Index(x, ix))] + calls[2:] + [Return(ix)], ret)
+
+def dot(kind):
+    nm = typeify("dot", kind)
+    F = Func(pkg(nm), kind,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y")
+        ),
+        Vars(
+            (Int, "i"), (kind, "sum"),
+        )
+    )
+    len, x, i, y, sum = F.variables("len", "x", "i", "y", "sum")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        AddSet(sum, Mul(Index(x, i), Index(y, i)))
+    )
+    ret = (sum, lambda ires, icur, node: StmtExpr(AddSet(Index(node, ires), Index(node, icur))))
+
+    kernel, calls = Unroll(nm, loop, NUNROLL, *ret)
+    kernel.emit()
+
+    # F.execute(*calls, Return(sum))
+    fini(F, loop, F.params[1:3], calls + [Return(sum)], ret)
+
+def norm(kind):
+    nm = typeify("norm", kind)
+    F = Func(pkg(nm), kind,
+        Params(
+            (Int, "len"), (Ptr(kind), "x")
+        ),
+        Vars(
+            (Int, "i"), (kind, "nrm"),
+        )
+    )
+    len, x, i, nrm = F.variables("len", "x", "i", "nrm")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        AddSet(nrm, Mul(Index(x, i), Index(x, i)))
+    )
+
+    ret = (nrm, lambda ires, icur, node:
+            StmtExpr(AddSet(Index(node, ires), Index(node, icur)))
+    )
+
+    kernel, calls = Unroll(nm, loop, NUNROLL, *ret)
+    kernel.emit()
+
+    if kind == Float64:
+        sqrt = Func("math·sqrt", kind)
+    elif kind == Float32:
+        sqrt = Func("math·sqrtf", kind)
+    else:
+        raise ValueError(f"no sqrt for type {kind}")
+
+    # F.execute(*calls, Return(Call(sqrt, [nrm])))
+    # F.emit()
+    fini(F, loop, [F.params[1]], calls + [Return(Call(sqrt, [nrm]))], ret)
+
+def sum(kind):
+    name = typeify("sum", kind)
+    F = Func(pkg(name), kind,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"),
+        ),
+        Vars(
+            (Int, "i"), (kind, "sum")
+        )
+    )
+
+    len, x, i, sum = F.variables("len", "x", "i", "sum")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        AddSet(sum, Index(x, i))
+    )
+
+    ret = (sum, lambda ires, icur, node:
+            StmtExpr(AddSet(Index(node, ires), Index(node, icur)))
+    )
+
+    kernel, calls = Unroll(name, loop, NUNROLL, *ret)
+    kernel.emit()
+
+    fini(F, loop, [F.params[1]], calls + [Return(sum)], ret)
+    # F.execute(*calls, Return(sum))
+    # F.emit()
+
+def scale(kind):
+    name = typeify("scale", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"), (kind, "a")
+        ),
+        Vars(
+            (Int, "i"),
+        )
+    )
+
+    len, a, x, i = F.variables("len", "a", "x", "i")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(Set(Index(x, i), Mul(a, Index(x, i))))
+
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+
+    fini(F, loop, [F.params[1]], calls)
+    # F.execute(*calls)
+    # F.emit()
+
+def rot(kind):
+    name = typeify("rot", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y"), (kind, "cos"), (kind, "sin")
+        ),
+        Vars(
+            (Int, "i"), (kind, "tmp")
+        )
+    )
+
+    len, x, y, i, tmp, cos, sin = F.variables("len", "x", "y", "i", "tmp", "cos", "sin")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        Comma(Set(tmp, Index(x, i)),
+            Comma(
+                Set(Index(x, i), Add(Mul(cos, Index(x, i)), Mul(sin, Index(y, i)))),
+                Set(Index(y, i), Sub(Mul(cos, Index(y, i)), Mul(sin, tmp)))
+            )
+        )
+    )
+
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+
+    fini(F, loop, [F.params[1], F.params[2]], calls)
+    # F.execute(*calls)
+    # F.emit()
+
+def rotm(kind):
+    name = typeify("rotg", kind)
+    F = Func(pkg(name), Void,
+        Params(
+            (Int, "len"), (Ptr(kind), "x"), (Ptr(kind), "y"), (Array(kind, 5), "H")
+        ),
+        Vars(
+            (Int, "i"), (kind, "tmp")
+        )
+    )
+
+    len, x, y, i, tmp, H = F.variables("len", "x", "y", "i", "tmp", "H")
+
+    loop = For(Set(i, I(0)), LT(i, len), [Inc(i)])
+    loop.execute(
+        Comma(Set(tmp, Index(x, i)),
+            Comma(
+                Set(Index(x, i), Add(Mul(Index(H, I(1)), Index(x, i)), Mul(Index(H, I(2)), Index(y, i)))),
+                Set(Index(y, i), Add(Mul(Index(H, I(3)), Index(y, i)), Mul(Index(H, I(4)), tmp)))
+            )
+        )
+    )
+
+    kernel, calls = Unroll(name, loop, NUNROLL)
+    kernel.emit()
+    fini(F, loop, F.params[1:3], calls)
+
+if __name__ == "__main__":
+    emit("#include <u.h>\n")
+    emit("#include <libn.h>\n")
+    emit("#include <libmath.h>\n")
+    emitln()
+    emit("/*********************************************************/\n")
+    emit("/* THIS CODE IS GENERATED BY GEN1.PY! DON'T EDIT BY HAND */\n")
+    emit("/*********************************************************/\n")
+    emitln(2)
+
+    # TODO: Implement rotg/rotmg
+    for kind in [Float32, Float64]:
+        argminmax(kind)
+        copy(kind)
+        axpy(kind)
+        axpby(kind)
+        dot(kind)
+        sum(kind)
+        norm(kind)
+        scale(kind)
+        rot(kind)
+        rotm(kind)
+
+    flush()
diff --git a/sys/libmath/linalg.c b/sys/libmath/linalg.c
index 09e100c..8551ff1 100644
--- a/sys/libmath/linalg.c
+++ b/sys/libmath/linalg.c
@@ -1,6 +1,7 @@
 #include <u.h>
 #include <libn.h>
 #include <libmath.h>
+#include <libmath/blas.h>
 
 // -----------------------------------------------------------------------
 // Vector
@@ -10,8 +11,8 @@ linalg·normalize(math·Vector vec)
 {
     double norm;
 
-    norm = blas·norm(vec.len, vec.data, 1);
-    blas·scale(vec.len, 1/norm, vec.data, 1);
+    norm = blas·normd(vec.len, vec.data, 1);
+    blas·scaled(vec.len, 1/norm, vec.data, 1);
 }
 // TODO: Write blas wrappers that eat vectors for convenience
 
@@ -50,12 +51,12 @@ linalg·lq(math·Matrix m, math·Vector w)
         len = m.dim[0] - i;
 
         // TODO: Don't want to compute norm twice!!
-        w.data[0] = math·sgn(row[0]) * blas·norm(len, row, 1);
-        blas·axpy(len, 1.0, row, 1, w.data, 1);
-        mag  = blas·norm(len, w.data, 1);
-        blas·scale(len, 1/mag, w.data, 1);
+        w.data[0] = math·sgn(row[0]) * blas·normd(len, row, 1);
+        blas·axpyd(len, 1.0, row, 1, w.data, 1);
+        mag  = blas·normd(len, w.data, 1);
+        blas·scaled(len, 1/mag, w.data, 1);
 
-        blas·copy(len - m.dim[0], w.data, 1, m.data + i, 1);
+        blas·copyd(len - m.dim[0], w.data, 1, m.data + i, 1);
     }
 
     return err·nil;
diff --git a/sys/libmath/proto.c b/sys/libmath/proto.c
deleted file mode 100644
index 63f8856..0000000
--- a/sys/libmath/proto.c
+++ /dev/null
@@ -1,1761 +0,0 @@
-#include <u.h>
-#include <libn.h>
-#include <libmath.h>
-#include <vendor/blas/cblas.h>
-
-#include <x86intrin.h>
-#include <time.h>
-
-#define EVEN_BY(x, n) (x) & ~((n)-1)
-
-// -----------------------------------------------------------------------
-// misc functions
-
-static
-inline
-double
-hsum_avx2(__m256d x)
-{
-    __m128d lo128, hi128, hi64;
-
-    lo128 = _mm256_castpd256_pd128(x);
-    hi128 = _mm256_extractf128_pd(x, 1);
-    lo128 = _mm_add_pd(lo128, hi128);
-
-    hi64  = _mm_unpackhi_pd(lo128, lo128);
-
-    return _mm_cvtsd_f64(_mm_add_sd(lo128, hi64));
-}
-
-
-// -----------------------------------------------------------------------
-// level one
-
-/*
- * rotate vector
- * x = cos*x + sin*y
- * y = cos*x - sin*y
- */ 
-
-static
-void
-rot_kernel8_avx2(int n, double *x, double *y, double cos, double sin)
-{
-    register int i;
-    __m256d x256, y256;
-    __m128d cos128, sin128;
-    __m256d cos256, sin256;
-
-    cos128 = _mm_load_sd(&cos);
-    cos256 = _mm256_broadcastsd_pd(cos128);
-
-    sin128 = _mm_load_sd(&sin);
-    sin256 = _mm256_broadcastsd_pd(sin128);
-
-    for (i = 0; i < n; i+=8) {
-        x256 = _mm256_loadu_pd(x+i+0);
-        y256 = _mm256_loadu_pd(y+i+0);
-        _mm256_storeu_pd(x+i+0, cos256 * x256 + sin256 * y256);
-        _mm256_storeu_pd(y+i+0, cos256 * y256 - sin256 * x256);
-
-        x256 = _mm256_loadu_pd(x+i+4);
-        y256 = _mm256_loadu_pd(y+i+4);
-        _mm256_storeu_pd(x+i+4, cos256 * x256 + sin256 * y256);
-        _mm256_storeu_pd(y+i+4, cos256 * y256 - sin256 * x256);
-    }
-}
-
-static
-void
-rot_kernel8(int n, double *x, double *y, double cos, double sin)
-{
-    register int i;
-    register double tmp;
-
-    for (i = 0; i < n; i+=8) {
-        tmp = x[i+0], x[i+0] = cos*x[i+0] + sin*y[i+0], y[i+0] = cos*y[i+0] - sin*tmp;
-        tmp = x[i+1], x[i+1] = cos*x[i+1] + sin*y[i+1], y[i+1] = cos*y[i+1] - sin*tmp;
-        tmp = x[i+2], x[i+2] = cos*x[i+2] + sin*y[i+2], y[i+2] = cos*y[i+2] - sin*tmp;
-        tmp = x[i+3], x[i+3] = cos*x[i+3] + sin*y[i+3], y[i+3] = cos*y[i+3] - sin*tmp;
-        tmp = x[i+4], x[i+4] = cos*x[i+4] + sin*y[i+4], y[i+4] = cos*y[i+4] - sin*tmp;
-        tmp = x[i+5], x[i+5] = cos*x[i+5] + sin*y[i+5], y[i+5] = cos*y[i+5] - sin*tmp;
-        tmp = x[i+6], x[i+6] = cos*x[i+6] + sin*y[i+6], y[i+6] = cos*y[i+6] - sin*tmp;
-        tmp = x[i+7], x[i+7] = cos*x[i+7] + sin*y[i+7], y[i+7] = cos*y[i+7] - sin*tmp;
-    }
-}
-
-void
-blas·rot(int len, double *x, int incx, double *y, int incy, double cos, double sin)
-{
-    register int i, n;
-    register double tmp;
-
-    if (incx == 1 && incy == 1) {
-        n = EVEN_BY(len, 8);
-        rot_kernel8_avx2(n, x, y, cos, sin);
-        x += n;
-        y += n;
-    } else {
-        n = EVEN_BY(len, 4);
-        for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) {
-            tmp = x[0*incx], x[0*incx] = cos*x[0*incx] + sin*y[0*incy], y[0*incy] = cos*y[0*incy] - sin*tmp;
-            tmp = x[1*incx], x[1*incx] = cos*x[1*incx] + sin*y[1*incy], y[1*incy] = cos*y[1*incy] - sin*tmp;
-            tmp = x[2*incx], x[2*incx] = cos*x[2*incx] + sin*y[2*incy], y[2*incy] = cos*y[2*incy] - sin*tmp;
-            tmp = x[3*incx], x[3*incx] = cos*x[3*incx] + sin*y[3*incy], y[3*incy] = cos*y[3*incy] - sin*tmp;
-        }
-    }
-
-    for (; n < len; n++, x += incx, y += incy) {
-        tmp = x[0], x[0] = cos*x[0] + sin*y[0], y[0] = cos*y[0] - sin*tmp;
-    }
-
-}
-
-/*
- * compute givens rotate vector
- * --      --
- * |+cos -sin| | a | = | r |
- * |-sin +cos| | b | = | 0 |
- * --      --
- */ 
-
-void
-blas·rotg(double *a, double *b, double *cos, double *sin)
-{
-    double abs_a, abs_b, r, rho, scale, z;
-
-    abs_a = math·abs(*a);
-    abs_b = math·abs(*b);
-    rho   = abs_a > abs_b ? *a : *b;
-    scale = abs_a + abs_b;
-
-    if (scale == 0) {
-        *cos = 1, *sin = 0;
-        r = 0.;
-        z = 0.;
-    } else {
-        r    = math·sgn(rho) * scale * math·sqrt(math·pow(abs_a/scale, 2) + math·pow(abs_b/scale, 2));
-        *cos = *a / r;
-        *sin = *b / r;
-        if (abs_a > abs_b)
-            z = *sin;
-        else if (abs_b >= abs_a && *cos != 0)
-            z = 1/(*cos);
-        else 
-            z = 1.;
-    }
-    *a = r;
-    *b = z;
-}
-
-/*
- * modified Givens rotation of points in plane
- * operates on len points
- *
- * params = [flag, h11, h12, h21, h22]
- * NOTE:    This is row major as opposed to other implementations
- *
- * Flags correspond to:
- * @flag = -1:
- *      H -> [ [h11, h12], [h21, h22] ]
- * @flag = 0.0:
- *      H -> [ [1, h12], [h21, 1] ]
- * @flag = +1:
- *      H -> [ [h11, 1], [-1, h22] ]
- * @flag = -2:
- *      H -> [ [1, 0], [0, 1] ]
- * @flag = *
- *      return error
- *
- *  Replaces:
- *      x -> H11 * x + H12 * y
- *      y -> H21 * x + H22 * y
- */
-
-static
-void
-rotm_kernel8_avx2(int n, double *x, double *y, double H[4])
-{
-    register int i;
-    __m256d x256, y256;
-    __m256d H256[4];
-
-    for (i = 0; i < 4; i++) {
-        H256[i] = _mm256_broadcastsd_pd(_mm_load_sd(H+i));
-    }
-        
-    for (i = 0; i < n; i+=8) {
-        x256 = _mm256_loadu_pd(x+i+0);
-        y256 = _mm256_loadu_pd(y+i+0);
-        _mm256_storeu_pd(x+i+0, H256[0] * x256 + H256[1] * y256);
-        _mm256_storeu_pd(y+i+0, H256[2] * x256 + H256[3] * y256);
-
-        x256 = _mm256_loadu_pd(x+i+4);
-        y256 = _mm256_loadu_pd(y+i+4);
-        _mm256_storeu_pd(x+i+4, H256[0] * x256 + H256[1] * y256);
-        _mm256_storeu_pd(y+i+4, H256[2] * x256 + H256[3] * y256);
-    }
-}
-
-static
-void
-rotm_kernel8(int n, double *x, double *y, double H[4])
-{
-    register int i;
-    register double tmp;
-
-    for (i = 0; i < n; i+=8) {
-        tmp = x[i+0], x[i+0] = H[0]*x[i+0] + H[1]*y[i+0], y[i+0] = H[2]*tmp + H[3]*y[i+0];
-        tmp = x[i+1], x[i+1] = H[0]*x[i+1] + H[1]*y[i+1], y[i+1] = H[2]*tmp + H[3]*y[i+1];
-        tmp = x[i+2], x[i+2] = H[0]*x[i+2] + H[1]*y[i+2], y[i+2] = H[2]*tmp + H[3]*y[i+2];
-        tmp = x[i+3], x[i+3] = H[0]*x[i+3] + H[1]*y[i+3], y[i+3] = H[2]*tmp + H[3]*y[i+3];
-        tmp = x[i+4], x[i+4] = H[0]*x[i+4] + H[1]*y[i+4], y[i+4] = H[2]*tmp + H[3]*y[i+4];
-        tmp = x[i+5], x[i+5] = H[0]*x[i+5] + H[1]*y[i+5], y[i+5] = H[2]*tmp + H[3]*y[i+5];
-        tmp = x[i+6], x[i+6] = H[0]*x[i+6] + H[1]*y[i+6], y[i+6] = H[2]*tmp + H[3]*y[i+6];
-        tmp = x[i+7], x[i+7] = H[0]*x[i+7] + H[1]*y[i+7], y[i+7] = H[2]*tmp + H[3]*y[i+7];
-    }
-}
-
-error
-blas·rotm(int len, double *x, int incx, double *y, int incy, double p[5])
-{
-    int    i, n, flag;
-    double tmp, H[4];
-
-    flag = math·round(p[0]);
-    switch (flag) {
-        case -1: H[0] = p[1], H[1] = p[2], H[2] = p[3], H[3] = p[4]; break;
-        case  0: H[0] = +1,   H[1] = p[2], H[2] = p[3], H[3] = +1;   break;
-        case +1: H[0] = p[1], H[1] = +1,   H[2] = -1,   H[3] = p[4]; break;
-        case -2: H[0] = +1,   H[1] = 0,    H[2] = 0,    H[3] = +1;   break;
-        default:
-            errorf("rotm: flag '%d' unrecognized", flag);
-            return 1;
-    }
-
-    if (incx == 1 && incy == 1) {
-        n = EVEN_BY(len, 8);
-        rotm_kernel8_avx2(n, x, y, H);
-        x += n;
-        y += n;
-    } else {
-        n = EVEN_BY(len, 4);
-        for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) {
-            tmp = x[0*incx], x[0*incx] = H[0]*x[0*incx] + H[1]*y[0*incy], y[0*incy] = H[2]*tmp + H[3]*y[0*incy];
-            tmp = x[1*incx], x[1*incx] = H[0]*x[1*incx] + H[1]*y[1*incy], y[1*incy] = H[2]*tmp + H[3]*y[1*incy];
-            tmp = x[2*incx], x[2*incx] = H[0]*x[2*incx] + H[1]*y[2*incy], y[2*incy] = H[2]*tmp + H[3]*y[2*incy];
-            tmp = x[3*incx], x[3*incx] = H[0]*x[3*incx] + H[1]*y[3*incy], y[3*incy] = H[2]*tmp + H[3]*y[3*incy];
-        }
-    }
-
-    for (; n < len; n++, x += incx, y += incy) {
-        tmp = x[0], x[0] = H[0]*x[0] + H[1]*y[0], y[0] = H[2]*tmp + H[3]*y[0];
-    }
-
-    return 0;
-}
-
-
-/*
- * scale vector
- * x = ax
- */ 
-
-static
-void
-scale_kernel8_avx2(int n, double *x, double a)
-{
-    __m128d a128;
-    __m256d a256;
-    register int i;
-
-    a128 = _mm_load_sd(&a);
-    a256 = _mm256_broadcastsd_pd(a128);
-    for (i = 0; i < n; i += 8) {
-        _mm256_storeu_pd(x+i+0, a256 * _mm256_loadu_pd(x+i+0));
-        _mm256_storeu_pd(x+i+4, a256 * _mm256_loadu_pd(x+i+4));
-    }
-}
-
-static
-void
-scale_kernel8(int n, double *x, double a)
-{
-    register int i;
-    for (i = 0; i < n; i += 8) {
-        x[i+0] *= a;
-        x[i+1] *= a;
-        x[i+2] *= a;
-        x[i+3] *= a;
-        x[i+4] *= a;
-        x[i+5] *= a;
-        x[i+6] *= a;
-        x[i+7] *= a;
-    }
-}
-
-void
-blas·scale(int len, double a, double *x, int inc)
-{
-    int n, ix;
-
-    if (inc == 1) {
-        n  = EVEN_BY(len, 8);
-        scale_kernel8_avx2(n, x, a);
-        ix = n;
-    } else {
-        n = EVEN_BY(len, 4);
-        for (ix = 0; ix < n*inc; ix += 4*inc) {
-            x[ix+0*inc] *= a;
-            x[ix+1*inc] *= a;
-            x[ix+2*inc] *= a;
-            x[ix+3*inc] *= a;
-        }
-    }
-    for (; n < len; n++, ix += inc) {
-        x[ix] *= a;
-    }
-}
-
-/*
- * copy
- * y = x
- */
-
-void
-blas·copy(int len, double *x, int incx, double *y, int incy)
-{
-    int n, i, ix, iy;
-    if (incx == 1 && incy == 1) {
-        memcpy(y, x, sizeof(*x) * len);
-        return;
-    } 
-
-    n = EVEN_BY(len, 4);
-    for (i = 0, incx = 0, incy = 0; i < n; i+=4, ix+=4*incx, iy+=4*incy) {
-        y[iy+0*incy] = x[ix+0*incx];
-        y[iy+1*incy] = x[ix+1*incx];
-        y[iy+2*incy] = x[ix+2*incx];
-        y[iy+3*incy] = x[ix+3*incx];
-    }
-
-    for (; n < len; n++, ix+=incx, iy+=incy) {
-        y[iy] = x[ix];
-    }
-}
-
-/*
- * swap
- * y <=> x
- */
-
-static
-void
-swap_kernel8_avx2(int n, double *x, double *y)
-{
-    register int i;
-    __m256d tmp[2];
-    for (i = 0; i < n; i += 8) {
-        tmp[0] = _mm256_loadu_pd(x+i+0);
-        tmp[1] = _mm256_loadu_pd(y+i+0);
-        _mm256_storeu_pd(x+i+0, tmp[1]);
-        _mm256_storeu_pd(y+i+0, tmp[0]);
-
-        tmp[0] = _mm256_loadu_pd(x+i+4);
-        tmp[1] = _mm256_loadu_pd(y+i+4);
-        _mm256_storeu_pd(x+i+4, tmp[1]);
-        _mm256_storeu_pd(y+i+4, tmp[0]);
-    }
-}
-
-static
-void
-swap_kernel8(int n, double *x, double *y)
-{
-    register int i;
-    register double tmp;
-    for (i = 0; i < n; i += 8) {
-        tmp = x[i+0], x[i+0] = y[i+0], y[i+0] = tmp;
-        tmp = x[i+1], x[i+1] = y[i+1], y[i+1] = tmp;
-        tmp = x[i+2], x[i+2] = y[i+2], y[i+2] = tmp;
-        tmp = x[i+3], x[i+3] = y[i+3], y[i+3] = tmp;
-        tmp = x[i+4], x[i+4] = y[i+4], y[i+4] = tmp;
-        tmp = x[i+5], x[i+5] = y[i+5], y[i+5] = tmp;
-        tmp = x[i+6], x[i+6] = y[i+6], y[i+6] = tmp;
-        tmp = x[i+7], x[i+7] = y[i+7], y[i+7] = tmp;
-    }
-}
-
-void
-blas·swap(int len, double *x, int incx, double *y, int incy)
-{
-    int n, i, ix, iy;
-    double tmp;
-
-    if (incx == 1 && incy == 1) {
-        n  = EVEN_BY(len, 8);
-        swap_kernel8(n, x, y);
-        ix = n;
-        iy = n;
-    } else {
-        n  = EVEN_BY(len, 4);
-        for (i = 0, ix = 0, iy = 0; i < n; i += 4, ix += 4*incx, iy += 4*incy) {
-            tmp = x[ix + 0*incx], x[ix + 0*incx] = y[iy + 0*incy], y[iy + 0*incy] = tmp;
-            tmp = x[ix + 1*incx], x[ix + 1*incx] = y[iy + 1*incy], y[iy + 1*incy] = tmp;
-            tmp = x[ix + 2*incx], x[ix + 2*incx] = y[iy + 2*incy], y[iy + 2*incy] = tmp;
-            tmp = x[ix + 3*incx], x[ix + 3*incx] = y[iy + 3*incy], y[iy + 3*incy] = tmp;
-        }
-    }
-
-    for (; n < len; n++, ix += incx, iy += incy) {
-        tmp = x[ix], x[ix] = y[iy], y[iy] = tmp;
-    }
-}
-
-/*
- * daxpy
- * y = ax + y
- */
- 
-
-static
-void
-axpy_kernel8_avx2(int n, double a, double *x, double *y)
-{
-    __m128d a128;
-    __m256d a256;
-    register int i;
-
-    a128 = _mm_load_sd(&a);
-    a256 = _mm256_broadcastsd_pd(a128);
-    for (i = 0; i < n; i += 8) {
-        _mm256_storeu_pd(y+i+0, _mm256_loadu_pd(y+i+0) + a256 * _mm256_loadu_pd(x+i+0));
-        _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(y+i+4) + a256 * _mm256_loadu_pd(x+i+4));
-    }
-}
-
-static
-void
-axpy_kernel8(int n, double a, double *x, double *y)
-{
-    register int i;
-    for (i = 0; i < n; i += 8) {
-        y[i+0] += a*x[i+0];
-        y[i+1] += a*x[i+1];
-        y[i+2] += a*x[i+2];
-        y[i+3] += a*x[i+3];
-        y[i+4] += a*x[i+4];
-        y[i+5] += a*x[i+5];
-        y[i+6] += a*x[i+6];
-        y[i+7] += a*x[i+7];
-    }
-}
-
-void
-blas·axpy(int len, double a, double *x, int incx, double *y, int incy)
-{
-    int n, i;
-
-    if (incx == 1 && incy == 1) {
-        n = EVEN_BY(len, 8);
-        axpy_kernel8_avx2(n, a, x, y);
-        x += n;
-        y += n;
-    } else {
-        n = EVEN_BY(len, 4);
-        for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) {
-            y[0*incy] += a*x[0*incx];
-            y[1*incy] += a*x[1*incx];
-            y[2*incy] += a*x[2*incx];
-            y[3*incy] += a*x[3*incx];
-        }
-    }
-
-    for (; n < len; n++, x+=incx, y+=incy) {
-        *y += a*(*x);
-    }
-}
-
-/*
- * dot product
- * x·y
- */
-
-static
-double
-dot_kernel8_fma3(int len, double *x, double *y)
-{
-    register int i;
-    __m256d sum[4];
-    __m128d res;
-
-    for (i = 0; i < arrlen(sum); i++) {
-        sum[i] = _mm256_setzero_pd();
-    }
-
-    for (i = 0; i < len; i += 16) {
-        sum[0] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+0),  _mm256_loadu_pd(y+i+0), sum[0]);
-        sum[1] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+4),  _mm256_loadu_pd(y+i+4), sum[1]);
-        sum[2] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+8),  _mm256_loadu_pd(y+i+8), sum[2]);
-        sum[3] = _mm256_fmadd_pd(_mm256_loadu_pd(x+i+12), _mm256_loadu_pd(y+i+12), sum[3]);
-    }
-
-    sum[0] += sum[1] + sum[2] + sum[3];
-
-    return hsum_avx2(sum[0]);
-}
-
-static
-double
-dot_kernel8_avx2(int len, double *x, double *y)
-{
-    register int i;
-    __m256d sum[4];
-    __m128d res;
-
-    for (i = 0; i < arrlen(sum); i++) {
-        sum[i] = _mm256_setzero_pd();
-    }
-
-    for (i = 0; i < len; i += 16) {
-        sum[0] += _mm256_loadu_pd(x+i+0)  * _mm256_loadu_pd(y+i+0);
-        sum[1] += _mm256_loadu_pd(x+i+4)  * _mm256_loadu_pd(y+i+4);
-        sum[2] += _mm256_loadu_pd(x+i+8)  * _mm256_loadu_pd(y+i+8);
-        sum[3] += _mm256_loadu_pd(x+i+12) * _mm256_loadu_pd(y+i+12);
-    }
-
-    sum[0] += sum[1] + sum[2] + sum[3];
-
-    return hsum_avx2(sum[0]);
-}
-
-static
-double
-dot_kernel8(int len, double *x, double *y)
-{
-    double res;
-    register int i;
-
-    for (i = 0; i < len; i += 8) {
-        res += x[i]   * y[i]   +
-               x[i+1] * y[i+1] +
-               x[i+2] * y[i+2] +
-               x[i+3] * y[i+3] +
-               x[i+4] * y[i+4] +
-               x[i+5] * y[i+5] +
-               x[i+6] * y[i+6] +
-               x[i+7] * y[i+7];
-    }
-
-    return res;
-}
-
-double
-blas·dot(int len, double *x, int incx, double *y, int incy)
-{
-    int i, n;
-    double mul[4], sum[2];
-    if (len == 0) return 0;
-
-    sum[0] = 0, sum[1] = 0;
-    if (incx == 1 && incy == 1) {
-        n      = EVEN_BY(len, 16);
-        sum[0] = dot_kernel8_fma3(n, x, y);
-
-        x += n;
-        y += n;
-    } else {
-        n = EVEN_BY(len, 4); 
-        for (i = 0; i < n; i += 4, x += 4*incx, y += 4*incy) {
-            mul[0] = x[0*incx] * y[0*incy];
-            mul[1] = x[1*incx] * y[1*incy];
-            mul[2] = x[2*incx] * y[2*incy];
-            mul[3] = x[3*incx] * y[3*incy];
-            
-            sum[0] += mul[0] + mul[2];
-            sum[1] += mul[1] + mul[3];
-        }
-    }
-
-    for (; n < len; n++, x += incx, y += incy) {
-        sum[0] += x[0] * y[0];
-    }
-
-    sum[0] += sum[1];
-    return sum[0];
-}
-
-/*
- * euclidean norm
- * ||x||
- */
-double
-blas·norm(int len, double *x, int incx)
-{
-    double res;
-
-    res = blas·dot(len, x, incx, x, incx);
-    res = math·sqrt(res);
-
-    return res;
-}
-
-static
-double
-sum_kernel8_avx2(int len, double *x)
-{
-    register int i;
-    __m256d sum[2];
-    __m128d res;
-
-    for (i = 0; i < arrlen(sum); i++) {
-        sum[i] = _mm256_setzero_pd();
-    }
-
-    for (i = 0; i < len; i += 8) {
-        sum[0] += _mm256_loadu_pd(x+i+0);
-        sum[1] += _mm256_loadu_pd(x+i+4);
-    }
-
-    sum[0] += sum[1];
-
-    return hsum_avx2(sum[0]);
-}
-
-static
-double
-sum_kernel8(int len, double *x, double *y)
-{
-    double res;
-    register int i;
-
-    for (i = 0; i < len; i += 8) {
-        res += x[i] +
-               x[i+1] +
-               x[i+2] +
-               x[i+3] +
-               x[i+4] +
-               x[i+5] +
-               x[i+6] +
-               x[i+7]; 
-    }
-
-    return res;
-}
-
-
-/*
- * L1 norm
- * sum(x_i)
- */
-double
-blas·sum(int len, double *x, int inc)
-{
-    int i, n;
-    double res;
-
-    if (len == 0) return 0;
-
-    if (inc == 1) {
-        n   = EVEN_BY(len, 8);
-        res = sum_kernel8_avx2(n, x);
-    } else {
-        n = EVEN_BY(len, 4);
-        for (i = 0; i < n; i++, x += 4*inc) {
-            res += x[0*inc];
-            res += x[1*inc];
-            res += x[2*inc];
-            res += x[3*inc];
-        }
-    }
-
-    for (i = n; i < len; i++, x += inc) {
-        res += x[0];
-    }
-
-    return res;
-}
-
-/*
- * argmax
- * i = argmax(x)
- */
-
-int
-argmax_kernel8_avx2(int n, double *x)
-{
-    register int i, msk, maxidx[4];
-    __m256d val, cmp, max;
-
-    maxidx[0] = 0, maxidx[1] = 0, maxidx[2] = 0, maxidx[3] = 0;
-    max = _mm256_loadu_pd(x);
-
-    for (i = 0; i < n; i += 4) {
-        val = _mm256_loadu_pd(x+i);                                                   
-        cmp = _mm256_cmp_pd(val, max, _CMP_GT_OQ);                                      
-        msk = _mm256_movemask_pd(cmp);                                                    
-        switch (msk) {                                                                   
-            case 1:  
-                max = _mm256_blend_pd(max, val, 1);  
-                maxidx[0] = i;
-            break; 
-
-            case 2:  
-                max = _mm256_blend_pd(max, val, 2);  
-                maxidx[1] = i;
-            break; 
-
-            case 3:  
-                max = _mm256_blend_pd(max, val, 3);
-                maxidx[0] = i;
-                maxidx[1] = i;
-            break;
-
-            case 4:  
-                max = _mm256_blend_pd(max, val, 4);
-                maxidx[2] = i;
-            break;
-
-            case 5:  
-                max = _mm256_blend_pd(max, val, 5);
-                maxidx[2] = i;
-                maxidx[0] = i;
-            break;
-
-            case 6:  
-                max = _mm256_blend_pd(max, val, 6);
-                maxidx[2] = i;
-                maxidx[1] = i;
-            break;
-
-            case 7:  
-                max = _mm256_blend_pd(max, val, 7);
-                maxidx[2] = i;
-                maxidx[1] = i;
-                maxidx[0] = i;
-            break;
-
-            case 8:  
-                max = _mm256_blend_pd(max, val, 8);
-                maxidx[3] = i;
-            break;
-
-            case 9:
-                max = _mm256_blend_pd(max, val, 9);
-                maxidx[3] = i;
-                maxidx[0] = i;
-            break;
-
-            case 10: 
-                max = _mm256_blend_pd(max, val, 10);
-                maxidx[3] = i;
-                maxidx[1] = i;
-            break;
-
-            case 11: 
-                max = _mm256_blend_pd(max, val, 11);
-                maxidx[3] = i;
-                maxidx[1] = i;
-                maxidx[0] = i;
-            break;
-
-            case 12: 
-                max = _mm256_blend_pd(max, val, 12);
-                maxidx[3] = i;
-                maxidx[2] = i;
-            break;
-
-            case 13: 
-                max = _mm256_blend_pd(max, val, 13);
-                maxidx[3] = i;
-                maxidx[2] = i;
-                maxidx[0] = i;
-            break;
-
-            case 14:
-                max = _mm256_blend_pd(max, val, 14);
-                maxidx[3] = i;
-                maxidx[2] = i;
-                maxidx[1] = i;
-            break;
-
-            case 15: 
-                max = _mm256_blend_pd(max, val, 15);
-                maxidx[3] = i;
-                maxidx[2] = i;
-                maxidx[1] = i;
-                maxidx[0] = i;
-            break;
-
-            case 0:
-            default: ; 
-        }                                                                                                               
-    }
-    maxidx[0] = (x[maxidx[0]+0] > x[maxidx[1]+1]) ?  maxidx[0]+0 : maxidx[1]+1;
-    maxidx[1] = (x[maxidx[2]+2] > x[maxidx[3]+3]) ?  maxidx[2]+2 : maxidx[3]+3;
-    maxidx[0] = (x[maxidx[0]] > x[maxidx[1]]) ?  maxidx[0] : maxidx[1];
-
-    return maxidx[0];
-}
-
-int
-argmax_kernel8(int n, double *x)
-{
-#define SET(d) idx[d] = d, max[d] = x[d]
-#define PUT(d) if (x[i+d] > max[d]) idx[d] = i+d, max[d] = x[i+d]
-    int i, idx[8];
-    double max[8];
-
-    SET(0); SET(1); SET(2); SET(3);
-    SET(4); SET(5); SET(6); SET(7);
-
-    for (i = 0; i < n; i += 8) {
-        PUT(0); PUT(1); PUT(2); PUT(3);
-        PUT(4); PUT(5); PUT(6); PUT(7);
-    }
-    
-    n = 0;
-    for (i = 1; i < 8; i++ ) {
-        if (max[i] > max[n]) {
-            n = i;
-        }
-    }
-    return idx[n];
-#undef PUT
-#undef SET
-}
-
-int
-blas·argmax(int len, double *x, int inc)
-{
-    int    i, ix, n;
-    double max;
-
-    if (len == 0) {
-        return -1;
-    }
-
-    if (inc == 1) {
-        n   = EVEN_BY(len, 8);
-        ix  = argmax_kernel8_avx2(n, x);
-        max = x[ix];
-        x  += n;
-    } else {
-        n   = EVEN_BY(len, 4);
-        ix  = 0;
-        max = x[ix];
-        for (i = 0; i < n; i += 4, x += 4*inc) {
-            if (x[0*inc] > max) {
-                ix  = i;
-                max = x[0*inc];
-            }
-            if (x[1*inc] > max) {
-                ix  = i+1;
-                max = x[1*inc];
-            }
-            if (x[2*inc] > max) {
-                ix  = i+2;
-                max = x[2*inc];
-            }
-            if (x[3*inc] > max) {
-                ix  = i+3;
-                max = x[3*inc];
-            }
-        }
-    }
-
-    for (; n < len; n++, x += inc) {
-        if (*x > max) {
-            ix  = n;
-            max = *x;
-        }
-    }
-
-    return ix;
-}
-
-int
-argmin_kernel8_avx2(int n, double *x)
-{
-    register int i, msk, minidx[4];
-    __m256d val, cmp, min;
-
-    minidx[0] = 0, minidx[1] = 0, minidx[2] = 0, minidx[3] = 0;
-    min = _mm256_loadu_pd(x);
-
-    for (i = 0; i < n; i += 4) {
-        val = _mm256_loadu_pd(x+i);                                                   
-        cmp = _mm256_cmp_pd(val, min, _CMP_LT_OS);
-        msk = _mm256_movemask_pd(cmp);                                                    
-        switch (msk) {                                                                   
-            case 1:  
-                min = _mm256_blend_pd(min, val, 1);  
-                minidx[0] = i;
-            break; 
-
-            case 2:  
-                min = _mm256_blend_pd(min, val, 2);  
-                minidx[1] = i;
-            break; 
-
-            case 3:  
-                min = _mm256_blend_pd(min, val, 3);
-                minidx[0] = i;
-                minidx[1] = i;
-            break;
-
-            case 4:  
-                min = _mm256_blend_pd(min, val, 4);
-                minidx[2] = i;
-            break;
-
-            case 5:  
-                min = _mm256_blend_pd(min, val, 5);
-                minidx[2] = i;
-                minidx[0] = i;
-            break;
-
-            case 6:  
-                min = _mm256_blend_pd(min, val, 6);
-                minidx[2] = i;
-                minidx[1] = i;
-            break;
-
-            case 7:  
-                min = _mm256_blend_pd(min, val, 7);
-                minidx[2] = i;
-                minidx[1] = i;
-                minidx[0] = i;
-            break;
-
-            case 8:  
-                min = _mm256_blend_pd(min, val, 8);
-                minidx[3] = i;
-            break;
-
-            case 9:
-                min = _mm256_blend_pd(min, val, 9);
-                minidx[3] = i;
-                minidx[0] = i;
-            break;
-
-            case 10: 
-                min = _mm256_blend_pd(min, val, 10);
-                minidx[3] = i;
-                minidx[1] = i;
-            break;
-
-            case 11: 
-                min = _mm256_blend_pd(min, val, 11);
-                minidx[3] = i;
-                minidx[1] = i;
-                minidx[0] = i;
-            break;
-
-            case 12: 
-                min = _mm256_blend_pd(min, val, 12);
-                minidx[3] = i;
-                minidx[2] = i;
-            break;
-
-            case 13: 
-                min = _mm256_blend_pd(min, val, 13);
-                minidx[3] = i;
-                minidx[2] = i;
-                minidx[0] = i;
-            break;
-
-            case 14:
-                min = _mm256_blend_pd(min, val, 14);
-                minidx[3] = i;
-                minidx[2] = i;
-                minidx[1] = i;
-            break;
-
-            case 15: 
-                min = _mm256_blend_pd(min, val, 15);
-                minidx[3] = i;
-                minidx[2] = i;
-                minidx[1] = i;
-                minidx[0] = i;
-            break;
-
-            case 0:
-            default: ; 
-        }                                                                                                               
-    }
-    minidx[0] = (x[minidx[0]+0] < x[minidx[1]+1]) ?  minidx[0]+0 : minidx[1]+1;
-    minidx[1] = (x[minidx[2]+2] < x[minidx[3]+3]) ?  minidx[2]+2 : minidx[3]+3;
-    minidx[0] = (x[minidx[0]]   < x[minidx[1]])   ?  minidx[0]   : minidx[1];
-
-    return minidx[0];
-}
-
-
-int
-argmin_kernel8(int n, double *x)
-{
-#define SET(d) idx[d] = d, min[d] = x[d]
-#define PUT(d) if (x[i+d] < min[d]) idx[d] = i+d, min[d] = x[i+d]
-    int i, idx[8];
-    double min[8];
-
-    SET(0); SET(1); SET(2); SET(3);
-    SET(4); SET(5); SET(6); SET(7);
-
-    for (i = 0; i < n; i += 8) {
-        PUT(0); PUT(1); PUT(2); PUT(3);
-        PUT(4); PUT(5); PUT(6); PUT(7);
-    }
-    
-    n = 0;
-    for (i = 1; i < 8; i++) {
-        if (min[i] < min[n]) {
-            n = i;
-        }
-    }
-    return idx[n];
-#undef PUT
-#undef SET
-}
-
-int
-blas·argmin(int len, double *x, int inc)
-{
-    double min;
-    int    i, ix, n;
-
-    if (len == 0) {
-        return -1;
-    }
-
-    if (inc == 1) {
-        n = EVEN_BY(len, 8);
-        ix = argmin_kernel8_avx2(n, x);
-        min = x[ix];
-        x  += n;
-    } else {
-        n   = EVEN_BY(len, 4);
-        ix  = 0;
-        min = x[ix];
-        for (i = 0; i < n; i += 4, x += 4*inc) {
-            if (x[0*inc] < min) {
-                ix  = i;
-                min = x[0*inc];
-            }
-            if (x[1*inc] < min) {
-                ix  = i+1;
-                min = x[1*inc];
-            }
-            if (x[2*inc] < min) {
-                ix  = i+2;
-                min = x[2*inc];
-            }
-            if (x[3*inc] < min) {
-                ix  = i+3;
-                min = x[3*inc];
-            }
-        }
-    }
-    for (; n < len; n++, x += inc) {
-        if (*x < min) {
-            ix  = n;
-            min = *x;
-        }
-
-    }
-
-    return ix;
-}
-
-// -----------------------------------------------------------------------
-// level two
-
-/* 
- * Notation: (number of rows) x (number of columns) _ unroll factor
- *            N => variable we sum over
- */
-
-
-// NOTE: All triangular matrix methods are assumed packed and upper for now!
-
-/* 
- * triangular shaped transformation
- * x = Mx 
- * @M: square triangular
- * TODO(PERF): Diagnose speed issues
- * TODO: Finish all other flag cases!
- */
-void
-blas·tpmv(blas·Flag f, int n, double *m, double *x)
-{
-    int i;
-    for (i = 0; i < n; m += (n-i), ++x, ++i) {
-        *x = blas·dot(n-i, m, 1, x, 1);
-    }
-}
-
-/* 
- * solve triangular set of equations
- * x = M^{-1}b
- * @M: square triangular
- * TODO(PERF): Diagnose speed issues
- * TODO: Finish all other flag cases!
- */
-void
-blas·tpsv(blas·Flag f, int n, double *m, double *x)
-{
-    int i;
-    double r;
-
-    x += (n - 1);
-    m += ((n * (n+1))/2 - 1);
-    for (i = n-1; i >= 0; --i, --x, m -= (n-i)) {
-        r  = blas·dot(n-i-1, m+1, 1, x+1, 1);
-        *x = (*x - r) / *m;
-    }
-}
-
-/* 
- * general affine transformation
- * y = aMx + by
- */
-
-static
-void
-gemv_8xN_kernel4_avx2(int ncol, double *row[8], double *x, double *y)
-{
-    int c;
-    __m128d hr;
-    __m256d x256, r256[8];
-
-    for (c = 0; c < 8; c++) {
-        r256[c] = _mm256_setzero_pd();
-    }
-
-    for (c = 0; c < ncol; c += 4) {
-        x256     = _mm256_loadu_pd(x+c);
-        r256[0] += x256 * _mm256_loadu_pd(row[0] + c);
-        r256[1] += x256 * _mm256_loadu_pd(row[1] + c);
-        r256[2] += x256 * _mm256_loadu_pd(row[2] + c);
-        r256[3] += x256 * _mm256_loadu_pd(row[3] + c);
-        r256[4] += x256 * _mm256_loadu_pd(row[4] + c);
-        r256[5] += x256 * _mm256_loadu_pd(row[5] + c);
-        r256[6] += x256 * _mm256_loadu_pd(row[6] + c);
-        r256[7] += x256 * _mm256_loadu_pd(row[7] + c);
-    }
-
-    y[0] = hsum_avx2(r256[0]);
-    y[1] = hsum_avx2(r256[1]);
-    y[2] = hsum_avx2(r256[2]);
-    y[3] = hsum_avx2(r256[3]);
-    y[4] = hsum_avx2(r256[4]);
-    y[5] = hsum_avx2(r256[5]);
-    y[6] = hsum_avx2(r256[6]);
-    y[7] = hsum_avx2(r256[7]);
-}
-
-static
-void
-gemv_4xN_kernel4_avx2(int ncol, double *row[4], double *x, double *y)
-{
-    int c;
-    __m128d hr;
-    __m256d x256, r256[4];
-
-    for (c = 0; c < 4; c++) {
-        r256[c] = _mm256_setzero_pd();
-    }
-
-    for (c = 0; c < ncol; c += 4) {
-        x256     = _mm256_loadu_pd(x+c);
-        r256[0] += x256 * _mm256_loadu_pd(row[0] + c);
-        r256[1] += x256 * _mm256_loadu_pd(row[1] + c);
-        r256[2] += x256 * _mm256_loadu_pd(row[2] + c);
-        r256[3] += x256 * _mm256_loadu_pd(row[3] + c);
-    }
-
-    y[0] = hsum_avx2(r256[0]);
-    y[1] = hsum_avx2(r256[1]);
-    y[2] = hsum_avx2(r256[2]);
-    y[3] = hsum_avx2(r256[3]);
-}
-
-static
-void
-gemv_4xN_kernel4(int n, double *row[4], double *x, double *y)
-{
-    int c;
-    double res[4];
-
-    res[0] = 0.0;
-    res[1] = 0.0;
-    res[2] = 0.0;
-    res[3] = 0.0;
-
-    for (c = 0; c < n; c += 4) {
-        res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3];
-        res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3];
-        res[2] += row[2][c+0]*x[c+0] + row[2][c+1]*x[c+1] + row[2][c+2]*x[c+2] + row[2][c+3]*x[c+3];
-        res[3] += row[3][c+0]*x[c+0] + row[3][c+1]*x[c+1] + row[3][c+2]*x[c+2] + row[3][c+3]*x[c+3];
-    }
-
-    y[0] = res[0];
-    y[1] = res[1];
-    y[2] = res[2];
-    y[3] = res[3];
-}
-
-static
-void
-gemv_2xN_kernel4_avx2(int n, double *row[2], double *x, double *y)
-{
-    int c;
-    __m128d hr;
-    __m256d x256, r256[2];
-
-    for (c = 0; c < 2; c++) {
-        r256[c] = _mm256_setzero_pd();
-    }
-
-    for (c = 0; c < n; c += 4) {
-        x256     = _mm256_loadu_pd(x+c);
-        r256[0] += x256 * _mm256_loadu_pd(row[0] + c);
-        r256[1] += x256 * _mm256_loadu_pd(row[1] + c);
-    }
-
-    y[0] = hsum_avx2(r256[0]);
-    y[1] = hsum_avx2(r256[1]);
-}
-
-static
-void
-gemv_2xN_kernel4(int n, double *row[2], double *x, double *y)
-{
-    int c;
-    double res[2];
-
-    res[0] = 0.0;
-    res[1] = 0.0;
-
-    for (c = 0; c < n; c += 4) {
-        res[0] += row[0][c+0]*x[c+0] + row[0][c+1]*x[c+1] + row[0][c+2]*x[c+2] + row[0][c+3]*x[c+3];
-        res[1] += row[1][c+0]*x[c+0] + row[1][c+1]*x[c+1] + row[1][c+2]*x[c+2] + row[1][c+3]*x[c+3];
-    }
-
-    y[0] = res[0];
-    y[1] = res[1];
-}
-
-static
-void
-gemv_1xN_kernel4_avx2(int n, double *row, double *x, double *y)
-{
-    int c;
-    __m128d r128;
-    __m256d r256;
-
-    r256 = _mm256_setzero_pd();
-    for (c = 0; c < n; c += 4) {
-        r256 += _mm256_loadu_pd(row+c) * _mm256_loadu_pd(x+c);
-    }
-
-    r128 = _mm_add_pd(_mm256_extractf128_pd(r256, 0), _mm256_extractf128_pd(r256, 1));
-    r128 = _mm_hadd_pd(r128, r128);
-
-    *y = r128[0];
-    *y = hsum_avx2(r256);
-}
-
-static
-void
-gemv_1xN_kernel4(int n, double *row, double *x, double *y)
-{
-    int c;
-    double res;
-
-    res = 0.;
-    for (c = 0; c < n; c += 4) {
-        res += row[c+0]*x[c+0] + row[c+1]*x[c+1] + row[c+2]*x[c+2] + row[c+3]*x[c+3];
-    }
-
-    *y = res;
-}
-
-error
-blas·gemv(int nrow, int ncol, double a, double *m, int incm, double *x, int incx, double b, double *y, int incy) 
-{
-    int c, r, nr, nc;
-    double *row[8], res[8];
-    enum {
-        err·nil,
-        err·incm,
-    };
-
-    if (incm < ncol) {
-        errorf("aliased matrix: inc = %d < ncols = %d", incm, ncol);
-        return err·incm;
-    }
-
-    if (incx == 1 && incy == 1) {
-        nc = EVEN_BY(ncol, 4);
-
-        nr = EVEN_BY(nrow, 8);
-        for (r = 0; r < nr; r += 8) {
-            row[0] = m + ((r+0) * incm);
-            row[1] = m + ((r+1) * incm);
-            row[2] = m + ((r+2) * incm);
-            row[3] = m + ((r+3) * incm);
-            row[4] = m + ((r+4) * incm);
-            row[5] = m + ((r+5) * incm);
-            row[6] = m + ((r+6) * incm);
-            row[7] = m + ((r+7) * incm);
-
-            gemv_8xN_kernel4_avx2(nc, row, x, res);
-
-            for (c = nc; c < ncol; c++) {
-                res[0] += row[0][c]*x[c];
-                res[1] += row[1][c]*x[c];
-                res[2] += row[2][c]*x[c];
-                res[3] += row[3][c]*x[c];
-                res[4] += row[4][c]*x[c];
-                res[5] += row[5][c]*x[c];
-                res[6] += row[6][c]*x[c];
-                res[7] += row[7][c]*x[c];
-            }
-
-            y[r+0] = a*res[0] + b*y[r+0];
-            y[r+1] = a*res[1] + b*y[r+1];
-            y[r+2] = a*res[2] + b*y[r+2];
-            y[r+3] = a*res[3] + b*y[r+3];
-            y[r+4] = a*res[4] + b*y[r+4];
-            y[r+5] = a*res[5] + b*y[r+5];
-            y[r+6] = a*res[6] + b*y[r+6];
-            y[r+7] = a*res[7] + b*y[r+7];
-        }
-
-        nr = EVEN_BY(nrow, 4);
-        for (; r < nr; r += 4) {
-            row[0] = m + ((r+0) * incm);
-            row[1] = m + ((r+1) * incm);
-            row[2] = m + ((r+2) * incm);
-            row[3] = m + ((r+3) * incm);
-
-            gemv_4xN_kernel4_avx2(nc, row, x, res);
-
-            for (c = nc; c < ncol; c++) {
-                res[0] += row[0][c]*x[c];
-                res[1] += row[1][c]*x[c];
-                res[2] += row[2][c]*x[c];
-                res[3] += row[3][c]*x[c];
-            }
-
-            y[r+0] = a*res[0] + b*y[r+0];
-            y[r+1] = a*res[1] + b*y[r+1];
-            y[r+2] = a*res[2] + b*y[r+2];
-            y[r+3] = a*res[3] + b*y[r+3];
-        }
-
-        nr = EVEN_BY(nrow, 2);
-        for (; r < nr; r += 2) {
-            row[0] = m + ((r+0) * incm);
-            row[1] = m + ((r+1) * incm);
-            gemv_2xN_kernel4_avx2(nc, row, x, res);
-
-            for (c = nc; c < ncol; c++) {
-                res[0] += row[0][c]*x[c];
-                res[1] += row[1][c]*x[c];
-            }
-
-            y[r+0] = a*res[0] + b*y[r+0];
-            y[r+1] = a*res[1] + b*y[r+1];
-        }
-
-        for (; r < nrow; r++) {
-            row[0] = m + ((r+0) * ncol);
-            res[0] = blas·dot(ncol, row[0], 1, x, 1);
-            y[r]   = a*res[0] + b*y[r];
-        }
-    }
-
-    return 0;
-}
-
-/* 
- * rank one addition
- * M = ax(y^T) + M
- * TODO: vectorize kernel
- */
-
-void
-blas·ger(int nrow, int ncol, double a, double *x, double *y, double *m)
-{
-    int i, j;
-
-    for (i = 0; i < nrow; i++) {
-        for (j = 0; j < ncol; j++) {
-            m[i+ncol*j] += a * x[i] * y[j];
-        }
-    }
-}
-
-/* 
- * rank one addition
- * M = ax(x^T) + M
- */
-
-void
-blas·her(int n, double a, double *x, double *m)
-{
-    blas·ger(n, n, a, x, x, m);
-}
-
-/* 
- * symmetric rank one addition
- * M = ax(x^T) + M
- * TODO: vectorize kernel
- */
-
-void
-blas·syr(int nrow, int ncol, double a, double *x, double *m)
-{
-    int i, j;
-
-    for (i = 0; i < nrow; i++) {
-        for (j = 0; j < ncol; j++) {
-            m[i+ncol*j] += a * x[i] * x[j];
-        }
-    }
-}
-
-
-// -----------------------------------------------------------------------
-// level three
-
-/* 
- * matrix multiplication
- * m3 = a(m1 * m2) + b(m3)
- * einstein notation:
- * m3_{ij} = a m1_{ik} m2_{kj} + b m3_{ij}
- *
- * n1 = # rows of m1 = # rows of m3
- * n2 = # cols of m2 = # cols of m3
- * n3 = # cols of m1 = # rows of m2
- *
- * TODO: Right now we are 2x slower than OpenBLAS.
- *       This is because we use a very simple algorithm.
- */
-
-void
-blas·gemm(int n1, int n2, int n3, double a, double *m1, double *m2, double b, double *m3)
-{
-    int i, j, k, len;
-
-    // TODO: Is there anyway this computation can be integrated into the one below?
-    for (i = 0; i < n1; i++) {
-        for (j = 0; j < n2; j++) {
-            m3[i + n2*j] *= b;
-        }
-    }
-
-    len = n1 & ~7;
-    for (j = 0; j < n2; j++) {
-        for (k = 0; k < n3; k++) {
-            axpy_kernel8_avx2(len, a * m2[k + n2*j], m1 + n3*k, m3 + n2*j);
-
-            // remainder
-            for (i = len; i < n1; i++) {
-                m3[i + n2*j] += a * m1[i + n3*k] * m2[k + n2*j]; 
-            }
-        }
-    }
-}
-
-/* 
- * triangular matrix multiplication
- * m2 = a * m1 * m2 _OR_ a * m2 * m1
- * m1 is assumed triangular
- * einstein notation:
- * m2_{ij} = a m1_{ik} m2_{kj} _OR_ a m1_{kj} m2_{ik} 
- *
- * nrow = # rows of m2 
- * ncol = # cols of m2 
- * TODO(PERF): make compute kernel
- * TODO: finish all other flags
- */
-void
-blas·trmm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2)
-{
-    int i, j, k, len;
-
-    for (i = 0; i < nrow; i++) {
-        for (j = 0; j < ncol; j++) {
-            for (k = i; k < ncol; k++) {
-                m2[i + ncol*j] += a * m1[i + nrow*k] * m2[k + ncol*j];
-            }
-        }
-    }
-}
-
-/* 
- * solve triangular matrix system of equations
- * m2 = a * m1^{-1L} _OR_ a * m2 * m1
- * m1 is assumed triangular
- *
- * nrow = # rows of m2 
- * ncol = # cols of m2 
- * TODO: complete stub
- * TODO: finish all other flags
- */
-void
-blas·trsm(blas·Flag f, int nrow, int ncol, double a, double *m1, double *m2)
-{
-}
-
-#define NITER 1000
-#define NCOL  2005
-#define NROW  2005
-
-error
-test·level3()
-{
-    vlong i, n;
-    clock_t t;
-
-    double *x, *y, *m[3];
-    double res[2], tprof[2];
-
-    // openblas_set_num_threads(1);
-
-    x    = malloc(sizeof(*x)*NCOL);
-    y    = malloc(sizeof(*x)*NCOL);
-    m[0] = malloc(sizeof(*x)*NCOL*NROW);
-    m[1] = malloc(sizeof(*x)*NCOL*NROW);
-    m[2] = malloc(sizeof(*x)*NCOL*NROW);
-
-    tprof[0] = 0;
-    tprof[1] = 0;
-
-    for (n = 0; n < NITER; n++) {
-        for (i = 0; i < NCOL; i++) {
-            x[i] = i*i+1;
-            y[i] = i+1;
-        }
-
-        for (i = 0; i < NCOL*NROW; i++) {
-            m[0][i] = i/(NCOL*NROW);
-            m[1][i] = i*i/(NCOL*NROW*NCOL*NROW);
-            m[2][i] = 1;
-        }
-
-        t = clock();
-        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, NCOL, NROW, NROW, 1.2, m[0], NROW, m[1], NROW, 2.8, m[2], NROW);
-        t = clock() - t;
-        tprof[1] += 1000.*t/CLOCKS_PER_SEC;
-        res[1]    = cblas_ddot(NROW*NCOL, m[2], 1, m[2], 1);
-
-        for (i = 0; i < NCOL; i++) {
-            x[i] = i*i+1;
-            y[i] = i+1;
-        }
-
-        for (i = 0; i < NCOL*NROW; i++) {
-            m[0][i] = i/(NCOL*NROW);
-            m[1][i] = i*i/(NCOL*NROW*NCOL*NROW);
-            m[2][i] = 1;
-        }
-
-        t = clock();
-        blas·gemm(NROW, NROW, NROW, 1.2, m[0], m[1], 2.8, m[2]);
-        t = clock() - t;
-        tprof[0] += 1000.*t/CLOCKS_PER_SEC;
-        res[0]    = blas·dot(NROW*NCOL, m[2], 1, m[2], 1);
-    }
-    printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER);
-    printf("--> result (naive): %f\n", res[0]);
-    printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER);
-    printf("--> result (oblas): %f\n", res[1]);
-
-    return 0;
-}
-
-void
-test·level2()
-{
-    int i, j, n, it;
-    clock_t t;
-
-    double *x, *y, *z, *m;
-    double tprof[2];
-
-    rng·init(0);
-
-    tprof[0] = 0;
-    tprof[1] = 0;
-
-    x = malloc(sizeof(*x)*NCOL);
-    y = malloc(sizeof(*x)*NCOL);
-    z = malloc(sizeof(*x)*NCOL);
-    m = malloc(sizeof(*x)*NROW*NCOL);
-
-    for (it = 0; it < NITER; it++) {
-        n = 0;
-        for (i = 0; i < NROW; i++) {
-            x[i] = rng·random();
-            y[i] = rng·random();
-            for (j = 0; j < NCOL; j++) {
-                m[n++] = rng·random() + .1; 
-            }
-        }
-
-        memcpy(z, y, NCOL * sizeof(*y));
-
-        t = clock();
-        blas·gemv(NROW, NCOL, 2, m, NCOL, x, 1, 1.0, y, 1);
-        t = clock() - t;
-
-        tprof[0] += 1000.*t/CLOCKS_PER_SEC;
-
-        t = clock();
-        cblas_dgemv(CblasRowMajor, CblasNoTrans, NROW, NCOL, 2, m, NROW, x, 1, 1.0, z, 1);
-        t = clock() - t;
-
-        tprof[1] += 1000.*t/CLOCKS_PER_SEC;
-
-        for (i = 0; i < NCOL; i++) {
-            if (math·abs(z[i] - y[i])/math·abs(x[i]) > 1e-5) {
-                errorf("failure at index %d: %f != %f", i, z[i], y[i]);
-            }
-        }
-    }
-
-    printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER);
-    printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER);
-}
-
-void
-print·array(int n, double *x)
-{
-    double *end;
-    printf("[");
-    for (end=x+n; x != end; ++x) {
-        printf("%f,", *x);
-    }
-    printf("]\n");
-}
-
-error
-test·level1()
-{
-    int ai, ai2, i, n;
-    double *x, *y;
-    double tprof[2];
-    // double params[5];
-    clock_t t;
-
-    x = malloc(sizeof(*x)*NCOL);
-    y = malloc(sizeof(*x)*NCOL);
-    rng·init(0);
-
-    // params[0] = -1.;
-    // params[1] = 100; params[2] = 20; params[3] = 30; params[4] = 10;
-
-    for (n = 0; n < NITER; n++) {
-        for (i = 0; i < NCOL; i++) {
-            y[i] = rng·random();
-        }
-        memcpy(x, y, sizeof(*x)*NCOL);
-
-        t = clock();
-        ai = blas·argmin(NCOL, x, 1);
-        t = clock() - t;
-        tprof[0] += 1000.*t/CLOCKS_PER_SEC;
-
-        if (n == 20729) {
-            printf("[%d]=%f vs [%d]=%f\n", 74202, x[74202], 3, x[3]);
-        }
-        memcpy(x, y, sizeof(*x)*NCOL);
-
-        t = clock();
-        ai2 = cblas_idamin(NCOL, x, 1);
-        t = clock() - t;
-        tprof[1] += 1000.*t/CLOCKS_PER_SEC;
-
-        if (ai != ai2) {
-            printf("iteration %d: %d not equal to %d. %f vs %f\n", n, ai, ai2, x[ai], x[ai2]);
-        }
-    }
-
-    printf("mean time/iteration (naive): %fms\n", tprof[0]/NITER);
-    printf("mean time/iteration (oblas): %fms\n", tprof[1]/NITER);
-
-    double a, b, c, s;
-
-    a = 10.234, b = 2.;
-    cblas_drotg(&a, &b, &c, &s);
-    printf("%f, %f, %f, %f\n", a, b, c, s);
-
-    a = 10.234, b = 2.;
-    blas·rotg(&a, &b, &c, &s);
-    printf("%f, %f, %f, %f\n", a, b, c, s);
-
-    return 0;
-}
-
-#define STEP 1
-error
-test·argmax()
-{
-    int i, n;
-    double *x, *y, *w, *z;
-    double res[2], tprof[2];
-    int idx[2];
-    clock_t t;
-
-    x = malloc(sizeof(*x)*NCOL);
-    y = malloc(sizeof(*x)*NCOL);
-    w = malloc(sizeof(*x)*NCOL);
-    z = malloc(sizeof(*x)*NCOL);
-    rng·init(0);
-
-    for (n = 0; n < NITER; n++) {
-        for (i = 0; i < NCOL; i++) {
-            x[i] = rng·random();
-            y[i] = rng·random();
-
-            w[i] = x[i];
-            z[i] = y[i];
-        }
-
-        t      = clock();
-        idx[0] = cblas_idamin(NCOL/STEP, w, STEP);
-        t      = clock() - t;
-        tprof[1] += 1000.*t/CLOCKS_PER_SEC;
-
-        t      = clock();
-        idx[1] = blas·argmin(NCOL/STEP, x, STEP);
-        t      = clock() - t;
-        tprof[0] += 1000.*t/CLOCKS_PER_SEC;
-
-        if (idx[0] != idx[1]) {
-            errorf("%d != %d", idx[0], idx[1]);
-        }
-        // if (math·abs(res[0] - res[1])/math·abs(res[0]) > 1e-4) {
-        //     errorf("%f != %f", res[0], res[1]);
-        // }
-        // for (i = 0; i < NCOL; i++) {
-        //     if (math·abs(x[i] - w[i]) + math·abs(y[i] - z[i]) > 1e-4) {
-        //         errorf("%f != %f & %f != %f at index %d", x[i], w[i], y[i], z[i], i);
-        //     }
-        // }
-    }
-
-    return 0;
-}
-
-error
-main()
-{
-    test·level2();
-    return 0;
-}
diff --git a/sys/libmath/rules.mk b/sys/libmath/rules.mk
index 049092a..73d63e3 100644
--- a/sys/libmath/rules.mk
+++ b/sys/libmath/rules.mk
@@ -28,13 +28,19 @@ BINS 	  := $(BINS) $(BINS_$(d))
 # $(LIBS_$(d)) = TCINCS  :=
 # $(LIBS_$(d)) = TCLIBS  :=
 
+GENERATE = python $^ > $@
+$(d)/blas1.c: $(d)/gen1.py
+	@$(GENERATE)
+
 $(LIBS_$(d)): $(OBJS_$(d))
-	$(ARCHIVE)
+	@echo AR $^
+	@$(ARCHIVE)
 
 $(BINS_$(d)): TCFLAGS := -D_GNU_SOURCE
 $(BINS_$(d)): TCLIBS := -lpthread -lm
 $(BINS_$(d)): $(OBJS_$(d)) $(OBJ_DIR)/libn/libn.a $(LIB_DIR)/vendor/libblas.a
-	$(LINK)
+	@echo BIN $@
+	@$(LINK)
 
 # ---- Pop off stack ----
 -include $(DEPS_$(d))
diff --git a/sys/libn/rules.mk b/sys/libn/rules.mk
index a60a3d5..7efcae6 100644
--- a/sys/libn/rules.mk
+++ b/sys/libn/rules.mk
@@ -34,11 +34,13 @@ BINS 	  := $(BINS) $(BINS_$(d))
 # $(LIBS_$(d)) := TCLIBS  := 
 
 $(LIBS_$(d)): $(OBJS_$(d))
-	$(ARCHIVE)
+	@echo LIB $@
+	@$(ARCHIVE)
 
 $(BINS_$(d)): TCLIBS := $(LIBS_$(d)) $(LIB_DIR)/vendor/libz.a
 $(BINS_$(d)): $(OBJ_DIR)/libn/test.o $(LIBS_$(d))
-	$(LINK)
+	@echo BIN $@
+	@$(LINK)
 
 # ---- Pop off stack ----
 -include $(DEPS_$(d))
author	Nicholas Noll <nbnoll@eml.cc>	2020-05-13 17:30:19 -0700
committer	Nicholas Noll <nbnoll@eml.cc>	2020-05-13 17:30:19 -0700
commit	d982e7c2fdebf560ccce193cb98b85d4fac28a45 (patch)
tree	b18902eea12a2d55a24994ca0681ca1a369631aa
parent	c9d4b2d7dd1d9a46571e5d2b2cf6ce10a9d9ebea (diff)