15 files changed, 1459 insertions, 0 deletions
diff --git a/src/libutf/canfit.c b/src/libutf/canfit.c
new file mode 100644
index 0000000..4579ab3
--- /dev/null
+++ b/src/libutf/canfit.c
@@ -0,0 +1,23 @@
+#include "internal.h"
+
+/* returns 1 if string of length n is long enough to be decoded */
+int
+utf8·canfit(byte* s, int n)
+{
+    int  i;
+    rune c;
+
+    if(n <= 0)
+        return 0;
+
+    c = *(ubyte*)s;
+    if(c < TByte1)
+        return 1;
+
+    if(c < TByte3)
+        return n >= 2;
+    if(c < TByte4)
+        return n >= 3;
+
+    return n >= UTFmax;
+}
diff --git a/src/libutf/decode.c b/src/libutf/decode.c
new file mode 100644
index 0000000..01797f1
--- /dev/null
+++ b/src/libutf/decode.c
@@ -0,0 +1,98 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks
+     */
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+    /*
+     * the second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state
+     */
+     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+    12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+int
+utf8·decode(char *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, x=ACCEPT;
+
+    b = ((uint8 *)s)[0];
+    t = decode[b];
+    v = (0xFF >> t) & b;
+    x = decode[256+x+t];
+
+    for(n=1; x > REJECT && n < UTFmax; n++){
+        b = ((uint8 *)s)[n];
+        t = decode[b];
+        v = (v << 6) | (b & TMask);
+        x = decode[256+x+t];
+    }
+
+    if(x != ACCEPT){
+        *r = RuneErr;
+        return 1;
+    }
+
+    *r = v;
+    return n;
+}
+
+#if 0
+int
+utf8·decode(byte *s, rune *r)
+{
+    int c[UTFmax], i;
+    rune l;
+
+    c[0] = *(ubyte*)(s);
+    if(c[0] < Tx){
+        *r = c[0];
+        return 1;
+    }
+
+    l = c[0];
+    for(i = 1; i < UTFmax; i++){
+        c[i]  = *(ubyte*)(s+i);
+        c[i] ^= Tx;
+        if(c[i] & Testx) goto bad;
+
+        l = (l << Bitx) | c[i];
+        if(c[0] < Tbyte(i + 2)){
+            l &= RuneX(i + 1);
+            if(i == 1){
+                if(c[0] < Tbyte(2) || l <= Rune1)
+                    goto bad;
+            }else if(l <= RuneX(i) || l > RuneMax)
+                goto bad;
+
+            if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+                goto bad;
+
+            *r = l;
+            return i + 1;
+        }
+    }
+bad:
+    *r = RuneErr;
+    return 1;
+}
+#endif
diff --git a/src/libutf/decodeprev.c b/src/libutf/decodeprev.c
new file mode 100644
index 0000000..27dced6
--- /dev/null
+++ b/src/libutf/decodeprev.c
@@ -0,0 +1,60 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks.
+     */
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+         7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+         8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+        10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+    /*
+     * The second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state.
+     */
+    //   0  1  2  3  4  5  6  7  8  9 10 11
+         0,24,12,12,12,12,12,24,12,24,12,12,
+         0,24,12,12,12,12,12,24,12,24,12,12,
+        12,36, 0,12,12,12,12,48,12,36,12,12,
+        12,60,12, 0, 0,12,12,72,12,72,12,12,
+        12,60,12, 0,12,12,12,72,12,72, 0,12,
+        12,12,12,12,12, 0, 0,12,12,12,12,12,
+        12,12,12,12,12,12,12,12,12,12,12, 0
+};
+
+int
+utf8·decodeprev(byte *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, d, x=ACCEPT;
+
+    v=0, n=0, d=0;
+nextbyte:
+    b = ((uint8 *)s)[-n++];
+    t = decode[b];
+    x = decode[256+x+t];
+
+    if(x > REJECT && n < UTFmax){
+        v = v | ((b & TMask) << d);
+        d += 6;
+        goto nextbyte;
+    }
+
+    if(x != ACCEPT)
+        *r = RuneErr;
+    else{
+        v |= (((0xFFu >> t) & b) << d);
+        *r = v;
+    }
+
+    return n;
+}
diff --git a/src/libutf/encode.c b/src/libutf/encode.c
new file mode 100644
index 0000000..fa7c93e
--- /dev/null
+++ b/src/libutf/encode.c
@@ -0,0 +1,69 @@
+#include "internal.h"
+
+int
+utf8·encode(rune *r, byte *s)
+{
+    rune c;
+
+    c = *r;
+    if(c < Rune1Byte){ // 7 bits
+        s[0] = (uint8)c;
+        return 1;
+    }
+
+    if(c < Rune2Byte){ // 11 bits
+        s[0] = TByte1 | (c >> 6);
+        s[1] = Tx     | (c & TMask);
+        return 2;
+    }
+
+    if(c < Rune3Byte){ // 16 bits
+        s[0] = TByte2 | ((c >> 12));
+        s[1] = Tx     | ((c >> 6) & TMask);
+        s[2] = Tx     | ((c)      & TMask);
+        return 3;
+    }
+
+    // 22 bits
+    if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax))
+        c = RuneErr;
+
+    s[0] = TByte3 | ((c >> 18));
+    s[1] = Tx     | ((c >> 12) & TMask);
+    s[2] = Tx     | ((c >> 6)  & TMask);
+    s[3] = Tx     | ((c)       & TMask);
+
+    return 4;
+}
+
+#if 0
+int
+utf8·encode(rune* r, byte* s)
+{
+    int i, j;
+    rune c;
+
+    c = *r;
+    if(c <= Rune1) {
+        s[0] = c;
+        return 1;
+    }
+
+    for(i = 2; i < UTFmax + 1; i++){
+        if(i == 3){
+            if(c > RuneMax)
+                c = RuneErr;
+            if(SurrogateMin <= c && c <= SurrogateMax)
+                c = RuneErr;
+        }
+        if(c <= RuneX(i) || i == UTFmax) {
+            s[0] = Tbyte(i) |  (c >> (i - 1)*Bitx);
+            for(j = 1; j < i; j++)
+                s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
+            return i;
+        }
+    }
+
+    return UTFmax;
+}
+#endif
diff --git a/src/libutf/find.c b/src/libutf/find.c
new file mode 100644
index 0000000..d75feb8
--- /dev/null
+++ b/src/libutf/find.c
@@ -0,0 +1,31 @@
+#include "internal.h"
+
+byte*
+utf8·find(byte* s, rune c)
+{
+    long c1;
+    rune r;
+    int  n;
+
+    if(c < Tx)
+        return strchr(s, c);
+
+    for(;;){
+        c1 = *(ubyte*)s;
+        if(c1 < Tx){
+            if(c1 == 0) return nil;
+            if(c1 == c) return s;
+            s++;
+            continue;
+        }
+
+        n = utf8·decode(s, &r);
+
+        if(r == c)
+            return s;
+
+        s += n;
+    }
+
+    return nil;
+}
diff --git a/src/libutf/findlast.c b/src/libutf/findlast.c
new file mode 100644
index 0000000..ab25ab2
--- /dev/null
+++ b/src/libutf/findlast.c
@@ -0,0 +1,32 @@
+#include "internal.h"
+
+byte*
+utf8·findlast(byte* s, rune c)
+{
+    long c1;
+    rune r;
+    byte *l;
+
+    if(c < Tx)
+        return strrchr(s, c);
+
+    l = nil;
+    for(;;){
+        c1 = *(ubyte*)s;
+        if(c1 < Tx){
+            if(c1 == 0) return l;
+            if(c1 == c) l = s;
+            s++;
+            continue;
+        }
+
+        c1 = utf8·decode(s, &r);
+
+        if(r == c)
+            l = s;
+
+        s += c1;
+    }
+
+    return nil;
+}
diff --git a/src/libutf/internal.h b/src/libutf/internal.h
new file mode 100644
index 0000000..9719977
--- /dev/null
+++ b/src/libutf/internal.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <u.h>
+#include <base.h>
+#include <libutf.h>
+
+/*
+ * NOTE: we use the preprocessor to ensure we have unsigned constants.
+ * UTF-8 code:
+ * 1 byte:
+ * 0xxxxxxx
+ * 2 byte:
+ * 110xxxxx 10xxxxxx
+ * 3 byte:
+ * 1110xxxx 10xxxxxx 10xxxxxx
+ * 4 byte:
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+#define Tx     0x80u // 0b10000000 transfer header
+#define TMask  0x3Fu // 0b00111111 transfer mask
+
+#define TByte1 0xC0u // 0b11000000
+#define TByte2 0xE0u // 0b11100000
+#define TByte3 0xF0u // 0b11110000
+#define TByte4 0xF8u // 0b11111000
+
+#define RuneMask  0x1FFFFFu
+
+#define Rune1Byte 0x000080u  // 1 << 8   (1 byte)
+#define Rune2Byte 0x001000u  // 1 << 12  (2 bytes)
+#define Rune3Byte 0x020000u  // 1 << 17  (3 bytes)
+#define Rune4Byte 0x400000u  // 1 << 22  (4 bytes)
+
+
+/* UTF-16 nonsense */
+#define RuneSurrogateMin 0x0D8000
+#define RuneSurrogateMax 0x0D8FFF
diff --git a/src/libutf/len.c b/src/libutf/len.c
new file mode 100644
index 0000000..8fbd679
--- /dev/null
+++ b/src/libutf/len.c
@@ -0,0 +1,21 @@
+#include "internal.h"
+
+int
+utf8·len(char *s)
+{
+    int c;
+    long n;
+    rune r;
+
+    n = 0;
+    for(;;){
+        c = *(uchar*)s;
+        if(c < Tx){
+            if(c == 0)
+                return n;
+            s++;
+        }else
+            s += utf8·decode(s, &r);
+        n++;
+    }
+}
diff --git a/src/libutf/rules.mk b/src/libutf/rules.mk
new file mode 100644
index 0000000..aeb86b2
--- /dev/null
+++ b/src/libutf/rules.mk
@@ -0,0 +1,76 @@
+include share/push.mk
+
+UNICODE=14.0.0
+
+SRCS_$(d):=\
+	$(d)/encode.c\
+	$(d)/decode.c\
+	$(d)/decodeprev.c\
+	$(d)/find.c\
+	$(d)/findlast.c\
+	$(d)/canfit.c\
+	$(d)/runelen.c\
+	$(d)/len.c\
+	$(d)/runetype-$(UNICODE).c\
+	$(d)/runewidth-$(UNICODE).c
+
+LIBS_$(d):=$(d)/libutf.a
+
+include share/paths.mk
+
+# ========================================================================
+# table generation
+
+$(d)/vendor/common.o: $(d)/vendor/common.c
+	$(COMPILE)
+
+# rune categories
+$(d)/vendor/UnicodeData-$(UNICODE).txt:
+	@echo "GET	UnicodeData.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
+
+$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/base/base.a
+	$(COMPLINK)
+
+GENS += $(d)/vendor/mkrunetype
+
+$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
+	@$(dir $@)vendor/mkrunetype $< > $@
+
+# rune widths
+$(d)/vendor/EastAsianWidth-$(UNICODE).txt:
+	@echo "GET	EastAsianWidth.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
+
+$(d)/vendor/EmojiData-$(UNICODE).txt:
+	@echo "GET	EmojiData.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@
+
+$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/base/base.a
+	$(COMPLINK)
+
+GENS += $(d)/vendor/mkrunewidth
+
+$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt
+	@$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@
+
+# grapheme boundaries
+$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
+	@echo "GET	GraphemeBreakProperty.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
+
+$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/base/base.a
+	$(COMPLINK)
+
+$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
+	$^ > $@
+
+GENS += $(d)/vendor/mkgraphemedata
+
+# ========================================================================
+# normal operations
+
+$(LIBS_$(d)): $(OBJS_$(d))
+	$(ARCHIVE)
+
+include share/pop.mk
diff --git a/src/libutf/runelen.c b/src/libutf/runelen.c
new file mode 100644
index 0000000..dac7f15
--- /dev/null
+++ b/src/libutf/runelen.c
@@ -0,0 +1,8 @@
+#include "internal.h"
+
+int
+utf8·runelen(rune r)
+{
+    byte s[10];
+    return utf8·encode(&r, s);
+}
diff --git a/src/libutf/vendor/common.c b/src/libutf/vendor/common.c
new file mode 100644
index 0000000..5a03a50
--- /dev/null
+++ b/src/libutf/vendor/common.c
@@ -0,0 +1,220 @@
+#include "common.h"
+
+// -----------------------------------------------------------------------
+// input functions
+
+int
+parse(io·Stream *io, int nfield, char **field, int len, char *line)
+{
+    int n;
+    if((n=io·readln(io, len, line)) <= 0)
+        return ParseEOF;
+
+    if(n == len)
+        panicf("line too long");
+
+    if(line[n-1] != '\n')
+        panicf("invalid line: expected '\n', found '%c'", line[n]);
+
+    line[n-1] = 0;
+
+    if(line[0] == '#' || line[0] == 0)
+        return ParseSkip;
+
+    /* tokenize line into fields */
+    n = 0;
+    field[n] = line;
+    while(*line){
+        if(*line == ';'){
+            *line = 0;
+            field[++n] = line+1;
+        }
+        line++;
+    }
+
+    if(n != nfield-1)
+        panicf("expected %d number of fields, got %d: %s", nfield, n, line);
+
+    return ParseOK;
+}
+
+int
+codepoint(char *s)
+{
+    int c, b;
+
+    c = 0;
+    while((b=*s++)){
+        c <<= 4;
+        if(b >= '0' && b <= '9')
+            c += b - '0';
+        else if(b >= 'A' && b <= 'F')
+            c += b - 'A' + 10;
+        else
+            panicf("bad codepoint char '%c'", b);
+    }
+
+    return c;
+}
+
+void
+codepointrange(io·Stream *utf8, char *field[NumFields], int *start,  int *stop)
+{
+    int e, c;
+    char *other[NumFields], line[1024];
+
+    // XXX: the stop variable passes in the previous stopping character
+    e = *stop;
+    c = codepoint(field[Fcode]);
+
+    if(c >= NumRunes)
+        panicf("unexpected large codepoint %x", c);
+    if(c <= e)
+        panicf("bad code sequence: %x then %x", e, c);
+    e = c;
+
+    if(strstr(field[Fname], ", First>") != nil){
+        if(!parse(utf8, arrlen(other), other, arrlen(line), line))
+            panicf("range start at end of file");
+        if(strstr(other[Fname], ", Last>") == nil)
+            panicf("range start not followed by range end");
+
+        e = codepoint(other[Fcode]);
+
+        if(e <= c)
+            panicf("bad code sequence: %x then %x", c, e);
+        if(strcmp(field[Fcategory], other[Fcategory]) != 0)
+            panicf("range with mismatched category");
+    }
+
+    *start = c;
+    *stop  = e;
+}
+
+// -----------------------------------------------------------------------
+// output functions
+
+void
+putsearch(void)
+{
+    puts(
+        "#include <u.h>\n"
+        "#include <libutf.h>\n"
+        "\n"
+        "static\n"
+        "rune*\n"
+        "rangesearch(rune c, rune *t, int n, int ne)\n"
+        "{\n"
+        "   rune *p;\n"
+        "   int m;\n"
+        "   while(n > 1) {\n"
+        "   m = n >> 1;\n"
+        "   p = t + m*ne;\n"
+        "   if(c >= p[0]){\n"
+        "       t = p;\n"
+        "       n = n-m;\n"
+        "   }else\n"
+        "       n = m;\n"
+        "   }\n"
+        "   if(n && c >= t[0])\n"
+        "       return t;\n"
+        "   return 0;\n"
+        "}\n"
+    );
+
+}
+
+int
+putrange(char *ident, char *prop, int force)
+{
+    int l, r, start;
+
+    start = 0;
+    for(l = 0; l < NumRunes;) {
+        if(!prop[l]){
+            l++;
+            continue;
+        }
+
+        for(r = l+1; r < NumRunes; r++){
+            if(!prop[r])
+                break;
+            prop[r] = 0;
+        }
+
+        if(force || r > l + 1){
+            if(!start){
+                printf("static rune %s[] = {\n", ident);
+                start = 1;
+            }
+            prop[l] = 0;
+            printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
+        }
+
+        l = r;
+    }
+
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
+
+int
+putpair(char *ident, char *prop)
+{
+    int l, r, start;
+
+    start = 0;
+    for(l=0; l+2 < NumRunes; ){
+        if(!prop[l]){
+            l++;
+            continue;
+        }
+
+        for(r = l + 2; r < NumRunes; r += 2){
+            if(!prop[r])
+                break;
+            prop[r] = 0;
+        }
+
+        if(r != l + 2){
+            if(!start){
+                printf("static rune %s[] = {\n", ident);
+                start = 1;
+            }
+            prop[l] = 0;
+            printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
+        }
+
+        l = r;
+    }
+
+    if(start)
+        printf("};\n\n");
+    return start;
+}
+
+int
+putsingle(char *ident, char *prop)
+{
+    int i, start;
+
+    start = 0;
+    for(i = 0; i < NumRunes; i++) {
+        if(!prop[i])
+            continue;
+
+        if(!start){
+            printf("static rune %s[] = {\n", ident);
+            start = 1;
+        }
+        prop[i] = 0;
+        printf("\t0x%.4x,\n", i);
+    }
+
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
diff --git a/src/libutf/vendor/common.h b/src/libutf/vendor/common.h
new file mode 100644
index 0000000..62f6c5b
--- /dev/null
+++ b/src/libutf/vendor/common.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <u.h>
+#include <base.h>
+#include <libutf.h>
+
+enum
+{
+    // Fields inside UnicodeData.txt
+    Fcode,
+    Fname,
+    Fcategory,
+    Fcombine,
+    Fbidir,
+    Fdecomp,
+    Fdecimal,
+    Fdigit,
+    Fnumeric,
+    Fmirror,
+    Foldname,
+    Fcomment,
+    Fupper,
+    Flower,
+    Ftitle,
+
+    NumFields,
+    NumRunes = 1 << 21,
+};
+
+/* input functions */
+enum
+{
+    ParseEOF,
+    ParseOK,
+    ParseSkip,
+};
+
+int  parse(io·Stream *io, int nfield, char **field, int len, char *line);
+int  codepoint(char *s);
+void codepointrange(io·Stream *utf8, char *field[NumFields], int *start,  int *stop);
+
+/* output functions */
+void putsearch(void);
+int  putrange(char *ident, char *prop, int force);
+int  putpair(char *ident, char *prop);
+int  putsingle(char *ident, char *prop);
diff --git a/src/libutf/vendor/mkgraphemedata.c b/src/libutf/vendor/mkgraphemedata.c
new file mode 100644
index 0000000..ce5a952
--- /dev/null
+++ b/src/libutf/vendor/mkgraphemedata.c
@@ -0,0 +1,24 @@
+#include <u.h>
+#include <base.h>
+#include <libutf.h>
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+    fprintf(stderr, "usage: mkgraphemedata <GraphemeBreakProperty.txt>\n");
+    exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+    io·Stream *utf8;
+    char line[1024];
+
+    ARGBEGIN{
+    }ARGEND;
+}
diff --git a/src/libutf/vendor/mkrunetype.c b/src/libutf/vendor/mkrunetype.c
new file mode 100644
index 0000000..9f939f4
--- /dev/null
+++ b/src/libutf/vendor/mkrunetype.c
@@ -0,0 +1,388 @@
+#include "common.h"
+
+// -----------------------------------------------------------------------
+// globals
+
+#define OFFSET (1 << 20)
+#define DELTA(mapx, x)  ((1 << 20) + (mapx) - (x))
+
+// TODO: use bitarrays. will reduce executable size 8x
+struct Table
+{
+    /* properties */
+    char isspace[NumRunes];
+    char isalpha[NumRunes];
+    char ismark[NumRunes];
+    char isdigit[NumRunes];
+    char isupper[NumRunes];
+    char islower[NumRunes];
+    char istitle[NumRunes];
+    char ispunct[NumRunes];
+    char issymbl[NumRunes];
+    char iscntrl[NumRunes];
+
+    char combine[NumRunes];
+
+    /* transformations */
+    int  toupper[NumRunes];
+    int  tolower[NumRunes];
+    int  totitle[NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+int
+isrange(char *label, char *prop, int force)
+{
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident))
+        panicf("out of identifier space\n");
+
+    return putrange(ident, prop, force);
+}
+
+static
+int
+ispair(char *label, char *prop)
+{
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident))
+        panicf("out of identifier space\n");
+
+    return putpair(ident, prop);
+}
+
+static
+int
+issingle(char *label, char *prop)
+{
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident))
+        panicf("out of identifier space\n");
+
+    return putsingle(ident, prop);
+}
+
+static
+void
+makeis(char *label, char *table, int pairs, int onlyranges)
+{
+    int hasr, hasp=0, hass=0;
+
+    hasr = isrange(label, table, onlyranges);
+    if(!onlyranges && pairs)
+        hasp = ispair(label, table);
+    if(!onlyranges)
+        hass = issingle(label, table);
+
+    printf(
+        "int\n"
+        "utf8·is%s(rune c)\n"
+        "{\n"
+        "    rune *p;\n"
+        "\n",
+        label);
+
+    if(hasr){
+        printf(
+            "   p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n"
+            "   if(p && c >= p[0] && c <= p[1])\n"
+            "       return 1;\n",
+            label, label);
+    }
+
+    if(hasp){
+        printf(
+            "   p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n"
+            "   if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+            "       return 1;\n",
+            label, label);
+    }
+
+    if(hass)
+        printf(
+            "   p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n"
+            "   if(p && c == p[0])\n"
+            "       return 1;\n",
+            label, label);
+
+    printf(
+        "   return 0;\n"
+        "}\n"
+        "\n");
+}
+
+static
+int
+torange(char *label, int *index, int force)
+{
+    int l, r, d, start = 0;
+
+    for(l = 0; l < NumRunes; ){
+        if(index[l] == l){
+            l++;
+            continue;
+        }
+
+        d = DELTA(index[l], l);
+        if(d != (rune)d)
+            panicf("bad map delta %d", d);
+
+        for(r = l+1; r < NumRunes; r++){
+            if(DELTA(index[r], r) != d)
+                break;
+            index[r] = r;
+        }
+
+        if(force || r != l + 1){
+            if(!start){
+                printf("static rune to%s_range[] = {\n", label);
+                start = 1;
+            }
+            index[l] = l;
+            printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d);
+        }
+        l = r;
+    }
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
+
+static
+int
+topair(char *label, int *index)
+{
+    int l, r, d, start = 0;
+
+    for(l = 0; l + 2 < NumRunes; ){
+        if(index[l] == l){
+            l++;
+            continue;
+        }
+
+        d = DELTA(index[l], l);
+        if(d != (rune)d)
+            panicf("bad delta %d", d);
+
+        for(r = l+2; r < NumRunes; r += 2){
+            if(DELTA(index[r], r) != d)
+                break;
+            index[r] = r;
+        }
+
+        if(r > l+2){
+            if(!start){
+                printf("static rune to%s_pair[] = {\n", label);
+                start = 1;
+            }
+            index[l] = l;
+            printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d);
+        }
+
+        l = r;
+    }
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
+
+static
+int
+tosingle(char *label, int *index)
+{
+    int i, d, start = 0;
+
+    for(i=0; i < NumRunes; i++) {
+        if(index[i] == i)
+            continue;
+
+        d = DELTA(index[i], i);
+        if(d != (rune)d)
+            panicf("bad map delta %d", d);
+
+        if(!start){
+            printf("static rune to%s_single[] = {\n", label);
+            start = 1;
+        }
+        index[i] = i;
+        printf("\t0x%.4x, %d,\n", i, d);
+    }
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
+
+static
+void
+mkto(char *label, int *index, int pairs, int onlyrange)
+{
+    int hasr, hasp=0, hass=0;
+
+    hasr = torange(label, index, !onlyrange);
+    if(!onlyrange && pairs)
+        hasp = topair(label, index);
+    if(!onlyrange)
+        hass = tosingle(label, index);
+
+    printf(
+        "rune\n"
+        "utf8·to%s(rune c)\n"
+        "{\n"
+        "	rune *p;\n"
+        "\n",
+        label);
+
+    if(hasr)
+        printf(
+            "   p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n"
+            "   if(p && c >= p[0] && c <= p[1])\n"
+            "       return c + p[2] - %d;\n",
+            label, label, OFFSET);
+
+    if(hasp)
+        printf(
+            "   p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n"
+            "   if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+            "       return c + p[2] - %d;\n",
+            label, label, OFFSET);
+
+    if(hass)
+        printf(
+            "   p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n"
+            "   if(p && c == p[0])\n"
+            "       return c + p[1] - %d;\n",
+            label, label, OFFSET);
+
+
+    printf(
+        "   return c;\n"
+        "}\n"
+        "\n"
+    );
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+    fprintf(stderr, "usage: mkrunetype <UnicodeData.txt>\n");
+    exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+    int i, sc, c, ec;
+    io·Stream *utf8;
+    char *prop, *field[NumFields], line[1024];
+
+    ARGBEGIN{
+    }ARGEND;
+
+    if(argc != 1)
+        usage();
+
+    if(!(utf8 = io·open(argv[0], "r")))
+        panicf("can't open %s\n", argv[0]);
+
+    /* by default each character maps to itself */
+    for(i = 0; i < NumRunes; i++) {
+        table.toupper[i] = i;
+        table.tolower[i] = i;
+        table.totitle[i] = i;
+    }
+
+    /* ensure all C local white space characters pass */
+    table.isspace['\t'] = 1;
+    table.isspace['\n'] = 1;
+    table.isspace['\r'] = 1;
+    table.isspace['\f'] = 1;
+    table.isspace['\v'] = 1;
+    table.isspace[0x85] = 1;
+
+    ec = -1;
+    // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any
+    while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+        /* parse unicode range */
+        codepointrange(utf8, field, &sc, &ec);
+        prop = field[Fcategory];
+
+        for(c = sc; c <= ec; c++){
+            /* grab properties */
+            switch(prop[0]){
+            case 'L':
+                table.isalpha[c] = 1;
+                switch(prop[1]){
+                case 'u': table.isupper[c] = 1; break;
+                case 'l': table.islower[c] = 1; break;
+                case 't': table.istitle[c] = 1; break;
+                case 'm': break; // modifier letters
+                case 'o': break; // ideograph letters
+                default:
+                    goto badproperty;
+                }
+                break;
+
+            case 'Z':
+                table.isspace[c] = 1;
+                break;
+
+            case 'M':
+                table.ismark[c] = 1;
+                break;
+
+            case 'N':
+                table.isdigit[c] = 1;
+                break;
+
+            case 'P':
+                table.ispunct[c] = 1;
+                break;
+
+            case 'S':
+                table.issymbl[c] = 1;
+                break;
+
+            case 'C':
+                table.iscntrl[c] = 1;
+                break;
+
+            default: badproperty:
+                panicf("unrecognized category '%s'", prop);
+            }
+            /* grab transformations */
+            if(*field[Fupper])
+                table.toupper[c] = codepoint(field[Fupper]);
+            if(*field[Flower])
+                table.tolower[c] = codepoint(field[Flower]);
+            if(*field[Ftitle])
+                table.totitle[c] = codepoint(field[Ftitle]);
+        }
+    }
+    io·close(utf8);
+
+    putsearch();
+
+    makeis("space", table.isspace, 0, 1);
+    makeis("digit", table.isdigit, 0, 1);
+    makeis("alpha", table.isalpha, 0, 0);
+    makeis("upper", table.isupper, 1, 0);
+    makeis("lower", table.islower, 1, 0);
+    makeis("title", table.istitle, 1, 0);
+    makeis("punct", table.ispunct, 1, 0);
+
+    mkto("upper", table.toupper, 1, 0);
+    mkto("lower", table.tolower, 1, 0);
+    mkto("title", table.totitle, 1, 0);
+}
diff --git a/src/libutf/vendor/mkrunewidth.c b/src/libutf/vendor/mkrunewidth.c
new file mode 100644
index 0000000..14e6973
--- /dev/null
+++ b/src/libutf/vendor/mkrunewidth.c
@@ -0,0 +1,325 @@
+#include "common.h"
+
+/*
+ * inspired by design choices in utf8proc/charwidths.jl
+ * all widths default to 1 unless they fall within the categories:
+ *      1. Mn 2. Mc 3. Me 4. Zl
+ *      5. Zp 6. Cc 7. Cf 8. Cs
+ * these default to zero width
+ */
+enum
+{
+    /* width ? */
+    WidthNeutral,   /* (N) practially treated like narrow but unclear ... */
+    WidthAmbiguous, /* (A) sometimes wide and sometimes not... */
+    /* width 1 */
+    WidthHalf,      /* (H) = to narrow (compatability equivalent)  */
+    WidthNarrow,    /* (Na) ASCII width */
+    /* width 2 */
+    WidthWide,      /* (W) 2x width */
+    WidthFull,      /* (F) = to wide (compatability equivalent) */
+};
+
+struct Table
+{
+    char width[3][NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+void
+parse_category(char *path)
+{
+    int sc, c, ec, w;
+    io·Stream *utf8;
+    char *prop, *field[NumFields], line[1024];
+
+    if(!(utf8 = io·open(path, "r")))
+        panicf("can't open %s\n", path);
+
+    // NOTE: we don't check for comments here
+    ec = -1;
+    while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+        codepointrange(utf8, field, &sc, &ec);
+
+        prop = field[Fcategory];
+
+        switch(prop[0]){
+        case 'M':
+            switch(prop[1]){
+            case 'n': case 'c': case 'e':
+                w = 0;
+                break;
+            default:
+                w = 1;
+                break;
+            }
+            break;
+        case 'Z':
+            switch(prop[1]){
+                case 'l': case 'p':
+                    w = 0;
+                    break;
+                default:
+                    w = 1;
+                    break;
+            }
+            break;
+        case 'C':
+            switch(prop[1]){
+                case 'c': case 'f': case 's':
+                    w = 0;
+                    break;
+                default:
+                    w = 1;
+                    break;
+            }
+        default:
+            w = 1;
+        }
+
+        for(c = sc; c <= ec; c++)
+            table.width[w][c] = 1;
+    }
+
+    io·close(utf8);
+}
+
+static
+void
+coderange(char *field, int *l, int *r)
+{
+    char *s;
+
+    if(!(s = strstr(field, "..")))
+        *l=*r=codepoint(field);
+    else{
+        *s++ = 0, *s++ = 0;
+        *l=codepoint(field);
+        *r=codepoint(s);
+    }
+}
+
+static
+void
+parse_eawidths(char *path)
+{
+    int at, w;
+    int l, c, r;
+    io·Stream *utf8;
+    char *field[2], line[1024];
+
+    utf8 = io·open(path, "r");
+    while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+        if(at == ParseSkip)
+            continue;
+
+        switch(field[1][0]){
+        case 'A': continue;
+        case 'N':
+            if(field[1][1] != 'a')
+                continue;
+        /* fallthrough */
+        case 'H': w = 1; break;
+
+        case 'W': /* fallthrough */
+        case 'F': w = 2; break;
+
+        default:
+            panicf("malformed east asian width class: %s\n", field[1]);
+        }
+
+        coderange(field[0], &l, &r);
+
+        for(c=l; c <= r; c++){
+            /* ensure it only exists in one table */
+            table.width[w][c]       = 1;
+            table.width[(w+1)%3][c] = 0;
+            table.width[(w+2)%3][c] = 0;
+        }
+    }
+    io·close(utf8);
+}
+
+static
+void
+parse_emoji(char *path)
+{
+    int at, w;
+    int l, c, r;
+    io·Stream *utf8;
+    char *s, *field[2], line[1024];
+
+    utf8 = io·open(path, "r");
+    while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+        if(at == ParseSkip)
+            continue;
+
+        /* only override emoji presentation */
+        if(!strstr(field[1], "Emoji_Presentation"))
+            continue;
+
+        /* trim trailing space */
+        for(s=field[0]; *s; s++){
+            if(*s == ' ')
+                *s = 0;
+        }
+
+        coderange(field[0], &l, &r);
+
+        for(c=l; c <= r; c++){
+            table.width[0][c] = 0;
+            table.width[1][c] = 0;
+            table.width[2][c] = 1;
+        }
+    }
+
+    io·close(utf8);
+}
+
+/* output functions */
+static
+void
+maketable(char *label, char *table, int pairs, int onlyranges)
+{
+    int r, p=0, s=0;
+    char ident[3][128];
+
+    enum
+    {
+        Irange,
+        Ipair,
+        Isingle,
+    };
+
+    /* ranges */
+    if(snprintf(ident[Irange], arrlen(ident[Irange]), "%s_range", label) == arrlen(ident[Irange]))
+        panicf("out of identifier space\n");
+    r = putrange(ident[Irange], table, onlyranges);
+
+    if(!onlyranges && pairs){
+        if(snprintf(ident[Ipair], arrlen(ident[Ipair]), "%s_pair", label) == arrlen(ident[Ipair]))
+            panicf("out of identifier space\n");
+        p = putpair(ident[Ipair], table);
+    }
+    if(!onlyranges){
+        if(snprintf(ident[Isingle], arrlen(ident[Isingle]), "%s_single", label) == arrlen(ident[Isingle]))
+            panicf("out of identifier space\n");
+
+        s = putsingle(ident[Isingle], table);
+    }
+
+    printf(
+        "static int\n"
+        "is%s(rune c)\n"
+        "{\n"
+        "    rune *p;\n"
+        "\n",
+        label);
+
+    if(r){
+        printf(
+            "   p = rangesearch(c, %s, arrlen(%s)/2, 2);\n"
+            "   if(p && c >= p[0] && c <= p[1])\n"
+            "       return 1;\n",
+            ident[Irange], ident[Irange]);
+    }
+
+    if(p){
+        printf(
+            "   p = rangesearch(c, %s, arrlen(%s)/2, 2);\n"
+            "   if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+            "       return 1;\n",
+            ident[Ipair], ident[Ipair]);
+    }
+
+    if(s)
+        printf(
+            "   p = rangesearch(c, %s, arrlen(%s), 1);\n"
+            "   if(p && c == p[0])\n"
+            "       return 1;\n",
+            ident[Isingle], ident[Isingle]);
+
+    printf(
+        "   return 0;\n"
+        "}\n"
+        "\n");
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+    fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt> <EmojiData.txt>\n");
+    exit(1);
+}
+
+#define SETW0(c) \
+    table.width[0][(c)] = 1, \
+    table.width[1][(c)] = 0, \
+    table.width[2][(c)] = 0;
+
+#define SETW1(c) \
+    table.width[0][(c)] = 0, \
+    table.width[1][(c)] = 1, \
+    table.width[2][(c)] = 0;
+
+#define SETW2(c) \
+    table.width[0][(c)] = 0, \
+    table.width[1][(c)] = 0, \
+    table.width[2][(c)] = 1;
+
+
+int
+main(int argc, char *argv[])
+{
+    int c;
+
+    ARGBEGIN{
+    }ARGEND;
+
+    if(argc != 3)
+        usage();
+
+    parse_category(*argv++);
+    parse_eawidths(*argv++);
+    parse_emoji(*argv);
+
+    /* overrides */
+    SETW0(0x2028);
+    SETW0(0x2029);
+
+    SETW1(0x00AD);
+
+    /* simple checking */
+    for(c=0; c<NumRunes; c++){
+        if(table.width[0][c] + table.width[1][c] + table.width[2][c] > 1)
+            panicf("improper table state");
+    }
+
+    putsearch();
+
+    maketable("width0", table.width[0], 1, 0);
+    maketable("width1", table.width[1], 1, 0);
+    maketable("width2", table.width[2], 1, 0);
+
+    puts(
+        "\n"
+        "int\n"
+        "utf8·runewidth(rune c)\n"
+        "{\n"
+        "   if(iswidth1(c))\n"
+        "       return 1;\n"
+        "   if(iswidth2(c))\n"
+        "       return 2;\n"
+        "   return 0;\n"
+        "}"
+    );
+}