chore: libunicode -> libutf

author: Nicholas <nbnoll@eml.cc> 2021-11-10 20:12:45 -0800
committer: Nicholas <nbnoll@eml.cc> 2021-11-11 08:16:47 -0800
commit: 7ea1cdb7d31f00024f5a1d124b42cd19a03b959a (patch)
tree: f65abf9fa32856287de586129d4ed7c1c473864a /sys/libutf
parent: 43688fe7190d0350349d47727c3663421d5618dc (diff)
12 files changed, 638 insertions, 0 deletions
diff --git a/sys/libutf/canfit.c b/sys/libutf/canfit.c
new file mode 100644
index 0000000..4579ab3
--- /dev/null
+++ b/sys/libutf/canfit.c
@@ -0,0 +1,23 @@
+#include "internal.h"
+
+/* returns 1 if string of length n is long enough to be decoded */
+int
+utf8·canfit(byte* s, int n)
+{
+    int  i;
+    rune c;
+
+    if(n <= 0)
+        return 0;
+
+    c = *(ubyte*)s;
+    if(c < TByte1)
+        return 1;
+
+    if(c < TByte3)
+        return n >= 2;
+    if(c < TByte4)
+        return n >= 3;
+
+    return n >= UTFmax;
+}
diff --git a/sys/libutf/decode.c b/sys/libutf/decode.c
new file mode 100644
index 0000000..01797f1
--- /dev/null
+++ b/sys/libutf/decode.c
@@ -0,0 +1,98 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks
+     */
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+    /*
+     * the second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state
+     */
+     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+    12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+int
+utf8·decode(char *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, x=ACCEPT;
+
+    b = ((uint8 *)s)[0];
+    t = decode[b];
+    v = (0xFF >> t) & b;
+    x = decode[256+x+t];
+
+    for(n=1; x > REJECT && n < UTFmax; n++){
+        b = ((uint8 *)s)[n];
+        t = decode[b];
+        v = (v << 6) | (b & TMask);
+        x = decode[256+x+t];
+    }
+
+    if(x != ACCEPT){
+        *r = RuneErr;
+        return 1;
+    }
+
+    *r = v;
+    return n;
+}
+
+#if 0
+int
+utf8·decode(byte *s, rune *r)
+{
+    int c[UTFmax], i;
+    rune l;
+
+    c[0] = *(ubyte*)(s);
+    if(c[0] < Tx){
+        *r = c[0];
+        return 1;
+    }
+
+    l = c[0];
+    for(i = 1; i < UTFmax; i++){
+        c[i]  = *(ubyte*)(s+i);
+        c[i] ^= Tx;
+        if(c[i] & Testx) goto bad;
+
+        l = (l << Bitx) | c[i];
+        if(c[0] < Tbyte(i + 2)){
+            l &= RuneX(i + 1);
+            if(i == 1){
+                if(c[0] < Tbyte(2) || l <= Rune1)
+                    goto bad;
+            }else if(l <= RuneX(i) || l > RuneMax)
+                goto bad;
+
+            if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+                goto bad;
+
+            *r = l;
+            return i + 1;
+        }
+    }
+bad:
+    *r = RuneErr;
+    return 1;
+}
+#endif
diff --git a/sys/libutf/decodeprev.c b/sys/libutf/decodeprev.c
new file mode 100644
index 0000000..27dced6
--- /dev/null
+++ b/sys/libutf/decodeprev.c
@@ -0,0 +1,60 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks.
+     */
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+         7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+         8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+        10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+    /*
+     * The second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state.
+     */
+    //   0  1  2  3  4  5  6  7  8  9 10 11
+         0,24,12,12,12,12,12,24,12,24,12,12,
+         0,24,12,12,12,12,12,24,12,24,12,12,
+        12,36, 0,12,12,12,12,48,12,36,12,12,
+        12,60,12, 0, 0,12,12,72,12,72,12,12,
+        12,60,12, 0,12,12,12,72,12,72, 0,12,
+        12,12,12,12,12, 0, 0,12,12,12,12,12,
+        12,12,12,12,12,12,12,12,12,12,12, 0
+};
+
+int
+utf8·decodeprev(byte *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, d, x=ACCEPT;
+
+    v=0, n=0, d=0;
+nextbyte:
+    b = ((uint8 *)s)[-n++];
+    t = decode[b];
+    x = decode[256+x+t];
+
+    if(x > REJECT && n < UTFmax){
+        v = v | ((b & TMask) << d);
+        d += 6;
+        goto nextbyte;
+    }
+
+    if(x != ACCEPT)
+        *r = RuneErr;
+    else{
+        v |= (((0xFFu >> t) & b) << d);
+        *r = v;
+    }
+
+    return n;
+}
diff --git a/sys/libutf/encode.c b/sys/libutf/encode.c
new file mode 100644
index 0000000..fa7c93e
--- /dev/null
+++ b/sys/libutf/encode.c
@@ -0,0 +1,69 @@
+#include "internal.h"
+
+int
+utf8·encode(rune *r, byte *s)
+{
+    rune c;
+
+    c = *r;
+    if(c < Rune1Byte){ // 7 bits
+        s[0] = (uint8)c;
+        return 1;
+    }
+
+    if(c < Rune2Byte){ // 11 bits
+        s[0] = TByte1 | (c >> 6);
+        s[1] = Tx     | (c & TMask);
+        return 2;
+    }
+
+    if(c < Rune3Byte){ // 16 bits
+        s[0] = TByte2 | ((c >> 12));
+        s[1] = Tx     | ((c >> 6) & TMask);
+        s[2] = Tx     | ((c)      & TMask);
+        return 3;
+    }
+
+    // 22 bits
+    if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax))
+        c = RuneErr;
+
+    s[0] = TByte3 | ((c >> 18));
+    s[1] = Tx     | ((c >> 12) & TMask);
+    s[2] = Tx     | ((c >> 6)  & TMask);
+    s[3] = Tx     | ((c)       & TMask);
+
+    return 4;
+}
+
+#if 0
+int
+utf8·encode(rune* r, byte* s)
+{
+    int i, j;
+    rune c;
+
+    c = *r;
+    if(c <= Rune1) {
+        s[0] = c;
+        return 1;
+    }
+
+    for(i = 2; i < UTFmax + 1; i++){
+        if(i == 3){
+            if(c > RuneMax)
+                c = RuneErr;
+            if(SurrogateMin <= c && c <= SurrogateMax)
+                c = RuneErr;
+        }
+        if(c <= RuneX(i) || i == UTFmax) {
+            s[0] = Tbyte(i) |  (c >> (i - 1)*Bitx);
+            for(j = 1; j < i; j++)
+                s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
+            return i;
+        }
+    }
+
+    return UTFmax;
+}
+#endif
diff --git a/sys/libutf/find.c b/sys/libutf/find.c
new file mode 100644
index 0000000..d75feb8
--- /dev/null
+++ b/sys/libutf/find.c
@@ -0,0 +1,31 @@
+#include "internal.h"
+
+byte*
+utf8·find(byte* s, rune c)
+{
+    long c1;
+    rune r;
+    int  n;
+
+    if(c < Tx)
+        return strchr(s, c);
+
+    for(;;){
+        c1 = *(ubyte*)s;
+        if(c1 < Tx){
+            if(c1 == 0) return nil;
+            if(c1 == c) return s;
+            s++;
+            continue;
+        }
+
+        n = utf8·decode(s, &r);
+
+        if(r == c)
+            return s;
+
+        s += n;
+    }
+
+    return nil;
+}
diff --git a/sys/libutf/findlast.c b/sys/libutf/findlast.c
new file mode 100644
index 0000000..ab25ab2
--- /dev/null
+++ b/sys/libutf/findlast.c
@@ -0,0 +1,32 @@
+#include "internal.h"
+
+byte*
+utf8·findlast(byte* s, rune c)
+{
+    long c1;
+    rune r;
+    byte *l;
+
+    if(c < Tx)
+        return strrchr(s, c);
+
+    l = nil;
+    for(;;){
+        c1 = *(ubyte*)s;
+        if(c1 < Tx){
+            if(c1 == 0) return l;
+            if(c1 == c) l = s;
+            s++;
+            continue;
+        }
+
+        c1 = utf8·decode(s, &r);
+
+        if(r == c)
+            l = s;
+
+        s += c1;
+    }
+
+    return nil;
+}
diff --git a/sys/libutf/internal.h b/sys/libutf/internal.h
new file mode 100644
index 0000000..9719977
--- /dev/null
+++ b/sys/libutf/internal.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <u.h>
+#include <base.h>
+#include <libutf.h>
+
+/*
+ * NOTE: we use the preprocessor to ensure we have unsigned constants.
+ * UTF-8 code:
+ * 1 byte:
+ * 0xxxxxxx
+ * 2 byte:
+ * 110xxxxx 10xxxxxx
+ * 3 byte:
+ * 1110xxxx 10xxxxxx 10xxxxxx
+ * 4 byte:
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+
+#define Tx     0x80u // 0b10000000 transfer header
+#define TMask  0x3Fu // 0b00111111 transfer mask
+
+#define TByte1 0xC0u // 0b11000000
+#define TByte2 0xE0u // 0b11100000
+#define TByte3 0xF0u // 0b11110000
+#define TByte4 0xF8u // 0b11111000
+
+#define RuneMask  0x1FFFFFu
+
+#define Rune1Byte 0x000080u  // 1 << 8   (1 byte)
+#define Rune2Byte 0x001000u  // 1 << 12  (2 bytes)
+#define Rune3Byte 0x020000u  // 1 << 17  (3 bytes)
+#define Rune4Byte 0x400000u  // 1 << 22  (4 bytes)
+
+
+/* UTF-16 nonsense */
+#define RuneSurrogateMin 0x0D8000
+#define RuneSurrogateMax 0x0D8FFF
diff --git a/sys/libutf/len.c b/sys/libutf/len.c
new file mode 100644
index 0000000..8fbd679
--- /dev/null
+++ b/sys/libutf/len.c
@@ -0,0 +1,21 @@
+#include "internal.h"
+
+int
+utf8·len(char *s)
+{
+    int c;
+    long n;
+    rune r;
+
+    n = 0;
+    for(;;){
+        c = *(uchar*)s;
+        if(c < Tx){
+            if(c == 0)
+                return n;
+            s++;
+        }else
+            s += utf8·decode(s, &r);
+        n++;
+    }
+}
diff --git a/sys/libutf/rules.mk b/sys/libutf/rules.mk
new file mode 100644
index 0000000..53ff8cf
--- /dev/null
+++ b/sys/libutf/rules.mk
@@ -0,0 +1,76 @@
+include share/push.mk
+
+UNICODE = 14.0.0
+
+SRCS_$(d) := \
+	$(d)/encode.c \
+	$(d)/decode.c \
+	$(d)/decodeprev.c \
+	$(d)/find.c \
+	$(d)/findlast.c \
+	$(d)/canfit.c \
+	$(d)/runelen.c \
+	$(d)/len.c \
+	$(d)/runetype-$(UNICODE).c \
+	$(d)/runewidth-$(UNICODE).c
+
+LIBS_$(d) := $(d)/libutf.a
+
+include share/paths.mk
+
+# ========================================================================
+# table generation
+
+$(d)/vendor/common.o: $(d)/vendor/common.c
+	$(COMPILE)
+
+# rune categories
+$(d)/vendor/UnicodeData-$(UNICODE).txt:
+	@echo "GET	UnicodeData.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
+
+$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
+	$(COMPLINK)
+
+GENS += $(d)/vendor/mkrunetype
+
+$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
+	@$(dir $@)vendor/mkrunetype $< > $@
+
+# rune widths
+$(d)/vendor/EastAsianWidth-$(UNICODE).txt:
+	@echo "GET	EastAsianWidth.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
+
+$(d)/vendor/EmojiData-$(UNICODE).txt:
+	@echo "GET	EmojiData.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@
+
+$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
+	$(COMPLINK)
+
+GENS += $(d)/vendor/mkrunewidth
+
+$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt
+	@$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@
+
+# grapheme boundaries
+$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
+	@echo "GET	GraphemeBreakProperty.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
+
+$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
+	$(COMPLINK)
+
+$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
+	$^ > $@
+
+GENS += $(d)/vendor/mkgraphemedata
+
+# ========================================================================
+# normal operations
+
+$(LIBS_$(d)): $(OBJS_$(d))
+	$(ARCHIVE)
+
+include share/pop.mk
diff --git a/sys/libutf/runelen.c b/sys/libutf/runelen.c
new file mode 100644
index 0000000..dac7f15
--- /dev/null
+++ b/sys/libutf/runelen.c
@@ -0,0 +1,8 @@
+#include "internal.h"
+
+int
+utf8·runelen(rune r)
+{
+    byte s[10];
+    return utf8·encode(&r, s);
+}
diff --git a/sys/libutf/runetype-14.0.0.c b/sys/libutf/runetype-14.0.0.c
new file mode 100644
index 0000000..6f4469d
--- /dev/null
+++ b/sys/libutf/runetype-14.0.0.c
@@ -0,0 +1,111 @@
+#include <u.h>
+#include <libutf.h>
+
+static
+rune*
+rangesearch(rune c, rune *t, int n, int ne)
+{
+   rune *p;
+   int m;
+   while(n > 1) {
+   m = n >> 1;
+   p = t + m*ne;
+   if(c >= p[0]){
+       t = p;
+       n = n-m;
+   }else
+       n = m;
+   }
+   if(n && c >= t[0])
+       return t;
+   return 0;
+}
+
+static rune isspace_range[] = {
+	0x0009, 0x000d,
+	0x0085, 0x0085,
+};
+
+int
+utf8·isspace(rune c)
+{
+    rune *p;
+
+   p = rangesearch(c, isspace_range, arrlen(isspace_range)/2, 2);
+   if(p && c >= p[0] && c <= p[1])
+       return 1;
+   return 0;
+}
+
+int
+utf8·isdigit(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+int
+utf8·isalpha(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+int
+utf8·isupper(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+int
+utf8·islower(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+int
+utf8·istitle(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+int
+utf8·ispunct(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+rune
+utf8·toupper(rune c)
+{
+	rune *p;
+
+   return c;
+}
+
+rune
+utf8·tolower(rune c)
+{
+	rune *p;
+
+   return c;
+}
+
+rune
+utf8·totitle(rune c)
+{
+	rune *p;
+
+   return c;
+}
+
diff --git a/sys/libutf/runewidth-14.0.0.c b/sys/libutf/runewidth-14.0.0.c
new file mode 100644
index 0000000..113c35e
--- /dev/null
+++ b/sys/libutf/runewidth-14.0.0.c
@@ -0,0 +1,71 @@
+#include <u.h>
+#include <libutf.h>
+
+static
+rune*
+rangesearch(rune c, rune *t, int n, int ne)
+{
+   rune *p;
+   int m;
+   while(n > 1) {
+   m = n >> 1;
+   p = t + m*ne;
+   if(c >= p[0]){
+       t = p;
+       n = n-m;
+   }else
+       n = m;
+   }
+   if(n && c >= t[0])
+       return t;
+   return 0;
+}
+
+static rune width0_range[] = {
+	0x2028, 0x2029,
+};
+
+static int
+iswidth0(rune c)
+{
+    rune *p;
+
+   p = rangesearch(c, width0_range, arrlen(width0_range)/2, 2);
+   if(p && c >= p[0] && c <= p[1])
+       return 1;
+   return 0;
+}
+
+static rune width1_single[] = {
+	0x00ad,
+};
+
+static int
+iswidth1(rune c)
+{
+    rune *p;
+
+   p = rangesearch(c, width1_single, arrlen(width1_single), 1);
+   if(p && c == p[0])
+       return 1;
+   return 0;
+}
+
+static int
+iswidth2(rune c)
+{
+    rune *p;
+
+   return 0;
+}
+
+
+int
+utf8·runewidth(rune c)
+{
+   if(iswidth1(c))
+       return 1;
+   if(iswidth2(c))
+       return 2;
+   return 0;
+}
author	Nicholas <nbnoll@eml.cc>	2021-11-10 20:12:45 -0800
committer	Nicholas <nbnoll@eml.cc>	2021-11-11 08:16:47 -0800
commit	7ea1cdb7d31f00024f5a1d124b42cd19a03b959a (patch)
tree	f65abf9fa32856287de586129d4ed7c1c473864a /sys/libutf
parent	43688fe7190d0350349d47727c3663421d5618dc (diff)