Feat(libunicode): cleaned up decode/encode

Additionally, decode can now apply backwards on a byte string.
author: Nicholas Noll <nbnoll@eml.cc> 2021-10-28 11:55:47 -0700
committer: Nicholas Noll <nbnoll@eml.cc> 2021-10-28 11:55:47 -0700
commit: 6318ddb69ad8380694bbca35ca5a72230c2ee694 (patch)
tree: e9ef65d79c264d88c2d02402a587c2561b5b1299
parent: 189b9e23edfe60b7e82c4c7b6071a3f98799653a (diff)
11 files changed, 227 insertions, 52 deletions
diff --git a/include/libunicode.h b/include/libunicode.h
index 5e69fd1..00e5bc0 100644
--- a/include/libunicode.h
+++ b/include/libunicode.h
@@ -2,16 +2,8 @@
 
 typedef uint32 rune;
 
-/*
- * we have to use the preprocessor to ensure we have unsigned constants.
- */
-
 #define UTFmax   4
-#define RuneSync 0x80u
-#define RuneSelf 0x80u
 #define RuneErr  0xFFFDu
-#define RuneMax  0x10FFFFu
-#define RuneMask 0x1FFFFFu
 
 /*
  * UTF-8 functions.
@@ -19,8 +11,9 @@ typedef uint32 rune;
 int     utf8·len(char *s);            // returns number of runes
 int     utf8·runelen(rune r);         // returns number of bytes for rune
 
-int     utf8·decode(char *, rune *);  // decode 1 rune from char stream, store into rune, return number of bytes
-int     utf8·encode(rune *, char *);  // encode 1 rune from rune stream, store into char, return number of bytes
+int     utf8·decode(char *, rune *);       // decode 1 rune from char stream, store into rune, return number of bytes
+int     utf8·encode(rune *, char *);       // encode 1 rune from rune stream, store into char, return number of bytes
+int     utf8·decodeprev(char *s, rune *r); // decode 1 rune from char stream, reading backwards, store into rune, return number of bytes
 
 char   *utf8·find(char *s, rune);     // find rune in char stream
 char   *utf8·findlast(char* s, rune); // find last rune in char stream
@@ -31,6 +24,7 @@ int     utf8·isletter(rune r);
 int     utf8·isdigit(rune r);
 int     utf8·isspace(rune r);
 int     utf8·istitle(rune r);
+int     utf8·ispunct(rune r);
 
 rune    utf8·toupper(rune r);
 rune    utf8·tolower(rune r);
diff --git a/sys/libunicode/canfit.c b/sys/libunicode/canfit.c
index d44c9e6..4579ab3 100644
--- a/sys/libunicode/canfit.c
+++ b/sys/libunicode/canfit.c
@@ -7,14 +7,17 @@ utf8·canfit(byte* s, int n)
     int  i;
     rune c;
 
-    if(n <= 0) return 0;
-    c = *(ubyte*) s;
-    if(c < Tx) return 1;
+    if(n <= 0)
+        return 0;
 
-    for(i = 3; i < UTFmax + 1; i++){
-        if(c < Tbyte(i))
-            return n >= i - 1;
-    }
+    c = *(ubyte*)s;
+    if(c < TByte1)
+        return 1;
+
+    if(c < TByte3)
+        return n >= 2;
+    if(c < TByte4)
+        return n >= 3;
 
     return n >= UTFmax;
 }
diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c
index 79271f2..01797f1 100644
--- a/sys/libunicode/decode.c
+++ b/sys/libunicode/decode.c
@@ -1,32 +1,90 @@
 #include "internal.h"
 
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks
+     */
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+    /*
+     * the second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state
+     */
+     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+    12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+int
+utf8·decode(char *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, x=ACCEPT;
+
+    b = ((uint8 *)s)[0];
+    t = decode[b];
+    v = (0xFF >> t) & b;
+    x = decode[256+x+t];
+
+    for(n=1; x > REJECT && n < UTFmax; n++){
+        b = ((uint8 *)s)[n];
+        t = decode[b];
+        v = (v << 6) | (b & TMask);
+        x = decode[256+x+t];
+    }
+
+    if(x != ACCEPT){
+        *r = RuneErr;
+        return 1;
+    }
+
+    *r = v;
+    return n;
+}
+
+#if 0
 int
-utf8·decode(byte *s, rune* r)
+utf8·decode(byte *s, rune *r)
 {
     int c[UTFmax], i;
     rune l;
 
     c[0] = *(ubyte*)(s);
-    if(c[0] < Tx) {
+    if(c[0] < Tx){
         *r = c[0];
         return 1;
     }
 
     l = c[0];
-    for(i = 1; i < UTFmax; i++) {
+    for(i = 1; i < UTFmax; i++){
         c[i]  = *(ubyte*)(s+i);
         c[i] ^= Tx;
-        if (c[i] & Testx) goto bad;
+        if(c[i] & Testx) goto bad;
 
         l = (l << Bitx) | c[i];
-        if(c[0] < Tbyte(i + 2)) {
+        if(c[0] < Tbyte(i + 2)){
             l &= RuneX(i + 1);
-            if (i == 1) {
-                if (c[0] < Tbyte(2) || l <= Rune1)
+            if(i == 1){
+                if(c[0] < Tbyte(2) || l <= Rune1)
                     goto bad;
-            } else if (l <= RuneX(i) || l > RuneMax)
+            }else if(l <= RuneX(i) || l > RuneMax)
                 goto bad;
-            if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+
+            if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
                 goto bad;
 
             *r = l;
@@ -37,4 +95,4 @@ bad:
     *r = RuneErr;
     return 1;
 }
-
+#endif
diff --git a/sys/libunicode/decodeprev.c b/sys/libunicode/decodeprev.c
new file mode 100644
index 0000000..27dced6
--- /dev/null
+++ b/sys/libunicode/decodeprev.c
@@ -0,0 +1,60 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks.
+     */
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+         7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+         8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+        10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+    /*
+     * The second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state.
+     */
+    //   0  1  2  3  4  5  6  7  8  9 10 11
+         0,24,12,12,12,12,12,24,12,24,12,12,
+         0,24,12,12,12,12,12,24,12,24,12,12,
+        12,36, 0,12,12,12,12,48,12,36,12,12,
+        12,60,12, 0, 0,12,12,72,12,72,12,12,
+        12,60,12, 0,12,12,12,72,12,72, 0,12,
+        12,12,12,12,12, 0, 0,12,12,12,12,12,
+        12,12,12,12,12,12,12,12,12,12,12, 0
+};
+
+int
+utf8·decodeprev(byte *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, d, x=ACCEPT;
+
+    v=0, n=0, d=0;
+nextbyte:
+    b = ((uint8 *)s)[-n++];
+    t = decode[b];
+    x = decode[256+x+t];
+
+    if(x > REJECT && n < UTFmax){
+        v = v | ((b & TMask) << d);
+        d += 6;
+        goto nextbyte;
+    }
+
+    if(x != ACCEPT)
+        *r = RuneErr;
+    else{
+        v |= (((0xFFu >> t) & b) << d);
+        *r = v;
+    }
+
+    return n;
+}
diff --git a/sys/libunicode/encode.c b/sys/libunicode/encode.c
index 8f4d212..fa7c93e 100644
--- a/sys/libunicode/encode.c
+++ b/sys/libunicode/encode.c
@@ -1,6 +1,43 @@
 #include "internal.h"
 
 int
+utf8·encode(rune *r, byte *s)
+{
+    rune c;
+
+    c = *r;
+    if(c < Rune1Byte){ // 7 bits
+        s[0] = (uint8)c;
+        return 1;
+    }
+
+    if(c < Rune2Byte){ // 11 bits
+        s[0] = TByte1 | (c >> 6);
+        s[1] = Tx     | (c & TMask);
+        return 2;
+    }
+
+    if(c < Rune3Byte){ // 16 bits
+        s[0] = TByte2 | ((c >> 12));
+        s[1] = Tx     | ((c >> 6) & TMask);
+        s[2] = Tx     | ((c)      & TMask);
+        return 3;
+    }
+
+    // 22 bits
+    if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax))
+        c = RuneErr;
+
+    s[0] = TByte3 | ((c >> 18));
+    s[1] = Tx     | ((c >> 12) & TMask);
+    s[2] = Tx     | ((c >> 6)  & TMask);
+    s[3] = Tx     | ((c)       & TMask);
+
+    return 4;
+}
+
+#if 0
+int
 utf8·encode(rune* r, byte* s)
 {
     int i, j;
@@ -29,3 +66,4 @@ utf8·encode(rune* r, byte* s)
 
     return UTFmax;
 }
+#endif
diff --git a/sys/libunicode/find.c b/sys/libunicode/find.c
index 659ab5b..d75feb8 100644
--- a/sys/libunicode/find.c
+++ b/sys/libunicode/find.c
@@ -7,17 +7,18 @@ utf8·find(byte* s, rune c)
     rune r;
     int  n;
 
-    if(c < RuneSync)
+    if(c < Tx)
         return strchr(s, c);
 
     for(;;){
         c1 = *(ubyte*)s;
-        if(c1 < RuneSelf) {
+        if(c1 < Tx){
             if(c1 == 0) return nil;
             if(c1 == c) return s;
             s++;
             continue;
         }
+
         n = utf8·decode(s, &r);
 
         if(r == c)
diff --git a/sys/libunicode/findlast.c b/sys/libunicode/findlast.c
index 3a4ed38..ab25ab2 100644
--- a/sys/libunicode/findlast.c
+++ b/sys/libunicode/findlast.c
@@ -7,19 +7,21 @@ utf8·findlast(byte* s, rune c)
     rune r;
     byte *l;
 
-    if(c < RuneSync)
+    if(c < Tx)
         return strrchr(s, c);
 
     l = nil;
     for(;;){
         c1 = *(ubyte*)s;
-        if(c1 < RuneSelf) {
+        if(c1 < Tx){
             if(c1 == 0) return l;
             if(c1 == c) l = s;
             s++;
             continue;
         }
+
         c1 = utf8·decode(s, &r);
+
         if(r == c)
             l = s;
 
diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h
index 31ed2ae..6b0657b 100644
--- a/sys/libunicode/internal.h
+++ b/sys/libunicode/internal.h
@@ -4,7 +4,9 @@
 #include <base.h>
 #include <libunicode.h>
 
-/* UTF-8 code
+/*
+ * NOTE: we use the preprocessor to ensure we have unsigned constants.
+ * UTF-8 code:
  * 1 byte:
  * 0xxxxxxx
  * 2 byte:
@@ -14,22 +16,23 @@
  * 4 byte:
  * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  */
-#define Bit(i) (7-(i))
-/* 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */
-#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
-/* 0000 0000 0000 0111 1111 1111 */
-#define	RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
 
-enum
-{
-    Bitx  = Bit(1),
-    Tx    = Tbyte(1),
-    Rune1 = (1 << (Bit(0)+0*Bitx)) - 1,
+#define Tx     0x80u // 0b10000000 transfer header
+#define TMask  0x3Fu // 0b00111111 transfer mask
 
-    Maskx = (1 << Bitx) - 1, /* 0011 1111 */
-    Testx = Maskx ^ 0xff,    /* 1100 0000 */
+#define TByte1 0xC0u // 0b11000000
+#define TByte2 0xE0u // 0b11100000
+#define TByte3 0xF0u // 0b11110000
+#define TByte4 0xF8u // 0b11111000
 
-    SurrogateMin = 0xD800,
-    SurrogateMax = 0xDFFF,
-    Bad = RuneErr,
-};
+#define Rune1Byte 0x000080u  // 1 << 8   (1 byte)
+#define Rune2Byte 0x001000u  // 1 << 12  (2 bytes)
+#define Rune3Byte 0x020000u  // 1 << 17  (3 bytes)
+#define Rune4Byte 0x400000u  // 1 << 22  (4 bytes)
+
+#define RuneMax   0x10FFFFu
+#define RuneMask  0x1FFFFFu
+
+/* UTF-16 nonsense */
+#define RuneSurrogateMin 0x0D8000
+#define RuneSurrogateMax 0x0D8FFF
diff --git a/sys/libunicode/len.c b/sys/libunicode/len.c
index c461bf3..8fbd679 100644
--- a/sys/libunicode/len.c
+++ b/sys/libunicode/len.c
@@ -10,11 +10,11 @@ utf8·len(char *s)
     n = 0;
     for(;;){
         c = *(uchar*)s;
-        if(c < RuneSelf) {
+        if(c < Tx){
             if(c == 0)
                 return n;
             s++;
-        } else
+        }else
             s += utf8·decode(s, &r);
         n++;
     }
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index 100697b..aca7348 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -5,6 +5,7 @@ UNICODE = 14.0.0
 SRCS_$(d) := \
 	$(d)/encode.c \
 	$(d)/decode.c \
+	$(d)/decodeprev.c \
 	$(d)/find.c \
 	$(d)/findlast.c \
 	$(d)/canfit.c \
@@ -16,6 +17,7 @@ LIBS_$(d) := $(d)/libunicode.a
 
 include share/paths.mk
 
+# rune categories
 $(d)/vendor/UnicodeData-$(UNICODE).txt:
 	@echo "GET	UnicodeData.txt";\
 	curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
@@ -23,10 +25,23 @@ $(d)/vendor/UnicodeData-$(UNICODE).txt:
 $(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a
 	$(COMPLINK)
 
+GENS += $(d)/vendor/mkrunetype
+
 $(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
 	@$(dir $@)vendor/mkrunetype $< > $@
 
-GENS += $(d)/vendor/mkrunetype
+# grapheme boundaries
+$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
+	@echo "GET	GraphemeBreakProperty.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
+
+$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(OBJ_DIR)/sys/base/base.a
+	$(COMPLINK)
+
+$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
+	$^ > $@
+
+GENS += $(d)/vendor/mkgraphemedata
 
 $(LIBS_$(d)): $(OBJS_$(d))
 	$(ARCHIVE)
diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c
index 0372e2b..fc4301c 100644
--- a/sys/libunicode/vendor/mkrunetype.c
+++ b/sys/libunicode/vendor/mkrunetype.c
@@ -418,7 +418,7 @@ static
 void
 usage(void)
 {
-    fprintf(stderr, "usage: mkutf8data <UnicodeData.txt>\n");
+    fprintf(stderr, "usage: mkrunetype <UnicodeData.txt>\n");
     exit(1);
 }
 
@@ -564,6 +564,7 @@ main(int argc, char *argv[])
     makeis("upper", table.isupper, 1, 0);
     makeis("lower", table.islower, 1, 0);
     makeis("title", table.istitle, 1, 0);
+    makeis("punct", table.ispunct, 1, 0);
 
     mkto("upper", table.toupper, 1, 0);
     mkto("lower", table.tolower, 1, 0);
author	Nicholas Noll <nbnoll@eml.cc>	2021-10-28 11:55:47 -0700
committer	Nicholas Noll <nbnoll@eml.cc>	2021-10-28 11:55:47 -0700
commit	6318ddb69ad8380694bbca35ca5a72230c2ee694 (patch)
tree	e9ef65d79c264d88c2d02402a587c2561b5b1299
parent	189b9e23edfe60b7e82c4c7b6071a3f98799653a (diff)