Chore: reorganize libutf and libfmt into base

I found the split to be arbitrary. Better to include the functionality in the standard library. I also split the headers to allow for more granular inclusion (but the library is still monolithic). The only ugliness is the circular dependency introduced with libutf's generated functions. We put explicit prereqs with the necessary object files instead.
author: Nicholas <nbnoll@eml.cc> 2021-11-20 10:53:19 -0800
committer: Nicholas <nbnoll@eml.cc> 2021-11-20 10:53:19 -0800
commit: a9bfe650038afea8b751175cac16f6027345e45f (patch)
tree: 9a7f9feb76a64bb3efe573036d80b7bdbf8a59a5 /src/base/utf/decode.c
parent: 1c8d4e69205fd875f6bec3fa3bd929c2e7f52f62 (diff)
1 files changed, 98 insertions, 0 deletions
diff --git a/src/base/utf/decode.c b/src/base/utf/decode.c
new file mode 100644
index 0000000..01797f1
--- /dev/null
+++ b/src/base/utf/decode.c
@@ -0,0 +1,98 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+    /*
+     * the first part of the table maps bytes to character classes that
+     * to reduce the size of the transition table and create bitmasks
+     */
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+     7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+     8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+    /*
+     * the second part is a transition table that maps a combination
+     * of a state of the automaton and a character class to a state
+     */
+     0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+    12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+int
+utf8·decode(char *s, rune *r)
+{
+    int   n;
+    rune  v;
+    uint8 b, t, x=ACCEPT;
+
+    b = ((uint8 *)s)[0];
+    t = decode[b];
+    v = (0xFF >> t) & b;
+    x = decode[256+x+t];
+
+    for(n=1; x > REJECT && n < UTFmax; n++){
+        b = ((uint8 *)s)[n];
+        t = decode[b];
+        v = (v << 6) | (b & TMask);
+        x = decode[256+x+t];
+    }
+
+    if(x != ACCEPT){
+        *r = RuneErr;
+        return 1;
+    }
+
+    *r = v;
+    return n;
+}
+
+#if 0
+int
+utf8·decode(byte *s, rune *r)
+{
+    int c[UTFmax], i;
+    rune l;
+
+    c[0] = *(ubyte*)(s);
+    if(c[0] < Tx){
+        *r = c[0];
+        return 1;
+    }
+
+    l = c[0];
+    for(i = 1; i < UTFmax; i++){
+        c[i]  = *(ubyte*)(s+i);
+        c[i] ^= Tx;
+        if(c[i] & Testx) goto bad;
+
+        l = (l << Bitx) | c[i];
+        if(c[0] < Tbyte(i + 2)){
+            l &= RuneX(i + 1);
+            if(i == 1){
+                if(c[0] < Tbyte(2) || l <= Rune1)
+                    goto bad;
+            }else if(l <= RuneX(i) || l > RuneMax)
+                goto bad;
+
+            if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+                goto bad;
+
+            *r = l;
+            return i + 1;
+        }
+    }
+bad:
+    *r = RuneErr;
+    return 1;
+}
+#endif
author	Nicholas <nbnoll@eml.cc>	2021-11-20 10:53:19 -0800
committer	Nicholas <nbnoll@eml.cc>	2021-11-20 10:53:19 -0800
commit	a9bfe650038afea8b751175cac16f6027345e45f (patch)
tree	9a7f9feb76a64bb3efe573036d80b7bdbf8a59a5 /src/base/utf/decode.c
parent	1c8d4e69205fd875f6bec3fa3bd929c2e7f52f62 (diff)