aboutsummaryrefslogtreecommitdiff
path: root/sys/libunicode/decode.c
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-10-28 11:55:47 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-10-28 11:55:47 -0700
commit6318ddb69ad8380694bbca35ca5a72230c2ee694 (patch)
treee9ef65d79c264d88c2d02402a587c2561b5b1299 /sys/libunicode/decode.c
parent189b9e23edfe60b7e82c4c7b6071a3f98799653a (diff)
Feat(libunicode): cleaned up decode/encode
Additionally, decode can now apply backwards on a byte string.
Diffstat (limited to 'sys/libunicode/decode.c')
-rw-r--r--sys/libunicode/decode.c78
1 files changed, 68 insertions, 10 deletions
diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c
index 79271f2..01797f1 100644
--- a/sys/libunicode/decode.c
+++ b/sys/libunicode/decode.c
@@ -1,32 +1,90 @@
#include "internal.h"
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+ /*
+ * the first part of the table maps bytes to character classes that
+ * to reduce the size of the transition table and create bitmasks
+ */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ /*
+ * the second part is a transition table that maps a combination
+ * of a state of the automaton and a character class to a state
+ */
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+int
+utf8·decode(char *s, rune *r)
+{
+ int n;
+ rune v;
+ uint8 b, t, x=ACCEPT;
+
+ b = ((uint8 *)s)[0];
+ t = decode[b];
+ v = (0xFF >> t) & b;
+ x = decode[256+x+t];
+
+ for(n=1; x > REJECT && n < UTFmax; n++){
+ b = ((uint8 *)s)[n];
+ t = decode[b];
+ v = (v << 6) | (b & TMask);
+ x = decode[256+x+t];
+ }
+
+ if(x != ACCEPT){
+ *r = RuneErr;
+ return 1;
+ }
+
+ *r = v;
+ return n;
+}
+
+#if 0
int
-utf8·decode(byte *s, rune* r)
+utf8·decode(byte *s, rune *r)
{
int c[UTFmax], i;
rune l;
c[0] = *(ubyte*)(s);
- if(c[0] < Tx) {
+ if(c[0] < Tx){
*r = c[0];
return 1;
}
l = c[0];
- for(i = 1; i < UTFmax; i++) {
+ for(i = 1; i < UTFmax; i++){
c[i] = *(ubyte*)(s+i);
c[i] ^= Tx;
- if (c[i] & Testx) goto bad;
+ if(c[i] & Testx) goto bad;
l = (l << Bitx) | c[i];
- if(c[0] < Tbyte(i + 2)) {
+ if(c[0] < Tbyte(i + 2)){
l &= RuneX(i + 1);
- if (i == 1) {
- if (c[0] < Tbyte(2) || l <= Rune1)
+ if(i == 1){
+ if(c[0] < Tbyte(2) || l <= Rune1)
goto bad;
- } else if (l <= RuneX(i) || l > RuneMax)
+ }else if(l <= RuneX(i) || l > RuneMax)
goto bad;
- if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+
+ if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
goto bad;
*r = l;
@@ -37,4 +95,4 @@ bad:
*r = RuneErr;
return 1;
}
-
+#endif