diff options
author | Nicholas Noll <nbnoll@eml.cc> | 2021-10-28 11:55:47 -0700 |
---|---|---|
committer | Nicholas Noll <nbnoll@eml.cc> | 2021-10-28 11:55:47 -0700 |
commit | 6318ddb69ad8380694bbca35ca5a72230c2ee694 (patch) | |
tree | e9ef65d79c264d88c2d02402a587c2561b5b1299 /sys/libunicode/decode.c | |
parent | 189b9e23edfe60b7e82c4c7b6071a3f98799653a (diff) |
Feat(libunicode): cleaned up decode/encode
Additionally, decode can now apply backwards on a byte string.
Diffstat (limited to 'sys/libunicode/decode.c')
-rw-r--r-- | sys/libunicode/decode.c | 78 |
1 files changed, 68 insertions, 10 deletions
diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c index 79271f2..01797f1 100644 --- a/sys/libunicode/decode.c +++ b/sys/libunicode/decode.c @@ -1,32 +1,90 @@ #include "internal.h" +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * the second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +int +utf8·decode(char *s, rune *r) +{ + int n; + rune v; + uint8 b, t, x=ACCEPT; + + b = ((uint8 *)s)[0]; + t = decode[b]; + v = (0xFF >> t) & b; + x = decode[256+x+t]; + + for(n=1; x > REJECT && n < UTFmax; n++){ + b = ((uint8 *)s)[n]; + t = decode[b]; + v = (v << 6) | (b & TMask); + x = decode[256+x+t]; + } + + if(x != ACCEPT){ + *r = RuneErr; + return 1; + } + + *r = v; + return n; +} + +#if 0 int -utf8·decode(byte *s, rune* r) +utf8·decode(byte *s, rune *r) { int c[UTFmax], i; rune l; c[0] = *(ubyte*)(s); - if(c[0] < Tx) { + if(c[0] < Tx){ *r = c[0]; return 1; } l = c[0]; - for(i = 1; i < UTFmax; i++) { + for(i = 1; i < UTFmax; i++){ c[i] = *(ubyte*)(s+i); c[i] ^= Tx; - if (c[i] & Testx) goto bad; + if(c[i] & Testx) goto bad; l = (l << Bitx) | c[i]; - if(c[0] < Tbyte(i + 2)) { + if(c[0] < Tbyte(i + 2)){ l &= RuneX(i + 1); - if (i == 1) { - if (c[0] < Tbyte(2) || l <= Rune1) + if(i == 1){ + if(c[0] < Tbyte(2) || l <= Rune1) goto bad; - } else if (l <= RuneX(i) || l > RuneMax) + }else if(l <= RuneX(i) || l > RuneMax) goto bad; - if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + + if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) goto bad; *r = l; @@ -37,4 +95,4 @@ bad: *r = RuneErr; return 1; } - +#endif |