diff options
author | Nicholas <nbnoll@eml.cc> | 2021-11-20 10:53:19 -0800 |
---|---|---|
committer | Nicholas <nbnoll@eml.cc> | 2021-11-20 10:53:19 -0800 |
commit | a9bfe650038afea8b751175cac16f6027345e45f (patch) | |
tree | 9a7f9feb76a64bb3efe573036d80b7bdbf8a59a5 /src/base/utf/decode.c | |
parent | 1c8d4e69205fd875f6bec3fa3bd929c2e7f52f62 (diff) |
Chore: reorganize libutf and libfmt into base
I found the split to be arbitrary. Better to include the functionality
in the standard library. I also split the headers to allow for more
granular inclusion (but the library is still monolithic). The only
ugliness is the circular dependency introduced with libutf's generated
functions. We put explicit prereqs with the necessary object files
instead.
Diffstat (limited to 'src/base/utf/decode.c')
-rw-r--r-- | src/base/utf/decode.c | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/src/base/utf/decode.c b/src/base/utf/decode.c new file mode 100644 index 0000000..01797f1 --- /dev/null +++ b/src/base/utf/decode.c @@ -0,0 +1,98 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * the second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +int +utf8·decode(char *s, rune *r) +{ + int n; + rune v; + uint8 b, t, x=ACCEPT; + + b = ((uint8 *)s)[0]; + t = decode[b]; + v = (0xFF >> t) & b; + x = decode[256+x+t]; + + for(n=1; x > REJECT && n < UTFmax; n++){ + b = ((uint8 *)s)[n]; + t = decode[b]; + v = (v << 6) | (b & TMask); + x = decode[256+x+t]; + } + + if(x != ACCEPT){ + *r = RuneErr; + return 1; + } + + *r = v; + return n; +} + +#if 0 +int +utf8·decode(byte *s, rune *r) +{ + int c[UTFmax], i; + rune l; + + c[0] = *(ubyte*)(s); + if(c[0] < Tx){ + *r = c[0]; + return 1; + } + + l = c[0]; + for(i = 1; i < UTFmax; i++){ + c[i] = *(ubyte*)(s+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; + + l = (l << Bitx) | c[i]; + if(c[0] < Tbyte(i + 2)){ + l &= RuneX(i + 1); + if(i == 1){ + if(c[0] < Tbyte(2) || l <= Rune1) + goto bad; + }else if(l <= RuneX(i) || l > RuneMax) + goto bad; + + if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + + *r = l; + return i + 1; + } + } +bad: + *r = RuneErr; + return 1; +} +#endif |