From 6318ddb69ad8380694bbca35ca5a72230c2ee694 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 28 Oct 2021 11:55:47 -0700 Subject: Feat(libunicode): cleaned up decode/encode Additionally, decode can now apply backwards on a byte string. --- sys/libunicode/canfit.c | 17 +++++---- sys/libunicode/decode.c | 78 +++++++++++++++++++++++++++++++++----- sys/libunicode/decodeprev.c | 60 +++++++++++++++++++++++++++++ sys/libunicode/encode.c | 38 +++++++++++++++++++ sys/libunicode/find.c | 5 ++- sys/libunicode/findlast.c | 6 ++- sys/libunicode/internal.h | 37 +++++++++--------- sys/libunicode/len.c | 4 +- sys/libunicode/rules.mk | 17 ++++++++- sys/libunicode/vendor/mkrunetype.c | 3 +- 10 files changed, 223 insertions(+), 42 deletions(-) create mode 100644 sys/libunicode/decodeprev.c (limited to 'sys') diff --git a/sys/libunicode/canfit.c b/sys/libunicode/canfit.c index d44c9e6..4579ab3 100644 --- a/sys/libunicode/canfit.c +++ b/sys/libunicode/canfit.c @@ -7,14 +7,17 @@ utf8·canfit(byte* s, int n) int i; rune c; - if(n <= 0) return 0; - c = *(ubyte*) s; - if(c < Tx) return 1; + if(n <= 0) + return 0; - for(i = 3; i < UTFmax + 1; i++){ - if(c < Tbyte(i)) - return n >= i - 1; - } + c = *(ubyte*)s; + if(c < TByte1) + return 1; + + if(c < TByte3) + return n >= 2; + if(c < TByte4) + return n >= 3; return n >= UTFmax; } diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c index 79271f2..01797f1 100644 --- a/sys/libunicode/decode.c +++ b/sys/libunicode/decode.c @@ -1,32 +1,90 @@ #include "internal.h" +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * the second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +int +utf8·decode(char *s, rune *r) +{ + int n; + rune v; + uint8 b, t, x=ACCEPT; + + b = ((uint8 *)s)[0]; + t = decode[b]; + v = (0xFF >> t) & b; + x = decode[256+x+t]; + + for(n=1; x > REJECT && n < UTFmax; n++){ + b = ((uint8 *)s)[n]; + t = decode[b]; + v = (v << 6) | (b & TMask); + x = decode[256+x+t]; + } + + if(x != ACCEPT){ + *r = RuneErr; + return 1; + } + + *r = v; + return n; +} + +#if 0 int -utf8·decode(byte *s, rune* r) +utf8·decode(byte *s, rune *r) { int c[UTFmax], i; rune l; c[0] = *(ubyte*)(s); - if(c[0] < Tx) { + if(c[0] < Tx){ *r = c[0]; return 1; } l = c[0]; - for(i = 1; i < UTFmax; i++) { + for(i = 1; i < UTFmax; i++){ c[i] = *(ubyte*)(s+i); c[i] ^= Tx; - if (c[i] & Testx) goto bad; + if(c[i] & Testx) goto bad; l = (l << Bitx) | c[i]; - if(c[0] < Tbyte(i + 2)) { + if(c[0] < Tbyte(i + 2)){ l &= RuneX(i + 1); - if (i == 1) { - if (c[0] < Tbyte(2) || l <= Rune1) + if(i == 1){ + if(c[0] < Tbyte(2) || l <= Rune1) goto bad; - } else if (l <= RuneX(i) || l > RuneMax) + }else if(l <= RuneX(i) || l > RuneMax) goto bad; - if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + + if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) goto bad; *r = l; @@ -37,4 +95,4 @@ bad: *r = RuneErr; return 1; } - +#endif diff --git a/sys/libunicode/decodeprev.c b/sys/libunicode/decodeprev.c new file mode 100644 index 0000000..27dced6 --- /dev/null +++ b/sys/libunicode/decodeprev.c @@ -0,0 +1,60 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + /* + * The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state. + */ + // 0 1 2 3 4 5 6 7 8 9 10 11 + 0,24,12,12,12,12,12,24,12,24,12,12, + 0,24,12,12,12,12,12,24,12,24,12,12, + 12,36, 0,12,12,12,12,48,12,36,12,12, + 12,60,12, 0, 0,12,12,72,12,72,12,12, + 12,60,12, 0,12,12,12,72,12,72, 0,12, + 12,12,12,12,12, 0, 0,12,12,12,12,12, + 12,12,12,12,12,12,12,12,12,12,12, 0 +}; + +int +utf8·decodeprev(byte *s, rune *r) +{ + int n; + rune v; + uint8 b, t, d, x=ACCEPT; + + v=0, n=0, d=0; +nextbyte: + b = ((uint8 *)s)[-n++]; + t = decode[b]; + x = decode[256+x+t]; + + if(x > REJECT && n < UTFmax){ + v = v | ((b & TMask) << d); + d += 6; + goto nextbyte; + } + + if(x != ACCEPT) + *r = RuneErr; + else{ + v |= (((0xFFu >> t) & b) << d); + *r = v; + } + + return n; +} diff --git a/sys/libunicode/encode.c b/sys/libunicode/encode.c index 8f4d212..fa7c93e 100644 --- a/sys/libunicode/encode.c +++ b/sys/libunicode/encode.c @@ -1,5 +1,42 @@ #include "internal.h" +int +utf8·encode(rune *r, byte *s) +{ + rune c; + + c = *r; + if(c < Rune1Byte){ // 7 bits + s[0] = (uint8)c; + return 1; + } + + if(c < Rune2Byte){ // 11 bits + s[0] = TByte1 | (c >> 6); + s[1] = Tx | (c & TMask); + return 2; + } + + if(c < Rune3Byte){ // 16 bits + s[0] = TByte2 | ((c >> 12)); + s[1] = Tx | ((c >> 6) & TMask); + s[2] = Tx | ((c) & TMask); + return 3; + } + + // 22 bits + if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax)) + c = RuneErr; + + s[0] = TByte3 | ((c >> 18)); + s[1] = Tx | ((c >> 12) & TMask); + s[2] = Tx | ((c >> 6) & TMask); + s[3] = Tx | ((c) & TMask); + + return 4; +} + +#if 0 int utf8·encode(rune* r, byte* s) { @@ -29,3 +66,4 @@ utf8·encode(rune* r, byte* s) return UTFmax; } +#endif diff --git a/sys/libunicode/find.c b/sys/libunicode/find.c index 659ab5b..d75feb8 100644 --- a/sys/libunicode/find.c +++ b/sys/libunicode/find.c @@ -7,17 +7,18 @@ utf8·find(byte* s, rune c) rune r; int n; - if(c < RuneSync) + if(c < Tx) return strchr(s, c); for(;;){ c1 = *(ubyte*)s; - if(c1 < RuneSelf) { + if(c1 < Tx){ if(c1 == 0) return nil; if(c1 == c) return s; s++; continue; } + n = utf8·decode(s, &r); if(r == c) diff --git a/sys/libunicode/findlast.c b/sys/libunicode/findlast.c index 3a4ed38..ab25ab2 100644 --- a/sys/libunicode/findlast.c +++ b/sys/libunicode/findlast.c @@ -7,19 +7,21 @@ utf8·findlast(byte* s, rune c) rune r; byte *l; - if(c < RuneSync) + if(c < Tx) return strrchr(s, c); l = nil; for(;;){ c1 = *(ubyte*)s; - if(c1 < RuneSelf) { + if(c1 < Tx){ if(c1 == 0) return l; if(c1 == c) l = s; s++; continue; } + c1 = utf8·decode(s, &r); + if(r == c) l = s; diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h index 31ed2ae..6b0657b 100644 --- a/sys/libunicode/internal.h +++ b/sys/libunicode/internal.h @@ -4,7 +4,9 @@ #include #include -/* UTF-8 code +/* + * NOTE: we use the preprocessor to ensure we have unsigned constants. + * UTF-8 code: * 1 byte: * 0xxxxxxx * 2 byte: @@ -14,22 +16,23 @@ * 4 byte: * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ -#define Bit(i) (7-(i)) -/* 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ -#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) -/* 0000 0000 0000 0111 1111 1111 */ -#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) -enum -{ - Bitx = Bit(1), - Tx = Tbyte(1), - Rune1 = (1 << (Bit(0)+0*Bitx)) - 1, +#define Tx 0x80u // 0b10000000 transfer header +#define TMask 0x3Fu // 0b00111111 transfer mask - Maskx = (1 << Bitx) - 1, /* 0011 1111 */ - Testx = Maskx ^ 0xff, /* 1100 0000 */ +#define TByte1 0xC0u // 0b11000000 +#define TByte2 0xE0u // 0b11100000 +#define TByte3 0xF0u // 0b11110000 +#define TByte4 0xF8u // 0b11111000 - SurrogateMin = 0xD800, - SurrogateMax = 0xDFFF, - Bad = RuneErr, -}; +#define Rune1Byte 0x000080u // 1 << 8 (1 byte) +#define Rune2Byte 0x001000u // 1 << 12 (2 bytes) +#define Rune3Byte 0x020000u // 1 << 17 (3 bytes) +#define Rune4Byte 0x400000u // 1 << 22 (4 bytes) + +#define RuneMax 0x10FFFFu +#define RuneMask 0x1FFFFFu + +/* UTF-16 nonsense */ +#define RuneSurrogateMin 0x0D8000 +#define RuneSurrogateMax 0x0D8FFF diff --git a/sys/libunicode/len.c b/sys/libunicode/len.c index c461bf3..8fbd679 100644 --- a/sys/libunicode/len.c +++ b/sys/libunicode/len.c @@ -10,11 +10,11 @@ utf8·len(char *s) n = 0; for(;;){ c = *(uchar*)s; - if(c < RuneSelf) { + if(c < Tx){ if(c == 0) return n; s++; - } else + }else s += utf8·decode(s, &r); n++; } diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk index 100697b..aca7348 100644 --- a/sys/libunicode/rules.mk +++ b/sys/libunicode/rules.mk @@ -5,6 +5,7 @@ UNICODE = 14.0.0 SRCS_$(d) := \ $(d)/encode.c \ $(d)/decode.c \ + $(d)/decodeprev.c \ $(d)/find.c \ $(d)/findlast.c \ $(d)/canfit.c \ @@ -16,6 +17,7 @@ LIBS_$(d) := $(d)/libunicode.a include share/paths.mk +# rune categories $(d)/vendor/UnicodeData-$(UNICODE).txt: @echo "GET UnicodeData.txt";\ curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ @@ -23,10 +25,23 @@ $(d)/vendor/UnicodeData-$(UNICODE).txt: $(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a $(COMPLINK) +GENS += $(d)/vendor/mkrunetype + $(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype @$(dir $@)vendor/mkrunetype $< > $@ -GENS += $(d)/vendor/mkrunetype +# grapheme boundaries +$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt: + @echo "GET GraphemeBreakProperty.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@ + +$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt + $^ > $@ + +GENS += $(d)/vendor/mkgraphemedata $(LIBS_$(d)): $(OBJS_$(d)) $(ARCHIVE) diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c index 0372e2b..fc4301c 100644 --- a/sys/libunicode/vendor/mkrunetype.c +++ b/sys/libunicode/vendor/mkrunetype.c @@ -418,7 +418,7 @@ static void usage(void) { - fprintf(stderr, "usage: mkutf8data \n"); + fprintf(stderr, "usage: mkrunetype \n"); exit(1); } @@ -564,6 +564,7 @@ main(int argc, char *argv[]) makeis("upper", table.isupper, 1, 0); makeis("lower", table.islower, 1, 0); makeis("title", table.istitle, 1, 0); + makeis("punct", table.ispunct, 1, 0); mkto("upper", table.toupper, 1, 0); mkto("lower", table.tolower, 1, 0); -- cgit v1.2.1