From 7ea1cdb7d31f00024f5a1d124b42cd19a03b959a Mon Sep 17 00:00:00 2001 From: Nicholas Date: Wed, 10 Nov 2021 20:12:45 -0800 Subject: chore: libunicode -> libutf --- sys/libutf/canfit.c | 23 +++++++++ sys/libutf/decode.c | 98 +++++++++++++++++++++++++++++++++++++ sys/libutf/decodeprev.c | 60 +++++++++++++++++++++++ sys/libutf/encode.c | 69 ++++++++++++++++++++++++++ sys/libutf/find.c | 31 ++++++++++++ sys/libutf/findlast.c | 32 ++++++++++++ sys/libutf/internal.h | 38 +++++++++++++++ sys/libutf/len.c | 21 ++++++++ sys/libutf/rules.mk | 76 +++++++++++++++++++++++++++++ sys/libutf/runelen.c | 8 +++ sys/libutf/runetype-14.0.0.c | 111 ++++++++++++++++++++++++++++++++++++++++++ sys/libutf/runewidth-14.0.0.c | 71 +++++++++++++++++++++++++++ 12 files changed, 638 insertions(+) create mode 100644 sys/libutf/canfit.c create mode 100644 sys/libutf/decode.c create mode 100644 sys/libutf/decodeprev.c create mode 100644 sys/libutf/encode.c create mode 100644 sys/libutf/find.c create mode 100644 sys/libutf/findlast.c create mode 100644 sys/libutf/internal.h create mode 100644 sys/libutf/len.c create mode 100644 sys/libutf/rules.mk create mode 100644 sys/libutf/runelen.c create mode 100644 sys/libutf/runetype-14.0.0.c create mode 100644 sys/libutf/runewidth-14.0.0.c (limited to 'sys/libutf') diff --git a/sys/libutf/canfit.c b/sys/libutf/canfit.c new file mode 100644 index 0000000..4579ab3 --- /dev/null +++ b/sys/libutf/canfit.c @@ -0,0 +1,23 @@ +#include "internal.h" + +/* returns 1 if string of length n is long enough to be decoded */ +int +utf8·canfit(byte* s, int n) +{ + int i; + rune c; + + if(n <= 0) + return 0; + + c = *(ubyte*)s; + if(c < TByte1) + return 1; + + if(c < TByte3) + return n >= 2; + if(c < TByte4) + return n >= 3; + + return n >= UTFmax; +} diff --git a/sys/libutf/decode.c b/sys/libutf/decode.c new file mode 100644 index 0000000..01797f1 --- /dev/null +++ b/sys/libutf/decode.c @@ -0,0 +1,98 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * the second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +int +utf8·decode(char *s, rune *r) +{ + int n; + rune v; + uint8 b, t, x=ACCEPT; + + b = ((uint8 *)s)[0]; + t = decode[b]; + v = (0xFF >> t) & b; + x = decode[256+x+t]; + + for(n=1; x > REJECT && n < UTFmax; n++){ + b = ((uint8 *)s)[n]; + t = decode[b]; + v = (v << 6) | (b & TMask); + x = decode[256+x+t]; + } + + if(x != ACCEPT){ + *r = RuneErr; + return 1; + } + + *r = v; + return n; +} + +#if 0 +int +utf8·decode(byte *s, rune *r) +{ + int c[UTFmax], i; + rune l; + + c[0] = *(ubyte*)(s); + if(c[0] < Tx){ + *r = c[0]; + return 1; + } + + l = c[0]; + for(i = 1; i < UTFmax; i++){ + c[i] = *(ubyte*)(s+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; + + l = (l << Bitx) | c[i]; + if(c[0] < Tbyte(i + 2)){ + l &= RuneX(i + 1); + if(i == 1){ + if(c[0] < Tbyte(2) || l <= Rune1) + goto bad; + }else if(l <= RuneX(i) || l > RuneMax) + goto bad; + + if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + + *r = l; + return i + 1; + } + } +bad: + *r = RuneErr; + return 1; +} +#endif diff --git a/sys/libutf/decodeprev.c b/sys/libutf/decodeprev.c new file mode 100644 index 0000000..27dced6 --- /dev/null +++ b/sys/libutf/decodeprev.c @@ -0,0 +1,60 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + /* + * The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state. + */ + // 0 1 2 3 4 5 6 7 8 9 10 11 + 0,24,12,12,12,12,12,24,12,24,12,12, + 0,24,12,12,12,12,12,24,12,24,12,12, + 12,36, 0,12,12,12,12,48,12,36,12,12, + 12,60,12, 0, 0,12,12,72,12,72,12,12, + 12,60,12, 0,12,12,12,72,12,72, 0,12, + 12,12,12,12,12, 0, 0,12,12,12,12,12, + 12,12,12,12,12,12,12,12,12,12,12, 0 +}; + +int +utf8·decodeprev(byte *s, rune *r) +{ + int n; + rune v; + uint8 b, t, d, x=ACCEPT; + + v=0, n=0, d=0; +nextbyte: + b = ((uint8 *)s)[-n++]; + t = decode[b]; + x = decode[256+x+t]; + + if(x > REJECT && n < UTFmax){ + v = v | ((b & TMask) << d); + d += 6; + goto nextbyte; + } + + if(x != ACCEPT) + *r = RuneErr; + else{ + v |= (((0xFFu >> t) & b) << d); + *r = v; + } + + return n; +} diff --git a/sys/libutf/encode.c b/sys/libutf/encode.c new file mode 100644 index 0000000..fa7c93e --- /dev/null +++ b/sys/libutf/encode.c @@ -0,0 +1,69 @@ +#include "internal.h" + +int +utf8·encode(rune *r, byte *s) +{ + rune c; + + c = *r; + if(c < Rune1Byte){ // 7 bits + s[0] = (uint8)c; + return 1; + } + + if(c < Rune2Byte){ // 11 bits + s[0] = TByte1 | (c >> 6); + s[1] = Tx | (c & TMask); + return 2; + } + + if(c < Rune3Byte){ // 16 bits + s[0] = TByte2 | ((c >> 12)); + s[1] = Tx | ((c >> 6) & TMask); + s[2] = Tx | ((c) & TMask); + return 3; + } + + // 22 bits + if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax)) + c = RuneErr; + + s[0] = TByte3 | ((c >> 18)); + s[1] = Tx | ((c >> 12) & TMask); + s[2] = Tx | ((c >> 6) & TMask); + s[3] = Tx | ((c) & TMask); + + return 4; +} + +#if 0 +int +utf8·encode(rune* r, byte* s) +{ + int i, j; + rune c; + + c = *r; + if(c <= Rune1) { + s[0] = c; + return 1; + } + + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > RuneMax) + c = RuneErr; + if(SurrogateMin <= c && c <= SurrogateMax) + c = RuneErr; + } + if(c <= RuneX(i) || i == UTFmax) { + s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + + return UTFmax; +} +#endif diff --git a/sys/libutf/find.c b/sys/libutf/find.c new file mode 100644 index 0000000..d75feb8 --- /dev/null +++ b/sys/libutf/find.c @@ -0,0 +1,31 @@ +#include "internal.h" + +byte* +utf8·find(byte* s, rune c) +{ + long c1; + rune r; + int n; + + if(c < Tx) + return strchr(s, c); + + for(;;){ + c1 = *(ubyte*)s; + if(c1 < Tx){ + if(c1 == 0) return nil; + if(c1 == c) return s; + s++; + continue; + } + + n = utf8·decode(s, &r); + + if(r == c) + return s; + + s += n; + } + + return nil; +} diff --git a/sys/libutf/findlast.c b/sys/libutf/findlast.c new file mode 100644 index 0000000..ab25ab2 --- /dev/null +++ b/sys/libutf/findlast.c @@ -0,0 +1,32 @@ +#include "internal.h" + +byte* +utf8·findlast(byte* s, rune c) +{ + long c1; + rune r; + byte *l; + + if(c < Tx) + return strrchr(s, c); + + l = nil; + for(;;){ + c1 = *(ubyte*)s; + if(c1 < Tx){ + if(c1 == 0) return l; + if(c1 == c) l = s; + s++; + continue; + } + + c1 = utf8·decode(s, &r); + + if(r == c) + l = s; + + s += c1; + } + + return nil; +} diff --git a/sys/libutf/internal.h b/sys/libutf/internal.h new file mode 100644 index 0000000..9719977 --- /dev/null +++ b/sys/libutf/internal.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include + +/* + * NOTE: we use the preprocessor to ensure we have unsigned constants. + * UTF-8 code: + * 1 byte: + * 0xxxxxxx + * 2 byte: + * 110xxxxx 10xxxxxx + * 3 byte: + * 1110xxxx 10xxxxxx 10xxxxxx + * 4 byte: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + +#define Tx 0x80u // 0b10000000 transfer header +#define TMask 0x3Fu // 0b00111111 transfer mask + +#define TByte1 0xC0u // 0b11000000 +#define TByte2 0xE0u // 0b11100000 +#define TByte3 0xF0u // 0b11110000 +#define TByte4 0xF8u // 0b11111000 + +#define RuneMask 0x1FFFFFu + +#define Rune1Byte 0x000080u // 1 << 8 (1 byte) +#define Rune2Byte 0x001000u // 1 << 12 (2 bytes) +#define Rune3Byte 0x020000u // 1 << 17 (3 bytes) +#define Rune4Byte 0x400000u // 1 << 22 (4 bytes) + + +/* UTF-16 nonsense */ +#define RuneSurrogateMin 0x0D8000 +#define RuneSurrogateMax 0x0D8FFF diff --git a/sys/libutf/len.c b/sys/libutf/len.c new file mode 100644 index 0000000..8fbd679 --- /dev/null +++ b/sys/libutf/len.c @@ -0,0 +1,21 @@ +#include "internal.h" + +int +utf8·len(char *s) +{ + int c; + long n; + rune r; + + n = 0; + for(;;){ + c = *(uchar*)s; + if(c < Tx){ + if(c == 0) + return n; + s++; + }else + s += utf8·decode(s, &r); + n++; + } +} diff --git a/sys/libutf/rules.mk b/sys/libutf/rules.mk new file mode 100644 index 0000000..53ff8cf --- /dev/null +++ b/sys/libutf/rules.mk @@ -0,0 +1,76 @@ +include share/push.mk + +UNICODE = 14.0.0 + +SRCS_$(d) := \ + $(d)/encode.c \ + $(d)/decode.c \ + $(d)/decodeprev.c \ + $(d)/find.c \ + $(d)/findlast.c \ + $(d)/canfit.c \ + $(d)/runelen.c \ + $(d)/len.c \ + $(d)/runetype-$(UNICODE).c \ + $(d)/runewidth-$(UNICODE).c + +LIBS_$(d) := $(d)/libutf.a + +include share/paths.mk + +# ======================================================================== +# table generation + +$(d)/vendor/common.o: $(d)/vendor/common.c + $(COMPILE) + +# rune categories +$(d)/vendor/UnicodeData-$(UNICODE).txt: + @echo "GET UnicodeData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ + +$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunetype + +$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype + @$(dir $@)vendor/mkrunetype $< > $@ + +# rune widths +$(d)/vendor/EastAsianWidth-$(UNICODE).txt: + @echo "GET EastAsianWidth.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@ + +$(d)/vendor/EmojiData-$(UNICODE).txt: + @echo "GET EmojiData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@ + +$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunewidth + +$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt + @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@ + +# grapheme boundaries +$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt: + @echo "GET GraphemeBreakProperty.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@ + +$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt + $^ > $@ + +GENS += $(d)/vendor/mkgraphemedata + +# ======================================================================== +# normal operations + +$(LIBS_$(d)): $(OBJS_$(d)) + $(ARCHIVE) + +include share/pop.mk diff --git a/sys/libutf/runelen.c b/sys/libutf/runelen.c new file mode 100644 index 0000000..dac7f15 --- /dev/null +++ b/sys/libutf/runelen.c @@ -0,0 +1,8 @@ +#include "internal.h" + +int +utf8·runelen(rune r) +{ + byte s[10]; + return utf8·encode(&r, s); +} diff --git a/sys/libutf/runetype-14.0.0.c b/sys/libutf/runetype-14.0.0.c new file mode 100644 index 0000000..6f4469d --- /dev/null +++ b/sys/libutf/runetype-14.0.0.c @@ -0,0 +1,111 @@ +#include +#include + +static +rune* +rangesearch(rune c, rune *t, int n, int ne) +{ + rune *p; + int m; + while(n > 1) { + m = n >> 1; + p = t + m*ne; + if(c >= p[0]){ + t = p; + n = n-m; + }else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +static rune isspace_range[] = { + 0x0009, 0x000d, + 0x0085, 0x0085, +}; + +int +utf8·isspace(rune c) +{ + rune *p; + + p = rangesearch(c, isspace_range, arrlen(isspace_range)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + return 0; +} + +int +utf8·isdigit(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·isalpha(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·isupper(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·islower(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·istitle(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·ispunct(rune c) +{ + rune *p; + + return 0; +} + +rune +utf8·toupper(rune c) +{ + rune *p; + + return c; +} + +rune +utf8·tolower(rune c) +{ + rune *p; + + return c; +} + +rune +utf8·totitle(rune c) +{ + rune *p; + + return c; +} + diff --git a/sys/libutf/runewidth-14.0.0.c b/sys/libutf/runewidth-14.0.0.c new file mode 100644 index 0000000..113c35e --- /dev/null +++ b/sys/libutf/runewidth-14.0.0.c @@ -0,0 +1,71 @@ +#include +#include + +static +rune* +rangesearch(rune c, rune *t, int n, int ne) +{ + rune *p; + int m; + while(n > 1) { + m = n >> 1; + p = t + m*ne; + if(c >= p[0]){ + t = p; + n = n-m; + }else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +static rune width0_range[] = { + 0x2028, 0x2029, +}; + +static int +iswidth0(rune c) +{ + rune *p; + + p = rangesearch(c, width0_range, arrlen(width0_range)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + return 0; +} + +static rune width1_single[] = { + 0x00ad, +}; + +static int +iswidth1(rune c) +{ + rune *p; + + p = rangesearch(c, width1_single, arrlen(width1_single), 1); + if(p && c == p[0]) + return 1; + return 0; +} + +static int +iswidth2(rune c) +{ + rune *p; + + return 0; +} + + +int +utf8·runewidth(rune c) +{ + if(iswidth1(c)) + return 1; + if(iswidth2(c)) + return 2; + return 0; +} -- cgit v1.2.1