From 189b9e23edfe60b7e82c4c7b6071a3f98799653a Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 26 Oct 2021 21:32:55 -0700 Subject: fix(unicode): renamed functions to be easier to understand their functions --- sys/cmd/dwm/drw.c | 2 +- sys/cmd/term/term.c | 10 +++++----- sys/cmd/term/x.c | 2 +- sys/cmd/wm/main.c | 2 +- sys/libunicode/bytetorune.c | 40 ---------------------------------------- sys/libunicode/canfit.c | 20 ++++++++++++++++++++ sys/libunicode/decode.c | 40 ++++++++++++++++++++++++++++++++++++++++ sys/libunicode/encode.c | 31 +++++++++++++++++++++++++++++++ sys/libunicode/find.c | 30 ++++++++++++++++++++++++++++++ sys/libunicode/findlast.c | 30 ++++++++++++++++++++++++++++++ sys/libunicode/findlastrune.c | 29 ----------------------------- sys/libunicode/findrune.c | 26 -------------------------- sys/libunicode/fullrune.c | 18 ------------------ sys/libunicode/internal.h | 12 +++++++++++- sys/libunicode/len.c | 21 +++++++++++++++++++++ sys/libunicode/rules.mk | 14 ++++++++------ sys/libunicode/runelen.c | 2 +- sys/libunicode/runetobyte.c | 31 ------------------------------- 18 files changed, 200 insertions(+), 160 deletions(-) delete mode 100644 sys/libunicode/bytetorune.c create mode 100644 sys/libunicode/canfit.c create mode 100644 sys/libunicode/decode.c create mode 100644 sys/libunicode/encode.c create mode 100644 sys/libunicode/find.c create mode 100644 sys/libunicode/findlast.c delete mode 100644 sys/libunicode/findlastrune.c delete mode 100644 sys/libunicode/findrune.c delete mode 100644 sys/libunicode/fullrune.c create mode 100644 sys/libunicode/len.c delete mode 100644 sys/libunicode/runetobyte.c (limited to 'sys') diff --git a/sys/cmd/dwm/drw.c b/sys/cmd/dwm/drw.c index 825e2f4..a6d6902 100644 --- a/sys/cmd/dwm/drw.c +++ b/sys/cmd/dwm/drw.c @@ -227,7 +227,7 @@ drw_text(Drw *drw, int x, int y, unsigned int w, unsigned int h, unsigned int lp utf8str = text; nextfont = NULL; while (*text) { - utf8charlen = utf8·bytetorune(&utf8codepoint, text); + utf8charlen = utf8·decode(text, &utf8codepoint); for (curfont = drw->fonts; curfont; curfont = curfont->next) { charexists = charexists || XftCharExists(drw->dpy, curfont->xfont, utf8codepoint); if (charexists) { diff --git a/sys/cmd/term/term.c b/sys/cmd/term/term.c index 6a73a8b..b08915e 100644 --- a/sys/cmd/term/term.c +++ b/sys/cmd/term/term.c @@ -410,7 +410,7 @@ getsel(void) if (gp->mode & Gwdummy) continue; - ptr += utf8·runetobyte(ptr, &gp->u); + ptr += utf8·encode(&gp->u, ptr); } /* @@ -993,7 +993,7 @@ tsetchar(rune u, Letter *attr, int x, int y) */ if (term.trantbl[term.charset] == CSgfx0 && BETWEEN(u, 0x41, 0x7e) && vt100_0[u - 0x41]) - utf8·bytetorune(&u, vt100_0[u - 0x41]); + utf8·decode(vt100_0[u - 0x41], &u); if (term.line[y][x].mode & Gwide) { if (x+1 < term.col) { @@ -1801,7 +1801,7 @@ tdumpline(int n) end = &bp[MIN(tlinelen(n), term.col) - 1]; if (bp != end || bp->u != ' ') { for ( ; bp <= end; ++bp) - tprinter(buf, utf8·runetobyte(buf, &bp->u)); + tprinter(buf, utf8·encode(&bp->u, buf)); } tprinter("\n", 1); } @@ -2090,7 +2090,7 @@ tputc(rune u) c[0] = u; width = len = 1; } else { - len = utf8·runetobyte(c, &u); + len = utf8·encode(&u, c); if(!control && (width = wcwidth(u)) == -1) width = 1; } @@ -2257,7 +2257,7 @@ twrite(char *buf, int buflen, int show_ctrl) for (n = 0; n < buflen; n += charsize) { if(IS_SET(Tutf8) && !IS_SET(Tsixel)) { /* process a complete utf8 char */ - charsize = utf8·bytetorune(&u, buf + n); + charsize = utf8·decode(buf + n, &u); if(charsize == 0) break; } else { diff --git a/sys/cmd/term/x.c b/sys/cmd/term/x.c index 3079f38..ea64bff 100644 --- a/sys/cmd/term/x.c +++ b/sys/cmd/term/x.c @@ -1824,7 +1824,7 @@ kpress(XEvent *ev) if (IS_SET(W8bit)) { if (*buf < 0177) { c = *buf | 0x80; - len = utf8·runetobyte(buf, &c); + len = utf8·encode(&c, buf); } } else { buf[1] = buf[0]; diff --git a/sys/cmd/wm/main.c b/sys/cmd/wm/main.c index a447d80..2607801 100644 --- a/sys/cmd/wm/main.c +++ b/sys/cmd/wm/main.c @@ -133,7 +133,7 @@ main(int argc, char *argv[]) { char *socket, *cmd=nil; - ARGBEGIN { + ARGBEGIN{ case 's': cmd = ARGF(); break; diff --git a/sys/libunicode/bytetorune.c b/sys/libunicode/bytetorune.c deleted file mode 100644 index fe41e7a..0000000 --- a/sys/libunicode/bytetorune.c +++ /dev/null @@ -1,40 +0,0 @@ -#include "internal.h" - -int -utf8·bytetorune(rune* r, byte* s) -{ - int c[UTFmax], i; - rune l; - - c[0] = *(ubyte*)(s); - if(c[0] < Tx) { - *r = c[0]; - return 1; - } - - l = c[0]; - for(i = 1; i < UTFmax; i++) { - c[i] = *(ubyte*)(s+i); - c[i] ^= Tx; - if (c[i] & Testx) goto bad; - - l = (l << Bitx) | c[i]; - if(c[0] < Tbyte(i + 2)) { - l &= RuneX(i + 1); - if (i == 1) { - if (c[0] < Tbyte(2) || l <= Rune1) - goto bad; - } else if (l <= RuneX(i) || l > RuneMax) - goto bad; - if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) - goto bad; - - *r = l; - return i + 1; - } - } -bad: - *r = RuneErr; - return 1; -} - diff --git a/sys/libunicode/canfit.c b/sys/libunicode/canfit.c new file mode 100644 index 0000000..d44c9e6 --- /dev/null +++ b/sys/libunicode/canfit.c @@ -0,0 +1,20 @@ +#include "internal.h" + +/* returns 1 if string of length n is long enough to be decoded */ +int +utf8·canfit(byte* s, int n) +{ + int i; + rune c; + + if(n <= 0) return 0; + c = *(ubyte*) s; + if(c < Tx) return 1; + + for(i = 3; i < UTFmax + 1; i++){ + if(c < Tbyte(i)) + return n >= i - 1; + } + + return n >= UTFmax; +} diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c new file mode 100644 index 0000000..79271f2 --- /dev/null +++ b/sys/libunicode/decode.c @@ -0,0 +1,40 @@ +#include "internal.h" + +int +utf8·decode(byte *s, rune* r) +{ + int c[UTFmax], i; + rune l; + + c[0] = *(ubyte*)(s); + if(c[0] < Tx) { + *r = c[0]; + return 1; + } + + l = c[0]; + for(i = 1; i < UTFmax; i++) { + c[i] = *(ubyte*)(s+i); + c[i] ^= Tx; + if (c[i] & Testx) goto bad; + + l = (l << Bitx) | c[i]; + if(c[0] < Tbyte(i + 2)) { + l &= RuneX(i + 1); + if (i == 1) { + if (c[0] < Tbyte(2) || l <= Rune1) + goto bad; + } else if (l <= RuneX(i) || l > RuneMax) + goto bad; + if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + + *r = l; + return i + 1; + } + } +bad: + *r = RuneErr; + return 1; +} + diff --git a/sys/libunicode/encode.c b/sys/libunicode/encode.c new file mode 100644 index 0000000..8f4d212 --- /dev/null +++ b/sys/libunicode/encode.c @@ -0,0 +1,31 @@ +#include "internal.h" + +int +utf8·encode(rune* r, byte* s) +{ + int i, j; + rune c; + + c = *r; + if(c <= Rune1) { + s[0] = c; + return 1; + } + + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > RuneMax) + c = RuneErr; + if(SurrogateMin <= c && c <= SurrogateMax) + c = RuneErr; + } + if(c <= RuneX(i) || i == UTFmax) { + s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + + return UTFmax; +} diff --git a/sys/libunicode/find.c b/sys/libunicode/find.c new file mode 100644 index 0000000..659ab5b --- /dev/null +++ b/sys/libunicode/find.c @@ -0,0 +1,30 @@ +#include "internal.h" + +byte* +utf8·find(byte* s, rune c) +{ + long c1; + rune r; + int n; + + if(c < RuneSync) + return strchr(s, c); + + for(;;){ + c1 = *(ubyte*)s; + if(c1 < RuneSelf) { + if(c1 == 0) return nil; + if(c1 == c) return s; + s++; + continue; + } + n = utf8·decode(s, &r); + + if(r == c) + return s; + + s += n; + } + + return nil; +} diff --git a/sys/libunicode/findlast.c b/sys/libunicode/findlast.c new file mode 100644 index 0000000..3a4ed38 --- /dev/null +++ b/sys/libunicode/findlast.c @@ -0,0 +1,30 @@ +#include "internal.h" + +byte* +utf8·findlast(byte* s, rune c) +{ + long c1; + rune r; + byte *l; + + if(c < RuneSync) + return strrchr(s, c); + + l = nil; + for(;;){ + c1 = *(ubyte*)s; + if(c1 < RuneSelf) { + if(c1 == 0) return l; + if(c1 == c) l = s; + s++; + continue; + } + c1 = utf8·decode(s, &r); + if(r == c) + l = s; + + s += c1; + } + + return nil; +} diff --git a/sys/libunicode/findlastrune.c b/sys/libunicode/findlastrune.c deleted file mode 100644 index 0dc5032..0000000 --- a/sys/libunicode/findlastrune.c +++ /dev/null @@ -1,29 +0,0 @@ -#include "internal.h" - -byte* -utf8·findlastrune(byte* s, long c) -{ - long c1; - rune r; - byte *l; - - if (c < RuneSync) - return strrchr(s, c); - - l = nil; - for(;;){ - c1 = *(ubyte*)s; - if (c1 < RuneSelf) { - if (c1 == 0) return l; - if (c1 == c) l = s; - s++; - continue; - } - c1 = utf8·bytetorune(&r, s); - if (r == c) - l = s; - s += c1; - } - - return nil; -} diff --git a/sys/libunicode/findrune.c b/sys/libunicode/findrune.c deleted file mode 100644 index 97edc3c..0000000 --- a/sys/libunicode/findrune.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "internal.h" - -byte* -utf8·findrune(byte* s, long c) -{ - long c1; - rune r; - int n; - - if (c < RuneSync) return strchr(s, c); - - for (;;) { - c1 = *(ubyte*)s; - if (c1 < RuneSelf) { - if (c1 == 0) return nil; - if (c1 == c) return s; - s++; - continue; - } - n = utf8·bytetorune(&r, s); - if (r == c) return s; - s += n; - } - - return nil; -} diff --git a/sys/libunicode/fullrune.c b/sys/libunicode/fullrune.c deleted file mode 100644 index e5cf314..0000000 --- a/sys/libunicode/fullrune.c +++ /dev/null @@ -1,18 +0,0 @@ -#include "internal.h" - -int -utf8·fullrune(byte* s, int n) -{ - int i; - rune c; - - if (n <= 0) return 0; - c = *(ubyte*) s; - if (c < Tx) return 1; - - for (i = 3; i < UTFmax + 1; i++) { - if (c < Tbyte(i)) return n >= i - 1; - } - - return n >= UTFmax; -} diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h index bf376b2..31ed2ae 100644 --- a/sys/libunicode/internal.h +++ b/sys/libunicode/internal.h @@ -4,8 +4,18 @@ #include #include +/* UTF-8 code + * 1 byte: + * 0xxxxxxx + * 2 byte: + * 110xxxxx 10xxxxxx + * 3 byte: + * 1110xxxx 10xxxxxx 10xxxxxx + * 4 byte: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ #define Bit(i) (7-(i)) -/* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ +/* 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ #define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) /* 0000 0000 0000 0111 1111 1111 */ #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) diff --git a/sys/libunicode/len.c b/sys/libunicode/len.c new file mode 100644 index 0000000..c461bf3 --- /dev/null +++ b/sys/libunicode/len.c @@ -0,0 +1,21 @@ +#include "internal.h" + +int +utf8·len(char *s) +{ + int c; + long n; + rune r; + + n = 0; + for(;;){ + c = *(uchar*)s; + if(c < RuneSelf) { + if(c == 0) + return n; + s++; + } else + s += utf8·decode(s, &r); + n++; + } +} diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk index fb83819..100697b 100644 --- a/sys/libunicode/rules.mk +++ b/sys/libunicode/rules.mk @@ -3,12 +3,13 @@ include share/push.mk UNICODE = 14.0.0 SRCS_$(d) := \ - $(d)/runetobyte.c \ - $(d)/bytetorune.c \ - $(d)/findrune.c \ - $(d)/findlastrune.c \ - $(d)/fullrune.c \ + $(d)/encode.c \ + $(d)/decode.c \ + $(d)/find.c \ + $(d)/findlast.c \ + $(d)/canfit.c \ $(d)/runelen.c \ + $(d)/len.c \ $(d)/runetype-$(UNICODE).c LIBS_$(d) := $(d)/libunicode.a @@ -16,13 +17,14 @@ LIBS_$(d) := $(d)/libunicode.a include share/paths.mk $(d)/vendor/UnicodeData-$(UNICODE).txt: + @echo "GET UnicodeData.txt";\ curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ $(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a $(COMPLINK) $(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype - $(dir $@)/vendor/mkrunetype $< > $@ + @$(dir $@)vendor/mkrunetype $< > $@ GENS += $(d)/vendor/mkrunetype diff --git a/sys/libunicode/runelen.c b/sys/libunicode/runelen.c index 682400c..dac7f15 100644 --- a/sys/libunicode/runelen.c +++ b/sys/libunicode/runelen.c @@ -4,5 +4,5 @@ int utf8·runelen(rune r) { byte s[10]; - return utf8·runetobyte(s, &r); + return utf8·encode(&r, s); } diff --git a/sys/libunicode/runetobyte.c b/sys/libunicode/runetobyte.c deleted file mode 100644 index 27f252b..0000000 --- a/sys/libunicode/runetobyte.c +++ /dev/null @@ -1,31 +0,0 @@ -#include "internal.h" - -int -utf8·runetobyte(byte* s, rune* r) -{ - int i, j; - rune c; - - c = *r; - if(c <= Rune1) { - s[0] = c; - return 1; - } - - for(i = 2; i < UTFmax + 1; i++){ - if(i == 3){ - if(c > RuneMax) - c = RuneErr; - if(SurrogateMin <= c && c <= SurrogateMax) - c = RuneErr; - } - if(c <= RuneX(i) || i == UTFmax) { - s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); - for(j = 1; j < i; j++) - s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); - return i; - } - } - - return UTFmax; -} -- cgit v1.2.1