From 7ea1cdb7d31f00024f5a1d124b42cd19a03b959a Mon Sep 17 00:00:00 2001 From: Nicholas Date: Wed, 10 Nov 2021 20:12:45 -0800 Subject: chore: libunicode -> libutf --- sys/libutf/canfit.c | 23 +++++++++ sys/libutf/decode.c | 98 +++++++++++++++++++++++++++++++++++++ sys/libutf/decodeprev.c | 60 +++++++++++++++++++++++ sys/libutf/encode.c | 69 ++++++++++++++++++++++++++ sys/libutf/find.c | 31 ++++++++++++ sys/libutf/findlast.c | 32 ++++++++++++ sys/libutf/internal.h | 38 +++++++++++++++ sys/libutf/len.c | 21 ++++++++ sys/libutf/rules.mk | 76 +++++++++++++++++++++++++++++ sys/libutf/runelen.c | 8 +++ sys/libutf/runetype-14.0.0.c | 111 ++++++++++++++++++++++++++++++++++++++++++ sys/libutf/runewidth-14.0.0.c | 71 +++++++++++++++++++++++++++ 12 files changed, 638 insertions(+) create mode 100644 sys/libutf/canfit.c create mode 100644 sys/libutf/decode.c create mode 100644 sys/libutf/decodeprev.c create mode 100644 sys/libutf/encode.c create mode 100644 sys/libutf/find.c create mode 100644 sys/libutf/findlast.c create mode 100644 sys/libutf/internal.h create mode 100644 sys/libutf/len.c create mode 100644 sys/libutf/rules.mk create mode 100644 sys/libutf/runelen.c create mode 100644 sys/libutf/runetype-14.0.0.c create mode 100644 sys/libutf/runewidth-14.0.0.c (limited to 'sys/libutf') diff --git a/sys/libutf/canfit.c b/sys/libutf/canfit.c new file mode 100644 index 0000000..4579ab3 --- /dev/null +++ b/sys/libutf/canfit.c @@ -0,0 +1,23 @@ +#include "internal.h" + +/* returns 1 if string of length n is long enough to be decoded */ +int +utf8·canfit(byte* s, int n) +{ + int i; + rune c; + + if(n <= 0) + return 0; + + c = *(ubyte*)s; + if(c < TByte1) + return 1; + + if(c < TByte3) + return n >= 2; + if(c < TByte4) + return n >= 3; + + return n >= UTFmax; +} diff --git a/sys/libutf/decode.c b/sys/libutf/decode.c new file mode 100644 index 0000000..01797f1 --- /dev/null +++ b/sys/libutf/decode.c @@ -0,0 +1,98 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* + * the second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state + */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +int +utf8·decode(char *s, rune *r) +{ + int n; + rune v; + uint8 b, t, x=ACCEPT; + + b = ((uint8 *)s)[0]; + t = decode[b]; + v = (0xFF >> t) & b; + x = decode[256+x+t]; + + for(n=1; x > REJECT && n < UTFmax; n++){ + b = ((uint8 *)s)[n]; + t = decode[b]; + v = (v << 6) | (b & TMask); + x = decode[256+x+t]; + } + + if(x != ACCEPT){ + *r = RuneErr; + return 1; + } + + *r = v; + return n; +} + +#if 0 +int +utf8·decode(byte *s, rune *r) +{ + int c[UTFmax], i; + rune l; + + c[0] = *(ubyte*)(s); + if(c[0] < Tx){ + *r = c[0]; + return 1; + } + + l = c[0]; + for(i = 1; i < UTFmax; i++){ + c[i] = *(ubyte*)(s+i); + c[i] ^= Tx; + if(c[i] & Testx) goto bad; + + l = (l << Bitx) | c[i]; + if(c[0] < Tbyte(i + 2)){ + l &= RuneX(i + 1); + if(i == 1){ + if(c[0] < Tbyte(2) || l <= Rune1) + goto bad; + }else if(l <= RuneX(i) || l > RuneMax) + goto bad; + + if(i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + + *r = l; + return i + 1; + } + } +bad: + *r = RuneErr; + return 1; +} +#endif diff --git a/sys/libutf/decodeprev.c b/sys/libutf/decodeprev.c new file mode 100644 index 0000000..27dced6 --- /dev/null +++ b/sys/libutf/decodeprev.c @@ -0,0 +1,60 @@ +#include "internal.h" + +#define ACCEPT 0 +#define REJECT 12 + +static uint8 decode[] = { + /* + * the first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. + */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + /* + * The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state. + */ + // 0 1 2 3 4 5 6 7 8 9 10 11 + 0,24,12,12,12,12,12,24,12,24,12,12, + 0,24,12,12,12,12,12,24,12,24,12,12, + 12,36, 0,12,12,12,12,48,12,36,12,12, + 12,60,12, 0, 0,12,12,72,12,72,12,12, + 12,60,12, 0,12,12,12,72,12,72, 0,12, + 12,12,12,12,12, 0, 0,12,12,12,12,12, + 12,12,12,12,12,12,12,12,12,12,12, 0 +}; + +int +utf8·decodeprev(byte *s, rune *r) +{ + int n; + rune v; + uint8 b, t, d, x=ACCEPT; + + v=0, n=0, d=0; +nextbyte: + b = ((uint8 *)s)[-n++]; + t = decode[b]; + x = decode[256+x+t]; + + if(x > REJECT && n < UTFmax){ + v = v | ((b & TMask) << d); + d += 6; + goto nextbyte; + } + + if(x != ACCEPT) + *r = RuneErr; + else{ + v |= (((0xFFu >> t) & b) << d); + *r = v; + } + + return n; +} diff --git a/sys/libutf/encode.c b/sys/libutf/encode.c new file mode 100644 index 0000000..fa7c93e --- /dev/null +++ b/sys/libutf/encode.c @@ -0,0 +1,69 @@ +#include "internal.h" + +int +utf8·encode(rune *r, byte *s) +{ + rune c; + + c = *r; + if(c < Rune1Byte){ // 7 bits + s[0] = (uint8)c; + return 1; + } + + if(c < Rune2Byte){ // 11 bits + s[0] = TByte1 | (c >> 6); + s[1] = Tx | (c & TMask); + return 2; + } + + if(c < Rune3Byte){ // 16 bits + s[0] = TByte2 | ((c >> 12)); + s[1] = Tx | ((c >> 6) & TMask); + s[2] = Tx | ((c) & TMask); + return 3; + } + + // 22 bits + if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax)) + c = RuneErr; + + s[0] = TByte3 | ((c >> 18)); + s[1] = Tx | ((c >> 12) & TMask); + s[2] = Tx | ((c >> 6) & TMask); + s[3] = Tx | ((c) & TMask); + + return 4; +} + +#if 0 +int +utf8·encode(rune* r, byte* s) +{ + int i, j; + rune c; + + c = *r; + if(c <= Rune1) { + s[0] = c; + return 1; + } + + for(i = 2; i < UTFmax + 1; i++){ + if(i == 3){ + if(c > RuneMax) + c = RuneErr; + if(SurrogateMin <= c && c <= SurrogateMax) + c = RuneErr; + } + if(c <= RuneX(i) || i == UTFmax) { + s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + + return UTFmax; +} +#endif diff --git a/sys/libutf/find.c b/sys/libutf/find.c new file mode 100644 index 0000000..d75feb8 --- /dev/null +++ b/sys/libutf/find.c @@ -0,0 +1,31 @@ +#include "internal.h" + +byte* +utf8·find(byte* s, rune c) +{ + long c1; + rune r; + int n; + + if(c < Tx) + return strchr(s, c); + + for(;;){ + c1 = *(ubyte*)s; + if(c1 < Tx){ + if(c1 == 0) return nil; + if(c1 == c) return s; + s++; + continue; + } + + n = utf8·decode(s, &r); + + if(r == c) + return s; + + s += n; + } + + return nil; +} diff --git a/sys/libutf/findlast.c b/sys/libutf/findlast.c new file mode 100644 index 0000000..ab25ab2 --- /dev/null +++ b/sys/libutf/findlast.c @@ -0,0 +1,32 @@ +#include "internal.h" + +byte* +utf8·findlast(byte* s, rune c) +{ + long c1; + rune r; + byte *l; + + if(c < Tx) + return strrchr(s, c); + + l = nil; + for(;;){ + c1 = *(ubyte*)s; + if(c1 < Tx){ + if(c1 == 0) return l; + if(c1 == c) l = s; + s++; + continue; + } + + c1 = utf8·decode(s, &r); + + if(r == c) + l = s; + + s += c1; + } + + return nil; +} diff --git a/sys/libutf/internal.h b/sys/libutf/internal.h new file mode 100644 index 0000000..9719977 --- /dev/null +++ b/sys/libutf/internal.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include + +/* + * NOTE: we use the preprocessor to ensure we have unsigned constants. + * UTF-8 code: + * 1 byte: + * 0xxxxxxx + * 2 byte: + * 110xxxxx 10xxxxxx + * 3 byte: + * 1110xxxx 10xxxxxx 10xxxxxx + * 4 byte: + * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + +#define Tx 0x80u // 0b10000000 transfer header +#define TMask 0x3Fu // 0b00111111 transfer mask + +#define TByte1 0xC0u // 0b11000000 +#define TByte2 0xE0u // 0b11100000 +#define TByte3 0xF0u // 0b11110000 +#define TByte4 0xF8u // 0b11111000 + +#define RuneMask 0x1FFFFFu + +#define Rune1Byte 0x000080u // 1 << 8 (1 byte) +#define Rune2Byte 0x001000u // 1 << 12 (2 bytes) +#define Rune3Byte 0x020000u // 1 << 17 (3 bytes) +#define Rune4Byte 0x400000u // 1 << 22 (4 bytes) + + +/* UTF-16 nonsense */ +#define RuneSurrogateMin 0x0D8000 +#define RuneSurrogateMax 0x0D8FFF diff --git a/sys/libutf/len.c b/sys/libutf/len.c new file mode 100644 index 0000000..8fbd679 --- /dev/null +++ b/sys/libutf/len.c @@ -0,0 +1,21 @@ +#include "internal.h" + +int +utf8·len(char *s) +{ + int c; + long n; + rune r; + + n = 0; + for(;;){ + c = *(uchar*)s; + if(c < Tx){ + if(c == 0) + return n; + s++; + }else + s += utf8·decode(s, &r); + n++; + } +} diff --git a/sys/libutf/rules.mk b/sys/libutf/rules.mk new file mode 100644 index 0000000..53ff8cf --- /dev/null +++ b/sys/libutf/rules.mk @@ -0,0 +1,76 @@ +include share/push.mk + +UNICODE = 14.0.0 + +SRCS_$(d) := \ + $(d)/encode.c \ + $(d)/decode.c \ + $(d)/decodeprev.c \ + $(d)/find.c \ + $(d)/findlast.c \ + $(d)/canfit.c \ + $(d)/runelen.c \ + $(d)/len.c \ + $(d)/runetype-$(UNICODE).c \ + $(d)/runewidth-$(UNICODE).c + +LIBS_$(d) := $(d)/libutf.a + +include share/paths.mk + +# ======================================================================== +# table generation + +$(d)/vendor/common.o: $(d)/vendor/common.c + $(COMPILE) + +# rune categories +$(d)/vendor/UnicodeData-$(UNICODE).txt: + @echo "GET UnicodeData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ + +$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunetype + +$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype + @$(dir $@)vendor/mkrunetype $< > $@ + +# rune widths +$(d)/vendor/EastAsianWidth-$(UNICODE).txt: + @echo "GET EastAsianWidth.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@ + +$(d)/vendor/EmojiData-$(UNICODE).txt: + @echo "GET EmojiData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@ + +$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunewidth + +$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt + @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@ + +# grapheme boundaries +$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt: + @echo "GET GraphemeBreakProperty.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@ + +$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt + $^ > $@ + +GENS += $(d)/vendor/mkgraphemedata + +# ======================================================================== +# normal operations + +$(LIBS_$(d)): $(OBJS_$(d)) + $(ARCHIVE) + +include share/pop.mk diff --git a/sys/libutf/runelen.c b/sys/libutf/runelen.c new file mode 100644 index 0000000..dac7f15 --- /dev/null +++ b/sys/libutf/runelen.c @@ -0,0 +1,8 @@ +#include "internal.h" + +int +utf8·runelen(rune r) +{ + byte s[10]; + return utf8·encode(&r, s); +} diff --git a/sys/libutf/runetype-14.0.0.c b/sys/libutf/runetype-14.0.0.c new file mode 100644 index 0000000..6f4469d --- /dev/null +++ b/sys/libutf/runetype-14.0.0.c @@ -0,0 +1,111 @@ +#include +#include + +static +rune* +rangesearch(rune c, rune *t, int n, int ne) +{ + rune *p; + int m; + while(n > 1) { + m = n >> 1; + p = t + m*ne; + if(c >= p[0]){ + t = p; + n = n-m; + }else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +static rune isspace_range[] = { + 0x0009, 0x000d, + 0x0085, 0x0085, +}; + +int +utf8·isspace(rune c) +{ + rune *p; + + p = rangesearch(c, isspace_range, arrlen(isspace_range)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + return 0; +} + +int +utf8·isdigit(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·isalpha(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·isupper(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·islower(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·istitle(rune c) +{ + rune *p; + + return 0; +} + +int +utf8·ispunct(rune c) +{ + rune *p; + + return 0; +} + +rune +utf8·toupper(rune c) +{ + rune *p; + + return c; +} + +rune +utf8·tolower(rune c) +{ + rune *p; + + return c; +} + +rune +utf8·totitle(rune c) +{ + rune *p; + + return c; +} + diff --git a/sys/libutf/runewidth-14.0.0.c b/sys/libutf/runewidth-14.0.0.c new file mode 100644 index 0000000..113c35e --- /dev/null +++ b/sys/libutf/runewidth-14.0.0.c @@ -0,0 +1,71 @@ +#include +#include + +static +rune* +rangesearch(rune c, rune *t, int n, int ne) +{ + rune *p; + int m; + while(n > 1) { + m = n >> 1; + p = t + m*ne; + if(c >= p[0]){ + t = p; + n = n-m; + }else + n = m; + } + if(n && c >= t[0]) + return t; + return 0; +} + +static rune width0_range[] = { + 0x2028, 0x2029, +}; + +static int +iswidth0(rune c) +{ + rune *p; + + p = rangesearch(c, width0_range, arrlen(width0_range)/2, 2); + if(p && c >= p[0] && c <= p[1]) + return 1; + return 0; +} + +static rune width1_single[] = { + 0x00ad, +}; + +static int +iswidth1(rune c) +{ + rune *p; + + p = rangesearch(c, width1_single, arrlen(width1_single), 1); + if(p && c == p[0]) + return 1; + return 0; +} + +static int +iswidth2(rune c) +{ + rune *p; + + return 0; +} + + +int +utf8·runewidth(rune c) +{ + if(iswidth1(c)) + return 1; + if(iswidth2(c)) + return 2; + return 0; +} -- cgit v1.2.1 From d584897456ef419f066be962f2c2628eeaef75bc Mon Sep 17 00:00:00 2001 From: Nicholas Date: Thu, 11 Nov 2021 08:21:23 -0800 Subject: chore: forgot to move old generators --- sys/libutf/vendor/common.c | 220 +++++++++++++++++++++ sys/libutf/vendor/common.h | 46 +++++ sys/libutf/vendor/mkgraphemedata.c | 24 +++ sys/libutf/vendor/mkrunetype.c | 388 +++++++++++++++++++++++++++++++++++++ sys/libutf/vendor/mkrunewidth.c | 280 ++++++++++++++++++++++++++ 5 files changed, 958 insertions(+) create mode 100644 sys/libutf/vendor/common.c create mode 100644 sys/libutf/vendor/common.h create mode 100644 sys/libutf/vendor/mkgraphemedata.c create mode 100644 sys/libutf/vendor/mkrunetype.c create mode 100644 sys/libutf/vendor/mkrunewidth.c (limited to 'sys/libutf') diff --git a/sys/libutf/vendor/common.c b/sys/libutf/vendor/common.c new file mode 100644 index 0000000..5a03a50 --- /dev/null +++ b/sys/libutf/vendor/common.c @@ -0,0 +1,220 @@ +#include "common.h" + +// ----------------------------------------------------------------------- +// input functions + +int +parse(io·Stream *io, int nfield, char **field, int len, char *line) +{ + int n; + if((n=io·readln(io, len, line)) <= 0) + return ParseEOF; + + if(n == len) + panicf("line too long"); + + if(line[n-1] != '\n') + panicf("invalid line: expected '\n', found '%c'", line[n]); + + line[n-1] = 0; + + if(line[0] == '#' || line[0] == 0) + return ParseSkip; + + /* tokenize line into fields */ + n = 0; + field[n] = line; + while(*line){ + if(*line == ';'){ + *line = 0; + field[++n] = line+1; + } + line++; + } + + if(n != nfield-1) + panicf("expected %d number of fields, got %d: %s", nfield, n, line); + + return ParseOK; +} + +int +codepoint(char *s) +{ + int c, b; + + c = 0; + while((b=*s++)){ + c <<= 4; + if(b >= '0' && b <= '9') + c += b - '0'; + else if(b >= 'A' && b <= 'F') + c += b - 'A' + 10; + else + panicf("bad codepoint char '%c'", b); + } + + return c; +} + +void +codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop) +{ + int e, c; + char *other[NumFields], line[1024]; + + // XXX: the stop variable passes in the previous stopping character + e = *stop; + c = codepoint(field[Fcode]); + + if(c >= NumRunes) + panicf("unexpected large codepoint %x", c); + if(c <= e) + panicf("bad code sequence: %x then %x", e, c); + e = c; + + if(strstr(field[Fname], ", First>") != nil){ + if(!parse(utf8, arrlen(other), other, arrlen(line), line)) + panicf("range start at end of file"); + if(strstr(other[Fname], ", Last>") == nil) + panicf("range start not followed by range end"); + + e = codepoint(other[Fcode]); + + if(e <= c) + panicf("bad code sequence: %x then %x", c, e); + if(strcmp(field[Fcategory], other[Fcategory]) != 0) + panicf("range with mismatched category"); + } + + *start = c; + *stop = e; +} + +// ----------------------------------------------------------------------- +// output functions + +void +putsearch(void) +{ + puts( + "#include \n" + "#include \n" + "\n" + "static\n" + "rune*\n" + "rangesearch(rune c, rune *t, int n, int ne)\n" + "{\n" + " rune *p;\n" + " int m;\n" + " while(n > 1) {\n" + " m = n >> 1;\n" + " p = t + m*ne;\n" + " if(c >= p[0]){\n" + " t = p;\n" + " n = n-m;\n" + " }else\n" + " n = m;\n" + " }\n" + " if(n && c >= t[0])\n" + " return t;\n" + " return 0;\n" + "}\n" + ); + +} + +int +putrange(char *ident, char *prop, int force) +{ + int l, r, start; + + start = 0; + for(l = 0; l < NumRunes;) { + if(!prop[l]){ + l++; + continue; + } + + for(r = l+1; r < NumRunes; r++){ + if(!prop[r]) + break; + prop[r] = 0; + } + + if(force || r > l + 1){ + if(!start){ + printf("static rune %s[] = {\n", ident); + start = 1; + } + prop[l] = 0; + printf("\t0x%.4x, 0x%.4x,\n", l, r-1); + } + + l = r; + } + + if(start) + printf("};\n\n"); + + return start; +} + +int +putpair(char *ident, char *prop) +{ + int l, r, start; + + start = 0; + for(l=0; l+2 < NumRunes; ){ + if(!prop[l]){ + l++; + continue; + } + + for(r = l + 2; r < NumRunes; r += 2){ + if(!prop[r]) + break; + prop[r] = 0; + } + + if(r != l + 2){ + if(!start){ + printf("static rune %s[] = {\n", ident); + start = 1; + } + prop[l] = 0; + printf("\t0x%.4x, 0x%.4x,\n", l, r - 2); + } + + l = r; + } + + if(start) + printf("};\n\n"); + return start; +} + +int +putsingle(char *ident, char *prop) +{ + int i, start; + + start = 0; + for(i = 0; i < NumRunes; i++) { + if(!prop[i]) + continue; + + if(!start){ + printf("static rune %s[] = {\n", ident); + start = 1; + } + prop[i] = 0; + printf("\t0x%.4x,\n", i); + } + + if(start) + printf("};\n\n"); + + return start; +} diff --git a/sys/libutf/vendor/common.h b/sys/libutf/vendor/common.h new file mode 100644 index 0000000..62f6c5b --- /dev/null +++ b/sys/libutf/vendor/common.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + +enum +{ + // Fields inside UnicodeData.txt + Fcode, + Fname, + Fcategory, + Fcombine, + Fbidir, + Fdecomp, + Fdecimal, + Fdigit, + Fnumeric, + Fmirror, + Foldname, + Fcomment, + Fupper, + Flower, + Ftitle, + + NumFields, + NumRunes = 1 << 21, +}; + +/* input functions */ +enum +{ + ParseEOF, + ParseOK, + ParseSkip, +}; + +int parse(io·Stream *io, int nfield, char **field, int len, char *line); +int codepoint(char *s); +void codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop); + +/* output functions */ +void putsearch(void); +int putrange(char *ident, char *prop, int force); +int putpair(char *ident, char *prop); +int putsingle(char *ident, char *prop); diff --git a/sys/libutf/vendor/mkgraphemedata.c b/sys/libutf/vendor/mkgraphemedata.c new file mode 100644 index 0000000..ce5a952 --- /dev/null +++ b/sys/libutf/vendor/mkgraphemedata.c @@ -0,0 +1,24 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkgraphemedata \n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + io·Stream *utf8; + char line[1024]; + + ARGBEGIN{ + }ARGEND; +} diff --git a/sys/libutf/vendor/mkrunetype.c b/sys/libutf/vendor/mkrunetype.c new file mode 100644 index 0000000..9f939f4 --- /dev/null +++ b/sys/libutf/vendor/mkrunetype.c @@ -0,0 +1,388 @@ +#include "common.h" + +// ----------------------------------------------------------------------- +// globals + +#define OFFSET (1 << 20) +#define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) + +// TODO: use bitarrays. will reduce executable size 8x +struct Table +{ + /* properties */ + char isspace[NumRunes]; + char isalpha[NumRunes]; + char ismark[NumRunes]; + char isdigit[NumRunes]; + char isupper[NumRunes]; + char islower[NumRunes]; + char istitle[NumRunes]; + char ispunct[NumRunes]; + char issymbl[NumRunes]; + char iscntrl[NumRunes]; + + char combine[NumRunes]; + + /* transformations */ + int toupper[NumRunes]; + int tolower[NumRunes]; + int totitle[NumRunes]; +}; + +static struct Table table; + +// ----------------------------------------------------------------------- +// internal functions + +static +int +isrange(char *label, char *prop, int force) +{ + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident)) + panicf("out of identifier space\n"); + + return putrange(ident, prop, force); +} + +static +int +ispair(char *label, char *prop) +{ + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident)) + panicf("out of identifier space\n"); + + return putpair(ident, prop); +} + +static +int +issingle(char *label, char *prop) +{ + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident)) + panicf("out of identifier space\n"); + + return putsingle(ident, prop); +} + +static +void +makeis(char *label, char *table, int pairs, int onlyranges) +{ + int hasr, hasp=0, hass=0; + + hasr = isrange(label, table, onlyranges); + if(!onlyranges && pairs) + hasp = ispair(label, table); + if(!onlyranges) + hass = issingle(label, table); + + printf( + "int\n" + "utf8·is%s(rune c)\n" + "{\n" + " rune *p;\n" + "\n", + label); + + if(hasr){ + printf( + " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n" + " if(p && c >= p[0] && c <= p[1])\n" + " return 1;\n", + label, label); + } + + if(hasp){ + printf( + " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n" + " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return 1;\n", + label, label); + } + + if(hass) + printf( + " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n" + " if(p && c == p[0])\n" + " return 1;\n", + label, label); + + printf( + " return 0;\n" + "}\n" + "\n"); +} + +static +int +torange(char *label, int *index, int force) +{ + int l, r, d, start = 0; + + for(l = 0; l < NumRunes; ){ + if(index[l] == l){ + l++; + continue; + } + + d = DELTA(index[l], l); + if(d != (rune)d) + panicf("bad map delta %d", d); + + for(r = l+1; r < NumRunes; r++){ + if(DELTA(index[r], r) != d) + break; + index[r] = r; + } + + if(force || r != l + 1){ + if(!start){ + printf("static rune to%s_range[] = {\n", label); + start = 1; + } + index[l] = l; + printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d); + } + l = r; + } + if(start) + printf("};\n\n"); + + return start; +} + +static +int +topair(char *label, int *index) +{ + int l, r, d, start = 0; + + for(l = 0; l + 2 < NumRunes; ){ + if(index[l] == l){ + l++; + continue; + } + + d = DELTA(index[l], l); + if(d != (rune)d) + panicf("bad delta %d", d); + + for(r = l+2; r < NumRunes; r += 2){ + if(DELTA(index[r], r) != d) + break; + index[r] = r; + } + + if(r > l+2){ + if(!start){ + printf("static rune to%s_pair[] = {\n", label); + start = 1; + } + index[l] = l; + printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d); + } + + l = r; + } + if(start) + printf("};\n\n"); + + return start; +} + +static +int +tosingle(char *label, int *index) +{ + int i, d, start = 0; + + for(i=0; i < NumRunes; i++) { + if(index[i] == i) + continue; + + d = DELTA(index[i], i); + if(d != (rune)d) + panicf("bad map delta %d", d); + + if(!start){ + printf("static rune to%s_single[] = {\n", label); + start = 1; + } + index[i] = i; + printf("\t0x%.4x, %d,\n", i, d); + } + if(start) + printf("};\n\n"); + + return start; +} + +static +void +mkto(char *label, int *index, int pairs, int onlyrange) +{ + int hasr, hasp=0, hass=0; + + hasr = torange(label, index, !onlyrange); + if(!onlyrange && pairs) + hasp = topair(label, index); + if(!onlyrange) + hass = tosingle(label, index); + + printf( + "rune\n" + "utf8·to%s(rune c)\n" + "{\n" + " rune *p;\n" + "\n", + label); + + if(hasr) + printf( + " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n" + " if(p && c >= p[0] && c <= p[1])\n" + " return c + p[2] - %d;\n", + label, label, OFFSET); + + if(hasp) + printf( + " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n" + " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return c + p[2] - %d;\n", + label, label, OFFSET); + + if(hass) + printf( + " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n" + " if(p && c == p[0])\n" + " return c + p[1] - %d;\n", + label, label, OFFSET); + + + printf( + " return c;\n" + "}\n" + "\n" + ); +} + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkrunetype \n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int i, sc, c, ec; + io·Stream *utf8; + char *prop, *field[NumFields], line[1024]; + + ARGBEGIN{ + }ARGEND; + + if(argc != 1) + usage(); + + if(!(utf8 = io·open(argv[0], "r"))) + panicf("can't open %s\n", argv[0]); + + /* by default each character maps to itself */ + for(i = 0; i < NumRunes; i++) { + table.toupper[i] = i; + table.tolower[i] = i; + table.totitle[i] = i; + } + + /* ensure all C local white space characters pass */ + table.isspace['\t'] = 1; + table.isspace['\n'] = 1; + table.isspace['\r'] = 1; + table.isspace['\f'] = 1; + table.isspace['\v'] = 1; + table.isspace[0x85] = 1; + + ec = -1; + // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + /* parse unicode range */ + codepointrange(utf8, field, &sc, &ec); + prop = field[Fcategory]; + + for(c = sc; c <= ec; c++){ + /* grab properties */ + switch(prop[0]){ + case 'L': + table.isalpha[c] = 1; + switch(prop[1]){ + case 'u': table.isupper[c] = 1; break; + case 'l': table.islower[c] = 1; break; + case 't': table.istitle[c] = 1; break; + case 'm': break; // modifier letters + case 'o': break; // ideograph letters + default: + goto badproperty; + } + break; + + case 'Z': + table.isspace[c] = 1; + break; + + case 'M': + table.ismark[c] = 1; + break; + + case 'N': + table.isdigit[c] = 1; + break; + + case 'P': + table.ispunct[c] = 1; + break; + + case 'S': + table.issymbl[c] = 1; + break; + + case 'C': + table.iscntrl[c] = 1; + break; + + default: badproperty: + panicf("unrecognized category '%s'", prop); + } + /* grab transformations */ + if(*field[Fupper]) + table.toupper[c] = codepoint(field[Fupper]); + if(*field[Flower]) + table.tolower[c] = codepoint(field[Flower]); + if(*field[Ftitle]) + table.totitle[c] = codepoint(field[Ftitle]); + } + } + io·close(utf8); + + putsearch(); + + makeis("space", table.isspace, 0, 1); + makeis("digit", table.isdigit, 0, 1); + makeis("alpha", table.isalpha, 0, 0); + makeis("upper", table.isupper, 1, 0); + makeis("lower", table.islower, 1, 0); + makeis("title", table.istitle, 1, 0); + makeis("punct", table.ispunct, 1, 0); + + mkto("upper", table.toupper, 1, 0); + mkto("lower", table.tolower, 1, 0); + mkto("title", table.totitle, 1, 0); +} diff --git a/sys/libutf/vendor/mkrunewidth.c b/sys/libutf/vendor/mkrunewidth.c new file mode 100644 index 0000000..43feb5e --- /dev/null +++ b/sys/libutf/vendor/mkrunewidth.c @@ -0,0 +1,280 @@ +#include "common.h" + +/* + * inspired by design choices in utf8proc/charwidths.jl + * all widths default to 1 unless they fall within the categories: + * 1. Mn 2. Mc 3. Me 4. Zl + * 5. Zp 6. Cc 7. Cf 8. Cs + * these default to zero width + */ +enum +{ + /* width ? */ + WidthNeutral, /* (N) practially treated like narrow but unclear ... */ + WidthAmbiguous, /* (A) sometimes wide and sometimes not... */ + /* width 1 */ + WidthHalf, /* (H) = to narrow (compatability equivalent) */ + WidthNarrow, /* (Na) ASCII width */ + /* width 2 */ + WidthWide, /* (W) 2x width */ + WidthFull, /* (F) = to wide (compatability equivalent) */ +}; + +struct Table +{ + char width[3][NumRunes]; +}; + +static struct Table table; + +// ----------------------------------------------------------------------- +// internal functions + +static +void +parse_category(char *path) +{ + int sc, c, ec, w; + io·Stream *utf8; + char *prop, *field[NumFields], line[1024]; + + if(!(utf8 = io·open(path, "r"))) + panicf("can't open %s\n", path); + + // NOTE: we don't check for comments here + ec = -1; + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + codepointrange(utf8, field, &sc, &ec); + + prop = field[Fcategory]; + + switch(prop[0]){ + case 'M': + switch(prop[1]){ + case 'n': case 'c': case 'e': + w = 0; + break; + default: + w = 1; + break; + } + break; + case 'Z': + switch(prop[1]){ + case 'l': case 'p': + w = 0; + break; + default: + w = 1; + break; + } + break; + case 'C': + switch(prop[1]){ + case 'c': case 'f': case 's': + w = 0; + break; + default: + w = 1; + break; + } + default: + w = 1; + } + + for(c = sc; c <= ec; c++) + table.width[w][c] = 1; + } + + io·close(utf8); +} + +static +void +parse_eawidths(char *path) +{ + int at, w; + int l, c, r; + io·Stream *utf8; + char *s, *field[2], line[1024]; + + utf8 = io·open(path, "r"); + while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ + if(at == ParseSkip) + continue; + + switch(field[1][0]){ + case 'A': continue; + case 'N': + if(field[1][1] != 'a') + continue; + /* fallthrough */ + case 'H': w = 1; break; + + case 'W': /* fallthrough */ + case 'F': w = 2; break; + + default: + panicf("malformed east asian width class: %s\n", field[1]); + } + + if(!(s = strstr(field[0], ".."))) + l=codepoint(field[0]), r=l; + else{ + *s++ = 0, *s++ = 0; + l=codepoint(field[0]); + r=codepoint(s); + } + + for(c=l; c <= r; c++){ + /* ensure it only exists in one table */ + table.width[w][c] = 1; + table.width[(w+1)%3][c] = 0; + table.width[(w+2)%3][c] = 0; + } + } + io·close(utf8); +} + +static +void +maketable(char *label, char *table, int pairs, int onlyranges) +{ + int r, p=0, s=0; + char ident[3][128]; + + enum + { + Irange, + Ipair, + Isingle, + }; + + /* ranges */ + if(snprintf(ident[Irange], arrlen(ident[Irange]), "%s_range", label) == arrlen(ident[Irange])) + panicf("out of identifier space\n"); + r = putrange(ident[Irange], table, onlyranges); + + if(!onlyranges && pairs){ + if(snprintf(ident[Ipair], arrlen(ident[Ipair]), "%s_pair", label) == arrlen(ident[Ipair])) + panicf("out of identifier space\n"); + p = putpair(ident[Ipair], table); + } + if(!onlyranges){ + if(snprintf(ident[Isingle], arrlen(ident[Isingle]), "%s_single", label) == arrlen(ident[Isingle])) + panicf("out of identifier space\n"); + + s = putsingle(ident[Isingle], table); + } + + printf( + "static int\n" + "is%s(rune c)\n" + "{\n" + " rune *p;\n" + "\n", + label); + + if(r){ + printf( + " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n" + " if(p && c >= p[0] && c <= p[1])\n" + " return 1;\n", + ident[Irange], ident[Irange]); + } + + if(p){ + printf( + " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n" + " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return 1;\n", + ident[Ipair], ident[Ipair]); + } + + if(s) + printf( + " p = rangesearch(c, %s, arrlen(%s), 1);\n" + " if(p && c == p[0])\n" + " return 1;\n", + ident[Isingle], ident[Isingle]); + + printf( + " return 0;\n" + "}\n" + "\n"); +} + + + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkrunewidth \n"); + exit(1); +} + +#define SETW0(c) \ + table.width[0][(c)] = 1, \ + table.width[1][(c)] = 0, \ + table.width[2][(c)] = 0; + +#define SETW1(c) \ + table.width[0][(c)] = 0, \ + table.width[1][(c)] = 1, \ + table.width[2][(c)] = 0; + +#define SETW2(c) \ + table.width[0][(c)] = 0, \ + table.width[1][(c)] = 0, \ + table.width[2][(c)] = 1; + + +int +main(int argc, char *argv[]) +{ + int c; + + ARGBEGIN{ + }ARGEND; + + if(argc != 2) + usage(); + + parse_category(*argv++); + parse_eawidths(*argv); + + /* overrides */ + SETW0(0x2028); + SETW0(0x2029); + + SETW1(0x00AD); + + /* simple checking */ + for(c=0; c 1) + panicf("improper table state"); + } + + putsearch(); + + maketable("width0", table.width[0], 1, 0); + maketable("width1", table.width[1], 1, 0); + maketable("width2", table.width[2], 1, 0); + + puts( + "\n" + "int\n" + "utf8·runewidth(rune c)\n" + "{\n" + " if(iswidth1(c))\n" + " return 1;\n" + " if(iswidth2(c))\n" + " return 2;\n" + " return 0;\n" + "}" + ); +} -- cgit v1.2.1 From da8ac7ebf766051245a14da1542634c6ab788457 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Thu, 11 Nov 2021 08:31:46 -0800 Subject: chore: updated old copy of mkrunewidth --- sys/libutf/runetype-14.0.0.c | 111 ---------------------------------------- sys/libutf/runewidth-14.0.0.c | 71 ------------------------- sys/libutf/vendor/mkrunewidth.c | 71 ++++++++++++++++++++----- 3 files changed, 58 insertions(+), 195 deletions(-) delete mode 100644 sys/libutf/runetype-14.0.0.c delete mode 100644 sys/libutf/runewidth-14.0.0.c (limited to 'sys/libutf') diff --git a/sys/libutf/runetype-14.0.0.c b/sys/libutf/runetype-14.0.0.c deleted file mode 100644 index 6f4469d..0000000 --- a/sys/libutf/runetype-14.0.0.c +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include - -static -rune* -rangesearch(rune c, rune *t, int n, int ne) -{ - rune *p; - int m; - while(n > 1) { - m = n >> 1; - p = t + m*ne; - if(c >= p[0]){ - t = p; - n = n-m; - }else - n = m; - } - if(n && c >= t[0]) - return t; - return 0; -} - -static rune isspace_range[] = { - 0x0009, 0x000d, - 0x0085, 0x0085, -}; - -int -utf8·isspace(rune c) -{ - rune *p; - - p = rangesearch(c, isspace_range, arrlen(isspace_range)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} - -int -utf8·isdigit(rune c) -{ - rune *p; - - return 0; -} - -int -utf8·isalpha(rune c) -{ - rune *p; - - return 0; -} - -int -utf8·isupper(rune c) -{ - rune *p; - - return 0; -} - -int -utf8·islower(rune c) -{ - rune *p; - - return 0; -} - -int -utf8·istitle(rune c) -{ - rune *p; - - return 0; -} - -int -utf8·ispunct(rune c) -{ - rune *p; - - return 0; -} - -rune -utf8·toupper(rune c) -{ - rune *p; - - return c; -} - -rune -utf8·tolower(rune c) -{ - rune *p; - - return c; -} - -rune -utf8·totitle(rune c) -{ - rune *p; - - return c; -} - diff --git a/sys/libutf/runewidth-14.0.0.c b/sys/libutf/runewidth-14.0.0.c deleted file mode 100644 index 113c35e..0000000 --- a/sys/libutf/runewidth-14.0.0.c +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include - -static -rune* -rangesearch(rune c, rune *t, int n, int ne) -{ - rune *p; - int m; - while(n > 1) { - m = n >> 1; - p = t + m*ne; - if(c >= p[0]){ - t = p; - n = n-m; - }else - n = m; - } - if(n && c >= t[0]) - return t; - return 0; -} - -static rune width0_range[] = { - 0x2028, 0x2029, -}; - -static int -iswidth0(rune c) -{ - rune *p; - - p = rangesearch(c, width0_range, arrlen(width0_range)/2, 2); - if(p && c >= p[0] && c <= p[1]) - return 1; - return 0; -} - -static rune width1_single[] = { - 0x00ad, -}; - -static int -iswidth1(rune c) -{ - rune *p; - - p = rangesearch(c, width1_single, arrlen(width1_single), 1); - if(p && c == p[0]) - return 1; - return 0; -} - -static int -iswidth2(rune c) -{ - rune *p; - - return 0; -} - - -int -utf8·runewidth(rune c) -{ - if(iswidth1(c)) - return 1; - if(iswidth2(c)) - return 2; - return 0; -} diff --git a/sys/libutf/vendor/mkrunewidth.c b/sys/libutf/vendor/mkrunewidth.c index 43feb5e..14e6973 100644 --- a/sys/libutf/vendor/mkrunewidth.c +++ b/sys/libutf/vendor/mkrunewidth.c @@ -89,6 +89,21 @@ parse_category(char *path) io·close(utf8); } +static +void +coderange(char *field, int *l, int *r) +{ + char *s; + + if(!(s = strstr(field, ".."))) + *l=*r=codepoint(field); + else{ + *s++ = 0, *s++ = 0; + *l=codepoint(field); + *r=codepoint(s); + } +} + static void parse_eawidths(char *path) @@ -96,7 +111,7 @@ parse_eawidths(char *path) int at, w; int l, c, r; io·Stream *utf8; - char *s, *field[2], line[1024]; + char *field[2], line[1024]; utf8 = io·open(path, "r"); while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ @@ -118,13 +133,7 @@ parse_eawidths(char *path) panicf("malformed east asian width class: %s\n", field[1]); } - if(!(s = strstr(field[0], ".."))) - l=codepoint(field[0]), r=l; - else{ - *s++ = 0, *s++ = 0; - l=codepoint(field[0]); - r=codepoint(s); - } + coderange(field[0], &l, &r); for(c=l; c <= r; c++){ /* ensure it only exists in one table */ @@ -136,6 +145,43 @@ parse_eawidths(char *path) io·close(utf8); } +static +void +parse_emoji(char *path) +{ + int at, w; + int l, c, r; + io·Stream *utf8; + char *s, *field[2], line[1024]; + + utf8 = io·open(path, "r"); + while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ + if(at == ParseSkip) + continue; + + /* only override emoji presentation */ + if(!strstr(field[1], "Emoji_Presentation")) + continue; + + /* trim trailing space */ + for(s=field[0]; *s; s++){ + if(*s == ' ') + *s = 0; + } + + coderange(field[0], &l, &r); + + for(c=l; c <= r; c++){ + table.width[0][c] = 0; + table.width[1][c] = 0; + table.width[2][c] = 1; + } + } + + io·close(utf8); +} + +/* output functions */ static void maketable(char *label, char *table, int pairs, int onlyranges) @@ -204,8 +250,6 @@ maketable(char *label, char *table, int pairs, int onlyranges) "\n"); } - - // ----------------------------------------------------------------------- // main point of entry @@ -213,7 +257,7 @@ static void usage(void) { - fprintf(stderr, "usage: mkrunewidth \n"); + fprintf(stderr, "usage: mkrunewidth \n"); exit(1); } @@ -241,11 +285,12 @@ main(int argc, char *argv[]) ARGBEGIN{ }ARGEND; - if(argc != 2) + if(argc != 3) usage(); parse_category(*argv++); - parse_eawidths(*argv); + parse_eawidths(*argv++); + parse_emoji(*argv); /* overrides */ SETW0(0x2028); -- cgit v1.2.1