From 6db99f51209ebde97311df3aac081f82589319cb Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 29 Oct 2021 11:10:29 -0700 Subject: implemented functionality to compute rune widths from unicode data --- sys/libunicode/rules.mk | 29 ++++- sys/libunicode/vendor/common.c | 220 +++++++++++++++++++++++++++++++ sys/libunicode/vendor/common.h | 46 +++++++ sys/libunicode/vendor/mkgraphemedata.c | 24 ++++ sys/libunicode/vendor/mkrunetype.c | 230 ++++----------------------------- sys/libunicode/vendor/mkrunewidth.c | 173 +++++++++++++++++++++++++ 6 files changed, 512 insertions(+), 210 deletions(-) create mode 100644 sys/libunicode/vendor/common.c create mode 100644 sys/libunicode/vendor/common.h create mode 100644 sys/libunicode/vendor/mkgraphemedata.c create mode 100644 sys/libunicode/vendor/mkrunewidth.c (limited to 'sys/libunicode') diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk index aca7348..5ba9b89 100644 --- a/sys/libunicode/rules.mk +++ b/sys/libunicode/rules.mk @@ -11,18 +11,25 @@ SRCS_$(d) := \ $(d)/canfit.c \ $(d)/runelen.c \ $(d)/len.c \ - $(d)/runetype-$(UNICODE).c + $(d)/runetype-$(UNICODE).c \ + $(d)/runewidth-$(UNICODE).c LIBS_$(d) := $(d)/libunicode.a include share/paths.mk +# ======================================================================== +# table generation + +$(d)/vendor/common.o: $(d)/vendor/common.c + $(COMPILE) + # rune categories $(d)/vendor/UnicodeData-$(UNICODE).txt: @echo "GET UnicodeData.txt";\ curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@ -$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a +$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a $(COMPLINK) GENS += $(d)/vendor/mkrunetype @@ -30,12 +37,25 @@ GENS += $(d)/vendor/mkrunetype $(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype @$(dir $@)vendor/mkrunetype $< > $@ +# rune widths +$(d)/vendor/EastAsianWidth-$(UNICODE).txt: + @echo "GET EastAsianWidth.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@ + +$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a + $(COMPLINK) + +GENS += $(d)/vendor/mkrunewidth + +$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt + @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) #> $@ + # grapheme boundaries $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt: @echo "GET GraphemeBreakProperty.txt";\ curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@ -$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(OBJ_DIR)/sys/base/base.a +$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a $(COMPLINK) $(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt @@ -43,6 +63,9 @@ $(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeB GENS += $(d)/vendor/mkgraphemedata +# ======================================================================== +# normal operations + $(LIBS_$(d)): $(OBJS_$(d)) $(ARCHIVE) diff --git a/sys/libunicode/vendor/common.c b/sys/libunicode/vendor/common.c new file mode 100644 index 0000000..6b5d1b3 --- /dev/null +++ b/sys/libunicode/vendor/common.c @@ -0,0 +1,220 @@ +#include "common.h" + +// ----------------------------------------------------------------------- +// input functions + +int +parse(io·Stream *io, int nfield, char **field, int len, char *line) +{ + int n; + if((n=io·readln(io, len, line)) <= 0) + return ParseEOF; + + if(n == len) + panicf("line too long"); + + if(line[n-1] != '\n') + panicf("invalid line: expected '\n', found '%c'", line[n]); + + line[n-1] = 0; + + if(line[0] == '#' || line[0] == 0) + return ParseSkip; + + /* tokenize line into fields */ + n = 0; + field[n] = line; + while(*line){ + if(*line == ';'){ + *line = 0; + field[++n] = line+1; + } + line++; + } + + if(n != nfield-1) + panicf("expected %d number of fields, got %d: %s", nfield, n, line); + + return ParseOK; +} + +int +codepoint(char *s) +{ + int c, b; + + c = 0; + while((b=*s++)){ + c <<= 4; + if(b >= '0' && b <= '9') + c += b - '0'; + else if(b >= 'A' && b <= 'F') + c += b - 'A' + 10; + else + panicf("bad codepoint char '%c'", b); + } + + return c; +} + +void +codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop) +{ + int e, c; + char *other[NumFields], line[1024]; + + // XXX: the stop variable passes in the previous stopping character + e = *stop; + c = codepoint(field[Fcode]); + + if(c >= NumRunes) + panicf("unexpected large codepoint %x", c); + if(c <= e) + panicf("bad code sequence: %x then %x", e, c); + e = c; + + if(strstr(field[Fname], ", First>") != nil){ + if(!parse(utf8, arrlen(other), other, arrlen(line), line)) + panicf("range start at end of file"); + if(strstr(other[Fname], ", Last>") == nil) + panicf("range start not followed by range end"); + + e = codepoint(other[Fcode]); + + if(e <= c) + panicf("bad code sequence: %x then %x", c, e); + if(strcmp(field[Fcategory], other[Fcategory]) != 0) + panicf("range with mismatched category"); + } + + *start = c; + *stop = e; +} + +// ----------------------------------------------------------------------- +// output functions + +void +putsearch(void) +{ + puts( + "#include \n" + "#include \n" + "\n" + "static\n" + "rune*\n" + "rangesearch(rune c, rune *t, int n, int ne)\n" + "{\n" + " rune *p;\n" + " int m;\n" + " while(n > 1) {\n" + " m = n >> 1;\n" + " p = t + m*ne;\n" + " if(c >= p[0]){\n" + " t = p;\n" + " n = n-m;\n" + " }else\n" + " n = m;\n" + " }\n" + " if(n && c >= t[0])\n" + " return t;\n" + " return 0;\n" + "}\n" + ); + +} + +int +putrange(char *ident, char *prop, int force) +{ + int l, r, start; + + start = 0; + for(l = 0; l < NumRunes;) { + if(!prop[l]){ + l++; + continue; + } + + for(r = l+1; r < NumRunes; r++){ + if(!prop[r]) + break; + prop[r] = 0; + } + + if(force || r > l + 1){ + if(!start){ + printf("static rune %s[] = {\n", ident); + start = 1; + } + prop[l] = 0; + printf("\t0x%.4x, 0x%.4x,\n", l, r-1); + } + + l = r; + } + + if(start) + printf("};\n\n"); + + return start; +} + +int +putpair(char *ident, char *prop) +{ + int l, r, start; + + start = 0; + for(l=0; l+2 < NumRunes; ){ + if(!prop[l]){ + l++; + continue; + } + + for(r = l + 2; r < NumRunes; r += 2){ + if(!prop[r]) + break; + prop[r] = 0; + } + + if(r != l + 2){ + if(!start){ + printf("static rune %s[] = {\n", ident); + start = 1; + } + prop[l] = 0; + printf("\t0x%.4x, 0x%.4x,\n", l, r - 2); + } + + l = r; + } + + if(start) + printf("};\n\n"); + return start; +} + +int +putsingle(char *ident, char *prop) +{ + int i, start; + + start = 0; + for(i = 0; i < NumRunes; i++) { + if(!prop[i]) + continue; + + if(!start){ + printf("static rune %s[] = {\n", ident); + start = 1; + } + prop[i] = 0; + printf("\t0x%.4x,\n", i); + } + + if(start) + printf("};\n\n"); + + return start; +} diff --git a/sys/libunicode/vendor/common.h b/sys/libunicode/vendor/common.h new file mode 100644 index 0000000..17cb113 --- /dev/null +++ b/sys/libunicode/vendor/common.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + +enum +{ + // Fields inside UnicodeData.txt + Fcode, + Fname, + Fcategory, + Fcombine, + Fbidir, + Fdecomp, + Fdecimal, + Fdigit, + Fnumeric, + Fmirror, + Foldname, + Fcomment, + Fupper, + Flower, + Ftitle, + + NumFields, + NumRunes = 1 << 21, +}; + +/* input functions */ +enum +{ + ParseEOF, + ParseOK, + ParseSkip, +}; + +int parse(io·Stream *io, int nfield, char **field, int len, char *line); +int codepoint(char *s); +void codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop); + +/* output functions */ +void putsearch(void); +int putrange(char *ident, char *prop, int force); +int putpair(char *ident, char *prop); +int putsingle(char *ident, char *prop); diff --git a/sys/libunicode/vendor/mkgraphemedata.c b/sys/libunicode/vendor/mkgraphemedata.c new file mode 100644 index 0000000..f79d851 --- /dev/null +++ b/sys/libunicode/vendor/mkgraphemedata.c @@ -0,0 +1,24 @@ +#include +#include +#include + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkgraphemedata \n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + io·Stream *utf8; + char line[1024]; + + ARGBEGIN{ + }ARGEND; +} diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c index fc4301c..9f939f4 100644 --- a/sys/libunicode/vendor/mkrunetype.c +++ b/sys/libunicode/vendor/mkrunetype.c @@ -1,6 +1,4 @@ -#include -#include -#include +#include "common.h" // ----------------------------------------------------------------------- // globals @@ -8,29 +6,6 @@ #define OFFSET (1 << 20) #define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) -enum -{ - // Fields inside UnicodeData.txt - Fcode, - Fname, - Fcategory, - Fcombine, - Fbidir, - Fdecomp, - Fdecimal, - Fdigit, - Fnumeric, - Fmirror, - Foldname, - Fcomment, - Fupper, - Flower, - Ftitle, - - NumFields, - NumRunes = 1 << 21, -}; - // TODO: use bitarrays. will reduce executable size 8x struct Table { @@ -59,154 +34,37 @@ static struct Table table; // ----------------------------------------------------------------------- // internal functions -int -parse(io·Stream *io, char **field, int len, char *line) -{ - int n; - if((n=io·readln(io, len, line)) <= 0) - return 0; - - if(n == len) - panicf("line too long"); - - if(line[n-1] != '\n') - panicf("invalid line: expected '\n', found '%c'", line[n]); - - line[n-1] = 0; - - /* tokenize line into fields */ - n = 0; - field[n] = line; - while(*line){ - if(*line == ';'){ - *line = 0; - field[++n] = line+1; - } - line++; - } - - if(n != NumFields-1) - panicf("expected %d number of fields, got %d", NumFields, n); - - return 1; -} - -static -int -codepoint(char *s) -{ - int c, b; - - c = 0; - while((b=*s++)){ - c <<= 4; - if(b >= '0' && b <= '9') - c += b - '0'; - else if(b >= 'A' && b <= 'F') - c += b - 'A' + 10; - else - panicf("bad codepoint char '%c'", b); - } - - return c; -} - static int isrange(char *label, char *prop, int force) { - int l, r, start; - - start = 0; - for(l = 0; l < NumRunes;) { - if(!prop[l]){ - l++; - continue; - } - - for(r = l+1; r < NumRunes; r++){ - if(!prop[r]) - break; - prop[r] = 0; - } - - if(force || r > l + 1){ - if(!start){ - printf("static rune is%s_range[] = {\n", label); - start = 1; - } - prop[l] = 0; - printf("\t0x%.4x, 0x%.4x,\n", l, r-1); - } - - l = r; - } - - if(start) - printf("};\n\n"); + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident)) + panicf("out of identifier space\n"); - return start; + return putrange(ident, prop, force); } static int ispair(char *label, char *prop) { - int l, r, start; - - start = 0; - for(l=0; l+2 < NumRunes; ){ - if(!prop[l]){ - l++; - continue; - } - - for(r = l + 2; r < NumRunes; r += 2){ - if(!prop[r]) - break; - prop[r] = 0; - } - - if(r != l + 2){ - if(!start){ - printf("static rune is%s_pair[] = {\n", label); - start = 1; - } - prop[l] = 0; - printf("\t0x%.4x, 0x%.4x,\n", l, r - 2); - } - - l = r; - } + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident)) + panicf("out of identifier space\n"); - if(start) - printf("};\n\n"); - return start; + return putpair(ident, prop); } static int issingle(char *label, char *prop) { - int i, start; + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident)) + panicf("out of identifier space\n"); - start = 0; - for(i = 0; i < NumRunes; i++) { - if(!prop[i]) - continue; - - if(!start){ - printf("static rune is%s_single[] = {\n", label); - start = 1; - } - prop[i] = 0; - printf("\t0x%.4x,\n", i); - } - - if(start) - printf("};\n\n"); - - return start; + return putsingle(ident, prop); } static @@ -425,9 +283,9 @@ usage(void) int main(int argc, char *argv[]) { - int i, c, pc; + int i, sc, c, ec; io·Stream *utf8; - char *prop, *field[NumFields], *field2[NumFields], line[1024], line2[1024]; + char *prop, *field[NumFields], line[1024]; ARGBEGIN{ }ARGEND; @@ -453,33 +311,14 @@ main(int argc, char *argv[]) table.isspace['\v'] = 1; table.isspace[0x85] = 1; - pc = -1; - while(parse(utf8, field, arrlen(line), line)){ - /* parse unicode value */ - c = codepoint(field[Fcode]); - if(c >= NumRunes) - panicf("unexpected large codepoint %x", c); - if(c <= pc) - panicf("bad code sequence: %x then %x", pc, c); - pc = c; - - /* see if this is a range */ - if(strstr(field[Fname], ", First>") != nil){ - if(!parse(utf8, field2, arrlen(line2), line2)) - panicf("range start at end of file"); - if(strstr(field2[Fname], ", Last>") == nil) - panicf("range start not followed by range end"); - - pc = codepoint(field2[Fcode]); - - if(pc <= c) - panicf("bad code sequence: %x then %x", c, pc); - if(strcmp(field[Fcategory], field2[Fcategory]) != 0) - panicf("range with mismatched category"); - } - + ec = -1; + // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + /* parse unicode range */ + codepointrange(utf8, field, &sc, &ec); prop = field[Fcategory]; - for(; c <= pc; c++){ + + for(c = sc; c <= ec; c++){ /* grab properties */ switch(prop[0]){ case 'L': @@ -533,30 +372,7 @@ main(int argc, char *argv[]) } io·close(utf8); - puts( - "#include \n" - "#include \n" - "\n" - "static\n" - "rune*\n" - "rangesearch(rune c, rune *t, int n, int ne)\n" - "{\n" - " rune *p;\n" - " int m;\n" - " while(n > 1) {\n" - " m = n >> 1;\n" - " p = t + m*ne;\n" - " if(c >= p[0]){\n" - " t = p;\n" - " n = n-m;\n" - " }else\n" - " n = m;\n" - " }\n" - " if(n && c >= t[0])\n" - " return t;\n" - " return 0;\n" - "}\n" - ); + putsearch(); makeis("space", table.isspace, 0, 1); makeis("digit", table.isdigit, 0, 1); diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c new file mode 100644 index 0000000..d821863 --- /dev/null +++ b/sys/libunicode/vendor/mkrunewidth.c @@ -0,0 +1,173 @@ +#include "common.h" + +/* + * inspired by design choices in utf8proc/charwidths.jl + * all widths default to 1 unless they fall within the categories: + * 1. Mn 2. Mc 3. Me 4. Zl + * 5. Zp 6. Cc 7. Cf 8. Cs + * these default to zero width + */ +enum +{ + /* width ? */ + WidthNeutral, /* (N) practially treated like narrow but unclear ... */ + WidthAmbiguous, /* (A) sometimes wide and sometimes not... */ + /* width 1 */ + WidthHalf, /* (H) = to narrow (compatability equivalent) */ + WidthNarrow, /* (Na) ASCII width */ + /* width 2 */ + WidthWide, /* (W) 2x width */ + WidthFull, /* (F) = to wide (compatability equivalent) */ +}; + +struct Table +{ + char width[3][NumRunes]; +}; + +static struct Table table; + +// ----------------------------------------------------------------------- +// internal functions + +static +void +parse_category(char *path) +{ + int sc, c, ec, w; + io·Stream *utf8; + char *prop, *field[NumFields], line[1024]; + + if(!(utf8 = io·open(path, "r"))) + panicf("can't open %s\n", path); + + // NOTE: we don't check for comments here + ec = -1; + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + codepointrange(utf8, field, &sc, &ec); + + prop = field[Fcategory]; + + switch(prop[0]){ + case 'M': + switch(prop[1]){ + case 'n': case 'c': case 'e': + w = 0; + break; + default: + w = 1; + break; + } + break; + case 'Z': + switch(prop[1]){ + case 'l': case 'p': + w = 0; + break; + default: + w = 1; + break; + } + break; + case 'C': + switch(prop[1]){ + case 'c': case 'f': case 's': + w = 0; + break; + default: + w = 1; + break; + } + default: + w = 1; + } + + for(c = sc; c <= ec; c++) + table.width[w][c] = 1; + } + + io·close(utf8); +} + +static +void +parse_eawidths(char *path) +{ + int at, w; + int l, c, r; + io·Stream *utf8; + char *s, *field[2], line[1024]; + + utf8 = io·open(path, "r"); + while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ + if(at == ParseSkip) + continue; + + switch(field[1][0]){ + case 'A': continue; + case 'N': + if(field[1][1] != 'a') + continue; + /* fallthrough */ + case 'H': w = 1; break; + + case 'W': /* fallthrough */ + case 'F': w = 2; break; + + default: + panicf("malformed east asian width class: %s\n", field[1]); + } + + if(!(s = strstr(field[0], ".."))) + l=codepoint(field[0]), r=l; + else{ + *s++ = 0, *s++ = 0; + l=codepoint(field[0]); + r=codepoint(s); + } + + for(c=l; c <= r; c++){ + /* ensure it only exists in one table */ + table.width[w][c] = 1; + table.width[(w+1)%3][c] = 0; + table.width[(w+2)%3][c] = 0; + } + } + io·close(utf8); +} + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkrunewidth \n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int c; + + ARGBEGIN{ + }ARGEND; + + if(argc != 2) + usage(); + + parse_category(*argv++); + parse_eawidths(*argv); + + /* simple checking */ + for(c=0; c 1) + panicf("improper table state"); + } + + putrange("zero", table.width[0], 1); + putrange("single", table.width[1], 1); + putrange("double", table.width[2], 1); +} -- cgit v1.2.1