From 6db99f51209ebde97311df3aac081f82589319cb Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 29 Oct 2021 11:10:29 -0700 Subject: implemented functionality to compute rune widths from unicode data --- sys/libunicode/vendor/mkrunewidth.c | 173 ++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 sys/libunicode/vendor/mkrunewidth.c (limited to 'sys/libunicode/vendor/mkrunewidth.c') diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c new file mode 100644 index 0000000..d821863 --- /dev/null +++ b/sys/libunicode/vendor/mkrunewidth.c @@ -0,0 +1,173 @@ +#include "common.h" + +/* + * inspired by design choices in utf8proc/charwidths.jl + * all widths default to 1 unless they fall within the categories: + * 1. Mn 2. Mc 3. Me 4. Zl + * 5. Zp 6. Cc 7. Cf 8. Cs + * these default to zero width + */ +enum +{ + /* width ? */ + WidthNeutral, /* (N) practially treated like narrow but unclear ... */ + WidthAmbiguous, /* (A) sometimes wide and sometimes not... */ + /* width 1 */ + WidthHalf, /* (H) = to narrow (compatability equivalent) */ + WidthNarrow, /* (Na) ASCII width */ + /* width 2 */ + WidthWide, /* (W) 2x width */ + WidthFull, /* (F) = to wide (compatability equivalent) */ +}; + +struct Table +{ + char width[3][NumRunes]; +}; + +static struct Table table; + +// ----------------------------------------------------------------------- +// internal functions + +static +void +parse_category(char *path) +{ + int sc, c, ec, w; + io·Stream *utf8; + char *prop, *field[NumFields], line[1024]; + + if(!(utf8 = io·open(path, "r"))) + panicf("can't open %s\n", path); + + // NOTE: we don't check for comments here + ec = -1; + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + codepointrange(utf8, field, &sc, &ec); + + prop = field[Fcategory]; + + switch(prop[0]){ + case 'M': + switch(prop[1]){ + case 'n': case 'c': case 'e': + w = 0; + break; + default: + w = 1; + break; + } + break; + case 'Z': + switch(prop[1]){ + case 'l': case 'p': + w = 0; + break; + default: + w = 1; + break; + } + break; + case 'C': + switch(prop[1]){ + case 'c': case 'f': case 's': + w = 0; + break; + default: + w = 1; + break; + } + default: + w = 1; + } + + for(c = sc; c <= ec; c++) + table.width[w][c] = 1; + } + + io·close(utf8); +} + +static +void +parse_eawidths(char *path) +{ + int at, w; + int l, c, r; + io·Stream *utf8; + char *s, *field[2], line[1024]; + + utf8 = io·open(path, "r"); + while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ + if(at == ParseSkip) + continue; + + switch(field[1][0]){ + case 'A': continue; + case 'N': + if(field[1][1] != 'a') + continue; + /* fallthrough */ + case 'H': w = 1; break; + + case 'W': /* fallthrough */ + case 'F': w = 2; break; + + default: + panicf("malformed east asian width class: %s\n", field[1]); + } + + if(!(s = strstr(field[0], ".."))) + l=codepoint(field[0]), r=l; + else{ + *s++ = 0, *s++ = 0; + l=codepoint(field[0]); + r=codepoint(s); + } + + for(c=l; c <= r; c++){ + /* ensure it only exists in one table */ + table.width[w][c] = 1; + table.width[(w+1)%3][c] = 0; + table.width[(w+2)%3][c] = 0; + } + } + io·close(utf8); +} + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkrunewidth \n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int c; + + ARGBEGIN{ + }ARGEND; + + if(argc != 2) + usage(); + + parse_category(*argv++); + parse_eawidths(*argv); + + /* simple checking */ + for(c=0; c 1) + panicf("improper table state"); + } + + putrange("zero", table.width[0], 1); + putrange("single", table.width[1], 1); + putrange("double", table.width[2], 1); +} -- cgit v1.2.1