diff options
author | Nicholas <nbnoll@eml.cc> | 2021-11-20 10:53:19 -0800 |
---|---|---|
committer | Nicholas <nbnoll@eml.cc> | 2021-11-20 10:53:19 -0800 |
commit | a9bfe650038afea8b751175cac16f6027345e45f (patch) | |
tree | 9a7f9feb76a64bb3efe573036d80b7bdbf8a59a5 /src/base/utf/vendor/mkrunetype.c | |
parent | 1c8d4e69205fd875f6bec3fa3bd929c2e7f52f62 (diff) |
Chore: reorganize libutf and libfmt into base
I found the split to be arbitrary. Better to include the functionality
in the standard library. I also split the headers to allow for more
granular inclusion (but the library is still monolithic). The only
ugliness is the circular dependency introduced with libutf's generated
functions. We put explicit prereqs with the necessary object files
instead.
Diffstat (limited to 'src/base/utf/vendor/mkrunetype.c')
-rw-r--r-- | src/base/utf/vendor/mkrunetype.c | 390 |
1 files changed, 390 insertions, 0 deletions
diff --git a/src/base/utf/vendor/mkrunetype.c b/src/base/utf/vendor/mkrunetype.c new file mode 100644 index 0000000..b33df32 --- /dev/null +++ b/src/base/utf/vendor/mkrunetype.c @@ -0,0 +1,390 @@ +#include "common.h" + +// ----------------------------------------------------------------------- +// globals + +#define OFFSET (1 << 20) +#define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) + +// TODO: use bitarrays. will reduce executable size 8x +struct Table +{ + /* properties */ + char isspace[NumRunes]; + char isalpha[NumRunes]; + char ismark[NumRunes]; + char isdigit[NumRunes]; + char isupper[NumRunes]; + char islower[NumRunes]; + char istitle[NumRunes]; + char ispunct[NumRunes]; + char issymbl[NumRunes]; + char iscntrl[NumRunes]; + + char combine[NumRunes]; + + /* transformations */ + int toupper[NumRunes]; + int tolower[NumRunes]; + int totitle[NumRunes]; +}; + +static struct Table table; + +// ----------------------------------------------------------------------- +// internal functions + +static +int +isrange(char *label, char *prop, int force) +{ + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident)) + panicf("out of identifier space\n"); + + return putrange(ident, prop, force); +} + +static +int +ispair(char *label, char *prop) +{ + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident)) + panicf("out of identifier space\n"); + + return putpair(ident, prop); +} + +static +int +issingle(char *label, char *prop) +{ + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident)) + panicf("out of identifier space\n"); + + return putsingle(ident, prop); +} + +static +void +makeis(char *label, char *table, int pairs, int onlyranges) +{ + int hasr, hasp=0, hass=0; + + hasr = isrange(label, table, onlyranges); + if(!onlyranges && pairs) + hasp = ispair(label, table); + if(!onlyranges) + hass = issingle(label, table); + + printf( + "int\n" + "utf8·is%s(rune c)\n" + "{\n" + " rune *p;\n" + "\n", + label); + + if(hasr){ + printf( + " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n" + " if(p && c >= p[0] && c <= p[1])\n" + " return 1;\n", + label, label); + } + + if(hasp){ + printf( + " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n" + " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return 1;\n", + label, label); + } + + if(hass) + printf( + " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n" + " if(p && c == p[0])\n" + " return 1;\n", + label, label); + + printf( + " return 0;\n" + "}\n" + "\n"); +} + +static +int +torange(char *label, int *index, int force) +{ + int l, r, d, start = 0; + + for(l = 0; l < NumRunes; ){ + if(index[l] == l){ + l++; + continue; + } + + d = DELTA(index[l], l); + if(d != (rune)d) + panicf("bad map delta %d", d); + + for(r = l+1; r < NumRunes; r++){ + if(DELTA(index[r], r) != d) + break; + index[r] = r; + } + + if(force || r != l + 1){ + if(!start){ + printf("static rune to%s_range[] = {\n", label); + start = 1; + } + index[l] = l; + printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d); + } + l = r; + } + if(start) + printf("};\n\n"); + + return start; +} + +static +int +topair(char *label, int *index) +{ + int l, r, d, start = 0; + + for(l = 0; l + 2 < NumRunes; ){ + if(index[l] == l){ + l++; + continue; + } + + d = DELTA(index[l], l); + if(d != (rune)d) + panicf("bad delta %d", d); + + for(r = l+2; r < NumRunes; r += 2){ + if(DELTA(index[r], r) != d) + break; + index[r] = r; + } + + if(r > l+2){ + if(!start){ + printf("static rune to%s_pair[] = {\n", label); + start = 1; + } + index[l] = l; + printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d); + } + + l = r; + } + if(start) + printf("};\n\n"); + + return start; +} + +static +int +tosingle(char *label, int *index) +{ + int i, d, start = 0; + + for(i=0; i < NumRunes; i++) { + if(index[i] == i) + continue; + + d = DELTA(index[i], i); + if(d != (rune)d) + panicf("bad map delta %d", d); + + if(!start){ + printf("static rune to%s_single[] = {\n", label); + start = 1; + } + index[i] = i; + printf("\t0x%.4x, %d,\n", i, d); + } + if(start) + printf("};\n\n"); + + return start; +} + +static +void +mkto(char *label, int *index, int pairs, int onlyrange) +{ + int hasr, hasp=0, hass=0; + + hasr = torange(label, index, !onlyrange); + if(!onlyrange && pairs) + hasp = topair(label, index); + if(!onlyrange) + hass = tosingle(label, index); + + printf( + "rune\n" + "utf8·to%s(rune c)\n" + "{\n" + " rune *p;\n" + "\n", + label); + + if(hasr) + printf( + " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n" + " if(p && c >= p[0] && c <= p[1])\n" + " return c + p[2] - %d;\n", + label, label, OFFSET); + + if(hasp) + printf( + " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n" + " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" + " return c + p[2] - %d;\n", + label, label, OFFSET); + + if(hass) + printf( + " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n" + " if(p && c == p[0])\n" + " return c + p[1] - %d;\n", + label, label, OFFSET); + + + printf( + " return c;\n" + "}\n" + "\n" + ); +} + +// ----------------------------------------------------------------------- +// main point of entry + +static +void +usage(void) +{ + fprintf(stderr, "usage: mkrunetype <UnicodeData.txt>\n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int i, sc, c, ec; + io·Stream *utf8; + char *prop, *field[NumFields], line[1024]; + + ARGBEGIN{ + }ARGEND; + + if(argc != 1) + usage(); + + if(!(utf8 = io·open(argv[0], "r"))) + panicf("can't open %s\n", argv[0]); + + /* by default each character maps to itself */ + for(i = 0; i < NumRunes; i++) { + table.toupper[i] = i; + table.tolower[i] = i; + table.totitle[i] = i; + } + + /* ensure all C local white space characters pass */ + table.isspace['\t'] = 1; + table.isspace['\n'] = 1; + table.isspace['\r'] = 1; + table.isspace['\f'] = 1; + table.isspace['\v'] = 1; + table.isspace[0x85] = 1; + + ec = -1; + // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + /* parse unicode range */ + codepointrange(utf8, field, &sc, &ec); + prop = field[Fcategory]; + + for(c = sc; c <= ec; c++){ + /* grab properties */ + switch(prop[0]){ + case 'L': + table.isalpha[c] = 1; + switch(prop[1]){ + case 'u': table.isupper[c] = 1; break; + case 'l': table.islower[c] = 1; break; + case 't': table.istitle[c] = 1; break; + case 'm': break; // modifier letters + case 'o': break; // ideograph letters + default: + goto badproperty; + } + break; + + case 'Z': + table.isspace[c] = 1; + break; + + case 'M': + table.ismark[c] = 1; + break; + + case 'N': + table.isdigit[c] = 1; + break; + + case 'P': + table.ispunct[c] = 1; + break; + + case 'S': + table.issymbl[c] = 1; + break; + + case 'C': + table.iscntrl[c] = 1; + break; + + default: badproperty: + panicf("unrecognized category '%s'", prop); + } + /* grab transformations */ + if(*field[Fupper]) + table.toupper[c] = codepoint(field[Fupper]); + if(*field[Flower]) + table.tolower[c] = codepoint(field[Flower]); + if(*field[Ftitle]) + table.totitle[c] = codepoint(field[Ftitle]); + } + } + io·close(utf8); + + putsearch(); + + makeis("space", table.isspace, 0, 1); + makeis("digit", table.isdigit, 0, 1); + makeis("alpha", table.isalpha, 0, 0); + makeis("upper", table.isupper, 1, 0); + makeis("lower", table.islower, 1, 0); + makeis("title", table.istitle, 1, 0); + makeis("punct", table.ispunct, 1, 0); + + mkto("upper", table.toupper, 1, 0); + mkto("lower", table.tolower, 1, 0); + mkto("title", table.totitle, 1, 0); + + return 0; +} |