diff options
Diffstat (limited to 'sys/libunicode/vendor/mkrunetype.c')
-rw-r--r-- | sys/libunicode/vendor/mkrunetype.c | 230 |
1 files changed, 23 insertions, 207 deletions
diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c index fc4301c..9f939f4 100644 --- a/sys/libunicode/vendor/mkrunetype.c +++ b/sys/libunicode/vendor/mkrunetype.c @@ -1,6 +1,4 @@ -#include <u.h> -#include <base.h> -#include <libunicode.h> +#include "common.h" // ----------------------------------------------------------------------- // globals @@ -8,29 +6,6 @@ #define OFFSET (1 << 20) #define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) -enum -{ - // Fields inside UnicodeData.txt - Fcode, - Fname, - Fcategory, - Fcombine, - Fbidir, - Fdecomp, - Fdecimal, - Fdigit, - Fnumeric, - Fmirror, - Foldname, - Fcomment, - Fupper, - Flower, - Ftitle, - - NumFields, - NumRunes = 1 << 21, -}; - // TODO: use bitarrays. will reduce executable size 8x struct Table { @@ -59,154 +34,37 @@ static struct Table table; // ----------------------------------------------------------------------- // internal functions -int -parse(io·Stream *io, char **field, int len, char *line) -{ - int n; - if((n=io·readln(io, len, line)) <= 0) - return 0; - - if(n == len) - panicf("line too long"); - - if(line[n-1] != '\n') - panicf("invalid line: expected '\n', found '%c'", line[n]); - - line[n-1] = 0; - - /* tokenize line into fields */ - n = 0; - field[n] = line; - while(*line){ - if(*line == ';'){ - *line = 0; - field[++n] = line+1; - } - line++; - } - - if(n != NumFields-1) - panicf("expected %d number of fields, got %d", NumFields, n); - - return 1; -} - -static -int -codepoint(char *s) -{ - int c, b; - - c = 0; - while((b=*s++)){ - c <<= 4; - if(b >= '0' && b <= '9') - c += b - '0'; - else if(b >= 'A' && b <= 'F') - c += b - 'A' + 10; - else - panicf("bad codepoint char '%c'", b); - } - - return c; -} - static int isrange(char *label, char *prop, int force) { - int l, r, start; - - start = 0; - for(l = 0; l < NumRunes;) { - if(!prop[l]){ - l++; - continue; - } - - for(r = l+1; r < NumRunes; r++){ - if(!prop[r]) - break; - prop[r] = 0; - } - - if(force || r > l + 1){ - if(!start){ - printf("static rune is%s_range[] = {\n", label); - start = 1; - } - prop[l] = 0; - printf("\t0x%.4x, 0x%.4x,\n", l, r-1); - } - - l = r; - } - - if(start) - printf("};\n\n"); + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident)) + panicf("out of identifier space\n"); - return start; + return putrange(ident, prop, force); } static int ispair(char *label, char *prop) { - int l, r, start; - - start = 0; - for(l=0; l+2 < NumRunes; ){ - if(!prop[l]){ - l++; - continue; - } - - for(r = l + 2; r < NumRunes; r += 2){ - if(!prop[r]) - break; - prop[r] = 0; - } - - if(r != l + 2){ - if(!start){ - printf("static rune is%s_pair[] = {\n", label); - start = 1; - } - prop[l] = 0; - printf("\t0x%.4x, 0x%.4x,\n", l, r - 2); - } - - l = r; - } + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident)) + panicf("out of identifier space\n"); - if(start) - printf("};\n\n"); - return start; + return putpair(ident, prop); } static int issingle(char *label, char *prop) { - int i, start; + char ident[128]; + if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident)) + panicf("out of identifier space\n"); - start = 0; - for(i = 0; i < NumRunes; i++) { - if(!prop[i]) - continue; - - if(!start){ - printf("static rune is%s_single[] = {\n", label); - start = 1; - } - prop[i] = 0; - printf("\t0x%.4x,\n", i); - } - - if(start) - printf("};\n\n"); - - return start; + return putsingle(ident, prop); } static @@ -425,9 +283,9 @@ usage(void) int main(int argc, char *argv[]) { - int i, c, pc; + int i, sc, c, ec; io·Stream *utf8; - char *prop, *field[NumFields], *field2[NumFields], line[1024], line2[1024]; + char *prop, *field[NumFields], line[1024]; ARGBEGIN{ }ARGEND; @@ -453,33 +311,14 @@ main(int argc, char *argv[]) table.isspace['\v'] = 1; table.isspace[0x85] = 1; - pc = -1; - while(parse(utf8, field, arrlen(line), line)){ - /* parse unicode value */ - c = codepoint(field[Fcode]); - if(c >= NumRunes) - panicf("unexpected large codepoint %x", c); - if(c <= pc) - panicf("bad code sequence: %x then %x", pc, c); - pc = c; - - /* see if this is a range */ - if(strstr(field[Fname], ", First>") != nil){ - if(!parse(utf8, field2, arrlen(line2), line2)) - panicf("range start at end of file"); - if(strstr(field2[Fname], ", Last>") == nil) - panicf("range start not followed by range end"); - - pc = codepoint(field2[Fcode]); - - if(pc <= c) - panicf("bad code sequence: %x then %x", c, pc); - if(strcmp(field[Fcategory], field2[Fcategory]) != 0) - panicf("range with mismatched category"); - } - + ec = -1; + // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any + while(parse(utf8, arrlen(field), field, arrlen(line), line)){ + /* parse unicode range */ + codepointrange(utf8, field, &sc, &ec); prop = field[Fcategory]; - for(; c <= pc; c++){ + + for(c = sc; c <= ec; c++){ /* grab properties */ switch(prop[0]){ case 'L': @@ -533,30 +372,7 @@ main(int argc, char *argv[]) } io·close(utf8); - puts( - "#include <u.h>\n" - "#include <libunicode.h>\n" - "\n" - "static\n" - "rune*\n" - "rangesearch(rune c, rune *t, int n, int ne)\n" - "{\n" - " rune *p;\n" - " int m;\n" - " while(n > 1) {\n" - " m = n >> 1;\n" - " p = t + m*ne;\n" - " if(c >= p[0]){\n" - " t = p;\n" - " n = n-m;\n" - " }else\n" - " n = m;\n" - " }\n" - " if(n && c >= t[0])\n" - " return t;\n" - " return 0;\n" - "}\n" - ); + putsearch(); makeis("space", table.isspace, 0, 1); makeis("digit", table.isdigit, 0, 1); |