#include #include #include // ----------------------------------------------------------------------- // globals #define OFFSET (1 << 20) #define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) enum { // Fields inside UnicodeData.txt Fcode, Fname, Fcategory, Fcombine, Fbidir, Fdecomp, Fdecimal, Fdigit, Fnumeric, Fmirror, Foldname, Fcomment, Fupper, Flower, Ftitle, NumFields, NumRunes = 1 << 21, }; // TODO: use bitarrays. will reduce executable size 8x struct Table { /* properties */ char isspace[NumRunes]; char isalpha[NumRunes]; char ismark[NumRunes]; char isdigit[NumRunes]; char isupper[NumRunes]; char islower[NumRunes]; char istitle[NumRunes]; char ispunct[NumRunes]; char issymbl[NumRunes]; char iscntrl[NumRunes]; char combine[NumRunes]; /* transformations */ int toupper[NumRunes]; int tolower[NumRunes]; int totitle[NumRunes]; }; static struct Table table; // ----------------------------------------------------------------------- // internal functions int parse(io·Stream *io, char **field, int len, char *line) { int n; if((n=io·readln(io, len, line)) <= 0) return 0; if(n == len) panicf("line too long"); if(line[n-1] != '\n') panicf("invalid line: expected '\n', found '%c'", line[n]); line[n-1] = 0; /* tokenize line into fields */ n = 0; field[n] = line; while(*line){ if(*line == ';'){ *line = 0; field[++n] = line+1; } line++; } if(n != NumFields-1) panicf("expected %d number of fields, got %d", NumFields, n); return 1; } static int codepoint(char *s) { int c, b; c = 0; while((b=*s++)){ c <<= 4; if(b >= '0' && b <= '9') c += b - '0'; else if(b >= 'A' && b <= 'F') c += b - 'A' + 10; else panicf("bad codepoint char '%c'", b); } return c; } static int isrange(char *label, char *prop, int force) { int l, r, start; start = 0; for(l = 0; l < NumRunes;) { if(!prop[l]){ l++; continue; } for(r = l+1; r < NumRunes; r++){ if(!prop[r]) break; prop[r] = 0; } if(force || r > l + 1){ if(!start){ printf("static rune is%s_range[] = {\n", label); start = 1; } prop[l] = 0; printf("\t0x%.4x, 0x%.4x,\n", l, r-1); } l = r; } if(start) printf("};\n\n"); return start; } static int ispair(char *label, char *prop) { int l, r, start; start = 0; for(l=0; l+2 < NumRunes; ){ if(!prop[l]){ l++; continue; } for(r = l + 2; r < NumRunes; r += 2){ if(!prop[r]) break; prop[r] = 0; } if(r != l + 2){ if(!start){ printf("static rune is%s_pair[] = {\n", label); start = 1; } prop[l] = 0; printf("\t0x%.4x, 0x%.4x,\n", l, r - 2); } l = r; } if(start) printf("};\n\n"); return start; } static int issingle(char *label, char *prop) { int i, start; start = 0; for(i = 0; i < NumRunes; i++) { if(!prop[i]) continue; if(!start){ printf("static rune is%s_single[] = {\n", label); start = 1; } prop[i] = 0; printf("\t0x%.4x,\n", i); } if(start) printf("};\n\n"); return start; } static void makeis(char *label, char *table, int pairs, int onlyranges) { int hasr, hasp=0, hass=0; hasr = isrange(label, table, onlyranges); if(!onlyranges && pairs) hasp = ispair(label, table); if(!onlyranges) hass = issingle(label, table); printf( "int\n" "utf8·is%s(rune c)\n" "{\n" " rune *p;\n" "\n", label); if(hasr){ printf( " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n" " if(p && c >= p[0] && c <= p[1])\n" " return 1;\n", label, label); } if(hasp){ printf( " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n" " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" " return 1;\n", label, label); } if(hass) printf( " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n" " if(p && c == p[0])\n" " return 1;\n", label, label); printf( " return 0;\n" "}\n" "\n"); } static int torange(char *label, int *index, int force) { int l, r, d, start = 0; for(l = 0; l < NumRunes; ){ if(index[l] == l){ l++; continue; } d = DELTA(index[l], l); if(d != (rune)d) panicf("bad map delta %d", d); for(r = l+1; r < NumRunes; r++){ if(DELTA(index[r], r) != d) break; index[r] = r; } if(force || r != l + 1){ if(!start){ printf("static rune to%s_range[] = {\n", label); start = 1; } index[l] = l; printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d); } l = r; } if(start) printf("};\n\n"); return start; } static int topair(char *label, int *index) { int l, r, d, start = 0; for(l = 0; l + 2 < NumRunes; ){ if(index[l] == l){ l++; continue; } d = DELTA(index[l], l); if(d != (rune)d) panicf("bad delta %d", d); for(r = l+2; r < NumRunes; r += 2){ if(DELTA(index[r], r) != d) break; index[r] = r; } if(r > l+2){ if(!start){ printf("static rune to%s_pair[] = {\n", label); start = 1; } index[l] = l; printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d); } l = r; } if(start) printf("};\n\n"); return start; } static int tosingle(char *label, int *index) { int i, d, start = 0; for(i=0; i < NumRunes; i++) { if(index[i] == i) continue; d = DELTA(index[i], i); if(d != (rune)d) panicf("bad map delta %d", d); if(!start){ printf("static rune to%s_single[] = {\n", label); start = 1; } index[i] = i; printf("\t0x%.4x, %d,\n", i, d); } if(start) printf("};\n\n"); return start; } static void mkto(char *label, int *index, int pairs, int onlyrange) { int hasr, hasp=0, hass=0; hasr = torange(label, index, !onlyrange); if(!onlyrange && pairs) hasp = topair(label, index); if(!onlyrange) hass = tosingle(label, index); printf( "rune\n" "utf8·to%s(rune c)\n" "{\n" " rune *p;\n" "\n", label); if(hasr) printf( " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n" " if(p && c >= p[0] && c <= p[1])\n" " return c + p[2] - %d;\n", label, label, OFFSET); if(hasp) printf( " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n" " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" " return c + p[2] - %d;\n", label, label, OFFSET); if(hass) printf( " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n" " if(p && c == p[0])\n" " return c + p[1] - %d;\n", label, label, OFFSET); printf( " return c;\n" "}\n" "\n" ); } // ----------------------------------------------------------------------- // main point of entry static void usage(void) { fprintf(stderr, "usage: mkutf8data \n"); exit(1); } int main(int argc, char *argv[]) { int i, c, pc; io·Stream *utf8; char *prop, *field[NumFields], *field2[NumFields], line[1024], line2[1024]; ARGBEGIN{ }ARGEND; if(argc != 1) usage(); if(!(utf8 = io·open(argv[0], "r"))) panicf("can't open %s\n", argv[0]); /* by default each character maps to itself */ for(i = 0; i < NumRunes; i++) { table.toupper[i] = i; table.tolower[i] = i; table.totitle[i] = i; } /* ensure all C local white space characters pass */ table.isspace['\t'] = 1; table.isspace['\n'] = 1; table.isspace['\r'] = 1; table.isspace['\f'] = 1; table.isspace['\v'] = 1; table.isspace[0x85] = 1; pc = -1; while(parse(utf8, field, arrlen(line), line)){ /* parse unicode value */ c = codepoint(field[Fcode]); if(c >= NumRunes) panicf("unexpected large codepoint %x", c); if(c <= pc) panicf("bad code sequence: %x then %x", pc, c); pc = c; /* see if this is a range */ if(strstr(field[Fname], ", First>") != nil){ if(!parse(utf8, field2, arrlen(line2), line2)) panicf("range start at end of file"); if(strstr(field2[Fname], ", Last>") == nil) panicf("range start not followed by range end"); pc = codepoint(field2[Fcode]); if(pc <= c) panicf("bad code sequence: %x then %x", c, pc); if(strcmp(field[Fcategory], field2[Fcategory]) != 0) panicf("range with mismatched category"); } prop = field[Fcategory]; for(; c <= pc; c++){ /* grab properties */ switch(prop[0]){ case 'L': table.isalpha[c] = 1; switch(prop[1]){ case 'u': table.isupper[c] = 1; break; case 'l': table.islower[c] = 1; break; case 't': table.istitle[c] = 1; break; case 'm': break; // modifier letters case 'o': break; // ideograph letters default: goto badproperty; } break; case 'Z': table.isspace[c] = 1; break; case 'M': table.ismark[c] = 1; break; case 'N': table.isdigit[c] = 1; break; case 'P': table.ispunct[c] = 1; break; case 'S': table.issymbl[c] = 1; break; case 'C': table.iscntrl[c] = 1; break; default: badproperty: panicf("unrecognized category '%s'", prop); } /* grab transformations */ if(*field[Fupper]) table.toupper[c] = codepoint(field[Fupper]); if(*field[Flower]) table.tolower[c] = codepoint(field[Flower]); if(*field[Ftitle]) table.totitle[c] = codepoint(field[Ftitle]); } } io·close(utf8); puts( "#include \n" "#include \n" "\n" "static\n" "rune*\n" "rangesearch(rune c, rune *t, int n, int ne)\n" "{\n" " rune *p;\n" " int m;\n" " while(n > 1) {\n" " m = n >> 1;\n" " p = t + m*ne;\n" " if(c >= p[0]){\n" " t = p;\n" " n = n-m;\n" " }else\n" " n = m;\n" " }\n" " if(n && c >= t[0])\n" " return t;\n" " return 0;\n" "}\n" ); makeis("space", table.isspace, 0, 1); makeis("digit", table.isdigit, 0, 1); makeis("alpha", table.isalpha, 0, 0); makeis("upper", table.isupper, 1, 0); makeis("lower", table.islower, 1, 0); makeis("title", table.istitle, 1, 0); mkto("upper", table.toupper, 1, 0); mkto("lower", table.tolower, 1, 0); mkto("title", table.totitle, 1, 0); }