#include "common.h" // ----------------------------------------------------------------------- // globals #define OFFSET (1 << 20) #define DELTA(mapx, x) ((1 << 20) + (mapx) - (x)) // TODO: use bitarrays. will reduce executable size 8x struct Table { /* properties */ char isspace[NumRunes]; char isalpha[NumRunes]; char ismark[NumRunes]; char isdigit[NumRunes]; char isupper[NumRunes]; char islower[NumRunes]; char istitle[NumRunes]; char ispunct[NumRunes]; char issymbl[NumRunes]; char iscntrl[NumRunes]; char combine[NumRunes]; /* transformations */ int toupper[NumRunes]; int tolower[NumRunes]; int totitle[NumRunes]; }; static struct Table table; // ----------------------------------------------------------------------- // internal functions static int isrange(char *label, char *prop, int force) { char ident[128]; if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident)) panicf("out of identifier space\n"); return putrange(ident, prop, force); } static int ispair(char *label, char *prop) { char ident[128]; if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident)) panicf("out of identifier space\n"); return putpair(ident, prop); } static int issingle(char *label, char *prop) { char ident[128]; if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident)) panicf("out of identifier space\n"); return putsingle(ident, prop); } static void makeis(char *label, char *table, int pairs, int onlyranges) { int hasr, hasp=0, hass=0; hasr = isrange(label, table, onlyranges); if(!onlyranges && pairs) hasp = ispair(label, table); if(!onlyranges) hass = issingle(label, table); printf( "int\n" "utf8·is%s(rune c)\n" "{\n" " rune *p;\n" "\n", label); if(hasr){ printf( " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n" " if(p && c >= p[0] && c <= p[1])\n" " return 1;\n", label, label); } if(hasp){ printf( " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n" " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" " return 1;\n", label, label); } if(hass) printf( " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n" " if(p && c == p[0])\n" " return 1;\n", label, label); printf( " return 0;\n" "}\n" "\n"); } static int torange(char *label, int *index, int force) { int l, r, d, start = 0; for(l = 0; l < NumRunes; ){ if(index[l] == l){ l++; continue; } d = DELTA(index[l], l); if(d != (rune)d) panicf("bad map delta %d", d); for(r = l+1; r < NumRunes; r++){ if(DELTA(index[r], r) != d) break; index[r] = r; } if(force || r != l + 1){ if(!start){ printf("static rune to%s_range[] = {\n", label); start = 1; } index[l] = l; printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d); } l = r; } if(start) printf("};\n\n"); return start; } static int topair(char *label, int *index) { int l, r, d, start = 0; for(l = 0; l + 2 < NumRunes; ){ if(index[l] == l){ l++; continue; } d = DELTA(index[l], l); if(d != (rune)d) panicf("bad delta %d", d); for(r = l+2; r < NumRunes; r += 2){ if(DELTA(index[r], r) != d) break; index[r] = r; } if(r > l+2){ if(!start){ printf("static rune to%s_pair[] = {\n", label); start = 1; } index[l] = l; printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d); } l = r; } if(start) printf("};\n\n"); return start; } static int tosingle(char *label, int *index) { int i, d, start = 0; for(i=0; i < NumRunes; i++) { if(index[i] == i) continue; d = DELTA(index[i], i); if(d != (rune)d) panicf("bad map delta %d", d); if(!start){ printf("static rune to%s_single[] = {\n", label); start = 1; } index[i] = i; printf("\t0x%.4x, %d,\n", i, d); } if(start) printf("};\n\n"); return start; } static void mkto(char *label, int *index, int pairs, int onlyrange) { int hasr, hasp=0, hass=0; hasr = torange(label, index, !onlyrange); if(!onlyrange && pairs) hasp = topair(label, index); if(!onlyrange) hass = tosingle(label, index); printf( "rune\n" "utf8·to%s(rune c)\n" "{\n" " rune *p;\n" "\n", label); if(hasr) printf( " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n" " if(p && c >= p[0] && c <= p[1])\n" " return c + p[2] - %d;\n", label, label, OFFSET); if(hasp) printf( " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n" " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" " return c + p[2] - %d;\n", label, label, OFFSET); if(hass) printf( " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n" " if(p && c == p[0])\n" " return c + p[1] - %d;\n", label, label, OFFSET); printf( " return c;\n" "}\n" "\n" ); } // ----------------------------------------------------------------------- // main point of entry static void usage(void) { fprintf(stderr, "usage: mkrunetype \n"); exit(1); } int main(int argc, char *argv[]) { int i, sc, c, ec; io·Stream *utf8; char *prop, *field[NumFields], line[1024]; ARGBEGIN{ }ARGEND; if(argc != 1) usage(); if(!(utf8 = io·open(argv[0], "r"))) panicf("can't open %s\n", argv[0]); /* by default each character maps to itself */ for(i = 0; i < NumRunes; i++) { table.toupper[i] = i; table.tolower[i] = i; table.totitle[i] = i; } /* ensure all C local white space characters pass */ table.isspace['\t'] = 1; table.isspace['\n'] = 1; table.isspace['\r'] = 1; table.isspace['\f'] = 1; table.isspace['\v'] = 1; table.isspace[0x85] = 1; ec = -1; // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any while(parse(utf8, arrlen(field), field, arrlen(line), line)){ /* parse unicode range */ codepointrange(utf8, field, &sc, &ec); prop = field[Fcategory]; for(c = sc; c <= ec; c++){ /* grab properties */ switch(prop[0]){ case 'L': table.isalpha[c] = 1; switch(prop[1]){ case 'u': table.isupper[c] = 1; break; case 'l': table.islower[c] = 1; break; case 't': table.istitle[c] = 1; break; case 'm': break; // modifier letters case 'o': break; // ideograph letters default: goto badproperty; } break; case 'Z': table.isspace[c] = 1; break; case 'M': table.ismark[c] = 1; break; case 'N': table.isdigit[c] = 1; break; case 'P': table.ispunct[c] = 1; break; case 'S': table.issymbl[c] = 1; break; case 'C': table.iscntrl[c] = 1; break; default: badproperty: panicf("unrecognized category '%s'", prop); } /* grab transformations */ if(*field[Fupper]) table.toupper[c] = codepoint(field[Fupper]); if(*field[Flower]) table.tolower[c] = codepoint(field[Flower]); if(*field[Ftitle]) table.totitle[c] = codepoint(field[Ftitle]); } } io·close(utf8); putsearch(); makeis("space", table.isspace, 0, 1); makeis("digit", table.isdigit, 0, 1); makeis("alpha", table.isalpha, 0, 0); makeis("upper", table.isupper, 1, 0); makeis("lower", table.islower, 1, 0); makeis("title", table.istitle, 1, 0); makeis("punct", table.ispunct, 1, 0); mkto("upper", table.toupper, 1, 0); mkto("lower", table.tolower, 1, 0); mkto("title", table.totitle, 1, 0); }