#include "common.h" /* * inspired by design choices in utf8proc/charwidths.jl * all widths default to 1 unless they fall within the categories: * 1. Mn 2. Mc 3. Me 4. Zl * 5. Zp 6. Cc 7. Cf 8. Cs * these default to zero width */ enum { /* width ? */ WidthNeutral, /* (N) practially treated like narrow but unclear ... */ WidthAmbiguous, /* (A) sometimes wide and sometimes not... */ /* width 1 */ WidthHalf, /* (H) = to narrow (compatability equivalent) */ WidthNarrow, /* (Na) ASCII width */ /* width 2 */ WidthWide, /* (W) 2x width */ WidthFull, /* (F) = to wide (compatability equivalent) */ }; struct Table { char width[3][NumRunes]; }; static struct Table table; // ----------------------------------------------------------------------- // internal functions static void parse_category(char *path) { int sc, c, ec, w; io·Stream *utf8; char *prop, *field[NumFields], line[1024]; if(!(utf8 = io·open(path, "r"))) panicf("can't open %s\n", path); // NOTE: we don't check for comments here ec = -1; while(parse(utf8, arrlen(field), field, arrlen(line), line)){ codepointrange(utf8, field, &sc, &ec); prop = field[Fcategory]; switch(prop[0]){ case 'M': switch(prop[1]){ case 'n': case 'c': case 'e': w = 0; break; default: w = 1; break; } break; case 'Z': switch(prop[1]){ case 'l': case 'p': w = 0; break; default: w = 1; break; } break; case 'C': switch(prop[1]){ case 'c': case 'f': case 's': w = 0; break; default: w = 1; break; } default: w = 1; } for(c = sc; c <= ec; c++) table.width[w][c] = 1; } io·close(utf8); } static void coderange(char *field, int *l, int *r) { char *s; if(!(s = strstr(field, ".."))) *l=*r=codepoint(field); else{ *s++ = 0, *s++ = 0; *l=codepoint(field); *r=codepoint(s); } } static void parse_eawidths(char *path) { int at, w; int l, c, r; io·Stream *utf8; char *field[2], line[1024]; utf8 = io·open(path, "r"); while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ if(at == ParseSkip) continue; switch(field[1][0]){ case 'A': continue; case 'N': if(field[1][1] != 'a') continue; /* fallthrough */ case 'H': w = 1; break; case 'W': /* fallthrough */ case 'F': w = 2; break; default: panicf("malformed east asian width class: %s\n", field[1]); } coderange(field[0], &l, &r); for(c=l; c <= r; c++){ /* ensure it only exists in one table */ table.width[w][c] = 1; table.width[(w+1)%3][c] = 0; table.width[(w+2)%3][c] = 0; } } io·close(utf8); } static void parse_emoji(char *path) { int at, w; int l, c, r; io·Stream *utf8; char *s, *field[2], line[1024]; utf8 = io·open(path, "r"); while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ if(at == ParseSkip) continue; /* only override emoji presentation */ if(!strstr(field[1], "Emoji_Presentation")) continue; /* trim trailing space */ for(s=field[0]; *s; s++){ if(*s == ' ') *s = 0; } coderange(field[0], &l, &r); for(c=l; c <= r; c++){ table.width[0][c] = 0; table.width[1][c] = 0; table.width[2][c] = 1; } } io·close(utf8); } /* output functions */ static void maketable(char *label, char *table, int pairs, int onlyranges) { int r, p=0, s=0; char ident[3][128]; enum { Irange, Ipair, Isingle, }; /* ranges */ if(snprintf(ident[Irange], arrlen(ident[Irange]), "%s_range", label) == arrlen(ident[Irange])) panicf("out of identifier space\n"); r = putrange(ident[Irange], table, onlyranges); if(!onlyranges && pairs){ if(snprintf(ident[Ipair], arrlen(ident[Ipair]), "%s_pair", label) == arrlen(ident[Ipair])) panicf("out of identifier space\n"); p = putpair(ident[Ipair], table); } if(!onlyranges){ if(snprintf(ident[Isingle], arrlen(ident[Isingle]), "%s_single", label) == arrlen(ident[Isingle])) panicf("out of identifier space\n"); s = putsingle(ident[Isingle], table); } printf( "static int\n" "is%s(rune c)\n" "{\n" " rune *p;\n" "\n", label); if(r){ printf( " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n" " if(p && c >= p[0] && c <= p[1])\n" " return 1;\n", ident[Irange], ident[Irange]); } if(p){ printf( " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n" " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n" " return 1;\n", ident[Ipair], ident[Ipair]); } if(s) printf( " p = rangesearch(c, %s, arrlen(%s), 1);\n" " if(p && c == p[0])\n" " return 1;\n", ident[Isingle], ident[Isingle]); printf( " return 0;\n" "}\n" "\n"); } // ----------------------------------------------------------------------- // main point of entry static void usage(void) { fprintf(stderr, "usage: mkrunewidth \n"); exit(1); } #define SETW0(c) \ table.width[0][(c)] = 1, \ table.width[1][(c)] = 0, \ table.width[2][(c)] = 0; #define SETW1(c) \ table.width[0][(c)] = 0, \ table.width[1][(c)] = 1, \ table.width[2][(c)] = 0; #define SETW2(c) \ table.width[0][(c)] = 0, \ table.width[1][(c)] = 0, \ table.width[2][(c)] = 1; int main(int argc, char *argv[]) { int c; ARGBEGIN{ }ARGEND; if(argc != 3) usage(); parse_category(*argv++); parse_eawidths(*argv++); parse_emoji(*argv); /* overrides */ SETW0(0x2028); SETW0(0x2029); SETW1(0x00AD); /* simple checking */ for(c=0; c 1) panicf("improper table state"); } putsearch(); maketable("width0", table.width[0], 1, 0); maketable("width1", table.width[1], 1, 0); maketable("width2", table.width[2], 1, 0); puts( "\n" "int\n" "utf8·runewidth(rune c)\n" "{\n" " if(iswidth1(c))\n" " return 1;\n" " if(iswidth2(c))\n" " return 2;\n" " return 0;\n" "}" ); }