aboutsummaryrefslogtreecommitdiff
path: root/src/libutf/vendor/mkrunetype.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libutf/vendor/mkrunetype.c')
-rw-r--r--src/libutf/vendor/mkrunetype.c388
1 files changed, 388 insertions, 0 deletions
diff --git a/src/libutf/vendor/mkrunetype.c b/src/libutf/vendor/mkrunetype.c
new file mode 100644
index 0000000..9f939f4
--- /dev/null
+++ b/src/libutf/vendor/mkrunetype.c
@@ -0,0 +1,388 @@
+#include "common.h"
+
+// -----------------------------------------------------------------------
+// globals
+
+#define OFFSET (1 << 20)
+#define DELTA(mapx, x) ((1 << 20) + (mapx) - (x))
+
+// TODO: use bitarrays. will reduce executable size 8x
+struct Table
+{
+ /* properties */
+ char isspace[NumRunes];
+ char isalpha[NumRunes];
+ char ismark[NumRunes];
+ char isdigit[NumRunes];
+ char isupper[NumRunes];
+ char islower[NumRunes];
+ char istitle[NumRunes];
+ char ispunct[NumRunes];
+ char issymbl[NumRunes];
+ char iscntrl[NumRunes];
+
+ char combine[NumRunes];
+
+ /* transformations */
+ int toupper[NumRunes];
+ int tolower[NumRunes];
+ int totitle[NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+int
+isrange(char *label, char *prop, int force)
+{
+ char ident[128];
+ if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident))
+ panicf("out of identifier space\n");
+
+ return putrange(ident, prop, force);
+}
+
+static
+int
+ispair(char *label, char *prop)
+{
+ char ident[128];
+ if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident))
+ panicf("out of identifier space\n");
+
+ return putpair(ident, prop);
+}
+
+static
+int
+issingle(char *label, char *prop)
+{
+ char ident[128];
+ if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident))
+ panicf("out of identifier space\n");
+
+ return putsingle(ident, prop);
+}
+
+static
+void
+makeis(char *label, char *table, int pairs, int onlyranges)
+{
+ int hasr, hasp=0, hass=0;
+
+ hasr = isrange(label, table, onlyranges);
+ if(!onlyranges && pairs)
+ hasp = ispair(label, table);
+ if(!onlyranges)
+ hass = issingle(label, table);
+
+ printf(
+ "int\n"
+ "utf8·is%s(rune c)\n"
+ "{\n"
+ " rune *p;\n"
+ "\n",
+ label);
+
+ if(hasr){
+ printf(
+ " p = rangesearch(c, is%s_range, arrlen(is%s_range)/2, 2);\n"
+ " if(p && c >= p[0] && c <= p[1])\n"
+ " return 1;\n",
+ label, label);
+ }
+
+ if(hasp){
+ printf(
+ " p = rangesearch(c, is%s_pair, arrlen(is%s_pair)/2, 2);\n"
+ " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+ " return 1;\n",
+ label, label);
+ }
+
+ if(hass)
+ printf(
+ " p = rangesearch(c, is%s_single, arrlen(is%s_single), 1);\n"
+ " if(p && c == p[0])\n"
+ " return 1;\n",
+ label, label);
+
+ printf(
+ " return 0;\n"
+ "}\n"
+ "\n");
+}
+
+static
+int
+torange(char *label, int *index, int force)
+{
+ int l, r, d, start = 0;
+
+ for(l = 0; l < NumRunes; ){
+ if(index[l] == l){
+ l++;
+ continue;
+ }
+
+ d = DELTA(index[l], l);
+ if(d != (rune)d)
+ panicf("bad map delta %d", d);
+
+ for(r = l+1; r < NumRunes; r++){
+ if(DELTA(index[r], r) != d)
+ break;
+ index[r] = r;
+ }
+
+ if(force || r != l + 1){
+ if(!start){
+ printf("static rune to%s_range[] = {\n", label);
+ start = 1;
+ }
+ index[l] = l;
+ printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-1, d);
+ }
+ l = r;
+ }
+ if(start)
+ printf("};\n\n");
+
+ return start;
+}
+
+static
+int
+topair(char *label, int *index)
+{
+ int l, r, d, start = 0;
+
+ for(l = 0; l + 2 < NumRunes; ){
+ if(index[l] == l){
+ l++;
+ continue;
+ }
+
+ d = DELTA(index[l], l);
+ if(d != (rune)d)
+ panicf("bad delta %d", d);
+
+ for(r = l+2; r < NumRunes; r += 2){
+ if(DELTA(index[r], r) != d)
+ break;
+ index[r] = r;
+ }
+
+ if(r > l+2){
+ if(!start){
+ printf("static rune to%s_pair[] = {\n", label);
+ start = 1;
+ }
+ index[l] = l;
+ printf("\t0x%.4x, 0x%.4x, %d,\n", l, r-2, d);
+ }
+
+ l = r;
+ }
+ if(start)
+ printf("};\n\n");
+
+ return start;
+}
+
+static
+int
+tosingle(char *label, int *index)
+{
+ int i, d, start = 0;
+
+ for(i=0; i < NumRunes; i++) {
+ if(index[i] == i)
+ continue;
+
+ d = DELTA(index[i], i);
+ if(d != (rune)d)
+ panicf("bad map delta %d", d);
+
+ if(!start){
+ printf("static rune to%s_single[] = {\n", label);
+ start = 1;
+ }
+ index[i] = i;
+ printf("\t0x%.4x, %d,\n", i, d);
+ }
+ if(start)
+ printf("};\n\n");
+
+ return start;
+}
+
+static
+void
+mkto(char *label, int *index, int pairs, int onlyrange)
+{
+ int hasr, hasp=0, hass=0;
+
+ hasr = torange(label, index, !onlyrange);
+ if(!onlyrange && pairs)
+ hasp = topair(label, index);
+ if(!onlyrange)
+ hass = tosingle(label, index);
+
+ printf(
+ "rune\n"
+ "utf8·to%s(rune c)\n"
+ "{\n"
+ " rune *p;\n"
+ "\n",
+ label);
+
+ if(hasr)
+ printf(
+ " p = rangesearch(c, to%s_range, arrlen(to%s_range)/3, 3);\n"
+ " if(p && c >= p[0] && c <= p[1])\n"
+ " return c + p[2] - %d;\n",
+ label, label, OFFSET);
+
+ if(hasp)
+ printf(
+ " p = rangesearch(c, to%s_pair, arrlen(to%s_pair)/3, 3);\n"
+ " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+ " return c + p[2] - %d;\n",
+ label, label, OFFSET);
+
+ if(hass)
+ printf(
+ " p = rangesearch(c, to%s_single, arrlen(to%s_single)/2, 2);\n"
+ " if(p && c == p[0])\n"
+ " return c + p[1] - %d;\n",
+ label, label, OFFSET);
+
+
+ printf(
+ " return c;\n"
+ "}\n"
+ "\n"
+ );
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+ fprintf(stderr, "usage: mkrunetype <UnicodeData.txt>\n");
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int i, sc, c, ec;
+ io·Stream *utf8;
+ char *prop, *field[NumFields], line[1024];
+
+ ARGBEGIN{
+ }ARGEND;
+
+ if(argc != 1)
+ usage();
+
+ if(!(utf8 = io·open(argv[0], "r")))
+ panicf("can't open %s\n", argv[0]);
+
+ /* by default each character maps to itself */
+ for(i = 0; i < NumRunes; i++) {
+ table.toupper[i] = i;
+ table.tolower[i] = i;
+ table.totitle[i] = i;
+ }
+
+ /* ensure all C local white space characters pass */
+ table.isspace['\t'] = 1;
+ table.isspace['\n'] = 1;
+ table.isspace['\r'] = 1;
+ table.isspace['\f'] = 1;
+ table.isspace['\v'] = 1;
+ table.isspace[0x85] = 1;
+
+ ec = -1;
+ // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any
+ while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+ /* parse unicode range */
+ codepointrange(utf8, field, &sc, &ec);
+ prop = field[Fcategory];
+
+ for(c = sc; c <= ec; c++){
+ /* grab properties */
+ switch(prop[0]){
+ case 'L':
+ table.isalpha[c] = 1;
+ switch(prop[1]){
+ case 'u': table.isupper[c] = 1; break;
+ case 'l': table.islower[c] = 1; break;
+ case 't': table.istitle[c] = 1; break;
+ case 'm': break; // modifier letters
+ case 'o': break; // ideograph letters
+ default:
+ goto badproperty;
+ }
+ break;
+
+ case 'Z':
+ table.isspace[c] = 1;
+ break;
+
+ case 'M':
+ table.ismark[c] = 1;
+ break;
+
+ case 'N':
+ table.isdigit[c] = 1;
+ break;
+
+ case 'P':
+ table.ispunct[c] = 1;
+ break;
+
+ case 'S':
+ table.issymbl[c] = 1;
+ break;
+
+ case 'C':
+ table.iscntrl[c] = 1;
+ break;
+
+ default: badproperty:
+ panicf("unrecognized category '%s'", prop);
+ }
+ /* grab transformations */
+ if(*field[Fupper])
+ table.toupper[c] = codepoint(field[Fupper]);
+ if(*field[Flower])
+ table.tolower[c] = codepoint(field[Flower]);
+ if(*field[Ftitle])
+ table.totitle[c] = codepoint(field[Ftitle]);
+ }
+ }
+ io·close(utf8);
+
+ putsearch();
+
+ makeis("space", table.isspace, 0, 1);
+ makeis("digit", table.isdigit, 0, 1);
+ makeis("alpha", table.isalpha, 0, 0);
+ makeis("upper", table.isupper, 1, 0);
+ makeis("lower", table.islower, 1, 0);
+ makeis("title", table.istitle, 1, 0);
+ makeis("punct", table.ispunct, 1, 0);
+
+ mkto("upper", table.toupper, 1, 0);
+ mkto("lower", table.tolower, 1, 0);
+ mkto("title", table.totitle, 1, 0);
+}