1 files changed, 23 insertions, 207 deletions
diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c
index fc4301c..9f939f4 100644
--- a/sys/libunicode/vendor/mkrunetype.c
+++ b/sys/libunicode/vendor/mkrunetype.c
@@ -1,6 +1,4 @@
-#include <u.h>
-#include <base.h>
-#include <libunicode.h>
+#include "common.h"
 
 // -----------------------------------------------------------------------
 // globals
@@ -8,29 +6,6 @@
 #define OFFSET (1 << 20)
 #define DELTA(mapx, x)  ((1 << 20) + (mapx) - (x))
 
-enum
-{
-    // Fields inside UnicodeData.txt
-    Fcode,
-    Fname,
-    Fcategory,
-    Fcombine,
-    Fbidir,
-    Fdecomp,
-    Fdecimal,
-    Fdigit,
-    Fnumeric,
-    Fmirror,
-    Foldname,
-    Fcomment,
-    Fupper,
-    Flower,
-    Ftitle,
-
-    NumFields,
-    NumRunes = 1 << 21,
-};
-
 // TODO: use bitarrays. will reduce executable size 8x
 struct Table
 {
@@ -59,154 +34,37 @@ static struct Table table;
 // -----------------------------------------------------------------------
 // internal functions
 
-int
-parse(io·Stream *io, char **field, int len, char *line)
-{
-    int n;
-    if((n=io·readln(io, len, line)) <= 0)
-        return 0;
-
-    if(n == len)
-        panicf("line too long");
-
-    if(line[n-1] != '\n')
-        panicf("invalid line: expected '\n', found '%c'", line[n]);
-
-    line[n-1] = 0;
-
-    /* tokenize line into fields */
-    n = 0;
-    field[n] = line;
-    while(*line){
-        if(*line == ';'){
-            *line = 0;
-            field[++n] = line+1;
-        }
-        line++;
-    }
-
-    if(n != NumFields-1)
-        panicf("expected %d number of fields, got %d", NumFields, n);
-
-    return 1;
-}
-
-static
-int
-codepoint(char *s)
-{
-    int c, b;
-
-    c = 0;
-    while((b=*s++)){
-        c <<= 4;
-        if(b >= '0' && b <= '9')
-            c += b - '0';
-        else if(b >= 'A' && b <= 'F')
-            c += b - 'A' + 10;
-        else
-            panicf("bad codepoint char '%c'", b);
-    }
-
-    return c;
-}
-
 static
 int
 isrange(char *label, char *prop, int force)
 {
-    int l, r, start;
-
-    start = 0;
-    for(l = 0; l < NumRunes;) {
-        if(!prop[l]){
-            l++;
-            continue;
-        }
-
-        for(r = l+1; r < NumRunes; r++){
-            if(!prop[r])
-                break;
-            prop[r] = 0;
-        }
-
-        if(force || r > l + 1){
-            if(!start){
-                printf("static rune is%s_range[] = {\n", label);
-                start = 1;
-            }
-            prop[l] = 0;
-            printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
-        }
-
-        l = r;
-    }
-
-    if(start)
-        printf("};\n\n");
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident))
+        panicf("out of identifier space\n");
 
-    return start;
+    return putrange(ident, prop, force);
 }
 
 static
 int
 ispair(char *label, char *prop)
 {
-    int l, r, start;
-
-    start = 0;
-    for(l=0; l+2 < NumRunes; ){
-        if(!prop[l]){
-            l++;
-            continue;
-        }
-
-        for(r = l + 2; r < NumRunes; r += 2){
-            if(!prop[r])
-                break;
-            prop[r] = 0;
-        }
-
-        if(r != l + 2){
-            if(!start){
-                printf("static rune is%s_pair[] = {\n", label);
-                start = 1;
-            }
-            prop[l] = 0;
-            printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
-        }
-
-        l = r;
-    }
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident))
+        panicf("out of identifier space\n");
 
-    if(start)
-        printf("};\n\n");
-    return start;
+    return putpair(ident, prop);
 }
 
 static
 int
 issingle(char *label, char *prop)
 {
-    int i, start;
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident))
+        panicf("out of identifier space\n");
 
-    start = 0;
-    for(i = 0; i < NumRunes; i++) {
-        if(!prop[i])
-            continue;
-
-        if(!start){
-            printf("static rune is%s_single[] = {\n", label);
-            start = 1;
-        }
-        prop[i] = 0;
-        printf("\t0x%.4x,\n", i);
-    }
-
-    if(start)
-        printf("};\n\n");
-
-    return start;
+    return putsingle(ident, prop);
 }
 
 static
@@ -425,9 +283,9 @@ usage(void)
 int
 main(int argc, char *argv[])
 {
-    int i, c, pc;
+    int i, sc, c, ec;
     io·Stream *utf8;
-    char *prop, *field[NumFields], *field2[NumFields], line[1024], line2[1024];
+    char *prop, *field[NumFields], line[1024];
 
     ARGBEGIN{
     }ARGEND;
@@ -453,33 +311,14 @@ main(int argc, char *argv[])
     table.isspace['\v'] = 1;
     table.isspace[0x85] = 1;
 
-    pc = -1;
-    while(parse(utf8, field, arrlen(line), line)){
-        /* parse unicode value */
-        c = codepoint(field[Fcode]);
-        if(c >= NumRunes)
-            panicf("unexpected large codepoint %x", c);
-        if(c <= pc)
-            panicf("bad code sequence: %x then %x", pc, c);
-        pc = c;
-
-        /* see if this is a range */
-        if(strstr(field[Fname], ", First>") != nil){
-            if(!parse(utf8, field2, arrlen(line2), line2))
-                panicf("range start at end of file");
-            if(strstr(field2[Fname], ", Last>") == nil)
-                panicf("range start not followed by range end");
-
-            pc = codepoint(field2[Fcode]);
-
-            if(pc <= c)
-                panicf("bad code sequence: %x then %x", c, pc);
-            if(strcmp(field[Fcategory], field2[Fcategory]) != 0)
-                panicf("range with mismatched category");
-        }
-
+    ec = -1;
+    // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any
+    while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+        /* parse unicode range */
+        codepointrange(utf8, field, &sc, &ec);
         prop = field[Fcategory];
-        for(; c <= pc; c++){
+
+        for(c = sc; c <= ec; c++){
             /* grab properties */
             switch(prop[0]){
             case 'L':
@@ -533,30 +372,7 @@ main(int argc, char *argv[])
     }
     io·close(utf8);
 
-    puts(
-        "#include <u.h>\n"
-        "#include <libunicode.h>\n"
-        "\n"
-        "static\n"
-        "rune*\n"
-        "rangesearch(rune c, rune *t, int n, int ne)\n"
-        "{\n"
-        "   rune *p;\n"
-        "   int m;\n"
-        "   while(n > 1) {\n"
-        "   m = n >> 1;\n"
-        "   p = t + m*ne;\n"
-        "   if(c >= p[0]){\n"
-        "       t = p;\n"
-        "       n = n-m;\n"
-        "   }else\n"
-        "       n = m;\n"
-        "   }\n"
-        "   if(n && c >= t[0])\n"
-        "       return t;\n"
-        "   return 0;\n"
-        "}\n"
-    );
+    putsearch();
 
     makeis("space", table.isspace, 0, 1);
     makeis("digit", table.isdigit, 0, 1);