implemented functionality to compute rune widths from unicode data

author: Nicholas Noll <nbnoll@eml.cc> 2021-10-29 11:10:29 -0700
committer: Nicholas Noll <nbnoll@eml.cc> 2021-10-29 11:10:29 -0700
commit: 6db99f51209ebde97311df3aac081f82589319cb (patch)
tree: 8a3e67a2c3ff4087379fb584a88073f0e43696e1
parent: b42145545b434a40d6ad50cf3ac6cb6e04ae0500 (diff)
6 files changed, 512 insertions, 210 deletions
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index aca7348..5ba9b89 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -11,18 +11,25 @@ SRCS_$(d) := \
 	$(d)/canfit.c \
 	$(d)/runelen.c \
 	$(d)/len.c \
-	$(d)/runetype-$(UNICODE).c
+	$(d)/runetype-$(UNICODE).c \
+	$(d)/runewidth-$(UNICODE).c
 
 LIBS_$(d) := $(d)/libunicode.a
 
 include share/paths.mk
 
+# ========================================================================
+# table generation
+
+$(d)/vendor/common.o: $(d)/vendor/common.c
+	$(COMPILE)
+
 # rune categories
 $(d)/vendor/UnicodeData-$(UNICODE).txt:
 	@echo "GET	UnicodeData.txt";\
 	curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
 
-$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a
+$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
 	$(COMPLINK)
 
 GENS += $(d)/vendor/mkrunetype
@@ -30,12 +37,25 @@ GENS += $(d)/vendor/mkrunetype
 $(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
 	@$(dir $@)vendor/mkrunetype $< > $@
 
+# rune widths
+$(d)/vendor/EastAsianWidth-$(UNICODE).txt:
+	@echo "GET	EastAsianWidth.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
+
+$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
+	$(COMPLINK)
+
+GENS += $(d)/vendor/mkrunewidth
+
+$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt
+	@$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) #> $@
+
 # grapheme boundaries
 $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
 	@echo "GET	GraphemeBreakProperty.txt";\
 	curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
 
-$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(OBJ_DIR)/sys/base/base.a
+$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
 	$(COMPLINK)
 
 $(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
@@ -43,6 +63,9 @@ $(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeB
 
 GENS += $(d)/vendor/mkgraphemedata
 
+# ========================================================================
+# normal operations
+
 $(LIBS_$(d)): $(OBJS_$(d))
 	$(ARCHIVE)
 
diff --git a/sys/libunicode/vendor/common.c b/sys/libunicode/vendor/common.c
new file mode 100644
index 0000000..6b5d1b3
--- /dev/null
+++ b/sys/libunicode/vendor/common.c
@@ -0,0 +1,220 @@
+#include "common.h"
+
+// -----------------------------------------------------------------------
+// input functions
+
+int
+parse(io·Stream *io, int nfield, char **field, int len, char *line)
+{
+    int n;
+    if((n=io·readln(io, len, line)) <= 0)
+        return ParseEOF;
+
+    if(n == len)
+        panicf("line too long");
+
+    if(line[n-1] != '\n')
+        panicf("invalid line: expected '\n', found '%c'", line[n]);
+
+    line[n-1] = 0;
+
+    if(line[0] == '#' || line[0] == 0)
+        return ParseSkip;
+
+    /* tokenize line into fields */
+    n = 0;
+    field[n] = line;
+    while(*line){
+        if(*line == ';'){
+            *line = 0;
+            field[++n] = line+1;
+        }
+        line++;
+    }
+
+    if(n != nfield-1)
+        panicf("expected %d number of fields, got %d: %s", nfield, n, line);
+
+    return ParseOK;
+}
+
+int
+codepoint(char *s)
+{
+    int c, b;
+
+    c = 0;
+    while((b=*s++)){
+        c <<= 4;
+        if(b >= '0' && b <= '9')
+            c += b - '0';
+        else if(b >= 'A' && b <= 'F')
+            c += b - 'A' + 10;
+        else
+            panicf("bad codepoint char '%c'", b);
+    }
+
+    return c;
+}
+
+void
+codepointrange(io·Stream *utf8, char *field[NumFields], int *start,  int *stop)
+{
+    int e, c;
+    char *other[NumFields], line[1024];
+
+    // XXX: the stop variable passes in the previous stopping character
+    e = *stop;
+    c = codepoint(field[Fcode]);
+
+    if(c >= NumRunes)
+        panicf("unexpected large codepoint %x", c);
+    if(c <= e)
+        panicf("bad code sequence: %x then %x", e, c);
+    e = c;
+
+    if(strstr(field[Fname], ", First>") != nil){
+        if(!parse(utf8, arrlen(other), other, arrlen(line), line))
+            panicf("range start at end of file");
+        if(strstr(other[Fname], ", Last>") == nil)
+            panicf("range start not followed by range end");
+
+        e = codepoint(other[Fcode]);
+
+        if(e <= c)
+            panicf("bad code sequence: %x then %x", c, e);
+        if(strcmp(field[Fcategory], other[Fcategory]) != 0)
+            panicf("range with mismatched category");
+    }
+
+    *start = c;
+    *stop  = e;
+}
+
+// -----------------------------------------------------------------------
+// output functions
+
+void
+putsearch(void)
+{
+    puts(
+        "#include <u.h>\n"
+        "#include <libunicode.h>\n"
+        "\n"
+        "static\n"
+        "rune*\n"
+        "rangesearch(rune c, rune *t, int n, int ne)\n"
+        "{\n"
+        "   rune *p;\n"
+        "   int m;\n"
+        "   while(n > 1) {\n"
+        "   m = n >> 1;\n"
+        "   p = t + m*ne;\n"
+        "   if(c >= p[0]){\n"
+        "       t = p;\n"
+        "       n = n-m;\n"
+        "   }else\n"
+        "       n = m;\n"
+        "   }\n"
+        "   if(n && c >= t[0])\n"
+        "       return t;\n"
+        "   return 0;\n"
+        "}\n"
+    );
+
+}
+
+int
+putrange(char *ident, char *prop, int force)
+{
+    int l, r, start;
+
+    start = 0;
+    for(l = 0; l < NumRunes;) {
+        if(!prop[l]){
+            l++;
+            continue;
+        }
+
+        for(r = l+1; r < NumRunes; r++){
+            if(!prop[r])
+                break;
+            prop[r] = 0;
+        }
+
+        if(force || r > l + 1){
+            if(!start){
+                printf("static rune %s[] = {\n", ident);
+                start = 1;
+            }
+            prop[l] = 0;
+            printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
+        }
+
+        l = r;
+    }
+
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
+
+int
+putpair(char *ident, char *prop)
+{
+    int l, r, start;
+
+    start = 0;
+    for(l=0; l+2 < NumRunes; ){
+        if(!prop[l]){
+            l++;
+            continue;
+        }
+
+        for(r = l + 2; r < NumRunes; r += 2){
+            if(!prop[r])
+                break;
+            prop[r] = 0;
+        }
+
+        if(r != l + 2){
+            if(!start){
+                printf("static rune %s[] = {\n", ident);
+                start = 1;
+            }
+            prop[l] = 0;
+            printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
+        }
+
+        l = r;
+    }
+
+    if(start)
+        printf("};\n\n");
+    return start;
+}
+
+int
+putsingle(char *ident, char *prop)
+{
+    int i, start;
+
+    start = 0;
+    for(i = 0; i < NumRunes; i++) {
+        if(!prop[i])
+            continue;
+
+        if(!start){
+            printf("static rune %s[] = {\n", ident);
+            start = 1;
+        }
+        prop[i] = 0;
+        printf("\t0x%.4x,\n", i);
+    }
+
+    if(start)
+        printf("};\n\n");
+
+    return start;
+}
diff --git a/sys/libunicode/vendor/common.h b/sys/libunicode/vendor/common.h
new file mode 100644
index 0000000..17cb113
--- /dev/null
+++ b/sys/libunicode/vendor/common.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <u.h>
+#include <base.h>
+#include <libunicode.h>
+
+enum
+{
+    // Fields inside UnicodeData.txt
+    Fcode,
+    Fname,
+    Fcategory,
+    Fcombine,
+    Fbidir,
+    Fdecomp,
+    Fdecimal,
+    Fdigit,
+    Fnumeric,
+    Fmirror,
+    Foldname,
+    Fcomment,
+    Fupper,
+    Flower,
+    Ftitle,
+
+    NumFields,
+    NumRunes = 1 << 21,
+};
+
+/* input functions */
+enum
+{
+    ParseEOF,
+    ParseOK,
+    ParseSkip,
+};
+
+int  parse(io·Stream *io, int nfield, char **field, int len, char *line);
+int  codepoint(char *s);
+void codepointrange(io·Stream *utf8, char *field[NumFields], int *start,  int *stop);
+
+/* output functions */
+void putsearch(void);
+int  putrange(char *ident, char *prop, int force);
+int  putpair(char *ident, char *prop);
+int  putsingle(char *ident, char *prop);
diff --git a/sys/libunicode/vendor/mkgraphemedata.c b/sys/libunicode/vendor/mkgraphemedata.c
new file mode 100644
index 0000000..f79d851
--- /dev/null
+++ b/sys/libunicode/vendor/mkgraphemedata.c
@@ -0,0 +1,24 @@
+#include <u.h>
+#include <base.h>
+#include <libunicode.h>
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+    fprintf(stderr, "usage: mkgraphemedata <GraphemeBreakProperty.txt>\n");
+    exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+    io·Stream *utf8;
+    char line[1024];
+
+    ARGBEGIN{
+    }ARGEND;
+}
diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c
index fc4301c..9f939f4 100644
--- a/sys/libunicode/vendor/mkrunetype.c
+++ b/sys/libunicode/vendor/mkrunetype.c
@@ -1,6 +1,4 @@
-#include <u.h>
-#include <base.h>
-#include <libunicode.h>
+#include "common.h"
 
 // -----------------------------------------------------------------------
 // globals
@@ -8,29 +6,6 @@
 #define OFFSET (1 << 20)
 #define DELTA(mapx, x)  ((1 << 20) + (mapx) - (x))
 
-enum
-{
-    // Fields inside UnicodeData.txt
-    Fcode,
-    Fname,
-    Fcategory,
-    Fcombine,
-    Fbidir,
-    Fdecomp,
-    Fdecimal,
-    Fdigit,
-    Fnumeric,
-    Fmirror,
-    Foldname,
-    Fcomment,
-    Fupper,
-    Flower,
-    Ftitle,
-
-    NumFields,
-    NumRunes = 1 << 21,
-};
-
 // TODO: use bitarrays. will reduce executable size 8x
 struct Table
 {
@@ -59,154 +34,37 @@ static struct Table table;
 // -----------------------------------------------------------------------
 // internal functions
 
-int
-parse(io·Stream *io, char **field, int len, char *line)
-{
-    int n;
-    if((n=io·readln(io, len, line)) <= 0)
-        return 0;
-
-    if(n == len)
-        panicf("line too long");
-
-    if(line[n-1] != '\n')
-        panicf("invalid line: expected '\n', found '%c'", line[n]);
-
-    line[n-1] = 0;
-
-    /* tokenize line into fields */
-    n = 0;
-    field[n] = line;
-    while(*line){
-        if(*line == ';'){
-            *line = 0;
-            field[++n] = line+1;
-        }
-        line++;
-    }
-
-    if(n != NumFields-1)
-        panicf("expected %d number of fields, got %d", NumFields, n);
-
-    return 1;
-}
-
-static
-int
-codepoint(char *s)
-{
-    int c, b;
-
-    c = 0;
-    while((b=*s++)){
-        c <<= 4;
-        if(b >= '0' && b <= '9')
-            c += b - '0';
-        else if(b >= 'A' && b <= 'F')
-            c += b - 'A' + 10;
-        else
-            panicf("bad codepoint char '%c'", b);
-    }
-
-    return c;
-}
-
 static
 int
 isrange(char *label, char *prop, int force)
 {
-    int l, r, start;
-
-    start = 0;
-    for(l = 0; l < NumRunes;) {
-        if(!prop[l]){
-            l++;
-            continue;
-        }
-
-        for(r = l+1; r < NumRunes; r++){
-            if(!prop[r])
-                break;
-            prop[r] = 0;
-        }
-
-        if(force || r > l + 1){
-            if(!start){
-                printf("static rune is%s_range[] = {\n", label);
-                start = 1;
-            }
-            prop[l] = 0;
-            printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
-        }
-
-        l = r;
-    }
-
-    if(start)
-        printf("};\n\n");
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident))
+        panicf("out of identifier space\n");
 
-    return start;
+    return putrange(ident, prop, force);
 }
 
 static
 int
 ispair(char *label, char *prop)
 {
-    int l, r, start;
-
-    start = 0;
-    for(l=0; l+2 < NumRunes; ){
-        if(!prop[l]){
-            l++;
-            continue;
-        }
-
-        for(r = l + 2; r < NumRunes; r += 2){
-            if(!prop[r])
-                break;
-            prop[r] = 0;
-        }
-
-        if(r != l + 2){
-            if(!start){
-                printf("static rune is%s_pair[] = {\n", label);
-                start = 1;
-            }
-            prop[l] = 0;
-            printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
-        }
-
-        l = r;
-    }
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident))
+        panicf("out of identifier space\n");
 
-    if(start)
-        printf("};\n\n");
-    return start;
+    return putpair(ident, prop);
 }
 
 static
 int
 issingle(char *label, char *prop)
 {
-    int i, start;
+    char ident[128];
+    if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident))
+        panicf("out of identifier space\n");
 
-    start = 0;
-    for(i = 0; i < NumRunes; i++) {
-        if(!prop[i])
-            continue;
-
-        if(!start){
-            printf("static rune is%s_single[] = {\n", label);
-            start = 1;
-        }
-        prop[i] = 0;
-        printf("\t0x%.4x,\n", i);
-    }
-
-    if(start)
-        printf("};\n\n");
-
-    return start;
+    return putsingle(ident, prop);
 }
 
 static
@@ -425,9 +283,9 @@ usage(void)
 int
 main(int argc, char *argv[])
 {
-    int i, c, pc;
+    int i, sc, c, ec;
     io·Stream *utf8;
-    char *prop, *field[NumFields], *field2[NumFields], line[1024], line2[1024];
+    char *prop, *field[NumFields], line[1024];
 
     ARGBEGIN{
     }ARGEND;
@@ -453,33 +311,14 @@ main(int argc, char *argv[])
     table.isspace['\v'] = 1;
     table.isspace[0x85] = 1;
 
-    pc = -1;
-    while(parse(utf8, field, arrlen(line), line)){
-        /* parse unicode value */
-        c = codepoint(field[Fcode]);
-        if(c >= NumRunes)
-            panicf("unexpected large codepoint %x", c);
-        if(c <= pc)
-            panicf("bad code sequence: %x then %x", pc, c);
-        pc = c;
-
-        /* see if this is a range */
-        if(strstr(field[Fname], ", First>") != nil){
-            if(!parse(utf8, field2, arrlen(line2), line2))
-                panicf("range start at end of file");
-            if(strstr(field2[Fname], ", Last>") == nil)
-                panicf("range start not followed by range end");
-
-            pc = codepoint(field2[Fcode]);
-
-            if(pc <= c)
-                panicf("bad code sequence: %x then %x", c, pc);
-            if(strcmp(field[Fcategory], field2[Fcategory]) != 0)
-                panicf("range with mismatched category");
-        }
-
+    ec = -1;
+    // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any
+    while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+        /* parse unicode range */
+        codepointrange(utf8, field, &sc, &ec);
         prop = field[Fcategory];
-        for(; c <= pc; c++){
+
+        for(c = sc; c <= ec; c++){
             /* grab properties */
             switch(prop[0]){
             case 'L':
@@ -533,30 +372,7 @@ main(int argc, char *argv[])
     }
     io·close(utf8);
 
-    puts(
-        "#include <u.h>\n"
-        "#include <libunicode.h>\n"
-        "\n"
-        "static\n"
-        "rune*\n"
-        "rangesearch(rune c, rune *t, int n, int ne)\n"
-        "{\n"
-        "   rune *p;\n"
-        "   int m;\n"
-        "   while(n > 1) {\n"
-        "   m = n >> 1;\n"
-        "   p = t + m*ne;\n"
-        "   if(c >= p[0]){\n"
-        "       t = p;\n"
-        "       n = n-m;\n"
-        "   }else\n"
-        "       n = m;\n"
-        "   }\n"
-        "   if(n && c >= t[0])\n"
-        "       return t;\n"
-        "   return 0;\n"
-        "}\n"
-    );
+    putsearch();
 
     makeis("space", table.isspace, 0, 1);
     makeis("digit", table.isdigit, 0, 1);
diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c
new file mode 100644
index 0000000..d821863
--- /dev/null
+++ b/sys/libunicode/vendor/mkrunewidth.c
@@ -0,0 +1,173 @@
+#include "common.h"
+
+/*
+ * inspired by design choices in utf8proc/charwidths.jl
+ * all widths default to 1 unless they fall within the categories:
+ *      1. Mn 2. Mc 3. Me 4. Zl
+ *      5. Zp 6. Cc 7. Cf 8. Cs
+ * these default to zero width
+ */
+enum
+{
+    /* width ? */
+    WidthNeutral,   /* (N) practially treated like narrow but unclear ... */
+    WidthAmbiguous, /* (A) sometimes wide and sometimes not... */
+    /* width 1 */
+    WidthHalf,      /* (H) = to narrow (compatability equivalent)  */
+    WidthNarrow,    /* (Na) ASCII width */
+    /* width 2 */
+    WidthWide,      /* (W) 2x width */
+    WidthFull,      /* (F) = to wide (compatability equivalent) */
+};
+
+struct Table
+{
+    char width[3][NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+void
+parse_category(char *path)
+{
+    int sc, c, ec, w;
+    io·Stream *utf8;
+    char *prop, *field[NumFields], line[1024];
+
+    if(!(utf8 = io·open(path, "r")))
+        panicf("can't open %s\n", path);
+
+    // NOTE: we don't check for comments here
+    ec = -1;
+    while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+        codepointrange(utf8, field, &sc, &ec);
+
+        prop = field[Fcategory];
+
+        switch(prop[0]){
+        case 'M':
+            switch(prop[1]){
+            case 'n': case 'c': case 'e':
+                w = 0;
+                break;
+            default:
+                w = 1;
+                break;
+            }
+            break;
+        case 'Z':
+            switch(prop[1]){
+                case 'l': case 'p':
+                    w = 0;
+                    break;
+                default:
+                    w = 1;
+                    break;
+            }
+            break;
+        case 'C':
+            switch(prop[1]){
+                case 'c': case 'f': case 's':
+                    w = 0;
+                    break;
+                default:
+                    w = 1;
+                    break;
+            }
+        default:
+            w = 1;
+        }
+
+        for(c = sc; c <= ec; c++)
+            table.width[w][c] = 1;
+    }
+
+    io·close(utf8);
+}
+
+static
+void
+parse_eawidths(char *path)
+{
+    int at, w;
+    int l, c, r;
+    io·Stream *utf8;
+    char *s, *field[2], line[1024];
+
+    utf8 = io·open(path, "r");
+    while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+        if(at == ParseSkip)
+            continue;
+
+        switch(field[1][0]){
+        case 'A': continue;
+        case 'N':
+            if(field[1][1] != 'a')
+                continue;
+        /* fallthrough */
+        case 'H': w = 1; break;
+
+        case 'W': /* fallthrough */
+        case 'F': w = 2; break;
+
+        default:
+            panicf("malformed east asian width class: %s\n", field[1]);
+        }
+
+        if(!(s = strstr(field[0], "..")))
+            l=codepoint(field[0]), r=l;
+        else{
+            *s++ = 0, *s++ = 0;
+            l=codepoint(field[0]);
+            r=codepoint(s);
+        }
+
+        for(c=l; c <= r; c++){
+            /* ensure it only exists in one table */
+            table.width[w][c]       = 1;
+            table.width[(w+1)%3][c] = 0;
+            table.width[(w+2)%3][c] = 0;
+        }
+    }
+    io·close(utf8);
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+    fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt>\n");
+    exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+    int c;
+
+    ARGBEGIN{
+    }ARGEND;
+
+    if(argc != 2)
+        usage();
+
+    parse_category(*argv++);
+    parse_eawidths(*argv);
+
+    /* simple checking */
+    for(c=0; c<NumRunes; c++){
+        if(table.width[0][c] + table.width[1][c] + table.width[2][c] > 1)
+            panicf("improper table state");
+    }
+
+    putrange("zero",   table.width[0], 1);
+    putrange("single", table.width[1], 1);
+    putrange("double", table.width[2], 1);
+}
author	Nicholas Noll <nbnoll@eml.cc>	2021-10-29 11:10:29 -0700
committer	Nicholas Noll <nbnoll@eml.cc>	2021-10-29 11:10:29 -0700
commit	6db99f51209ebde97311df3aac081f82589319cb (patch)
tree	8a3e67a2c3ff4087379fb584a88073f0e43696e1
parent	b42145545b434a40d6ad50cf3ac6cb6e04ae0500 (diff)