From 6db99f51209ebde97311df3aac081f82589319cb Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Fri, 29 Oct 2021 11:10:29 -0700
Subject: implemented functionality to compute rune widths from unicode data

---
 sys/libunicode/vendor/mkrunewidth.c | 173 ++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 sys/libunicode/vendor/mkrunewidth.c

(limited to 'sys/libunicode/vendor/mkrunewidth.c')

diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c
new file mode 100644
index 0000000..d821863
--- /dev/null
+++ b/sys/libunicode/vendor/mkrunewidth.c
@@ -0,0 +1,173 @@
+#include "common.h"
+
+/*
+ * inspired by design choices in utf8proc/charwidths.jl
+ * all widths default to 1 unless they fall within the categories:
+ *      1. Mn 2. Mc 3. Me 4. Zl
+ *      5. Zp 6. Cc 7. Cf 8. Cs
+ * these default to zero width
+ */
+enum
+{
+    /* width ? */
+    WidthNeutral,   /* (N) practially treated like narrow but unclear ... */
+    WidthAmbiguous, /* (A) sometimes wide and sometimes not... */
+    /* width 1 */
+    WidthHalf,      /* (H) = to narrow (compatability equivalent)  */
+    WidthNarrow,    /* (Na) ASCII width */
+    /* width 2 */
+    WidthWide,      /* (W) 2x width */
+    WidthFull,      /* (F) = to wide (compatability equivalent) */
+};
+
+struct Table
+{
+    char width[3][NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+void
+parse_category(char *path)
+{
+    int sc, c, ec, w;
+    io·Stream *utf8;
+    char *prop, *field[NumFields], line[1024];
+
+    if(!(utf8 = io·open(path, "r")))
+        panicf("can't open %s\n", path);
+
+    // NOTE: we don't check for comments here
+    ec = -1;
+    while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+        codepointrange(utf8, field, &sc, &ec);
+
+        prop = field[Fcategory];
+
+        switch(prop[0]){
+        case 'M':
+            switch(prop[1]){
+            case 'n': case 'c': case 'e':
+                w = 0;
+                break;
+            default:
+                w = 1;
+                break;
+            }
+            break;
+        case 'Z':
+            switch(prop[1]){
+                case 'l': case 'p':
+                    w = 0;
+                    break;
+                default:
+                    w = 1;
+                    break;
+            }
+            break;
+        case 'C':
+            switch(prop[1]){
+                case 'c': case 'f': case 's':
+                    w = 0;
+                    break;
+                default:
+                    w = 1;
+                    break;
+            }
+        default:
+            w = 1;
+        }
+
+        for(c = sc; c <= ec; c++)
+            table.width[w][c] = 1;
+    }
+
+    io·close(utf8);
+}
+
+static
+void
+parse_eawidths(char *path)
+{
+    int at, w;
+    int l, c, r;
+    io·Stream *utf8;
+    char *s, *field[2], line[1024];
+
+    utf8 = io·open(path, "r");
+    while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+        if(at == ParseSkip)
+            continue;
+
+        switch(field[1][0]){
+        case 'A': continue;
+        case 'N':
+            if(field[1][1] != 'a')
+                continue;
+        /* fallthrough */
+        case 'H': w = 1; break;
+
+        case 'W': /* fallthrough */
+        case 'F': w = 2; break;
+
+        default:
+            panicf("malformed east asian width class: %s\n", field[1]);
+        }
+
+        if(!(s = strstr(field[0], "..")))
+            l=codepoint(field[0]), r=l;
+        else{
+            *s++ = 0, *s++ = 0;
+            l=codepoint(field[0]);
+            r=codepoint(s);
+        }
+
+        for(c=l; c <= r; c++){
+            /* ensure it only exists in one table */
+            table.width[w][c]       = 1;
+            table.width[(w+1)%3][c] = 0;
+            table.width[(w+2)%3][c] = 0;
+        }
+    }
+    io·close(utf8);
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+    fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt>\n");
+    exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+    int c;
+
+    ARGBEGIN{
+    }ARGEND;
+
+    if(argc != 2)
+        usage();
+
+    parse_category(*argv++);
+    parse_eawidths(*argv);
+
+    /* simple checking */
+    for(c=0; c<NumRunes; c++){
+        if(table.width[0][c] + table.width[1][c] + table.width[2][c] > 1)
+            panicf("improper table state");
+    }
+
+    putrange("zero",   table.width[0], 1);
+    putrange("single", table.width[1], 1);
+    putrange("double", table.width[2], 1);
+}
-- 
cgit v1.2.1