aboutsummaryrefslogtreecommitdiff
path: root/src/base/utf/vendor/mkrunewidth.c
diff options
context:
space:
mode:
authorNicholas <nbnoll@eml.cc>2021-11-20 10:53:19 -0800
committerNicholas <nbnoll@eml.cc>2021-11-20 10:53:19 -0800
commita9bfe650038afea8b751175cac16f6027345e45f (patch)
tree9a7f9feb76a64bb3efe573036d80b7bdbf8a59a5 /src/base/utf/vendor/mkrunewidth.c
parent1c8d4e69205fd875f6bec3fa3bd929c2e7f52f62 (diff)
Chore: reorganize libutf and libfmt into base
I found the split to be arbitrary. Better to include the functionality in the standard library. I also split the headers to allow for more granular inclusion (but the library is still monolithic). The only ugliness is the circular dependency introduced with libutf's generated functions. We put explicit prereqs with the necessary object files instead.
Diffstat (limited to 'src/base/utf/vendor/mkrunewidth.c')
-rw-r--r--src/base/utf/vendor/mkrunewidth.c325
1 files changed, 325 insertions, 0 deletions
diff --git a/src/base/utf/vendor/mkrunewidth.c b/src/base/utf/vendor/mkrunewidth.c
new file mode 100644
index 0000000..14e6973
--- /dev/null
+++ b/src/base/utf/vendor/mkrunewidth.c
@@ -0,0 +1,325 @@
+#include "common.h"
+
+/*
+ * inspired by design choices in utf8proc/charwidths.jl
+ * all widths default to 1 unless they fall within the categories:
+ * 1. Mn 2. Mc 3. Me 4. Zl
+ * 5. Zp 6. Cc 7. Cf 8. Cs
+ * these default to zero width
+ */
+enum
+{
+ /* width ? */
+ WidthNeutral, /* (N) practially treated like narrow but unclear ... */
+ WidthAmbiguous, /* (A) sometimes wide and sometimes not... */
+ /* width 1 */
+ WidthHalf, /* (H) = to narrow (compatability equivalent) */
+ WidthNarrow, /* (Na) ASCII width */
+ /* width 2 */
+ WidthWide, /* (W) 2x width */
+ WidthFull, /* (F) = to wide (compatability equivalent) */
+};
+
+struct Table
+{
+ char width[3][NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+void
+parse_category(char *path)
+{
+ int sc, c, ec, w;
+ io·Stream *utf8;
+ char *prop, *field[NumFields], line[1024];
+
+ if(!(utf8 = io·open(path, "r")))
+ panicf("can't open %s\n", path);
+
+ // NOTE: we don't check for comments here
+ ec = -1;
+ while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+ codepointrange(utf8, field, &sc, &ec);
+
+ prop = field[Fcategory];
+
+ switch(prop[0]){
+ case 'M':
+ switch(prop[1]){
+ case 'n': case 'c': case 'e':
+ w = 0;
+ break;
+ default:
+ w = 1;
+ break;
+ }
+ break;
+ case 'Z':
+ switch(prop[1]){
+ case 'l': case 'p':
+ w = 0;
+ break;
+ default:
+ w = 1;
+ break;
+ }
+ break;
+ case 'C':
+ switch(prop[1]){
+ case 'c': case 'f': case 's':
+ w = 0;
+ break;
+ default:
+ w = 1;
+ break;
+ }
+ default:
+ w = 1;
+ }
+
+ for(c = sc; c <= ec; c++)
+ table.width[w][c] = 1;
+ }
+
+ io·close(utf8);
+}
+
+static
+void
+coderange(char *field, int *l, int *r)
+{
+ char *s;
+
+ if(!(s = strstr(field, "..")))
+ *l=*r=codepoint(field);
+ else{
+ *s++ = 0, *s++ = 0;
+ *l=codepoint(field);
+ *r=codepoint(s);
+ }
+}
+
+static
+void
+parse_eawidths(char *path)
+{
+ int at, w;
+ int l, c, r;
+ io·Stream *utf8;
+ char *field[2], line[1024];
+
+ utf8 = io·open(path, "r");
+ while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+ if(at == ParseSkip)
+ continue;
+
+ switch(field[1][0]){
+ case 'A': continue;
+ case 'N':
+ if(field[1][1] != 'a')
+ continue;
+ /* fallthrough */
+ case 'H': w = 1; break;
+
+ case 'W': /* fallthrough */
+ case 'F': w = 2; break;
+
+ default:
+ panicf("malformed east asian width class: %s\n", field[1]);
+ }
+
+ coderange(field[0], &l, &r);
+
+ for(c=l; c <= r; c++){
+ /* ensure it only exists in one table */
+ table.width[w][c] = 1;
+ table.width[(w+1)%3][c] = 0;
+ table.width[(w+2)%3][c] = 0;
+ }
+ }
+ io·close(utf8);
+}
+
+static
+void
+parse_emoji(char *path)
+{
+ int at, w;
+ int l, c, r;
+ io·Stream *utf8;
+ char *s, *field[2], line[1024];
+
+ utf8 = io·open(path, "r");
+ while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+ if(at == ParseSkip)
+ continue;
+
+ /* only override emoji presentation */
+ if(!strstr(field[1], "Emoji_Presentation"))
+ continue;
+
+ /* trim trailing space */
+ for(s=field[0]; *s; s++){
+ if(*s == ' ')
+ *s = 0;
+ }
+
+ coderange(field[0], &l, &r);
+
+ for(c=l; c <= r; c++){
+ table.width[0][c] = 0;
+ table.width[1][c] = 0;
+ table.width[2][c] = 1;
+ }
+ }
+
+ io·close(utf8);
+}
+
+/* output functions */
+static
+void
+maketable(char *label, char *table, int pairs, int onlyranges)
+{
+ int r, p=0, s=0;
+ char ident[3][128];
+
+ enum
+ {
+ Irange,
+ Ipair,
+ Isingle,
+ };
+
+ /* ranges */
+ if(snprintf(ident[Irange], arrlen(ident[Irange]), "%s_range", label) == arrlen(ident[Irange]))
+ panicf("out of identifier space\n");
+ r = putrange(ident[Irange], table, onlyranges);
+
+ if(!onlyranges && pairs){
+ if(snprintf(ident[Ipair], arrlen(ident[Ipair]), "%s_pair", label) == arrlen(ident[Ipair]))
+ panicf("out of identifier space\n");
+ p = putpair(ident[Ipair], table);
+ }
+ if(!onlyranges){
+ if(snprintf(ident[Isingle], arrlen(ident[Isingle]), "%s_single", label) == arrlen(ident[Isingle]))
+ panicf("out of identifier space\n");
+
+ s = putsingle(ident[Isingle], table);
+ }
+
+ printf(
+ "static int\n"
+ "is%s(rune c)\n"
+ "{\n"
+ " rune *p;\n"
+ "\n",
+ label);
+
+ if(r){
+ printf(
+ " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n"
+ " if(p && c >= p[0] && c <= p[1])\n"
+ " return 1;\n",
+ ident[Irange], ident[Irange]);
+ }
+
+ if(p){
+ printf(
+ " p = rangesearch(c, %s, arrlen(%s)/2, 2);\n"
+ " if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
+ " return 1;\n",
+ ident[Ipair], ident[Ipair]);
+ }
+
+ if(s)
+ printf(
+ " p = rangesearch(c, %s, arrlen(%s), 1);\n"
+ " if(p && c == p[0])\n"
+ " return 1;\n",
+ ident[Isingle], ident[Isingle]);
+
+ printf(
+ " return 0;\n"
+ "}\n"
+ "\n");
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+ fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt> <EmojiData.txt>\n");
+ exit(1);
+}
+
+#define SETW0(c) \
+ table.width[0][(c)] = 1, \
+ table.width[1][(c)] = 0, \
+ table.width[2][(c)] = 0;
+
+#define SETW1(c) \
+ table.width[0][(c)] = 0, \
+ table.width[1][(c)] = 1, \
+ table.width[2][(c)] = 0;
+
+#define SETW2(c) \
+ table.width[0][(c)] = 0, \
+ table.width[1][(c)] = 0, \
+ table.width[2][(c)] = 1;
+
+
+int
+main(int argc, char *argv[])
+{
+ int c;
+
+ ARGBEGIN{
+ }ARGEND;
+
+ if(argc != 3)
+ usage();
+
+ parse_category(*argv++);
+ parse_eawidths(*argv++);
+ parse_emoji(*argv);
+
+ /* overrides */
+ SETW0(0x2028);
+ SETW0(0x2029);
+
+ SETW1(0x00AD);
+
+ /* simple checking */
+ for(c=0; c<NumRunes; c++){
+ if(table.width[0][c] + table.width[1][c] + table.width[2][c] > 1)
+ panicf("improper table state");
+ }
+
+ putsearch();
+
+ maketable("width0", table.width[0], 1, 0);
+ maketable("width1", table.width[1], 1, 0);
+ maketable("width2", table.width[2], 1, 0);
+
+ puts(
+ "\n"
+ "int\n"
+ "utf8·runewidth(rune c)\n"
+ "{\n"
+ " if(iswidth1(c))\n"
+ " return 1;\n"
+ " if(iswidth2(c))\n"
+ " return 2;\n"
+ " return 0;\n"
+ "}"
+ );
+}