aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-10-29 11:10:29 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-10-29 11:10:29 -0700
commit6db99f51209ebde97311df3aac081f82589319cb (patch)
tree8a3e67a2c3ff4087379fb584a88073f0e43696e1
parentb42145545b434a40d6ad50cf3ac6cb6e04ae0500 (diff)
implemented functionality to compute rune widths from unicode data
-rw-r--r--sys/libunicode/rules.mk29
-rw-r--r--sys/libunicode/vendor/common.c220
-rw-r--r--sys/libunicode/vendor/common.h46
-rw-r--r--sys/libunicode/vendor/mkgraphemedata.c24
-rw-r--r--sys/libunicode/vendor/mkrunetype.c230
-rw-r--r--sys/libunicode/vendor/mkrunewidth.c173
6 files changed, 512 insertions, 210 deletions
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index aca7348..5ba9b89 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -11,18 +11,25 @@ SRCS_$(d) := \
$(d)/canfit.c \
$(d)/runelen.c \
$(d)/len.c \
- $(d)/runetype-$(UNICODE).c
+ $(d)/runetype-$(UNICODE).c \
+ $(d)/runewidth-$(UNICODE).c
LIBS_$(d) := $(d)/libunicode.a
include share/paths.mk
+# ========================================================================
+# table generation
+
+$(d)/vendor/common.o: $(d)/vendor/common.c
+ $(COMPILE)
+
# rune categories
$(d)/vendor/UnicodeData-$(UNICODE).txt:
@echo "GET UnicodeData.txt";\
curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
-$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a
+$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
$(COMPLINK)
GENS += $(d)/vendor/mkrunetype
@@ -30,12 +37,25 @@ GENS += $(d)/vendor/mkrunetype
$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
@$(dir $@)vendor/mkrunetype $< > $@
+# rune widths
+$(d)/vendor/EastAsianWidth-$(UNICODE).txt:
+ @echo "GET EastAsianWidth.txt";\
+ curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
+
+$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
+ $(COMPLINK)
+
+GENS += $(d)/vendor/mkrunewidth
+
+$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt
+ @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) #> $@
+
# grapheme boundaries
$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
@echo "GET GraphemeBreakProperty.txt";\
curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
-$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(OBJ_DIR)/sys/base/base.a
+$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
$(COMPLINK)
$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
@@ -43,6 +63,9 @@ $(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeB
GENS += $(d)/vendor/mkgraphemedata
+# ========================================================================
+# normal operations
+
$(LIBS_$(d)): $(OBJS_$(d))
$(ARCHIVE)
diff --git a/sys/libunicode/vendor/common.c b/sys/libunicode/vendor/common.c
new file mode 100644
index 0000000..6b5d1b3
--- /dev/null
+++ b/sys/libunicode/vendor/common.c
@@ -0,0 +1,220 @@
+#include "common.h"
+
+// -----------------------------------------------------------------------
+// input functions
+
+int
+parse(io·Stream *io, int nfield, char **field, int len, char *line)
+{
+ int n;
+ if((n=io·readln(io, len, line)) <= 0)
+ return ParseEOF;
+
+ if(n == len)
+ panicf("line too long");
+
+ if(line[n-1] != '\n')
+ panicf("invalid line: expected '\n', found '%c'", line[n]);
+
+ line[n-1] = 0;
+
+ if(line[0] == '#' || line[0] == 0)
+ return ParseSkip;
+
+ /* tokenize line into fields */
+ n = 0;
+ field[n] = line;
+ while(*line){
+ if(*line == ';'){
+ *line = 0;
+ field[++n] = line+1;
+ }
+ line++;
+ }
+
+ if(n != nfield-1)
+ panicf("expected %d number of fields, got %d: %s", nfield, n, line);
+
+ return ParseOK;
+}
+
+int
+codepoint(char *s)
+{
+ int c, b;
+
+ c = 0;
+ while((b=*s++)){
+ c <<= 4;
+ if(b >= '0' && b <= '9')
+ c += b - '0';
+ else if(b >= 'A' && b <= 'F')
+ c += b - 'A' + 10;
+ else
+ panicf("bad codepoint char '%c'", b);
+ }
+
+ return c;
+}
+
+void
+codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop)
+{
+ int e, c;
+ char *other[NumFields], line[1024];
+
+ // XXX: the stop variable passes in the previous stopping character
+ e = *stop;
+ c = codepoint(field[Fcode]);
+
+ if(c >= NumRunes)
+ panicf("unexpected large codepoint %x", c);
+ if(c <= e)
+ panicf("bad code sequence: %x then %x", e, c);
+ e = c;
+
+ if(strstr(field[Fname], ", First>") != nil){
+ if(!parse(utf8, arrlen(other), other, arrlen(line), line))
+ panicf("range start at end of file");
+ if(strstr(other[Fname], ", Last>") == nil)
+ panicf("range start not followed by range end");
+
+ e = codepoint(other[Fcode]);
+
+ if(e <= c)
+ panicf("bad code sequence: %x then %x", c, e);
+ if(strcmp(field[Fcategory], other[Fcategory]) != 0)
+ panicf("range with mismatched category");
+ }
+
+ *start = c;
+ *stop = e;
+}
+
+// -----------------------------------------------------------------------
+// output functions
+
+void
+putsearch(void)
+{
+ puts(
+ "#include <u.h>\n"
+ "#include <libunicode.h>\n"
+ "\n"
+ "static\n"
+ "rune*\n"
+ "rangesearch(rune c, rune *t, int n, int ne)\n"
+ "{\n"
+ " rune *p;\n"
+ " int m;\n"
+ " while(n > 1) {\n"
+ " m = n >> 1;\n"
+ " p = t + m*ne;\n"
+ " if(c >= p[0]){\n"
+ " t = p;\n"
+ " n = n-m;\n"
+ " }else\n"
+ " n = m;\n"
+ " }\n"
+ " if(n && c >= t[0])\n"
+ " return t;\n"
+ " return 0;\n"
+ "}\n"
+ );
+
+}
+
+int
+putrange(char *ident, char *prop, int force)
+{
+ int l, r, start;
+
+ start = 0;
+ for(l = 0; l < NumRunes;) {
+ if(!prop[l]){
+ l++;
+ continue;
+ }
+
+ for(r = l+1; r < NumRunes; r++){
+ if(!prop[r])
+ break;
+ prop[r] = 0;
+ }
+
+ if(force || r > l + 1){
+ if(!start){
+ printf("static rune %s[] = {\n", ident);
+ start = 1;
+ }
+ prop[l] = 0;
+ printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
+ }
+
+ l = r;
+ }
+
+ if(start)
+ printf("};\n\n");
+
+ return start;
+}
+
+int
+putpair(char *ident, char *prop)
+{
+ int l, r, start;
+
+ start = 0;
+ for(l=0; l+2 < NumRunes; ){
+ if(!prop[l]){
+ l++;
+ continue;
+ }
+
+ for(r = l + 2; r < NumRunes; r += 2){
+ if(!prop[r])
+ break;
+ prop[r] = 0;
+ }
+
+ if(r != l + 2){
+ if(!start){
+ printf("static rune %s[] = {\n", ident);
+ start = 1;
+ }
+ prop[l] = 0;
+ printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
+ }
+
+ l = r;
+ }
+
+ if(start)
+ printf("};\n\n");
+ return start;
+}
+
+int
+putsingle(char *ident, char *prop)
+{
+ int i, start;
+
+ start = 0;
+ for(i = 0; i < NumRunes; i++) {
+ if(!prop[i])
+ continue;
+
+ if(!start){
+ printf("static rune %s[] = {\n", ident);
+ start = 1;
+ }
+ prop[i] = 0;
+ printf("\t0x%.4x,\n", i);
+ }
+
+ if(start)
+ printf("};\n\n");
+
+ return start;
+}
diff --git a/sys/libunicode/vendor/common.h b/sys/libunicode/vendor/common.h
new file mode 100644
index 0000000..17cb113
--- /dev/null
+++ b/sys/libunicode/vendor/common.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <u.h>
+#include <base.h>
+#include <libunicode.h>
+
+enum
+{
+ // Fields inside UnicodeData.txt
+ Fcode,
+ Fname,
+ Fcategory,
+ Fcombine,
+ Fbidir,
+ Fdecomp,
+ Fdecimal,
+ Fdigit,
+ Fnumeric,
+ Fmirror,
+ Foldname,
+ Fcomment,
+ Fupper,
+ Flower,
+ Ftitle,
+
+ NumFields,
+ NumRunes = 1 << 21,
+};
+
+/* input functions */
+enum
+{
+ ParseEOF,
+ ParseOK,
+ ParseSkip,
+};
+
+int parse(io·Stream *io, int nfield, char **field, int len, char *line);
+int codepoint(char *s);
+void codepointrange(io·Stream *utf8, char *field[NumFields], int *start, int *stop);
+
+/* output functions */
+void putsearch(void);
+int putrange(char *ident, char *prop, int force);
+int putpair(char *ident, char *prop);
+int putsingle(char *ident, char *prop);
diff --git a/sys/libunicode/vendor/mkgraphemedata.c b/sys/libunicode/vendor/mkgraphemedata.c
new file mode 100644
index 0000000..f79d851
--- /dev/null
+++ b/sys/libunicode/vendor/mkgraphemedata.c
@@ -0,0 +1,24 @@
+#include <u.h>
+#include <base.h>
+#include <libunicode.h>
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+ fprintf(stderr, "usage: mkgraphemedata <GraphemeBreakProperty.txt>\n");
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ io·Stream *utf8;
+ char line[1024];
+
+ ARGBEGIN{
+ }ARGEND;
+}
diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c
index fc4301c..9f939f4 100644
--- a/sys/libunicode/vendor/mkrunetype.c
+++ b/sys/libunicode/vendor/mkrunetype.c
@@ -1,6 +1,4 @@
-#include <u.h>
-#include <base.h>
-#include <libunicode.h>
+#include "common.h"
// -----------------------------------------------------------------------
// globals
@@ -8,29 +6,6 @@
#define OFFSET (1 << 20)
#define DELTA(mapx, x) ((1 << 20) + (mapx) - (x))
-enum
-{
- // Fields inside UnicodeData.txt
- Fcode,
- Fname,
- Fcategory,
- Fcombine,
- Fbidir,
- Fdecomp,
- Fdecimal,
- Fdigit,
- Fnumeric,
- Fmirror,
- Foldname,
- Fcomment,
- Fupper,
- Flower,
- Ftitle,
-
- NumFields,
- NumRunes = 1 << 21,
-};
-
// TODO: use bitarrays. will reduce executable size 8x
struct Table
{
@@ -59,154 +34,37 @@ static struct Table table;
// -----------------------------------------------------------------------
// internal functions
-int
-parse(io·Stream *io, char **field, int len, char *line)
-{
- int n;
- if((n=io·readln(io, len, line)) <= 0)
- return 0;
-
- if(n == len)
- panicf("line too long");
-
- if(line[n-1] != '\n')
- panicf("invalid line: expected '\n', found '%c'", line[n]);
-
- line[n-1] = 0;
-
- /* tokenize line into fields */
- n = 0;
- field[n] = line;
- while(*line){
- if(*line == ';'){
- *line = 0;
- field[++n] = line+1;
- }
- line++;
- }
-
- if(n != NumFields-1)
- panicf("expected %d number of fields, got %d", NumFields, n);
-
- return 1;
-}
-
-static
-int
-codepoint(char *s)
-{
- int c, b;
-
- c = 0;
- while((b=*s++)){
- c <<= 4;
- if(b >= '0' && b <= '9')
- c += b - '0';
- else if(b >= 'A' && b <= 'F')
- c += b - 'A' + 10;
- else
- panicf("bad codepoint char '%c'", b);
- }
-
- return c;
-}
-
static
int
isrange(char *label, char *prop, int force)
{
- int l, r, start;
-
- start = 0;
- for(l = 0; l < NumRunes;) {
- if(!prop[l]){
- l++;
- continue;
- }
-
- for(r = l+1; r < NumRunes; r++){
- if(!prop[r])
- break;
- prop[r] = 0;
- }
-
- if(force || r > l + 1){
- if(!start){
- printf("static rune is%s_range[] = {\n", label);
- start = 1;
- }
- prop[l] = 0;
- printf("\t0x%.4x, 0x%.4x,\n", l, r-1);
- }
-
- l = r;
- }
-
- if(start)
- printf("};\n\n");
+ char ident[128];
+ if(snprintf(ident, arrlen(ident), "is%s_range", label) == arrlen(ident))
+ panicf("out of identifier space\n");
- return start;
+ return putrange(ident, prop, force);
}
static
int
ispair(char *label, char *prop)
{
- int l, r, start;
-
- start = 0;
- for(l=0; l+2 < NumRunes; ){
- if(!prop[l]){
- l++;
- continue;
- }
-
- for(r = l + 2; r < NumRunes; r += 2){
- if(!prop[r])
- break;
- prop[r] = 0;
- }
-
- if(r != l + 2){
- if(!start){
- printf("static rune is%s_pair[] = {\n", label);
- start = 1;
- }
- prop[l] = 0;
- printf("\t0x%.4x, 0x%.4x,\n", l, r - 2);
- }
-
- l = r;
- }
+ char ident[128];
+ if(snprintf(ident, arrlen(ident), "is%s_pair", label) == arrlen(ident))
+ panicf("out of identifier space\n");
- if(start)
- printf("};\n\n");
- return start;
+ return putpair(ident, prop);
}
static
int
issingle(char *label, char *prop)
{
- int i, start;
+ char ident[128];
+ if(snprintf(ident, arrlen(ident), "is%s_single", label) == arrlen(ident))
+ panicf("out of identifier space\n");
- start = 0;
- for(i = 0; i < NumRunes; i++) {
- if(!prop[i])
- continue;
-
- if(!start){
- printf("static rune is%s_single[] = {\n", label);
- start = 1;
- }
- prop[i] = 0;
- printf("\t0x%.4x,\n", i);
- }
-
- if(start)
- printf("};\n\n");
-
- return start;
+ return putsingle(ident, prop);
}
static
@@ -425,9 +283,9 @@ usage(void)
int
main(int argc, char *argv[])
{
- int i, c, pc;
+ int i, sc, c, ec;
io·Stream *utf8;
- char *prop, *field[NumFields], *field2[NumFields], line[1024], line2[1024];
+ char *prop, *field[NumFields], line[1024];
ARGBEGIN{
}ARGEND;
@@ -453,33 +311,14 @@ main(int argc, char *argv[])
table.isspace['\v'] = 1;
table.isspace[0x85] = 1;
- pc = -1;
- while(parse(utf8, field, arrlen(line), line)){
- /* parse unicode value */
- c = codepoint(field[Fcode]);
- if(c >= NumRunes)
- panicf("unexpected large codepoint %x", c);
- if(c <= pc)
- panicf("bad code sequence: %x then %x", pc, c);
- pc = c;
-
- /* see if this is a range */
- if(strstr(field[Fname], ", First>") != nil){
- if(!parse(utf8, field2, arrlen(line2), line2))
- panicf("range start at end of file");
- if(strstr(field2[Fname], ", Last>") == nil)
- panicf("range start not followed by range end");
-
- pc = codepoint(field2[Fcode]);
-
- if(pc <= c)
- panicf("bad code sequence: %x then %x", c, pc);
- if(strcmp(field[Fcategory], field2[Fcategory]) != 0)
- panicf("range with mismatched category");
- }
-
+ ec = -1;
+ // NOTE: we don't check for comments here: assume UnicodeData.txt doesn't have any
+ while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+ /* parse unicode range */
+ codepointrange(utf8, field, &sc, &ec);
prop = field[Fcategory];
- for(; c <= pc; c++){
+
+ for(c = sc; c <= ec; c++){
/* grab properties */
switch(prop[0]){
case 'L':
@@ -533,30 +372,7 @@ main(int argc, char *argv[])
}
io·close(utf8);
- puts(
- "#include <u.h>\n"
- "#include <libunicode.h>\n"
- "\n"
- "static\n"
- "rune*\n"
- "rangesearch(rune c, rune *t, int n, int ne)\n"
- "{\n"
- " rune *p;\n"
- " int m;\n"
- " while(n > 1) {\n"
- " m = n >> 1;\n"
- " p = t + m*ne;\n"
- " if(c >= p[0]){\n"
- " t = p;\n"
- " n = n-m;\n"
- " }else\n"
- " n = m;\n"
- " }\n"
- " if(n && c >= t[0])\n"
- " return t;\n"
- " return 0;\n"
- "}\n"
- );
+ putsearch();
makeis("space", table.isspace, 0, 1);
makeis("digit", table.isdigit, 0, 1);
diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c
new file mode 100644
index 0000000..d821863
--- /dev/null
+++ b/sys/libunicode/vendor/mkrunewidth.c
@@ -0,0 +1,173 @@
+#include "common.h"
+
+/*
+ * inspired by design choices in utf8proc/charwidths.jl
+ * all widths default to 1 unless they fall within the categories:
+ * 1. Mn 2. Mc 3. Me 4. Zl
+ * 5. Zp 6. Cc 7. Cf 8. Cs
+ * these default to zero width
+ */
+enum
+{
+ /* width ? */
+ WidthNeutral, /* (N) practially treated like narrow but unclear ... */
+ WidthAmbiguous, /* (A) sometimes wide and sometimes not... */
+ /* width 1 */
+ WidthHalf, /* (H) = to narrow (compatability equivalent) */
+ WidthNarrow, /* (Na) ASCII width */
+ /* width 2 */
+ WidthWide, /* (W) 2x width */
+ WidthFull, /* (F) = to wide (compatability equivalent) */
+};
+
+struct Table
+{
+ char width[3][NumRunes];
+};
+
+static struct Table table;
+
+// -----------------------------------------------------------------------
+// internal functions
+
+static
+void
+parse_category(char *path)
+{
+ int sc, c, ec, w;
+ io·Stream *utf8;
+ char *prop, *field[NumFields], line[1024];
+
+ if(!(utf8 = io·open(path, "r")))
+ panicf("can't open %s\n", path);
+
+ // NOTE: we don't check for comments here
+ ec = -1;
+ while(parse(utf8, arrlen(field), field, arrlen(line), line)){
+ codepointrange(utf8, field, &sc, &ec);
+
+ prop = field[Fcategory];
+
+ switch(prop[0]){
+ case 'M':
+ switch(prop[1]){
+ case 'n': case 'c': case 'e':
+ w = 0;
+ break;
+ default:
+ w = 1;
+ break;
+ }
+ break;
+ case 'Z':
+ switch(prop[1]){
+ case 'l': case 'p':
+ w = 0;
+ break;
+ default:
+ w = 1;
+ break;
+ }
+ break;
+ case 'C':
+ switch(prop[1]){
+ case 'c': case 'f': case 's':
+ w = 0;
+ break;
+ default:
+ w = 1;
+ break;
+ }
+ default:
+ w = 1;
+ }
+
+ for(c = sc; c <= ec; c++)
+ table.width[w][c] = 1;
+ }
+
+ io·close(utf8);
+}
+
+static
+void
+parse_eawidths(char *path)
+{
+ int at, w;
+ int l, c, r;
+ io·Stream *utf8;
+ char *s, *field[2], line[1024];
+
+ utf8 = io·open(path, "r");
+ while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+ if(at == ParseSkip)
+ continue;
+
+ switch(field[1][0]){
+ case 'A': continue;
+ case 'N':
+ if(field[1][1] != 'a')
+ continue;
+ /* fallthrough */
+ case 'H': w = 1; break;
+
+ case 'W': /* fallthrough */
+ case 'F': w = 2; break;
+
+ default:
+ panicf("malformed east asian width class: %s\n", field[1]);
+ }
+
+ if(!(s = strstr(field[0], "..")))
+ l=codepoint(field[0]), r=l;
+ else{
+ *s++ = 0, *s++ = 0;
+ l=codepoint(field[0]);
+ r=codepoint(s);
+ }
+
+ for(c=l; c <= r; c++){
+ /* ensure it only exists in one table */
+ table.width[w][c] = 1;
+ table.width[(w+1)%3][c] = 0;
+ table.width[(w+2)%3][c] = 0;
+ }
+ }
+ io·close(utf8);
+}
+
+// -----------------------------------------------------------------------
+// main point of entry
+
+static
+void
+usage(void)
+{
+ fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt>\n");
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c;
+
+ ARGBEGIN{
+ }ARGEND;
+
+ if(argc != 2)
+ usage();
+
+ parse_category(*argv++);
+ parse_eawidths(*argv);
+
+ /* simple checking */
+ for(c=0; c<NumRunes; c++){
+ if(table.width[0][c] + table.width[1][c] + table.width[2][c] > 1)
+ panicf("improper table state");
+ }
+
+ putrange("zero", table.width[0], 1);
+ putrange("single", table.width[1], 1);
+ putrange("double", table.width[2], 1);
+}