From 1455834a50b8b6a15567e971db664fe7a6cdfaf6 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Fri, 29 Oct 2021 17:46:41 -0700
Subject: fix(unicode): emoji widths and readline now moves in a more unicode
 aware manner

---
 sys/cmd/rc/input.c                  | 34 ++++++++++++------
 sys/libunicode/rules.mk             |  6 +++-
 sys/libunicode/vendor/mkrunewidth.c | 71 ++++++++++++++++++++++++++++++-------
 3 files changed, 87 insertions(+), 24 deletions(-)

(limited to 'sys')

diff --git a/sys/cmd/rc/input.c b/sys/cmd/rc/input.c
index 08363f0..5d30ccd 100644
--- a/sys/cmd/rc/input.c
+++ b/sys/cmd/rc/input.c
@@ -341,14 +341,16 @@ refreshsingleline(struct TerminalState *term)
         w = utf8·runewidth(r);
 
         buf+=n, len-=n;
-        pos-=w,  col-=w;
+        pos-=w, col-=w;
     }
 
     assert(buf <= term->edit.buf + len);
 
     while(off+col > term->cursor.cap){
         n = utf8·decodeprev(buf+len-1, &r);
-        len-=n, col--;
+        w = utf8·runewidth(r);
+
+        len-=n, col-=w;
     }
     assert(len >= 0);
 
@@ -510,8 +512,8 @@ insertrune(struct TerminalState *term, int n, char *c)
             memmove(term->edit.buf+term->edit.len+n, term->edit.buf+term->edit.len, term->edit.len-term->edit.pos);
             memcpy(term->edit.buf+term->edit.pos, c, n);
 
-            term->edit.pos+=n,   term->edit.len+=n;
-            term->cursor.pos++, term->cursor.len++;
+            term->edit.pos   += n,   term->edit.len   += n;
+            term->cursor.pos += w,   term->cursor.len += w;
 
             term->edit.buf[term->edit.len] = '\0';
             refreshline(term);
@@ -664,15 +666,21 @@ static
 Position
 left(struct TerminalState *term, int n)
 {
+    int  w, d;
     rune r;
     Position pos = CURRENT(term);
     char *buf = term->edit.buf + term->edit.pos;
 
-    pos.cursor = MAX(pos.cursor-n, 0);
-
-    while(n-- > 0 && buf > term->edit.buf)
+    d = 0;
+    while(n > 0 && buf > term->edit.buf){
         buf -= utf8·decodeprev(buf-1, &r);
 
+        w  = utf8·runewidth(r);
+        n -= w;
+        d += w;
+    }
+
+    pos.cursor = MAX(pos.cursor-d, 0);
     pos.buffer = MAX(buf-term->edit.buf, 0);
     return pos;
 }
@@ -682,16 +690,22 @@ static
 Position
 right(struct TerminalState *term, int n)
 {
+    int  w, d;
     rune r;
     Position pos = CURRENT(term);
     char *end = term->edit.buf + term->edit.len;
     char *buf = term->edit.buf + term->edit.pos;
 
-    pos.cursor = MIN(pos.cursor+n, term->cursor.len);
-
-    while(n-- > 0 && buf < end)
+    d = 0;
+    while(n > 0 && buf < end){
         buf += utf8·decode(buf, &r);
 
+        w  = utf8·runewidth(r);
+        n -= w;
+        d += w;
+    }
+
+    pos.cursor = MIN(pos.cursor+d, term->cursor.len);
     pos.buffer = MIN(buf-term->edit.buf, term->edit.len);
     return pos;
 }
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index acfce32..1d714fe 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -42,12 +42,16 @@ $(d)/vendor/EastAsianWidth-$(UNICODE).txt:
 	@echo "GET	EastAsianWidth.txt";\
 	curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
 
+$(d)/vendor/EmojiData-$(UNICODE).txt:
+	@echo "GET	EmojiData.txt";\
+	curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@
+
 $(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
 	$(COMPLINK)
 
 GENS += $(d)/vendor/mkrunewidth
 
-$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt
+$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt
 	@$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@
 
 # grapheme boundaries
diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c
index 43feb5e..14e6973 100644
--- a/sys/libunicode/vendor/mkrunewidth.c
+++ b/sys/libunicode/vendor/mkrunewidth.c
@@ -89,6 +89,21 @@ parse_category(char *path)
     io·close(utf8);
 }
 
+static
+void
+coderange(char *field, int *l, int *r)
+{
+    char *s;
+
+    if(!(s = strstr(field, "..")))
+        *l=*r=codepoint(field);
+    else{
+        *s++ = 0, *s++ = 0;
+        *l=codepoint(field);
+        *r=codepoint(s);
+    }
+}
+
 static
 void
 parse_eawidths(char *path)
@@ -96,7 +111,7 @@ parse_eawidths(char *path)
     int at, w;
     int l, c, r;
     io·Stream *utf8;
-    char *s, *field[2], line[1024];
+    char *field[2], line[1024];
 
     utf8 = io·open(path, "r");
     while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
@@ -118,13 +133,7 @@ parse_eawidths(char *path)
             panicf("malformed east asian width class: %s\n", field[1]);
         }
 
-        if(!(s = strstr(field[0], "..")))
-            l=codepoint(field[0]), r=l;
-        else{
-            *s++ = 0, *s++ = 0;
-            l=codepoint(field[0]);
-            r=codepoint(s);
-        }
+        coderange(field[0], &l, &r);
 
         for(c=l; c <= r; c++){
             /* ensure it only exists in one table */
@@ -136,6 +145,43 @@ parse_eawidths(char *path)
     io·close(utf8);
 }
 
+static
+void
+parse_emoji(char *path)
+{
+    int at, w;
+    int l, c, r;
+    io·Stream *utf8;
+    char *s, *field[2], line[1024];
+
+    utf8 = io·open(path, "r");
+    while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+        if(at == ParseSkip)
+            continue;
+
+        /* only override emoji presentation */
+        if(!strstr(field[1], "Emoji_Presentation"))
+            continue;
+
+        /* trim trailing space */
+        for(s=field[0]; *s; s++){
+            if(*s == ' ')
+                *s = 0;
+        }
+
+        coderange(field[0], &l, &r);
+
+        for(c=l; c <= r; c++){
+            table.width[0][c] = 0;
+            table.width[1][c] = 0;
+            table.width[2][c] = 1;
+        }
+    }
+
+    io·close(utf8);
+}
+
+/* output functions */
 static
 void
 maketable(char *label, char *table, int pairs, int onlyranges)
@@ -204,8 +250,6 @@ maketable(char *label, char *table, int pairs, int onlyranges)
         "\n");
 }
 
-
-
 // -----------------------------------------------------------------------
 // main point of entry
 
@@ -213,7 +257,7 @@ static
 void
 usage(void)
 {
-    fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt>\n");
+    fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt> <EmojiData.txt>\n");
     exit(1);
 }
 
@@ -241,11 +285,12 @@ main(int argc, char *argv[])
     ARGBEGIN{
     }ARGEND;
 
-    if(argc != 2)
+    if(argc != 3)
         usage();
 
     parse_category(*argv++);
-    parse_eawidths(*argv);
+    parse_eawidths(*argv++);
+    parse_emoji(*argv);
 
     /* overrides */
     SETW0(0x2028);
-- 
cgit v1.2.1