aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-10-29 17:46:41 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-10-29 17:46:41 -0700
commit1455834a50b8b6a15567e971db664fe7a6cdfaf6 (patch)
treec787f7db4b2b2a277351d0c90a9829119df677d4
parent5d1e21d28a93415b66401d27a30ee8eb6b2b66a8 (diff)
fix(unicode): emoji widths and readline now moves in a more unicode aware manner
-rw-r--r--sys/cmd/rc/input.c34
-rw-r--r--sys/libunicode/rules.mk6
-rw-r--r--sys/libunicode/vendor/mkrunewidth.c71
3 files changed, 87 insertions, 24 deletions
diff --git a/sys/cmd/rc/input.c b/sys/cmd/rc/input.c
index 08363f0..5d30ccd 100644
--- a/sys/cmd/rc/input.c
+++ b/sys/cmd/rc/input.c
@@ -341,14 +341,16 @@ refreshsingleline(struct TerminalState *term)
w = utf8·runewidth(r);
buf+=n, len-=n;
- pos-=w, col-=w;
+ pos-=w, col-=w;
}
assert(buf <= term->edit.buf + len);
while(off+col > term->cursor.cap){
n = utf8·decodeprev(buf+len-1, &r);
- len-=n, col--;
+ w = utf8·runewidth(r);
+
+ len-=n, col-=w;
}
assert(len >= 0);
@@ -510,8 +512,8 @@ insertrune(struct TerminalState *term, int n, char *c)
memmove(term->edit.buf+term->edit.len+n, term->edit.buf+term->edit.len, term->edit.len-term->edit.pos);
memcpy(term->edit.buf+term->edit.pos, c, n);
- term->edit.pos+=n, term->edit.len+=n;
- term->cursor.pos++, term->cursor.len++;
+ term->edit.pos += n, term->edit.len += n;
+ term->cursor.pos += w, term->cursor.len += w;
term->edit.buf[term->edit.len] = '\0';
refreshline(term);
@@ -664,15 +666,21 @@ static
Position
left(struct TerminalState *term, int n)
{
+ int w, d;
rune r;
Position pos = CURRENT(term);
char *buf = term->edit.buf + term->edit.pos;
- pos.cursor = MAX(pos.cursor-n, 0);
-
- while(n-- > 0 && buf > term->edit.buf)
+ d = 0;
+ while(n > 0 && buf > term->edit.buf){
buf -= utf8·decodeprev(buf-1, &r);
+ w = utf8·runewidth(r);
+ n -= w;
+ d += w;
+ }
+
+ pos.cursor = MAX(pos.cursor-d, 0);
pos.buffer = MAX(buf-term->edit.buf, 0);
return pos;
}
@@ -682,16 +690,22 @@ static
Position
right(struct TerminalState *term, int n)
{
+ int w, d;
rune r;
Position pos = CURRENT(term);
char *end = term->edit.buf + term->edit.len;
char *buf = term->edit.buf + term->edit.pos;
- pos.cursor = MIN(pos.cursor+n, term->cursor.len);
-
- while(n-- > 0 && buf < end)
+ d = 0;
+ while(n > 0 && buf < end){
buf += utf8·decode(buf, &r);
+ w = utf8·runewidth(r);
+ n -= w;
+ d += w;
+ }
+
+ pos.cursor = MIN(pos.cursor+d, term->cursor.len);
pos.buffer = MIN(buf-term->edit.buf, term->edit.len);
return pos;
}
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index acfce32..1d714fe 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -42,12 +42,16 @@ $(d)/vendor/EastAsianWidth-$(UNICODE).txt:
@echo "GET EastAsianWidth.txt";\
curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@
+$(d)/vendor/EmojiData-$(UNICODE).txt:
+ @echo "GET EmojiData.txt";\
+ curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@
+
$(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a
$(COMPLINK)
GENS += $(d)/vendor/mkrunewidth
-$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt
+$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt
@$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@
# grapheme boundaries
diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c
index 43feb5e..14e6973 100644
--- a/sys/libunicode/vendor/mkrunewidth.c
+++ b/sys/libunicode/vendor/mkrunewidth.c
@@ -91,12 +91,27 @@ parse_category(char *path)
static
void
+coderange(char *field, int *l, int *r)
+{
+ char *s;
+
+ if(!(s = strstr(field, "..")))
+ *l=*r=codepoint(field);
+ else{
+ *s++ = 0, *s++ = 0;
+ *l=codepoint(field);
+ *r=codepoint(s);
+ }
+}
+
+static
+void
parse_eawidths(char *path)
{
int at, w;
int l, c, r;
io·Stream *utf8;
- char *s, *field[2], line[1024];
+ char *field[2], line[1024];
utf8 = io·open(path, "r");
while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
@@ -118,13 +133,7 @@ parse_eawidths(char *path)
panicf("malformed east asian width class: %s\n", field[1]);
}
- if(!(s = strstr(field[0], "..")))
- l=codepoint(field[0]), r=l;
- else{
- *s++ = 0, *s++ = 0;
- l=codepoint(field[0]);
- r=codepoint(s);
- }
+ coderange(field[0], &l, &r);
for(c=l; c <= r; c++){
/* ensure it only exists in one table */
@@ -138,6 +147,43 @@ parse_eawidths(char *path)
static
void
+parse_emoji(char *path)
+{
+ int at, w;
+ int l, c, r;
+ io·Stream *utf8;
+ char *s, *field[2], line[1024];
+
+ utf8 = io·open(path, "r");
+ while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){
+ if(at == ParseSkip)
+ continue;
+
+ /* only override emoji presentation */
+ if(!strstr(field[1], "Emoji_Presentation"))
+ continue;
+
+ /* trim trailing space */
+ for(s=field[0]; *s; s++){
+ if(*s == ' ')
+ *s = 0;
+ }
+
+ coderange(field[0], &l, &r);
+
+ for(c=l; c <= r; c++){
+ table.width[0][c] = 0;
+ table.width[1][c] = 0;
+ table.width[2][c] = 1;
+ }
+ }
+
+ io·close(utf8);
+}
+
+/* output functions */
+static
+void
maketable(char *label, char *table, int pairs, int onlyranges)
{
int r, p=0, s=0;
@@ -204,8 +250,6 @@ maketable(char *label, char *table, int pairs, int onlyranges)
"\n");
}
-
-
// -----------------------------------------------------------------------
// main point of entry
@@ -213,7 +257,7 @@ static
void
usage(void)
{
- fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt>\n");
+ fprintf(stderr, "usage: mkrunewidth <UnicodeData.txt> <EastAsianWidth.txt> <EmojiData.txt>\n");
exit(1);
}
@@ -241,11 +285,12 @@ main(int argc, char *argv[])
ARGBEGIN{
}ARGEND;
- if(argc != 2)
+ if(argc != 3)
usage();
parse_category(*argv++);
- parse_eawidths(*argv);
+ parse_eawidths(*argv++);
+ parse_emoji(*argv);
/* overrides */
SETW0(0x2028);