From 1455834a50b8b6a15567e971db664fe7a6cdfaf6 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 29 Oct 2021 17:46:41 -0700 Subject: fix(unicode): emoji widths and readline now moves in a more unicode aware manner --- sys/cmd/rc/input.c | 34 ++++++++++++------ sys/libunicode/rules.mk | 6 +++- sys/libunicode/vendor/mkrunewidth.c | 71 ++++++++++++++++++++++++++++++------- 3 files changed, 87 insertions(+), 24 deletions(-) (limited to 'sys') diff --git a/sys/cmd/rc/input.c b/sys/cmd/rc/input.c index 08363f0..5d30ccd 100644 --- a/sys/cmd/rc/input.c +++ b/sys/cmd/rc/input.c @@ -341,14 +341,16 @@ refreshsingleline(struct TerminalState *term) w = utf8·runewidth(r); buf+=n, len-=n; - pos-=w, col-=w; + pos-=w, col-=w; } assert(buf <= term->edit.buf + len); while(off+col > term->cursor.cap){ n = utf8·decodeprev(buf+len-1, &r); - len-=n, col--; + w = utf8·runewidth(r); + + len-=n, col-=w; } assert(len >= 0); @@ -510,8 +512,8 @@ insertrune(struct TerminalState *term, int n, char *c) memmove(term->edit.buf+term->edit.len+n, term->edit.buf+term->edit.len, term->edit.len-term->edit.pos); memcpy(term->edit.buf+term->edit.pos, c, n); - term->edit.pos+=n, term->edit.len+=n; - term->cursor.pos++, term->cursor.len++; + term->edit.pos += n, term->edit.len += n; + term->cursor.pos += w, term->cursor.len += w; term->edit.buf[term->edit.len] = '\0'; refreshline(term); @@ -664,15 +666,21 @@ static Position left(struct TerminalState *term, int n) { + int w, d; rune r; Position pos = CURRENT(term); char *buf = term->edit.buf + term->edit.pos; - pos.cursor = MAX(pos.cursor-n, 0); - - while(n-- > 0 && buf > term->edit.buf) + d = 0; + while(n > 0 && buf > term->edit.buf){ buf -= utf8·decodeprev(buf-1, &r); + w = utf8·runewidth(r); + n -= w; + d += w; + } + + pos.cursor = MAX(pos.cursor-d, 0); pos.buffer = MAX(buf-term->edit.buf, 0); return pos; } @@ -682,16 +690,22 @@ static Position right(struct TerminalState *term, int n) { + int w, d; rune r; Position pos = CURRENT(term); char *end = term->edit.buf + term->edit.len; char *buf = term->edit.buf + term->edit.pos; - pos.cursor = MIN(pos.cursor+n, term->cursor.len); - - while(n-- > 0 && buf < end) + d = 0; + while(n > 0 && buf < end){ buf += utf8·decode(buf, &r); + w = utf8·runewidth(r); + n -= w; + d += w; + } + + pos.cursor = MIN(pos.cursor+d, term->cursor.len); pos.buffer = MIN(buf-term->edit.buf, term->edit.len); return pos; } diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk index acfce32..1d714fe 100644 --- a/sys/libunicode/rules.mk +++ b/sys/libunicode/rules.mk @@ -42,12 +42,16 @@ $(d)/vendor/EastAsianWidth-$(UNICODE).txt: @echo "GET EastAsianWidth.txt";\ curl https://www.unicode.org/Public/$(UNICODE)/ucd/EastAsianWidth.txt > $@ +$(d)/vendor/EmojiData-$(UNICODE).txt: + @echo "GET EmojiData.txt";\ + curl https://www.unicode.org/Public/$(UNICODE)/ucd/emoji/emoji-data.txt > $@ + $(d)/vendor/mkrunewidth: $(d)/vendor/mkrunewidth.c $(d)/vendor/common.o $(OBJ_DIR)/sys/base/base.a $(COMPLINK) GENS += $(d)/vendor/mkrunewidth -$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt +$(d)/runewidth-$(UNICODE).c: $(d)/vendor/mkrunewidth $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/EastAsianWidth-$(UNICODE).txt $(d)/vendor/EmojiData-$(UNICODE).txt @$(dir $@)vendor/mkrunewidth $(filter-out $<, $^) > $@ # grapheme boundaries diff --git a/sys/libunicode/vendor/mkrunewidth.c b/sys/libunicode/vendor/mkrunewidth.c index 43feb5e..14e6973 100644 --- a/sys/libunicode/vendor/mkrunewidth.c +++ b/sys/libunicode/vendor/mkrunewidth.c @@ -89,6 +89,21 @@ parse_category(char *path) io·close(utf8); } +static +void +coderange(char *field, int *l, int *r) +{ + char *s; + + if(!(s = strstr(field, ".."))) + *l=*r=codepoint(field); + else{ + *s++ = 0, *s++ = 0; + *l=codepoint(field); + *r=codepoint(s); + } +} + static void parse_eawidths(char *path) @@ -96,7 +111,7 @@ parse_eawidths(char *path) int at, w; int l, c, r; io·Stream *utf8; - char *s, *field[2], line[1024]; + char *field[2], line[1024]; utf8 = io·open(path, "r"); while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ @@ -118,13 +133,7 @@ parse_eawidths(char *path) panicf("malformed east asian width class: %s\n", field[1]); } - if(!(s = strstr(field[0], ".."))) - l=codepoint(field[0]), r=l; - else{ - *s++ = 0, *s++ = 0; - l=codepoint(field[0]); - r=codepoint(s); - } + coderange(field[0], &l, &r); for(c=l; c <= r; c++){ /* ensure it only exists in one table */ @@ -136,6 +145,43 @@ parse_eawidths(char *path) io·close(utf8); } +static +void +parse_emoji(char *path) +{ + int at, w; + int l, c, r; + io·Stream *utf8; + char *s, *field[2], line[1024]; + + utf8 = io·open(path, "r"); + while((at=parse(utf8, arrlen(field), field, arrlen(line), line)) != ParseEOF){ + if(at == ParseSkip) + continue; + + /* only override emoji presentation */ + if(!strstr(field[1], "Emoji_Presentation")) + continue; + + /* trim trailing space */ + for(s=field[0]; *s; s++){ + if(*s == ' ') + *s = 0; + } + + coderange(field[0], &l, &r); + + for(c=l; c <= r; c++){ + table.width[0][c] = 0; + table.width[1][c] = 0; + table.width[2][c] = 1; + } + } + + io·close(utf8); +} + +/* output functions */ static void maketable(char *label, char *table, int pairs, int onlyranges) @@ -204,8 +250,6 @@ maketable(char *label, char *table, int pairs, int onlyranges) "\n"); } - - // ----------------------------------------------------------------------- // main point of entry @@ -213,7 +257,7 @@ static void usage(void) { - fprintf(stderr, "usage: mkrunewidth \n"); + fprintf(stderr, "usage: mkrunewidth \n"); exit(1); } @@ -241,11 +285,12 @@ main(int argc, char *argv[]) ARGBEGIN{ }ARGEND; - if(argc != 2) + if(argc != 3) usage(); parse_category(*argv++); - parse_eawidths(*argv); + parse_eawidths(*argv++); + parse_emoji(*argv); /* overrides */ SETW0(0x2028); -- cgit v1.2.1