From b12609cd69ff39f8a221b12230becadfe3e1fc57 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Fri, 29 Oct 2021 08:44:47 -0700 Subject: Prototype of unicode readline support The readline functionality operated on the assumption that 1 byte = 1 character. This is obviously wrong if you input a non-ascii character. This commit temporarily removes a lot of functionality but parses input bytes in a unicode-aware manner. The outstanding problem now is 1 unicode rune != 1 column. There are double wide characters, as well as zero width runes, that further break our assumption that 1 rune = 1 character = 1 column. This is the next iteration. --- sys/cmd/rc/lex.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'sys/cmd/rc/lex.c') diff --git a/sys/cmd/rc/lex.c b/sys/cmd/rc/lex.c index 9a58f21..9ca2453 100644 --- a/sys/cmd/rc/lex.c +++ b/sys/cmd/rc/lex.c @@ -142,28 +142,23 @@ putbyte(char *buf, int c) return buf; } -#define onebyte(c) ((c&0x80)==0x00) -#define twobyte(c) ((c&0xe0)==0xc0) -#define threebyte(c) ((c&0xf0)==0xe0) -#define fourbyte(c) ((c&0xf8)==0xf0) - static char * putrune(char *buf, int c) { buf = putbyte(buf, c); - if(onebyte(c)) + if(utf8·onebyte(c)) return buf; - if(twobyte(c)) - return putbyte(buf,c); - if(threebyte(c)){ - buf = putbyte(buf,c); - return putbyte(buf,c); + if(utf8·twobyte(c)) + return putbyte(buf,advance()); + if(utf8·threebyte(c)){ + buf = putbyte(buf,advance()); + return putbyte(buf,advance()); } - if(fourbyte(c)){ - buf = putbyte(buf,c); - buf = putbyte(buf,c); - return putbyte(buf,c); + if(utf8·fourbyte(c)){ + buf = putbyte(buf,advance()); + buf = putbyte(buf,advance()); + return putbyte(buf,advance()); } fatal("malformed utf8 stream"); -- cgit v1.2.1