From b12609cd69ff39f8a221b12230becadfe3e1fc57 Mon Sep 17 00:00:00 2001
From: Nicholas Noll <nbnoll@eml.cc>
Date: Fri, 29 Oct 2021 08:44:47 -0700
Subject: Prototype of unicode readline support

The readline functionality operated on the assumption that 1 byte = 1
character. This is obviously wrong if you input a non-ascii character.
This commit temporarily removes a lot of functionality but parses input
bytes in a unicode-aware manner.

The outstanding problem now is 1 unicode rune != 1 column. There are
double wide characters, as well as zero width runes, that further break
our assumption that 1 rune = 1 character = 1 column. This is the next
iteration.
---
 sys/cmd/rc/lex.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'sys/cmd/rc/lex.c')

diff --git a/sys/cmd/rc/lex.c b/sys/cmd/rc/lex.c
index 9a58f21..9ca2453 100644
--- a/sys/cmd/rc/lex.c
+++ b/sys/cmd/rc/lex.c
@@ -142,28 +142,23 @@ putbyte(char *buf, int c)
     return buf;
 }
 
-#define	onebyte(c)	     ((c&0x80)==0x00)
-#define	twobyte(c)	     ((c&0xe0)==0xc0)
-#define	threebyte(c)	 ((c&0xf0)==0xe0)
-#define	fourbyte(c)	     ((c&0xf8)==0xf0)
-
 static
 char *
 putrune(char *buf, int c)
 {
     buf = putbyte(buf, c);
-    if(onebyte(c))
+    if(utf8·onebyte(c))
         return buf;
-    if(twobyte(c))
-        return putbyte(buf,c);
-    if(threebyte(c)){
-        buf = putbyte(buf,c);
-        return putbyte(buf,c);
+    if(utf8·twobyte(c))
+        return putbyte(buf,advance());
+    if(utf8·threebyte(c)){
+        buf = putbyte(buf,advance());
+        return putbyte(buf,advance());
     }
-    if(fourbyte(c)){
-        buf = putbyte(buf,c);
-        buf = putbyte(buf,c);
-        return putbyte(buf,c);
+    if(utf8·fourbyte(c)){
+        buf = putbyte(buf,advance());
+        buf = putbyte(buf,advance());
+        return putbyte(buf,advance());
     }
     fatal("malformed utf8 stream");
 
-- 
cgit v1.2.1