aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2021-10-28 11:55:47 -0700
committerNicholas Noll <nbnoll@eml.cc>2021-10-28 11:55:47 -0700
commit6318ddb69ad8380694bbca35ca5a72230c2ee694 (patch)
treee9ef65d79c264d88c2d02402a587c2561b5b1299
parent189b9e23edfe60b7e82c4c7b6071a3f98799653a (diff)
Feat(libunicode): cleaned up decode/encode
Additionally, decode can now apply backwards on a byte string.
-rw-r--r--include/libunicode.h14
-rw-r--r--sys/libunicode/canfit.c17
-rw-r--r--sys/libunicode/decode.c78
-rw-r--r--sys/libunicode/decodeprev.c60
-rw-r--r--sys/libunicode/encode.c38
-rw-r--r--sys/libunicode/find.c5
-rw-r--r--sys/libunicode/findlast.c6
-rw-r--r--sys/libunicode/internal.h37
-rw-r--r--sys/libunicode/len.c4
-rw-r--r--sys/libunicode/rules.mk17
-rw-r--r--sys/libunicode/vendor/mkrunetype.c3
11 files changed, 227 insertions, 52 deletions
diff --git a/include/libunicode.h b/include/libunicode.h
index 5e69fd1..00e5bc0 100644
--- a/include/libunicode.h
+++ b/include/libunicode.h
@@ -2,16 +2,8 @@
typedef uint32 rune;
-/*
- * we have to use the preprocessor to ensure we have unsigned constants.
- */
-
#define UTFmax 4
-#define RuneSync 0x80u
-#define RuneSelf 0x80u
#define RuneErr 0xFFFDu
-#define RuneMax 0x10FFFFu
-#define RuneMask 0x1FFFFFu
/*
* UTF-8 functions.
@@ -19,8 +11,9 @@ typedef uint32 rune;
int utf8·len(char *s); // returns number of runes
int utf8·runelen(rune r); // returns number of bytes for rune
-int utf8·decode(char *, rune *); // decode 1 rune from char stream, store into rune, return number of bytes
-int utf8·encode(rune *, char *); // encode 1 rune from rune stream, store into char, return number of bytes
+int utf8·decode(char *, rune *); // decode 1 rune from char stream, store into rune, return number of bytes
+int utf8·encode(rune *, char *); // encode 1 rune from rune stream, store into char, return number of bytes
+int utf8·decodeprev(char *s, rune *r); // decode 1 rune from char stream, reading backwards, store into rune, return number of bytes
char *utf8·find(char *s, rune); // find rune in char stream
char *utf8·findlast(char* s, rune); // find last rune in char stream
@@ -31,6 +24,7 @@ int utf8·isletter(rune r);
int utf8·isdigit(rune r);
int utf8·isspace(rune r);
int utf8·istitle(rune r);
+int utf8·ispunct(rune r);
rune utf8·toupper(rune r);
rune utf8·tolower(rune r);
diff --git a/sys/libunicode/canfit.c b/sys/libunicode/canfit.c
index d44c9e6..4579ab3 100644
--- a/sys/libunicode/canfit.c
+++ b/sys/libunicode/canfit.c
@@ -7,14 +7,17 @@ utf8·canfit(byte* s, int n)
int i;
rune c;
- if(n <= 0) return 0;
- c = *(ubyte*) s;
- if(c < Tx) return 1;
+ if(n <= 0)
+ return 0;
- for(i = 3; i < UTFmax + 1; i++){
- if(c < Tbyte(i))
- return n >= i - 1;
- }
+ c = *(ubyte*)s;
+ if(c < TByte1)
+ return 1;
+
+ if(c < TByte3)
+ return n >= 2;
+ if(c < TByte4)
+ return n >= 3;
return n >= UTFmax;
}
diff --git a/sys/libunicode/decode.c b/sys/libunicode/decode.c
index 79271f2..01797f1 100644
--- a/sys/libunicode/decode.c
+++ b/sys/libunicode/decode.c
@@ -1,32 +1,90 @@
#include "internal.h"
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+ /*
+ * the first part of the table maps bytes to character classes that
+ * to reduce the size of the transition table and create bitmasks
+ */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ /*
+ * the second part is a transition table that maps a combination
+ * of a state of the automaton and a character class to a state
+ */
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+int
+utf8·decode(char *s, rune *r)
+{
+ int n;
+ rune v;
+ uint8 b, t, x=ACCEPT;
+
+ b = ((uint8 *)s)[0];
+ t = decode[b];
+ v = (0xFF >> t) & b;
+ x = decode[256+x+t];
+
+ for(n=1; x > REJECT && n < UTFmax; n++){
+ b = ((uint8 *)s)[n];
+ t = decode[b];
+ v = (v << 6) | (b & TMask);
+ x = decode[256+x+t];
+ }
+
+ if(x != ACCEPT){
+ *r = RuneErr;
+ return 1;
+ }
+
+ *r = v;
+ return n;
+}
+
+#if 0
int
-utf8·decode(byte *s, rune* r)
+utf8·decode(byte *s, rune *r)
{
int c[UTFmax], i;
rune l;
c[0] = *(ubyte*)(s);
- if(c[0] < Tx) {
+ if(c[0] < Tx){
*r = c[0];
return 1;
}
l = c[0];
- for(i = 1; i < UTFmax; i++) {
+ for(i = 1; i < UTFmax; i++){
c[i] = *(ubyte*)(s+i);
c[i] ^= Tx;
- if (c[i] & Testx) goto bad;
+ if(c[i] & Testx) goto bad;
l = (l << Bitx) | c[i];
- if(c[0] < Tbyte(i + 2)) {
+ if(c[0] < Tbyte(i + 2)){
l &= RuneX(i + 1);
- if (i == 1) {
- if (c[0] < Tbyte(2) || l <= Rune1)
+ if(i == 1){
+ if(c[0] < Tbyte(2) || l <= Rune1)
goto bad;
- } else if (l <= RuneX(i) || l > RuneMax)
+ }else if(l <= RuneX(i) || l > RuneMax)
goto bad;
- if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+
+ if(i == 2 && SurrogateMin <= l && l <= SurrogateMax)
goto bad;
*r = l;
@@ -37,4 +95,4 @@ bad:
*r = RuneErr;
return 1;
}
-
+#endif
diff --git a/sys/libunicode/decodeprev.c b/sys/libunicode/decodeprev.c
new file mode 100644
index 0000000..27dced6
--- /dev/null
+++ b/sys/libunicode/decodeprev.c
@@ -0,0 +1,60 @@
+#include "internal.h"
+
+#define ACCEPT 0
+#define REJECT 12
+
+static uint8 decode[] = {
+ /*
+ * the first part of the table maps bytes to character classes that
+ * to reduce the size of the transition table and create bitmasks.
+ */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+ /*
+ * The second part is a transition table that maps a combination
+ * of a state of the automaton and a character class to a state.
+ */
+ // 0 1 2 3 4 5 6 7 8 9 10 11
+ 0,24,12,12,12,12,12,24,12,24,12,12,
+ 0,24,12,12,12,12,12,24,12,24,12,12,
+ 12,36, 0,12,12,12,12,48,12,36,12,12,
+ 12,60,12, 0, 0,12,12,72,12,72,12,12,
+ 12,60,12, 0,12,12,12,72,12,72, 0,12,
+ 12,12,12,12,12, 0, 0,12,12,12,12,12,
+ 12,12,12,12,12,12,12,12,12,12,12, 0
+};
+
+int
+utf8·decodeprev(byte *s, rune *r)
+{
+ int n;
+ rune v;
+ uint8 b, t, d, x=ACCEPT;
+
+ v=0, n=0, d=0;
+nextbyte:
+ b = ((uint8 *)s)[-n++];
+ t = decode[b];
+ x = decode[256+x+t];
+
+ if(x > REJECT && n < UTFmax){
+ v = v | ((b & TMask) << d);
+ d += 6;
+ goto nextbyte;
+ }
+
+ if(x != ACCEPT)
+ *r = RuneErr;
+ else{
+ v |= (((0xFFu >> t) & b) << d);
+ *r = v;
+ }
+
+ return n;
+}
diff --git a/sys/libunicode/encode.c b/sys/libunicode/encode.c
index 8f4d212..fa7c93e 100644
--- a/sys/libunicode/encode.c
+++ b/sys/libunicode/encode.c
@@ -1,6 +1,43 @@
#include "internal.h"
int
+utf8·encode(rune *r, byte *s)
+{
+ rune c;
+
+ c = *r;
+ if(c < Rune1Byte){ // 7 bits
+ s[0] = (uint8)c;
+ return 1;
+ }
+
+ if(c < Rune2Byte){ // 11 bits
+ s[0] = TByte1 | (c >> 6);
+ s[1] = Tx | (c & TMask);
+ return 2;
+ }
+
+ if(c < Rune3Byte){ // 16 bits
+ s[0] = TByte2 | ((c >> 12));
+ s[1] = Tx | ((c >> 6) & TMask);
+ s[2] = Tx | ((c) & TMask);
+ return 3;
+ }
+
+ // 22 bits
+ if(c > RuneMax || (RuneSurrogateMin <= c && c <= RuneSurrogateMax))
+ c = RuneErr;
+
+ s[0] = TByte3 | ((c >> 18));
+ s[1] = Tx | ((c >> 12) & TMask);
+ s[2] = Tx | ((c >> 6) & TMask);
+ s[3] = Tx | ((c) & TMask);
+
+ return 4;
+}
+
+#if 0
+int
utf8·encode(rune* r, byte* s)
{
int i, j;
@@ -29,3 +66,4 @@ utf8·encode(rune* r, byte* s)
return UTFmax;
}
+#endif
diff --git a/sys/libunicode/find.c b/sys/libunicode/find.c
index 659ab5b..d75feb8 100644
--- a/sys/libunicode/find.c
+++ b/sys/libunicode/find.c
@@ -7,17 +7,18 @@ utf8·find(byte* s, rune c)
rune r;
int n;
- if(c < RuneSync)
+ if(c < Tx)
return strchr(s, c);
for(;;){
c1 = *(ubyte*)s;
- if(c1 < RuneSelf) {
+ if(c1 < Tx){
if(c1 == 0) return nil;
if(c1 == c) return s;
s++;
continue;
}
+
n = utf8·decode(s, &r);
if(r == c)
diff --git a/sys/libunicode/findlast.c b/sys/libunicode/findlast.c
index 3a4ed38..ab25ab2 100644
--- a/sys/libunicode/findlast.c
+++ b/sys/libunicode/findlast.c
@@ -7,19 +7,21 @@ utf8·findlast(byte* s, rune c)
rune r;
byte *l;
- if(c < RuneSync)
+ if(c < Tx)
return strrchr(s, c);
l = nil;
for(;;){
c1 = *(ubyte*)s;
- if(c1 < RuneSelf) {
+ if(c1 < Tx){
if(c1 == 0) return l;
if(c1 == c) l = s;
s++;
continue;
}
+
c1 = utf8·decode(s, &r);
+
if(r == c)
l = s;
diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h
index 31ed2ae..6b0657b 100644
--- a/sys/libunicode/internal.h
+++ b/sys/libunicode/internal.h
@@ -4,7 +4,9 @@
#include <base.h>
#include <libunicode.h>
-/* UTF-8 code
+/*
+ * NOTE: we use the preprocessor to ensure we have unsigned constants.
+ * UTF-8 code:
* 1 byte:
* 0xxxxxxx
* 2 byte:
@@ -14,22 +16,23 @@
* 4 byte:
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
-#define Bit(i) (7-(i))
-/* 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */
-#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
-/* 0000 0000 0000 0111 1111 1111 */
-#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
-enum
-{
- Bitx = Bit(1),
- Tx = Tbyte(1),
- Rune1 = (1 << (Bit(0)+0*Bitx)) - 1,
+#define Tx 0x80u // 0b10000000 transfer header
+#define TMask 0x3Fu // 0b00111111 transfer mask
- Maskx = (1 << Bitx) - 1, /* 0011 1111 */
- Testx = Maskx ^ 0xff, /* 1100 0000 */
+#define TByte1 0xC0u // 0b11000000
+#define TByte2 0xE0u // 0b11100000
+#define TByte3 0xF0u // 0b11110000
+#define TByte4 0xF8u // 0b11111000
- SurrogateMin = 0xD800,
- SurrogateMax = 0xDFFF,
- Bad = RuneErr,
-};
+#define Rune1Byte 0x000080u // 1 << 8 (1 byte)
+#define Rune2Byte 0x001000u // 1 << 12 (2 bytes)
+#define Rune3Byte 0x020000u // 1 << 17 (3 bytes)
+#define Rune4Byte 0x400000u // 1 << 22 (4 bytes)
+
+#define RuneMax 0x10FFFFu
+#define RuneMask 0x1FFFFFu
+
+/* UTF-16 nonsense */
+#define RuneSurrogateMin 0x0D8000
+#define RuneSurrogateMax 0x0D8FFF
diff --git a/sys/libunicode/len.c b/sys/libunicode/len.c
index c461bf3..8fbd679 100644
--- a/sys/libunicode/len.c
+++ b/sys/libunicode/len.c
@@ -10,11 +10,11 @@ utf8·len(char *s)
n = 0;
for(;;){
c = *(uchar*)s;
- if(c < RuneSelf) {
+ if(c < Tx){
if(c == 0)
return n;
s++;
- } else
+ }else
s += utf8·decode(s, &r);
n++;
}
diff --git a/sys/libunicode/rules.mk b/sys/libunicode/rules.mk
index 100697b..aca7348 100644
--- a/sys/libunicode/rules.mk
+++ b/sys/libunicode/rules.mk
@@ -5,6 +5,7 @@ UNICODE = 14.0.0
SRCS_$(d) := \
$(d)/encode.c \
$(d)/decode.c \
+ $(d)/decodeprev.c \
$(d)/find.c \
$(d)/findlast.c \
$(d)/canfit.c \
@@ -16,6 +17,7 @@ LIBS_$(d) := $(d)/libunicode.a
include share/paths.mk
+# rune categories
$(d)/vendor/UnicodeData-$(UNICODE).txt:
@echo "GET UnicodeData.txt";\
curl https://www.unicode.org/Public/$(UNICODE)/ucd/UnicodeData.txt > $@
@@ -23,10 +25,23 @@ $(d)/vendor/UnicodeData-$(UNICODE).txt:
$(d)/vendor/mkrunetype: $(d)/vendor/mkrunetype.c $(OBJ_DIR)/sys/base/base.a
$(COMPLINK)
+GENS += $(d)/vendor/mkrunetype
+
$(d)/runetype-$(UNICODE).c: $(d)/vendor/UnicodeData-$(UNICODE).txt $(d)/vendor/mkrunetype
@$(dir $@)vendor/mkrunetype $< > $@
-GENS += $(d)/vendor/mkrunetype
+# grapheme boundaries
+$(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt:
+ @echo "GET GraphemeBreakProperty.txt";\
+ curl https://www.unicode.org/Public/$(UNICODE)/ucd/auxiliary/GraphemeBreakProperty.txt > $@
+
+$(d)/vendor/mkgraphemedata: $(d)/vendor/mkgraphemedata.c $(OBJ_DIR)/sys/base/base.a
+ $(COMPLINK)
+
+$(d)/graphemedata-$(UNICODE).c: $(d)/vendor/mkgraphemedata $(d)/vendor/GraphemeBreakProperty-$(UNICODE).txt
+ $^ > $@
+
+GENS += $(d)/vendor/mkgraphemedata
$(LIBS_$(d)): $(OBJS_$(d))
$(ARCHIVE)
diff --git a/sys/libunicode/vendor/mkrunetype.c b/sys/libunicode/vendor/mkrunetype.c
index 0372e2b..fc4301c 100644
--- a/sys/libunicode/vendor/mkrunetype.c
+++ b/sys/libunicode/vendor/mkrunetype.c
@@ -418,7 +418,7 @@ static
void
usage(void)
{
- fprintf(stderr, "usage: mkutf8data <UnicodeData.txt>\n");
+ fprintf(stderr, "usage: mkrunetype <UnicodeData.txt>\n");
exit(1);
}
@@ -564,6 +564,7 @@ main(int argc, char *argv[])
makeis("upper", table.isupper, 1, 0);
makeis("lower", table.islower, 1, 0);
makeis("title", table.istitle, 1, 0);
+ makeis("punct", table.ispunct, 1, 0);
mkto("upper", table.toupper, 1, 0);
mkto("lower", table.tolower, 1, 0);