From 6318ddb69ad8380694bbca35ca5a72230c2ee694 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Thu, 28 Oct 2021 11:55:47 -0700 Subject: Feat(libunicode): cleaned up decode/encode Additionally, decode can now apply backwards on a byte string. --- sys/libunicode/internal.h | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'sys/libunicode/internal.h') diff --git a/sys/libunicode/internal.h b/sys/libunicode/internal.h index 31ed2ae..6b0657b 100644 --- a/sys/libunicode/internal.h +++ b/sys/libunicode/internal.h @@ -4,7 +4,9 @@ #include #include -/* UTF-8 code +/* + * NOTE: we use the preprocessor to ensure we have unsigned constants. + * UTF-8 code: * 1 byte: * 0xxxxxxx * 2 byte: @@ -14,22 +16,23 @@ * 4 byte: * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ -#define Bit(i) (7-(i)) -/* 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ -#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) -/* 0000 0000 0000 0111 1111 1111 */ -#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) -enum -{ - Bitx = Bit(1), - Tx = Tbyte(1), - Rune1 = (1 << (Bit(0)+0*Bitx)) - 1, +#define Tx 0x80u // 0b10000000 transfer header +#define TMask 0x3Fu // 0b00111111 transfer mask - Maskx = (1 << Bitx) - 1, /* 0011 1111 */ - Testx = Maskx ^ 0xff, /* 1100 0000 */ +#define TByte1 0xC0u // 0b11000000 +#define TByte2 0xE0u // 0b11100000 +#define TByte3 0xF0u // 0b11110000 +#define TByte4 0xF8u // 0b11111000 - SurrogateMin = 0xD800, - SurrogateMax = 0xDFFF, - Bad = RuneErr, -}; +#define Rune1Byte 0x000080u // 1 << 8 (1 byte) +#define Rune2Byte 0x001000u // 1 << 12 (2 bytes) +#define Rune3Byte 0x020000u // 1 << 17 (3 bytes) +#define Rune4Byte 0x400000u // 1 << 22 (4 bytes) + +#define RuneMax 0x10FFFFu +#define RuneMask 0x1FFFFFu + +/* UTF-16 nonsense */ +#define RuneSurrogateMin 0x0D8000 +#define RuneSurrogateMax 0x0D8FFF -- cgit v1.2.1