From 29b56ef4e4113bcd091b19d6926f18814162ca53 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 26 Oct 2021 21:01:41 -0700 Subject: Feat(libunicode): Added an explicit unicode library Refactored code to pull out utf8 functions from base into a standalone library. Also left the required function inside arg.c so that code that calls ARG_BEGIN doesn't have to link to libunicode. --- sys/base/string.c | 174 ------------------------------------------------------ 1 file changed, 174 deletions(-) (limited to 'sys/base/string.c') diff --git a/sys/base/string.c b/sys/base/string.c index 8973a4e..1ef9068 100644 --- a/sys/base/string.c +++ b/sys/base/string.c @@ -10,180 +10,6 @@ typedef struct Hdr byte buf[]; } Hdr; -// ------------------------------------------------------------------------- -// UTF-8 functions - -#define Bit(i) (7-(i)) -/* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ -#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) -/* 0000 0000 0000 0111 1111 1111 */ -#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) - -enum -{ - Bitx = Bit(1), - Tx = Tbyte(1), - Rune1 = (1 << (Bit(0)+0*Bitx)) - 1, - - Maskx = (1 << Bitx) - 1, /* 0011 1111 */ - Testx = Maskx ^ 0xff, /* 1100 0000 */ - - SurrogateMin = 0xD800, - SurrogateMax = 0xDFFF, - Bad = RuneErr, -}; - -int -utf8·bytetorune(rune* r, byte* s) -{ - int c[UTFmax], i; - rune l; - - c[0] = *(ubyte*)(s); - if(c[0] < Tx) { - *r = c[0]; - return 1; - } - - l = c[0]; - for(i = 1; i < UTFmax; i++) { - c[i] = *(ubyte*)(s+i); - c[i] ^= Tx; - if (c[i] & Testx) goto bad; - - l = (l << Bitx) | c[i]; - if(c[0] < Tbyte(i + 2)) { - l &= RuneX(i + 1); - if (i == 1) { - if (c[0] < Tbyte(2) || l <= Rune1) - goto bad; - } else if (l <= RuneX(i) || l > RuneMax) - goto bad; - if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) - goto bad; - - *r = l; - return i + 1; - } - } -bad: - *r = RuneErr; - return 1; -} - -int -utf8·runetobyte(byte* s, rune* r) -{ - int i, j; - rune c; - - c = *r; - if (c <= Rune1) { - s[0] = c; - return 1; - } - - for (i = 2; i < UTFmax + 1; i++){ - if (i == 3){ - if (c > RuneMax) - c = RuneErr; - if (SurrogateMin <= c && c <= SurrogateMax) - c = RuneErr; - } - if (c <= RuneX(i) || i == UTFmax) { - s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); - for(j = 1; j < i; j++) - s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); - return i; - } - } - - return UTFmax; -} - -int -utf8·runelen(rune r) -{ - byte s[10]; - return utf8·runetobyte(s, &r); -} - -int -utf8·fullrune(byte* s, int n) -{ - int i; - rune c; - - if (n <= 0) return 0; - c = *(ubyte*) s; - if (c < Tx) return 1; - - for (i = 3; i < UTFmax + 1; i++) { - if (c < Tbyte(i)) return n >= i - 1; - } - - return n >= UTFmax; -} - -byte* -utf8·findrune(byte* s, long c) -{ - long c1; - rune r; - int n; - - if (c < RuneSync) return strchr(s, c); - - for (;;) { - c1 = *(ubyte*)s; - if (c1 < RuneSelf) { - if (c1 == 0) return nil; - if (c1 == c) return s; - s++; - continue; - } - n = utf8·bytetorune(&r, s); - if (r == c) return s; - s += n; - } - - return nil; -} - -byte* -utf8·findrrune(byte* s, long c) -{ - long c1; - rune r; - byte *l; - - if (c < RuneSync) - return strrchr(s, c); - - l = nil; - for (;;) { - c1 = *(ubyte*)s; - if (c1 < RuneSelf) { - if (c1 == 0) return l; - if (c1 == c) l = s; - s++; - continue; - } - c1 = utf8·bytetorune(&r, s); - if (r == c) - l = s; - s += c1; - } - - return nil; -} - -#undef Bit -#undef Tbyte -#undef RuneX - -#include ".generated/utf8.c" - // ------------------------------------------------------------------------- // Dynamic string functions -- cgit v1.2.1