#include #include // NOTE: this utf8 bit is copied from libunicode to remove the hard dependency just for ARG_BEGIN. #define UTFmax 4 #define RuneSync 0x80u #define RuneSelf 0x80u #define RuneErr 0xFFFDu #define RuneMax 0x10FFFFu #define RuneMask 0x1FFFFFu #define Bit(i) (7-(i)) /* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ #define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) /* 0000 0000 0000 0111 1111 1111 */ #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) enum { Bitx = Bit(1), Tx = Tbyte(1), Rune1 = (1 << (Bit(0)+0*Bitx)) - 1, Maskx = (1 << Bitx) - 1, /* 0011 1111 */ Testx = Maskx ^ 0xff, /* 1100 0000 */ SurrogateMin = 0xD800, SurrogateMax = 0xDFFF, Bad = RuneErr, }; int arg·bytetorune(uint32* r, byte* s) { int c[4], i; uint32 l; c[0] = *(ubyte*)(s); if(c[0] < Tx) { *r = c[0]; return 1; } l = c[0]; for(i = 1; i < UTFmax; i++) { c[i] = *(ubyte*)(s+i); c[i] ^= Tx; if (c[i] & Testx) goto bad; l = (l << Bitx) | c[i]; if(c[0] < Tbyte(i + 2)) { l &= RuneX(i + 1); if (i == 1) { if (c[0] < Tbyte(2) || l <= Rune1) goto bad; } else if (l <= RuneX(i) || l > RuneMax) goto bad; if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) goto bad; *r = l; return i + 1; } } bad: *r = RuneErr; return 1; } char *argv0;