#include #define MAX_STRING_ALLOC 1024 * 1024 // ------------------------------------------------------------------------- // UTF-8 functions #define Bit(i) (7-(i)) /* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ #define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) /* 0000 0000 0000 0111 1111 1111 */ #define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) enum { Bitx = Bit(1), Tx = Tbyte(1), Rune1 = (1 << (Bit(0)+0*Bitx)) - 1, Maskx = (1 << Bitx) - 1, /* 0011 1111 */ Testx = Maskx ^ 0xff, /* 1100 0000 */ SurrogateMin = 0xD800, SurrogateMax = 0xDFFF, Bad = RuneErr, }; int utf8·charToRune(Rune* r, byte* s) { int c[UTFmax], i; Rune l; c[0] = *(ubyte*)(s); if (c[0] < Tx) { *r = c[0]; return 1; } l = c[0]; for (i = 1; i < UTFmax; i++) { c[i] = *(ubyte*)(s+i); c[i] ^= Tx; if (c[i] & Testx) goto bad; l = (l << Bitx) | c[i]; if (c[0] < Tbyte(i + 2)) { l &= RuneX(i + 1); if (i == 1) { if (c[0] < Tbyte(2) || l <= Rune1) goto bad; } else if (l <= RuneX(i) || l > RuneMax) goto bad; if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) goto bad; *r = l; return i + 1; } } bad: *r = RuneErr; return 1; } int utf8·runeToChar(byte* s, Rune* r) { int i, j; Rune c; c = *r; if (c <= Rune1) { s[0] = c; return 1; } for (i = 2; i < UTFmax + 1; i++){ if (i == 3){ if (c > RuneMax) c = RuneErr; if (SurrogateMin <= c && c <= SurrogateMax) c = RuneErr; } if (c <= RuneX(i) || i == UTFmax) { s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); for(j = 1; j < i; j++) s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); return i; } } return UTFmax; } int utf8·runeLen(Rune r) { byte s[10]; return utf8·runeToChar(s, &r); } int utf8·fullRune(byte* s, int n) { int i; Rune c; if (n <= 0) return 0; c = *(ubyte*) s; if (c < Tx) return 1; for (i = 3; i < UTFmax + 1; i++) { if (c < Tbyte(i)) return n >= i - 1; } return n >= UTFmax; } byte* utf8·findRune(byte* s, long c) { long c1; Rune r; int n; if (c < RuneSync) return strchr(s, c); for (;;) { c1 = *(ubyte*)s; if (c1 < RuneSelf) { if (c1 == 0) return nil; if (c1 == c) return s; s++; continue; } n = utf8·charToRune(&r, s); if (r == c) return s; s += n; } return nil; } #undef Bit #undef Tbyte #undef RuneX #include ".generated/utf8.c" // ------------------------------------------------------------------------- // Dynamic string functions // New returns a new dynamic string object, initialized from the given C string. // len defines the length of the C substring that we will copy into our buffer. // The backing buffer will have capacity cap. string str·newCap(const byte* s, vlong len, vlong cap) { struct str·Hdr* h; h = malloc(sizeof(*h) + cap + 1); if (s == nil) memset(h, 0, sizeof(*h)); if (h == nil) return nil; // Allocation failed. h->len = (s == nil) ? 0 : len; h->cap = cap; if (cap < h->len) goto cleanup; if (s != nil && cap > 0) { memcpy(h->buf, s, h->len); memset(h->buf + h->len, '\0', h->cap - h->len + 1); } return h->buf; cleanup: free(h); panicf("Attempted to create a string with less capacity than length"); } // New returns a new dynamic string object, initialized from the given C string. // The backing buffer capacity is equivalent to the string length. string str·newLen(const byte* s, vlong len) { vlong sl = (s == nil) ? 0 : strlen(s); if (sl < len) panicf("attempted to take a bigger substring than string length"); vlong cap = (len == 0) ? 1 : len; return str·newCap(s, len, cap); } // New returns a new dynamic string object, initialized from the given C string. // The backing buffer capacity is equivalent to the string length. string str·new(const byte* s) { vlong len = (s == nil) ? 0 : strlen(s); return str·newLen(s, len); } // Newf returns a new dynamic string object string str·newf(const byte* fmt, ...) { va_list args; va_start(args, fmt); vlong n = vsnprintf(nil, 0, fmt, args); va_end(args); string s = str·newCap(nil, 0, n); va_start(args, fmt); vsnprintf(s, n + 1, fmt, args); va_end(args); str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); h->len = n; return s; } // Free returns memory associated to the buffer. void str·free(string s) { free(s - sizeof(str·Hdr)); } // Len returns the length of the string. int str·len(const string s) { str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); return h->len; } // Cap returns the capacity of the string buffer. int str·cap(const string s) { str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); return h->cap; } string str·clear(string s) { str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); h->len = 0; *s = 0; return s; } // Grow ensures that the string can encompass AT LEAST delta bytes. // If it already can, this is a NO OP. // If it can't, the string will be reallocated. string str·grow(string s, vlong delta) { str·Hdr *h, *newh; vlong cap = str·cap(s); vlong len = str·len(s); Assert(cap >= len); // To prevent unsigned behavior if (cap - len >= delta) return s; h = (str·Hdr*)(s - sizeof(str·Hdr)); vlong newCap = cap + delta; Assert(newCap >= cap); // To prevent unsigned behavior if (newCap < MAX_STRING_ALLOC) { newCap *= 2; } else newCap += MAX_STRING_ALLOC; newh = (str·Hdr*)realloc(h, sizeof(*h) + newCap + 1); if (newh == nil) return nil; memset(newh->buf + len, '\0', newCap - len); newh->cap = newCap; newh->len = len; return newh->buf; } // Fit reallocates the string such that the buffer is exactly sized for the // buffer. If the capacity equals the length, then the function is a NOOP. The // byte array is unchanged. string str·fit(string s) { str·Hdr* h; vlong cap = str·cap(s); vlong len = str·len(s); if (cap == len) return s; h = (str·Hdr*)(s - sizeof(str·Hdr)); h = realloc(h, sizeof(*h) + len + 1); h->cap = len; return h->buf; } // Append will append the given null terminated C string to the string data // structure. This variant can append a substring of length len of the given // string to our buffer. The result is reallocated if not enough room is present // in the buffer. string str·appendCount(string s, vlong n, const byte* b) { vlong bl = strlen(b); if (n > bl) panicf("attempted to make a substring longer than string"); s = str·grow(s, n); if (s == nil) return nil; str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); memcpy(s + str·len(s), b, n); h->len += n; s[h->len] = '\0'; return s; } // Append will append the given null terminated C string to the string data // structure. This variant will append the entire string. string str·append(string s, const byte* b) { return str·appendCount(s, strlen(b), b); } // AppendByte will append the given byte to our string. // NOTE: As the byte is on the stack, it is not null-terminated. // Can not pass to the above functions. string str·appendByte(string s, const byte b) { s = str·grow(s, 1); if (s == nil) return nil; str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); *(s + str·len(s)) = b; h->len++; s[h->len] = '\0'; // NOTE: I don't think an explicit zero is required..? return s; } // Equals returns true if string s and t are equivalent. bool str·equals(const string s, const string t) { vlong sL = str·len(s); vlong tL = str·len(t); if (sL != tL) return false; return memcmp(s, t, sL) == 0; } //------------------------------------------------------------------------ // Utility Methods /* * Appendf will append the given formatted string to our buffer. * Returns the newly minted string */ string str·appendf(string s, const byte* fmt, ...) { va_list args; va_start(args, fmt); int remain = str·cap(s) - str·len(s); int n = vsnprintf(s + str·len(s), remain + 1, fmt, args); va_end(args); if (n > remain) { // If the first write was incomplete, we overwite the data again. s = str·grow(s, n); va_list args; va_start(args, fmt); n = vsnprintf(s + str·len(s), n + 1, fmt, args); Assert(n - remain <= str·cap(s)); va_end(args); } str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr)); h->len += n; return s; } // Find will find the first occurence of // substr in the string Returns -1 if nothing was found. int str·find(string s, const byte* substr) { byte* loc = strstr(s, substr); if (loc == nil) return -1; return (int)(loc - s); } // // Lower will force all runes in the string to be lowercase. void str·Lower(string s) { byte *b, *e; b = s; e = b + str·len(s); while (b++ != e) *b = tolower(*b); } // Upper will force all runes in the string to be uppercase. void str·upper(string s) { byte *b, *e; b = s; e = b + str·len(s); while (b++ != e) *b = toupper(*b); } // Replace will replace all occurences of the given bytes 'from' to bytes 'to' // Edits are done in place and modify the string. // NOTE: As of now strings from and to must be the same size. void str·replace(string s, const byte* from, const byte* to) { vlong fromL = strlen(from); vlong toL = strlen(to); if (toL != fromL) { panicf("different sized replacement string not supported"); } vlong l = str·len(s); vlong i = l; vlong j = l; for (i = 0; i < l; i++) { for (j = 0; j < toL; j++) { if (s[i] == from[j]) { s[i] = to[j]; break; } } } } // Split will split the string by the given token. // Returns a stretchy buffer of strings that result from the partition. // It is the caller's responsibility to clean the memory. string* str·split(string s, const byte* tok) { string* fields = nil; vlong start = 0; vlong sL = str·len(s); vlong tokL = strlen(tok); if (sL == 0 || tokL == 0) return nil; buffit(fields, 5); for (vlong i = 0; i < sL - tokL; i++) { if ((tokL == 1 && s[i] == tokL) || !memcmp(s + i, tok, tokL)) { bufpush(fields, str·newLen(s + start, i - start)); if (fields[buflen(fields) - 1] == nil) goto cleanup; start = i + tokL; i += tokL - 1; } } bufpush(fields, str·newLen(s + start, sL - start)); return fields; cleanup: for (vlong i = 0; i < buflen(fields); i++) { str·free(fields[i]); } buffree(fields); return nil; } string str·join(vlong len, byte** fields, const byte* sep) { string s = str·newCap(nil, 0, 10); int j = 0; for (j = 0; j < len; j++) { s = str·append(s, fields[j]); if (j < len - 1) { s = str·appendCount(s, 1, sep); } } return s; }