From c8e1e71eb526475dd431443345262c2e5a627831 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Sat, 23 Oct 2021 11:17:25 -0700 Subject: chore(rename): libn -> base --- sys/base/string.c | 560 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 560 insertions(+) create mode 100644 sys/base/string.c (limited to 'sys/base/string.c') diff --git a/sys/base/string.c b/sys/base/string.c new file mode 100644 index 0000000..8973a4e --- /dev/null +++ b/sys/base/string.c @@ -0,0 +1,560 @@ +#include +#include + +#define MAX_STRING_ALLOC 1024 * 1024 + +typedef struct Hdr +{ + vlong len; + vlong cap; + byte buf[]; +} Hdr; + +// ------------------------------------------------------------------------- +// UTF-8 functions + +#define Bit(i) (7-(i)) +/* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */ +#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF) +/* 0000 0000 0000 0111 1111 1111 */ +#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1) + +enum +{ + Bitx = Bit(1), + Tx = Tbyte(1), + Rune1 = (1 << (Bit(0)+0*Bitx)) - 1, + + Maskx = (1 << Bitx) - 1, /* 0011 1111 */ + Testx = Maskx ^ 0xff, /* 1100 0000 */ + + SurrogateMin = 0xD800, + SurrogateMax = 0xDFFF, + Bad = RuneErr, +}; + +int +utf8·bytetorune(rune* r, byte* s) +{ + int c[UTFmax], i; + rune l; + + c[0] = *(ubyte*)(s); + if(c[0] < Tx) { + *r = c[0]; + return 1; + } + + l = c[0]; + for(i = 1; i < UTFmax; i++) { + c[i] = *(ubyte*)(s+i); + c[i] ^= Tx; + if (c[i] & Testx) goto bad; + + l = (l << Bitx) | c[i]; + if(c[0] < Tbyte(i + 2)) { + l &= RuneX(i + 1); + if (i == 1) { + if (c[0] < Tbyte(2) || l <= Rune1) + goto bad; + } else if (l <= RuneX(i) || l > RuneMax) + goto bad; + if (i == 2 && SurrogateMin <= l && l <= SurrogateMax) + goto bad; + + *r = l; + return i + 1; + } + } +bad: + *r = RuneErr; + return 1; +} + +int +utf8·runetobyte(byte* s, rune* r) +{ + int i, j; + rune c; + + c = *r; + if (c <= Rune1) { + s[0] = c; + return 1; + } + + for (i = 2; i < UTFmax + 1; i++){ + if (i == 3){ + if (c > RuneMax) + c = RuneErr; + if (SurrogateMin <= c && c <= SurrogateMax) + c = RuneErr; + } + if (c <= RuneX(i) || i == UTFmax) { + s[0] = Tbyte(i) | (c >> (i - 1)*Bitx); + for(j = 1; j < i; j++) + s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx); + return i; + } + } + + return UTFmax; +} + +int +utf8·runelen(rune r) +{ + byte s[10]; + return utf8·runetobyte(s, &r); +} + +int +utf8·fullrune(byte* s, int n) +{ + int i; + rune c; + + if (n <= 0) return 0; + c = *(ubyte*) s; + if (c < Tx) return 1; + + for (i = 3; i < UTFmax + 1; i++) { + if (c < Tbyte(i)) return n >= i - 1; + } + + return n >= UTFmax; +} + +byte* +utf8·findrune(byte* s, long c) +{ + long c1; + rune r; + int n; + + if (c < RuneSync) return strchr(s, c); + + for (;;) { + c1 = *(ubyte*)s; + if (c1 < RuneSelf) { + if (c1 == 0) return nil; + if (c1 == c) return s; + s++; + continue; + } + n = utf8·bytetorune(&r, s); + if (r == c) return s; + s += n; + } + + return nil; +} + +byte* +utf8·findrrune(byte* s, long c) +{ + long c1; + rune r; + byte *l; + + if (c < RuneSync) + return strrchr(s, c); + + l = nil; + for (;;) { + c1 = *(ubyte*)s; + if (c1 < RuneSelf) { + if (c1 == 0) return l; + if (c1 == c) l = s; + s++; + continue; + } + c1 = utf8·bytetorune(&r, s); + if (r == c) + l = s; + s += c1; + } + + return nil; +} + +#undef Bit +#undef Tbyte +#undef RuneX + +#include ".generated/utf8.c" + +// ------------------------------------------------------------------------- +// Dynamic string functions + +// New returns a new dynamic string object, initialized from the given C string. +// len defines the length of the C substring that we will copy into our buffer. +// The backing buffer will have capacity cap. +string +str·makecap(const byte *s, vlong len, vlong cap) +{ + struct Hdr* h; + + h = malloc(sizeof(*h) + cap + 1); + if (s == nil) memset(h, 0, sizeof(*h)); + + if (h == nil) return nil; // Allocation failed. + + h->len = (s == nil) ? 0 : len; + h->cap = cap; + + if (cap < h->len) goto cleanup; + + if (s != nil && cap > 0) { + memcpy(h->buf, s, h->len); + memset(h->buf + h->len, '\0', h->cap - h->len + 1); + } + + return h->buf; + +cleanup: + free(h); + panicf("Attempted to create a string with less capacity than length"); + return nil; +} + +// New returns a new dynamic string object, initialized from the given C string. +// The backing buffer capacity is equivalent to the string length. +string +str·makelen(const byte *s, vlong len) +{ + vlong sl = (!s) ? 0 : strlen(s); + if (sl < len) panicf("attempted to take a bigger substring than string length"); + + vlong cap = (len == 0) ? 1 : len; + return str·makecap(s, len, cap); +} + +// New returns a new dynamic string object, initialized from the given C string. +// The backing buffer capacity is equivalent to the string length. +string +str·make(const byte *s) +{ + vlong len = (!s) ? 0 : strlen(s); + return str·makelen(s, len); +} + +// Newf returns a new dynamic string object +string +str·makef(const byte *fmt, ...) +{ + vlong n; + string s; + va_list args; + + va_start(args, fmt); + n = vsnprintf(nil, 0, fmt, args); + va_end(args); + + s = str·makecap(nil, 0, n); + + va_start(args, fmt); + vsnprintf(s, n + 1, fmt, args); + va_end(args); + + Hdr* h = (Hdr*)(s - sizeof(Hdr)); + h->len = n; + + return s; +} + +// Free returns memory associated to the buffer. +void +str·free(string s) +{ + free(s - sizeof(Hdr)); +} + +// Len returns the length of the string. +int +str·len(const string s) +{ + Hdr* h = (Hdr*)(s - sizeof(Hdr)); + return h->len; +} + +// Cap returns the capacity of the string buffer. +int +str·cap(const string s) +{ + Hdr* h = (Hdr*)(s - sizeof(Hdr)); + return h->cap; +} + +void +str·clear(string *s) +{ + Hdr* h = (Hdr*)(*s - sizeof(Hdr)); + h->len = 0; + *s[0] = '\0'; +} + +// Grow ensures that the string can encompass AT LEAST delta bytes. +// If it already can, this is a NO OP. +// If it can't, the string will be reallocated. +void +str·grow(string *s, vlong delta) +{ + Hdr *h, *newh; + vlong cap = str·cap(*s); + vlong len = str·len(*s); + assert(cap >= len); // To prevent unsigned behavior + + if (cap - len >= delta) return; + + h = (Hdr*)(*s - sizeof(Hdr)); + + vlong newCap = cap + delta; + assert(newCap >= cap); // To prevent unsigned behavior + if (newCap < MAX_STRING_ALLOC) { + newCap *= 2; + } else + newCap += MAX_STRING_ALLOC; + + newh = (Hdr*)realloc(h, sizeof(*h) + newCap + 1); + if (newh == nil) return; + + memset(newh->buf + len, '\0', newCap - len); + newh->cap = newCap; + newh->len = len; + + *s = newh->buf; +} + +// Fit reallocates the string such that the buffer is exactly sized for the +// buffer. If the capacity equals the length, then the function is a NOOP. The +// byte array is unchanged. +void +str·fit(string *s) +{ + Hdr* h; + vlong cap = str·cap(*s); + vlong len = str·len(*s); + + if (cap == len) return; + + h = (Hdr*)(s - sizeof(Hdr)); + h = realloc(h, sizeof(*h) + len + 1); + h->cap = len; + + *s = h->buf; +} + +// Append will append the given null terminated C string to the string data +// structure. This variant can append a substring of length len of the given +// string to our buffer. The result is reallocated if not enough room is present +// in the buffer. +int +str·appendlen(string *s, vlong n, const byte* b) +{ + /* + bl = strlen(b); + if (n > bl) panicf("attempted to make a substring longer than string"); + */ + + str·grow(s, n); + if (*s == nil) return 0; + + Hdr* h = (Hdr*)(*s - sizeof(Hdr)); + + memcpy(*s + str·len(*s), b, n); + h->len += n; + (*s)[h->len] = '\0'; + + return n; +} + +// Append will append the given null terminated C string to the string data +// structure. This variant will append the entire string. +int +str·append(string *s, const byte* b) +{ + return str·appendlen(s, strlen(b), b); +} + +// AppendByte will append the given byte to our string. +// NOTE: As the byte is on the stack, it is not null-terminated. +// Can not pass to the above functions. +int +str·appendbyte(string *s, const byte b) +{ + str·grow(s, 1); + if (*s == nil) return 0; + + Hdr* h = (Hdr*)(*s - sizeof(Hdr)); + + *(*s + str·len(*s)) = b; + h->len++; + (*s)[h->len] = '\0'; // NOTE: I don't think an explicit zero is required..? + + return 1; +} + +/* + * Appendf will append the given formatted string to our buffer. + * Returns the newly minted string + */ + +int +str·appendf(string *s, const byte* fmt, ...) +{ + va_list args; + va_start(args, fmt); + int remain = str·cap(*s) - str·len(*s); + int n = vsnprintf(*s + str·len(*s), remain + 1, fmt, args); + va_end(args); + + if (n > remain) { + // If the first write was incomplete, we overwite the data again. + str·grow(s, n); + va_list args; + va_start(args, fmt); + n = vsnprintf(*s + str·len(*s), n + 1, fmt, args); + assert(n - remain <= str·cap(*s)); + va_end(args); + } + + Hdr* h = (Hdr*)(*s - sizeof(Hdr)); + h->len += n; + + return n; +} + +// Equals returns true if string s and t are equivalent. +bool +str·equals(const string s, const string t) +{ + vlong sL = str·len(s); + vlong tL = str·len(t); + if (sL != tL) return false; + + return memcmp(s, t, sL) == 0; +} + +//------------------------------------------------------------------------ +// Utility Methods + +int +str·read(string s, int size, int n, void *buf) +{ + int len; + + len = MIN(n * size, str·len(s)); + memcpy(buf, s, len); + + return len; +} + +// Find will find the first occurence of +// substr in the string Returns -1 if nothing was found. +int +str·find(string s, const byte* substr) +{ + byte* loc = strstr(s, substr); + if (loc == nil) return -1; + return (int)(loc - s); +} + +// +// Lower will force all runes in the string to be lowercase. +void +str·lower(string s) +{ + byte *b, *e; + b = s; + e = b + str·len(s); + while (b++ != e) + *b = tolower(*b); +} + +// Upper will force all runes in the string to be uppercase. +void +str·upper(string s) +{ + byte *b, *e; + b = s; + e = b + str·len(s); + while (b++ != e) + *b = toupper(*b); +} + +// Replace will replace all occurences of the given bytes 'from' to bytes 'to' +// Edits are done in place and modify the string. +// NOTE: As of now strings from and to must be the same size. +void +str·replace(string s, const byte* from, const byte* to) +{ + vlong fromL = strlen(from); + vlong toL = strlen(to); + if (toL != fromL) { panicf("different sized replacement string not supported"); } + + vlong l = str·len(s); + vlong i = l; + vlong j = l; + + for (i = 0; i < l; i++) { + for (j = 0; j < toL; j++) { + if (s[i] == from[j]) { + s[i] = to[j]; + break; + } + } + } +} + +// Split will split the string by the given token. +// Returns a stretchy buffer of strings that result from the partition. +// It is the caller's responsibility to clean the memory. +string* +str·split(string s, const byte* tok) +{ + string* fields = nil; + vlong start = 0; + + vlong sL = str·len(s); + vlong tokL = strlen(tok); + if (sL == 0 || tokL == 0) return nil; + + buffit(fields, 5); + + for (vlong i = 0; i < sL - tokL; i++) { + if ((tokL == 1 && s[i] == tokL) || !memcmp(s + i, tok, tokL)) { + bufpush(fields, str·makelen(s + start, i - start)); + if (fields[buflen(fields) - 1] == nil) goto cleanup; + + start = i + tokL; + i += tokL - 1; + } + } + + bufpush(fields, str·makelen(s + start, sL - start)); + + return fields; + +cleanup: + for (vlong i = 0; i < buflen(fields); i++) { + str·free(fields[i]); + } + buffree(fields); + return nil; +} + +string +str·join(vlong len, byte** fields, const byte* sep) +{ + string s = str·makecap("", 0, 10); + int j = 0; + + for (j = 0; j < len; j++) { + str·append(&s, fields[j]); + if (j < len - 1) + str·appendlen(&s, 1, sep); + } + + return s; +} -- cgit v1.2.1