aboutsummaryrefslogtreecommitdiff
path: root/src/str.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/str.c')
-rw-r--r--src/str.c504
1 files changed, 504 insertions, 0 deletions
diff --git a/src/str.c b/src/str.c
new file mode 100644
index 0000000..9aa29b3
--- /dev/null
+++ b/src/str.c
@@ -0,0 +1,504 @@
+#include <u.h>
+
+#define MAX_STRING_ALLOC 1024 * 1024
+
+// -------------------------------------------------------------------------
+// UTF-8 functions
+
+#define Bit(i) (7-(i))
+/* N 0's preceded by i 1's e.g. T(Bit(2)) is 1100 0000 */
+#define Tbyte(i) (((1 << (Bit(i)+1))-1) ^ 0xFF)
+/* 0000 0000 0000 0111 1111 1111 */
+#define RuneX(i) ((1 << (Bit(i) + ((i)-1)*Bitx))-1)
+
+enum
+{
+ Bitx = Bit(1),
+ Tx = Tbyte(1),
+ Rune1 = (1 << (Bit(0)+0*Bitx)) - 1,
+
+ Maskx = (1 << Bitx) - 1, /* 0011 1111 */
+ Testx = Maskx ^ 0xff, /* 1100 0000 */
+
+ SurrogateMin = 0xD800,
+ SurrogateMax = 0xDFFF,
+ Bad = RuneErr,
+};
+
+int
+utf8·CharToRune(Rune* r, byte* s)
+{
+ int c[UTFmax], i;
+ Rune l;
+
+ c[0] = *(ubyte*)(s);
+ if (c[0] < Tx) {
+ *r = c[0];
+ return 1;
+ }
+
+ l = c[0];
+ for (i = 1; i < UTFmax; i++) {
+ c[i] = *(ubyte*)(s+i);
+ c[i] ^= Tx;
+ if (c[i] & Testx) goto bad;
+
+ l = (l << Bitx) | c[i];
+ if (c[0] < Tbyte(i + 2)) {
+ l &= RuneX(i + 1);
+ if (i == 1) {
+ if (c[0] < Tbyte(2) || l <= Rune1)
+ goto bad;
+ } else if (l <= RuneX(i) || l > RuneMax)
+ goto bad;
+ if (i == 2 && SurrogateMin <= l && l <= SurrogateMax)
+ goto bad;
+
+ *r = l;
+ return i + 1;
+ }
+ }
+bad:
+ *r = RuneErr;
+ return 1;
+}
+
+int
+utf8·RuneToChar(byte* s, Rune* r)
+{
+ int i, j;
+ Rune c;
+
+ c = *r;
+ if (c <= Rune1) {
+ s[0] = c;
+ return 1;
+ }
+
+ for (i = 2; i < UTFmax + 1; i++){
+ if (i == 3){
+ if (c > RuneMax)
+ c = RuneErr;
+ if (SurrogateMin <= c && c <= SurrogateMax)
+ c = RuneErr;
+ }
+ if (c <= RuneX(i) || i == UTFmax) {
+ s[0] = Tbyte(i) | (c >> (i - 1)*Bitx);
+ for(j = 1; j < i; j++)
+ s[j] = Tx | ((c >> (i - j - 1)*Bitx) & Maskx);
+ return i;
+ }
+ }
+
+ return UTFmax;
+}
+
+int
+utf8·RuneLen(Rune r)
+{
+ byte s[10];
+ return utf8·RuneToChar(s, &r);
+}
+
+int
+utf8·FullRune(byte* s, int n)
+{
+ int i;
+ Rune c;
+
+ if (n <= 0) return 0;
+ c = *(ubyte*) s;
+ if (c < Tx) return 1;
+
+ for (i = 3; i < UTFmax + 1; i++) {
+ if (c < Tbyte(i)) return n >= i - 1;
+ }
+
+ return n >= UTFmax;
+}
+
+byte*
+utf8·FindRune(byte* s, long c)
+{
+ long c1;
+ Rune r;
+ int n;
+
+ if (c < RuneSync) return strchr(s, c);
+
+ for (;;) {
+ c1 = *(ubyte*)s;
+ if (c1 < RuneSelf) {
+ if (c1 == 0) return nil;
+ if (c1 == c) return s;
+ s++;
+ continue;
+ }
+ n = utf8·CharToRune(&r, s);
+ if (r == c) return s;
+ s += n;
+ }
+
+ return nil;
+}
+
+#undef Bit
+#undef Tbyte
+#undef RuneX
+
+#include "generated/utf8.c"
+
+// -------------------------------------------------------------------------
+// Dynamic string functions
+
+// New returns a new dynamic string object, initialized from the given C string.
+// len defines the length of the C substring that we will copy into our buffer.
+// The backing buffer will have capacity cap.
+string
+str·NewCap(const byte* s, vlong len, vlong cap)
+{
+ struct str·Hdr* h;
+
+ h = malloc(sizeof(*h) + cap + 1);
+ if (s == nil) memset(h, 0, sizeof(*h));
+
+ if (h == nil) return nil; // Allocation failed.
+
+ h->len = (s == nil) ? 0 : len;
+ h->cap = cap;
+
+ if (cap < h->len) goto cleanup;
+
+ if (s != nil && cap > 0) {
+ memcpy(h->buf, s, h->len);
+ memset(h->buf + h->len, '\0', h->cap - h->len + 1);
+ }
+
+ return h->buf;
+
+cleanup:
+ free(h);
+ panicf("Attempted to create a string with less capacity than length");
+}
+
+// New returns a new dynamic string object, initialized from the given C string.
+// The backing buffer capacity is equivalent to the string length.
+string
+str·NewLen(const byte* s, vlong len)
+{
+ vlong sl = (s == nil) ? 0 : strlen(s);
+ if (sl < len) panicf("attempted to take a bigger substring than string length");
+
+ vlong cap = (len == 0) ? 1 : len;
+ return str·NewCap(s, len, cap);
+}
+
+// New returns a new dynamic string object, initialized from the given C string.
+// The backing buffer capacity is equivalent to the string length.
+string
+str·New(const byte* s)
+{
+ vlong len = (s == nil) ? 0 : strlen(s);
+ return str·NewLen(s, len);
+}
+
+// Newf returns a new dynamic string object
+string
+str·Newf(const byte* fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ vlong n = vsnprintf(nil, 0, fmt, args);
+ va_end(args);
+
+ string s = str·NewCap(nil, 0, n);
+
+ va_start(args, fmt);
+ vsnprintf(s, n + 1, fmt, args);
+ va_end(args);
+
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+ h->len = n;
+
+ return s;
+}
+
+// Free returns memory associated to the buffer.
+void
+str·Free(string s)
+{
+ free(s - sizeof(str·Hdr));
+}
+
+// Len returns the length of the string.
+int
+str·Len(const string s)
+{
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+ return h->len;
+}
+
+// Cap returns the capacity of the string buffer.
+int
+str·Cap(const string s)
+{
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+ return h->cap;
+}
+
+string
+str·Clear(string s)
+{
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+ h->len = 0;
+ *s = 0;
+
+ return s;
+}
+
+// Grow ensures that the string can encompass AT LEAST delta bytes.
+// If it already can, this is a NO OP.
+// If it can't, the string will be reallocated.
+string
+str·Grow(string s, vlong delta)
+{
+ str·Hdr *h, *newh;
+ vlong cap = str·Cap(s);
+ vlong len = str·Len(s);
+ Assert(cap >= len); // To prevent unsigned behavior
+
+ if (cap - len >= delta) return s;
+
+ h = (str·Hdr*)(s - sizeof(str·Hdr));
+
+ vlong newCap = cap + delta;
+ Assert(newCap >= cap); // To prevent unsigned behavior
+ if (newCap < MAX_STRING_ALLOC) {
+ newCap *= 2;
+ } else
+ newCap += MAX_STRING_ALLOC;
+
+ newh = (str·Hdr*)realloc(h, sizeof(*h) + newCap + 1);
+ if (newh == nil) return nil;
+
+ memset(newh->buf + len, '\0', newCap - len);
+ newh->cap = newCap;
+ newh->len = len;
+
+ return newh->buf;
+}
+
+// Fit reallocates the string such that the buffer is exactly sized for the
+// buffer. If the capacity equals the length, then the function is a NOOP. The
+// byte array is unchanged.
+string
+str·Fit(string s)
+{
+ str·Hdr* h;
+ vlong cap = str·Cap(s);
+ vlong len = str·Len(s);
+
+ if (cap == len) return s;
+
+ h = (str·Hdr*)(s - sizeof(str·Hdr));
+ h = realloc(h, sizeof(*h) + len + 1);
+ h->cap = len;
+
+ return h->buf;
+}
+
+// Append will append the given null terminated C string to the string data
+// structure. This variant can append a substring of length len of the given
+// string to our buffer. The result is reallocated if not enough room is present
+// in the buffer.
+string
+str·AppendCount(string s, const byte* b, vlong n)
+{
+ vlong bl = strlen(b);
+ if (n > bl) panicf("attempted to make a substring longer than string");
+
+ s = str·Grow(s, n);
+ if (s == nil) return nil;
+
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+
+ memcpy(s + str·Len(s), b, n);
+ h->len += n;
+ s[h->len] = '\0';
+
+ return s;
+}
+
+// Append will append the given null terminated C string to the string data
+// structure. This variant will append the entire string.
+string
+str·Append(string s, const byte* b)
+{
+ return str·AppendCount(s, b, strlen(b));
+}
+
+// AppendByte will append the given byte to our string.
+// NOTE: As the byte is on the stack, it is not null-terminated.
+// Can not pass to the above functions.
+string
+str·AppendByte(string s, const byte b)
+{
+ s = str·Grow(s, 1);
+ if (s == nil) return nil;
+
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+
+ *(s + str·Len(s)) = b;
+ h->len++;
+ s[h->len] = '\0'; // NOTE: I don't think an explicit zero is required..?
+
+ return s;
+}
+
+// Equals returns true if string s and t are equivalent.
+bool
+str·Equals(const string s, const string t)
+{
+ vlong sL = str·Len(s);
+ vlong tL = str·Len(t);
+ if (sL != tL) return false;
+
+ return memcmp(s, t, sL) == 0;
+}
+
+// Utility Methods ------------------------------------
+//
+// Appendf will append the given formatted string to our buffer.
+// Returns the newly minted string.
+string
+str·Appendf(string s, const byte* fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ int remain = str·Cap(s) - str·Len(s);
+ int n = vsnprintf(s + str·Len(s), remain + 1, fmt, args);
+ va_end(args);
+
+ if (n > remain) {
+ // If the first write was incomplete, we overwite the data again.
+ s = str·Grow(s, n);
+ va_list args;
+ va_start(args, fmt);
+ n = vsnprintf(s + str·Len(s), n + 1, fmt, args);
+ Assert(n - remain <= str·Cap(s));
+ va_end(args);
+ }
+
+ str·Hdr* h = (str·Hdr*)(s - sizeof(str·Hdr));
+ h->len += n;
+
+ return s;
+}
+
+// Find will find the first occurence of
+// substr in the string Returns -1 if nothing was found.
+int
+str·Find(string s, const byte* substr)
+{
+ byte* loc = strstr(s, substr);
+ if (loc == nil) return -1;
+ return (int)(loc - s);
+}
+
+//
+// Lower will force all runes in the string to be lowercase.
+void
+str·Lower(string s)
+{
+ byte *b, *e;
+ b = s;
+ e = b + str·Len(s);
+ while (b++ != e)
+ *b = tolower(*b);
+}
+
+// Upper will force all runes in the string to be uppercase.
+void
+str·Upper(string s)
+{
+ byte *b, *e;
+ b = s;
+ e = b + str·Len(s);
+ while (b++ != e)
+ *b = toupper(*b);
+}
+
+// Replace will replace all occurences of the given bytes 'from' to bytes 'to'
+// Edits are done in place and modify the string.
+// NOTE: As of now strings from and to must be the same size.
+void
+str·Replace(string s, const byte* from, const byte* to)
+{
+ vlong fromL = strlen(from);
+ vlong toL = strlen(to);
+ if (toL != fromL) { panicf("different sized replacement string not supported"); }
+
+ vlong l = str·Len(s);
+ vlong i = l;
+ vlong j = l;
+
+ for (i = 0; i < l; i++) {
+ for (j = 0; j < toL; j++) {
+ if (s[i] == from[j]) {
+ s[i] = to[j];
+ break;
+ }
+ }
+ }
+}
+
+// Split will split the string by the given token.
+// Returns a stretchy buffer of strings that result from the partition.
+// It is the caller's responsibility to clean the memory.
+string*
+str·Split(string s, const byte* tok)
+{
+ string* fields = nil;
+ vlong start = 0;
+
+ vlong sL = str·Len(s);
+ vlong tokL = strlen(tok);
+ if (sL == 0 || tokL == 0) return nil;
+
+ buffit(fields, 5);
+
+ for (vlong i = 0; i < sL - tokL; i++) {
+ if ((tokL == 1 && s[i] == tokL) || !memcmp(s + i, tok, tokL)) {
+ bufpush(fields, str·NewLen(s + start, i - start));
+ if (fields[buflen(fields) - 1] == nil) goto cleanup;
+
+ start = i + tokL;
+ i += tokL - 1;
+ }
+ }
+
+ bufpush(fields, str·NewLen(s + start, sL - start));
+
+ return fields;
+
+cleanup:
+ for (vlong i = 0; i < buflen(fields); i++) {
+ str·Free(fields[i]);
+ }
+ buffree(fields);
+ return nil;
+}
+
+string
+str·Join(byte** fields, vlong numFields, const byte* sep)
+{
+ string s = str·NewCap(nil, 0, 10);
+ int j = 0;
+
+ for (j = 0; j < numFields; j++) {
+ s = str·Append(s, fields[j]);
+ if (j < numFields - 1) { s = str·AppendCount(s, sep, 1); }
+ }
+
+ return s;
+}