aboutsummaryrefslogtreecommitdiff
path: root/sys/cmd/cc/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cmd/cc/lex.c')
-rw-r--r--sys/cmd/cc/lex.c649
1 files changed, 630 insertions, 19 deletions
diff --git a/sys/cmd/cc/lex.c b/sys/cmd/cc/lex.c
index af3bbf3..6b85d8c 100644
--- a/sys/cmd/cc/lex.c
+++ b/sys/cmd/cc/lex.c
@@ -1,38 +1,649 @@
#include "cc.h"
-static
-void
-errorat(Pos x, byte *fmt, ...)
+#include <libn/macro/map.h>
+
+// -----------------------------------------------------------------------
+// simple wrappers
+
+byte
+getbyte(Lexer *l)
{
- va_list args;
- va_start(args, fmt);
- printf("error %d:", x.line);
- vprintf(fmt, args);
- va_end(args);
+ return bufio·getbyte(&l->io->buf);
+}
+
+byte
+getnsbyte(Lexer *l)
+{
+ byte b;
+ while (b = bufio·getbyte(&l->io->buf), isspace(b));
+ return b;
+}
+
+rune
+getrune(Lexer *l)
+{
+ return bufio·getrune(&l->io->buf);
}
-static
byte
-getbyte(struct Lexer *lex)
+ungetbyte(Lexer *lx)
{
- return bufio·getbyte(&lex->buf);
+ byte b;
+ return bufio·ungetbyte(&lx->io->buf, b);
}
+rune
+ungetrune(Lexer *l, rune r)
+{
+ return bufio·ungetrune(&l->io->buf, r);
+}
+
+// -----------------------------------------------------------------------
+// main lexer
+
+#define TOK(a, b) b,
+byte *tokens[NUM_TOKENS] = { TOKENS };
+#undef TOK
+
+static uint8 Atoi[256] =
+{
+ ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4, ['5'] = 5,
+ ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9, ['a'] = 10, ['A'] = 10,
+ ['b'] = 11, ['B'] = 11, ['c'] = 12, ['C'] = 12, ['d'] = 13, ['D'] = 13,
+ ['e'] = 14, ['E'] = 14, ['f'] = 15, ['F'] = 15,
+};
+
static
error
-ungetbyte(struct Lexer *lex, byte b)
+escape(Lexer *lx, int x, int *flag, vlong *val)
{
- return bufio·ungetbyte(&lex->buf, b);
+ int i, u, c;
+ vlong l;
+
+ c = getrune(lx);
+
+ switch (c) {
+ case EOF:
+ errorat(lx->pos, "EOF in string");
+ return 1;
+ case '\n':
+ errorat(lx->pos, "newline in string");
+ return 1;
+ case '\\':
+ break;
+ default:
+ if (c == x)
+ return 1;
+ *val = c;
+ return 0;
+ }
+
+ u = 0;
+ c = getrune(lx);
+
+ switch(c) {
+ case 'x':
+ i = 2;
+ *flag = 1;
+ goto hex;
+
+ case 'u':
+ i = 4;
+ u = 1;
+ goto hex;
+
+ case 'U':
+ i = 8;
+ u = 1;
+ goto hex;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ *flag = 1;
+ goto oct;
+
+ case 'a': c = '\a'; break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'v': c = '\v'; break;
+ case '\\': c = '\\'; break;
+
+ default:
+ if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c);
+ }
+ *val = c;
+ return 0;
+
+hex:
+ l = 0;
+ for(; i > 0; i--) {
+ c = getbyte(lx);
+ if (c >= '0' && c <= '9') {
+ l = l*16 + c-'0';
+ continue;
+ }
+ if (c >= 'a' && c <= 'f') {
+ l = l*16 + c-'a' + 10;
+ continue;
+ }
+ if (c >= 'A' && c <= 'F') {
+ l = l*16 + c-'A' + 10;
+ continue;
+ }
+ errorat(lx->pos, "non-hex character in escape sequence: %c", c);
+ ungetbyte(lx);
+ break;
+ }
+ if (u && (l > RuneMax || (0xd800 <= l && l < 0xe000))) {
+ errorat(lx->pos, "invalid unicode code point in escape sequence: %#llx", l);
+ l = RuneErr;
+ }
+ *val = l;
+ return 0;
+
+oct:
+ l = c - '0';
+ for (i = 2; i > 0; i--) {
+ c = getbyte(lx);
+ if (c >= '0' && c <= '7') {
+ l = l*8 + c-'0';
+ continue;
+ }
+ errorat(lx->pos, "non-octal character in escape sequence: %c", c);
+ ungetbyte(lx);
+ }
+ if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l);
+
+ *val = l;
+ return 0;
}
-void
-lex(struct Lexer *lex)
+#define CASE1(stmt1, kind1) \
+ case stmt1: \
+ tok.kind = kind1; \
+ break;
+
+#define CASE2(stmt1, kind1, b1, kind2) \
+ case stmt1: \
+ tok.kind = kind1; \
+ b = getbyte(lx); \
+ if (b == b1) \
+ tok.kind = kind2; \
+ else \
+ ungetbyte(lx); \
+ break;
+
+#define CASE3(stmt1, kind1, b1, kind2, b2, kind3) \
+ case stmt1: \
+ tok.kind = kind1; \
+ b = getbyte(lx); \
+ if (b == b1) \
+ tok.kind = kind2; \
+ else if (b == b2) \
+ tok.kind = kind3; \
+ else \
+ ungetbyte(lx); \
+ break;
+
+#define CASE4(stmt1, kind1, b1, kind2, b2, kind3, b3, type4) \
+ case stmt1: \
+ tok.kind = kind1; \
+ b = getbyte(lx); \
+ if (b == b1) \
+ tok.kind = kind2; \
+ else if (b == b2) \
+ tok.kind = kind3; \
+ else if (b == b3) \
+ tok.kind = type4; \
+ else \
+ ungetbyte(lx); \
+ break;
+
+
+Token
+lex(Lexer *lx)
{
- int b;
+ int b, n, f;
+ vlong v;
+ uint u;
+ rune r;
+ string s;
+ double d;
+ byte *e;
+ Token tok;
+ Sym *sym;
+ Io *io;
+
+GetByte:
+ b = getbyte(lx);
+Dispatch:
+ tok.pos.beg = lx->pos;
+
+ if (b >= RuneSelf || isalpha(b))
+ goto TAlpha;
+ if (isdigit(b))
+ goto TNum;
- b = getbyte(lex);
-TOP:
switch (b) {
+ case ' ': case '\n': case '\r': case '\t': case '\v': case '\f':
+ while (b = getbyte(lx), isspace(b))
+ if (b == '\n') lx->pos.line++;
+ goto Dispatch;
+
+ case '\'':
+ if (escape(lx, '\'', &f, &v)) {
+ errorat(lx->pos, "empty literal or escaped ' in char literal");
+ v = '\'';
+ }
+ if (!escape(lx, '\'', &f, &v)) {
+ errorat(lx->pos, "missing '");
+ ungetbyte(lx);
+ }
+
+ if (v > 0xff) {
+ errorat(lx->pos, "overflowed character literal");
+ v = 0;
+ }
+ tok.kind = Alit | Vchar;
+ tok.val.c = v;
+ break;
+
+ case '"':
+ s = str·makecap("", 0, 8);
+ for (;;) {
+ if (escape(lx, '"', &f, &v))
+ break;
+
+ if (v < RuneSelf || f)
+ str·appendbyte(&s, v);
+ else {
+ r = v;
+ b = utf8·runelen(r);
+ utf8·runetochar(lx->buf, &r);
+ str·appendlen(&s, b, lx->buf);
+ }
+ }
+ tok.kind = Alit | Vstr;
+ tok.val.s = s;
+ intern(&tok.val.s);
+
+ str·free(s);
+ break;
+ case '.':
+ tok.kind = Adot;
+ b = getbyte(lx);
+
+ if (isdigit(b)) {
+ // *lx->b++ = b;
+ goto TFlt;
+ } else if (b == '.') {
+ b = getbyte(lx);
+ if (b != '.') {
+ errorat(lx->pos, "invalid token '..'");
+ tok.kind = Aellip;
+ break;
+ }
+ }
+ ungetbyte(lx);
+ break;
+
+ case '<':
+ tok.kind = Alt;
+ b = getbyte(lx);
+
+ if (b == '<') {
+ tok.kind = Alsft;
+ b = getbyte(lx);
+ if (b == '=')
+ tok.kind = Alsftasn;
+ else
+ ungetbyte(lx);
+ } else if (b == '=')
+ tok.kind = Alteq;
+ else
+ ungetbyte(lx);
+ break;
+
+ case '>':
+ tok.kind = Agt;
+ b = getbyte(lx);
+
+ if (b == '>') {
+ tok.kind = Arsft;
+ b = getbyte(lx);
+ if (b == '=')
+ tok.kind = Arsftasn;
+ else
+ ungetbyte(lx);
+ } else if (b == '=')
+ tok.kind = Agteq;
+ else
+ ungetbyte(lx);
+ break;
+
+ case '/':
+ tok.kind = Adiv;
+ b = getbyte(lx);
+
+ if (b == '=')
+ tok.kind = Adivasn;
+ else if (b == '/') {
+ while (b != EOF && b != '\n')
+ b = getbyte(lx);
+ lx->pos.line++;
+ goto Dispatch;
+ } else if (b == '*') {
+ int level = 1;
+ b = getbyte(lx);
+ while (b != EOF && level > 0) {
+ if (b == '/') {
+ b = getbyte(lx);
+ if (b == '*')
+ level++;
+ } else if (b == '*') {
+ b = getbyte(lx);
+ if (b == '/')
+ level--;
+ }
+ if (b == '\n') lx->pos.line++;
+ b = getbyte(lx);
+ }
+ goto Dispatch;
+ } else
+ ungetbyte(lx);
+ break;
+
+ case '#':
+ if (domacro(lx)) {
+ tok.kind = Anil;
+ errorat(lx->pos, "failed to perform preprocessor directive");
+ return tok;
+ }
+ goto GetByte;
+ break;
+
+ case EOF:
+ panicf("need to implement popio");
+
+ CASE1('(', Alparen)
+ CASE1(')', Arparen)
+ CASE1('{', Albrace)
+ CASE1('}', Arbrace)
+ CASE1('[', Albrakt)
+ CASE1(']', Arbrakt)
+ CASE1(',', Acomma)
+ CASE1('?', Aqmark)
+ CASE1(';', Asemi)
+ CASE1('~', Aneg)
+ CASE1(':', Acolon)
+ CASE2('^', Axor, '=', Axorasn)
+ CASE2('!', Anot, '=', Aneq)
+ CASE2('*', Astar,'=', Amulasn)
+ CASE2('=', Aasn, '=', Aeq)
+ CASE2('%', Amod, '=', Amodasn)
+ CASE3('+', Aadd, '=', Aaddasn, '+', Ainc)
+ CASE3('&', Aand, '=', Aandasn, '&', Aandand)
+ CASE3('|', Aor, '=', Aorasn, '|', Aoror)
+ CASE4('-', Asub, '=', Asubasn, '-', Adec, '>', Aarrow)
+
+ default:
+ tok.kind = Anil;
+ errorat(lx->pos, "invalid token, crashing");
+ abort();
}
-}
+
+ goto Return;
+
+ TNum:
+ e = lx->buf + arrlen(lx->buf);
+ do {
+ if (lx->b >= e) {
+ errorat(lx->pos, "number overflows lexer buffer");
+ goto Nospace;
+ }
+ *lx->b++ = b;
+ } while (b = getbyte(lx), isdigit(b) || b == '_');
+
+ if (b == '.' || tolower(b) == 'e')
+ goto TFlt;
+ TInt:
+ r = b;
+ n = 10;
+ s = lx->buf;
+ if (*s == '0') {
+ b = *++s;
+ switch (b) {
+ case 'x': n = 16; break;
+ case 'b': n = 2; break;
+ case 'o': n = 8; break;
+ default: --s;
+ }
+ if (s >= e) {
+ errorat(lx->pos, "number overflows lexer buffer");
+ goto Nospace;
+ }
+ }
+
+ v = 0;
+ for (; s != lx->b ; s++) {
+ b = *s;
+ if (b == '_') continue;
+
+ f = Atoi[b];
+ if (f == 0 && b != '0')
+ break;
+
+ if (f >= n) {
+ errorat(lx->pos, "digit '%c' out of range for base %d", b, n);
+ f = 0;
+ }
+
+ if (v > (UINT64_MAX - f) / n) {
+ errorat(lx->pos, "integer literal overflow");
+ v = 0;
+ break;
+ }
+
+ v = v * n + f;
+ }
+ b = r;
+ tok.kind = Alit | Vint;
+ tok.val.i = v;
+ /* TODO: Suffixes!
+ if (tolower(b) == 'u') {
+ tok.kind |= Vusgn;
+ b = getbyte(lx);
+ }
+ */
+ goto Return;
+
+ TFlt:
+ if (b == '.') {
+ *lx->b++ = b;
+ b = getbyte(lx);
+ }
+
+ while (isdigit(b)) {
+ *lx->b++ = b;
+
+ if (lx->b >= e) {
+ errorat(lx->pos, "number overflows lexer buffer");
+ goto Nospace;
+ }
+ }
+
+ if (tolower(b) == 'e') {
+ b = getbyte(lx);
+ if (b == '-' || b == '+')
+ b = getbyte(lx);
+
+ if (!isdigit(b))
+ errorat(lx->pos, "expected number after exponent, found %c", b);
+
+ do {
+ *lx->b++ = b;
+ } while (b = getbyte(lx), isdigit(b));
+ }
+ *lx->b = '\0';
+ d = strtod(lx->buf, nil);
+
+ tok.kind = Alit | Vfloat;
+ tok.val.f = d;
+
+ goto Return;
+
+ TAlpha:
+ u = b;
+ s = lx->buf;
+ e = lx->buf + arrlen(lx->buf);
+ for (;;) {
+ if (s >= e) {
+ errorat(lx->pos, "identifier too long for buffer: %s", s);
+ goto Nospace;
+ }
+ if (u >= RuneSelf) {
+ ungetbyte(lx);
+ r = getrune(lx);
+ if (!utf8·isletter(r) && !utf8·isdigit(r) && r != 0xb7) {
+ errorat(lx->pos, "invalid identifier character %d", r);
+ }
+ s += utf8·runetochar(s, &r);
+ } else if (!isalnum(u) && u != '_')
+ break;
+ else
+ *s++ = u;
+ u = getbyte(lx);
+ }
+ *s = '\0';
+ tok.kind = Aident;
+ tok.val.s = lx->buf;
+
+ n = intern(&tok.val.s);
+ if (n < arrlen(keywords)) {
+ tok.kind = Akeywd;
+ }
+
+ sym = lookup(lx->sym, tok.val.s);
+ if (sym) {
+ io = makeio();
+ io->buf.end += expandmacro(lx, sym, io->b);
+ pushio(lx, io);
+ goto GetByte;
+ }
+
+Return:
+ lx->b = lx->buf;
+ tok.pos.end = lx->pos;
+ return tok;
+
+Nospace:
+ panicf("aborting compilation");
+}
+
+#undef CASE4
+#undef CASE3
+#undef CASE2
+#undef CASE1
+
+// -----------------------------------------------------------------------
+// push/pop io objects
+
+void
+pushio(Lexer *lx, Io *new)
+{
+ new->link = lx->io;
+ lx->io->store = lx->pos;
+ lx->io = new;
+
+ lx->pos = (Pos){
+ .line = 0,
+ .col = 0,
+ .path = new->path,
+ };
+}
+
+void
+popio(Lexer *lx)
+{
+ Io *prev;
+
+ prev = lx->io->link;
+ if (!prev) {
+ panicf("no buffer left");
+ }
+
+ lx->pos = prev->store;
+ lx->io = prev;
+}
+
+// -----------------------------------------------------------------------
+// symbol tables
+
+#define PTR_HASH(p) (uintptr)(p)
+#define PTR_EQUAL(p1, p2) ((uintptr)(p1) == (uintptr)(p2))
+
+struct SymTab
+{
+ MAP_STRUCT_BODY(string, Sym*);
+};
+
+Sym*
+lookup(SymTab *tab, string ident)
+{
+ int idx;
+ MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL);
+
+ if (idx < tab->n_buckets)
+ return tab->vals[idx];
+
+ return nil;
+}
+
+static
+int
+moresymtab(SymTab *tab, int n)
+{
+ MAP_GROW(tab, string, Sym*, n, PTR_HASH, mem·sys.alloc, mem·sys.free, nil);
+}
+
+static
+int
+putsym(SymTab *tab, Sym *sym, error *err)
+{
+ MAP_PUT(tab, sym->name, sym, PTR_HASH, PTR_EQUAL, moresymtab, err);
+}
+
+Sym*
+define(SymTab *tab, string name, int kind)
+{
+ Sym *sym;
+ error err;
+
+ sym = mem·arenaalloc(C.heap, 1, sizeof(*sym));
+ sym->name = name;
+ sym->kind = kind;
+
+ putsym(tab, sym, &err);
+
+ return sym;
+}
+
+// -----------------------------------------------------------------------
+// error reporting
+
+void
+errorat(Pos x, byte *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+
+ printf("error %d: ", x.line);
+
+ vprintf(fmt, args);
+ va_end(args);
+}
+