#include "cc.h" #include // ----------------------------------------------------------------------- // printing functions void puttok(Token tok) { if (tok.kind < Alit) printf("%s", tokens[tok.kind]); else if (tok.kind & Alit) { if (tok.kind & Vchar) if (tok.kind & Vint) if (tok.kind & Vlong) if (tok.kind & Vvlong) printf("literal <%lld>", tok.val.i); if (tok.kind & Vfloat) printf("literal <%f>", tok.val.f); printf("literal <%s>", tok.val.s); } else printf("ident <%s>", tok.val.s); } // ----------------------------------------------------------------------- // simple wrappers byte getbyte(Lexer *lx) { return bufio·getbyte(&lx->io->buf); } byte getnsbyte(Lexer *lx) { int b; b = getbyte(lx); for (;;) { if (b >= RuneSelf || !isspace(b)) return b; if (b == '\n') { lx->pos.line++; return b; } b = getbyte(lx); } return b; } rune getrune(Lexer *lx) { return bufio·getrune(&lx->io->buf); } byte ungetbyte(Lexer *lx) { byte b; return bufio·ungetbyte(&lx->io->buf, b); } rune ungetrune(Lexer *l, rune r) { return bufio·ungetrune(&l->io->buf, r); } // ----------------------------------------------------------------------- // main lexer #define TOK(a, b) b, byte *tokens[NUM_TOKENS] = { TOKENS }; #undef TOK static uint8 Atoi[256] = { ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4, ['5'] = 5, ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9, ['a'] = 10, ['A'] = 10, ['b'] = 11, ['B'] = 11, ['c'] = 12, ['C'] = 12, ['d'] = 13, ['D'] = 13, ['e'] = 14, ['E'] = 14, ['f'] = 15, ['F'] = 15, }; static error escape(Lexer *lx, int x, int *flag, vlong *val) { int i, u, c; vlong l; c = getrune(lx); switch (c) { case EOF: errorat(lx->pos, "EOF in string"); return 1; case '\n': errorat(lx->pos, "newline in string"); return 1; case '\\': break; default: if (c == x) return 1; *val = c; return 0; } u = 0; c = getrune(lx); switch(c) { case 'x': i = 2; *flag = 1; goto hex; case 'u': i = 4; u = 1; goto hex; case 'U': i = 8; u = 1; goto hex; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': *flag = 1; goto oct; case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case '\\': c = '\\'; break; default: if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c); } *val = c; return 0; hex: l = 0; for(; i > 0; i--) { c = getbyte(lx); if (c >= '0' && c <= '9') { l = l*16 + c-'0'; continue; } if (c >= 'a' && c <= 'f') { l = l*16 + c-'a' + 10; continue; } if (c >= 'A' && c <= 'F') { l = l*16 + c-'A' + 10; continue; } errorat(lx->pos, "non-hex character in escape sequence: %c", c); ungetbyte(lx); break; } if (u && (l > RuneMax || (0xd800 <= l && l < 0xe000))) { errorat(lx->pos, "invalid unicode code point in escape sequence: %#llx", l); l = RuneErr; } *val = l; return 0; oct: l = c - '0'; for (i = 2; i > 0; i--) { c = getbyte(lx); if (c >= '0' && c <= '7') { l = l*8 + c-'0'; continue; } errorat(lx->pos, "non-octal character in escape sequence: %c", c); ungetbyte(lx); } if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l); *val = l; return 0; } #define CASE1(stmt1, kind1) \ case stmt1: \ tok.kind = kind1; \ break; #define CASE2(stmt1, kind1, b1, kind2) \ case stmt1: \ tok.kind = kind1; \ b = getbyte(lx); \ if (b == b1) \ tok.kind = kind2; \ else \ ungetbyte(lx); \ break; #define CASE3(stmt1, kind1, b1, kind2, b2, kind3) \ case stmt1: \ tok.kind = kind1; \ b = getbyte(lx); \ if (b == b1) \ tok.kind = kind2; \ else if (b == b2) \ tok.kind = kind3; \ else \ ungetbyte(lx); \ break; #define CASE4(stmt1, kind1, b1, kind2, b2, kind3, b3, type4) \ case stmt1: \ tok.kind = kind1; \ b = getbyte(lx); \ if (b == b1) \ tok.kind = kind2; \ else if (b == b2) \ tok.kind = kind3; \ else if (b == b3) \ tok.kind = type4; \ else \ ungetbyte(lx); \ break; Token lex(Lexer *lx) { int b, n, f; vlong v; uint u; rune r; string s; double d; byte *e; Token tok; Sym *sym; Io *io; GetByte: b = getbyte(lx); Dispatch: tok.pos.beg = lx->pos; if (b >= RuneSelf || isalpha(b) || b == '_') goto TAlpha; if (isdigit(b)) goto TNum; switch (b) { case ' ': case '\n': case '\r': case '\t': case '\v': case '\f': while (b = getbyte(lx), isspace(b)) if (b == '\n') lx->pos.line++; goto Dispatch; case '\'': if (escape(lx, '\'', &f, &v)) { errorat(lx->pos, "empty literal or escaped ' in char literal"); v = '\''; } if (!escape(lx, '\'', &f, &v)) { errorat(lx->pos, "missing '"); ungetbyte(lx); } if (v > 0xff) { errorat(lx->pos, "overflowed character literal"); v = 0; } tok.kind = Alit | Vchar; tok.val.c = v; break; case '"': s = str·makecap("", 0, 8); for (;;) { if (escape(lx, '"', &f, &v)) break; if (v < RuneSelf || f) str·appendbyte(&s, v); else { r = v; b = utf8·runelen(r); utf8·runetochar(lx->buf, &r); str·appendlen(&s, b, lx->buf); } } tok.kind = Alit | Vstr; tok.val.s = s; intern(&tok.val.s); str·free(s); break; case '.': tok.kind = Adot; b = getbyte(lx); if (isdigit(b)) { // *lx->b++ = b; goto TFlt; } else if (b == '.') { b = getbyte(lx); if (b != '.') { errorat(lx->pos, "invalid token '..'"); tok.kind = Aellip; break; } } ungetbyte(lx); break; case '<': tok.kind = Alt; b = getbyte(lx); if (b == '<') { tok.kind = Alsft; b = getbyte(lx); if (b == '=') tok.kind = Alsftasn; else ungetbyte(lx); } else if (b == '=') tok.kind = Alteq; else ungetbyte(lx); break; case '>': tok.kind = Agt; b = getbyte(lx); if (b == '>') { tok.kind = Arsft; b = getbyte(lx); if (b == '=') tok.kind = Arsftasn; else ungetbyte(lx); } else if (b == '=') tok.kind = Agteq; else ungetbyte(lx); break; case '/': tok.kind = Adiv; b = getbyte(lx); if (b == '=') tok.kind = Adivasn; else if (b == '/') { while (b != EOF && b != '\n') b = getbyte(lx); lx->pos.line++; goto Dispatch; } else if (b == '*') { int level = 1; b = getbyte(lx); while (b != EOF && level > 0) { if (b == '/') { b = getbyte(lx); if (b == '*') level++; } else if (b == '*') { b = getbyte(lx); if (b == '/') level--; } if (b == '\n') lx->pos.line++; b = getbyte(lx); } goto Dispatch; } else ungetbyte(lx); break; case '#': if (domacro(lx)) { tok.kind = Anil; errorat(lx->pos, "failed to perform preprocessor directive"); return tok; } goto GetByte; break; case EOF: panicf("need to implement popio"); CASE1('(', Alparen) CASE1(')', Arparen) CASE1('{', Albrace) CASE1('}', Arbrace) CASE1('[', Albrakt) CASE1(']', Arbrakt) CASE1(',', Acomma) CASE1('?', Aqmark) CASE1(';', Asemi) CASE1('~', Aneg) CASE1(':', Acolon) CASE2('^', Axor, '=', Axorasn) CASE2('!', Anot, '=', Aneq) CASE2('*', Astar,'=', Amulasn) CASE2('=', Aasn, '=', Aeq) CASE2('%', Amod, '=', Amodasn) CASE3('+', Aadd, '=', Aaddasn, '+', Ainc) CASE3('&', Aand, '=', Aandasn, '&', Aandand) CASE3('|', Aor, '=', Aorasn, '|', Aoror) CASE4('-', Asub, '=', Asubasn, '-', Adec, '>', Aarrow) default: tok.kind = Anil; errorat(lx->pos, "invalid token, crashing"); abort(); } goto Return; TNum: e = lx->buf + arrlen(lx->buf); do { if (lx->b >= e) { errorat(lx->pos, "number overflows lexer buffer"); goto Nospace; } *lx->b++ = b; } while (b = getbyte(lx), isdigit(b) || b == '_'); if (b == '.' || tolower(b) == 'e') goto TFlt; TInt: r = b; n = 10; s = lx->buf; ungetbyte(lx); if (*s == '0') { b = *++s; switch (b) { case 'x': n = 16; break; case 'b': n = 2; break; case 'o': n = 8; break; default: --s; } if (s >= e) { errorat(lx->pos, "number overflows lexer buffer"); goto Nospace; } } v = 0; for (; s != lx->b ; s++) { b = *s; if (b == '_') continue; f = Atoi[b]; if (f == 0 && b != '0') break; if (f >= n) { errorat(lx->pos, "digit '%c' out of range for base %d", b, n); f = 0; } if (v > (UINT64_MAX - f) / n) { errorat(lx->pos, "integer literal overflow"); v = 0; break; } v = v * n + f; } b = r; tok.kind = Alit | Vint; tok.val.i = v; /* TODO: Suffixes! if (tolower(b) == 'u') { tok.kind |= Vusgn; b = getbyte(lx); } */ goto Return; TFlt: if (b == '.') { *lx->b++ = b; b = getbyte(lx); } while (isdigit(b)) { *lx->b++ = b; if (lx->b >= e) { errorat(lx->pos, "number overflows lexer buffer"); goto Nospace; } } if (tolower(b) == 'e') { b = getbyte(lx); if (b == '-' || b == '+') b = getbyte(lx); if (!isdigit(b)) errorat(lx->pos, "expected number after exponent, found %c", b); do { *lx->b++ = b; } while (b = getbyte(lx), isdigit(b)); } *lx->b = '\0'; d = strtod(lx->buf, nil); ungetbyte(lx); tok.kind = Alit | Vfloat; tok.val.f = d; goto Return; TAlpha: u = b; s = lx->buf; e = lx->buf + arrlen(lx->buf); for (;;) { if (s >= e) { errorat(lx->pos, "identifier too long for buffer: %s", s); goto Nospace; } if (u >= RuneSelf) { ungetbyte(lx); r = getrune(lx); if (!utf8·isletter(r) && !utf8·isdigit(r) && r != 0xb7) { errorat(lx->pos, "invalid identifier character %d", r); } s += utf8·runetochar(s, &r); } else if (!isalnum(u) && u != '_') break; else *s++ = u; u = getbyte(lx); } *s = '\0'; ungetbyte(lx); tok.kind = Aident; tok.val.s = lx->buf; n = intern(&tok.val.s); if (n < arrlen(keywords)) { tok.kind = Akeywd; } sym = lookup(&lx->sym, tok.val.s); if (sym) { io = makeio(); io->buf.end += expandmacro(lx, sym, io->b); pushio(lx, io); goto GetByte; } Return: lx->b = lx->buf; tok.pos.end = lx->pos; return tok; Nospace: panicf("aborting compilation"); } #undef CASE4 #undef CASE3 #undef CASE2 #undef CASE1 // ----------------------------------------------------------------------- // push/pop io objects void pushio(Lexer *lx, Io *new) { new->link = lx->io; lx->io->store = lx->pos; lx->io = new; lx->pos = (Pos){ .line = 0, .col = 0, .path = new->path, }; } void popio(Lexer *lx) { Io *prev; prev = lx->io->link; if (!prev) { panicf("no buffer left"); } lx->pos = prev->store; lx->io = prev; } // ----------------------------------------------------------------------- // symbol tables #define PTR_HASH(p) (uintptr)(p) #define PTR_EQUAL(p1, p2) ((uintptr)(p1) == (uintptr)(p2)) Sym* lookup(SymTab *tab, string ident) { int idx; MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL); if (idx < tab->n_buckets) return tab->vals[idx]; return nil; } static int moresymtab(SymTab *tab, int n) { MAP_GROW(tab, string, Sym*, n, PTR_HASH, ·calloc, ·free, nil); } static int putsym(SymTab *tab, Sym *sym, error *err) { MAP_PUT(tab, sym->name, sym, PTR_HASH, PTR_EQUAL, moresymtab, err); } Sym* define(SymTab *tab, string name, int kind) { int i; Sym *sym; error err; sym = mem·arenaalloc(C.heap, 1, sizeof(*sym)); sym->name = name; sym->kind = kind; i = putsym(tab, sym, &err); tab->vals[i] = sym; return sym; } // ----------------------------------------------------------------------- // error reporting void errorat(Pos x, byte *fmt, ...) { va_list args; va_start(args, fmt); printf("error %d: ", x.line); vprintf(fmt, args); printf("\n"); va_end(args); }