#include "cc.h" #include // ----------------------------------------------------------------------- // printing functions void puttok(Token tok) { if (tok.kind < Alit) printf("%s", tokens[tok.kind]); else if (tok.kind & Alit) { if (tok.kind & Vchar) if (tok.kind & Vint) if (tok.kind & Vlong) if (tok.kind & Vvlong) printf("literal <%lld>", tok.val.i); if (tok.kind & Vfloat) printf("literal <%f>", tok.val.f); printf("literal <%s>", tok.val.s); } else printf("ident <%s>", tok.val.s); } // ----------------------------------------------------------------------- // io buffer management #define asrdr(x) (io·Reader){(int (*)(void *, int, int, void *))x} // path should be absolute Io* openio(Lexer *lx, byte *path) { string *it, *end; intern(&path); // See if we have already opened file; // If so, and it hasn't been flagged return it for (it = lx->omit.path, end = it + lx->omit.len; it < end; ++it) { if ((uintptr)(*it) == (uintptr)(path)) return nil; } // TODO: See if we have already loaded the file if ((lx->new - lx->iostk) >= arrlen(lx->iostk)-1) panicf("out of I/O space!"); lx->new->f = io·open(path, "r"); if (!lx->new->f) panicf("file %s not found", path); lx->new->kind = IOfile; lx->new->path = path; bufio·initreader(&lx->new->rdr, asrdr(io·read), lx->new->f); return lx->new++; } static Io* makeio(Lexer *lx, byte *name) { if ((lx->new - lx->iostk) >= arrlen(lx->iostk)-1) panicf("out of I/O space!"); lx->new->path = name; lx->new->rdr = (io·Buffer) { .state = bufio·rdr | bufio·end, .runesize = 0, .h = nil, .size = bufio·size, .beg = lx->new->rdr.buf + bufio·ungets, .pos = lx->new->rdr.buf + bufio·ungets, .end = lx->new->rdr.buf + bufio·ungets, }; lx->new->b = lx->new->rdr.beg; return lx->new++; } #undef asrdr static void freeio(Lexer *lx, Io *io) { if (io->kind & IOfile) { io·close(io->f); } io->rdr.state = 0; io->kind = 0; io->link = nil; io->path = nil; io->store = (Pos){ 0 }; io->path = ""; } void pushio(Lexer *lx, Io *new) { new->link = lx->io; lx->io->store = lx->pos; lx->io = new; lx->pos = (Pos){ .line = 1, .col = 1, .path = new->path, }; } void popio(Lexer *lx) { Io *prev; assert(lx->io == lx->new-1); --lx->new; prev = lx->io->link; freeio(lx, lx->io); lx->io = prev; if (!prev) { return; } lx->pos = prev->store; } // ----------------------------------------------------------------------- // simple wrappers int getbyte(Lexer *lx) { return bufio·getbyte(&lx->io->rdr); } int getnsbyte(Lexer *lx) { int b; b = getbyte(lx); for (;;) { if (b == EOF) { if (lx->io->link) { popio(lx); assert(lx->io); b = getbyte(lx); continue; } else return b; } if (b >= RuneSelf || !isspace(b)) return b; if (b == '\n') return b; b = getbyte(lx); } return b; } rune getrune(Lexer *lx) { return bufio·getrune(&lx->io->rdr); } byte ungetbyte(Lexer *lx) { byte b; return bufio·ungetbyte(&lx->io->rdr, b); } rune ungetrune(Lexer *l, rune r) { return bufio·ungetrune(&l->io->rdr, r); } // ----------------------------------------------------------------------- // main lexer #define TOK(a, b) b, byte *tokens[NUM_TOKENS] = { TOKENS }; #undef TOK static uint8 Atoi[256] = { ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4, ['5'] = 5, ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9, ['a'] = 10, ['A'] = 10, ['b'] = 11, ['B'] = 11, ['c'] = 12, ['C'] = 12, ['d'] = 13, ['D'] = 13, ['e'] = 14, ['E'] = 14, ['f'] = 15, ['F'] = 15, }; static error escapechar(Lexer *lx, int x, int islong, int esc, vlong *val) { int i, u, c; vlong l; c = getrune(lx); switch (c) { case '\\': break; case EOF: errorat(lx->pos, "EOF in string"); return 1; case '\n': errorat(lx->pos, "newline in string"); return 1; default: if (c == x) return 1; *val = c; return 0; } u = 0; c = getrune(lx); switch(c) { case 'x': i = islong ? 4 : 2; goto hex; case 'u': i = islong ? 8 : 4; u = 1; goto hex; case 'U': i = 8; u = 1; goto hex; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': i = islong ? 4 : 2; goto oct; case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case '\\':c = '\\'; break; default: if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c); } *val = c; return 0; hex: l = 0; for(; i > 0; i--) { c = getbyte(lx); if (c >= '0' && c <= '9') { l = l*16 + c-'0'; continue; } if (c >= 'a' && c <= 'f') { l = l*16 + c-'a' + 10; continue; } if (c >= 'A' && c <= 'F') { l = l*16 + c-'A' + 10; continue; } ungetbyte(lx); break; } if (u && (l > RuneMax || (0xd800 <= l && l < 0xe000))) { errorat(lx->pos, "invalid unicode code point in escape sequence: %#llx", l); l = RuneErr; } *val = l; if (esc) *val |= RuneMask + 1; return 0; oct: l = c - '0'; for (; i > 0; i--) { c = getbyte(lx); if (c >= '0' && c <= '7') { l = l*8 + c-'0'; continue; } ungetbyte(lx); break; } if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l); *val = l; if (esc) *val |= RuneMask + 1; return 0; } #define CASE1(stmt1, kind1) \ case stmt1: \ tok.kind = kind1; \ goto Return #define CASE2(stmt1, kind1, b1, kind2) \ case stmt1: \ tok.kind = kind1; \ b = getbyte(lx); \ if (b == b1) \ tok.kind = kind2; \ else \ ungetbyte(lx); \ goto Return #define CASE3(stmt1, kind1, b1, kind2, b2, kind3) \ case stmt1: \ tok.kind = kind1; \ b = getbyte(lx); \ if (b == b1) \ tok.kind = kind2; \ else if (b == b2) \ tok.kind = kind3; \ else \ ungetbyte(lx); \ goto Return #define CASE4(stmt1, kind1, b1, kind2, b2, kind3, b3, type4) \ case stmt1: \ tok.kind = kind1; \ b = getbyte(lx); \ if (b == b1) \ tok.kind = kind2; \ else if (b == b2) \ tok.kind = kind3; \ else if (b == b3) \ tok.kind = type4; \ else \ ungetbyte(lx); \ goto Return Token lex(Lexer *lx) { int b, n, f; vlong v, _; rune r; string s; double d; byte *e; Token tok; Sym *sym; Io *io; GetByte: b = getbyte(lx); Dispatch: tok.pos = lx->pos; if ((b != EOF && b >= RuneSelf) || b == '_') goto Talpha; if (isalpha(b)) { if (b != 'L') goto Talpha; n = b; b = getbyte(lx); if (b == '\'') { if (escapechar(lx, '\'', 1, 0, &v)) b = '\''; if (!escapechar(lx, '\'', 1, 0, &_)) { errorat(lx->pos, "missing ' at end of character constant"); } tok.kind = Alit | Vrune; tok.val.r = v; goto Return; } if (b == '"') goto TLstr; ungetbyte(lx); b = n; goto Talpha; } if (isdigit(b)) goto Tnum; switch (b) { case '\n': lx->pos.line++; case ' ': case '\r': case '\t': case '\v': case '\f': while (b = getbyte(lx), isspace(b)) if (b == '\n') lx->pos.line++; goto Dispatch; case '\\': b = getbyte(lx); if (b != '\n') errorat(lx->pos, "'\\' without a trailing newline"); goto GetByte; Tchar: case '\'': if (escapechar(lx, '\'', 0, 0, &v)) { errorat(lx->pos, "empty literal or escaped ' in char literal"); v = '\''; } if (!escapechar(lx, '\'', 0, 0, &_)) { errorat(lx->pos, "missing '"); ungetbyte(lx); } if (v > 0xff) { errorat(lx->pos, "overflowed character literal"); v = 0; } tok.kind = Alit | Vchar; tok.val.c = v; goto Return; case '"': s = str·makecap("", 0, 8); for (;;) { if (escapechar(lx, '"', 0, 1, &v)) break; if (v & (RuneMask + 1)) str·appendbyte(&s, v); else { r = v; b = utf8·runelen(r); utf8·runetobyte(lx->buf, &r); str·appendlen(&s, b, lx->buf); } } tok.kind = Alit | Vstr; tok.val.s = s; intern(&tok.val.s); str·free(s); goto Return; TLstr: s = str·makecap("", 0, 8); // NOTE: this violates strict aliasing for (;;) { if (escapechar(lx, '"', 1, 0, &v)) break; str·appendlen(&s, sizeof(wchar_t), (byte*)&v); } tok.kind = Alit | Vwstr; tok.val.s = s; intern(&tok.val.s); str·free(s); goto Return; case '.': tok.kind = Adot; b = getbyte(lx); if (isdigit(b)) { // *lx->b++ = b; goto Tflt; } else if (b == '.') { b = getbyte(lx); if (b != '.') { errorat(lx->pos, "invalid token '..'"); tok.kind = Aellip; break; } } ungetbyte(lx); goto Return; case '<': tok.kind = Alt; b = getbyte(lx); if (b == '<') { tok.kind = Alsft; b = getbyte(lx); if (b == '=') tok.kind = Alsftasn; else ungetbyte(lx); } else if (b == '=') tok.kind = Alteq; else ungetbyte(lx); goto Return; case '>': tok.kind = Agt; b = getbyte(lx); if (b == '>') { tok.kind = Arsft; b = getbyte(lx); if (b == '=') tok.kind = Arsftasn; else ungetbyte(lx); } else if (b == '=') tok.kind = Agteq; else ungetbyte(lx); goto Return; case '/': tok.kind = Adiv; b = getbyte(lx); if (b == '=') tok.kind = Adivasn; else if (b == '/') { while (b != EOF && b != '\n') b = getbyte(lx); goto Dispatch; } else if (b == '*') { int level = 1; b = getbyte(lx); while (b != EOF && level > 0) { if (b == '/') { b = getbyte(lx); if (b == '*') level++; } else if (b == '*') { b = getbyte(lx); if (b == '/') level--; } if (b == '\n') lx->pos.line++; b = getbyte(lx); } goto Dispatch; } else ungetbyte(lx); goto Return; case '#': if (domacro(lx)) { tok.kind = Anil; errorat(lx->pos, "failed to perform preprocessor directive"); return tok; } goto GetByte; case EOF: popio(lx); if (lx->io) goto GetByte; tok.kind = Aeof; goto Return; CASE1('(', Alparen); CASE1(')', Arparen); CASE1('{', Albrace); CASE1('}', Arbrace); CASE1('[', Albrakt); CASE1(']', Arbrakt); CASE1(',', Acomma); CASE1('?', Aqmark); CASE1(';', Asemi); CASE1('~', Aneg); CASE1(':', Acolon); CASE2('^', Axor, '=', Axorasn); CASE2('!', Anot, '=', Aneq); CASE2('*', Astar,'=', Amulasn); CASE2('=', Aasn, '=', Aeq); CASE2('%', Amod, '=', Amodasn); CASE3('+', Aadd, '=', Aaddasn, '+', Ainc); CASE3('&', Aand, '=', Aandasn, '&', Aandand); CASE3('|', Aor, '=', Aorasn, '|', Aoror); CASE4('-', Asub, '=', Asubasn, '-', Adec, '>', Aarrow); Tnum: e = lx->buf + arrlen(lx->buf); do { if (lx->b >= e) { errorat(lx->pos, "number overflows lexer buffer"); goto Nospace; } *lx->b++ = b; } while (b = getbyte(lx), isdigit(b) || b == '_'); if (b == '.' || tolower(b) == 'e') goto Tflt; Tint: n = 10; s = lx->buf; if (*s == '0') { switch (b) { case 'x': n = 16; break; case 'b': n = 2; break; case 'o': n = 8; break; default: goto Rint; } lx->b = s; /* reparse number, now with base info */ while (b = getbyte(lx), (isdigit(b) || ('a' <= b && b <= 'f') || ('A' <= b && b <= 'F') || b == '_')) *lx->b++ = b; } Rint: v = 0; r = b; for (; s != lx->b ; s++) { b = *s; if (b == '_') continue; f = Atoi[b]; if (f == 0 && b != '0') break; if (f >= n) { errorat(lx->pos, "digit '%c' out of range for base %d", b, n); f = 0; } if (v > (UINT64_MAX - f) / n) { errorat(lx->pos, "integer literal overflow"); v = 0; break; } v = v * n + f; } b = r; tok.kind = Alit; tok.val.i = v; if (b == 'u' || b == 'U') { tok.kind |= Vun; b = getbyte(lx); } if (b == 'l' || b == 'L') { r = getbyte(lx); if (r == 'l' || r == 'L') { if (r != b) errorat(lx->pos, "mismatched case on long long integer suffix"); tok.kind |= Vvlong; r = getbyte(lx); } else tok.kind |= Vlong; if (r == 'u' || r == 'U') { if (tok.kind & Vun) errorat(lx->pos, "multiple unsigned designators on integer suffix"); tok.kind |= Vun; goto Return; } ungetbyte(lx); goto Return; } tok.kind |= Vint; ungetbyte(lx); goto Return; Tflt: if (b == '.') { *lx->b++ = b; b = getbyte(lx); } while (isdigit(b)) { *lx->b++ = b; if (lx->b >= e) { errorat(lx->pos, "number overflows lexer buffer"); goto Nospace; } } if (tolower(b) == 'e') { b = getbyte(lx); if (b == '-' || b == '+') b = getbyte(lx); if (!isdigit(b)) errorat(lx->pos, "expected number after exponent, found %c", b); do { *lx->b++ = b; } while (b = getbyte(lx), isdigit(b)); } *lx->b = '\0'; d = strtod(lx->buf, nil); ungetbyte(lx); tok.kind = Alit | Vfloat; tok.val.f = d; goto Return; Talpha: s = lx->buf; e = lx->buf + arrlen(lx->buf); for (;;) { if (s >= e) { errorat(lx->pos, "identifier too long for buffer: %s", s); goto Nospace; } if (b != EOF && b >= RuneSelf) { ungetbyte(lx); r = getrune(lx); if (!utf8·isletter(r) && !utf8·isdigit(r) && r != 0xb7) { errorat(lx->pos, "invalid identifier character %d", r); } s += utf8·runetobyte(s, &r); } else if (!isalnum(b) && b != '_') break; else *s++ = b; b = getbyte(lx); } *s = '\0'; ungetbyte(lx); tok.kind = Aident; tok.val.s = lx->buf; n = intern(&tok.val.s); if (n < arrlen(keywords)) { tok.kind = Akeywd; tok.val.i = n; goto Return; } sym = lookup(&lx->sym, tok.val.s); if (sym && ((uintptr)sym->name != (uintptr)lx->io->path)) { if ((uintptr)sym == lx->macline) { tok.kind = Alit | Vint; tok.val.i = lx->pos.line; goto Return; } if ((uintptr)sym == lx->macfile) { tok.kind = Alit | Vstr; tok.val.s = lx->pos.path; goto Return; } io = makeio(lx, sym->name); io->rdr.end += expandmacro(lx, sym, io->b); printf("EXPANDED %s: %s\n", sym->name, io->rdr.beg); *io->rdr.end++ = EOF; pushio(lx, io); goto GetByte; } goto Return; default: tok.kind = Anil; errorat(lx->pos, "invalid token, crashing"); abort(); } Return: lx->b = lx->buf; return tok; Nospace: panicf("aborting compilation"); exit(1); } #undef CASE4 #undef CASE3 #undef CASE2 #undef CASE1 // ----------------------------------------------------------------------- // symbol tables #define PTR_HASH(p) (uintptr)(p) #define PTR_EQUAL(p1, p2) ((uintptr)(p1) == (uintptr)(p2)) static void ·free(void* _, void* ptr) { return free(ptr); } static void * ·alloc(void* _, uint n, ulong size) { return malloc(n*size); } static void * ·calloc(void* _, uint n, ulong size) { return calloc(n, size); } static int moresymtab(SymTab *tab, int n) { MAP_GROW(tab, string, Sym*, n, PTR_HASH, sys·Memory, nil); } static int putsym(SymTab *tab, Sym *sym, error *err) { MAP_PUT(tab, sym->name, sym, PTR_HASH, PTR_EQUAL, moresymtab, err); } Sym* define(SymTab *tab, string name, uint32 kind) { int i; Sym *sym; error err; sym = mem·arenaalloc(C.heap, 1, sizeof(*sym)); sym->name = name; sym->kind = kind; i = putsym(tab, sym, &err); tab->vals[i] = sym; return sym; } Sym* lookup(SymTab *tab, string ident) { int idx; MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL); if (idx < tab->n_buckets) return tab->vals[idx]; return nil; } error forget(SymTab *tab, string ident) { int idx; MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL); if (idx < tab->n_buckets) { MAP_DEL(tab, idx); return 0; } return 1; } void forgetall(SymTab *tab) { MAP_RESET(tab); }