From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/cmd/cc/lex.c | 873 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 873 insertions(+) create mode 100644 src/cmd/cc/lex.c (limited to 'src/cmd/cc/lex.c') diff --git a/src/cmd/cc/lex.c b/src/cmd/cc/lex.c new file mode 100644 index 0000000..33fc5d0 --- /dev/null +++ b/src/cmd/cc/lex.c @@ -0,0 +1,873 @@ +#include "cc.h" +#include + +// ----------------------------------------------------------------------- +// printing functions + +void +puttok(Token tok) +{ + if (tok.kind < Alit) + printf("%s", tokens[tok.kind]); + else if (tok.kind & Alit) { + if (tok.kind & Vchar) + if (tok.kind & Vint) + if (tok.kind & Vlong) + if (tok.kind & Vvlong) + printf("literal <%lld>", tok.val.i); + if (tok.kind & Vfloat) + printf("literal <%f>", tok.val.f); + printf("literal <%s>", tok.val.s); + } else + printf("ident <%s>", tok.val.s); +} + +// ----------------------------------------------------------------------- +// io buffer management + +#define asrdr(x) (io·Reader){(int (*)(void *, int, int, void *))x} + +// path should be absolute +Io* +openio(Lexer *lx, byte *path) +{ + string *it, *end; + + intern(&path); + + // See if we have already opened file; + // If so, and it hasn't been flagged return it + for (it = lx->omit.path, end = it + lx->omit.len; it < end; ++it) { + if ((uintptr)(*it) == (uintptr)(path)) + return nil; + } + + // TODO: See if we have already loaded the file + + if ((lx->new - lx->iostk) >= arrlen(lx->iostk)-1) + panicf("out of I/O space!"); + + lx->new->f = io·open(path, "r"); + if (!lx->new->f) + panicf("file %s not found", path); + + lx->new->kind = IOfile; + lx->new->path = path; + bufio·initreader(&lx->new->rdr, asrdr(io·read), lx->new->f); + + return lx->new++; +} + +static +Io* +makeio(Lexer *lx, byte *name) +{ + if ((lx->new - lx->iostk) >= arrlen(lx->iostk)-1) + panicf("out of I/O space!"); + + lx->new->path = name; + lx->new->rdr = (io·Buffer) { + .state = bufio·rdr | bufio·end, + .runesize = 0, + .h = nil, + .size = bufio·size, + .beg = lx->new->rdr.buf + bufio·ungets, + .pos = lx->new->rdr.buf + bufio·ungets, + .end = lx->new->rdr.buf + bufio·ungets, + }; + lx->new->b = lx->new->rdr.beg; + + return lx->new++; +} +#undef asrdr + +static +void +freeio(Lexer *lx, Io *io) +{ + if (io->kind & IOfile) { + io·close(io->f); + } + + io->rdr.state = 0; + io->kind = 0; + io->link = nil; + io->path = nil; + io->store = (Pos){ 0 }; + io->path = ""; +} + +void +pushio(Lexer *lx, Io *new) +{ + new->link = lx->io; + lx->io->store = lx->pos; + lx->io = new; + + lx->pos = (Pos){ + .line = 1, + .col = 1, + .path = new->path, + }; +} + +void +popio(Lexer *lx) +{ + Io *prev; + + assert(lx->io == lx->new-1); + --lx->new; + + prev = lx->io->link; + freeio(lx, lx->io); + + lx->io = prev; + if (!prev) { + return; + } + + lx->pos = prev->store; +} + +// ----------------------------------------------------------------------- +// simple wrappers + +int +getbyte(Lexer *lx) +{ + return bufio·getbyte(&lx->io->rdr); +} + +int +getnsbyte(Lexer *lx) +{ + int b; + b = getbyte(lx); + for (;;) { + if (b == EOF) { + if (lx->io->link) { + popio(lx); + assert(lx->io); + b = getbyte(lx); + continue; + } else + return b; + } + if (b >= RuneSelf || !isspace(b)) + return b; + if (b == '\n') + return b; + b = getbyte(lx); + } + return b; +} + +rune +getrune(Lexer *lx) +{ + return bufio·getrune(&lx->io->rdr); +} + +byte +ungetbyte(Lexer *lx) +{ + byte b; + return bufio·ungetbyte(&lx->io->rdr, b); +} + +rune +ungetrune(Lexer *l, rune r) +{ + return bufio·ungetrune(&l->io->rdr, r); +} + +// ----------------------------------------------------------------------- +// main lexer + +#define TOK(a, b) b, +byte *tokens[NUM_TOKENS] = { TOKENS }; +#undef TOK + +static uint8 Atoi[256] = +{ + ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4, ['5'] = 5, + ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9, ['a'] = 10, ['A'] = 10, + ['b'] = 11, ['B'] = 11, ['c'] = 12, ['C'] = 12, ['d'] = 13, ['D'] = 13, + ['e'] = 14, ['E'] = 14, ['f'] = 15, ['F'] = 15, +}; + +static +error +escapechar(Lexer *lx, int x, int islong, int esc, vlong *val) +{ + int i, u, c; + vlong l; + + c = getrune(lx); + + switch (c) { + case '\\': + break; + case EOF: + errorat(lx->pos, "EOF in string"); + return 1; + case '\n': + errorat(lx->pos, "newline in string"); + return 1; + default: + if (c == x) + return 1; + *val = c; + return 0; + } + + u = 0; + c = getrune(lx); + + switch(c) { + case 'x': + i = islong ? 4 : 2; + goto hex; + + case 'u': + i = islong ? 8 : 4; + u = 1; + goto hex; + + case 'U': + i = 8; + u = 1; + goto hex; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + i = islong ? 4 : 2; + goto oct; + + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'v': c = '\v'; break; + case '\\':c = '\\'; break; + + default: + if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c); + } + *val = c; + return 0; + +hex: + l = 0; + for(; i > 0; i--) { + c = getbyte(lx); + if (c >= '0' && c <= '9') { + l = l*16 + c-'0'; + continue; + } + if (c >= 'a' && c <= 'f') { + l = l*16 + c-'a' + 10; + continue; + } + if (c >= 'A' && c <= 'F') { + l = l*16 + c-'A' + 10; + continue; + } + ungetbyte(lx); + break; + } + if (u && (l > RuneMax || (0xd800 <= l && l < 0xe000))) { + errorat(lx->pos, "invalid unicode code point in escape sequence: %#llx", l); + l = RuneErr; + } + *val = l; + if (esc) + *val |= RuneMask + 1; + return 0; + +oct: + l = c - '0'; + for (; i > 0; i--) { + c = getbyte(lx); + if (c >= '0' && c <= '7') { + l = l*8 + c-'0'; + continue; + } + ungetbyte(lx); + break; + } + if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l); + + *val = l; + if (esc) + *val |= RuneMask + 1; + return 0; +} + +#define CASE1(stmt1, kind1) \ + case stmt1: \ + tok.kind = kind1; \ + goto Return + +#define CASE2(stmt1, kind1, b1, kind2) \ + case stmt1: \ + tok.kind = kind1; \ + b = getbyte(lx); \ + if (b == b1) \ + tok.kind = kind2; \ + else \ + ungetbyte(lx); \ + goto Return + +#define CASE3(stmt1, kind1, b1, kind2, b2, kind3) \ + case stmt1: \ + tok.kind = kind1; \ + b = getbyte(lx); \ + if (b == b1) \ + tok.kind = kind2; \ + else if (b == b2) \ + tok.kind = kind3; \ + else \ + ungetbyte(lx); \ + goto Return + +#define CASE4(stmt1, kind1, b1, kind2, b2, kind3, b3, type4) \ + case stmt1: \ + tok.kind = kind1; \ + b = getbyte(lx); \ + if (b == b1) \ + tok.kind = kind2; \ + else if (b == b2) \ + tok.kind = kind3; \ + else if (b == b3) \ + tok.kind = type4; \ + else \ + ungetbyte(lx); \ + goto Return + + +Token +lex(Lexer *lx) +{ + int b, n, f; + vlong v, _; + rune r; + string s; + double d; + byte *e; + Token tok; + Sym *sym; + Io *io; + +GetByte: + b = getbyte(lx); +Dispatch: + tok.pos = lx->pos; + + if ((b != EOF && b >= RuneSelf) || b == '_') + goto Talpha; + if (isalpha(b)) { + if (b != 'L') + goto Talpha; + + n = b; + b = getbyte(lx); + if (b == '\'') { + if (escapechar(lx, '\'', 1, 0, &v)) + b = '\''; + if (!escapechar(lx, '\'', 1, 0, &_)) { + errorat(lx->pos, "missing ' at end of character constant"); + } + tok.kind = Alit | Vrune; + tok.val.r = v; + goto Return; + } + if (b == '"') + goto TLstr; + ungetbyte(lx); + b = n; + + goto Talpha; + } + if (isdigit(b)) + goto Tnum; + + switch (b) { + case '\n': + lx->pos.line++; + case ' ': case '\r': case '\t': case '\v': case '\f': + while (b = getbyte(lx), isspace(b)) + if (b == '\n') + lx->pos.line++; + goto Dispatch; + + case '\\': + b = getbyte(lx); + if (b != '\n') + errorat(lx->pos, "'\\' without a trailing newline"); + goto GetByte; + + Tchar: + case '\'': + if (escapechar(lx, '\'', 0, 0, &v)) { + errorat(lx->pos, "empty literal or escaped ' in char literal"); + v = '\''; + } + if (!escapechar(lx, '\'', 0, 0, &_)) { + errorat(lx->pos, "missing '"); + ungetbyte(lx); + } + + if (v > 0xff) { + errorat(lx->pos, "overflowed character literal"); + v = 0; + } + tok.kind = Alit | Vchar; + tok.val.c = v; + goto Return; + + case '"': + s = str·makecap("", 0, 8); + for (;;) { + if (escapechar(lx, '"', 0, 1, &v)) + break; + + if (v & (RuneMask + 1)) + str·appendbyte(&s, v); + else { + r = v; + b = utf8·runelen(r); + utf8·runetobyte(lx->buf, &r); + str·appendlen(&s, b, lx->buf); + } + } + tok.kind = Alit | Vstr; + tok.val.s = s; + intern(&tok.val.s); + + str·free(s); + goto Return; + + TLstr: + s = str·makecap("", 0, 8); + // NOTE: this violates strict aliasing + for (;;) { + if (escapechar(lx, '"', 1, 0, &v)) + break; + str·appendlen(&s, sizeof(wchar_t), (byte*)&v); + } + tok.kind = Alit | Vwstr; + tok.val.s = s; + intern(&tok.val.s); + + str·free(s); + goto Return; + + case '.': + tok.kind = Adot; + b = getbyte(lx); + + if (isdigit(b)) { + // *lx->b++ = b; + goto Tflt; + } else if (b == '.') { + b = getbyte(lx); + if (b != '.') { + errorat(lx->pos, "invalid token '..'"); + tok.kind = Aellip; + break; + } + } + ungetbyte(lx); + goto Return; + + case '<': + tok.kind = Alt; + b = getbyte(lx); + + if (b == '<') { + tok.kind = Alsft; + b = getbyte(lx); + if (b == '=') + tok.kind = Alsftasn; + else + ungetbyte(lx); + } else if (b == '=') + tok.kind = Alteq; + else + ungetbyte(lx); + goto Return; + + case '>': + tok.kind = Agt; + b = getbyte(lx); + + if (b == '>') { + tok.kind = Arsft; + b = getbyte(lx); + if (b == '=') + tok.kind = Arsftasn; + else + ungetbyte(lx); + } else if (b == '=') + tok.kind = Agteq; + else + ungetbyte(lx); + goto Return; + + case '/': + tok.kind = Adiv; + b = getbyte(lx); + + if (b == '=') + tok.kind = Adivasn; + else if (b == '/') { + while (b != EOF && b != '\n') + b = getbyte(lx); + goto Dispatch; + } else if (b == '*') { + int level = 1; + b = getbyte(lx); + while (b != EOF && level > 0) { + if (b == '/') { + b = getbyte(lx); + if (b == '*') + level++; + } else if (b == '*') { + b = getbyte(lx); + if (b == '/') + level--; + } + if (b == '\n') + lx->pos.line++; + b = getbyte(lx); + } + goto Dispatch; + } else + ungetbyte(lx); + goto Return; + + case '#': + if (domacro(lx)) { + tok.kind = Anil; + errorat(lx->pos, "failed to perform preprocessor directive"); + return tok; + } + goto GetByte; + + case EOF: + popio(lx); + if (lx->io) + goto GetByte; + tok.kind = Aeof; + goto Return; + + CASE1('(', Alparen); + CASE1(')', Arparen); + CASE1('{', Albrace); + CASE1('}', Arbrace); + CASE1('[', Albrakt); + CASE1(']', Arbrakt); + CASE1(',', Acomma); + CASE1('?', Aqmark); + CASE1(';', Asemi); + CASE1('~', Aneg); + CASE1(':', Acolon); + CASE2('^', Axor, '=', Axorasn); + CASE2('!', Anot, '=', Aneq); + CASE2('*', Astar,'=', Amulasn); + CASE2('=', Aasn, '=', Aeq); + CASE2('%', Amod, '=', Amodasn); + CASE3('+', Aadd, '=', Aaddasn, '+', Ainc); + CASE3('&', Aand, '=', Aandasn, '&', Aandand); + CASE3('|', Aor, '=', Aorasn, '|', Aoror); + CASE4('-', Asub, '=', Asubasn, '-', Adec, '>', Aarrow); + + Tnum: + e = lx->buf + arrlen(lx->buf); + do { + if (lx->b >= e) { + errorat(lx->pos, "number overflows lexer buffer"); + goto Nospace; + } + *lx->b++ = b; + } while (b = getbyte(lx), isdigit(b) || b == '_'); + + if (b == '.' || tolower(b) == 'e') + goto Tflt; + Tint: + n = 10; + s = lx->buf; + if (*s == '0') { + switch (b) { + case 'x': n = 16; break; + case 'b': n = 2; break; + case 'o': n = 8; break; + default: goto Rint; + } + lx->b = s; + /* reparse number, now with base info */ + while (b = getbyte(lx), (isdigit(b) || + ('a' <= b && b <= 'f') || + ('A' <= b && b <= 'F') || + b == '_')) + *lx->b++ = b; + } + Rint: + v = 0; + r = b; + for (; s != lx->b ; s++) { + b = *s; + if (b == '_') continue; + + f = Atoi[b]; + if (f == 0 && b != '0') + break; + + if (f >= n) { + errorat(lx->pos, "digit '%c' out of range for base %d", b, n); + f = 0; + } + + if (v > (UINT64_MAX - f) / n) { + errorat(lx->pos, "integer literal overflow"); + v = 0; + break; + } + + v = v * n + f; + } + + b = r; + tok.kind = Alit; + tok.val.i = v; + + if (b == 'u' || b == 'U') { + tok.kind |= Vun; + b = getbyte(lx); + } + if (b == 'l' || b == 'L') { + r = getbyte(lx); + if (r == 'l' || r == 'L') { + if (r != b) + errorat(lx->pos, "mismatched case on long long integer suffix"); + tok.kind |= Vvlong; + r = getbyte(lx); + } else + tok.kind |= Vlong; + + if (r == 'u' || r == 'U') { + if (tok.kind & Vun) + errorat(lx->pos, "multiple unsigned designators on integer suffix"); + tok.kind |= Vun; + goto Return; + } + + ungetbyte(lx); + goto Return; + } + + tok.kind |= Vint; + ungetbyte(lx); + goto Return; + + Tflt: + if (b == '.') { + *lx->b++ = b; + b = getbyte(lx); + } + + while (isdigit(b)) { + *lx->b++ = b; + + if (lx->b >= e) { + errorat(lx->pos, "number overflows lexer buffer"); + goto Nospace; + } + } + + if (tolower(b) == 'e') { + b = getbyte(lx); + if (b == '-' || b == '+') + b = getbyte(lx); + + if (!isdigit(b)) + errorat(lx->pos, "expected number after exponent, found %c", b); + + do { + *lx->b++ = b; + } while (b = getbyte(lx), isdigit(b)); + } + *lx->b = '\0'; + d = strtod(lx->buf, nil); + ungetbyte(lx); + + tok.kind = Alit | Vfloat; + tok.val.f = d; + + goto Return; + + Talpha: + s = lx->buf; + e = lx->buf + arrlen(lx->buf); + for (;;) { + if (s >= e) { + errorat(lx->pos, "identifier too long for buffer: %s", s); + goto Nospace; + } + if (b != EOF && b >= RuneSelf) { + ungetbyte(lx); + r = getrune(lx); + if (!utf8·isletter(r) && !utf8·isdigit(r) && r != 0xb7) { + errorat(lx->pos, "invalid identifier character %d", r); + } + s += utf8·runetobyte(s, &r); + } else if (!isalnum(b) && b != '_') + break; + else + *s++ = b; + b = getbyte(lx); + } + *s = '\0'; + ungetbyte(lx); + + tok.kind = Aident; + tok.val.s = lx->buf; + + n = intern(&tok.val.s); + if (n < arrlen(keywords)) { + tok.kind = Akeywd; + tok.val.i = n; + goto Return; + } + + sym = lookup(&lx->sym, tok.val.s); + if (sym && ((uintptr)sym->name != (uintptr)lx->io->path)) { + if ((uintptr)sym == lx->macline) { + tok.kind = Alit | Vint; + tok.val.i = lx->pos.line; + goto Return; + } + if ((uintptr)sym == lx->macfile) { + tok.kind = Alit | Vstr; + tok.val.s = lx->pos.path; + goto Return; + } + io = makeio(lx, sym->name); + io->rdr.end += expandmacro(lx, sym, io->b); + printf("EXPANDED %s: %s\n", sym->name, io->rdr.beg); + *io->rdr.end++ = EOF; + pushio(lx, io); + goto GetByte; + } + goto Return; + + default: + tok.kind = Anil; + errorat(lx->pos, "invalid token, crashing"); + abort(); + } + +Return: + lx->b = lx->buf; + return tok; + +Nospace: + panicf("aborting compilation"); + exit(1); +} + +#undef CASE4 +#undef CASE3 +#undef CASE2 +#undef CASE1 + +// ----------------------------------------------------------------------- +// symbol tables + +#define PTR_HASH(p) (uintptr)(p) +#define PTR_EQUAL(p1, p2) ((uintptr)(p1) == (uintptr)(p2)) + +static +void +·free(void* _, void* ptr) { + return free(ptr); +} + +static +void * +·alloc(void* _, uint n, ulong size) { + return malloc(n*size); +} + +static +void * +·calloc(void* _, uint n, ulong size) { + return calloc(n, size); +} + +static +int +moresymtab(SymTab *tab, int n) +{ + MAP_GROW(tab, string, Sym*, n, PTR_HASH, sys·Memory, nil); +} + +static +int +putsym(SymTab *tab, Sym *sym, error *err) +{ + MAP_PUT(tab, sym->name, sym, PTR_HASH, PTR_EQUAL, moresymtab, err); +} + +Sym* +define(SymTab *tab, string name, uint32 kind) +{ + int i; + Sym *sym; + error err; + + sym = mem·arenaalloc(C.heap, 1, sizeof(*sym)); + sym->name = name; + sym->kind = kind; + + i = putsym(tab, sym, &err); + tab->vals[i] = sym; + + return sym; +} + +Sym* +lookup(SymTab *tab, string ident) +{ + int idx; + MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL); + + if (idx < tab->n_buckets) + return tab->vals[idx]; + + return nil; +} + + +error +forget(SymTab *tab, string ident) +{ + int idx; + MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL); + + if (idx < tab->n_buckets) { + MAP_DEL(tab, idx); + return 0; + } + return 1; +} + +void +forgetall(SymTab *tab) +{ + MAP_RESET(tab); +} -- cgit v1.2.1