From 9ec5bed6a7d715ffa69851569485a685dd69db2e Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Sun, 17 May 2020 16:19:17 -0700 Subject: prototype of front end cli --- sys/cmd/cc/lex.c | 649 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 630 insertions(+), 19 deletions(-) (limited to 'sys/cmd/cc/lex.c') diff --git a/sys/cmd/cc/lex.c b/sys/cmd/cc/lex.c index af3bbf3..6b85d8c 100644 --- a/sys/cmd/cc/lex.c +++ b/sys/cmd/cc/lex.c @@ -1,38 +1,649 @@ #include "cc.h" -static -void -errorat(Pos x, byte *fmt, ...) +#include + +// ----------------------------------------------------------------------- +// simple wrappers + +byte +getbyte(Lexer *l) { - va_list args; - va_start(args, fmt); - printf("error %d:", x.line); - vprintf(fmt, args); - va_end(args); + return bufio·getbyte(&l->io->buf); +} + +byte +getnsbyte(Lexer *l) +{ + byte b; + while (b = bufio·getbyte(&l->io->buf), isspace(b)); + return b; +} + +rune +getrune(Lexer *l) +{ + return bufio·getrune(&l->io->buf); } -static byte -getbyte(struct Lexer *lex) +ungetbyte(Lexer *lx) { - return bufio·getbyte(&lex->buf); + byte b; + return bufio·ungetbyte(&lx->io->buf, b); } +rune +ungetrune(Lexer *l, rune r) +{ + return bufio·ungetrune(&l->io->buf, r); +} + +// ----------------------------------------------------------------------- +// main lexer + +#define TOK(a, b) b, +byte *tokens[NUM_TOKENS] = { TOKENS }; +#undef TOK + +static uint8 Atoi[256] = +{ + ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4, ['5'] = 5, + ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9, ['a'] = 10, ['A'] = 10, + ['b'] = 11, ['B'] = 11, ['c'] = 12, ['C'] = 12, ['d'] = 13, ['D'] = 13, + ['e'] = 14, ['E'] = 14, ['f'] = 15, ['F'] = 15, +}; + static error -ungetbyte(struct Lexer *lex, byte b) +escape(Lexer *lx, int x, int *flag, vlong *val) { - return bufio·ungetbyte(&lex->buf, b); + int i, u, c; + vlong l; + + c = getrune(lx); + + switch (c) { + case EOF: + errorat(lx->pos, "EOF in string"); + return 1; + case '\n': + errorat(lx->pos, "newline in string"); + return 1; + case '\\': + break; + default: + if (c == x) + return 1; + *val = c; + return 0; + } + + u = 0; + c = getrune(lx); + + switch(c) { + case 'x': + i = 2; + *flag = 1; + goto hex; + + case 'u': + i = 4; + u = 1; + goto hex; + + case 'U': + i = 8; + u = 1; + goto hex; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + *flag = 1; + goto oct; + + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'v': c = '\v'; break; + case '\\': c = '\\'; break; + + default: + if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c); + } + *val = c; + return 0; + +hex: + l = 0; + for(; i > 0; i--) { + c = getbyte(lx); + if (c >= '0' && c <= '9') { + l = l*16 + c-'0'; + continue; + } + if (c >= 'a' && c <= 'f') { + l = l*16 + c-'a' + 10; + continue; + } + if (c >= 'A' && c <= 'F') { + l = l*16 + c-'A' + 10; + continue; + } + errorat(lx->pos, "non-hex character in escape sequence: %c", c); + ungetbyte(lx); + break; + } + if (u && (l > RuneMax || (0xd800 <= l && l < 0xe000))) { + errorat(lx->pos, "invalid unicode code point in escape sequence: %#llx", l); + l = RuneErr; + } + *val = l; + return 0; + +oct: + l = c - '0'; + for (i = 2; i > 0; i--) { + c = getbyte(lx); + if (c >= '0' && c <= '7') { + l = l*8 + c-'0'; + continue; + } + errorat(lx->pos, "non-octal character in escape sequence: %c", c); + ungetbyte(lx); + } + if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l); + + *val = l; + return 0; } -void -lex(struct Lexer *lex) +#define CASE1(stmt1, kind1) \ + case stmt1: \ + tok.kind = kind1; \ + break; + +#define CASE2(stmt1, kind1, b1, kind2) \ + case stmt1: \ + tok.kind = kind1; \ + b = getbyte(lx); \ + if (b == b1) \ + tok.kind = kind2; \ + else \ + ungetbyte(lx); \ + break; + +#define CASE3(stmt1, kind1, b1, kind2, b2, kind3) \ + case stmt1: \ + tok.kind = kind1; \ + b = getbyte(lx); \ + if (b == b1) \ + tok.kind = kind2; \ + else if (b == b2) \ + tok.kind = kind3; \ + else \ + ungetbyte(lx); \ + break; + +#define CASE4(stmt1, kind1, b1, kind2, b2, kind3, b3, type4) \ + case stmt1: \ + tok.kind = kind1; \ + b = getbyte(lx); \ + if (b == b1) \ + tok.kind = kind2; \ + else if (b == b2) \ + tok.kind = kind3; \ + else if (b == b3) \ + tok.kind = type4; \ + else \ + ungetbyte(lx); \ + break; + + +Token +lex(Lexer *lx) { - int b; + int b, n, f; + vlong v; + uint u; + rune r; + string s; + double d; + byte *e; + Token tok; + Sym *sym; + Io *io; + +GetByte: + b = getbyte(lx); +Dispatch: + tok.pos.beg = lx->pos; + + if (b >= RuneSelf || isalpha(b)) + goto TAlpha; + if (isdigit(b)) + goto TNum; - b = getbyte(lex); -TOP: switch (b) { + case ' ': case '\n': case '\r': case '\t': case '\v': case '\f': + while (b = getbyte(lx), isspace(b)) + if (b == '\n') lx->pos.line++; + goto Dispatch; + + case '\'': + if (escape(lx, '\'', &f, &v)) { + errorat(lx->pos, "empty literal or escaped ' in char literal"); + v = '\''; + } + if (!escape(lx, '\'', &f, &v)) { + errorat(lx->pos, "missing '"); + ungetbyte(lx); + } + + if (v > 0xff) { + errorat(lx->pos, "overflowed character literal"); + v = 0; + } + tok.kind = Alit | Vchar; + tok.val.c = v; + break; + + case '"': + s = str·makecap("", 0, 8); + for (;;) { + if (escape(lx, '"', &f, &v)) + break; + + if (v < RuneSelf || f) + str·appendbyte(&s, v); + else { + r = v; + b = utf8·runelen(r); + utf8·runetochar(lx->buf, &r); + str·appendlen(&s, b, lx->buf); + } + } + tok.kind = Alit | Vstr; + tok.val.s = s; + intern(&tok.val.s); + + str·free(s); + break; + case '.': + tok.kind = Adot; + b = getbyte(lx); + + if (isdigit(b)) { + // *lx->b++ = b; + goto TFlt; + } else if (b == '.') { + b = getbyte(lx); + if (b != '.') { + errorat(lx->pos, "invalid token '..'"); + tok.kind = Aellip; + break; + } + } + ungetbyte(lx); + break; + + case '<': + tok.kind = Alt; + b = getbyte(lx); + + if (b == '<') { + tok.kind = Alsft; + b = getbyte(lx); + if (b == '=') + tok.kind = Alsftasn; + else + ungetbyte(lx); + } else if (b == '=') + tok.kind = Alteq; + else + ungetbyte(lx); + break; + + case '>': + tok.kind = Agt; + b = getbyte(lx); + + if (b == '>') { + tok.kind = Arsft; + b = getbyte(lx); + if (b == '=') + tok.kind = Arsftasn; + else + ungetbyte(lx); + } else if (b == '=') + tok.kind = Agteq; + else + ungetbyte(lx); + break; + + case '/': + tok.kind = Adiv; + b = getbyte(lx); + + if (b == '=') + tok.kind = Adivasn; + else if (b == '/') { + while (b != EOF && b != '\n') + b = getbyte(lx); + lx->pos.line++; + goto Dispatch; + } else if (b == '*') { + int level = 1; + b = getbyte(lx); + while (b != EOF && level > 0) { + if (b == '/') { + b = getbyte(lx); + if (b == '*') + level++; + } else if (b == '*') { + b = getbyte(lx); + if (b == '/') + level--; + } + if (b == '\n') lx->pos.line++; + b = getbyte(lx); + } + goto Dispatch; + } else + ungetbyte(lx); + break; + + case '#': + if (domacro(lx)) { + tok.kind = Anil; + errorat(lx->pos, "failed to perform preprocessor directive"); + return tok; + } + goto GetByte; + break; + + case EOF: + panicf("need to implement popio"); + + CASE1('(', Alparen) + CASE1(')', Arparen) + CASE1('{', Albrace) + CASE1('}', Arbrace) + CASE1('[', Albrakt) + CASE1(']', Arbrakt) + CASE1(',', Acomma) + CASE1('?', Aqmark) + CASE1(';', Asemi) + CASE1('~', Aneg) + CASE1(':', Acolon) + CASE2('^', Axor, '=', Axorasn) + CASE2('!', Anot, '=', Aneq) + CASE2('*', Astar,'=', Amulasn) + CASE2('=', Aasn, '=', Aeq) + CASE2('%', Amod, '=', Amodasn) + CASE3('+', Aadd, '=', Aaddasn, '+', Ainc) + CASE3('&', Aand, '=', Aandasn, '&', Aandand) + CASE3('|', Aor, '=', Aorasn, '|', Aoror) + CASE4('-', Asub, '=', Asubasn, '-', Adec, '>', Aarrow) + + default: + tok.kind = Anil; + errorat(lx->pos, "invalid token, crashing"); + abort(); } -} + + goto Return; + + TNum: + e = lx->buf + arrlen(lx->buf); + do { + if (lx->b >= e) { + errorat(lx->pos, "number overflows lexer buffer"); + goto Nospace; + } + *lx->b++ = b; + } while (b = getbyte(lx), isdigit(b) || b == '_'); + + if (b == '.' || tolower(b) == 'e') + goto TFlt; + TInt: + r = b; + n = 10; + s = lx->buf; + if (*s == '0') { + b = *++s; + switch (b) { + case 'x': n = 16; break; + case 'b': n = 2; break; + case 'o': n = 8; break; + default: --s; + } + if (s >= e) { + errorat(lx->pos, "number overflows lexer buffer"); + goto Nospace; + } + } + + v = 0; + for (; s != lx->b ; s++) { + b = *s; + if (b == '_') continue; + + f = Atoi[b]; + if (f == 0 && b != '0') + break; + + if (f >= n) { + errorat(lx->pos, "digit '%c' out of range for base %d", b, n); + f = 0; + } + + if (v > (UINT64_MAX - f) / n) { + errorat(lx->pos, "integer literal overflow"); + v = 0; + break; + } + + v = v * n + f; + } + b = r; + tok.kind = Alit | Vint; + tok.val.i = v; + /* TODO: Suffixes! + if (tolower(b) == 'u') { + tok.kind |= Vusgn; + b = getbyte(lx); + } + */ + goto Return; + + TFlt: + if (b == '.') { + *lx->b++ = b; + b = getbyte(lx); + } + + while (isdigit(b)) { + *lx->b++ = b; + + if (lx->b >= e) { + errorat(lx->pos, "number overflows lexer buffer"); + goto Nospace; + } + } + + if (tolower(b) == 'e') { + b = getbyte(lx); + if (b == '-' || b == '+') + b = getbyte(lx); + + if (!isdigit(b)) + errorat(lx->pos, "expected number after exponent, found %c", b); + + do { + *lx->b++ = b; + } while (b = getbyte(lx), isdigit(b)); + } + *lx->b = '\0'; + d = strtod(lx->buf, nil); + + tok.kind = Alit | Vfloat; + tok.val.f = d; + + goto Return; + + TAlpha: + u = b; + s = lx->buf; + e = lx->buf + arrlen(lx->buf); + for (;;) { + if (s >= e) { + errorat(lx->pos, "identifier too long for buffer: %s", s); + goto Nospace; + } + if (u >= RuneSelf) { + ungetbyte(lx); + r = getrune(lx); + if (!utf8·isletter(r) && !utf8·isdigit(r) && r != 0xb7) { + errorat(lx->pos, "invalid identifier character %d", r); + } + s += utf8·runetochar(s, &r); + } else if (!isalnum(u) && u != '_') + break; + else + *s++ = u; + u = getbyte(lx); + } + *s = '\0'; + tok.kind = Aident; + tok.val.s = lx->buf; + + n = intern(&tok.val.s); + if (n < arrlen(keywords)) { + tok.kind = Akeywd; + } + + sym = lookup(lx->sym, tok.val.s); + if (sym) { + io = makeio(); + io->buf.end += expandmacro(lx, sym, io->b); + pushio(lx, io); + goto GetByte; + } + +Return: + lx->b = lx->buf; + tok.pos.end = lx->pos; + return tok; + +Nospace: + panicf("aborting compilation"); +} + +#undef CASE4 +#undef CASE3 +#undef CASE2 +#undef CASE1 + +// ----------------------------------------------------------------------- +// push/pop io objects + +void +pushio(Lexer *lx, Io *new) +{ + new->link = lx->io; + lx->io->store = lx->pos; + lx->io = new; + + lx->pos = (Pos){ + .line = 0, + .col = 0, + .path = new->path, + }; +} + +void +popio(Lexer *lx) +{ + Io *prev; + + prev = lx->io->link; + if (!prev) { + panicf("no buffer left"); + } + + lx->pos = prev->store; + lx->io = prev; +} + +// ----------------------------------------------------------------------- +// symbol tables + +#define PTR_HASH(p) (uintptr)(p) +#define PTR_EQUAL(p1, p2) ((uintptr)(p1) == (uintptr)(p2)) + +struct SymTab +{ + MAP_STRUCT_BODY(string, Sym*); +}; + +Sym* +lookup(SymTab *tab, string ident) +{ + int idx; + MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL); + + if (idx < tab->n_buckets) + return tab->vals[idx]; + + return nil; +} + +static +int +moresymtab(SymTab *tab, int n) +{ + MAP_GROW(tab, string, Sym*, n, PTR_HASH, mem·sys.alloc, mem·sys.free, nil); +} + +static +int +putsym(SymTab *tab, Sym *sym, error *err) +{ + MAP_PUT(tab, sym->name, sym, PTR_HASH, PTR_EQUAL, moresymtab, err); +} + +Sym* +define(SymTab *tab, string name, int kind) +{ + Sym *sym; + error err; + + sym = mem·arenaalloc(C.heap, 1, sizeof(*sym)); + sym->name = name; + sym->kind = kind; + + putsym(tab, sym, &err); + + return sym; +} + +// ----------------------------------------------------------------------- +// error reporting + +void +errorat(Pos x, byte *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + printf("error %d: ", x.line); + + vprintf(fmt, args); + va_end(args); +} + -- cgit v1.2.1