aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/cc/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/cmd/cc/lex.c')
-rw-r--r--src/cmd/cc/lex.c873
1 files changed, 873 insertions, 0 deletions
diff --git a/src/cmd/cc/lex.c b/src/cmd/cc/lex.c
new file mode 100644
index 0000000..33fc5d0
--- /dev/null
+++ b/src/cmd/cc/lex.c
@@ -0,0 +1,873 @@
+#include "cc.h"
+#include <libn/macro/map.h>
+
+// -----------------------------------------------------------------------
+// printing functions
+
+void
+puttok(Token tok)
+{
+ if (tok.kind < Alit)
+ printf("%s", tokens[tok.kind]);
+ else if (tok.kind & Alit) {
+ if (tok.kind & Vchar)
+ if (tok.kind & Vint)
+ if (tok.kind & Vlong)
+ if (tok.kind & Vvlong)
+ printf("literal <%lld>", tok.val.i);
+ if (tok.kind & Vfloat)
+ printf("literal <%f>", tok.val.f);
+ printf("literal <%s>", tok.val.s);
+ } else
+ printf("ident <%s>", tok.val.s);
+}
+
+// -----------------------------------------------------------------------
+// io buffer management
+
+#define asrdr(x) (io·Reader){(int (*)(void *, int, int, void *))x}
+
+// path should be absolute
+Io*
+openio(Lexer *lx, byte *path)
+{
+ string *it, *end;
+
+ intern(&path);
+
+ // See if we have already opened file;
+ // If so, and it hasn't been flagged return it
+ for (it = lx->omit.path, end = it + lx->omit.len; it < end; ++it) {
+ if ((uintptr)(*it) == (uintptr)(path))
+ return nil;
+ }
+
+ // TODO: See if we have already loaded the file
+
+ if ((lx->new - lx->iostk) >= arrlen(lx->iostk)-1)
+ panicf("out of I/O space!");
+
+ lx->new->f = io·open(path, "r");
+ if (!lx->new->f)
+ panicf("file %s not found", path);
+
+ lx->new->kind = IOfile;
+ lx->new->path = path;
+ bufio·initreader(&lx->new->rdr, asrdr(io·read), lx->new->f);
+
+ return lx->new++;
+}
+
+static
+Io*
+makeio(Lexer *lx, byte *name)
+{
+ if ((lx->new - lx->iostk) >= arrlen(lx->iostk)-1)
+ panicf("out of I/O space!");
+
+ lx->new->path = name;
+ lx->new->rdr = (io·Buffer) {
+ .state = bufio·rdr | bufio·end,
+ .runesize = 0,
+ .h = nil,
+ .size = bufio·size,
+ .beg = lx->new->rdr.buf + bufio·ungets,
+ .pos = lx->new->rdr.buf + bufio·ungets,
+ .end = lx->new->rdr.buf + bufio·ungets,
+ };
+ lx->new->b = lx->new->rdr.beg;
+
+ return lx->new++;
+}
+#undef asrdr
+
+static
+void
+freeio(Lexer *lx, Io *io)
+{
+ if (io->kind & IOfile) {
+ io·close(io->f);
+ }
+
+ io->rdr.state = 0;
+ io->kind = 0;
+ io->link = nil;
+ io->path = nil;
+ io->store = (Pos){ 0 };
+ io->path = "<empty>";
+}
+
+void
+pushio(Lexer *lx, Io *new)
+{
+ new->link = lx->io;
+ lx->io->store = lx->pos;
+ lx->io = new;
+
+ lx->pos = (Pos){
+ .line = 1,
+ .col = 1,
+ .path = new->path,
+ };
+}
+
+void
+popio(Lexer *lx)
+{
+ Io *prev;
+
+ assert(lx->io == lx->new-1);
+ --lx->new;
+
+ prev = lx->io->link;
+ freeio(lx, lx->io);
+
+ lx->io = prev;
+ if (!prev) {
+ return;
+ }
+
+ lx->pos = prev->store;
+}
+
+// -----------------------------------------------------------------------
+// simple wrappers
+
+int
+getbyte(Lexer *lx)
+{
+ return bufio·getbyte(&lx->io->rdr);
+}
+
+int
+getnsbyte(Lexer *lx)
+{
+ int b;
+ b = getbyte(lx);
+ for (;;) {
+ if (b == EOF) {
+ if (lx->io->link) {
+ popio(lx);
+ assert(lx->io);
+ b = getbyte(lx);
+ continue;
+ } else
+ return b;
+ }
+ if (b >= RuneSelf || !isspace(b))
+ return b;
+ if (b == '\n')
+ return b;
+ b = getbyte(lx);
+ }
+ return b;
+}
+
+rune
+getrune(Lexer *lx)
+{
+ return bufio·getrune(&lx->io->rdr);
+}
+
+byte
+ungetbyte(Lexer *lx)
+{
+ byte b;
+ return bufio·ungetbyte(&lx->io->rdr, b);
+}
+
+rune
+ungetrune(Lexer *l, rune r)
+{
+ return bufio·ungetrune(&l->io->rdr, r);
+}
+
+// -----------------------------------------------------------------------
+// main lexer
+
+#define TOK(a, b) b,
+byte *tokens[NUM_TOKENS] = { TOKENS };
+#undef TOK
+
+static uint8 Atoi[256] =
+{
+ ['0'] = 0, ['1'] = 1, ['2'] = 2, ['3'] = 3, ['4'] = 4, ['5'] = 5,
+ ['6'] = 6, ['7'] = 7, ['8'] = 8, ['9'] = 9, ['a'] = 10, ['A'] = 10,
+ ['b'] = 11, ['B'] = 11, ['c'] = 12, ['C'] = 12, ['d'] = 13, ['D'] = 13,
+ ['e'] = 14, ['E'] = 14, ['f'] = 15, ['F'] = 15,
+};
+
+static
+error
+escapechar(Lexer *lx, int x, int islong, int esc, vlong *val)
+{
+ int i, u, c;
+ vlong l;
+
+ c = getrune(lx);
+
+ switch (c) {
+ case '\\':
+ break;
+ case EOF:
+ errorat(lx->pos, "EOF in string");
+ return 1;
+ case '\n':
+ errorat(lx->pos, "newline in string");
+ return 1;
+ default:
+ if (c == x)
+ return 1;
+ *val = c;
+ return 0;
+ }
+
+ u = 0;
+ c = getrune(lx);
+
+ switch(c) {
+ case 'x':
+ i = islong ? 4 : 2;
+ goto hex;
+
+ case 'u':
+ i = islong ? 8 : 4;
+ u = 1;
+ goto hex;
+
+ case 'U':
+ i = 8;
+ u = 1;
+ goto hex;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ i = islong ? 4 : 2;
+ goto oct;
+
+ case 'a': c = '\a'; break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'v': c = '\v'; break;
+ case '\\':c = '\\'; break;
+
+ default:
+ if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c);
+ }
+ *val = c;
+ return 0;
+
+hex:
+ l = 0;
+ for(; i > 0; i--) {
+ c = getbyte(lx);
+ if (c >= '0' && c <= '9') {
+ l = l*16 + c-'0';
+ continue;
+ }
+ if (c >= 'a' && c <= 'f') {
+ l = l*16 + c-'a' + 10;
+ continue;
+ }
+ if (c >= 'A' && c <= 'F') {
+ l = l*16 + c-'A' + 10;
+ continue;
+ }
+ ungetbyte(lx);
+ break;
+ }
+ if (u && (l > RuneMax || (0xd800 <= l && l < 0xe000))) {
+ errorat(lx->pos, "invalid unicode code point in escape sequence: %#llx", l);
+ l = RuneErr;
+ }
+ *val = l;
+ if (esc)
+ *val |= RuneMask + 1;
+ return 0;
+
+oct:
+ l = c - '0';
+ for (; i > 0; i--) {
+ c = getbyte(lx);
+ if (c >= '0' && c <= '7') {
+ l = l*8 + c-'0';
+ continue;
+ }
+ ungetbyte(lx);
+ break;
+ }
+ if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l);
+
+ *val = l;
+ if (esc)
+ *val |= RuneMask + 1;
+ return 0;
+}
+
+#define CASE1(stmt1, kind1) \
+ case stmt1: \
+ tok.kind = kind1; \
+ goto Return
+
+#define CASE2(stmt1, kind1, b1, kind2) \
+ case stmt1: \
+ tok.kind = kind1; \
+ b = getbyte(lx); \
+ if (b == b1) \
+ tok.kind = kind2; \
+ else \
+ ungetbyte(lx); \
+ goto Return
+
+#define CASE3(stmt1, kind1, b1, kind2, b2, kind3) \
+ case stmt1: \
+ tok.kind = kind1; \
+ b = getbyte(lx); \
+ if (b == b1) \
+ tok.kind = kind2; \
+ else if (b == b2) \
+ tok.kind = kind3; \
+ else \
+ ungetbyte(lx); \
+ goto Return
+
+#define CASE4(stmt1, kind1, b1, kind2, b2, kind3, b3, type4) \
+ case stmt1: \
+ tok.kind = kind1; \
+ b = getbyte(lx); \
+ if (b == b1) \
+ tok.kind = kind2; \
+ else if (b == b2) \
+ tok.kind = kind3; \
+ else if (b == b3) \
+ tok.kind = type4; \
+ else \
+ ungetbyte(lx); \
+ goto Return
+
+
+Token
+lex(Lexer *lx)
+{
+ int b, n, f;
+ vlong v, _;
+ rune r;
+ string s;
+ double d;
+ byte *e;
+ Token tok;
+ Sym *sym;
+ Io *io;
+
+GetByte:
+ b = getbyte(lx);
+Dispatch:
+ tok.pos = lx->pos;
+
+ if ((b != EOF && b >= RuneSelf) || b == '_')
+ goto Talpha;
+ if (isalpha(b)) {
+ if (b != 'L')
+ goto Talpha;
+
+ n = b;
+ b = getbyte(lx);
+ if (b == '\'') {
+ if (escapechar(lx, '\'', 1, 0, &v))
+ b = '\'';
+ if (!escapechar(lx, '\'', 1, 0, &_)) {
+ errorat(lx->pos, "missing ' at end of character constant");
+ }
+ tok.kind = Alit | Vrune;
+ tok.val.r = v;
+ goto Return;
+ }
+ if (b == '"')
+ goto TLstr;
+ ungetbyte(lx);
+ b = n;
+
+ goto Talpha;
+ }
+ if (isdigit(b))
+ goto Tnum;
+
+ switch (b) {
+ case '\n':
+ lx->pos.line++;
+ case ' ': case '\r': case '\t': case '\v': case '\f':
+ while (b = getbyte(lx), isspace(b))
+ if (b == '\n')
+ lx->pos.line++;
+ goto Dispatch;
+
+ case '\\':
+ b = getbyte(lx);
+ if (b != '\n')
+ errorat(lx->pos, "'\\' without a trailing newline");
+ goto GetByte;
+
+ Tchar:
+ case '\'':
+ if (escapechar(lx, '\'', 0, 0, &v)) {
+ errorat(lx->pos, "empty literal or escaped ' in char literal");
+ v = '\'';
+ }
+ if (!escapechar(lx, '\'', 0, 0, &_)) {
+ errorat(lx->pos, "missing '");
+ ungetbyte(lx);
+ }
+
+ if (v > 0xff) {
+ errorat(lx->pos, "overflowed character literal");
+ v = 0;
+ }
+ tok.kind = Alit | Vchar;
+ tok.val.c = v;
+ goto Return;
+
+ case '"':
+ s = str·makecap("", 0, 8);
+ for (;;) {
+ if (escapechar(lx, '"', 0, 1, &v))
+ break;
+
+ if (v & (RuneMask + 1))
+ str·appendbyte(&s, v);
+ else {
+ r = v;
+ b = utf8·runelen(r);
+ utf8·runetobyte(lx->buf, &r);
+ str·appendlen(&s, b, lx->buf);
+ }
+ }
+ tok.kind = Alit | Vstr;
+ tok.val.s = s;
+ intern(&tok.val.s);
+
+ str·free(s);
+ goto Return;
+
+ TLstr:
+ s = str·makecap("", 0, 8);
+ // NOTE: this violates strict aliasing
+ for (;;) {
+ if (escapechar(lx, '"', 1, 0, &v))
+ break;
+ str·appendlen(&s, sizeof(wchar_t), (byte*)&v);
+ }
+ tok.kind = Alit | Vwstr;
+ tok.val.s = s;
+ intern(&tok.val.s);
+
+ str·free(s);
+ goto Return;
+
+ case '.':
+ tok.kind = Adot;
+ b = getbyte(lx);
+
+ if (isdigit(b)) {
+ // *lx->b++ = b;
+ goto Tflt;
+ } else if (b == '.') {
+ b = getbyte(lx);
+ if (b != '.') {
+ errorat(lx->pos, "invalid token '..'");
+ tok.kind = Aellip;
+ break;
+ }
+ }
+ ungetbyte(lx);
+ goto Return;
+
+ case '<':
+ tok.kind = Alt;
+ b = getbyte(lx);
+
+ if (b == '<') {
+ tok.kind = Alsft;
+ b = getbyte(lx);
+ if (b == '=')
+ tok.kind = Alsftasn;
+ else
+ ungetbyte(lx);
+ } else if (b == '=')
+ tok.kind = Alteq;
+ else
+ ungetbyte(lx);
+ goto Return;
+
+ case '>':
+ tok.kind = Agt;
+ b = getbyte(lx);
+
+ if (b == '>') {
+ tok.kind = Arsft;
+ b = getbyte(lx);
+ if (b == '=')
+ tok.kind = Arsftasn;
+ else
+ ungetbyte(lx);
+ } else if (b == '=')
+ tok.kind = Agteq;
+ else
+ ungetbyte(lx);
+ goto Return;
+
+ case '/':
+ tok.kind = Adiv;
+ b = getbyte(lx);
+
+ if (b == '=')
+ tok.kind = Adivasn;
+ else if (b == '/') {
+ while (b != EOF && b != '\n')
+ b = getbyte(lx);
+ goto Dispatch;
+ } else if (b == '*') {
+ int level = 1;
+ b = getbyte(lx);
+ while (b != EOF && level > 0) {
+ if (b == '/') {
+ b = getbyte(lx);
+ if (b == '*')
+ level++;
+ } else if (b == '*') {
+ b = getbyte(lx);
+ if (b == '/')
+ level--;
+ }
+ if (b == '\n')
+ lx->pos.line++;
+ b = getbyte(lx);
+ }
+ goto Dispatch;
+ } else
+ ungetbyte(lx);
+ goto Return;
+
+ case '#':
+ if (domacro(lx)) {
+ tok.kind = Anil;
+ errorat(lx->pos, "failed to perform preprocessor directive");
+ return tok;
+ }
+ goto GetByte;
+
+ case EOF:
+ popio(lx);
+ if (lx->io)
+ goto GetByte;
+ tok.kind = Aeof;
+ goto Return;
+
+ CASE1('(', Alparen);
+ CASE1(')', Arparen);
+ CASE1('{', Albrace);
+ CASE1('}', Arbrace);
+ CASE1('[', Albrakt);
+ CASE1(']', Arbrakt);
+ CASE1(',', Acomma);
+ CASE1('?', Aqmark);
+ CASE1(';', Asemi);
+ CASE1('~', Aneg);
+ CASE1(':', Acolon);
+ CASE2('^', Axor, '=', Axorasn);
+ CASE2('!', Anot, '=', Aneq);
+ CASE2('*', Astar,'=', Amulasn);
+ CASE2('=', Aasn, '=', Aeq);
+ CASE2('%', Amod, '=', Amodasn);
+ CASE3('+', Aadd, '=', Aaddasn, '+', Ainc);
+ CASE3('&', Aand, '=', Aandasn, '&', Aandand);
+ CASE3('|', Aor, '=', Aorasn, '|', Aoror);
+ CASE4('-', Asub, '=', Asubasn, '-', Adec, '>', Aarrow);
+
+ Tnum:
+ e = lx->buf + arrlen(lx->buf);
+ do {
+ if (lx->b >= e) {
+ errorat(lx->pos, "number overflows lexer buffer");
+ goto Nospace;
+ }
+ *lx->b++ = b;
+ } while (b = getbyte(lx), isdigit(b) || b == '_');
+
+ if (b == '.' || tolower(b) == 'e')
+ goto Tflt;
+ Tint:
+ n = 10;
+ s = lx->buf;
+ if (*s == '0') {
+ switch (b) {
+ case 'x': n = 16; break;
+ case 'b': n = 2; break;
+ case 'o': n = 8; break;
+ default: goto Rint;
+ }
+ lx->b = s;
+ /* reparse number, now with base info */
+ while (b = getbyte(lx), (isdigit(b) ||
+ ('a' <= b && b <= 'f') ||
+ ('A' <= b && b <= 'F') ||
+ b == '_'))
+ *lx->b++ = b;
+ }
+ Rint:
+ v = 0;
+ r = b;
+ for (; s != lx->b ; s++) {
+ b = *s;
+ if (b == '_') continue;
+
+ f = Atoi[b];
+ if (f == 0 && b != '0')
+ break;
+
+ if (f >= n) {
+ errorat(lx->pos, "digit '%c' out of range for base %d", b, n);
+ f = 0;
+ }
+
+ if (v > (UINT64_MAX - f) / n) {
+ errorat(lx->pos, "integer literal overflow");
+ v = 0;
+ break;
+ }
+
+ v = v * n + f;
+ }
+
+ b = r;
+ tok.kind = Alit;
+ tok.val.i = v;
+
+ if (b == 'u' || b == 'U') {
+ tok.kind |= Vun;
+ b = getbyte(lx);
+ }
+ if (b == 'l' || b == 'L') {
+ r = getbyte(lx);
+ if (r == 'l' || r == 'L') {
+ if (r != b)
+ errorat(lx->pos, "mismatched case on long long integer suffix");
+ tok.kind |= Vvlong;
+ r = getbyte(lx);
+ } else
+ tok.kind |= Vlong;
+
+ if (r == 'u' || r == 'U') {
+ if (tok.kind & Vun)
+ errorat(lx->pos, "multiple unsigned designators on integer suffix");
+ tok.kind |= Vun;
+ goto Return;
+ }
+
+ ungetbyte(lx);
+ goto Return;
+ }
+
+ tok.kind |= Vint;
+ ungetbyte(lx);
+ goto Return;
+
+ Tflt:
+ if (b == '.') {
+ *lx->b++ = b;
+ b = getbyte(lx);
+ }
+
+ while (isdigit(b)) {
+ *lx->b++ = b;
+
+ if (lx->b >= e) {
+ errorat(lx->pos, "number overflows lexer buffer");
+ goto Nospace;
+ }
+ }
+
+ if (tolower(b) == 'e') {
+ b = getbyte(lx);
+ if (b == '-' || b == '+')
+ b = getbyte(lx);
+
+ if (!isdigit(b))
+ errorat(lx->pos, "expected number after exponent, found %c", b);
+
+ do {
+ *lx->b++ = b;
+ } while (b = getbyte(lx), isdigit(b));
+ }
+ *lx->b = '\0';
+ d = strtod(lx->buf, nil);
+ ungetbyte(lx);
+
+ tok.kind = Alit | Vfloat;
+ tok.val.f = d;
+
+ goto Return;
+
+ Talpha:
+ s = lx->buf;
+ e = lx->buf + arrlen(lx->buf);
+ for (;;) {
+ if (s >= e) {
+ errorat(lx->pos, "identifier too long for buffer: %s", s);
+ goto Nospace;
+ }
+ if (b != EOF && b >= RuneSelf) {
+ ungetbyte(lx);
+ r = getrune(lx);
+ if (!utf8·isletter(r) && !utf8·isdigit(r) && r != 0xb7) {
+ errorat(lx->pos, "invalid identifier character %d", r);
+ }
+ s += utf8·runetobyte(s, &r);
+ } else if (!isalnum(b) && b != '_')
+ break;
+ else
+ *s++ = b;
+ b = getbyte(lx);
+ }
+ *s = '\0';
+ ungetbyte(lx);
+
+ tok.kind = Aident;
+ tok.val.s = lx->buf;
+
+ n = intern(&tok.val.s);
+ if (n < arrlen(keywords)) {
+ tok.kind = Akeywd;
+ tok.val.i = n;
+ goto Return;
+ }
+
+ sym = lookup(&lx->sym, tok.val.s);
+ if (sym && ((uintptr)sym->name != (uintptr)lx->io->path)) {
+ if ((uintptr)sym == lx->macline) {
+ tok.kind = Alit | Vint;
+ tok.val.i = lx->pos.line;
+ goto Return;
+ }
+ if ((uintptr)sym == lx->macfile) {
+ tok.kind = Alit | Vstr;
+ tok.val.s = lx->pos.path;
+ goto Return;
+ }
+ io = makeio(lx, sym->name);
+ io->rdr.end += expandmacro(lx, sym, io->b);
+ printf("EXPANDED %s: %s\n", sym->name, io->rdr.beg);
+ *io->rdr.end++ = EOF;
+ pushio(lx, io);
+ goto GetByte;
+ }
+ goto Return;
+
+ default:
+ tok.kind = Anil;
+ errorat(lx->pos, "invalid token, crashing");
+ abort();
+ }
+
+Return:
+ lx->b = lx->buf;
+ return tok;
+
+Nospace:
+ panicf("aborting compilation");
+ exit(1);
+}
+
+#undef CASE4
+#undef CASE3
+#undef CASE2
+#undef CASE1
+
+// -----------------------------------------------------------------------
+// symbol tables
+
+#define PTR_HASH(p) (uintptr)(p)
+#define PTR_EQUAL(p1, p2) ((uintptr)(p1) == (uintptr)(p2))
+
+static
+void
+·free(void* _, void* ptr) {
+ return free(ptr);
+}
+
+static
+void *
+·alloc(void* _, uint n, ulong size) {
+ return malloc(n*size);
+}
+
+static
+void *
+·calloc(void* _, uint n, ulong size) {
+ return calloc(n, size);
+}
+
+static
+int
+moresymtab(SymTab *tab, int n)
+{
+ MAP_GROW(tab, string, Sym*, n, PTR_HASH, sys·Memory, nil);
+}
+
+static
+int
+putsym(SymTab *tab, Sym *sym, error *err)
+{
+ MAP_PUT(tab, sym->name, sym, PTR_HASH, PTR_EQUAL, moresymtab, err);
+}
+
+Sym*
+define(SymTab *tab, string name, uint32 kind)
+{
+ int i;
+ Sym *sym;
+ error err;
+
+ sym = mem·arenaalloc(C.heap, 1, sizeof(*sym));
+ sym->name = name;
+ sym->kind = kind;
+
+ i = putsym(tab, sym, &err);
+ tab->vals[i] = sym;
+
+ return sym;
+}
+
+Sym*
+lookup(SymTab *tab, string ident)
+{
+ int idx;
+ MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL);
+
+ if (idx < tab->n_buckets)
+ return tab->vals[idx];
+
+ return nil;
+}
+
+
+error
+forget(SymTab *tab, string ident)
+{
+ int idx;
+ MAP_GET(idx, tab, ident, PTR_HASH, PTR_EQUAL);
+
+ if (idx < tab->n_buckets) {
+ MAP_DEL(tab, idx);
+ return 0;
+ }
+ return 1;
+}
+
+void
+forgetall(SymTab *tab)
+{
+ MAP_RESET(tab);
+}