From 732c0f6ed6bc072638a66e8ed6802968e18ceb98 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Tue, 19 May 2020 12:13:05 -0700 Subject: feat: cleaned up character lexing --- sys/cmd/cc/cc.h | 18 ++++---- sys/cmd/cc/lex.c | 133 +++++++++++++++++++++++++++++++++++-------------------- sys/cmd/cc/pp.c | 8 ++-- 3 files changed, 101 insertions(+), 58 deletions(-) (limited to 'sys') diff --git a/sys/cmd/cc/cc.h b/sys/cmd/cc/cc.h index 0e525be..f5d3d28 100644 --- a/sys/cmd/cc/cc.h +++ b/sys/cmd/cc/cc.h @@ -179,12 +179,13 @@ enum NUM_TOKENS, Vchar = iota(8), - Vint = iota(9), - Vlong = iota(10), - Vvlong = iota(11), - Vusgn = iota(12), - Vfloat = iota(13), - Vstr = iota(14), + Vrune = iota(9), + Vint = iota(10), + Vlong = iota(11), + Vvlong = iota(12), + Vun = iota(13), + Vfloat = iota(14), + Vstr = iota(15), Vmask = Vchar - 1, }; @@ -202,8 +203,9 @@ struct Token double f; vlong i; uvlong ui; - byte c; - ubyte uc; + int32 c; + uint32 uc; + rune r; } val; }; diff --git a/sys/cmd/cc/lex.c b/sys/cmd/cc/lex.c index 20c6f9c..03aba4b 100644 --- a/sys/cmd/cc/lex.c +++ b/sys/cmd/cc/lex.c @@ -82,7 +82,7 @@ static uint8 Atoi[256] = static error -escape(Lexer *lx, int x, int *flag, vlong *val) +escapechar(Lexer *lx, int x, int islong, int esc, vlong *val) { int i, u, c; vlong l; @@ -90,14 +90,14 @@ escape(Lexer *lx, int x, int *flag, vlong *val) c = getrune(lx); switch (c) { + case '\\': + break; case EOF: errorat(lx->pos, "EOF in string"); return 1; case '\n': errorat(lx->pos, "newline in string"); return 1; - case '\\': - break; default: if (c == x) return 1; @@ -110,12 +110,11 @@ escape(Lexer *lx, int x, int *flag, vlong *val) switch(c) { case 'x': - i = 2; - *flag = 1; + i = islong ? 4 : 2; goto hex; case 'u': - i = 4; + i = islong ? 8 : 4; u = 1; goto hex; @@ -126,17 +125,17 @@ escape(Lexer *lx, int x, int *flag, vlong *val) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - *flag = 1; + i = islong ? 4 : 2; goto oct; - case 'a': c = '\a'; break; - case 'b': c = '\b'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'v': c = '\v'; break; - case '\\': c = '\\'; break; + case 'a': c = '\a'; break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'v': c = '\v'; break; + case '\\':c = '\\'; break; default: if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c); @@ -160,7 +159,6 @@ hex: l = l*16 + c-'A' + 10; continue; } - errorat(lx->pos, "non-hex character in escape sequence: %c", c); ungetbyte(lx); break; } @@ -169,22 +167,25 @@ hex: l = RuneErr; } *val = l; + if (esc) + *val |= RuneMask + 1; return 0; oct: l = c - '0'; - for (i = 2; i > 0; i--) { + for (; i > 0; i--) { c = getbyte(lx); if (c >= '0' && c <= '7') { l = l*8 + c-'0'; continue; } - errorat(lx->pos, "non-octal character in escape sequence: %c", c); ungetbyte(lx); } if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l); *val = l; + if (esc) + *val |= RuneMask + 1; return 0; } @@ -234,7 +235,7 @@ Token lex(Lexer *lx) { int b, n, f; - vlong v; + vlong v, _; uint u; rune r; string s; @@ -249,10 +250,28 @@ GetByte: Dispatch: tok.pos.beg = lx->pos; - if (b >= RuneSelf || isalpha(b) || b == '_') - goto TAlpha; + if (b >= RuneSelf || b == '_') + goto Talpha; + if (isalpha(b)) { + goto Talpha; + + n = b; + b = getbyte(lx); + if (b == '\'') { + if (escapechar(lx, '\'', 1, 0, &v)) + b = '\''; + if (!escapechar(lx, '\'', 1, 0, &_)) { + errorat(lx->pos, "missing ' at end of character constant"); + } + tok.kind = Alit | Vrune; + tok.val.r = v; + goto Return; + } + ungetbyte(lx); + b = n; + } if (isdigit(b)) - goto TNum; + goto Tnum; switch (b) { case '\n': @@ -263,12 +282,13 @@ Dispatch: lx->pos.line++; goto Dispatch; + Tchar: case '\'': - if (escape(lx, '\'', &f, &v)) { + if (escapechar(lx, '\'', 0, 0, &v)) { errorat(lx->pos, "empty literal or escaped ' in char literal"); v = '\''; } - if (!escape(lx, '\'', &f, &v)) { + if (!escapechar(lx, '\'', 0, 0, &_)) { errorat(lx->pos, "missing '"); ungetbyte(lx); } @@ -279,15 +299,15 @@ Dispatch: } tok.kind = Alit | Vchar; tok.val.c = v; - break; + break; case '"': s = str·makecap("", 0, 8); for (;;) { - if (escape(lx, '"', &f, &v)) + if (escapechar(lx, '"', 0, 1, &v)) break; - if (v < RuneSelf || f) + if (v & (RuneMask + 1)) str·appendbyte(&s, v); else { r = v; @@ -301,7 +321,7 @@ Dispatch: intern(&tok.val.s); str·free(s); - break; + break; case '.': tok.kind = Adot; @@ -309,7 +329,7 @@ Dispatch: if (isdigit(b)) { // *lx->b++ = b; - goto TFlt; + goto Tflt; } else if (b == '.') { b = getbyte(lx); if (b != '.') { @@ -319,7 +339,7 @@ Dispatch: } } ungetbyte(lx); - break; + break; case '<': tok.kind = Alt; @@ -336,7 +356,7 @@ Dispatch: tok.kind = Alteq; else ungetbyte(lx); - break; + break; case '>': tok.kind = Agt; @@ -353,7 +373,7 @@ Dispatch: tok.kind = Agteq; else ungetbyte(lx); - break; + break; case '/': tok.kind = Adiv; @@ -385,7 +405,7 @@ Dispatch: goto Dispatch; } else ungetbyte(lx); - break; + break; case '#': if (domacro(lx)) { @@ -394,7 +414,6 @@ Dispatch: return tok; } goto GetByte; - break; case EOF: popio(lx); @@ -432,7 +451,7 @@ Dispatch: goto Return; - TNum: + Tnum: e = lx->buf + arrlen(lx->buf); do { if (lx->b >= e) { @@ -443,12 +462,11 @@ Dispatch: } while (b = getbyte(lx), isdigit(b) || b == '_'); if (b == '.' || tolower(b) == 'e') - goto TFlt; - TInt: + goto Tflt; + Tint: r = b; n = 10; s = lx->buf; - ungetbyte(lx); if (*s == '0') { b = *++s; switch (b) { @@ -485,18 +503,39 @@ Dispatch: v = v * n + f; } + b = r; - tok.kind = Alit | Vint; + tok.kind = Alit; tok.val.i = v; - /* TODO: Suffixes! - if (tolower(b) == 'u') { - tok.kind |= Vusgn; + + if (b == 'u' || b == 'U') { + tok.kind |= Vun; b = getbyte(lx); } - */ - goto Return; + if (b == 'l' || b == 'L') { + r = getbyte(lx); + if (r == 'l' || r == 'L') { + if (r != b) + errorat(lx->pos, "mismatched case on long long integer suffix"); + tok.kind |= Vvlong; + r = getbyte(lx); + } else + tok.kind |= Vlong; + + if (r == 'u' || r == 'U') { + if (tok.kind & Vun) + errorat(lx->pos, "multiple unsigned designators on integer suffix"); + tok.kind |= Vun; + goto Return; + } + ungetbyte(lx); + } else + tok.kind |= Vint; + + ungetbyte(lx); + goto Return; - TFlt: + Tflt: if (b == '.') { *lx->b++ = b; b = getbyte(lx); @@ -530,9 +569,9 @@ Dispatch: tok.kind = Alit | Vfloat; tok.val.f = d; - goto Return; + goto Return; - TAlpha: + Talpha: u = b; s = lx->buf; e = lx->buf + arrlen(lx->buf); diff --git a/sys/cmd/cc/pp.c b/sys/cmd/cc/pp.c index 30bb3d4..8de4792 100644 --- a/sys/cmd/cc/pp.c +++ b/sys/cmd/cc/pp.c @@ -62,7 +62,7 @@ defmacro(Lexer *lx, string name, string macro) { Sym *mac; - printf("DEFINING MACRO %s, ON LINE %d\n", name, lx->pos.line); + printf("DEFINING MACRO %s = %s, ON LINE %d\n", name, macro+1, lx->pos.line); mac = define(&lx->sym, name, Smacro); mac->macro = macro; @@ -108,12 +108,14 @@ opand(Lexer *lx) case Alit: switch (tok.kind & ~Vmask) { - case Vint: + case Vint: case Vlong: case Vvlong: return tok.val.i; + case Vint | Vun: case Vlong | Vun: case Vvlong | Vun: + return tok.val.ui; case Vchar: return tok.val.c; default: - errorat(lx->pos, "invalid literal of type '%d' in conditional macro", tok.kind & ~Vmask); + errorat(lx->pos, "invalid literal of type '%s' in conditional macro", tokens[tok.kind & ~Vmask]); return 0; } -- cgit v1.2.1