aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorNicholas Noll <nbnoll@eml.cc>2020-05-19 12:13:05 -0700
committerNicholas Noll <nbnoll@eml.cc>2020-05-19 12:13:05 -0700
commit732c0f6ed6bc072638a66e8ed6802968e18ceb98 (patch)
tree8bfc25aff1903f452006f31ff6a2f906303e2017 /sys
parent5c3114c81477298e9eb78ed3a070752f8d24b8a5 (diff)
feat: cleaned up character lexing
Diffstat (limited to 'sys')
-rw-r--r--sys/cmd/cc/cc.h18
-rw-r--r--sys/cmd/cc/lex.c133
-rw-r--r--sys/cmd/cc/pp.c8
3 files changed, 101 insertions, 58 deletions
diff --git a/sys/cmd/cc/cc.h b/sys/cmd/cc/cc.h
index 0e525be..f5d3d28 100644
--- a/sys/cmd/cc/cc.h
+++ b/sys/cmd/cc/cc.h
@@ -179,12 +179,13 @@ enum
NUM_TOKENS,
Vchar = iota(8),
- Vint = iota(9),
- Vlong = iota(10),
- Vvlong = iota(11),
- Vusgn = iota(12),
- Vfloat = iota(13),
- Vstr = iota(14),
+ Vrune = iota(9),
+ Vint = iota(10),
+ Vlong = iota(11),
+ Vvlong = iota(12),
+ Vun = iota(13),
+ Vfloat = iota(14),
+ Vstr = iota(15),
Vmask = Vchar - 1,
};
@@ -202,8 +203,9 @@ struct Token
double f;
vlong i;
uvlong ui;
- byte c;
- ubyte uc;
+ int32 c;
+ uint32 uc;
+ rune r;
} val;
};
diff --git a/sys/cmd/cc/lex.c b/sys/cmd/cc/lex.c
index 20c6f9c..03aba4b 100644
--- a/sys/cmd/cc/lex.c
+++ b/sys/cmd/cc/lex.c
@@ -82,7 +82,7 @@ static uint8 Atoi[256] =
static
error
-escape(Lexer *lx, int x, int *flag, vlong *val)
+escapechar(Lexer *lx, int x, int islong, int esc, vlong *val)
{
int i, u, c;
vlong l;
@@ -90,14 +90,14 @@ escape(Lexer *lx, int x, int *flag, vlong *val)
c = getrune(lx);
switch (c) {
+ case '\\':
+ break;
case EOF:
errorat(lx->pos, "EOF in string");
return 1;
case '\n':
errorat(lx->pos, "newline in string");
return 1;
- case '\\':
- break;
default:
if (c == x)
return 1;
@@ -110,12 +110,11 @@ escape(Lexer *lx, int x, int *flag, vlong *val)
switch(c) {
case 'x':
- i = 2;
- *flag = 1;
+ i = islong ? 4 : 2;
goto hex;
case 'u':
- i = 4;
+ i = islong ? 8 : 4;
u = 1;
goto hex;
@@ -126,17 +125,17 @@ escape(Lexer *lx, int x, int *flag, vlong *val)
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
- *flag = 1;
+ i = islong ? 4 : 2;
goto oct;
- case 'a': c = '\a'; break;
- case 'b': c = '\b'; break;
- case 'f': c = '\f'; break;
- case 'n': c = '\n'; break;
- case 'r': c = '\r'; break;
- case 't': c = '\t'; break;
- case 'v': c = '\v'; break;
- case '\\': c = '\\'; break;
+ case 'a': c = '\a'; break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'v': c = '\v'; break;
+ case '\\':c = '\\'; break;
default:
if(c != x) errorat(lx->pos, "unknown escape sequence: %c", c);
@@ -160,7 +159,6 @@ hex:
l = l*16 + c-'A' + 10;
continue;
}
- errorat(lx->pos, "non-hex character in escape sequence: %c", c);
ungetbyte(lx);
break;
}
@@ -169,22 +167,25 @@ hex:
l = RuneErr;
}
*val = l;
+ if (esc)
+ *val |= RuneMask + 1;
return 0;
oct:
l = c - '0';
- for (i = 2; i > 0; i--) {
+ for (; i > 0; i--) {
c = getbyte(lx);
if (c >= '0' && c <= '7') {
l = l*8 + c-'0';
continue;
}
- errorat(lx->pos, "non-octal character in escape sequence: %c", c);
ungetbyte(lx);
}
if (l > 255) errorat(lx->pos, "octal escape value > 255: %d", l);
*val = l;
+ if (esc)
+ *val |= RuneMask + 1;
return 0;
}
@@ -234,7 +235,7 @@ Token
lex(Lexer *lx)
{
int b, n, f;
- vlong v;
+ vlong v, _;
uint u;
rune r;
string s;
@@ -249,10 +250,28 @@ GetByte:
Dispatch:
tok.pos.beg = lx->pos;
- if (b >= RuneSelf || isalpha(b) || b == '_')
- goto TAlpha;
+ if (b >= RuneSelf || b == '_')
+ goto Talpha;
+ if (isalpha(b)) {
+ goto Talpha;
+
+ n = b;
+ b = getbyte(lx);
+ if (b == '\'') {
+ if (escapechar(lx, '\'', 1, 0, &v))
+ b = '\'';
+ if (!escapechar(lx, '\'', 1, 0, &_)) {
+ errorat(lx->pos, "missing ' at end of character constant");
+ }
+ tok.kind = Alit | Vrune;
+ tok.val.r = v;
+ goto Return;
+ }
+ ungetbyte(lx);
+ b = n;
+ }
if (isdigit(b))
- goto TNum;
+ goto Tnum;
switch (b) {
case '\n':
@@ -263,12 +282,13 @@ Dispatch:
lx->pos.line++;
goto Dispatch;
+ Tchar:
case '\'':
- if (escape(lx, '\'', &f, &v)) {
+ if (escapechar(lx, '\'', 0, 0, &v)) {
errorat(lx->pos, "empty literal or escaped ' in char literal");
v = '\'';
}
- if (!escape(lx, '\'', &f, &v)) {
+ if (!escapechar(lx, '\'', 0, 0, &_)) {
errorat(lx->pos, "missing '");
ungetbyte(lx);
}
@@ -279,15 +299,15 @@ Dispatch:
}
tok.kind = Alit | Vchar;
tok.val.c = v;
- break;
+ break;
case '"':
s = str·makecap("", 0, 8);
for (;;) {
- if (escape(lx, '"', &f, &v))
+ if (escapechar(lx, '"', 0, 1, &v))
break;
- if (v < RuneSelf || f)
+ if (v & (RuneMask + 1))
str·appendbyte(&s, v);
else {
r = v;
@@ -301,7 +321,7 @@ Dispatch:
intern(&tok.val.s);
str·free(s);
- break;
+ break;
case '.':
tok.kind = Adot;
@@ -309,7 +329,7 @@ Dispatch:
if (isdigit(b)) {
// *lx->b++ = b;
- goto TFlt;
+ goto Tflt;
} else if (b == '.') {
b = getbyte(lx);
if (b != '.') {
@@ -319,7 +339,7 @@ Dispatch:
}
}
ungetbyte(lx);
- break;
+ break;
case '<':
tok.kind = Alt;
@@ -336,7 +356,7 @@ Dispatch:
tok.kind = Alteq;
else
ungetbyte(lx);
- break;
+ break;
case '>':
tok.kind = Agt;
@@ -353,7 +373,7 @@ Dispatch:
tok.kind = Agteq;
else
ungetbyte(lx);
- break;
+ break;
case '/':
tok.kind = Adiv;
@@ -385,7 +405,7 @@ Dispatch:
goto Dispatch;
} else
ungetbyte(lx);
- break;
+ break;
case '#':
if (domacro(lx)) {
@@ -394,7 +414,6 @@ Dispatch:
return tok;
}
goto GetByte;
- break;
case EOF:
popio(lx);
@@ -432,7 +451,7 @@ Dispatch:
goto Return;
- TNum:
+ Tnum:
e = lx->buf + arrlen(lx->buf);
do {
if (lx->b >= e) {
@@ -443,12 +462,11 @@ Dispatch:
} while (b = getbyte(lx), isdigit(b) || b == '_');
if (b == '.' || tolower(b) == 'e')
- goto TFlt;
- TInt:
+ goto Tflt;
+ Tint:
r = b;
n = 10;
s = lx->buf;
- ungetbyte(lx);
if (*s == '0') {
b = *++s;
switch (b) {
@@ -485,18 +503,39 @@ Dispatch:
v = v * n + f;
}
+
b = r;
- tok.kind = Alit | Vint;
+ tok.kind = Alit;
tok.val.i = v;
- /* TODO: Suffixes!
- if (tolower(b) == 'u') {
- tok.kind |= Vusgn;
+
+ if (b == 'u' || b == 'U') {
+ tok.kind |= Vun;
b = getbyte(lx);
}
- */
- goto Return;
+ if (b == 'l' || b == 'L') {
+ r = getbyte(lx);
+ if (r == 'l' || r == 'L') {
+ if (r != b)
+ errorat(lx->pos, "mismatched case on long long integer suffix");
+ tok.kind |= Vvlong;
+ r = getbyte(lx);
+ } else
+ tok.kind |= Vlong;
+
+ if (r == 'u' || r == 'U') {
+ if (tok.kind & Vun)
+ errorat(lx->pos, "multiple unsigned designators on integer suffix");
+ tok.kind |= Vun;
+ goto Return;
+ }
+ ungetbyte(lx);
+ } else
+ tok.kind |= Vint;
+
+ ungetbyte(lx);
+ goto Return;
- TFlt:
+ Tflt:
if (b == '.') {
*lx->b++ = b;
b = getbyte(lx);
@@ -530,9 +569,9 @@ Dispatch:
tok.kind = Alit | Vfloat;
tok.val.f = d;
- goto Return;
+ goto Return;
- TAlpha:
+ Talpha:
u = b;
s = lx->buf;
e = lx->buf + arrlen(lx->buf);
diff --git a/sys/cmd/cc/pp.c b/sys/cmd/cc/pp.c
index 30bb3d4..8de4792 100644
--- a/sys/cmd/cc/pp.c
+++ b/sys/cmd/cc/pp.c
@@ -62,7 +62,7 @@ defmacro(Lexer *lx, string name, string macro)
{
Sym *mac;
- printf("DEFINING MACRO %s, ON LINE %d\n", name, lx->pos.line);
+ printf("DEFINING MACRO %s = %s, ON LINE %d\n", name, macro+1, lx->pos.line);
mac = define(&lx->sym, name, Smacro);
mac->macro = macro;
@@ -108,12 +108,14 @@ opand(Lexer *lx)
case Alit:
switch (tok.kind & ~Vmask) {
- case Vint:
+ case Vint: case Vlong: case Vvlong:
return tok.val.i;
+ case Vint | Vun: case Vlong | Vun: case Vvlong | Vun:
+ return tok.val.ui;
case Vchar:
return tok.val.c;
default:
- errorat(lx->pos, "invalid literal of type '%d' in conditional macro", tok.kind & ~Vmask);
+ errorat(lx->pos, "invalid literal of type '%s' in conditional macro", tokens[tok.kind & ~Vmask]);
return 0;
}