aboutsummaryrefslogtreecommitdiff
path: root/sys/cmd/rc/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cmd/rc/lex.c')
-rw-r--r--sys/cmd/rc/lex.c688
1 files changed, 321 insertions, 367 deletions
diff --git a/sys/cmd/rc/lex.c b/sys/cmd/rc/lex.c
index f6e2b4e..253d05a 100644
--- a/sys/cmd/rc/lex.c
+++ b/sys/cmd/rc/lex.c
@@ -1,417 +1,371 @@
#include "rc.h"
+#include "exec.h"
+#include "io.h"
+#include "getflags.h"
+#include "fns.h"
-#define onebyte(c) ((c&0x80)==0x00)
-#define twobyte(c) ((c&0xe0)==0xc0)
-#define threebyte(c) ((c&0xf0)==0xe0)
-#define fourbyte(c) ((c&0xf8)==0xf0)
+char *promptstr;
+int doprompt;
+char tok[NTOK];
-// -----------------------------------------------------------------------
-// globals
-
-static int lastc, nextc=EOF, lastdol, lastword, doprompt = 1;
-static char buf[8*1024];
-
-// -----------------------------------------------------------------------
-// utilities
-
-static uchar nwordc[256] =
-{
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
+int getnext(void);
int
wordchr(int c)
{
- return !nwordc[c] && c!=EOF;
+ return !strchr("\n \t#;&|^$=`'{}()<>", c) && c!=EOF;
}
-
-static uchar nquotec[256] =
-{
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
int
-quotechr(char c)
+idchr(int c)
{
- return !nquotec[c] && c!=EOF;
+ /*
+ * Formerly:
+ * return 'a'<=c && c<='z' || 'A'<=c && c<='Z' || '0'<=c && c<='9'
+ * || c=='_' || c=='*';
+ */
+ return c>' ' && !strchr("!\"#$%&'()+,-./:;<=>?@[\\]^`{|}~", c);
}
-
-static uchar nvarc[256] =
-{
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
- 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
+int future = EOF;
+int doprompt = 1;
+int inquote;
+int incomm;
+/*
+ * Look ahead in the input stream
+ */
int
-varchr(char c)
+nextc(void)
{
- return !nvarc[c] && c!=EOF;
+ if(future==EOF)
+ future = getnext();
+ return future;
}
+/*
+ * Consume the lookahead character.
+ */
-static
-void
-prompt(void)
-{
- shell->cmd.line++;
- doprompt = 0;
-}
-
-/* lookahead one byte */
-static
int
-lookahead(void)
+advance(void)
{
- int c;
-
- if(nextc != EOF)
- return nextc;
- if(shell->cmd.eof)
- return EOF;
-
- if(doprompt)
- prompt();
-
- c = rchr(shell->cmd.io);
- doprompt = c == '\n' || c == EOF;
-
- if(c == EOF)
- shell->cmd.eof++;
-
- return nextc = c;
+ int c = nextc();
+ lastc = future;
+ future = EOF;
+ return c;
}
+/*
+ * read a character from the input stream
+ */
-/* consumes the lookahead */
-static
int
-advance(void)
+getnext(void)
{
- int c = lookahead();
- lastc = nextc, nextc = EOF;
-
- return c;
+ int c;
+ static int peekc = EOF;
+ if(peekc!=EOF){
+ c = peekc;
+ peekc = EOF;
+ return c;
+ }
+ if(runq->eof)
+ return EOF;
+ if(doprompt)
+ pprompt();
+ c = rchr(runq->cmdfd);
+ if(!inquote && c=='\\'){
+ c = rchr(runq->cmdfd);
+ if(c=='\n' && !incomm){ /* don't continue a comment */
+ doprompt = 1;
+ c=' ';
+ }
+ else{
+ peekc = c;
+ c='\\';
+ }
+ }
+ doprompt = doprompt || c=='\n' || c==EOF;
+ if(c==EOF)
+ runq->eof++;
+ else if(flag['V'] || ndot>=2 && flag['v']) pchr(err, c);
+ return c;
}
-/*
- * advance until we no longer hit horizontal space
- * consumes all comments
- */
-static
void
-skipws(void)
+skipwhite(void)
{
- int c;
- for(;;) {
- c = lookahead();
- if(c=='#'){
- for(;;){
- c = lookahead();
- if(c=='\n' || c==EOF)
- break;
- advance();
- }
- }
- if(c==' ' || c=='\t')
- advance();
- else
- return;
- }
+ int c;
+ for(;;){
+ c = nextc();
+ /* Why did this used to be if(!inquote && c=='#') ?? */
+ if(c=='#'){
+ incomm = 1;
+ for(;;){
+ c = nextc();
+ if(c=='\n' || c==EOF) {
+ incomm = 0;
+ break;
+ }
+ advance();
+ }
+ }
+ if(c==' ' || c=='\t')
+ advance();
+ else return;
+ }
}
-/* advance until we no longer hit any space */
void
skipnl(void)
{
- int c;
- for(;;) {
- skipws();
- if ((c = lookahead()) != '\n')
- return;
- advance();
- }
+ int c;
+ for(;;){
+ skipwhite();
+ c = nextc();
+ if(c!='\n')
+ return;
+ advance();
+ }
}
-/* advance if next char is equal to c */
-static
int
nextis(int c)
{
- if(lookahead()==c) {
- advance();
- return 1;
- }
- return 0;
+ if(nextc()==c){
+ advance();
+ return 1;
+ }
+ return 0;
}
-/* functions to append to our write buffer */
-static
char*
-putbyte(char *s, int c)
+addtok(char *p, int val)
{
- if(!s)
- return s;
- if(s == arrend(buf)){
- *s = 0;
- rcerror("out of buffer space");
- return nil;
- }
- *s++ = c;
- return s;
+ if(p==0)
+ return 0;
+ if(p==&tok[NTOK-1]){
+ *p = 0;
+ yyerror("token buffer too short");
+ return 0;
+ }
+ *p++=val;
+ return p;
}
-static
char*
-putrune(char *s, int c)
+addutf(char *p, int c)
{
- s = putbyte(s, c);
- if (onebyte(c))
- return s;
- if (twobyte(c))
- return putbyte(s, advance());
- if (threebyte(c)) {
- putbyte(s, advance());
- return putbyte(s, advance());
- }
- if (fourbyte(c)) {
- putbyte(s, advance());
- putbyte(s, advance());
- return putbyte(s, advance());
- }
- rcerror("malformed utf8 stream");
- return nil;
+ p = addtok(p, c);
+ if(twobyte(c)) /* 2-byte escape */
+ return addtok(p, advance());
+ if(threebyte(c)){ /* 3-byte escape */
+ p = addtok(p, advance());
+ return addtok(p, advance());
+ }
+ if(fourbyte(c)){ /* 4-byte escape */
+ p = addtok(p, advance());
+ p = addtok(p, advance());
+ return addtok(p, advance());
+ }
+ return p;
}
+int lastdol; /* was the last token read '$' or '$#' or '"'? */
+int lastword; /* was the last token read a word or compound word terminator? */
-// -----------------------------------------------------------------------
-// main exports
-
-void
-rcerror(char *fmt, ...)
-{
- va_list args;
-
- pfmt(errio, "rc:");
- if (shell->cmd.io)
- pfmt(errio, "%s:%d ", shell->cmd.name, shell->cmd.line);
-
- va_start(args, fmt);
- vpfmt(errio, fmt, args);
- va_end(args);
-
- pfmt(errio, "\n");
-
- flush(&errio);
- lastword = lastdol = 0;
- while (lastc != '\n' && lastc != EOF)
- advance();
- /* for debugging only */
- abort();
-}
-
-/* word is only modified in the event of a lexed word */
int
-lex(Tree **node)
+yylex(void)
{
- int c;
- char *w = buf;
- /*
- * NOTE:
- * we inject tokens into the lexer based on context if last token = word:
- * if we see a (, then we interpret that as a subscript
- * otherwise, if the next character is the first char of a word, we return a ^ operator.
- */
- if(lastword){
- lastword=0;
- c = lookahead();
- if(c=='('){
- advance();
- return Tlparen;
- }
- if(quotechr(c))
- return Tcarot;
- }
-
- skipws();
- switch(c=advance()) {
- case EOF:
- lastdol = 0;
- return EOF;
- case '$':
- lastdol = 1;
- if(nextis('#'))
- return Tcount;
- if (nextis('"'))
- return Tquote;
- return Tdol;
- case '&':
- lastdol = 0;
- if(nextis('&'))
- return Tandand;
- return Tand;
-
- case '!':
- return Tbang;
- case '@':
- return Tsubshell;
- case '~':
- return Ttwiddle;
-
- case '|':
- lastdol = 0;
- if(nextis('|')){
- skipnl();
- return Toror;
- }
- (*node) = newtree();
- (*node)->type = Tpipe;
- (*node)->redir.fd[0] = 0;
- (*node)->redir.fd[1] = 1;
- goto redir;
- case '>':
- (*node) = newtree();
- (*node)->type = Tredir;
- if (nextis(c))
- (*node)->redir.type = Rappend;
- else
- (*node)->redir.type = Rwrite;
- (*node)->redir.fd[0] = 1;
- goto redir;
- case '<':
- (*node) = newtree();
- (*node)->type = Tredir;
- if(nextis(c))
- (*node)->redir.type = Rhere;
- else if(nextis('>'))
- (*node)->redir.type = Rrdwr;
- else
- (*node)->redir.type = Rread;
- (*node)->redir.fd[0] = 0;
- /* fallthrough */
- redir:
- if(nextis('[')) {
- c = advance();
- if(c < '0' || '9' < c) {
- redirerr:
- rcerror("incorrect redirection syntax");
- return EOF;
- }
- (*node)->redir.fd[0] = 0;
- do {
- (*node)->redir.fd[0] = 10*(*node)->redir.fd[0]+(c-'0');
- c = advance();
- } while('0'<=c && c<='9');
-
- if(c == '=') {
- if((*node)->type == Tredir)
- (*node)->type = Tdup;
- c = advance();
- if('0'<=c && c<='9') {
- (*node)->redir.type = Rdupfd;
- (*node)->redir.fd[1] = (*node)->redir.fd[0];
- (*node)->redir.fd[0] = 0;
- do {
- (*node)->redir.fd[0] = 10*(*node)->redir.fd[0]+(c-'0');
- c = advance();
- } while('0'<=c && c<='9');
- } else {
- if((*node)->type == Tpipe)
- goto redirerr;
- (*node)->redir.type = Rclose;
- }
- }
- if (c != ']'
- ||(*node)->type==Tdup && ((*node)->redir.type==Rhere || (*node)->redir.type==Rappend))
- goto redirerr;
- }
- if ((c = ((*node)->type)) == Tpipe)
- skipnl();
- return c;
-
- case '\'':
- lastdol = 0;
- lastword = 1;
- for(;;){
- c = advance();
- if(c==EOF)
- break;
- if(c=='\''){
- if(lookahead()!='\'')
- break;
- advance();
- }
- w = putrune(w, c);
- }
- *w = 0;
- *node = wordnode(buf);
- (*node)->quoted = 1;
- return Tword;
- }
- if (!wordchr(c)) {
- lastdol = 0;
- return c;
- }
- for(;;){
- if(c=='*'||c=='['||c=='?'||c==GLOB)
- w = putbyte(w, GLOB);
- w = putrune(w, c);
- c = lookahead();
- if(lastdol?!varchr(c):!wordchr(c))
- break;
- advance();
- }
- *w = 0;
-
- if ((c = kwlookup(buf)) == -1) {
- (*node) = wordnode(buf);
- (*node)->type = c = Tword;
- (*node)->quoted = 0;
- lastword = 1;
- }
-
- lastdol = 0;
- return c;
+ int c, d = nextc();
+ char *w = tok;
+ struct tree *t;
+ yylval.tree = 0;
+ /*
+ * Embarassing sneakiness: if the last token read was a quoted or unquoted
+ * WORD then we alter the meaning of what follows. If the next character
+ * is `(', we return SUB (a subscript paren) and consume the `('. Otherwise,
+ * if the next character is the first character of a simple or compound word,
+ * we insert a `^' before it.
+ */
+ if(lastword){
+ lastword = 0;
+ if(d=='('){
+ advance();
+ strcpy(tok, "( [SUB]");
+ return SUB;
+ }
+ if(wordchr(d) || d=='\'' || d=='`' || d=='$' || d=='"'){
+ strcpy(tok, "^");
+ return '^';
+ }
+ }
+ inquote = 0;
+ skipwhite();
+ switch(c = advance()){
+ case EOF:
+ lastdol = 0;
+ strcpy(tok, "EOF");
+ return EOF;
+ case '$':
+ lastdol = 1;
+ if(nextis('#')){
+ strcpy(tok, "$#");
+ return COUNT;
+ }
+ if(nextis('"')){
+ strcpy(tok, "$\"");
+ return '"';
+ }
+ strcpy(tok, "$");
+ return '$';
+ case '&':
+ lastdol = 0;
+ if(nextis('&')){
+ skipnl();
+ strcpy(tok, "&&");
+ return ANDAND;
+ }
+ strcpy(tok, "&");
+ return '&';
+ case '|':
+ lastdol = 0;
+ if(nextis(c)){
+ skipnl();
+ strcpy(tok, "||");
+ return OROR;
+ }
+ case '<':
+ case '>':
+ lastdol = 0;
+ /*
+ * funny redirection tokens:
+ * redir: arrow | arrow '[' fd ']'
+ * arrow: '<' | '<<' | '>' | '>>' | '|'
+ * fd: digit | digit '=' | digit '=' digit
+ * digit: '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7'|'8'|'9'
+ * some possibilities are nonsensical and get a message.
+ */
+ *w++=c;
+ t = newtree();
+ switch(c){
+ case '|':
+ t->type = PIPE;
+ t->fd0 = 1;
+ t->fd1 = 0;
+ break;
+ case '>':
+ t->type = REDIR;
+ if(nextis(c)){
+ t->rtype = APPEND;
+ *w++=c;
+ }
+ else t->rtype = WRITE;
+ t->fd0 = 1;
+ break;
+ case '<':
+ t->type = REDIR;
+ if(nextis(c)){
+ t->rtype = HERE;
+ *w++=c;
+ } else if (nextis('>')){
+ t->rtype = RDWR;
+ *w++=c;
+ } else t->rtype = READ;
+ t->fd0 = 0;
+ break;
+ }
+ if(nextis('[')){
+ *w++='[';
+ c = advance();
+ *w++=c;
+ if(c<'0' || '9'<c){
+ RedirErr:
+ *w = 0;
+ yyerror(t->type==PIPE?"pipe syntax"
+ :"redirection syntax");
+ return EOF;
+ }
+ t->fd0 = 0;
+ do{
+ t->fd0 = t->fd0*10+c-'0';
+ *w++=c;
+ c = advance();
+ }while('0'<=c && c<='9');
+ if(c=='='){
+ *w++='=';
+ if(t->type==REDIR)
+ t->type = DUP;
+ c = advance();
+ if('0'<=c && c<='9'){
+ t->rtype = DUPFD;
+ t->fd1 = t->fd0;
+ t->fd0 = 0;
+ do{
+ t->fd0 = t->fd0*10+c-'0';
+ *w++=c;
+ c = advance();
+ }while('0'<=c && c<='9');
+ }
+ else{
+ if(t->type==PIPE)
+ goto RedirErr;
+ t->rtype = CLOSE;
+ }
+ }
+ if(c!=']'
+ || t->type==DUP && (t->rtype==HERE || t->rtype==APPEND))
+ goto RedirErr;
+ *w++=']';
+ }
+ *w='\0';
+ yylval.tree = t;
+ if(t->type==PIPE)
+ skipnl();
+ return t->type;
+ case '\'':
+ lastdol = 0;
+ lastword = 1;
+ inquote = 1;
+ for(;;){
+ c = advance();
+ if(c==EOF)
+ break;
+ if(c=='\''){
+ if(nextc()!='\'')
+ break;
+ advance();
+ }
+ w = addutf(w, c);
+ }
+ if(w!=0)
+ *w='\0';
+ t = token(tok, WORD);
+ t->quoted = 1;
+ yylval.tree = t;
+ return t->type;
+ }
+ if(!wordchr(c)){
+ lastdol = 0;
+ tok[0] = c;
+ tok[1]='\0';
+ return c;
+ }
+ for(;;){
+ /* next line should have (char)c==GLOB, but ken's compiler is broken */
+ if(c=='*' || c=='[' || c=='?' || c==(unsigned char)GLOB)
+ w = addtok(w, GLOB);
+ w = addutf(w, c);
+ c = nextc();
+ if(lastdol?!idchr(c):!wordchr(c)) break;
+ advance();
+ }
+
+ lastword = 1;
+ lastdol = 0;
+ if(w!=0)
+ *w='\0';
+ t = klook(tok);
+ if(t->type!=WORD)
+ lastword = 0;
+ t->quoted = 0;
+ yylval.tree = t;
+ return t->type;
}