aboutsummaryrefslogtreecommitdiff
path: root/src/libsre/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libsre/lex.c')
-rw-r--r--src/libsre/lex.c246
1 files changed, 246 insertions, 0 deletions
diff --git a/src/libsre/lex.c b/src/libsre/lex.c
new file mode 100644
index 0000000..f4c6ac2
--- /dev/null
+++ b/src/libsre/lex.c
@@ -0,0 +1,246 @@
+#include "sre.h"
+
+static
+State *
+state(Machine *m, int t)
+{
+ if (m->state >= m->statestk + arrlen(m->statestk))
+ panicf("regexp vm: out of state space");
+
+ m->state->type = t;
+ m->state->l = nil;
+ m->state->r = nil;
+
+ return m->state++;
+}
+
+static
+int
+poptor(Parser *p)
+{
+ if (p->optor <= p->optorstk)
+ panicf("regexp parser: opand stack underflow");
+
+ return *--p->optor;
+}
+
+static
+void
+pushtor(Parser *p, int t)
+{
+ if (p->optor >= arrend(p->optorstk))
+ panicf("regexp parser: opand stack overflow");
+
+ *p->optor++ = t;
+}
+
+static
+void
+pushand(Parser *p, State *beg, State *end)
+{
+ if (p->node >= arrend(p->nodestk))
+ panicf("regexp parser: opand stack overflow");
+
+ p->node->beg = beg;
+ p->node->end = end;
+
+ p->node++;
+}
+
+static
+Node *
+popand(Parser *p)
+{
+ if (p->node <= p->nodestk)
+ panicf("regexp parser: opand stack underflow");
+
+ return --p->node;
+}
+
+static
+void
+operateuntil(Parser *p, int prec)
+{
+ Node *o1, *o2, *t;
+ State *s1, *s2;
+
+ while (prec == Trparen || p->optor[-1] >= prec) {
+ switch (poptor(p)) {
+ case Tor:
+ o1 = popand(p);
+ o2 = popand(p);
+
+ s1 = state(p->mach, Tor);
+ s2 = state(p->mach, Tnop);
+ s1->l = o1->beg;
+ s1->r = o2->beg;
+
+ o1->end->out = s2;
+ o2->end->out = s2;
+
+ pushand(p, s1, s2);
+ break;
+
+ case Tcat:
+ o1 = popand(p);
+ o2 = popand(p);
+
+ o1->end->out = o2->beg;
+ pushand(p, o1->beg, o2->end);
+ break;
+
+ case Tstar:
+ o1 = popand(p);
+ s1 = state(p->mach, Tor);
+ o1->end->out = s1;
+ s1->l = o1->beg;
+ pushand(p, s1, s1);
+ break;
+
+ case Tplus:
+ o1 = popand(p);
+ s1 = state(p->mach, Tor);
+ o1->end->out = s1;
+ s1->l = o1->beg;
+ pushand(p, o1->beg, s1);
+ break;
+
+ case Tqmark:
+ o1 = popand(p);
+ s1 = state(p->mach, Tor);
+ s2 = state(p->mach, Tnop);
+ s1->l = o1->beg;
+ s1->r = s2;
+ o1->end->out = s2;
+ pushand(p, s1, s2);
+ break;
+
+ default:
+ panicf("unsupported regexp operator");
+ }
+ }
+}
+
+static
+void
+operator(Parser *p, int t)
+{
+ operateuntil(p, t);
+ pushtor(p, t);
+ p->wasopand = (t != Tstar && t != Tqmark && t != Tplus && t != Trparen);
+}
+
+static
+void
+operand(Parser *p, int t)
+{
+ State *new;
+ if (p->wasopand)
+ operator(p, Tcat);
+
+ new = state(p->mach, t);
+ pushand(p, new, new);
+ p->wasopand = true;
+}
+
+#define cinc 20
+int
+lex(Parser *p)
+{
+ int c, t;
+ byte *class;
+ long n, cap;
+
+ c = *p->re++;
+ switch (c) {
+ case '\\':
+ if (*p->re)
+ if ((c = *p->re++) == '\n')
+ c = '\n';
+ break;
+ case '\0':
+ c = Tend,
+ --p->re;
+ break;
+ case '*':
+ c = Tstar;
+ break;
+ case '?':
+ c = Tqmark;
+ break;
+ case '+':
+ c = Tplus;
+ break;
+ case '|':
+ c = Tor;
+ break;
+ case '.':
+ c = Tany;
+ break;
+ case '(':
+ c = Tlparen;
+ break;
+ case ')':
+ c = Trparen;
+ break;
+ case '^':
+ c = Tbol;
+ break;
+ case '$':
+ c = Teol;
+ break;
+ case '[':
+ goto charclass;
+ default:
+ ;
+ }
+ return c;
+
+charclass:
+ panicf("to implement");
+}
+
+#undef cinc
+
+static
+State*
+optimize(State *entry)
+{
+ State *curr, *next;
+ for (curr=entry; curr->type != Tend; curr++) {
+ next = curr->out;
+ while (next->type == Tnop)
+ next = next->out;
+ curr->out = next;
+ }
+
+ return entry;
+}
+
+void
+sre·compile(Machine *mach, byte *regexp)
+{
+ int tok;
+ Parser p;
+ Node *prog;
+
+ p = (Parser) {
+ .re = regexp,
+ .mach = mach,
+ .node = p.nodestk,
+ };
+
+ pushtor(&p, Tstart - 1);
+ while ((tok = lex(&p)) != Tend) {
+ if ((tok & isoptor) == Toperator)
+ operator(&p, tok);
+ else
+ operand(&p, tok);
+ }
+ operateuntil(&p, Tstart);
+ operand(&p, Tend);
+ operateuntil(&p, Tstart);
+
+ prog = popand(&p);
+ mach->entry = optimize(prog->beg);
+}