From ce05175372a9ddca1a225db0765ace1127a39293 Mon Sep 17 00:00:00 2001 From: Nicholas Date: Fri, 12 Nov 2021 09:22:01 -0800 Subject: chore: simplified organizational structure --- src/libsre/lex.c | 246 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 src/libsre/lex.c (limited to 'src/libsre/lex.c') diff --git a/src/libsre/lex.c b/src/libsre/lex.c new file mode 100644 index 0000000..f4c6ac2 --- /dev/null +++ b/src/libsre/lex.c @@ -0,0 +1,246 @@ +#include "sre.h" + +static +State * +state(Machine *m, int t) +{ + if (m->state >= m->statestk + arrlen(m->statestk)) + panicf("regexp vm: out of state space"); + + m->state->type = t; + m->state->l = nil; + m->state->r = nil; + + return m->state++; +} + +static +int +poptor(Parser *p) +{ + if (p->optor <= p->optorstk) + panicf("regexp parser: opand stack underflow"); + + return *--p->optor; +} + +static +void +pushtor(Parser *p, int t) +{ + if (p->optor >= arrend(p->optorstk)) + panicf("regexp parser: opand stack overflow"); + + *p->optor++ = t; +} + +static +void +pushand(Parser *p, State *beg, State *end) +{ + if (p->node >= arrend(p->nodestk)) + panicf("regexp parser: opand stack overflow"); + + p->node->beg = beg; + p->node->end = end; + + p->node++; +} + +static +Node * +popand(Parser *p) +{ + if (p->node <= p->nodestk) + panicf("regexp parser: opand stack underflow"); + + return --p->node; +} + +static +void +operateuntil(Parser *p, int prec) +{ + Node *o1, *o2, *t; + State *s1, *s2; + + while (prec == Trparen || p->optor[-1] >= prec) { + switch (poptor(p)) { + case Tor: + o1 = popand(p); + o2 = popand(p); + + s1 = state(p->mach, Tor); + s2 = state(p->mach, Tnop); + s1->l = o1->beg; + s1->r = o2->beg; + + o1->end->out = s2; + o2->end->out = s2; + + pushand(p, s1, s2); + break; + + case Tcat: + o1 = popand(p); + o2 = popand(p); + + o1->end->out = o2->beg; + pushand(p, o1->beg, o2->end); + break; + + case Tstar: + o1 = popand(p); + s1 = state(p->mach, Tor); + o1->end->out = s1; + s1->l = o1->beg; + pushand(p, s1, s1); + break; + + case Tplus: + o1 = popand(p); + s1 = state(p->mach, Tor); + o1->end->out = s1; + s1->l = o1->beg; + pushand(p, o1->beg, s1); + break; + + case Tqmark: + o1 = popand(p); + s1 = state(p->mach, Tor); + s2 = state(p->mach, Tnop); + s1->l = o1->beg; + s1->r = s2; + o1->end->out = s2; + pushand(p, s1, s2); + break; + + default: + panicf("unsupported regexp operator"); + } + } +} + +static +void +operator(Parser *p, int t) +{ + operateuntil(p, t); + pushtor(p, t); + p->wasopand = (t != Tstar && t != Tqmark && t != Tplus && t != Trparen); +} + +static +void +operand(Parser *p, int t) +{ + State *new; + if (p->wasopand) + operator(p, Tcat); + + new = state(p->mach, t); + pushand(p, new, new); + p->wasopand = true; +} + +#define cinc 20 +int +lex(Parser *p) +{ + int c, t; + byte *class; + long n, cap; + + c = *p->re++; + switch (c) { + case '\\': + if (*p->re) + if ((c = *p->re++) == '\n') + c = '\n'; + break; + case '\0': + c = Tend, + --p->re; + break; + case '*': + c = Tstar; + break; + case '?': + c = Tqmark; + break; + case '+': + c = Tplus; + break; + case '|': + c = Tor; + break; + case '.': + c = Tany; + break; + case '(': + c = Tlparen; + break; + case ')': + c = Trparen; + break; + case '^': + c = Tbol; + break; + case '$': + c = Teol; + break; + case '[': + goto charclass; + default: + ; + } + return c; + +charclass: + panicf("to implement"); +} + +#undef cinc + +static +State* +optimize(State *entry) +{ + State *curr, *next; + for (curr=entry; curr->type != Tend; curr++) { + next = curr->out; + while (next->type == Tnop) + next = next->out; + curr->out = next; + } + + return entry; +} + +void +sre·compile(Machine *mach, byte *regexp) +{ + int tok; + Parser p; + Node *prog; + + p = (Parser) { + .re = regexp, + .mach = mach, + .node = p.nodestk, + }; + + pushtor(&p, Tstart - 1); + while ((tok = lex(&p)) != Tend) { + if ((tok & isoptor) == Toperator) + operator(&p, tok); + else + operand(&p, tok); + } + operateuntil(&p, Tstart); + operand(&p, Tend); + operateuntil(&p, Tstart); + + prog = popand(&p); + mach->entry = optimize(prog->beg); +} -- cgit v1.2.1