From 1ef4974d3a84345c545b62ecc42c9f05bf630bf5 Mon Sep 17 00:00:00 2001 From: Nicholas Noll Date: Sun, 31 May 2020 14:53:26 -0700 Subject: structural regular expressions prototype --- sys/libsre/lex.c | 246 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ sys/libsre/sre.h | 93 +++++++++++++++++++++ 2 files changed, 339 insertions(+) create mode 100644 sys/libsre/lex.c create mode 100644 sys/libsre/sre.h (limited to 'sys') diff --git a/sys/libsre/lex.c b/sys/libsre/lex.c new file mode 100644 index 0000000..f4c6ac2 --- /dev/null +++ b/sys/libsre/lex.c @@ -0,0 +1,246 @@ +#include "sre.h" + +static +State * +state(Machine *m, int t) +{ + if (m->state >= m->statestk + arrlen(m->statestk)) + panicf("regexp vm: out of state space"); + + m->state->type = t; + m->state->l = nil; + m->state->r = nil; + + return m->state++; +} + +static +int +poptor(Parser *p) +{ + if (p->optor <= p->optorstk) + panicf("regexp parser: opand stack underflow"); + + return *--p->optor; +} + +static +void +pushtor(Parser *p, int t) +{ + if (p->optor >= arrend(p->optorstk)) + panicf("regexp parser: opand stack overflow"); + + *p->optor++ = t; +} + +static +void +pushand(Parser *p, State *beg, State *end) +{ + if (p->node >= arrend(p->nodestk)) + panicf("regexp parser: opand stack overflow"); + + p->node->beg = beg; + p->node->end = end; + + p->node++; +} + +static +Node * +popand(Parser *p) +{ + if (p->node <= p->nodestk) + panicf("regexp parser: opand stack underflow"); + + return --p->node; +} + +static +void +operateuntil(Parser *p, int prec) +{ + Node *o1, *o2, *t; + State *s1, *s2; + + while (prec == Trparen || p->optor[-1] >= prec) { + switch (poptor(p)) { + case Tor: + o1 = popand(p); + o2 = popand(p); + + s1 = state(p->mach, Tor); + s2 = state(p->mach, Tnop); + s1->l = o1->beg; + s1->r = o2->beg; + + o1->end->out = s2; + o2->end->out = s2; + + pushand(p, s1, s2); + break; + + case Tcat: + o1 = popand(p); + o2 = popand(p); + + o1->end->out = o2->beg; + pushand(p, o1->beg, o2->end); + break; + + case Tstar: + o1 = popand(p); + s1 = state(p->mach, Tor); + o1->end->out = s1; + s1->l = o1->beg; + pushand(p, s1, s1); + break; + + case Tplus: + o1 = popand(p); + s1 = state(p->mach, Tor); + o1->end->out = s1; + s1->l = o1->beg; + pushand(p, o1->beg, s1); + break; + + case Tqmark: + o1 = popand(p); + s1 = state(p->mach, Tor); + s2 = state(p->mach, Tnop); + s1->l = o1->beg; + s1->r = s2; + o1->end->out = s2; + pushand(p, s1, s2); + break; + + default: + panicf("unsupported regexp operator"); + } + } +} + +static +void +operator(Parser *p, int t) +{ + operateuntil(p, t); + pushtor(p, t); + p->wasopand = (t != Tstar && t != Tqmark && t != Tplus && t != Trparen); +} + +static +void +operand(Parser *p, int t) +{ + State *new; + if (p->wasopand) + operator(p, Tcat); + + new = state(p->mach, t); + pushand(p, new, new); + p->wasopand = true; +} + +#define cinc 20 +int +lex(Parser *p) +{ + int c, t; + byte *class; + long n, cap; + + c = *p->re++; + switch (c) { + case '\\': + if (*p->re) + if ((c = *p->re++) == '\n') + c = '\n'; + break; + case '\0': + c = Tend, + --p->re; + break; + case '*': + c = Tstar; + break; + case '?': + c = Tqmark; + break; + case '+': + c = Tplus; + break; + case '|': + c = Tor; + break; + case '.': + c = Tany; + break; + case '(': + c = Tlparen; + break; + case ')': + c = Trparen; + break; + case '^': + c = Tbol; + break; + case '$': + c = Teol; + break; + case '[': + goto charclass; + default: + ; + } + return c; + +charclass: + panicf("to implement"); +} + +#undef cinc + +static +State* +optimize(State *entry) +{ + State *curr, *next; + for (curr=entry; curr->type != Tend; curr++) { + next = curr->out; + while (next->type == Tnop) + next = next->out; + curr->out = next; + } + + return entry; +} + +void +sreĀ·compile(Machine *mach, byte *regexp) +{ + int tok; + Parser p; + Node *prog; + + p = (Parser) { + .re = regexp, + .mach = mach, + .node = p.nodestk, + }; + + pushtor(&p, Tstart - 1); + while ((tok = lex(&p)) != Tend) { + if ((tok & isoptor) == Toperator) + operator(&p, tok); + else + operand(&p, tok); + } + operateuntil(&p, Tstart); + operand(&p, Tend); + operateuntil(&p, Tstart); + + prog = popand(&p); + mach->entry = optimize(prog->beg); +} diff --git a/sys/libsre/sre.h b/sys/libsre/sre.h new file mode 100644 index 0000000..a7ace1a --- /dev/null +++ b/sys/libsre/sre.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include + +enum +{ + Toperator = RuneMask + 1, + Tstart = Toperator, + Trparen, + Tlparen, + Tor, + Tcat, + Tstar, + Tplus, + Tqmark, + + Tany = Toperator << 1, + Tnop, + Tbol, + Teol, + Tcclass, + Tnclass, + Tend, + + isoptor = Toperator, + isopand = Toperator << 1, +}; + +typedef struct Class Class; +typedef struct State State; +typedef struct Patch Patch; +typedef struct Node Node; + +typedef struct Parser Parser; +typedef struct Machine Machine; + +struct Class +{ + rune *end; + rune span[64]; +}; + +struct State +{ + int type; + union { + State *l; + }; + union { + State *r; + State *out; + }; +}; + +struct Patch +{ + State *s; + Patch *link; +}; + +struct Node +{ + State *beg; + State *end; +}; + +struct Parser +{ + Machine *mach; + byte *re; + int wasopand : 1; + int *optor, optorstk[1000]; + Node *node, nodestk[1000]; +}; + +struct Machine +{ + /* memory buffers */ + struct { + void *heap; + memĀ·Reallocator; + }; + State *state, statestk[1000]; + + struct { + int cap; + int len; + Class *c; + } class; + + State *entry; +}; -- cgit v1.2.1