aboutsummaryrefslogtreecommitdiff
path: root/src/libsre
diff options
context:
space:
mode:
Diffstat (limited to 'src/libsre')
-rw-r--r--src/libsre/lex.c246
-rw-r--r--src/libsre/sre.h93
2 files changed, 339 insertions, 0 deletions
diff --git a/src/libsre/lex.c b/src/libsre/lex.c
new file mode 100644
index 0000000..f4c6ac2
--- /dev/null
+++ b/src/libsre/lex.c
@@ -0,0 +1,246 @@
+#include "sre.h"
+
+static
+State *
+state(Machine *m, int t)
+{
+ if (m->state >= m->statestk + arrlen(m->statestk))
+ panicf("regexp vm: out of state space");
+
+ m->state->type = t;
+ m->state->l = nil;
+ m->state->r = nil;
+
+ return m->state++;
+}
+
+static
+int
+poptor(Parser *p)
+{
+ if (p->optor <= p->optorstk)
+ panicf("regexp parser: opand stack underflow");
+
+ return *--p->optor;
+}
+
+static
+void
+pushtor(Parser *p, int t)
+{
+ if (p->optor >= arrend(p->optorstk))
+ panicf("regexp parser: opand stack overflow");
+
+ *p->optor++ = t;
+}
+
+static
+void
+pushand(Parser *p, State *beg, State *end)
+{
+ if (p->node >= arrend(p->nodestk))
+ panicf("regexp parser: opand stack overflow");
+
+ p->node->beg = beg;
+ p->node->end = end;
+
+ p->node++;
+}
+
+static
+Node *
+popand(Parser *p)
+{
+ if (p->node <= p->nodestk)
+ panicf("regexp parser: opand stack underflow");
+
+ return --p->node;
+}
+
+static
+void
+operateuntil(Parser *p, int prec)
+{
+ Node *o1, *o2, *t;
+ State *s1, *s2;
+
+ while (prec == Trparen || p->optor[-1] >= prec) {
+ switch (poptor(p)) {
+ case Tor:
+ o1 = popand(p);
+ o2 = popand(p);
+
+ s1 = state(p->mach, Tor);
+ s2 = state(p->mach, Tnop);
+ s1->l = o1->beg;
+ s1->r = o2->beg;
+
+ o1->end->out = s2;
+ o2->end->out = s2;
+
+ pushand(p, s1, s2);
+ break;
+
+ case Tcat:
+ o1 = popand(p);
+ o2 = popand(p);
+
+ o1->end->out = o2->beg;
+ pushand(p, o1->beg, o2->end);
+ break;
+
+ case Tstar:
+ o1 = popand(p);
+ s1 = state(p->mach, Tor);
+ o1->end->out = s1;
+ s1->l = o1->beg;
+ pushand(p, s1, s1);
+ break;
+
+ case Tplus:
+ o1 = popand(p);
+ s1 = state(p->mach, Tor);
+ o1->end->out = s1;
+ s1->l = o1->beg;
+ pushand(p, o1->beg, s1);
+ break;
+
+ case Tqmark:
+ o1 = popand(p);
+ s1 = state(p->mach, Tor);
+ s2 = state(p->mach, Tnop);
+ s1->l = o1->beg;
+ s1->r = s2;
+ o1->end->out = s2;
+ pushand(p, s1, s2);
+ break;
+
+ default:
+ panicf("unsupported regexp operator");
+ }
+ }
+}
+
+static
+void
+operator(Parser *p, int t)
+{
+ operateuntil(p, t);
+ pushtor(p, t);
+ p->wasopand = (t != Tstar && t != Tqmark && t != Tplus && t != Trparen);
+}
+
+static
+void
+operand(Parser *p, int t)
+{
+ State *new;
+ if (p->wasopand)
+ operator(p, Tcat);
+
+ new = state(p->mach, t);
+ pushand(p, new, new);
+ p->wasopand = true;
+}
+
+#define cinc 20
+int
+lex(Parser *p)
+{
+ int c, t;
+ byte *class;
+ long n, cap;
+
+ c = *p->re++;
+ switch (c) {
+ case '\\':
+ if (*p->re)
+ if ((c = *p->re++) == '\n')
+ c = '\n';
+ break;
+ case '\0':
+ c = Tend,
+ --p->re;
+ break;
+ case '*':
+ c = Tstar;
+ break;
+ case '?':
+ c = Tqmark;
+ break;
+ case '+':
+ c = Tplus;
+ break;
+ case '|':
+ c = Tor;
+ break;
+ case '.':
+ c = Tany;
+ break;
+ case '(':
+ c = Tlparen;
+ break;
+ case ')':
+ c = Trparen;
+ break;
+ case '^':
+ c = Tbol;
+ break;
+ case '$':
+ c = Teol;
+ break;
+ case '[':
+ goto charclass;
+ default:
+ ;
+ }
+ return c;
+
+charclass:
+ panicf("to implement");
+}
+
+#undef cinc
+
+static
+State*
+optimize(State *entry)
+{
+ State *curr, *next;
+ for (curr=entry; curr->type != Tend; curr++) {
+ next = curr->out;
+ while (next->type == Tnop)
+ next = next->out;
+ curr->out = next;
+ }
+
+ return entry;
+}
+
+void
+sreĀ·compile(Machine *mach, byte *regexp)
+{
+ int tok;
+ Parser p;
+ Node *prog;
+
+ p = (Parser) {
+ .re = regexp,
+ .mach = mach,
+ .node = p.nodestk,
+ };
+
+ pushtor(&p, Tstart - 1);
+ while ((tok = lex(&p)) != Tend) {
+ if ((tok & isoptor) == Toperator)
+ operator(&p, tok);
+ else
+ operand(&p, tok);
+ }
+ operateuntil(&p, Tstart);
+ operand(&p, Tend);
+ operateuntil(&p, Tstart);
+
+ prog = popand(&p);
+ mach->entry = optimize(prog->beg);
+}
diff --git a/src/libsre/sre.h b/src/libsre/sre.h
new file mode 100644
index 0000000..a7ace1a
--- /dev/null
+++ b/src/libsre/sre.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <u.h>
+#include <libn.h>
+
+enum
+{
+ Toperator = RuneMask + 1,
+ Tstart = Toperator,
+ Trparen,
+ Tlparen,
+ Tor,
+ Tcat,
+ Tstar,
+ Tplus,
+ Tqmark,
+
+ Tany = Toperator << 1,
+ Tnop,
+ Tbol,
+ Teol,
+ Tcclass,
+ Tnclass,
+ Tend,
+
+ isoptor = Toperator,
+ isopand = Toperator << 1,
+};
+
+typedef struct Class Class;
+typedef struct State State;
+typedef struct Patch Patch;
+typedef struct Node Node;
+
+typedef struct Parser Parser;
+typedef struct Machine Machine;
+
+struct Class
+{
+ rune *end;
+ rune span[64];
+};
+
+struct State
+{
+ int type;
+ union {
+ State *l;
+ };
+ union {
+ State *r;
+ State *out;
+ };
+};
+
+struct Patch
+{
+ State *s;
+ Patch *link;
+};
+
+struct Node
+{
+ State *beg;
+ State *end;
+};
+
+struct Parser
+{
+ Machine *mach;
+ byte *re;
+ int wasopand : 1;
+ int *optor, optorstk[1000];
+ Node *node, nodestk[1000];
+};
+
+struct Machine
+{
+ /* memory buffers */
+ struct {
+ void *heap;
+ memĀ·Reallocator;
+ };
+ State *state, statestk[1000];
+
+ struct {
+ int cap;
+ int len;
+ Class *c;
+ } class;
+
+ State *entry;
+};