ref: 750f64d65eaa7af67dc7377ee948dda823c68476
dir: /compile.myr/
use std use "types.use" pkg regex = const compile : (re : byte[:] -> std.error(regex#, status)) ;; type tree = union /* basic string building */ `Alt [tree#, tree#] `Cat [tree#, tree#] /* repetition */ `Star tree# `Plus tree# `Quest tree# /* end matches */ `Byte byte `Chr char `Class [char, char][:] `Dot /* meta */ `Cap tree# `Bol /* beginning of line */ `Eol /* end of line */ ;; type parseresult = union `Some tree# `None `Fail status ;; const compile = {pat var re : regex# re = std.zalloc() re.pat = pat re.nmatch = 1 /* whole match */ match parse(re) `None: -> `std.Failure (`Earlystop);; `Fail f: -> `std.Failure f;; `Some t: dump(t, 0) append(re, `Ilbra 0) gen(re, t) append(re, `Irbra 0) append(re, `Imatch) idump(re) -> `std.Success re ;; ;; -> `std.Failure (`Noimpl) } const gen = {re, t var m match t# `Alt (a, b): genalt(re, a, b);; `Cat (a, b): gen(re, a); gen(re, b);; /* repetition */ `Star a: genstar(re, a);; `Plus a: gen(re, a); genstar(re, a);; `Quest a: genquest(re, a);; /* end matches */ `Class sl: std.die("Can't gen class\n") ;; `Byte b: append(re, `Ibyte b);; `Chr c: genchar(re, c);; `Dot: genrange(re, 0, std.Maxcharval);; /* meta */ `Bol: append(re, `Ibol) ;; `Eol: append(re, `Ibol) ;; `Cap a: m = re.nmatch++ append(re, `Ilbra m) gen(re, a) append(re, `Irbra m) ;; ;; -> re.proglen } const genrange = {re, lo, hi var charrng = [ 0, 0x80, 0x800, 0x10000, 0x200000, -1 ] var lbuf : byte[4] var hbuf : byte[4] var lsz var hsz var end var sz var d var i var j lsz = std.charlen(lo) hsz = std.charlen(hi) charrng[lsz - 1] = lo charrng[hsz] = hi if lsz == 1 && hsz == 1 append(re, `Irange (lo castto(byte), hi castto(byte))) else for i = hsz; i > lsz; i-- std.put("i = %z\n", i - 2) d = re.proglen + i - 1 append(re, `Ifork (re.proglen + 1, jmpdist(i) + d)) ;; end = re.proglen + jmpdist(hsz + 1); for i = 0; i < hsz; i++ std.put("lo[%z] = %i\n", i, charrng[i] castto(int)) std.put("hi[%z] = %i\n", i, (charrng[i + 1] - 1) castto(int)) sz = std.encode(lbuf[:], charrng[i]) std.encode(hbuf[:], charrng[i + 1] - 1) for j = 0; j < sz; j++ append(re, `Irange (lbuf[j], hbuf[j])) ;; append(re, `Ijmp (end)) ;; ;; -> re.proglen } const jmpdist = {n var d var i d = n - 1 for i = n - 1; i > 0; i-- d += i ;; -> d } const genalt = {re, l, r var alt var jmp var l0 var l1 var l2 alt = re.proglen l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */ gen(re, l) jmp = re.proglen l1 = append(re, `Ijmp -1) /* needs to be replaced */ l2 = gen(re, r) re.prog[alt] = `Ifork(l0, l1) re.prog[jmp] = `Ijmp l2 -> re.proglen } const genstar = {re, rep var alt var jmp var l0 var l1 var l2 l0 = re.proglen alt = re.proglen l1 = append(re, `Ifork (-1, -1)) /* needs to be replaced */ jmp = gen(re, rep) l2 = append(re, `Ijmp -1) re.prog[alt] = `Ifork (l1, l2) re.prog[jmp] = `Ijmp l0 -> re.proglen } const genquest = {re, q var alt var l0 var l1 alt = re.proglen l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */ l1 = gen(re, q) re.prog[alt] = `Ifork (l0, l1) -> re.proglen } const genchar = {re, c var b : byte[4] var n var i n = std.encode(b[:], c) for i = 0; i < n; i++ append(re, `Ibyte b[i]) ;; -> re.proglen } const append = {re, insn if re.proglen == re.prog.len re.prog = std.slgrow(re.prog, std.max(1, 2*re.proglen)) ;; re.prog[re.proglen] = insn re.proglen++ -> re.proglen } const idump = {re var i for i = 0; i < re.proglen; i++ std.put("%i:\t", i) match re.prog[i] /* Char matching. Consume exactly one byte from the string. */ `Ibyte b: std.put("`Ibyte %b (%c)\n", b, b castto(char)) ;; `Irange (start, end): std.put("`Irange (%b,%b)\n", start, end) ;; `Idot: std.put("`Idot\n") ;; /* capture groups */ `Ilbra m: std.put("`Ilbra %z\n", m) ;; `Irbra m: std.put("`Irbra %z\n", m) ;; /* anchors */ `Ibol: std.put("`Ibol\n");; `Ieol: std.put("`Ieol\n");; /* control flow */ `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip) ;; `Ijmp ip: std.put("`Ijmp %z\n", ip) ;; `Imatch: std.put("`Imatch\n") ;; ;; ;; } const dump = {t, indent var i for i = 0; i < indent; i++ std.put(" ") ;; match t# `Alt (a, b): std.put("Alt\n") dump(a, indent + 1) dump(b, indent + 1) ;; `Cat (a, b): std.put("Cat\n") dump(a, indent + 1) dump(b, indent + 1) ;; /* repetition */ `Star a: std.put("Star\n") dump(a, indent + 1) ;; `Plus a: std.put("Plus\n") dump(a, indent + 1) ;; `Quest a: std.put("Quest\n") dump(a, indent + 1) ;; `Bol: std.put("Bol\n") ;; `Eol: std.put("Eol\n") ;; /* end matches */ `Class sl: std.put("Class [a..b]\n") ;; `Byte b: std.put("Byte %b\n", b) ;; `Chr c: std.put("Char %c\n", c) ;; `Dot: std.put("Dot\n") ;; /* meta */ `Cap a: std.put("Cap\n") dump(a, indent + 1) ;; ;; } const parse = {re match altexpr(re) `Some t: if re.pat.len == 0 -> `Some t else free(t) -> `Fail (`Earlystop) ;; ;; `None: -> `None ;; ;; } const altexpr = {re var ret : tree# match catexpr(re) `Some t: ret = t if matchc(re, '|') match altexpr(re) `Some rhs: ret = mk(`Alt (ret, rhs)) ;; `None: free(ret) -> `Fail (`Earlystop) ;; `Fail f: -> `Fail f ;; ;; ;; ;; other: -> other;; ;; -> `Some ret } const catexpr = {re var ret match repexpr(re) `Some t: ret = t match catexpr(re) `Some rhs: ret = mk(`Cat (t, rhs)) ;; `Fail f: -> `Fail f;; `None: /* nothing */;; ;; ;; other: -> other;; ;; -> `Some ret } const repexpr = {re var ret match baseexpr(re) `Some t: if matchc(re, '*') ret = mk(`Star t) elif matchc(re, '+') ret = mk(`Plus t) elif matchc(re, '?') ret = mk(`Quest t) else ret = t ;; ;; other: -> other;; ;; -> `Some ret } const baseexpr = {re var ret if re.pat.len == 0 -> `None ;; match peekc(re) /* lower prec operators */ '|': -> `None;; ')': -> `None;; '*': -> `Fail (`Badrep);; '+': -> `Fail (`Badrep);; '?': -> `Fail (`Badrep);; '[': ret = chrclass(re);; '.': getc(re); ret = mk(`Dot);; '^': getc(re); ret = mk(`Bol);; '$': getc(re); ret = mk(`Eol);; '(': getc(re) match altexpr(re) `Some s: ret = mk(`Cap s);; `None: -> `Fail (`Emptyparen);; ;; if !matchc(re, ')') free(ret) -> `Fail (`Unbalanced) ;; ;; c: getc(re) if c == '\\' if re.pat.len == 0 -> `Fail (`Earlystop) ;; c = getc(re) ;; ret = mk(`Chr c) ;; ;; dump(ret, 0) -> `Some ret } const chrclass = {pat var t t = std.alloc() t# = `Class [][:] -> t } const matchc = {re, c var str var chr (chr, str) = std.striter(re.pat) if chr != c -> false ;; re.pat = str -> true } const getc = {re var c (c, re.pat) = std.striter(re.pat) -> c } const peekc = {re var c var _ (c, _) = std.striter(re.pat) -> c } const mk = {v var t t = std.alloc() t# = v -> t } const free = {t match t# `Alt (a, b): free(a); free(b);; `Cat (a, b): free(a); free(b);; /* repetition */ `Star a: free(a);; `Plus a: free(a);; `Quest a: free(a);; /* end matches */ `Class sl: std.slfree(sl);; `Byte b: ;; `Chr c: ;; `Dot: ;; /* meta */ `Cap a: free(a);; ;; std.free(t) }