ref: 363389e2b3fc56ad52922dc1903ca96129a3f5ed
parent: a2da7312d03152baea7311d4e0263f42fbc9a2e6
author: Ori Bernstein <[email protected]>
date: Tue Oct 22 10:30:39 EDT 2013
Add compilation of regexes.
--- a/compile.myr
+++ b/compile.myr
@@ -3,20 +3,415 @@
use "types.use"
pkg regex =
- const compile : (re : byte[:] -> regex#)
+ const compile : (re : byte[:] -> std.error(regex#, status))
;;
-const compile = {re
- var re
+type tree = union
+ /* basic string building */
+ `Alt [tree#, tree#]
+ `Cat [tree#, tree#]
+
+ /* repetition */
+ `Star tree#
+ `Plus tree#
+ `Quest tree#
+
+ /* end matches */
+ `Byte byte
+ `Chr char
+ `Class [char, char][:]
+ `Dot
+
+ /* meta */
+ `Cap tree#
+ `Astart
+ `Aend
+;;
+
+type parseresult = union
+ `Some tree#
+ `None
+ `Fail status
+;;
+
+const compile = {pat
+ var re : regex#
+
re = std.zalloc()
- re.prog = std.slalloc(4)
- /* compiled regex for a* */
- re.prog[0] = `Byte ('a' castto(byte))
- re.prog[1] = `Byte ('a' castto(byte))
- re.prog[2] = `Split (0, 3)
- re.prog[3] = `Match
+ re.pat = pat
+ match parse(re)
+ `None: std.put("Empty parse\n");;
+ `Fail f: -> `std.Failure f;;
+ `Some t:
+ dump(t, 0)
+ gen(re, t)
+ append(re, `Imatch)
+ idump(re)
+ ;;
+ ;;
+ -> `std.Failure (`Noimpl)
+}
- -> re
+const gen = {re, t
+ match t#
+ `Alt (a, b): genalt(re, a, b);;
+ `Cat (a, b): gen(re, a); gen(re, b);;
+ /* repetition */
+ `Star a: genstar(re, a);;
+ `Plus a: gen(re, a); genstar(re, a);;
+ `Quest a: genquest(re, a);;
+
+ /* end matches */
+ `Class sl:
+ std.die("Can't gen class\n")
+ ;;
+ `Byte b: append(re, `Ibyte b);;
+ `Chr c: genchar(re, c);;
+ `Dot: append(re, `Idot);;
+
+ /* meta */
+ `Cap a:
+ std.put("WARNING: Capture not implemented")
+ gen(re, a)
+ ;;
+ ;;
+ -> re.proglen
}
+const genalt = {re, l, r
+ var alt
+ var jmp
+ var l0
+ var l1
+ var l2
+ alt = re.proglen
+ l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
+ gen(re, l)
+ jmp = re.proglen
+ l1 = append(re, `Ijmp -1) /* needs to be replaced */
+ l2 = gen(re, l)
+
+ re.prog[alt] = `Ifork(l0, l1)
+ re.prog[jmp] = `Ijmp l2
+ -> re.proglen
+}
+
+const genstar = {re, rep
+ var alt
+ var jmp
+ var l0
+ var l1
+ var l2
+
+ l0 = re.proglen
+ alt = re.proglen
+ l1 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
+ jmp = gen(re, rep)
+ l2 = append(re, `Ijmp -1)
+
+
+ re.prog[alt] = `Ifork (l1, l2)
+ re.prog[jmp] = `Ijmp l0
+ -> re.proglen
+}
+
+const genquest = {re, q
+ var alt
+ var l0
+ var l1
+
+ alt = re.proglen
+ l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
+ l1 = gen(re, q)
+ re.prog[alt] = `Ifork (l0, l1)
+ -> re.proglen
+}
+
+const genchar = {re, c
+ var b : byte[4]
+ var n
+ var i
+
+ n = std.encode(b[:], c)
+ for i = 0; i < n; i++
+ append(re, `Ibyte b[i])
+ ;;
+ -> re.proglen
+}
+
+const append = {re, insn
+ if re.proglen == re.prog.len
+ re.prog = std.slgrow(re.prog, std.max(1, 2*re.proglen))
+ ;;
+ re.prog[re.proglen] = insn
+ re.proglen++
+ -> re.proglen
+}
+
+const idump = {re
+ var i
+
+ for i = 0; i < re.proglen; i++
+ std.put("%i:\t", i)
+ match re.prog[i]
+ /* Char matching. Consume exactly one byte from the string. */
+ `Ibyte b:
+ std.put("`Ibyte %b (%c)\n", b, b castto(char))
+ ;;
+ `Irange (start, end):
+ std.put("`Irange (%b,%b)\n", start, end)
+ ;;
+ `Idot:
+ std.put("`Idot\n")
+ ;;
+ /*
+ Control flow. All of these recursively call step() until
+ exactly one byte is consumed from the string.
+ */
+ `Ifork (lip, rip):
+ std.put("`Ifork (%z,%z)\n", lip, rip)
+ ;;
+ `Ijmp ip:
+ std.put("`Ijmp %z\n", ip)
+ ;;
+ `Imatch:
+ std.put("`Imatch\n")
+ ;;
+ ;;
+ ;;
+}
+
+const dump = {t, indent
+ var i
+
+ for i = 0; i < indent; i++
+ std.put(" ")
+ ;;
+ match t#
+ `Alt (a, b):
+ std.put("Alt\n")
+ dump(a, indent + 1)
+ dump(b, indent + 1)
+ ;;
+ `Cat (a, b):
+ std.put("Cat\n")
+ dump(a, indent + 1)
+ dump(b, indent + 1)
+ ;;
+ /* repetition */
+ `Star a:
+ std.put("Star\n")
+ dump(a, indent + 1)
+ ;;
+ `Plus a:
+ std.put("Plus\n")
+ dump(a, indent + 1)
+ ;;
+ `Quest a:
+ std.put("Quest\n")
+ dump(a, indent + 1)
+ ;;
+ /* end matches */
+ `Class sl:
+ std.put("Class [a..b]\n")
+ ;;
+ `Byte b:
+ std.put("Byte %b\n", b)
+ ;;
+ `Chr c:
+ std.put("Char %c\n", c)
+ ;;
+ `Dot:
+ std.put("Dot\n")
+ ;;
+
+ /* meta */
+ `Cap a:
+ std.put("Cap\n")
+ dump(a, indent + 1)
+ ;;
+ ;;
+}
+
+const parse = {re
+ match altexpr(re)
+ `Some t:
+ if re.pat.len == 0
+ -> `Some t
+ else
+ free(t)
+ -> `Fail (`Earlystop)
+ ;;
+ ;;
+ `None:
+ -> `None
+ ;;
+ ;;
+}
+
+const altexpr = {re
+ var ret : tree#
+
+ std.put("-> alt\n")
+ match catexpr(re)
+ `Some t:
+ ret = t
+ if matchc(re, '|')
+ match altexpr(re)
+ `Some rhs:
+ ret = mk(`Alt (ret, rhs))
+ ;;
+ `None:
+ free(ret)
+ -> `Fail (`Earlystop)
+ ;;
+ `Fail f:
+ -> `Fail f
+ ;;
+ ;;
+
+ ;;
+ ;;
+ other: -> other;;
+ ;;
+ -> `Some ret
+}
+
+const catexpr = {re
+ var ret
+
+ std.put("-> cat\n")
+ match repexpr(re)
+ `Some t:
+ ret = t
+ match catexpr(re)
+ `Some rhs:
+ ret = mk(`Cat (t, rhs))
+ ;;
+ `Fail f: -> `Fail f;;
+ `None: /* nothing */;;
+ ;;
+ ;;
+ other: -> other;;
+ ;;
+ -> `Some ret
+}
+
+const repexpr = {re
+ var ret
+
+ std.put("-> rep\n")
+ match baseexpr(re)
+ `Some t:
+ if matchc(re, '*')
+ ret = mk(`Star t)
+ elif matchc(re, '+')
+ ret = mk(`Plus t)
+ elif matchc(re, '?')
+ ret = mk(`Quest t)
+ else
+ ret = t
+ ;;
+ ;;
+ other: -> other;;
+ ;;
+ -> `Some ret
+}
+
+const baseexpr = {re
+ var ret
+
+ std.put("-> base\n")
+ if re.pat.len == 0
+ -> `None
+ ;;
+ match getc(re)
+ '[': ret = chrclass(re);;
+ '.': ret = mk(`Dot);;
+ '^': ret = mk(`Astart);;
+ '$': ret = mk(`Aend);;
+ '(':
+ match altexpr(re)
+ `Some s: ret = mk(`Cap s);;
+ `None: -> `Fail (`Emptyparen);;
+ ;;
+ if !matchc(re, ')')
+ free(ret)
+ -> `Fail (`Unbalanced)
+ ;;
+ ;;
+ '*': -> `Fail (`Badrep);;
+ '+': -> `Fail (`Badrep);;
+ '?': -> `Fail (`Badrep);;
+ c:
+ if c == '\\'
+ if re.pat.len == 0
+ -> `Fail (`Earlystop)
+ ;;
+ c = getc(re)
+ ;;
+ ret = mk(`Chr c)
+ ;;
+ ;;
+ std.put("<- base\n")
+ dump(ret, 0)
+ -> `Some ret
+}
+
+const chrclass = {pat
+ var t
+
+ t = std.alloc()
+ t# = `Class [][:]
+ -> t
+}
+
+const matchc = {re, c
+ var str
+ var chr
+
+ (chr, str) = std.striter(re.pat)
+ if chr != c
+ -> false
+ ;;
+ re.pat = str
+ -> true
+}
+
+const getc = {re
+ var c
+
+ (c, re.pat) = std.striter(re.pat)
+ -> c
+}
+
+
+const mk = {v
+ var t
+
+ t = std.alloc()
+ t# = v
+ -> t
+}
+
+const free = {t
+ match t#
+ `Alt (a, b): free(a); free(b);;
+ `Cat (a, b): free(a); free(b);;
+ /* repetition */
+ `Star a: free(a);;
+ `Plus a: free(a);;
+ `Quest a: free(a);;
+
+ /* end matches */
+ `Class sl: std.slfree(sl);;
+ `Byte b: ;;
+ `Chr c: ;;
+ `Dot: ;;
+
+ /* meta */
+ `Cap a: free(a);;
+ ;;
+ std.free(t)
+}
--- a/interp.myr
+++ b/interp.myr
@@ -35,33 +35,36 @@
thr = re.thr[tid]
str = re.str
match re.prog[thr.ip]
- /* Char matching */
- `Byte b:
+ /* Char matching. Consume exactly one byte from the string. */
+ `Ibyte b:
if !in(re, str) || b != str[re.strp]
- kill(re, tid, "not char")
+ kill(re, tid, "not right char")
else
std.put("matched %b with %b\n", b, str[re.strp])
;;
;;
- `Range (start, end):
+ `Irange (start, end):
if !in(re, str) || start > str[re.strp] || end < str[re.strp]
kill(re, tid, "bad range")
;;
;;
- `Dot:
+ `Idot:
if !in(re, str)
kill(re, tid, "past end")
;;
;;
- /* Control flow */
- `Split (lip, rip):
+ /*
+ Control flow. All of these recursively call step() until
+ exactly one byte is consumed from the string.
+ */
+ `Ifork (lip, rip):
spawn(re, lip)
jmp(re, tid, rip)
;;
- `Jmp ip:
+ `Ijmp ip:
jmp(re, tid, ip)
;;
- `Match:
+ `Imatch:
finish(re, tid)
;;
;;
@@ -91,7 +94,6 @@
re.thr[re.nthr] = thr
re.nthr++
-
step(re, tid)
}
--- a/main.myr
+++ b/main.myr
@@ -1,13 +1,13 @@
+use regex
use std
-use regex
-
const main = {
+ regex.compile("a+")
+ /*
var re
var found
-
- re = regex.compile("a+")
found = regex.exec(re, "aaa")
std.put("Found: len = %z\n", re.strp, found)
+ */
}
--- a/types.myr
+++ b/types.myr
@@ -1,7 +1,18 @@
use std
pkg regex =
+ type status = union
+ `Earlystop
+ `Unbalanced
+ `Emptyparen
+ `Badrep
+ `Noimpl
+ ;;
+
type regex = struct
+ pat : byte[:]
+ /* VM state */
+ proglen : std.size
prog : reinst[:]
nthr : std.size
thr : rethread#[:]
@@ -17,13 +28,13 @@
type reinst = union
/* direct consumers */
- `Byte byte
- `Range [byte, byte]
- `Dot
+ `Ibyte byte
+ `Irange [byte, byte]
+ `Idot
- `Match /* found the end of the expr */
+ `Imatch /* found the end of the expr */
- `Split [std.size, std.size]
- `Jmp std.size
+ `Ifork [std.size, std.size]
+ `Ijmp std.size
;;
;;