ref: 64a31ef2fd606a419f9e79ad56509febbff54f17
parent: 595655d194bef8eb4505ffe38af641f78aa157fe
author: Ori Bernstein <[email protected]>
date: Thu Dec 26 19:31:45 EST 2013
Add in loop detection for regexes. It used to be that '(a*)*' would make our regex interpreter loop indefinitely because we had a path through the automaton that would return to the start without consuming any characters.
--- a/interp.myr
+++ b/interp.myr
@@ -15,6 +15,7 @@
run(re)
match re.matched
| `std.Some thr:
+ trace(re, thr, "mstart: %z, mend: %z, str.len: %z\n", thr.mstart[0], thr.mend[0], str.len)
if thr.mend[0] != str.len
-> `std.None
else
@@ -41,7 +42,8 @@
}
const run = {re
- var i, consumed
+ var i, ip
+ var consumed
var thr
re.matched = `std.None
@@ -58,10 +60,11 @@
thr = re.runq
re.runq = thr.next
- trace(re, thr, "\nrunning tid=%z, ip=%z, c=%c\n", thr.tid, thr.ip, re.str[re.strp] castto(char))
- consumed = step(re, thr)
+ trace(re, thr, "\nrunning tid=%z, ip=%z, s[%z]=%c\n", thr.tid, thr.ip, re.strp, std.decode(re.str[re.strp:]))
+ ip = thr.ip
+ consumed = step(re, thr, -1)
while !consumed
- consumed = step(re, thr)
+ consumed = step(re, thr, ip)
;;
if thr.dead
@@ -71,11 +74,10 @@
re.expired = thr
;;
;;
+ trace(re, thr, "SWITCH\n")
re.runq = re.expired
re.expired = Zthr
- if re.nthr > 0
- re.strp++
- ;;
+ re.strp++
;;
}
@@ -83,7 +85,7 @@
Steps forward one instruction. Returns true if a byte of input was
consumed, false otherwise.
*/
-const step = {re, thr
+const step = {re, thr, curip
var str
var mstart
var mend
@@ -142,7 +144,7 @@
trace(re, thr, "\t%z:\tFork (%z, %z)\n", thr.ip, lip, rip)
mstart = std.sldup(thr.mstart)
mend = std.sldup(thr.mend)
- fork(re, thr, rip, mstart, mend)
+ fork(re, thr, rip, curip, mstart, mend)
thr.ip = lip
-> false
| `Ijmp ip:
@@ -157,9 +159,12 @@
-> true
}
-const fork = {re, thr, ip, mstart, mend
+const fork = {re, thr, ip, curip, mstart, mend
var thr
+ if ip == curip /* loop detection */
+ ->
+ ;;
thr = mkthread(re, ip)
thr.next = re.runq
thr.mstart = mstart
@@ -174,7 +179,7 @@
}
const finish = {re, thr
- trace(re, thr, "finish %z\n", tid)
+ trace(re, thr, "finish %z\n", thr.tid)
match re.matched
| `std.Some t: thrfree(re, t)
| `std.None:
@@ -184,7 +189,7 @@
re.nthr--
}
-var tid = 0
+var nexttid = 0
const mkthread = {re, ip
var thr : rethread#
@@ -194,7 +199,7 @@
thr.next = Zthr
thr.ip = ip
- thr.tid = tid++
+ thr.tid = nexttid++
thr.dead = false
thr.matched = false
--- a/test/data/regex-basic-expected
+++ b/test/data/regex-basic-expected
@@ -1,2 +1,5 @@
Matched. 1 matches
match 0: Abc
+Matched. 2 matches
+match 0: a
+match 1: a
--- a/test/regex-basic.myr
+++ b/test/regex-basic.myr
@@ -4,4 +4,5 @@
const main = {
testmatch(".*bc", "Abc")
+ testmatch("(a*)*", "a")
}
--- a/test/testmatch.myr
+++ b/test/testmatch.myr
@@ -3,11 +3,20 @@
pkg =
const testmatch : (pat : byte[:], text : byte[:] -> void)
+ const dbgmatch : (pat : byte[:], text : byte[:] -> void)
;;
const testmatch = {pat, text
+ run(regex.compile(pat), text)
+}
+
+const dbgmatch = {pat, text
+ run(regex.dbgcompile(pat), text)
+}
+
+const run = {regex, text
var i
- match regex.compile(pat)
+ match regex
| `std.Success re:
match regex.exec(re, text)
| `std.Some m: