ref: 1b08f4376253dca73868af564b44a7680a02667c
parent: 560a2b7de036a9f21e0a86f8656ca9b4cd2be478
author: Ori Bernstein <[email protected]>
date: Sun Sep 7 18:11:10 EDT 2014
Add support for word boundary metacharacter.
--- a/compile.myr
+++ b/compile.myr
@@ -30,6 +30,8 @@
`Cap (std.size, tree#) /* id, tree */
`Bol /* beginning of line */
`Eol /* end of line */
+ `Bow /* beginning of word */
+ `Eow /* end of word */
;;
type parseresult = union
@@ -107,6 +109,8 @@
/* meta */
|`Bol: append(re, `Ibol)
|`Eol: append(re, `Ibol)
+ |`Bow: append(re, `Ibow)
+ |`Eow: append(re, `Ieow)
|`Cap (m, a):
append(re, `Ilbra m)
gen(re, a)
@@ -312,6 +316,7 @@
var i
n = std.encode(b[:], c)
+ std.assert(n > 0 && n < 4, "non-utf character in regex\n")
for i = 0; i < n; i++
append(re, `Ibyte b[i])
;;
@@ -352,6 +357,8 @@
/* anchors */
| `Ibol: std.put("`Ibol\n")
| `Ieol: std.put("`Ieol\n")
+ | `Ibow: std.put("`Ibow\n")
+ | `Ieow: std.put("`Ieow\n")
/* control flow */
| `Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip)
| `Ijmp ip: std.put("`Ijmp %z\n", ip)
@@ -399,6 +406,10 @@
std.put("Bol\n")
| `Eol:
std.put("Eol\n")
+ | `Bow:
+ std.put("Bow\n")
+ | `Eow:
+ std.put("Eow\n")
/* end matches */
| `Byte b:
std.put("Byte %b\n", b)
@@ -566,6 +577,10 @@
/* unicode character classes */
| 'p': ret = unicodeclass(re, false)
| 'P': ret = unicodeclass(re, true)
+
+ /* operators that need an escape */
+ | '<': ret = `Some mk(`Bow)
+ | '>': ret = `Some mk(`Eow)
/* escaped metachars */
| '^': ret = `Some mk(`Chr '^')
--- a/interp.myr
+++ b/interp.myr
@@ -163,10 +163,31 @@
| `Ieol:
trace(re, thr, "\t%z:\tEol\n", thr.ip)
if re.strp == str.len || str[re.strp] == '\n' castto(byte)
+ thr.ip++
-> false
else
die(re, thr, "not end of line")
;;
+ /* check for word characters */
+ | `Ibow:
+ trace(re, thr, "\t%z:\tBow\n", thr.ip)
+ if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
+ thr.ip++
+ -> false
+ else
+ die(re, thr, "not beginning of word")
+ ;;
+ | `Ieow:
+ trace(re, thr, "\t%z:\tEow\n", thr.ip)
+ if re.strp == str.len && iswordchar(prevchar(str, re.strp))
+ thr.ip++
+ -> false
+ elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))
+ thr.ip++
+ -> false
+ else
+ die(re, thr, "not end of word")
+ ;;
| `Ilbra m:
trace(re, thr, "\t%z:\tLbra %z\n", thr.ip, m)
trace(re, thr, "\t\tmatch start = %z\n", re.strp)
@@ -267,4 +288,21 @@
if re.debug
std.putv(msg, std.vastart(&args))
;;
+}
+
+/* must be called with i >= 1 */
+const prevchar = {s, i
+ std.assert(i != 0, "prevchar must be called with i >= 1\n")
+ i--
+ while i != 0 && s[i] >= 0x80
+ i--
+ ;;
+ -> s[i:]
+}
+
+const iswordchar = {s
+ var c
+
+ c = std.decode(s)
+ -> std.isalpha(c) || std.isdigit(c) || c == '_'
}
--- /dev/null
+++ b/test/data/regex-boundaries-expected
@@ -1,0 +1,28 @@
+Matched abcdef via \<([a-z]*)\> : 2
+ match 0: abcdef
+ match 1: abcdef
+Matched !m! via .*(\<.*\>).* : 2
+ match 0: !m!
+ match 1: m
+Matched !m via .*(\<.*\>).* : 2
+ match 0: !m
+ match 1: m
+Matched m! via .*(\<.*\>).* : 2
+ match 0: m!
+ match 1: m
+Matched !@#!!matches!!%! via .*(\<.*\>).* : 2
+ match 0: !@#!!matches!!%!
+ match 1: matches
+Matched matches!!%! via .*(\<.*\>).* : 2
+ match 0: matches!!%!
+ match 1: matches
+Matched !@#!!matches via .*(\<.*\>).* : 2
+ match 0: !@#!!matches
+ match 1: matches
+Matched !@#!!matches!!%!foo via .*(\<.*\>).* : 2
+ match 0: !@#!!matches!!%!foo
+ match 1: foo
+Matched 123 via .*(\<.*\>).* : 2
+ match 0: 123
+ match 1: 123
+No match of abcdefoo via \<([a-z]*)\>foo
--- /dev/null
+++ b/test/regex-boundaries.myr
@@ -1,0 +1,17 @@
+use "testmatch.use"
+
+const main = {
+ /* expected matches */
+ testmatch("\\<([a-z]*)\\>", "abcdef") /* whole word */
+ testmatch(".*(\\<.*\\>).*", "!m!") /* single char word in midstring */
+ testmatch(".*(\\<.*\\>).*", "!m") /* single char word at end of string */
+ testmatch(".*(\\<.*\\>).*", "m!") /* single char word at start of string */
+ testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!") /* word in midstring */
+ testmatch(".*(\\<.*\\>).*", "matches!!%!") /* word at start of string */
+ testmatch(".*(\\<.*\\>).*", "!@#!!matches") /* word at end of string */
+ testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!foo") /* matches last word in string */
+ testmatch(".*(\\<.*\\>).*", "123") /* numbers are also word bounds */
+
+ /* nonmatches */
+ testmatch("\\<([a-z]*)\\>foo", "abcdefoo") /* word boundary needed in midstring */
+}
--- a/test/tests
+++ b/test/tests
@@ -21,8 +21,9 @@
# What we compare with. This should be self-
# evident.
B regex-basic C
-B regex-class C
-B regex-negclass C
+B regex-boundaries C
B regex-capture C
+B regex-class C
B regex-failmatch C
+B regex-negclass C
B regex-unicode C
--- a/types.myr
+++ b/types.myr
@@ -51,6 +51,8 @@
/* anchors */
`Ibol
`Ieol
+ `Ibow
+ `Ieow
/* control flow */
`Ifork (std.size, std.size)