shithub: mc

Download patch

ref: 1b08f4376253dca73868af564b44a7680a02667c
parent: 560a2b7de036a9f21e0a86f8656ca9b4cd2be478
author: Ori Bernstein <[email protected]>
date: Sun Sep 7 18:11:10 EDT 2014

Add support for word boundary metacharacter.

--- a/compile.myr
+++ b/compile.myr
@@ -30,6 +30,8 @@
 	`Cap	(std.size, tree#) /* id, tree */
 	`Bol	/* beginning of line */
 	`Eol	/* end of line */
+	`Bow	/* beginning of word */
+	`Eow	/* end of word */
 ;;
 
 type parseresult = union
@@ -107,6 +109,8 @@
 	/* meta */
 	|`Bol:	append(re, `Ibol)
 	|`Eol:	append(re, `Ibol)
+	|`Bow:	append(re, `Ibow)
+	|`Eow:	append(re, `Ieow)
 	|`Cap	(m, a):
 		append(re, `Ilbra m)
 		gen(re, a)
@@ -312,6 +316,7 @@
 	var i
 
 	n = std.encode(b[:], c)
+	std.assert(n > 0 && n < 4, "non-utf character in regex\n")
 	for i = 0; i < n; i++
 		append(re, `Ibyte b[i])
 	;;
@@ -352,6 +357,8 @@
 		/* anchors */
 		| `Ibol:			std.put("`Ibol\n")
 		| `Ieol:			std.put("`Ieol\n")
+		| `Ibow:			std.put("`Ibow\n")
+		| `Ieow:			std.put("`Ieow\n")
 		/* control flow */
 		| `Ifork	(lip, rip):	std.put("`Ifork (%z,%z)\n", lip, rip) 
 		| `Ijmp ip:		std.put("`Ijmp %z\n", ip) 
@@ -399,6 +406,10 @@
 		std.put("Bol\n")
 	| `Eol:
 		std.put("Eol\n")
+	| `Bow:
+		std.put("Bow\n")
+	| `Eow:
+		std.put("Eow\n")
 	/* end matches */
 	| `Byte	b:
 		std.put("Byte %b\n", b)
@@ -566,6 +577,10 @@
 	/* unicode character classes */
 	| 'p':	ret = unicodeclass(re, false)
 	| 'P':  ret = unicodeclass(re, true)
+
+	/* operators that need an escape */
+	| '<': ret = `Some mk(`Bow)
+	| '>': ret = `Some mk(`Eow)
 
 	/* escaped metachars */
 	| '^': ret = `Some mk(`Chr '^')
--- a/interp.myr
+++ b/interp.myr
@@ -163,10 +163,31 @@
 	| `Ieol:
 		trace(re, thr, "\t%z:\tEol\n", thr.ip)
 		if re.strp == str.len || str[re.strp] == '\n' castto(byte)
+			thr.ip++
 			-> false
 		else
 			die(re, thr, "not end of line")
 		;;
+	/* check for word characters */
+	| `Ibow:
+		trace(re, thr, "\t%z:\tBow\n", thr.ip)
+		if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))
+			thr.ip++
+			-> false
+		else
+			die(re, thr, "not beginning of word")
+		;;
+	| `Ieow:
+		trace(re, thr, "\t%z:\tEow\n", thr.ip)
+		if re.strp == str.len && iswordchar(prevchar(str, re.strp))
+			thr.ip++
+			-> false
+		elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))
+			thr.ip++
+			-> false
+		else
+			die(re, thr, "not end of word")
+		;;
 	| `Ilbra m:
 		trace(re, thr, "\t%z:\tLbra %z\n", thr.ip, m)
 		trace(re, thr, "\t\tmatch start = %z\n", re.strp)
@@ -267,4 +288,21 @@
 	if re.debug
 		std.putv(msg, std.vastart(&args))
 	;;
+}
+
+/* must be called with i >= 1 */
+const prevchar = {s, i
+	std.assert(i != 0, "prevchar must be called with i >= 1\n")
+	i--
+	while i != 0 && s[i] >= 0x80
+		i--
+	;;
+	-> s[i:]
+}
+
+const iswordchar = {s
+	var c
+
+	c = std.decode(s)
+	-> std.isalpha(c) || std.isdigit(c) || c == '_'
 }
--- /dev/null
+++ b/test/data/regex-boundaries-expected
@@ -1,0 +1,28 @@
+Matched abcdef via \<([a-z]*)\> : 2
+	match 0: abcdef
+	match 1: abcdef
+Matched !m! via .*(\<.*\>).* : 2
+	match 0: !m!
+	match 1: m
+Matched !m via .*(\<.*\>).* : 2
+	match 0: !m
+	match 1: m
+Matched m! via .*(\<.*\>).* : 2
+	match 0: m!
+	match 1: m
+Matched !@#!!matches!!%! via .*(\<.*\>).* : 2
+	match 0: !@#!!matches!!%!
+	match 1: matches
+Matched matches!!%! via .*(\<.*\>).* : 2
+	match 0: matches!!%!
+	match 1: matches
+Matched !@#!!matches via .*(\<.*\>).* : 2
+	match 0: !@#!!matches
+	match 1: matches
+Matched !@#!!matches!!%!foo via .*(\<.*\>).* : 2
+	match 0: !@#!!matches!!%!foo
+	match 1: foo
+Matched 123 via .*(\<.*\>).* : 2
+	match 0: 123
+	match 1: 123
+No match of abcdefoo via \<([a-z]*)\>foo
--- /dev/null
+++ b/test/regex-boundaries.myr
@@ -1,0 +1,17 @@
+use "testmatch.use"
+
+const main = {
+	/* expected matches */
+	testmatch("\\<([a-z]*)\\>", "abcdef")	/* whole word */
+	testmatch(".*(\\<.*\\>).*", "!m!")	/* single char word in midstring */
+	testmatch(".*(\\<.*\\>).*", "!m")	/* single char word at end of string */
+	testmatch(".*(\\<.*\\>).*", "m!")	/* single char word at start of string */
+	testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!")	/* word in midstring */
+	testmatch(".*(\\<.*\\>).*", "matches!!%!")	/* word at start of string */
+	testmatch(".*(\\<.*\\>).*", "!@#!!matches")	/* word at end of string */
+	testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!foo")	/* matches last word in string */
+	testmatch(".*(\\<.*\\>).*", "123")	/* numbers are also word bounds */
+	
+	/* nonmatches */
+	testmatch("\\<([a-z]*)\\>foo", "abcdefoo")	/* word boundary needed in midstring */
+}
--- a/test/tests
+++ b/test/tests
@@ -21,8 +21,9 @@
 #	What we compare with. This should be self-
 #	evident.
 B regex-basic		C
-B regex-class		C
-B regex-negclass	C
+B regex-boundaries	C
 B regex-capture         C
+B regex-class		C
 B regex-failmatch	C
+B regex-negclass	C
 B regex-unicode		C
--- a/types.myr
+++ b/types.myr
@@ -51,6 +51,8 @@
 		/* anchors */
 		`Ibol
 		`Ieol
+		`Ibow
+		`Ieow
 
 		/* control flow */
 		`Ifork	(std.size, std.size)