shithub: mc

--- a/compile.myr

+++ b/compile.myr

@@ -30,6 +30,8 @@

 	`Cap	(std.size, tree#) /* id, tree */

 	`Bol	/* beginning of line */

 	`Eol	/* end of line */

+	`Bow	/* beginning of word */

+	`Eow	/* end of word */

;;

 type parseresult = union

@@ -107,6 +109,8 @@

 	/* meta */

 	|`Bol:	append(re, `Ibol)

 	|`Eol:	append(re, `Ibol)

+	|`Bow:	append(re, `Ibow)

+	|`Eow:	append(re, `Ieow)

 	|`Cap	(m, a):

 		append(re, `Ilbra m)

 		gen(re, a)

@@ -312,6 +316,7 @@

 	var i

 	n = std.encode(b[:], c)

+	std.assert(n > 0 && n < 4, "non-utf character in regex\n")

 	for i = 0; i < n; i++

 		append(re, `Ibyte b[i])

;;

@@ -352,6 +357,8 @@

 		/* anchors */

 		| `Ibol:			std.put("`Ibol\n")

 		| `Ieol:			std.put("`Ieol\n")

+		| `Ibow:			std.put("`Ibow\n")

+		| `Ieow:			std.put("`Ieow\n")

 		/* control flow */

 		| `Ifork	(lip, rip):	std.put("`Ifork (%z,%z)\n", lip, rip)

 		| `Ijmp ip:		std.put("`Ijmp %z\n", ip)

@@ -399,6 +406,10 @@

 		std.put("Bol\n")

 	| `Eol:

 		std.put("Eol\n")

+	| `Bow:

+		std.put("Bow\n")

+	| `Eow:

+		std.put("Eow\n")

 	/* end matches */

 	| `Byte	b:

 		std.put("Byte %b\n", b)

@@ -566,6 +577,10 @@

 	/* unicode character classes */

 	| 'p':	ret = unicodeclass(re, false)

 	| 'P':  ret = unicodeclass(re, true)

+	/* operators that need an escape */

+	| '<': ret = `Some mk(`Bow)

+	| '>': ret = `Some mk(`Eow)

 	/* escaped metachars */

 	| '^': ret = `Some mk(`Chr '^')

--- a/interp.myr

+++ b/interp.myr

@@ -163,10 +163,31 @@

 	| `Ieol:

 		trace(re, thr, "\t%z:\tEol\n", thr.ip)

 		if re.strp == str.len || str[re.strp] == '\n' castto(byte)

+			thr.ip++

 			-> false

 		else

 			die(re, thr, "not end of line")

;;

+	/* check for word characters */

+	| `Ibow:

+		trace(re, thr, "\t%z:\tBow\n", thr.ip)

+		if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp)))

+			thr.ip++

+			-> false

+		else

+			die(re, thr, "not beginning of word")

+		;;

+	| `Ieow:

+		trace(re, thr, "\t%z:\tEow\n", thr.ip)

+		if re.strp == str.len && iswordchar(prevchar(str, re.strp))

+			thr.ip++

+			-> false

+		elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp))

+			thr.ip++

+			-> false

+		else

+			die(re, thr, "not end of word")

+		;;

 	| `Ilbra m:

 		trace(re, thr, "\t%z:\tLbra %z\n", thr.ip, m)

 		trace(re, thr, "\t\tmatch start = %z\n", re.strp)

@@ -267,4 +288,21 @@

 	if re.debug

 		std.putv(msg, std.vastart(&args))

;;

+}

+/* must be called with i >= 1 */

+const prevchar = {s, i

+	std.assert(i != 0, "prevchar must be called with i >= 1\n")

+	i--

+	while i != 0 && s[i] >= 0x80

+		i--

+	;;

+	-> s[i:]

+}

+const iswordchar = {s

+	var c

+	c = std.decode(s)

+	-> std.isalpha(c) || std.isdigit(c) || c == '_'

--- /dev/null

+++ b/test/data/regex-boundaries-expected

@@ -1,0 +1,28 @@

+Matched abcdef via \<([a-z]*)\> : 2

+	match 0: abcdef

+	match 1: abcdef

+Matched !m! via .*(\<.*\>).* : 2

+	match 0: !m!

+	match 1: m

+Matched !m via .*(\<.*\>).* : 2

+	match 0: !m

+	match 1: m

+Matched m! via .*(\<.*\>).* : 2

+	match 0: m!

+	match 1: m

+Matched !@#!!matches!!%! via .*(\<.*\>).* : 2

+	match 0: !@#!!matches!!%!

+	match 1: matches

+Matched matches!!%! via .*(\<.*\>).* : 2

+	match 0: matches!!%!

+	match 1: matches

+Matched !@#!!matches via .*(\<.*\>).* : 2

+	match 0: !@#!!matches

+	match 1: matches

+Matched !@#!!matches!!%!foo via .*(\<.*\>).* : 2

+	match 0: !@#!!matches!!%!foo

+	match 1: foo

+Matched 123 via .*(\<.*\>).* : 2

+	match 0: 123

+	match 1: 123

+No match of abcdefoo via \<([a-z]*)\>foo

--- /dev/null

+++ b/test/regex-boundaries.myr

@@ -1,0 +1,17 @@

+use "testmatch.use"

+const main = {

+	/* expected matches */

+	testmatch("\\<([a-z]*)\\>", "abcdef")	/* whole word */

+	testmatch(".*(\\<.*\\>).*", "!m!")	/* single char word in midstring */

+	testmatch(".*(\\<.*\\>).*", "!m")	/* single char word at end of string */

+	testmatch(".*(\\<.*\\>).*", "m!")	/* single char word at start of string */

+	testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!")	/* word in midstring */

+	testmatch(".*(\\<.*\\>).*", "matches!!%!")	/* word at start of string */

+	testmatch(".*(\\<.*\\>).*", "!@#!!matches")	/* word at end of string */

+	testmatch(".*(\\<.*\\>).*", "!@#!!matches!!%!foo")	/* matches last word in string */

+	testmatch(".*(\\<.*\\>).*", "123")	/* numbers are also word bounds */

+	/* nonmatches */

+	testmatch("\\<([a-z]*)\\>foo", "abcdefoo")	/* word boundary needed in midstring */

+}

--- a/test/tests

+++ b/test/tests

@@ -21,8 +21,9 @@

 #	What we compare with. This should be self-

 #	evident.

 B regex-basic		C

-B regex-class		C

-B regex-negclass	C

+B regex-boundaries	C

 B regex-capture         C

+B regex-class		C

 B regex-failmatch	C

+B regex-negclass	C

 B regex-unicode		C

--- a/types.myr

+++ b/types.myr

@@ -51,6 +51,8 @@

 		/* anchors */

 		`Ibol

 		`Ieol

+		`Ibow

+		`Ieow

 		/* control flow */

 		`Ifork	(std.size, std.size)