shithub: mc

Download patch

ref: fb41bc022ada401c69bad8da6802762a7a114803
parent: 89598b02bebd8005be984af85a20e8bbdee9da8a
author: Ori Bernstein <[email protected]>
date: Sun Jan 26 18:55:13 EST 2014

Fix capture groups with '+'

    Because of the way we generated the group id when compiling,
    and the way it interacted with the duplication of the subnodes
    for '+', we generated two capture groups for. This is because

      '(a)+ became (a)(a)*.

    This change fixes it so that both capture groups, while generated
    the same way, get the same capture id.

--- a/compile.myr
+++ b/compile.myr
@@ -25,7 +25,7 @@
 	`Class	[char, char]
 
 	/* meta */
-	`Cap	tree#
+	`Cap	[std.size, tree#] /* id, tree */
 	`Bol	/* beginning of line */
 	`Eol	/* end of line */
 ;;
@@ -87,8 +87,6 @@
 
 /* generates bytecode from an AST */
 const gen = {re, t
-	var m
-
 	match t#
 	|`Alt	(a, b): genalt(re, a, b)
 	|`Cat	(a, b): gen(re, a); gen(re, b)
@@ -105,8 +103,7 @@
 	/* meta */
 	|`Bol:	append(re, `Ibol)
 	|`Eol:	append(re, `Ibol)
-	|`Cap	a:
-		m = re.nmatch++
+	|`Cap	(m, a):
 		append(re, `Ilbra m)
 		gen(re, a)
 		append(re, `Irbra m)
@@ -325,8 +322,8 @@
 		std.put("Class (%c-%c)\n", a, b)
 
 	/* meta */
-	| `Cap	a:
-		std.put("Cap\n")
+	| `Cap	(m, a):
+		std.put("Cap %i\n", m)
 		dump(re, a, indent + 1)
 	;;
 }
@@ -417,9 +414,9 @@
 	/* lower prec operators */
 	| '|':	-> `None
 	| ')':	-> `None
-	| '*':	-> `Fail (`Badrep)
-	| '+':	-> `Fail (`Badrep)
-	| '?':	-> `Fail (`Badrep)
+	| '*':	-> `Fail `Badrep
+	| '+':	-> `Fail `Badrep
+	| '?':	-> `Fail `Badrep
 	| '[':	-> chrclass(re)
 	| '.':	getc(re); ret = mk(`Class (0, std.Maxcharval))
 	| '^':	getc(re); ret = mk(`Bol)
@@ -427,17 +424,18 @@
 	| '(':	
 		getc(re)
 		match altexpr(re)
-		| `Some s:	ret = mk(`Cap s)
-		| `None:	-> `Fail (`Emptyparen)
+		| `Some s:
+			if matchc(re, ')')
+				-> `Some mk(`Cap (re.nmatch++, s))
+			else
+				-> `Fail `Unbalanced
+			;;
+		| `None:	-> `Fail `Emptyparen
 		;;
-		if !matchc(re, ')')
-			astfree(ret)
-			-> `Fail (`Unbalanced)
-		;;
 	| '\\':
 		getc(re) /* consume the slash */
 		if re.pat.len == 0
-			-> `Fail (`Earlystop)
+			-> `Fail `Earlystop
 		;;
 		-> escaped(re)
 	| c:
@@ -700,7 +698,7 @@
 	| `Class (a, b):	
 
 	/* meta */
-	| `Cap	a:	astfree(a)
+	| `Cap	(m, a):	astfree(a)
 	;;
 	std.free(t)
 }
--- a/test/data/regex-capture-expected
+++ b/test/data/regex-capture-expected
@@ -4,10 +4,13 @@
 Matched. 2 matches
 match 0: Abcde
 match 1: bcd
+Matched. 2 matches
+match 0: abab
+match 1: b
 Matched. 3 matches
 match 0: Abcde
-match 1: bcd
-match 2: c
+match 1: c
+match 2: bcd
 Matched. 4 matches
 match 0: aaaa
 match 1: a
--- a/test/regex-capture.myr
+++ b/test/regex-capture.myr
@@ -3,6 +3,7 @@
 const main = {
 	testmatch("A(.*)", "Abc")
 	testmatch("A(.*)e", "Abcde")
+	testmatch("(a|b)+", "abab")
 	testmatch("A(b(.*)d)e", "Abcde")
 	testmatch("(a?)(a*)(a?)", "aaaa")
 }