shithub: mc

Download patch

ref: 5782871df2ffdc7d41c42762ef3878acc9529b43
parent: 6f1d771f9e6fee4b3f9d84da05bf1c96f964df70
parent: 8d54eba9360a95c0ae4a3676fe9bbc9d347b998f
author: Ori Bernstein <[email protected]>
date: Thu Jan 9 05:50:45 EST 2014

Merge branch 'master' of git+ssh://git.eigenstate.org/git/ori/libregex

--- a/compile.myr
+++ b/compile.myr
@@ -58,6 +58,13 @@
 	| `None:	-> `std.Failure (`Earlystop)
 	| `Fail f:	-> `std.Failure f
 	| `Some t:
+		/*
+		we can stop early if we get 
+		an incorrectly encoded char
+		*/
+		if re.pat.len > 0
+			-> `std.Failure (`Earlystop)
+		;;
 		dump(re, t, 0)
 		append(re, `Ilbra 0)
 		gen(re, t)
@@ -459,12 +466,12 @@
 	| '+': ret = mk(`Chr '+')
 	| '?': ret = mk(`Chr '?')
 
-	/* FIXME: implement this later.
-	| 'W': ret = negranges(re, _ranges.asciiword[:])
-	| 'S': ret = negranges(re, _ranges.asciispace[:])
-	| 'D': ret = negranges(re, _ranges.asciidigit[:])[:])
-	| 'X': ret = negranges(re, _ranges.xdigit[:])[:])
-	| 'H': ret = negranges(re, _ranges.asciihspace[:])
+	| 'W': ret = negranges(re, _ranges.tabasciiword[:])
+	| 'S': ret = negranges(re, _ranges.tabasciispace[:])
+	| 'D': ret = negranges(re, _ranges.tabasciidigit[:])
+	| 'X': ret = negranges(re, _ranges.tabasciixdigit[:])
+	| 'H': ret = negranges(re, _ranges.tabasciiblank[:])
+	/*
 	| 'p':	unicodeclass(re)
 	| 'P':  negate(unicodeclass(re))
 	*/
@@ -472,44 +479,45 @@
 	-> ret
 }
 
-const ranges = {re, rng
-	var ret
-	var lhs
-	var rhs
-
-	if rng.len == 1
-		ret = mk(`Class (rng[0][0], rng[0][1]))
-	else
-		lhs = ranges(re, rng[0:rng.len/2])
-		rhs = ranges(re, rng[rng.len/2:rng.len])
-		ret = mk(`Alt (lhs, rhs))
-	;;
-	-> ret
-}
-
 const chrclass = {re
-	var r
+	var rl, m
+	var neg
 	var t
 
 	/* we know we saw '[' on entry */
 	matchc(re, '[')
+	neg = false
 	if matchc(re, '^')
-		std.die("negation of character classes not yet supported")
+		neg = true
 	;;
-	t = rangematch(re)
+	rl = rangematch(re, [][:])
 	while peekc(re) != ']'
-		r = rangematch(re)
-		t = mk(`Alt (t, r))
+		rl = rangematch(re, rl)
 	;;
 	if !matchc(re, ']')
-		astfree(t)
+		std.slfree(rl)
 		-> `Fail (`Earlystop)
+	;;
+	if neg
+		std.sort(rl, {a, b;
+			if a[0] < b[0]
+				-> `std.Before
+			elif a[0] == b[0]
+				-> `std.Equal
+			else
+				-> `std.After
+			;;})
+		m = merge(rl)
+		t = negranges(re, m)
+		std.slfree(m)
 	else
-		-> `Some t
+		t = ranges(re, rl)
 	;;
+	std.slfree(rl)
+	-> `Some t
 }
 
-const rangematch = {re
+const rangematch = {re, sl
 	var lo
 	var hi
 
@@ -516,11 +524,81 @@
 	lo = getc(re)
 	if matchc(re, '-')
 		hi = getc(re)
-		-> mk(`Class (lo, hi))
+		if lo <= hi
+			-> std.slpush(sl, [lo, hi])
+		else
+			-> std.slpush(sl, [hi, lo])
+		;;
 	else
-		-> mk(`Chr lo)
+		-> std.slpush(sl, [lo, lo])
 	;;
 }
+
+const ranges = {re, rng
+	var ret
+	var lhs
+	var rhs
+
+	if rng.len == 1
+		ret = mk(`Class (rng[0][0], rng[0][1]))
+	else
+		lhs = ranges(re, rng[0:rng.len/2])
+		rhs = ranges(re, rng[rng.len/2:rng.len])
+		ret = mk(`Alt (lhs, rhs))
+	;;
+	-> ret
+}
+
+const negranges = {re, rng
+	var neg, ret
+
+	neg = negate(rng)
+	ret = ranges(re, neg)
+	std.slfree(neg)
+	-> ret
+}
+
+const negate = {rng
+	var start, end, next
+	var neg
+
+	neg = [][:]
+	start = 0
+	next = 0 /* if we have no ranges */
+	for r in rng
+		(end, next) = (r[0], r[1])
+		neg = std.slpush(neg, [start, end - 1])
+		start = next + 1
+	;;
+	neg = std.slpush(neg, [next + 1, std.Maxcharval])
+	-> neg
+}
+
+/* rl is a sorted list of ranges */
+const merge = {rl
+	var lo, hi
+	var ret
+
+	if rl.len == 0
+		-> [][:]
+	;;
+	ret = [][:]
+	lo = rl[0][0]
+	hi = rl[0][1]
+	rl = rl[1:] /* BUG: compiler wants an rval in loop range */
+	for r in rl
+		/* if it overlaps or abuts, merge */
+		if r[0] <= hi + 1
+			hi = r[1]
+		else
+			ret = std.slpush(ret, [lo, hi])
+			lo = r[0]
+			hi = r[1]
+		;;
+	;;
+	-> std.slpush(ret, [lo, hi])
+}
+
 
 const matchc = {re, c
 	var str
--- a/main.myr
+++ /dev/null
@@ -1,16 +1,0 @@
-use regex
-use std
-
-const main = {
-	var found
-	match regex.compile(".*bc")
-	| `std.Success re:
-		found = regex.exec(re, "Abc")
-		std.put("Found = %t: len = %z\n", found, re.strp)
-		-> 0
-	| `std.Failure err:
-		std.put("failed to compile regex")
-		-> 1
-	;;
-}
-
--- /dev/null
+++ b/test/data/regex-class-expected
@@ -1,0 +1,51 @@
+Matched. 1 matches
+match 0: 1
+Matched. 1 matches
+match 0: 13
+Matched. 1 matches
+match 0: 13688
+No match
+No match
+No match
+Matched. 1 matches
+match 0: a
+Matched. 1 matches
+match 0: 1F
+Matched. 1 matches
+match 0: 13b8cDEf
+No match
+No match
+No match
+Matched. 1 matches
+match 0:  
+Matched. 1 matches
+match 0: 	
+
+Matched. 1 matches
+match 0: 	
+
 	
+No match
+No match
+No match
+Matched. 1 matches
+match 0: abcABC0123_
+No match
+Matched. 1 matches
+match 0:  
+Matched. 1 matches
+match 0: 	 
+Matched. 1 matches
+match 0: 	 	 
+No match
+No match
+No match
+Matched. 1 matches
+match 0: abcd
+Matched. 1 matches
+match 0: abCD
+Matched. 1 matches
+match 0: _abCD018
+Matched. 1 matches
+match 0: abba
+Matched. 1 matches
+match 0: abBa
--- /dev/null
+++ b/test/data/regex-negclass-expected
@@ -1,0 +1,50 @@
+Matched. 1 matches
+match 0: x
+Matched. 1 matches
+match 0: xa!#^cs
+No match
+No match
+No match
+No match
+Matched. 1 matches
+match 0: Z
+Matched. 1 matches
+match 0: gg
+No match
+No match
+Matched. 1 matches
+match 0: a
+Matched. 1 matches
+match 0: i%
+Matched. 1 matches
+match 0: alskd690!#!!
+No match
+No match
+No match
+Matched. 1 matches
+match 0: !%!^^@@!^
+No match
+Matched. 1 matches
+match 0: 
+
+Matched. 1 matches
+match 0: 
+
+No match
+No match
+No match
+Matched. 1 matches
+match 0: ABCD
+Matched. 1 matches
+match 0: 1234
+Matched. 1 matches
+match 0: -^^-
+Matched. 1 matches
+match 0: d6d
+Matched. 1 matches
+match 0: !^!!))#
+No match
+No match
+No match
+No match
+No match
--- /dev/null
+++ b/test/regex-class.myr
@@ -1,0 +1,67 @@
+use std
+
+use "testmatch.use"
+
+const main = {
+	asciiclass()
+	set()
+	/*
+	unicodeclass()
+	negasciiclass()
+	negasciirange()
+	negset()
+	*/
+}
+
+const asciiclass = {
+	/* \d success */
+	testmatch("\\d", "1")
+	testmatch("\\d\\d", "13")
+	testmatch("\\d+", "13688")
+	/* \d fail */
+	testmatch("\\d", "x")
+	testmatch("\\d\\d", "x3")
+	testmatch("\\d+", "1368f")
+
+	/* \x success */
+	testmatch("\\x", "a")
+	testmatch("\\x\\x", "1F")
+	testmatch("\\x+", "13b8cDEf")
+	/* \x fail */
+	testmatch("\\x", "Z")
+	testmatch("\\x\\x", "fg")
+	testmatch("\\x+", "13b8cg")
+
+	/* \s success */
+	testmatch("\\s", " ")
+	testmatch("\\s\\s", "\t\n")
+	testmatch("\\s+", "\t\n\r \t")
+	/* \s fail */
+	testmatch("\\s", "a")
+	testmatch("\\s\\s", "i\n")
+	testmatch("\\s+", "\t\n\r.\t")
+
+	/* word success */
+	testmatch("\\w+", "abcABC0123_")
+	/* word fail */
+	testmatch("\\w+", "abcABC0123_.")
+
+	/* \h success */
+	testmatch("\\h", " ")
+	testmatch("\\h\\h", "\t ")
+	testmatch("\\h+", "\t \t ")
+	/* \h fail */
+	testmatch("\\h", "\n")
+	testmatch("\\h\\h", "\t\r")
+	testmatch("\\h+", "\t \t.")
+}
+
+const set = {
+	/* ranges */
+	testmatch("[a-z]*", "abcd")
+	testmatch("[a-zA-Z]*", "abCD")
+	testmatch("[a-zA-Z0-9_]*", "_abCD018")
+
+	testmatch("[abc]*", "abba")
+	testmatch("[a-zABC]*", "abBa")
+}
--- /dev/null
+++ b/test/regex-negclass.myr
@@ -1,0 +1,72 @@
+use std
+
+use "testmatch.use"
+
+const main = {
+	asciiclass()
+	set()
+	/*
+	unicodeclass()
+	negasciiclass()
+	negasciirange()
+	negset()
+	*/
+}
+
+const asciiclass = {
+	/* \D success */
+	testmatch("\\D", "x")
+	testmatch("\\D+", "xa!#^cs")
+
+	/* \D fail: end of ranges chars */
+	testmatch("\\D", "0")
+	testmatch("\\D", "9")
+	testmatch("\\D+", "a35x")
+	testmatch("\\D+", "13688")
+
+	/* \X success */
+	testmatch("\\X", "Z")
+	testmatch("\\X\\X", "gg")
+	/* \X fail */
+	testmatch("\\X", "a")
+	testmatch("\\X+", "zz13b8cDEf")
+
+	/* \S success */
+	testmatch("\\S", "a")
+	testmatch("\\S\\S", "i%")
+	testmatch("\\S+", "alskd690!#!!")
+
+	/* \S fail */
+	testmatch("\\S", " ")
+	testmatch("\\S\\S", "\t\n")
+	testmatch("\\S+", "\t \nkait")
+
+	/* word success */
+	testmatch("\\W+", "!%!^^@@!^")
+	/* word fail */
+	testmatch("\\W+", "a^#$bcABC0123_")
+
+	/* \H success */
+	testmatch("\\H", "\n")
+	testmatch("\\H\\H", "\n\r")
+	/* \H fail */
+	testmatch("\\H+", "\t \t.")
+	testmatch("\\H\\H", "\t ")
+	testmatch("\\H+", "\ta35 \t ")
+}
+
+const set = {
+	/* ranges: should succeed */
+	testmatch("[^a-z]*", "ABCD")
+	testmatch("[^a-zA-Z]*", "1234")
+	testmatch("[^a-zA-Z0-9_]*", "-^^-")
+	testmatch("[^abc]*", "d6d")
+	testmatch("[^a-zABC]*", "!^!!))#")
+
+	/* ranges: should fail */
+	testmatch("[^a-z]*", "abcd")
+	testmatch("[^a-zA-Z]*", "abCD")
+	testmatch("[^a-zA-Z0-9_]*", "_abCD018")
+	testmatch("[^abc]*", "abba")
+	testmatch("[^a-zABC]*", "abBa")
+}
--- a/test/tests
+++ b/test/tests
@@ -22,6 +22,7 @@
 #	evident.
 B regex-basic		C
 B regex-class		C
+B regex-negclass	C
 B regex-capture         C
 B regex-failmatch	C
 B regex-unicode		C