shithub: mc

--- a/compile.myr

+++ b/compile.myr

@@ -58,6 +58,13 @@

 	| `None:	-> `std.Failure (`Earlystop)

 	| `Fail f:	-> `std.Failure f

 	| `Some t:

+		/*

+		we can stop early if we get

+		an incorrectly encoded char

+		*/

+		if re.pat.len > 0

+			-> `std.Failure (`Earlystop)

+		;;

 		dump(re, t, 0)

 		append(re, `Ilbra 0)

 		gen(re, t)

@@ -459,12 +466,12 @@

 	| '+': ret = mk(`Chr '+')

 	| '?': ret = mk(`Chr '?')

-	/* FIXME: implement this later.

-	| 'W': ret = negranges(re, _ranges.asciiword[:])

-	| 'S': ret = negranges(re, _ranges.asciispace[:])

-	| 'D': ret = negranges(re, _ranges.asciidigit[:])[:])

-	| 'X': ret = negranges(re, _ranges.xdigit[:])[:])

-	| 'H': ret = negranges(re, _ranges.asciihspace[:])

+	| 'W': ret = negranges(re, _ranges.tabasciiword[:])

+	| 'S': ret = negranges(re, _ranges.tabasciispace[:])

+	| 'D': ret = negranges(re, _ranges.tabasciidigit[:])

+	| 'X': ret = negranges(re, _ranges.tabasciixdigit[:])

+	| 'H': ret = negranges(re, _ranges.tabasciiblank[:])

+	/*

 	| 'p':	unicodeclass(re)

 	| 'P':  negate(unicodeclass(re))

*/

@@ -472,44 +479,45 @@

 	-> ret

-const ranges = {re, rng

-	var ret

-	var lhs

-	var rhs

-	if rng.len == 1

-		ret = mk(`Class (rng[0][0], rng[0][1]))

-	else

-		lhs = ranges(re, rng[0:rng.len/2])

-		rhs = ranges(re, rng[rng.len/2:rng.len])

-		ret = mk(`Alt (lhs, rhs))

-	;;

-	-> ret

-}

 const chrclass = {re

-	var r

+	var rl, m

+	var neg

 	var t

 	/* we know we saw '[' on entry */

 	matchc(re, '[')

+	neg = false

 	if matchc(re, '^')

-		std.die("negation of character classes not yet supported")

+		neg = true

;;

-	t = rangematch(re)

+	rl = rangematch(re, [][:])

 	while peekc(re) != ']'

-		r = rangematch(re)

-		t = mk(`Alt (t, r))

+		rl = rangematch(re, rl)

;;

 	if !matchc(re, ']')

-		astfree(t)

+		std.slfree(rl)

 		-> `Fail (`Earlystop)

+	;;

+	if neg

+		std.sort(rl, {a, b;

+			if a[0] < b[0]

+				-> `std.Before

+			elif a[0] == b[0]

+				-> `std.Equal

+			else

+				-> `std.After

+			;;})

+		m = merge(rl)

+		t = negranges(re, m)

+		std.slfree(m)

 	else

-		-> `Some t

+		t = ranges(re, rl)

;;

+	std.slfree(rl)

+	-> `Some t

-const rangematch = {re

+const rangematch = {re, sl

 	var lo

 	var hi

@@ -516,11 +524,81 @@

 	lo = getc(re)

 	if matchc(re, '-')

 		hi = getc(re)

-		-> mk(`Class (lo, hi))

+		if lo <= hi

+			-> std.slpush(sl, [lo, hi])

+		else

+			-> std.slpush(sl, [hi, lo])

+		;;

 	else

-		-> mk(`Chr lo)

+		-> std.slpush(sl, [lo, lo])

;;

+const ranges = {re, rng

+	var ret

+	var lhs

+	var rhs

+	if rng.len == 1

+		ret = mk(`Class (rng[0][0], rng[0][1]))

+	else

+		lhs = ranges(re, rng[0:rng.len/2])

+		rhs = ranges(re, rng[rng.len/2:rng.len])

+		ret = mk(`Alt (lhs, rhs))

+	;;

+	-> ret

+}

+const negranges = {re, rng

+	var neg, ret

+	neg = negate(rng)

+	ret = ranges(re, neg)

+	std.slfree(neg)

+	-> ret

+}

+const negate = {rng

+	var start, end, next

+	var neg

+	neg = [][:]

+	start = 0

+	next = 0 /* if we have no ranges */

+	for r in rng

+		(end, next) = (r[0], r[1])

+		neg = std.slpush(neg, [start, end - 1])

+		start = next + 1

+	;;

+	neg = std.slpush(neg, [next + 1, std.Maxcharval])

+	-> neg

+}

+/* rl is a sorted list of ranges */

+const merge = {rl

+	var lo, hi

+	var ret

+	if rl.len == 0

+		-> [][:]

+	;;

+	ret = [][:]

+	lo = rl[0][0]

+	hi = rl[0][1]

+	rl = rl[1:] /* BUG: compiler wants an rval in loop range */

+	for r in rl

+		/* if it overlaps or abuts, merge */

+		if r[0] <= hi + 1

+			hi = r[1]

+		else

+			ret = std.slpush(ret, [lo, hi])

+			lo = r[0]

+			hi = r[1]

+		;;

+	;;

+	-> std.slpush(ret, [lo, hi])

+}

 const matchc = {re, c

 	var str

--- a/main.myr

+++ /dev/null

@@ -1,16 +1,0 @@

-use regex

-use std

-const main = {

-	var found

-	match regex.compile(".*bc")

-	| `std.Success re:

-		found = regex.exec(re, "Abc")

-		std.put("Found = %t: len = %z\n", found, re.strp)

-		-> 0

-	| `std.Failure err:

-		std.put("failed to compile regex")

-		-> 1

-	;;

-}

--- /dev/null

+++ b/test/data/regex-class-expected

@@ -1,0 +1,51 @@

+Matched. 1 matches

+match 0: 1

+Matched. 1 matches

+match 0: 13

+Matched. 1 matches

+match 0: 13688

+No match

+No match

+No match

+Matched. 1 matches

+match 0: a

+Matched. 1 matches

+match 0: 1F

+Matched. 1 matches

+match 0: 13b8cDEf

+No match

+No match

+No match

+Matched. 1 matches

+match 0:

+Matched. 1 matches

+match 0:

+Matched. 1 matches

+match 0:

+No match

+No match

+No match

+Matched. 1 matches

+match 0: abcABC0123_

+No match

+Matched. 1 matches

+match 0:

+Matched. 1 matches

+match 0:

+Matched. 1 matches

+match 0:

+No match

+No match

+No match

+Matched. 1 matches

+match 0: abcd

+Matched. 1 matches

+match 0: abCD

+Matched. 1 matches

+match 0: _abCD018

+Matched. 1 matches

+match 0: abba

+Matched. 1 matches

+match 0: abBa

--- /dev/null

+++ b/test/data/regex-negclass-expected

@@ -1,0 +1,50 @@

+Matched. 1 matches

+match 0: x

+Matched. 1 matches

+match 0: xa!#^cs

+No match

+No match

+No match

+No match

+Matched. 1 matches

+match 0: Z

+Matched. 1 matches

+match 0: gg

+No match

+No match

+Matched. 1 matches

+match 0: a

+Matched. 1 matches

+match 0: i%

+Matched. 1 matches

+match 0: alskd690!#!!

+No match

+No match

+No match

+Matched. 1 matches

+match 0: !%!^^@@!^

+No match

+Matched. 1 matches

+match 0:

+Matched. 1 matches

+match 0:

+No match

+No match

+No match

+Matched. 1 matches

+match 0: ABCD

+Matched. 1 matches

+match 0: 1234

+Matched. 1 matches

+match 0: -^^-

+Matched. 1 matches

+match 0: d6d

+Matched. 1 matches

+match 0: !^!!))#

+No match

+No match

+No match

+No match

+No match

--- /dev/null

+++ b/test/regex-class.myr

@@ -1,0 +1,67 @@

+use std

+use "testmatch.use"

+const main = {

+	asciiclass()

+	set()

+	/*

+	unicodeclass()

+	negasciiclass()

+	negasciirange()

+	negset()

+	*/

+}

+const asciiclass = {

+	/* \d success */

+	testmatch("\\d", "1")

+	testmatch("\\d\\d", "13")

+	testmatch("\\d+", "13688")

+	/* \d fail */

+	testmatch("\\d", "x")

+	testmatch("\\d\\d", "x3")

+	testmatch("\\d+", "1368f")

+	/* \x success */

+	testmatch("\\x", "a")

+	testmatch("\\x\\x", "1F")

+	testmatch("\\x+", "13b8cDEf")

+	/* \x fail */

+	testmatch("\\x", "Z")

+	testmatch("\\x\\x", "fg")

+	testmatch("\\x+", "13b8cg")

+	/* \s success */

+	testmatch("\\s", " ")

+	testmatch("\\s\\s", "\t\n")

+	testmatch("\\s+", "\t\n\r \t")

+	/* \s fail */

+	testmatch("\\s", "a")

+	testmatch("\\s\\s", "i\n")

+	testmatch("\\s+", "\t\n\r.\t")

+	/* word success */

+	testmatch("\\w+", "abcABC0123_")

+	/* word fail */

+	testmatch("\\w+", "abcABC0123_.")

+	/* \h success */

+	testmatch("\\h", " ")

+	testmatch("\\h\\h", "\t ")

+	testmatch("\\h+", "\t \t ")

+	/* \h fail */

+	testmatch("\\h", "\n")

+	testmatch("\\h\\h", "\t\r")

+	testmatch("\\h+", "\t \t.")

+}

+const set = {

+	/* ranges */

+	testmatch("[a-z]*", "abcd")

+	testmatch("[a-zA-Z]*", "abCD")

+	testmatch("[a-zA-Z0-9_]*", "_abCD018")

+	testmatch("[abc]*", "abba")

+	testmatch("[a-zABC]*", "abBa")

+}

--- /dev/null

+++ b/test/regex-negclass.myr

@@ -1,0 +1,72 @@

+use std

+use "testmatch.use"

+const main = {

+	asciiclass()

+	set()

+	/*

+	unicodeclass()

+	negasciiclass()

+	negasciirange()

+	negset()

+	*/

+}

+const asciiclass = {

+	/* \D success */

+	testmatch("\\D", "x")

+	testmatch("\\D+", "xa!#^cs")

+	/* \D fail: end of ranges chars */

+	testmatch("\\D", "0")

+	testmatch("\\D", "9")

+	testmatch("\\D+", "a35x")

+	testmatch("\\D+", "13688")

+	/* \X success */

+	testmatch("\\X", "Z")

+	testmatch("\\X\\X", "gg")

+	/* \X fail */

+	testmatch("\\X", "a")

+	testmatch("\\X+", "zz13b8cDEf")

+	/* \S success */

+	testmatch("\\S", "a")

+	testmatch("\\S\\S", "i%")

+	testmatch("\\S+", "alskd690!#!!")

+	/* \S fail */

+	testmatch("\\S", " ")

+	testmatch("\\S\\S", "\t\n")

+	testmatch("\\S+", "\t \nkait")

+	/* word success */

+	testmatch("\\W+", "!%!^^@@!^")

+	/* word fail */

+	testmatch("\\W+", "a^#$bcABC0123_")

+	/* \H success */

+	testmatch("\\H", "\n")

+	testmatch("\\H\\H", "\n\r")

+	/* \H fail */

+	testmatch("\\H+", "\t \t.")

+	testmatch("\\H\\H", "\t ")

+	testmatch("\\H+", "\ta35 \t ")

+}

+const set = {

+	/* ranges: should succeed */

+	testmatch("[^a-z]*", "ABCD")

+	testmatch("[^a-zA-Z]*", "1234")

+	testmatch("[^a-zA-Z0-9_]*", "-^^-")

+	testmatch("[^abc]*", "d6d")

+	testmatch("[^a-zABC]*", "!^!!))#")

+	/* ranges: should fail */

+	testmatch("[^a-z]*", "abcd")

+	testmatch("[^a-zA-Z]*", "abCD")

+	testmatch("[^a-zA-Z0-9_]*", "_abCD018")

+	testmatch("[^abc]*", "abba")

+	testmatch("[^a-zABC]*", "abBa")

+}

--- a/test/tests

+++ b/test/tests

@@ -22,6 +22,7 @@

 #	evident.

 B regex-basic		C

 B regex-class		C

+B regex-negclass	C

 B regex-capture         C

 B regex-failmatch	C

 B regex-unicode		C