shithub: mc

--- a/Makefile

+++ b/Makefile

@@ -3,6 +3,7 @@

 MYRSRC= \

 	compile.myr \

 	interp.myr \

+	ranges.myr \

 	types.myr \

 include config.mk

--- a/compile.myr

+++ b/compile.myr

@@ -1,6 +1,7 @@

 use std

 use "types.use"

+use "ranges.use"

 pkg regex =

 	const compile	: (re : byte[:] -> std.error(regex#, status))

@@ -426,14 +427,14 @@

 			astfree(ret)

 			-> `Fail (`Unbalanced)

;;

+	| '\\':

+		getc(re) /* consume the slash */

+		if re.pat.len == 0

+			-> `Fail (`Earlystop)

+		;;

+		ret = escaped(re)

 	| c:

 		getc(re)

-		if c == '\\'

-			if re.pat.len == 0

-				-> `Fail (`Earlystop)

-			;;

-			c = getc(re)

-		;;

 		ret = mk(`Chr c)

;;

 	dump(re, ret, 0)

@@ -440,11 +441,61 @@

 	-> `Some ret

+const escaped = {re

+	var ret

+	match getc(re)

+	/* character classes */

+	| 'd': ret = ranges(re, _ranges.asciidigit[:])

+	| 'x': ret = ranges(re, _ranges.asciixdigit[:])

+	| 's': ret = ranges(re, _ranges.asciispace[:])

+	| 'w': ret = ranges(re, _ranges.asciiword[:])

+	| 'h': ret = ranges(re, _ranges.asciihspace[:])

+	/* escaped metachars */

+	| '^': ret = mk(`Chr '^')

+	| '$': ret = mk(`Chr '$')

+	| '.': ret = mk(`Chr '.')

+	| '+': ret = mk(`Chr '+')

+	| '?': ret = mk(`Chr '?')

+	/* FIXME: implement this later.

+	| 'W': ret = negranges(re, _ranges.asciiword[:])

+	| 'S': ret = negranges(re, _ranges.asciispace[:])

+	| 'D': ret = negranges(re, _ranges.asciidigit[:])[:])

+	| 'X': ret = negranges(re, _ranges.xdigit[:])[:])

+	| 'H': ret = negranges(re, _ranges.asciihspace[:])

+	| 'p':	unicodeclass(re)

+	| 'P':  negate(unicodeclass(re))

+	*/

+	;;

+	-> ret

+}

+const ranges = {re, rng

+	var ret

+	var lhs

+	var rhs

+	if rng.len == 1

+		ret = mk(`Class (rng[0][0], rng[0][1]))

+	else

+		lhs = ranges(re, rng[0:rng.len/2])

+		rhs = ranges(re, rng[rng.len/2:rng.len])

+		ret = mk(`Alt (lhs, rhs))

+	;;

+	-> ret

+}

 const chrclass = {re

 	var r

 	var t

+	/* we know we saw '[' on entry */

 	matchc(re, '[')

+	if matchc(re, '^')

+		std.die("negation of character classes not yet supported")

+	;;

 	t = rangematch(re)

 	while peekc(re) != ']'

 		r = rangematch(re)

--- a/doc/myr-regex.3

+++ b/doc/myr-regex.3

@@ -46,6 +46,90 @@

 within the string, instead of attempting to find a match spanning the whole

 string.

+.SH REGEX SYNTAX

+.PP

+The grammar used by libregex is below:

+.EX

+    regex       : altexpr

+    altexpr     : catexpr ('|' altexpr)+

+    catexpr     : repexpr (catexpr)+

+    repexpr     : baseexpr[*+?]

+    baseexpr    : literal

+                | charclass

+                | charrange

+                | '.'

+                | '^'

+                | '$'

+                | '(' regex ')'

+    charclass   : see below

+    charrange   : '[' (literal('-' literal)?)+']'

+.EE

+The following metacharacters have the meanings listed below:

+.TP

+.

+Matches a single unicode character

+.TP

+^

+Matches the beginning of a line. Does not consume any characters.

+.TP

+$

+Matches the end of a line. Does not consume any characters.

+.TP

+*

+Matches any number of repetitions of the preceding regex fragment.

+.TP

++

+Matches one or more repetitions of the preceding regex fragment.

+.TP

+?

+Matches zero or one of the preceding regex fragment.

+.PP

+In order to match a literal metacharacter, it needs to be preceded by

+a '\\' character.

+The following character classes are supported:

+.TP

+\\d

+ASCII digits

+.TP

+\\D

+Negation of ASCII digits

+.TP

+\\x

+ASCII Hex digits

+.TP

+\\X

+Negation of ASCII Hex digits

+.TP

+\\s

+ASCII spaces

+.TP

+\\S

+Negation of ASCII spaces

+.TP

+\\w

+ASCII word characters

+.TP

+\\W

+Negation of ASCII word characters

+.TP

+\\h

+ASCII whitespace characters

+.TP

+\\H

+Negation of ASCII whitespace characters

+.TP

+\\pX

+Characters with unicode property 'X'

+.TP

+\\PX

+.PP

+Negation of characters with unicode property 'X'. The only properties that

+are currently supported are 'Z' (space), 'L' (letter).

 .SH EXAMPLE

.EX

         use std

--- /dev/null

+++ b/ranges.myr

@@ -1,0 +1,33 @@

+pkg _ranges =

+	const asciidigit 	: char[2][1]

+	const asciixdigit	: char[2][3]

+	const asciispace	: char[2][2]

+	const asciiword		: char[2][4]

+	const asciihspace	: char[2][2]

+;;

+const asciidigit = [

+	[0x30,0x39]

+]

+const asciixdigit = [

+	[0x30,0x39],

+	[0x41,0x46],

+	[0x61,0x66]

+]

+const asciispace = [

+	[0x9, 0xd],

+	[0x20, 0x20]

+]

+const asciiword = [

+	[0x30,0x39],	/* 0-9 */

+	[0x41, 0x5a],	/* A-Z */

+	[0x5f, 0x5f],	/* _ */

+	[0x61, 0x7a]	/* a-z */

+]

+const asciihspace = [

+	[0x9, 0x9],	/* \t */

+	[0x20, 0x20]	/* ' ' */

+]

--- a/test/tests

+++ b/test/tests

@@ -21,6 +21,7 @@

 #	What we compare with. This should be self-

 #	evident.

 B regex-basic		C

+B regex-class		C

 B regex-capture         C

 B regex-failmatch	C

 B regex-unicode		C