shithub: mc

Download patch

ref: 595655d194bef8eb4505ffe38af641f78aa157fe
parent: 9c6298c0d9ea5f9c0e5944995670fd16581e91fb
author: Ori Bernstein <[email protected]>
date: Thu Dec 26 16:47:43 EST 2013

Add support for some basic character ranges.

--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@
 MYRSRC= \
 	compile.myr \
 	interp.myr \
+	ranges.myr \
 	types.myr \
 
 include config.mk
--- a/compile.myr
+++ b/compile.myr
@@ -1,6 +1,7 @@
 use std
 
 use "types.use"
+use "ranges.use"
 
 pkg regex =
 	const compile	: (re : byte[:] -> std.error(regex#, status))
@@ -426,14 +427,14 @@
 			astfree(ret)
 			-> `Fail (`Unbalanced)
 		;;
+	| '\\':
+		getc(re) /* consume the slash */
+		if re.pat.len == 0
+			-> `Fail (`Earlystop)
+		;;
+		ret = escaped(re)
 	| c:
 		getc(re)
-		if c == '\\'
-			if re.pat.len == 0
-				-> `Fail (`Earlystop)
-			;;
-			c = getc(re)
-		;;
 		ret = mk(`Chr c)
 	;;
 	dump(re, ret, 0)
@@ -440,11 +441,61 @@
 	-> `Some ret
 }
 
+const escaped = {re
+	var ret
+
+	match getc(re)
+	/* character classes */
+	| 'd': ret = ranges(re, _ranges.asciidigit[:])
+	| 'x': ret = ranges(re, _ranges.asciixdigit[:])
+	| 's': ret = ranges(re, _ranges.asciispace[:])
+	| 'w': ret = ranges(re, _ranges.asciiword[:])
+	| 'h': ret = ranges(re, _ranges.asciihspace[:])
+
+	/* escaped metachars */
+	| '^': ret = mk(`Chr '^')
+	| '$': ret = mk(`Chr '$')
+	| '.': ret = mk(`Chr '.')
+	| '+': ret = mk(`Chr '+')
+	| '?': ret = mk(`Chr '?')
+
+	/* FIXME: implement this later.
+	| 'W': ret = negranges(re, _ranges.asciiword[:])
+	| 'S': ret = negranges(re, _ranges.asciispace[:])
+	| 'D': ret = negranges(re, _ranges.asciidigit[:])[:])
+	| 'X': ret = negranges(re, _ranges.xdigit[:])[:])
+	| 'H': ret = negranges(re, _ranges.asciihspace[:])
+	| 'p':	unicodeclass(re)
+	| 'P':  negate(unicodeclass(re))
+	*/
+	;;
+	-> ret
+}
+
+const ranges = {re, rng
+	var ret
+	var lhs
+	var rhs
+
+	if rng.len == 1
+		ret = mk(`Class (rng[0][0], rng[0][1]))
+	else
+		lhs = ranges(re, rng[0:rng.len/2])
+		rhs = ranges(re, rng[rng.len/2:rng.len])
+		ret = mk(`Alt (lhs, rhs))
+	;;
+	-> ret
+}
+
 const chrclass = {re
 	var r
 	var t
 
+	/* we know we saw '[' on entry */
 	matchc(re, '[')
+	if matchc(re, '^')
+		std.die("negation of character classes not yet supported")
+	;;
 	t = rangematch(re)
 	while peekc(re) != ']'
 		r = rangematch(re)
--- a/doc/myr-regex.3
+++ b/doc/myr-regex.3
@@ -46,6 +46,90 @@
 within the string, instead of attempting to find a match spanning the whole
 string.
 
+.SH REGEX SYNTAX
+.PP
+The grammar used by libregex is below:
+
+.EX
+    regex       : altexpr
+    altexpr     : catexpr ('|' altexpr)+
+    catexpr     : repexpr (catexpr)+
+    repexpr     : baseexpr[*+?]
+    baseexpr    : literal
+                | charclass
+                | charrange
+                | '.'
+                | '^'
+                | '$'
+                | '(' regex ')'
+    charclass   : see below
+    charrange   : '[' (literal('-' literal)?)+']'
+.EE
+
+The following metacharacters have the meanings listed below:
+.TP
+.
+Matches a single unicode character
+.TP
+^
+Matches the beginning of a line. Does not consume any characters.
+.TP
+$
+Matches the end of a line. Does not consume any characters.
+.TP
+*
+Matches any number of repetitions of the preceding regex fragment.
+.TP
++
+Matches one or more repetitions of the preceding regex fragment.
+.TP
+?
+Matches zero or one of the preceding regex fragment.
+
+.PP
+In order to match a literal metacharacter, it needs to be preceded by
+a '\\' character.
+
+The following character classes are supported:
+.TP
+\\d
+ASCII digits
+.TP
+\\D
+Negation of ASCII digits
+.TP
+\\x
+ASCII Hex digits
+.TP
+\\X
+Negation of ASCII Hex digits
+.TP
+\\s
+ASCII spaces
+.TP
+\\S
+Negation of ASCII spaces
+.TP
+\\w
+ASCII word characters
+.TP
+\\W
+Negation of ASCII word characters
+.TP
+\\h
+ASCII whitespace characters
+.TP
+\\H
+Negation of ASCII whitespace characters
+.TP
+\\pX
+Characters with unicode property 'X'
+.TP
+\\PX
+.PP
+Negation of characters with unicode property 'X'. The only properties that
+are currently supported are 'Z' (space), 'L' (letter).
+
 .SH EXAMPLE
 .EX
         use std
--- /dev/null
+++ b/ranges.myr
@@ -1,0 +1,33 @@
+pkg _ranges =
+	const asciidigit 	: char[2][1]
+	const asciixdigit	: char[2][3]
+	const asciispace	: char[2][2]
+	const asciiword		: char[2][4]
+	const asciihspace	: char[2][2]
+;;
+const asciidigit = [
+	[0x30,0x39]
+]
+
+const asciixdigit = [
+	[0x30,0x39],
+	[0x41,0x46],
+	[0x61,0x66]
+]
+
+const asciispace = [
+	[0x9, 0xd],
+	[0x20, 0x20]
+]
+
+const asciiword = [
+	[0x30,0x39],	/* 0-9 */
+	[0x41, 0x5a],	/* A-Z */
+	[0x5f, 0x5f],	/* _ */
+	[0x61, 0x7a]	/* a-z */
+]
+
+const asciihspace = [
+	[0x9, 0x9],	/* \t */
+	[0x20, 0x20]	/* ' ' */
+]
--- a/test/tests
+++ b/test/tests
@@ -21,6 +21,7 @@
 #	What we compare with. This should be self-
 #	evident.
 B regex-basic		C
+B regex-class		C
 B regex-capture         C
 B regex-failmatch	C
 B regex-unicode		C