ref: 595655d194bef8eb4505ffe38af641f78aa157fe
parent: 9c6298c0d9ea5f9c0e5944995670fd16581e91fb
author: Ori Bernstein <[email protected]>
date: Thu Dec 26 16:47:43 EST 2013
Add support for some basic character ranges.
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@
MYRSRC= \
compile.myr \
interp.myr \
+ ranges.myr \
types.myr \
include config.mk
--- a/compile.myr
+++ b/compile.myr
@@ -1,6 +1,7 @@
use std
use "types.use"
+use "ranges.use"
pkg regex =
const compile : (re : byte[:] -> std.error(regex#, status))
@@ -426,14 +427,14 @@
astfree(ret)
-> `Fail (`Unbalanced)
;;
+ | '\\':
+ getc(re) /* consume the slash */
+ if re.pat.len == 0
+ -> `Fail (`Earlystop)
+ ;;
+ ret = escaped(re)
| c:
getc(re)
- if c == '\\'
- if re.pat.len == 0
- -> `Fail (`Earlystop)
- ;;
- c = getc(re)
- ;;
ret = mk(`Chr c)
;;
dump(re, ret, 0)
@@ -440,11 +441,61 @@
-> `Some ret
}
+const escaped = {re
+ var ret
+
+ match getc(re)
+ /* character classes */
+ | 'd': ret = ranges(re, _ranges.asciidigit[:])
+ | 'x': ret = ranges(re, _ranges.asciixdigit[:])
+ | 's': ret = ranges(re, _ranges.asciispace[:])
+ | 'w': ret = ranges(re, _ranges.asciiword[:])
+ | 'h': ret = ranges(re, _ranges.asciihspace[:])
+
+ /* escaped metachars */
+ | '^': ret = mk(`Chr '^')
+ | '$': ret = mk(`Chr '$')
+ | '.': ret = mk(`Chr '.')
+ | '+': ret = mk(`Chr '+')
+ | '?': ret = mk(`Chr '?')
+
+ /* FIXME: implement this later.
+ | 'W': ret = negranges(re, _ranges.asciiword[:])
+ | 'S': ret = negranges(re, _ranges.asciispace[:])
+ | 'D': ret = negranges(re, _ranges.asciidigit[:])[:])
+ | 'X': ret = negranges(re, _ranges.xdigit[:])[:])
+ | 'H': ret = negranges(re, _ranges.asciihspace[:])
+ | 'p': unicodeclass(re)
+ | 'P': negate(unicodeclass(re))
+ */
+ ;;
+ -> ret
+}
+
+const ranges = {re, rng
+ var ret
+ var lhs
+ var rhs
+
+ if rng.len == 1
+ ret = mk(`Class (rng[0][0], rng[0][1]))
+ else
+ lhs = ranges(re, rng[0:rng.len/2])
+ rhs = ranges(re, rng[rng.len/2:rng.len])
+ ret = mk(`Alt (lhs, rhs))
+ ;;
+ -> ret
+}
+
const chrclass = {re
var r
var t
+ /* we know we saw '[' on entry */
matchc(re, '[')
+ if matchc(re, '^')
+ std.die("negation of character classes not yet supported")
+ ;;
t = rangematch(re)
while peekc(re) != ']'
r = rangematch(re)
--- a/doc/myr-regex.3
+++ b/doc/myr-regex.3
@@ -46,6 +46,90 @@
within the string, instead of attempting to find a match spanning the whole
string.
+.SH REGEX SYNTAX
+.PP
+The grammar used by libregex is below:
+
+.EX
+ regex : altexpr
+ altexpr : catexpr ('|' altexpr)+
+ catexpr : repexpr (catexpr)+
+ repexpr : baseexpr[*+?]
+ baseexpr : literal
+ | charclass
+ | charrange
+ | '.'
+ | '^'
+ | '$'
+ | '(' regex ')'
+ charclass : see below
+ charrange : '[' (literal('-' literal)?)+']'
+.EE
+
+The following metacharacters have the meanings listed below:
+.TP
+.
+Matches a single unicode character
+.TP
+^
+Matches the beginning of a line. Does not consume any characters.
+.TP
+$
+Matches the end of a line. Does not consume any characters.
+.TP
+*
+Matches any number of repetitions of the preceding regex fragment.
+.TP
++
+Matches one or more repetitions of the preceding regex fragment.
+.TP
+?
+Matches zero or one of the preceding regex fragment.
+
+.PP
+In order to match a literal metacharacter, it needs to be preceded by
+a '\\' character.
+
+The following character classes are supported:
+.TP
+\\d
+ASCII digits
+.TP
+\\D
+Negation of ASCII digits
+.TP
+\\x
+ASCII Hex digits
+.TP
+\\X
+Negation of ASCII Hex digits
+.TP
+\\s
+ASCII spaces
+.TP
+\\S
+Negation of ASCII spaces
+.TP
+\\w
+ASCII word characters
+.TP
+\\W
+Negation of ASCII word characters
+.TP
+\\h
+ASCII whitespace characters
+.TP
+\\H
+Negation of ASCII whitespace characters
+.TP
+\\pX
+Characters with unicode property 'X'
+.TP
+\\PX
+.PP
+Negation of characters with unicode property 'X'. The only properties that
+are currently supported are 'Z' (space), 'L' (letter).
+
.SH EXAMPLE
.EX
use std
--- /dev/null
+++ b/ranges.myr
@@ -1,0 +1,33 @@
+pkg _ranges =
+ const asciidigit : char[2][1]
+ const asciixdigit : char[2][3]
+ const asciispace : char[2][2]
+ const asciiword : char[2][4]
+ const asciihspace : char[2][2]
+;;
+const asciidigit = [
+ [0x30,0x39]
+]
+
+const asciixdigit = [
+ [0x30,0x39],
+ [0x41,0x46],
+ [0x61,0x66]
+]
+
+const asciispace = [
+ [0x9, 0xd],
+ [0x20, 0x20]
+]
+
+const asciiword = [
+ [0x30,0x39], /* 0-9 */
+ [0x41, 0x5a], /* A-Z */
+ [0x5f, 0x5f], /* _ */
+ [0x61, 0x7a] /* a-z */
+]
+
+const asciihspace = [
+ [0x9, 0x9], /* \t */
+ [0x20, 0x20] /* ' ' */
+]
--- a/test/tests
+++ b/test/tests
@@ -21,6 +21,7 @@
# What we compare with. This should be self-
# evident.
B regex-basic C
+B regex-class C
B regex-capture C
B regex-failmatch C
B regex-unicode C