ref: 5782871df2ffdc7d41c42762ef3878acc9529b43
parent: 6f1d771f9e6fee4b3f9d84da05bf1c96f964df70
parent: 8d54eba9360a95c0ae4a3676fe9bbc9d347b998f
author: Ori Bernstein <[email protected]>
date: Thu Jan 9 05:50:45 EST 2014
Merge branch 'master' of git+ssh://git.eigenstate.org/git/ori/libregex
--- a/compile.myr
+++ b/compile.myr
@@ -58,6 +58,13 @@
| `None: -> `std.Failure (`Earlystop)
| `Fail f: -> `std.Failure f
| `Some t:
+ /*
+ we can stop early if we get
+ an incorrectly encoded char
+ */
+ if re.pat.len > 0
+ -> `std.Failure (`Earlystop)
+ ;;
dump(re, t, 0)
append(re, `Ilbra 0)
gen(re, t)
@@ -459,12 +466,12 @@
| '+': ret = mk(`Chr '+')
| '?': ret = mk(`Chr '?')
- /* FIXME: implement this later.
- | 'W': ret = negranges(re, _ranges.asciiword[:])
- | 'S': ret = negranges(re, _ranges.asciispace[:])
- | 'D': ret = negranges(re, _ranges.asciidigit[:])[:])
- | 'X': ret = negranges(re, _ranges.xdigit[:])[:])
- | 'H': ret = negranges(re, _ranges.asciihspace[:])
+ | 'W': ret = negranges(re, _ranges.tabasciiword[:])
+ | 'S': ret = negranges(re, _ranges.tabasciispace[:])
+ | 'D': ret = negranges(re, _ranges.tabasciidigit[:])
+ | 'X': ret = negranges(re, _ranges.tabasciixdigit[:])
+ | 'H': ret = negranges(re, _ranges.tabasciiblank[:])
+ /*
| 'p': unicodeclass(re)
| 'P': negate(unicodeclass(re))
*/
@@ -472,44 +479,45 @@
-> ret
}
-const ranges = {re, rng
- var ret
- var lhs
- var rhs
-
- if rng.len == 1
- ret = mk(`Class (rng[0][0], rng[0][1]))
- else
- lhs = ranges(re, rng[0:rng.len/2])
- rhs = ranges(re, rng[rng.len/2:rng.len])
- ret = mk(`Alt (lhs, rhs))
- ;;
- -> ret
-}
-
const chrclass = {re
- var r
+ var rl, m
+ var neg
var t
/* we know we saw '[' on entry */
matchc(re, '[')
+ neg = false
if matchc(re, '^')
- std.die("negation of character classes not yet supported")
+ neg = true
;;
- t = rangematch(re)
+ rl = rangematch(re, [][:])
while peekc(re) != ']'
- r = rangematch(re)
- t = mk(`Alt (t, r))
+ rl = rangematch(re, rl)
;;
if !matchc(re, ']')
- astfree(t)
+ std.slfree(rl)
-> `Fail (`Earlystop)
+ ;;
+ if neg
+ std.sort(rl, {a, b;
+ if a[0] < b[0]
+ -> `std.Before
+ elif a[0] == b[0]
+ -> `std.Equal
+ else
+ -> `std.After
+ ;;})
+ m = merge(rl)
+ t = negranges(re, m)
+ std.slfree(m)
else
- -> `Some t
+ t = ranges(re, rl)
;;
+ std.slfree(rl)
+ -> `Some t
}
-const rangematch = {re
+const rangematch = {re, sl
var lo
var hi
@@ -516,11 +524,81 @@
lo = getc(re)
if matchc(re, '-')
hi = getc(re)
- -> mk(`Class (lo, hi))
+ if lo <= hi
+ -> std.slpush(sl, [lo, hi])
+ else
+ -> std.slpush(sl, [hi, lo])
+ ;;
else
- -> mk(`Chr lo)
+ -> std.slpush(sl, [lo, lo])
;;
}
+
+const ranges = {re, rng
+ var ret
+ var lhs
+ var rhs
+
+ if rng.len == 1
+ ret = mk(`Class (rng[0][0], rng[0][1]))
+ else
+ lhs = ranges(re, rng[0:rng.len/2])
+ rhs = ranges(re, rng[rng.len/2:rng.len])
+ ret = mk(`Alt (lhs, rhs))
+ ;;
+ -> ret
+}
+
+const negranges = {re, rng
+ var neg, ret
+
+ neg = negate(rng)
+ ret = ranges(re, neg)
+ std.slfree(neg)
+ -> ret
+}
+
+const negate = {rng
+ var start, end, next
+ var neg
+
+ neg = [][:]
+ start = 0
+ next = 0 /* if we have no ranges */
+ for r in rng
+ (end, next) = (r[0], r[1])
+ neg = std.slpush(neg, [start, end - 1])
+ start = next + 1
+ ;;
+ neg = std.slpush(neg, [next + 1, std.Maxcharval])
+ -> neg
+}
+
+/* rl is a sorted list of ranges */
+const merge = {rl
+ var lo, hi
+ var ret
+
+ if rl.len == 0
+ -> [][:]
+ ;;
+ ret = [][:]
+ lo = rl[0][0]
+ hi = rl[0][1]
+ rl = rl[1:] /* BUG: compiler wants an rval in loop range */
+ for r in rl
+ /* if it overlaps or abuts, merge */
+ if r[0] <= hi + 1
+ hi = r[1]
+ else
+ ret = std.slpush(ret, [lo, hi])
+ lo = r[0]
+ hi = r[1]
+ ;;
+ ;;
+ -> std.slpush(ret, [lo, hi])
+}
+
const matchc = {re, c
var str
--- a/main.myr
+++ /dev/null
@@ -1,16 +1,0 @@
-use regex
-use std
-
-const main = {
- var found
- match regex.compile(".*bc")
- | `std.Success re:
- found = regex.exec(re, "Abc")
- std.put("Found = %t: len = %z\n", found, re.strp)
- -> 0
- | `std.Failure err:
- std.put("failed to compile regex")
- -> 1
- ;;
-}
-
--- /dev/null
+++ b/test/data/regex-class-expected
@@ -1,0 +1,51 @@
+Matched. 1 matches
+match 0: 1
+Matched. 1 matches
+match 0: 13
+Matched. 1 matches
+match 0: 13688
+No match
+No match
+No match
+Matched. 1 matches
+match 0: a
+Matched. 1 matches
+match 0: 1F
+Matched. 1 matches
+match 0: 13b8cDEf
+No match
+No match
+No match
+Matched. 1 matches
+match 0:
+Matched. 1 matches
+match 0:
+
+Matched. 1 matches
+match 0:
+
+No match
+No match
+No match
+Matched. 1 matches
+match 0: abcABC0123_
+No match
+Matched. 1 matches
+match 0:
+Matched. 1 matches
+match 0:
+Matched. 1 matches
+match 0:
+No match
+No match
+No match
+Matched. 1 matches
+match 0: abcd
+Matched. 1 matches
+match 0: abCD
+Matched. 1 matches
+match 0: _abCD018
+Matched. 1 matches
+match 0: abba
+Matched. 1 matches
+match 0: abBa
--- /dev/null
+++ b/test/data/regex-negclass-expected
@@ -1,0 +1,50 @@
+Matched. 1 matches
+match 0: x
+Matched. 1 matches
+match 0: xa!#^cs
+No match
+No match
+No match
+No match
+Matched. 1 matches
+match 0: Z
+Matched. 1 matches
+match 0: gg
+No match
+No match
+Matched. 1 matches
+match 0: a
+Matched. 1 matches
+match 0: i%
+Matched. 1 matches
+match 0: alskd690!#!!
+No match
+No match
+No match
+Matched. 1 matches
+match 0: !%!^^@@!^
+No match
+Matched. 1 matches
+match 0:
+
+Matched. 1 matches
+match 0:
+
+No match
+No match
+No match
+Matched. 1 matches
+match 0: ABCD
+Matched. 1 matches
+match 0: 1234
+Matched. 1 matches
+match 0: -^^-
+Matched. 1 matches
+match 0: d6d
+Matched. 1 matches
+match 0: !^!!))#
+No match
+No match
+No match
+No match
+No match
--- /dev/null
+++ b/test/regex-class.myr
@@ -1,0 +1,67 @@
+use std
+
+use "testmatch.use"
+
+const main = {
+ asciiclass()
+ set()
+ /*
+ unicodeclass()
+ negasciiclass()
+ negasciirange()
+ negset()
+ */
+}
+
+const asciiclass = {
+ /* \d success */
+ testmatch("\\d", "1")
+ testmatch("\\d\\d", "13")
+ testmatch("\\d+", "13688")
+ /* \d fail */
+ testmatch("\\d", "x")
+ testmatch("\\d\\d", "x3")
+ testmatch("\\d+", "1368f")
+
+ /* \x success */
+ testmatch("\\x", "a")
+ testmatch("\\x\\x", "1F")
+ testmatch("\\x+", "13b8cDEf")
+ /* \x fail */
+ testmatch("\\x", "Z")
+ testmatch("\\x\\x", "fg")
+ testmatch("\\x+", "13b8cg")
+
+ /* \s success */
+ testmatch("\\s", " ")
+ testmatch("\\s\\s", "\t\n")
+ testmatch("\\s+", "\t\n\r \t")
+ /* \s fail */
+ testmatch("\\s", "a")
+ testmatch("\\s\\s", "i\n")
+ testmatch("\\s+", "\t\n\r.\t")
+
+ /* word success */
+ testmatch("\\w+", "abcABC0123_")
+ /* word fail */
+ testmatch("\\w+", "abcABC0123_.")
+
+ /* \h success */
+ testmatch("\\h", " ")
+ testmatch("\\h\\h", "\t ")
+ testmatch("\\h+", "\t \t ")
+ /* \h fail */
+ testmatch("\\h", "\n")
+ testmatch("\\h\\h", "\t\r")
+ testmatch("\\h+", "\t \t.")
+}
+
+const set = {
+ /* ranges */
+ testmatch("[a-z]*", "abcd")
+ testmatch("[a-zA-Z]*", "abCD")
+ testmatch("[a-zA-Z0-9_]*", "_abCD018")
+
+ testmatch("[abc]*", "abba")
+ testmatch("[a-zABC]*", "abBa")
+}
--- /dev/null
+++ b/test/regex-negclass.myr
@@ -1,0 +1,72 @@
+use std
+
+use "testmatch.use"
+
+const main = {
+ asciiclass()
+ set()
+ /*
+ unicodeclass()
+ negasciiclass()
+ negasciirange()
+ negset()
+ */
+}
+
+const asciiclass = {
+ /* \D success */
+ testmatch("\\D", "x")
+ testmatch("\\D+", "xa!#^cs")
+
+ /* \D fail: end of ranges chars */
+ testmatch("\\D", "0")
+ testmatch("\\D", "9")
+ testmatch("\\D+", "a35x")
+ testmatch("\\D+", "13688")
+
+ /* \X success */
+ testmatch("\\X", "Z")
+ testmatch("\\X\\X", "gg")
+ /* \X fail */
+ testmatch("\\X", "a")
+ testmatch("\\X+", "zz13b8cDEf")
+
+ /* \S success */
+ testmatch("\\S", "a")
+ testmatch("\\S\\S", "i%")
+ testmatch("\\S+", "alskd690!#!!")
+
+ /* \S fail */
+ testmatch("\\S", " ")
+ testmatch("\\S\\S", "\t\n")
+ testmatch("\\S+", "\t \nkait")
+
+ /* word success */
+ testmatch("\\W+", "!%!^^@@!^")
+ /* word fail */
+ testmatch("\\W+", "a^#$bcABC0123_")
+
+ /* \H success */
+ testmatch("\\H", "\n")
+ testmatch("\\H\\H", "\n\r")
+ /* \H fail */
+ testmatch("\\H+", "\t \t.")
+ testmatch("\\H\\H", "\t ")
+ testmatch("\\H+", "\ta35 \t ")
+}
+
+const set = {
+ /* ranges: should succeed */
+ testmatch("[^a-z]*", "ABCD")
+ testmatch("[^a-zA-Z]*", "1234")
+ testmatch("[^a-zA-Z0-9_]*", "-^^-")
+ testmatch("[^abc]*", "d6d")
+ testmatch("[^a-zABC]*", "!^!!))#")
+
+ /* ranges: should fail */
+ testmatch("[^a-z]*", "abcd")
+ testmatch("[^a-zA-Z]*", "abCD")
+ testmatch("[^a-zA-Z0-9_]*", "_abCD018")
+ testmatch("[^abc]*", "abba")
+ testmatch("[^a-zABC]*", "abBa")
+}
--- a/test/tests
+++ b/test/tests
@@ -22,6 +22,7 @@
# evident.
B regex-basic C
B regex-class C
+B regex-negclass C
B regex-capture C
B regex-failmatch C
B regex-unicode C