ref: 655d23e95323b6e3b99c7b35bf807c277475721b
parent: 55949ed458f9014e42c404a832332a66bb999d51
author: Ori Bernstein <[email protected]>
date: Sun Oct 12 23:16:02 EDT 2014
Generate approriate jumps for unicode ranges. We are no longer jumping out after the first byte of a multibyte character. That was embarrassing.
--- a/compile.myr
+++ b/compile.myr
@@ -145,6 +145,9 @@
;;
rtinsert(rt, lbuf[:lsz], hbuf[:hsz])
;;
+ if re.debug
+ rtdump(rt, 0)
+ ;;
rangegen(re, rt, rt.ranges, rt.link, rangeprogsize(rt) + re.proglen)
rtfree(rt)
-> re.proglen
@@ -156,11 +159,38 @@
end : bool
;;
+const rtdump = {rt, ind
+ var i
+ var l, h
+
+ indent(ind)
+ std.put("Range (end = %t) {\n", rt.end)
+ for i = 0; i < rt.ranges.len; i++
+ indent(ind + 1)
+ (l, h) = rt.ranges[i]
+ std.put("0x%xb-0x%xb: \n", l, h)
+ rtdump(rt.link[i], ind + 1)
+ ;;
+ indent(ind)
+ std.put("}\n")
+}
+
+const indent = {ind
+ var i
+ for i = 0; i < ind; i++
+ std.put("\t")
+ ;;
+}
+
const rtinsert = {rt, lo, hi
var a, b
var n
std.assert(lo.len == hi.len, "range sizes differ")
+ if lo.len == 0
+ rt.end = true
+ ->
+ ;;
n = rt.ranges.len
if n == 0
@@ -179,11 +209,7 @@
;;
;;
- if lo.len == 1
- rt.end = true
- else
- rtinsert(rt.link[rt.link.len - 1], lo[1:], hi[1:])
- ;;
+ rtinsert(rt.link[rt.link.len - 1], lo[1:], hi[1:])
}
const rtfree = {rt
@@ -206,10 +232,12 @@
elif n == 1
(a, b) = ranges[0]
append(re, `Irange (a, b))
- if links[0].ranges.len > 0 && rt.end
- append(re, `Ifork (re.prog.len + 1, end))
- elif rt.end
- append(re, `Ijmp end)
+ if links[0].end
+ if links[0].ranges.len > 0
+ append(re, `Ifork (re.prog.len + 1, end))
+ else
+ append(re, `Ijmp end)
+ ;;
;;
rangegen(re, links[0], links[0].ranges, links[0].link, end)
else
@@ -226,17 +254,17 @@
var sz
if rt.ranges.len == 0
- -> 0
+ sz = 0
else
sz = 2*rt.ranges.len - 1
for l in rt.link
sz += rangeprogsize(l)
;;
- if rt.end
- sz += rt.ranges.len
- ;;
- -> sz
;;
+ if rt.end
+ sz += 1
+ ;;
+ -> sz
}
/* calculates the forward jump distance for a utf8 character range */
--- a/test/data/unicode-expected
+++ b/test/data/unicode-expected
@@ -9,3 +9,5 @@
Matched Aabæc%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! via (\p{Letter}*)bæc\P{Uppercase_Letter}* : 2
match 0: Aabæc%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
match 1: Aa
+Matched æ via . : 1
+ match 0: æ
--- a/test/unicode.myr
+++ b/test/unicode.myr
@@ -9,7 +9,5 @@
/* test various syntaxen */
testmatch("(\\pL*)bæc\\PL*", "Aabæc%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
testmatch("(\\p{Letter}*)bæc\\P{Uppercase_Letter}*", "Aabæc%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
- /* BUGGERED
testmatch(".", "æ")
- */
}