shithub: mc

Download patch

ref: 499d959560bcabe68a811bb7a865018bd2ea72df
parent: f7f2a75344fcb2739350d9a1e1d9b464b732a7f1
author: Ori Bernstein <[email protected]>
date: Wed Jan 29 05:22:54 EST 2014

Make unicode patterns usefully fast.

    Instead of eagerly forking for each byte and generating something
    that looks like this:

      fork-- abc
        \
         fork -- abd
          \
           fork -- bcd

    We generate something more like this:

       fork ab -- fork -- c
          \          \
            bcd       d

    Which means that far fewer threads are live at any time, and our
    runqueues are probably at least 10 times shorter.

--- a/compile.myr
+++ b/compile.myr
@@ -22,7 +22,7 @@
 	/* end matches */
 	`Byte	byte
 	`Chr	char
-	`Class	[char, char]
+	`Ranges	char[2][:]
 
 	/* meta */
 	`Cap	[std.size, tree#] /* id, tree */
@@ -98,7 +98,7 @@
 	/* end matches */
 	|`Byte	b: 	append(re, `Ibyte b)
 	|`Chr	c:	genchar(re, c)
-	|`Class  (a, b):	genrange(re, a, b)
+	|`Ranges  sl:	genranges(re, sl)
 
 	/* meta */
 	|`Bol:	append(re, `Ibol)
@@ -111,60 +111,126 @@
 	-> re.proglen
 }
 
-/*
- converts a codepoint range spanning multiple utf8 byte lenghts into a
- set of utf8 ranges. Eg:
- 	[0x00-0x2000]  => [0x00-0x7F]|[0xC2-0xDF][0x80-0x8F]
-*/
-const genrange = {re, lo, hi
-	/* the transitions between different char lenghts for unicode
-	   characters, needed so that we know how to generate the
-	   different size categories */
-	var charrng = [
-		0,
-		0x80,
-		0x800,
-		0x10000,
-		0x200000,
+const genranges = {re, sl
+	const charbounds = [
+		0,		/* len = 0: bug if used for hi */
+		0x80,		/* len = 1 */
+		0x800,		/* len = 2 */
+		0x10000,	/* len = 3 */
+		0x200000,	/* len = 4 */
 		-1
 	]
-	var lbuf : byte[4], hbuf : byte[4]
-	var lsz, hsz
-	var sz, end
-	var d
-	var i, j
+	var lbuf : byte[4], hbuf : byte[4], boundbuf : byte[4]
+	var lsz, hsz, bsz
+	var rt : rangetrie#
+	var i
 
-	lsz = std.charlen(lo)
-	hsz = std.charlen(hi)
-	charrng[lsz - 1] = lo
-	charrng[hsz] = hi
-	if lsz == 1 && hsz == 1
-		append(re, `Irange (lo castto(byte), hi castto(byte)))
+	/* generate a trie of ranges */
+	rt = std.zalloc()
+	for r in sl
+		lsz = std.encode(lbuf[:], r[0])
+		hsz = std.encode(hbuf[:], r[1])
+		for i = lsz; i < hsz; i++
+			bsz = std.encode(boundbuf[:], charbounds[i] - 1)
+			rtinsert(rt, lbuf[:lsz], boundbuf[:bsz])
+			lsz = std.encode(lbuf[:], charbounds[i])
+		;;
+		rtinsert(rt, lbuf[:lsz], hbuf[:hsz])
+	;;
+	rangegen(re, rt, rt.ranges, rt.link, rangeprogsize(rt) + re.proglen)
+	rtfree(rt)
+	-> re.proglen
+}
+
+type rangetrie = struct
+	ranges	: [byte, byte][:]
+	link	: rangetrie#[:]
+	end	: bool
+;;
+
+const rtinsert = {rt, lo, hi
+	var a, b
+	var n
+
+	std.assert(lo.len == hi.len, "range sizes differ")
+
+	n = rt.ranges.len
+	if n == 0
+		rt.ranges = std.slpush(rt.ranges, (lo[0], hi[0]))
+		rt.link = std.slpush(rt.link, std.zalloc())
 	else
-		for i = hsz; i > lsz; i--
-			if re.debug
-				std.put("range size = %z\n", i - 2)
-			;;
-			d = re.proglen + i - 1
-			append(re, `Ifork (re.proglen + 1, jmpdist(i) + d))
+		/*
+		this is a safe way to compare because we know that ranges
+		should always be coming in ordered. This means that equal
+		values will be added one after the other.
+		*/
+		(a, b) = rt.ranges[n - 1]
+		if a != lo[0] || b != hi[0]
+			rt.ranges = std.slpush(rt.ranges, (lo[0], hi[0]))
+			rt.link = std.slpush(rt.link, std.zalloc())
 		;;
-		end = re.proglen + jmpdist(hsz + 1);
-		for i = 0; i < hsz; i++
-			if re.debug
-				std.put("lo[%z] = %i\n", i, charrng[i] castto(int))
-				std.put("hi[%z] = %i\n", i, (charrng[i + 1] - 1) castto(int))
-			;;
-			sz = std.encode(lbuf[:], charrng[i])
-			std.encode(hbuf[:], charrng[i + 1] - 1)
-			for j = 0; j < sz; j++
-				append(re, `Irange (lbuf[j], hbuf[j]))
-			;;
-			append(re, `Ijmp (end))
+	;;
+
+	if lo.len == 1
+		rt.end = true
+	else
+		rtinsert(rt.link[rt.link.len - 1], lo[1:], hi[1:])
+	;;
+}
+
+const rtfree = {rt
+	for l in rt.link
+		rtfree(l)
+	;;
+	std.slfree(rt.link)
+	std.slfree(rt.ranges)
+	std.free(rt)
+}
+
+const rangegen = {re, rt, ranges, links, end
+	var alt, l0, l1, l2
+	var a, b
+	var n
+
+	n = ranges.len
+	if n == 0
+		-> re.proglen
+	elif n == 1
+		(a, b) = ranges[0]
+		append(re, `Irange (a, b))
+		if links[0].ranges.len > 0 && rt.end
+			append(re, `Ifork (re.prog.len + 1, end))
+		elif rt.end
+			append(re, `Ijmp end)
 		;;
+		rangegen(re, links[0], links[0].ranges, links[0].link, end)
+	else
+		alt = re.proglen
+		l0 = append(re, `Ifork (-1, -1))
+		l1 = rangegen(re, rt, ranges[0:n/2], links[0:n/2], end)
+		l2 = rangegen(re, rt, ranges[n/2:n], links[n/2:n], end)
+		re.prog[alt] = `Ifork (l0, l1)
 	;;
 	-> re.proglen
 }
 
+const rangeprogsize = {rt
+	var sz
+
+	if rt.ranges.len == 0
+		-> 0
+	else
+		sz = 2*rt.ranges.len - 1
+		for l in rt.link
+			sz += rangeprogsize(l)
+		;;
+		if rt.end
+			sz += rt.ranges.len
+		;;
+		-> sz
+	;;
+}
+
 /* calculates the forward jump distance for a utf8 character range */
 const jmpdist = {n
 	var d
@@ -265,7 +331,12 @@
 		match re.prog[i]
 		/* Char matching. Consume exactly one byte from the string. */
 		| `Ibyte b:		std.put("`Ibyte %ub (%c)\n", b, b castto(char)) 
-		| `Irange (start, end):	std.put("`Irange (%ub,%ub)\n", start, end) 
+		| `Irange (start, end):	
+			std.put("`Irange (%ub,%ub)", start, end) 
+			if std.isalnum(start castto(char)) && std.isalnum(end castto(char))
+				std.put("\t/* %c-%c */", start castto(char), end castto(char))
+			;;
+			std.put("\n")
 		/* capture groups */
 		| `Ilbra m:		std.put("`Ilbra %z\n", m) 
 		| `Irbra m:		std.put("`Irbra %z\n", m) 
@@ -318,8 +389,14 @@
 		std.put("Byte %b\n", b)
 	| `Chr	c:
 		std.put("Char %c\n", c)
-	| `Class (a, b):
-		std.put("Class (%c-%c)\n", a, b)
+	| `Ranges rl:
+                std.put("Ranges")
+		for r in rl
+			for i = 0; i < indent + 1; i++
+				std.put("  ")
+			;;
+			std.put("\t(%ui-%ui)\n", r[0], r[1])
+		;;
 
 	/* meta */
 	| `Cap	(m, a):
@@ -418,7 +495,7 @@
 	| '+':	-> `Fail `Badrep
 	| '?':	-> `Fail `Badrep
 	| '[':	-> chrclass(re)
-	| '.':	getc(re); ret = mk(`Class (0, std.Maxcharval))
+	| '.':	getc(re); ret = mk(`Ranges std.slpush([][:], [0, std.Maxcharval]))
 	| '^':	getc(re); ret = mk(`Bol)
 	| '$':	getc(re); ret = mk(`Eol)
 	| '(':	
@@ -450,18 +527,18 @@
 
 	match getc(re)
 	/* character classes */
-	| 'd': ret = `Some ranges(re, _ranges.tabasciidigit[:])
-	| 'x': ret = `Some ranges(re, _ranges.tabasciixdigit[:])
-	| 's': ret = `Some ranges(re, _ranges.tabasciispace[:])
-	| 'w': ret = `Some ranges(re, _ranges.tabasciiword[:])
-	| 'h': ret = `Some ranges(re, _ranges.tabasciiblank[:])
+	| 'd': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciidigit[:]))
+	| 'x': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciixdigit[:]))
+	| 's': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciispace[:]))
+	| 'w': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciiword[:]))
+	| 'h': ret = `Some mk(`Ranges std.sldup(_ranges.tabasciiblank[:]))
 
 	/* negated character classes */
-	| 'W': ret = `Some negranges(re, _ranges.tabasciiword[:])
-	| 'S': ret = `Some negranges(re, _ranges.tabasciispace[:])
-	| 'D': ret = `Some negranges(re, _ranges.tabasciidigit[:])
-	| 'X': ret = `Some negranges(re, _ranges.tabasciixdigit[:])
-	| 'H': ret = `Some negranges(re, _ranges.tabasciiblank[:])
+	| 'W': ret = `Some mk(`Ranges negate(_ranges.tabasciiword[:]))
+	| 'S': ret = `Some mk(`Ranges negate(_ranges.tabasciispace[:]))
+	| 'D': ret = `Some mk(`Ranges negate(_ranges.tabasciidigit[:]))
+	| 'X': ret = `Some mk(`Ranges negate(_ranges.tabasciixdigit[:]))
+	| 'H': ret = `Some mk(`Ranges negate(_ranges.tabasciiblank[:]))
 
 	/* unicode character classes */
 	| 'p':	ret = unicodeclass(re, false)
@@ -480,6 +557,7 @@
 const unicodeclass = {re, neg
 	var c, s
 	var tab
+	var t
 	var n
 
 	if re.pat.len == 0
@@ -490,6 +568,7 @@
 	/* either a single char pattern, or {pat} */
 	match getc(re)
 	| '{':
+		s = s[1:]
 		while re.pat.len > 0
 			c = getc(re)
 			if c == '}'
@@ -501,6 +580,7 @@
 		n += std.charlen(r)
 	;;
 	s = s[:n]
+	std.put("s = %s\n", s)
 	/* letters */
 	if std.sleq(s, "L") || std.sleq(s, "Letter")
 		tab = _ranges.tabalpha[:]
@@ -521,14 +601,15 @@
 		-> `Fail (`Badrange)
 	;;
 	if !neg
-		-> `Some ranges(re, tab)
+		t = mk(`Ranges std.sldup(tab))
 	else
-		-> `Some negranges(re, tab)
+		t = mk(`Ranges negate(tab))
 	;;
+	-> `Some t
 }
 
 const chrclass = {re
-	var rl, m
+	var rl, m, n
 	var neg
 	var t
 
@@ -546,22 +627,24 @@
 		std.slfree(rl)
 		-> `Fail (`Earlystop)
 	;;
+
+	std.sort(rl, {a, b;
+		if a[0] < b[0]
+			-> `std.Before
+		elif a[0] == b[0]
+			-> `std.Equal
+		else
+			-> `std.After
+		;;})
+	m = merge(rl)
+	std.slfree(rl)
 	if neg
-		std.sort(rl, {a, b;
-			if a[0] < b[0]
-				-> `std.Before
-			elif a[0] == b[0]
-				-> `std.Equal
-			else
-				-> `std.After
-			;;})
-		m = merge(rl)
-		t = negranges(re, m)
+		n = negate(m)
 		std.slfree(m)
+		t = mk(`Ranges n)
 	else
-		t = ranges(re, rl)
+		t = mk(`Ranges m)
 	;;
-	std.slfree(rl)
 	-> `Some t
 }
 
@@ -582,30 +665,6 @@
 	;;
 }
 
-const ranges = {re, rng
-	var ret
-	var lhs
-	var rhs
-
-	if rng.len == 1
-		ret = mk(`Class (rng[0][0], rng[0][1]))
-	else
-		lhs = ranges(re, rng[0:rng.len/2])
-		rhs = ranges(re, rng[rng.len/2:rng.len])
-		ret = mk(`Alt (lhs, rhs))
-	;;
-	-> ret
-}
-
-const negranges = {re, rng
-	var neg, ret
-
-	neg = negate(rng)
-	ret = ranges(re, neg)
-	std.slfree(neg)
-	-> ret
-}
-
 const negate = {rng
 	var start, end, next
 	var neg
@@ -695,7 +754,7 @@
 	/* end matches */
 	| `Byte	b:	
 	| `Chr	c:	
-	| `Class (a, b):	
+	| `Ranges rl:	std.slfree(rl)
 
 	/* meta */
 	| `Cap	(m, a):	astfree(a)
--- a/interp.myr
+++ b/interp.myr
@@ -128,7 +128,7 @@
 	match re.prog[thr.ip]
 	/* Char matching. Consume exactly one byte from the string. */
 	| `Ibyte b:
-		trace(re, thr, "\t%z:\tByte %b (%c)\n", thr.ip, b, b castto(char))
+		trace(re, thr, "\t%z:\tByte %ub (%c)\n", thr.ip, b, b castto(char))
 		if !within(re, str)
 			die(re, thr, "end of string")
 		elif b != str[re.strp]
@@ -138,7 +138,7 @@
 			trace(re, thr, "\t\tmatched %b with %b\n", b, str[re.strp])
 		;;
 	| `Irange (start, end):
-		trace(re, thr, "\t%z:\tRange (%b, %b)\n", thr.ip, start, end)
+		trace(re, thr, "\t%z:\tRange (%ub, %ub) /* %c - %c */\n", thr.ip, start, end, start castto(char), end castto(char))
 		if !within(re, str) || start > str[re.strp] || end < str[re.strp]
 			die(re, thr, "bad range")
 		else
--- a/ranges.myr
+++ b/ranges.myr
@@ -66,6 +66,7 @@
 
 const tabalpha = [
 	['\u{41}','\u{5a}'],
+	['\u{61}','\u{7a}'],
 	['\u{c0}','\u{d6}'],
 	['\u{d8}','\u{de}'],
 	['\u{100}','\u{100}'],
--- a/test/data/regex-unicode-expected
+++ b/test/data/regex-unicode-expected
@@ -1,2 +1,7 @@
 Matched. 1 matches
 match 0: Abæc
+s = L
+s = L
+Matched. 2 matches
+match 0: Aabæc%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+match 1: Aa
--- a/test/regex-unicode.myr
+++ b/test/regex-unicode.myr
@@ -5,4 +5,5 @@
 
 const main = {
 	testmatch(".*bæc", "Abæc")
+	testmatch("(\\p{L}*)bæc\\P{L}*", "Aabæc%!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
 }