shithub: mc

Download patch

ref: 5e60759629a43ed5b5aeb5c831cadc516bc3c980
parent: 655d23e95323b6e3b99c7b35bf807c277475721b
author: Ori Bernstein <[email protected]>
date: Mon Oct 13 01:22:25 EDT 2014

Fix up unicode boundary generation.

    Using boundary characters doesn't guarantee a match -- while
    the values are lexicographically in order, individual bytes may
    not be quite right...

--- a/compile.myr
+++ b/compile.myr
@@ -120,28 +120,26 @@
 }
 
 const genranges = {re, sl
-	const charbounds = [
-		0,		/* len = 0: bug if used for hi */
-		0x80,		/* len = 1 */
-		0x800,		/* len = 2 */
-		0x10000,	/* len = 3 */
-		0x200000,	/* len = 4 */
-		-1
-	]
 	var lbuf : byte[4], hbuf : byte[4], boundbuf : byte[4]
-	var lsz, hsz, bsz
+	var lsz, hsz, bsz, i
 	var rt : rangetrie#
-	var i
 
 	/* generate a trie of ranges */
 	rt = std.zalloc()
 	for r in sl
+		/* 
+		encode:
+			lo => bounds[loidx] - 1
+			bounds[loidx] => bounds[loidx + 1] - 1
+			...
+			bounds[hiidx - 1] => hi
+		*/
 		lsz = std.encode(lbuf[:], r[0])
 		hsz = std.encode(hbuf[:], r[1])
 		for i = lsz; i < hsz; i++
-			bsz = std.encode(boundbuf[:], charbounds[i] - 1)
+			bsz = bound(boundbuf[:], i, 0xff)
 			rtinsert(rt, lbuf[:lsz], boundbuf[:bsz])
-			lsz = std.encode(lbuf[:], charbounds[i])
+			lsz = bound(lbuf[:], i + 1, 0x00)
 		;;
 		rtinsert(rt, lbuf[:lsz], hbuf[:hsz])
 	;;
@@ -151,6 +149,21 @@
 	rangegen(re, rt, rt.ranges, rt.link, rangeprogsize(rt) + re.proglen)
 	rtfree(rt)
 	-> re.proglen
+}
+
+const bound = {buf, len, fill
+	var i, s
+
+	if len == 1
+		buf[0] = 0x7f
+	else
+		s = len castto(byte)
+		buf[0] = (0xff << (8 - s)) | (fill >> (s + 1))
+		for i = 1; i < len; i++
+			buf[i] = 0x80 | (fill >> 2)
+		;;
+	;;
+	-> len
 }
 
 type rangetrie = struct