shithub: mc

Download patch

ref: 64c5e8f1227c1a98ab996e95425ab43791c0ee2e
parent: c039e23ae4fd65431fb40f6d556d72a977725b0f
author: S. Gilles <[email protected]>
date: Fri Jun 7 20:04:12 EDT 2019

Rework pown to be less embarrassingly slow.

--- a/lib/math/pown-impl.myr
+++ b/lib/math/pown-impl.myr
@@ -1,8 +1,9 @@
 use std
 
 use "fpmath"
-use "log-impl"
+use "impls"
 use "log-overkill"
+use "log-impl"
 use "sum-impl"
 use "util"
 
@@ -32,7 +33,11 @@
 	neginf : @u
 	magcmp : (f : @f, g : @f -> std.order)
 	two_by_two : (x : @f, y : @f -> (@f, @f))
+	split_add : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))
+	split_mul : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))
+	floor : (x : @f -> @f)
 	log_overkill : (x : @f -> (@f, @f))
+	precision : @i
 	emin : @i
 	emax : @i
 	imax : @i
@@ -52,7 +57,11 @@
 	.neginf = 0xff800000,
 	.magcmp = mag_cmp32,
 	.two_by_two = two_by_two32,
+	.split_add = split_add32,
+	.split_mul = split_mul32,
+	.floor = floor32,
 	.log_overkill = logoverkill32,
+	.precision = 24,
 	.emin = -126,
 	.emax = 127,
 	.imax = 2147483647, /* For detecting overflow in final exponent */
@@ -72,7 +81,11 @@
 	.neginf = 0xfff0000000000000,
 	.magcmp = mag_cmp64,
 	.two_by_two = two_by_two64,
+	.split_add = hl_add,
+	.split_mul = hl_mult,
+	.floor = floor64,
 	.log_overkill = logoverkill64,
+	.precision = 53,
 	.emin = -1022,
 	.emax = 1023,
 	.imax = 9223372036854775807,
@@ -79,6 +92,24 @@
 	.imin = -9223372036854775808,
 ]
 
+const split_add32 = {x_h : flt32, x_l : flt32, y_h : flt32, y_l : flt32
+	var x : flt64 = (x_h : flt64) + (x_l : flt64)
+	var y : flt64 = (y_h : flt64) + (y_l : flt64)
+	var z = x + y
+	var z_h : flt32 = (z : flt32)
+	var z_l : flt32 = ((z - (z_h : flt64)) : flt32)
+	-> (z_h, z_l)
+}
+
+const split_mul32 = {x_h : flt32, x_l : flt32, y_h : flt32, y_l : flt32
+	var x : flt64 = (x_h : flt64) + (x_l : flt64)
+	var y : flt64 = (y_h : flt64) + (y_l : flt64)
+	var z = x * y
+	var z_h : flt32 = (z : flt32)
+	var z_l : flt32 = ((z - (z_h : flt64)) : flt32)
+	-> (z_h, z_l)
+}
+
 const pown32 = {x : flt32, n : int32
 	-> powngen(x, n, desc32)
 }
@@ -123,6 +154,9 @@
 	elif n == 1
 		/* Anything^1 is itself */
 		-> x
+	elif n == -1
+		/* The CPU is probably better at division than we are at pow(). */
+		-> 1.0/x
 	;;
 
 	/* (-f)^n = (-1)^n * (f)^n. Figure this out now, then pretend f >= 0.0 */
@@ -142,62 +176,55 @@
 	   Since n and e, and I are all integers, we can get the last part from
 	   scale2. The hard part is computing I and F, and then computing 2^F.
 	 */
+	if xe > 0
+		/*
+		   But first: do some rough calculations: if we can show n*log(xs) has the
+		   same sign as n*e, and n*e would cause overflow, then we might as well
+		   return right now.
+		 */
+		var exp_rough_estimate = n * xe
+		if n > 0 && (exp_rough_estimate > d.emax + 1 || (exp_rough_estimate / n != xe))
+			-> ult_sgn * d.frombits(d.inf)
+		elif n < 0 && (exp_rough_estimate < d.emin - d.precision - 1 || (exp_rough_estimate / n != xe))
+			-> ult_sgn * 0.0
+		;;
+	elif xe < 0
+		/*
+		   Also, if consider xs/2 and xe + 1, we can analyze the case in which
+		   n*log(xs) has a different sign from n*e.
+	         */
+		var exp_rough_estimate = n * (xe + 1)
+		if n > 0 && (exp_rough_estimate < d.emin - d.precision - 1 || (exp_rough_estimate / n != (xe + 1)))
+			-> ult_sgn * 0.0
+		elif n < 0 && (exp_rough_estimate > d.emax + 1 || (exp_rough_estimate / n != (xe + 1)))
+			-> ult_sgn * d.frombits(d.inf)
+		;;
+	;;
+
 	var ln_xs_hi, ln_xs_lo
 	(ln_xs_hi, ln_xs_lo) = d.log_overkill(d.assem(false, 0, xs))
 
 	/* Now x^n = 2^(n * [ ln_xs / ln(2) ]) * 2^(n + e) */
+	var E1, E2
+	(E1, E2) = d.split_mul(ln_xs_hi, ln_xs_lo, d.frombits(d.one_over_ln2_hi), d.frombits(d.one_over_ln2_lo))
 
-	var ls1 : @f[8]
-	(ls1[0], ls1[1]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_hi))
-	(ls1[2], ls1[3]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_lo))
-	(ls1[4], ls1[5]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_hi))
-	(ls1[6], ls1[7]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_lo))
-
 	/*
-	   Now log2(xs) = Sum(ls1), so
+	   Now log2(xs) = E1 + E2, so
 
-	     x^n = 2^(n * Sum(ls1)) * 2^(n * e)
+	     x^n = 2^(n * E1 + E2) * 2^(n * e)
 	 */
-	var E1, E2
-	(E1, E2) = double_compensated_sum(ls1[0:8])
-	var ls2 : @f[5]
-	var ls2s : @f[5]
-	var I = 0
-	(ls2[0], ls2[1]) = d.two_by_two(E1, nf)
-	(ls2[2], ls2[3]) = d.two_by_two(E2, nf)
-	ls2[4] = 0.0
 
-	/* Now x^n = 2^(Sum(ls2)) * 2^(n + e) */
-
-	for var j = 0; j < 5; ++j
-		var i = rn(ls2[j])
-		I += i
-		ls2[j] -= (i : @f)
-	;;
-
 	var F1, F2
-	std.slcp(ls2s[0:5], ls2[0:5])
-	std.sort(ls2s[0:5], d.magcmp)
-	(F1, F2) = double_compensated_sum(ls2s[0:5])
+	(F1, F2) = d.split_mul(E1, E2, nf, 0.0)
 
-	if (F1 < 0.0 || F1 > 1.0)
-		var i = rn(F1)
-		I += i
-		ls2[4] -= (i : @f)
-		std.slcp(ls2s[0:5], ls2[0:5])
-		std.sort(ls2s[0:5], d.magcmp)
-		(F1, F2) = double_compensated_sum(ls2s[0:5])
-	;;
+	var I = rn(F1)
+	(F1, F2) = d.split_add(-1.0 * (I : @f), 0.0, F1, F2)
 
 	/* Now, x^n = 2^(F1 + F2) * 2^(I + n*e). */
-	var ls3 : @f[6]
 	var log2_hi, log2_lo
 	(log2_hi, log2_lo) = d.C[128]
-	(ls3[0], ls3[1]) = d.two_by_two(F1, d.frombits(log2_hi))
-	(ls3[2], ls3[3]) = d.two_by_two(F1, d.frombits(log2_lo))
-	(ls3[4], ls3[5]) = d.two_by_two(F2, d.frombits(log2_hi))
 	var G1, G2
-	(G1, G2) = double_compensated_sum(ls3[0:6])
+	(G1, G2) = d.split_mul(F1, F2, d.frombits(log2_hi), d.frombits(log2_lo))
 
 	var base = exp(G1) + G2
 	var pow_xen = xe * n
--- a/lib/math/powr-impl.myr
+++ b/lib/math/powr-impl.myr
@@ -230,7 +230,7 @@
 
 	/*
 	   y could actually be above integer infinity, in which
-	   case x^y is most certainly infinity of 0. More importantly,
+	   case x^y is most certainly infinity or 0. More importantly,
 	   we can't safely compute M (below).
 	 */
 	if x > (1.0 : @f)
--- a/lib/math/test/pown-impl.myr
+++ b/lib/math/test/pown-impl.myr
@@ -100,6 +100,14 @@
 		(0xc017043172d0152b, 0x00000000000000e9, 0xe4b2c1666379afdc),
 		(0xc0325800cfeffb8e, 0x00000000000000d8, 0x78983c24a5e29e19),
 		(0xbfee2ae3cd3208ec, 0x00000000000006b7, 0xb6cb06585f39893d),
+		(0x3f7dd2994731f21f, 0x0000000000000097, 0x0000000000000003),
+		(0x61696e53830d02af, 0xfffffffffffffffe, 0x0000000000000006),
+		(0xc0e60abfce171c2e, 0xffffffffffffffbb, 0x800000000000008a),
+		(0x32dbf16a23293407, 0x0000000000000005, 0x00000000103f2cd6),
+		(0xb95741e695eb8ab2, 0x000000000000000a, 0x00000000000a873c),
+		(0x000aa88b5c2dd078, 0xffffffffffffffff, 0x7fd804c764025003),
+		(0x800cd2d56c4a4074, 0xffffffffffffffff, 0xffd3f696f65f6596),
+		(0x8000d6838a5a8463, 0xffffffffffffffff, 0xfff0000000000000),
 	][:]
 
 	for (x, y, z) : inputs