ref: 64c5e8f1227c1a98ab996e95425ab43791c0ee2e
parent: c039e23ae4fd65431fb40f6d556d72a977725b0f
author: S. Gilles <[email protected]>
date: Fri Jun 7 20:04:12 EDT 2019
Rework pown to be less embarrassingly slow.
--- a/lib/math/pown-impl.myr
+++ b/lib/math/pown-impl.myr
@@ -1,8 +1,9 @@
use std
use "fpmath"
-use "log-impl"
+use "impls"
use "log-overkill"
+use "log-impl"
use "sum-impl"
use "util"
@@ -32,7 +33,11 @@
neginf : @u
magcmp : (f : @f, g : @f -> std.order)
two_by_two : (x : @f, y : @f -> (@f, @f))
+ split_add : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))
+ split_mul : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))
+ floor : (x : @f -> @f)
log_overkill : (x : @f -> (@f, @f))
+ precision : @i
emin : @i
emax : @i
imax : @i
@@ -52,7 +57,11 @@
.neginf = 0xff800000,
.magcmp = mag_cmp32,
.two_by_two = two_by_two32,
+ .split_add = split_add32,
+ .split_mul = split_mul32,
+ .floor = floor32,
.log_overkill = logoverkill32,
+ .precision = 24,
.emin = -126,
.emax = 127,
.imax = 2147483647, /* For detecting overflow in final exponent */
@@ -72,7 +81,11 @@
.neginf = 0xfff0000000000000,
.magcmp = mag_cmp64,
.two_by_two = two_by_two64,
+ .split_add = hl_add,
+ .split_mul = hl_mult,
+ .floor = floor64,
.log_overkill = logoverkill64,
+ .precision = 53,
.emin = -1022,
.emax = 1023,
.imax = 9223372036854775807,
@@ -79,6 +92,24 @@
.imin = -9223372036854775808,
]
+const split_add32 = {x_h : flt32, x_l : flt32, y_h : flt32, y_l : flt32
+ var x : flt64 = (x_h : flt64) + (x_l : flt64)
+ var y : flt64 = (y_h : flt64) + (y_l : flt64)
+ var z = x + y
+ var z_h : flt32 = (z : flt32)
+ var z_l : flt32 = ((z - (z_h : flt64)) : flt32)
+ -> (z_h, z_l)
+}
+
+const split_mul32 = {x_h : flt32, x_l : flt32, y_h : flt32, y_l : flt32
+ var x : flt64 = (x_h : flt64) + (x_l : flt64)
+ var y : flt64 = (y_h : flt64) + (y_l : flt64)
+ var z = x * y
+ var z_h : flt32 = (z : flt32)
+ var z_l : flt32 = ((z - (z_h : flt64)) : flt32)
+ -> (z_h, z_l)
+}
+
const pown32 = {x : flt32, n : int32
-> powngen(x, n, desc32)
}
@@ -123,6 +154,9 @@
elif n == 1
/* Anything^1 is itself */
-> x
+ elif n == -1
+ /* The CPU is probably better at division than we are at pow(). */
+ -> 1.0/x
;;
/* (-f)^n = (-1)^n * (f)^n. Figure this out now, then pretend f >= 0.0 */
@@ -142,62 +176,55 @@
Since n and e, and I are all integers, we can get the last part from
scale2. The hard part is computing I and F, and then computing 2^F.
*/
+ if xe > 0
+ /*
+ But first: do some rough calculations: if we can show n*log(xs) has the
+ same sign as n*e, and n*e would cause overflow, then we might as well
+ return right now.
+ */
+ var exp_rough_estimate = n * xe
+ if n > 0 && (exp_rough_estimate > d.emax + 1 || (exp_rough_estimate / n != xe))
+ -> ult_sgn * d.frombits(d.inf)
+ elif n < 0 && (exp_rough_estimate < d.emin - d.precision - 1 || (exp_rough_estimate / n != xe))
+ -> ult_sgn * 0.0
+ ;;
+ elif xe < 0
+ /*
+ Also, if consider xs/2 and xe + 1, we can analyze the case in which
+ n*log(xs) has a different sign from n*e.
+ */
+ var exp_rough_estimate = n * (xe + 1)
+ if n > 0 && (exp_rough_estimate < d.emin - d.precision - 1 || (exp_rough_estimate / n != (xe + 1)))
+ -> ult_sgn * 0.0
+ elif n < 0 && (exp_rough_estimate > d.emax + 1 || (exp_rough_estimate / n != (xe + 1)))
+ -> ult_sgn * d.frombits(d.inf)
+ ;;
+ ;;
+
var ln_xs_hi, ln_xs_lo
(ln_xs_hi, ln_xs_lo) = d.log_overkill(d.assem(false, 0, xs))
/* Now x^n = 2^(n * [ ln_xs / ln(2) ]) * 2^(n + e) */
+ var E1, E2
+ (E1, E2) = d.split_mul(ln_xs_hi, ln_xs_lo, d.frombits(d.one_over_ln2_hi), d.frombits(d.one_over_ln2_lo))
- var ls1 : @f[8]
- (ls1[0], ls1[1]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_hi))
- (ls1[2], ls1[3]) = d.two_by_two(ln_xs_hi, d.frombits(d.one_over_ln2_lo))
- (ls1[4], ls1[5]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_hi))
- (ls1[6], ls1[7]) = d.two_by_two(ln_xs_lo, d.frombits(d.one_over_ln2_lo))
-
/*
- Now log2(xs) = Sum(ls1), so
+ Now log2(xs) = E1 + E2, so
- x^n = 2^(n * Sum(ls1)) * 2^(n * e)
+ x^n = 2^(n * E1 + E2) * 2^(n * e)
*/
- var E1, E2
- (E1, E2) = double_compensated_sum(ls1[0:8])
- var ls2 : @f[5]
- var ls2s : @f[5]
- var I = 0
- (ls2[0], ls2[1]) = d.two_by_two(E1, nf)
- (ls2[2], ls2[3]) = d.two_by_two(E2, nf)
- ls2[4] = 0.0
- /* Now x^n = 2^(Sum(ls2)) * 2^(n + e) */
-
- for var j = 0; j < 5; ++j
- var i = rn(ls2[j])
- I += i
- ls2[j] -= (i : @f)
- ;;
-
var F1, F2
- std.slcp(ls2s[0:5], ls2[0:5])
- std.sort(ls2s[0:5], d.magcmp)
- (F1, F2) = double_compensated_sum(ls2s[0:5])
+ (F1, F2) = d.split_mul(E1, E2, nf, 0.0)
- if (F1 < 0.0 || F1 > 1.0)
- var i = rn(F1)
- I += i
- ls2[4] -= (i : @f)
- std.slcp(ls2s[0:5], ls2[0:5])
- std.sort(ls2s[0:5], d.magcmp)
- (F1, F2) = double_compensated_sum(ls2s[0:5])
- ;;
+ var I = rn(F1)
+ (F1, F2) = d.split_add(-1.0 * (I : @f), 0.0, F1, F2)
/* Now, x^n = 2^(F1 + F2) * 2^(I + n*e). */
- var ls3 : @f[6]
var log2_hi, log2_lo
(log2_hi, log2_lo) = d.C[128]
- (ls3[0], ls3[1]) = d.two_by_two(F1, d.frombits(log2_hi))
- (ls3[2], ls3[3]) = d.two_by_two(F1, d.frombits(log2_lo))
- (ls3[4], ls3[5]) = d.two_by_two(F2, d.frombits(log2_hi))
var G1, G2
- (G1, G2) = double_compensated_sum(ls3[0:6])
+ (G1, G2) = d.split_mul(F1, F2, d.frombits(log2_hi), d.frombits(log2_lo))
var base = exp(G1) + G2
var pow_xen = xe * n
--- a/lib/math/powr-impl.myr
+++ b/lib/math/powr-impl.myr
@@ -230,7 +230,7 @@
/*
y could actually be above integer infinity, in which
- case x^y is most certainly infinity of 0. More importantly,
+ case x^y is most certainly infinity or 0. More importantly,
we can't safely compute M (below).
*/
if x > (1.0 : @f)
--- a/lib/math/test/pown-impl.myr
+++ b/lib/math/test/pown-impl.myr
@@ -100,6 +100,14 @@
(0xc017043172d0152b, 0x00000000000000e9, 0xe4b2c1666379afdc),
(0xc0325800cfeffb8e, 0x00000000000000d8, 0x78983c24a5e29e19),
(0xbfee2ae3cd3208ec, 0x00000000000006b7, 0xb6cb06585f39893d),
+ (0x3f7dd2994731f21f, 0x0000000000000097, 0x0000000000000003),
+ (0x61696e53830d02af, 0xfffffffffffffffe, 0x0000000000000006),
+ (0xc0e60abfce171c2e, 0xffffffffffffffbb, 0x800000000000008a),
+ (0x32dbf16a23293407, 0x0000000000000005, 0x00000000103f2cd6),
+ (0xb95741e695eb8ab2, 0x000000000000000a, 0x00000000000a873c),
+ (0x000aa88b5c2dd078, 0xffffffffffffffff, 0x7fd804c764025003),
+ (0x800cd2d56c4a4074, 0xffffffffffffffff, 0xffd3f696f65f6596),
+ (0x8000d6838a5a8463, 0xffffffffffffffff, 0xfff0000000000000),
][:]
for (x, y, z) : inputs