shithub: mc

Download patch

ref: 0dd6c9d5124707e4c69116745469e21f7e076429
parent: e406ac08a6f68623f0454f2983b88575e69ca62b
author: S. Gilles <[email protected]>
date: Fri Apr 20 10:33:46 EDT 2018

Implement most of expm1. Errors in huge numbers.

[Tan92] finishes off calculation of expm1(f) by range expansion,
which involves multiplying a number by 2^M, where M grows roughly
linearly with f. If f is very large (say, 0x0x42b17200 in
single-precision), M may grow to be as larger than the maximal
exponent (in this case, M = 128).

If 2^M * P is calculated in the normal way, the intermediate value
2^M is not representable, causing mangled output. However, the
result may still be computed: the value P is of magnitude < 1.
Therefore, a scalb function will be necessary to perform this
multiplication without creating unrepresentable intermediate values={

--- a/lib/math/exp-impl.myr
+++ b/lib/math/exp-impl.myr
@@ -1,11 +1,20 @@
 use std
 
 use "fpmath"
+use "util"
 
-/* See [Mul16] (6.2.1), and in particular [Tan89] (which this notation follows) */
+/*
+    See [Mul16] (6.2.1), [Tan89], and [Tan92]. While the exp and
+    expm1 functions are similar enough to be in the same file (Tang's
+    algorithms use the same S constants), they are not quite similar
+    enough to be in the same function.
+ */
 pkg math =
 	pkglocal const exp32 : (f : flt32 -> flt32)
 	pkglocal const exp64 : (f : flt64 -> flt64)
+
+	pkglocal const expm132 : (f : flt32 -> flt32)
+	pkglocal const expm164 : (f : flt64 -> flt64)
 ;;
 
 extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
@@ -22,37 +31,63 @@
 	tobits : (f : @f -> @u)
 	frombits : (u : @u -> @f)
 	inf : @u
+	nan : @u
+
+	/* For exp */
 	thresh_1_min : @u
 	thresh_1_max : @u
 	thresh_2 : @u
+	Ai : @u[:]
+
+	/* For expm1 */
+	thresh_tiny : @u
+	thresh_huge_neg : @u
+	Log3_4 : @u
+	Log5_4 : @u
+	Bi : @u[:]
+	exactness_thresh : @u
+
+	/* For both */
+	nabs : @u
 	inv_L : @u
 	L1 : @u
 	L2 : @u
-	nabs : @u
-	Ai : @u[:]
 	S : (@u, @u)[32]
+
 ;;
 
-/* Coefficients for p(t), locally approximating exp1m */
-const Ai32 : uint32[:] = [
-		0x00000000,
-		0x00000000,
+const desc32 : fltdesc(flt32, uint32, int32) = [
+	.explode = std.flt32explode,
+	.assem = std.flt32assem,
+	.fma = fma32,
+	.horner = horner_polyu32,
+	.sgnmask = (1 << 31),
+	.tobits = std.flt32bits,
+	.frombits = std.flt32frombits,
+	.inf = 0x7f800000,
+	.nan = 0x7fc0000,
+	.thresh_1_min = 0xc2cff1b4, /* ln(2^(-127 - 23)) ~= -103.9720770839917 */
+	.thresh_1_max = 0x42b17218, /* ln([2 - 2^(-24)] * 2^127) ~= 88.72283905206 */
+	.inv_L = 0x4238aa3b, /* L = log(2) / 32, this is 1/L in working precision */
+	.L1 = 0x3cb17200, /* L1 and L2 sum to give L in high precision, */
+	.L2 = 0x333fbe8e, /* and L1 has some trailing zeroes. */
+	.nabs = 9, /* L1 has 9 trailing zeroes in binary */
+	.Ai = [ /* Coefficients for approximating expm1 in a tiny range */
 		0x3f000044,
 		0x3e2aaaec,
-	][:]
-
-const Ai64 : uint64[:] = [
-		0x0000000000000000,
-		0x0000000000000000,
-		0x3fe0000000000000,
-		0x3fc5555555548f7c,
-		0x3fa5555555545d4e,
-		0x3f811115b7aa905e,
-		0x3f56c1728d739765,
-	][:]
-
-/* Double-precise expansions of 2^(J/32) */
-const S32 : (uint32, uint32)[32] = [
+	][:],
+	.Log3_4 = 0xbe934b11, /* log(1 - 1/4) < x < log(1 + 1/4) */
+	.Log5_4 = 0x3e647fbf, /* triggers special expm1 case */
+	.thresh_tiny = 0x33000000, /* similar to thresh_1_{min,max}, but for expm1 */
+	.thresh_huge_neg = 0xc18aa122,
+	.Bi = [ /* Coefficients for approximating expm1 between log(3/4) and log(5/4) */
+		0x3e2aaaaa,
+		0x3d2aaaa0,
+		0x3c0889ff,
+		0x3ab64de5,
+		0x394ab327,
+	][:],
+	.S = [ /* Double-precise expansions of 2^(J/32) */
 		(0x3f800000, 0x00000000),
 		(0x3f82cd80, 0x35531585),
 		(0x3f85aac0, 0x34d9f312),
@@ -85,9 +120,49 @@
 		(0x3fefe480, 0x36e66f73),
 		(0x3ff52540, 0x36f45492),
 		(0x3ffa8380, 0x36cb6dc9),
-	]
+	],
+	.exactness_thresh = 24, /* threshold to prevent underflow in a S[high] + 2^-m */
+]
 
-const S64 : (uint64, uint64)[32] = [
+const desc64 : fltdesc(flt64, uint64, int64) = [
+	.explode = std.flt64explode,
+	.assem = std.flt64assem,
+	.fma = fma64,
+	.horner = horner_polyu64,
+	.sgnmask = (1 << 63),
+	.tobits = std.flt64bits,
+	.frombits = std.flt64frombits,
+	.inf = 0x7ff0000000000000,
+	.nan = 0x7ff8000000000000,
+	.thresh_1_min = 0xc0874910d52d3052, /* ln(2^(-1023 - 52)) ~= -745.1332191019 */
+	.thresh_1_max = 0x40862e42fefa39ef, /* ln([2 - 2^(-53)] * 2^1023) ~= 709.78271289 */
+	.inv_L = 0x40471547652b82fe, /* below as in exp32 */
+	.L1 = 0x3f962e42fef00000,
+	.L2 = 0x3d8473de6af278ed,
+	.nabs = 20,
+	.Ai = [
+		0x3fe0000000000000,
+		0x3fc5555555548f7c,
+		0x3fa5555555545d4e,
+		0x3f811115b7aa905e,
+		0x3f56c1728d739765,
+	][:],
+	.Log3_4 = 0xbfd269621134db93,
+	.Log5_4 = 0x3fcc8ff7c79a9a22,
+	.thresh_tiny = 0x3c90000000000000,
+	.thresh_huge_neg = 0xc042b708872320e1,
+	.Bi = [
+		0x3fc5555555555549,
+		0x3fa55555555554b6,
+		0x3f8111111111a9f3,
+		0x3f56c16c16ce14c6,
+		0x3f2a01a01159dd2d,
+		0x3efa019f635825c4,
+		0x3ec71e14bfe3db59,
+		0x3e928295484734ea,
+		0x3e5a2836aa646b96,
+	][:],
+	.S = [
 		(0x3ff0000000000000, 0x0000000000000000),
 		(0x3ff059b0d3158540, 0x3d0a1d73e2a475b4),
 		(0x3ff0b5586cf98900, 0x3ceec5317256e308),
@@ -120,52 +195,16 @@
 		(0x3ffdfc97337b9b40, 0x3cfeb968cac39ed3),
 		(0x3ffea4afa2a490c0, 0x3cf9858f73a18f5e),
 		(0x3fff50765b6e4540, 0x3c99d3e12dd8a18b),
-	]
+	],
+	.exactness_thresh = 53,
+]
 
 const exp32 = {f : flt32
-	const d : fltdesc(flt32, uint32, int32) = [
-		.explode = std.flt32explode,
-		.assem = std.flt32assem,
-		.fma = fma32,
-		.horner = horner_polyu32,
-		.sgnmask = (1 << 31),
-		.tobits = std.flt32bits,
-		.frombits = std.flt32frombits,
-		.inf = 0x7f800000,
-		.thresh_1_min = 0xc2cff1b4, /* ln(2^(-127 - 23)) ~= -103.9720770839917 */
-		.thresh_1_max = 0x42b17218, /* ln([2 - 2^(-24)] * 2^127) ~= 88.72283905206 */
-		.inv_L = 0x4238aa3b, /* L = log(2) / 32, this is 1/L in working precision */
-		.L1 = 0x3cb17200, /* L1 and L2 sum to give L in high precision, and L1 */
-		.L2 = 0x333fbe8e, /* has some trailing zeroes. */
-		.nabs = 9, /* L1 has 9 trailing zeroes in binary */
-		.Ai = Ai32,
-		.S = S32,
-	]
-
-	-> expgen(f, d)
+	-> expgen(f, desc32)
 }
 
 const exp64 = {f : flt64
-	const d : fltdesc(flt64, uint64, int64) = [
-		.explode = std.flt64explode,
-		.assem = std.flt64assem,
-		.fma = fma64,
-		.horner = horner_polyu64,
-		.sgnmask = (1 << 63),
-		.tobits = std.flt64bits,
-		.frombits = std.flt64frombits,
-		.inf = 0x7ff0000000000000,
-		.thresh_1_min = 0xc0874910d52d3052, /* ln(2^(-1023 - 52)) ~= -745.1332191019 */
-		.thresh_1_max = 0x40862e42fefa39ef, /* ln([2 - 2^(-53)] * 2^1023) ~= 709.78271289 */
-		.inv_L = 0x40471547652b82fe, /* as in exp32 */
-		.L1 = 0x3f962e42fef00000,
-		.L2 = 0x3d8473de6af278ed,
-		.nabs = 20,
-		.Ai = Ai64,
-		.S = S64,
-	]
-
-	-> expgen(f, d)
+	-> expgen(f, desc64)
 }
 
 generic expgen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
@@ -180,7 +219,7 @@
 	 */
 	if !n && b >= d.thresh_1_max
 		-> d.frombits(d.inf)
-	elif n && b >= d.thresh_1_min
+	elif n && b > d.thresh_1_min
 		-> (0.0 : @f)
 	;;
 
@@ -191,9 +230,8 @@
 
 	/* Argument reduction to [ -ln(2)/64, ln(2)/64 ] */
 	var inv_L = d.frombits(d.inv_L)
-	var finv_L = f * inv_L
 
-	var N = rn(finv_L)
+	var N = rn(f * inv_L)
 	var N2  = N % (32 : @i)
 	if N2 < 0
 		N2 += (32 : @i)
@@ -220,7 +258,7 @@
 
 	/* Compute a polynomial approximation of exp1m */
 	var R = R1 + R2
-	var Q = d.horner(R, d.Ai)
+	var Q = R * R * d.horner(R, d.Ai)
 	var P = R1 + (R2 + Q)
 
 	/* Expand out from reduced range */
@@ -234,6 +272,127 @@
 	var nu, eu, su
 	(nu, eu, su) = d.explode(unscaled)
 	var exp = d.assem(nu, eu + M, su)
+	if (d.tobits(exp) == 0)
+		/* Make sure we don't quite return 0 */
+		-> d.frombits(1)
+	;;
 
 	-> exp
+}
+
+const expm132 = {f : flt32
+	-> expm1gen(f, desc32)
+}
+
+const expm164 = {f : flt64
+	-> expm1gen(f, desc64)
+}
+
+generic expm1gen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+	var b = d.tobits(f)
+	var n, e, s
+	(n, e, s) = d.explode(f)
+
+	/* Special cases: +/- 0, inf, NaN, tiny, and huge */
+	if (b & ~d.sgnmask == 0)
+		-> f
+	elif n && (b & ~d.sgnmask == d.inf)
+		-> (-1.0 : @f)
+	elif (b & ~d.sgnmask == d.inf)
+		-> f
+	elif std.isnan(f)
+		-> d.frombits(d.nan)
+	elif (b & ~d.sgnmask) <= d.thresh_tiny
+		var two_to_large = d.assem(false, 100, 0)
+		var two_to_small = d.assem(false, -100, 0)
+		var abs_f = d.assem(false, e, s)
+		-> (two_to_large * f + abs_f) * two_to_small
+	elif !n && b >= d.thresh_1_max /* exp(x) = oo <=> expm1(x) = oo, as it turns out */
+		-> d.frombits(d.inf)
+	elif n && b >= d.thresh_huge_neg
+		-> (-1.0 : @f)
+	;;
+
+	if ((n && b < d.Log3_4) || (!n && b < d.Log5_4))
+		/* Procedure 2 */
+
+		/* compute x^2  / 2 with extra precision */
+		var u = round(f, d)
+		var v = f - u
+		var y = u * u * (0.5 : @f)
+		var z = v * (f + u) * (0.5 : @f)
+		var q = f * f * f * d.horner(f, d.Bi)
+
+		var yn, ye, ys
+		(yn, ye, ys) = d.explode(y)
+		if (ye >= -7)
+			-> (u + y) + (q + (v  + z))
+		else
+			-> f + (y + (q + z))
+		;;
+	;;
+
+	/* Procedure 1 */
+	var inv_L = d.frombits(d.inv_L)
+
+	var N = rn(f * inv_L)
+	var N2 = N % (32 : @i)
+	if N2 < 0
+		N2 += (32 : @i)
+	;;
+	var N1 = N - N2
+
+	var R1 : @f, R2 : @f
+
+	/*
+	   As in the exp case, R1 + R2 very well approximates f
+	   reduced into [ -ln(2)/64, ln(2)/64 ]
+	 */
+	if std.abs(N) >= (1 << d.nabs)
+		R1 = (f - (N1 : @f) * d.frombits(d.L1)) - ((N2 : @f) * d.frombits(d.L1))
+	else
+		R1 = f - (N : @f) * d.frombits(d.L1)
+	;;
+	R2 = -1.0 * (N : @f) * d.frombits(d.L2)
+
+	var M = (N1 : @i) / 32
+	var J = (N2 : std.size)
+
+	/* Compute a polynomial approximation of exp1m */
+	var R = R1 + R2
+	var Q = R * R * d.horner(R, d.Ai)
+	var P = R1 + (R2 + Q)
+
+	/* Expand out */
+	var Su_hi, Su_lo
+	(Su_hi, Su_lo) = d.S[J]
+	var S_hi = d.frombits(Su_hi)
+	var S_lo = d.frombits(Su_lo)
+	var S = S_hi + S_lo
+
+	/*
+	   Note that [Tan92] includes cases to avoid tripping the
+	   underflow flag when not technically required. We currently
+	   do not care about such flags, so that logic is omitted.
+	 */
+	var two_m = d.assem(false, M, 0)
+	var two_neg_m = d.assem(false, -M, 0)
+	if M >= d.exactness_thresh
+		-> two_m * (S_hi  + (S * P + (S_lo - two_neg_m)))
+	elif M <= -8
+		-> two_m * (S_hi  + (S * P + S_lo)) - (1.0 : @f)
+	;;
+
+	-> two_m * ((S_hi - two_neg_m) + (S_hi * P + S_lo * (P + (1.0 : @f))))
+}
+
+generic round = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+	var b = d.tobits(f)
+	var n, e, s
+	(n, e, s) = d.explode(f)
+	var mask = ~((1 << d.nabs) - 1)
+	if need_round_away(0, ((s & mask) : uint64), (d.nabs : int64) + 1)
+		-> d.assem(n, e, 1 + s & mask)
+	;;
+	-> d.assem(n, e, s & mask)
 }
--- a/lib/math/fpmath.myr
+++ b/lib/math/fpmath.myr
@@ -5,6 +5,7 @@
 
 		/* exp-impl */
 		exp : (f : @f -> @f)
+		expm1 : (f : @f -> @f)
 
 		/* fma-impl */
 		fma : (x : @f, y : @f, z : @f -> @f)
@@ -67,6 +68,7 @@
 	fma = {x, y, z; -> fma32(x, y, z)}
 
 	exp = {f; -> exp32(f)}
+	expm1 = {f; -> expm132(f)}
 
 	horner_poly = {f, a; -> horner_poly32(f, a)}
 	horner_polyu = {f, a; -> horner_polyu32(f, a)}
@@ -86,6 +88,7 @@
 	fma = {x, y, z; -> fma64(x, y, z)}
 
 	exp = {f; -> exp64(f)}
+	expm1 = {f; -> expm164(f)}
 
 	horner_poly = {f, a; -> horner_poly64(f, a)}
 	horner_polyu = {f, a; -> horner_polyu64(f, a)}
@@ -105,6 +108,9 @@
 
 extern const exp32 : (x : flt32 -> flt32)
 extern const exp64 : (x : flt64 -> flt64)
+
+extern const expm132 : (x : flt32 -> flt32)
+extern const expm164 : (x : flt64 -> flt64)
 
 extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
 extern const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
--- a/lib/math/references
+++ b/lib/math/references
@@ -19,3 +19,9 @@
 Function in IEEE Floating-point Arithmetic”. In: ACM Trans. Math.
 Softw. 15.2 (June 1989), pp. 144–157. issn: 0098-3500.  doi:
 10.1145/63522.214389. url: http://doi.acm.org/10.1145/63522.214389.
+
+[Tan92]
+Ping Tak Peter Tang. “Table-driven Implementation of the Expm1
+Function in IEEE Floating- point Arithmetic”. In: ACM Trans. Math.
+Softw. 18.2 (June 1992), pp. 211–222. issn: 0098-3500.  doi:
+10.1145/146847.146928. url: http://doi.acm.org/10.1145/146847.146928.
--- a/lib/math/test/exp-impl.myr
+++ b/lib/math/test/exp-impl.myr
@@ -8,6 +8,7 @@
 		[.name="exp-02", .fn = exp02],
 		[.name="exp-03", .fn = exp03],
 		[.name="exp-04", .fn = exp04],
+		[.name="expm1-01", .fn = expm101],
 	][:])
 }
 
@@ -19,6 +20,15 @@
 		(0x42000000, 0x568fa1fe),
 		(0xc2b00000, 0x0041edc4),
 		(0xc2b20000, 0x001840fc),
+		(0x7f7fffff, 0x7f800000),
+		(0x7f800000, 0x7f800000),
+		(0x7f800001, 0x7fc00000),
+		(0xc2cff1b3, 0x00000001),
+		(0xc2cff1b4, 0x00000001),
+		(0xc2cff1b5, 0000000000),
+		(0x42b17217, 0x7f7fff84),
+		(0x42b17218, 0x7f800000),
+		(0x42b17219, 0x7f800000),
 	][:]
 
 	for (x, y) : inputs
@@ -99,3 +109,28 @@
 		;;
 	;;
 }
+
+const expm101 = {c
+	var inputs : (uint32, uint32)[:] = [
+		(0x00000000, 0x00000000),
+		(0x3f000000, 0x3f261299),
+		(0x34000000, 0x34000000),
+		(0x3c000000, 0x3c008056),
+		(0x42000000, 0x568fa1fe),
+		(0xc2b00000, 0xbf800000),
+		(0xc2b20000, 0xbf800000),
+		(0x01000000, 0x01000000),
+		(0x40000000, 0x40cc7326),
+		(0x42b17200, 0x7f7d7740),
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var yf : flt32 = std.flt32frombits(y)
+		var rf = math.expm1(xf)
+		testr.check(c, rf == yf,
+			"expm1(0x{b=16,w=8,p=0}) should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+			x, y, std.flt32bits(rf))
+	;;
+}
+