ref: b889eefb13a9f1e41a3d0a85b23b69c058847ef3
parent: 218258e4ac2940303be597272245cb09662fd759
author: S. Gilles <[email protected]>
date: Wed May 1 12:01:01 EDT 2019
Add a log function which returns far too much precision. This makes accurately computing pown and powr easier.
--- /dev/null
+++ b/lib/math/ancillary/log-overkill-constants.c
@@ -1,0 +1,88 @@
+/* cc -o log-overkill-constants log-overkill-constants.c -lmpfr */
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <mpfr.h>
+
+#define FLT64_TO_UINT64(f) (*((uint64_t *) ((char *) &f)))
+#define UINT64_TO_FLT64(u) (*((double *) ((char *) &u)))
+
+#define FLT32_TO_UINT32(f) (*((uint32_t *) ((char *) &f)))
+#define UINT32_TO_FLT32(u) (*((double *) ((char *) &u)))
+
+int main(void)
+{
+ mpfr_t one;
+ mpfr_t two_to_the_minus_k;
+ mpfr_t one_plus_two_to_the_minus_k;
+ mpfr_t ln_one_plus_two_to_the_minus_k;
+ mpfr_t one_minus_two_to_the_minus_k;
+ mpfr_t ln_one_minus_two_to_the_minus_k;
+ mpfr_t t1;
+ mpfr_t t2;
+ double d = 0;
+ uint64_t u = 0;
+
+ mpfr_init2(one, 10000);
+ mpfr_init2(two_to_the_minus_k, 10000);
+ mpfr_init2(one_plus_two_to_the_minus_k, 10000);
+ mpfr_init2(ln_one_plus_two_to_the_minus_k, 10000);
+ mpfr_init2(one_minus_two_to_the_minus_k, 10000);
+ mpfr_init2(ln_one_minus_two_to_the_minus_k, 10000);
+ mpfr_init2(t1, 10000);
+ mpfr_init2(t2, 10000);
+
+ printf("/* C_plus */\n");
+ printf("(0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */\n");
+
+ for (size_t k = 1; k <= 27; ++k) {
+ mpfr_set_si(one, 1, MPFR_RNDN);
+ mpfr_mul_2si(two_to_the_minus_k, one, -k, MPFR_RNDN);
+ mpfr_add(one_plus_two_to_the_minus_k, one, two_to_the_minus_k, MPFR_RNDN);
+ mpfr_log(ln_one_plus_two_to_the_minus_k, one_plus_two_to_the_minus_k, MPFR_RNDN);
+
+ mpfr_set(t1, ln_one_plus_two_to_the_minus_k, MPFR_RNDN);
+ d = mpfr_get_d(t1, MPFR_RNDN);
+ u = FLT64_TO_UINT64(d);
+ printf("(%#018lx, ", u);
+ mpfr_set_d(t2, d, MPFR_RNDN);
+ mpfr_sub(t1, t1, t2, MPFR_RNDN);
+ d = mpfr_get_d(t1, MPFR_RNDN);
+ u = FLT64_TO_UINT64(d);
+ printf("%#018lx, ", u);
+ mpfr_set_d(t2, d, MPFR_RNDN);
+ mpfr_sub(t1, t1, t2, MPFR_RNDN);
+ d = mpfr_get_d(t1, MPFR_RNDN);
+ u = FLT64_TO_UINT64(d);
+ printf("%#018lx), /* k = %zu */\n", u, k);
+ }
+
+ printf("\n");
+ printf("/* C_minus */\n");
+ printf("(0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */\n");
+
+ for (size_t k = 1; k <= 27; ++k) {
+ mpfr_set_si(one, 1, MPFR_RNDN);
+ mpfr_mul_2si(two_to_the_minus_k, one, -k, MPFR_RNDN);
+ mpfr_sub(one_minus_two_to_the_minus_k, one, two_to_the_minus_k, MPFR_RNDN);
+ mpfr_log(ln_one_minus_two_to_the_minus_k, one_minus_two_to_the_minus_k, MPFR_RNDN);
+
+ mpfr_set(t1, ln_one_minus_two_to_the_minus_k, MPFR_RNDN);
+ d = mpfr_get_d(t1, MPFR_RNDN);
+ u = FLT64_TO_UINT64(d);
+ printf("(%#018lx, ", u);
+ mpfr_set_d(t2, d, MPFR_RNDN);
+ mpfr_sub(t1, t1, t2, MPFR_RNDN);
+ d = mpfr_get_d(t1, MPFR_RNDN);
+ u = FLT64_TO_UINT64(d);
+ printf("%#018lx, ", u);
+ mpfr_set_d(t2, d, MPFR_RNDN);
+ mpfr_sub(t1, t1, t2, MPFR_RNDN);
+ d = mpfr_get_d(t1, MPFR_RNDN);
+ u = FLT64_TO_UINT64(d);
+ printf("%#018lx), /* k = %zu */\n", u, k);
+ }
+
+ return 0;
+}
--- /dev/null
+++ b/lib/math/ancillary/log-overkill-tests.c
@@ -1,0 +1,106 @@
+/* cc -o log-overkill-tests log-overkill-tests.c -lmpfr */
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <mpfr.h>
+
+#define FLT64_TO_UINT64(f) (*((uint64_t *) ((char *) &f)))
+#define UINT64_TO_FLT64(u) (*((double *) ((char *) &u)))
+
+#define FLT32_TO_UINT32(f) (*((uint32_t *) ((char *) &f)))
+#define UINT32_TO_FLT32(u) (*((float *) ((char *) &u)))
+
+/*
+ xoroshiro64**, from http://xoshiro.di.unimi.it/xoroshiro64starstar.c
+ and in public domain.
+ */
+static inline uint32_t
+rotl(const uint32_t x, int k)
+{
+ return (x << k) | (x >> (32 - k));
+}
+
+static uint32_t s[2];
+
+uint32_t
+next(void)
+{
+ const uint32_t s0 = s[0];
+ uint32_t s1 = s[1];
+ const uint32_t result_starstar = rotl(s0 * 0x9E3779BB, 5) * 5;
+
+ s1 ^= s0;
+ s[0] = rotl(s0, 26) ^ s1 ^ (s1 << 9);
+ s[1] = rotl(s1, 13);
+
+ return result_starstar;
+}
+
+int
+main(void)
+{
+ mpfr_t x;
+ mpfr_t lnx;
+ mpfr_t t;
+ double d = 0;
+ float f = 0;
+ double t1 = 0;
+ double t2 = 0;
+ float t3 = 0;
+ float t4 = 0;
+ uint64_t u = 0;
+ uint32_t v = 0;
+
+ /* Seed xoroshiro64** */
+ s[1] = 1234;
+ s[2] = 1234;
+
+ mpfr_init2(x, 10000);
+ mpfr_init2(t, 10000);
+ mpfr_init2(lnx, 10000);
+
+ size_t n = 0;
+ while (n < 100) {
+ u = (((uint64_t)next()) << 32) | ((uint64_t)next());
+ d = UINT64_TO_FLT64(u);
+ if (d < 0.001 || d > 100) {
+ continue;
+ }
+
+ printf("(%#018lx, ", u);
+ mpfr_set_d(x, d, MPFR_RNDN);
+ mpfr_log(lnx, x, MPFR_RNDN);
+
+ t1 = mpfr_get_d(lnx, MPFR_RNDN);
+ mpfr_set_d(t, t1, MPFR_RNDN);
+ mpfr_sub(lnx, lnx, t, MPFR_RNDN);
+ t2 = mpfr_get_d(lnx, MPFR_RNDN);
+
+ printf("%#018lx, %#018lx),\n", FLT64_TO_UINT64(t1), FLT64_TO_UINT64(t2));
+ n++;
+ }
+
+ n = 0;
+ while (n < 100) {
+ v = next();
+ f = UINT32_TO_FLT32(v);
+ if (f < 0.001 || f > 100) {
+ continue;
+ }
+
+ printf("(%#010x, ", v);
+ mpfr_set_d(x, (double) f, MPFR_RNDN);
+ mpfr_log(lnx, x, MPFR_RNDN);
+
+ t3 = (float)mpfr_get_d(lnx, MPFR_RNDN);
+ mpfr_set_d(t, (double) t3, MPFR_RNDN);
+ mpfr_sub(lnx, lnx, t, MPFR_RNDN);
+ t4 = (float)mpfr_get_d(lnx, MPFR_RNDN);
+
+ printf("%#010x, %#010x),\n", FLT32_TO_UINT32(t3), FLT32_TO_UINT32(t4));
+ n++;
+ }
+
+ return 0;
+}
--- a/lib/math/bld.sub
+++ b/lib/math/bld.sub
@@ -22,6 +22,9 @@
# polynomial evaluation methods
poly-impl.myr
+ # x^n
+ log-overkill.myr
+
# x^y
powr-impl.myr
--- /dev/null
+++ b/lib/math/log-overkill.myr
@@ -1,0 +1,302 @@
+use std
+
+use "fpmath"
+use "log-impl"
+use "exp-impl"
+use "sum-impl"
+use "util"
+
+/*
+ This is an implementation of log following [Mul16] 8.3.2, returning
+ far too much precision. These are slower than the
+ table-and-low-degree-polynomial implementations of exp-impl.myr and
+ log-impl.myr, but are needed to handle the powr, pown, and rootn
+ functions.
+
+ This is only for flt64, because if you want this for flt32 you should
+ just cast to flt64, use the fast functions, and then split back.
+
+ Note that the notation of [Mul16] 8.3.2 is confusing, to say the
+ least. [NM96] is, perhaps, a clearer presentation.
+
+ To recap, we use an iteration, starting with t_1 = 0, L_1 = x, and
+ iterate as
+
+ t_{n+1} = t_{n} - ln(1 + d_n 2^{-n})
+ L_{n+1} = L_n (1 + d_n 2^{-n})
+
+ where d_n is in {-1, 0, 1}, chosen so that as n -> oo, L_n approaches
+ 1, then t_n approaches ln(x).
+
+ If we let l_n = L_n - 1, we initialize l_1 = x - 1 and iterate as
+
+ l_{n+1} = l_n (1 + d_n 2^{-n}) + d_n 2^{-n}
+
+ If we further consider l'_n = 2^n l_n, we initialize l'_1 = x - 1,
+ and iterate as
+
+ l'_{n + 1} = 2 l'_{n} (1 + d_n 2^{-n}) + 2 d_n
+
+ The nice thing about this is that we can pick d_n easily based on
+ truncating l'_n to the first fractional digit: [l'_n]. Then
+
+ { +1 if [l'_n] <= -1/2
+ d_n = { 0 if [l'_n] = 0 or 1/2
+ { -1 if [l'_n] >= 1
+ */
+pkg math =
+ pkglocal const logoverkill32 : (x : flt32 -> (flt32, flt32))
+ /*pkglocal*/ const logoverkill64 : (x : flt64 -> (flt64, flt64))
+;;
+
+/*
+ Tables of 1 +/- 2^-k, for k = 0 to 159, with k = 0 a dummy row. 159
+ is chosen as the first k such that the error between 2^(-53 * 2) and
+ 2^(-53 * 2) + log(1 + 2^(-k)) is less than 1 ulp, therefore we'll
+ have a full 53 * 2 bits of precision available with these tables. The
+ layout for C_plus is
+
+ ( ln(1 + 2^-k)[hi], ln(1 + 2^-k)[lo], ln(1 + 2^-k)[very lo]) ,
+
+ and for C_minus it is similar, but handling ln(1 - 2^-k).
+
+ They are generated by ancillary/log-overkill-constants.c.
+
+ Conveniently, for k > 27, we can calculate the entry exactly using a
+ few terms of the Taylor expansion for ln(1 + x), with the x^{2+}
+ terms vanishing past k = 53. This is possible since we only care
+ about two flt64s worth of precision.
+ */
+const C_plus : (uint64, uint64, uint64)[28] = [
+ (0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */
+ (0x3fd9f323ecbf984c, 0xbc4a92e513217f5c, 0x38e0c0cfa41ff669), /* k = 1 */
+ (0x3fcc8ff7c79a9a22, 0xbc64f689f8434012, 0x390a24ae3b2f53a1), /* k = 2 */
+ (0x3fbe27076e2af2e6, 0xbc361578001e0162, 0x38b55db94ebc4018), /* k = 3 */
+ (0x3faf0a30c01162a6, 0x3c485f325c5bbacd, 0xb8e0ece597165991), /* k = 4 */
+ (0x3f9f829b0e783300, 0x3c333e3f04f1ef23, 0xb8d814544147acc9), /* k = 5 */
+ (0x3f8fc0a8b0fc03e4, 0xbc183092c59642a1, 0xb8b52414fc416fc2), /* k = 6 */
+ (0x3f7fe02a6b106789, 0xbbce44b7e3711ebf, 0x386a567b6587df34), /* k = 7 */
+ (0x3f6ff00aa2b10bc0, 0x3c02821ad5a6d353, 0xb8912dcccb588a4a), /* k = 8 */
+ (0x3f5ff802a9ab10e6, 0x3bfe29e3a153e3b2, 0xb89538d49c4f745e), /* k = 9 */
+ (0x3f4ffc00aa8ab110, 0xbbe0fecbeb9b6cdb, 0xb877171cf29e89d1), /* k = 10 */
+ (0x3f3ffe002aa6ab11, 0x3b999e2b62cc632d, 0xb81eae851c58687c), /* k = 11 */
+ (0x3f2fff000aaa2ab1, 0x3ba0bbc04dc4e3dc, 0x38152723342e000b), /* k = 12 */
+ (0x3f1fff8002aa9aab, 0x3b910e6678af0afc, 0x382ed521af29bc8d), /* k = 13 */
+ (0x3f0fffc000aaa8ab, 0xbba3bbc110fec82c, 0xb84f79185e42fbaf), /* k = 14 */
+ (0x3effffe0002aaa6b, 0xbb953bbbe6661d42, 0xb835071791df7d3e), /* k = 15 */
+ (0x3eeffff0000aaaa3, 0xbb8553bbbd110fec, 0xb82ff1fae6cea01a), /* k = 16 */
+ (0x3edffff80002aaaa, 0xbb75553bbbc66662, 0x3805f05f166325ff), /* k = 17 */
+ (0x3ecffffc0000aaab, 0xbb6d5553bbbc1111, 0x37a380f8138f70f4), /* k = 18 */
+ (0x3ebffffe00002aab, 0xbb5655553bbbbe66, 0xb7f987507707503c), /* k = 19 */
+ (0x3eafffff00000aab, 0xbb45755553bbbbd1, 0xb7c10fec7ed7ec7e), /* k = 20 */
+ (0x3e9fffff800002ab, 0xbb355955553bbbbc, 0xb7d9999875075875), /* k = 21 */
+ (0x3e8fffffc00000ab, 0xbb2555d55553bbbc, 0x37bf7777809c09a1), /* k = 22 */
+ (0x3e7fffffe000002b, 0xbb15556555553bbc, 0x37b106666678af8b), /* k = 23 */
+ (0x3e6ffffff000000b, 0xbb055557555553bc, 0x37a110bbbbbc04e0), /* k = 24 */
+ (0x3e5ffffff8000003, 0xbaf555559555553c, 0x3791110e6666678b), /* k = 25 */
+ (0x3e4ffffffc000001, 0xbae555555d555554, 0x37811110fbbbbbc0), /* k = 26 */
+ (0x3e3ffffffe000000, 0x3ac5555553555556, 0xb76ddddddf333333), /* k = 27 */
+]
+
+const C_minus : (uint64, uint64, uint64)[28] = [
+ (0x0000000000000000, 0x0000000000000000, 0x0000000000000000), /* dummy */
+ (0xbfe62e42fefa39ef, 0xbc7abc9e3b39803f, 0xb907b57a079a1934), /* k = 1 */
+ (0xbfd269621134db92, 0xbc7e0efadd9db02b, 0x39163d5cf0b6f233), /* k = 2 */
+ (0xbfc1178e8227e47c, 0x3c50e63a5f01c691, 0xb8f03c776a3fb0f1), /* k = 3 */
+ (0xbfb08598b59e3a07, 0x3c5dd7009902bf32, 0x38ea7da07274e01d), /* k = 4 */
+ (0xbfa0415d89e74444, 0xbc4c05cf1d753622, 0xb8d3bc1c184cef0a), /* k = 5 */
+ (0xbf90205658935847, 0xbc327c8e8416e71f, 0x38b19642aac1310f), /* k = 6 */
+ (0xbf8010157588de71, 0xbc146662d417ced0, 0xb87e91702f8418af), /* k = 7 */
+ (0xbf70080559588b35, 0xbc1f96638cf63677, 0x38a90badb5e868b4), /* k = 8 */
+ (0xbf60040155d5889e, 0x3be8f98e1113f403, 0x38601ac2204fbf4b), /* k = 9 */
+ (0xbf50020055655889, 0xbbe9abe6bf0fa436, 0x3867c7d335b216f3), /* k = 10 */
+ (0xbf40010015575589, 0x3bec8863f23ef222, 0x38852c36a3d20146), /* k = 11 */
+ (0xbf30008005559559, 0x3bddd332a0e20e2f, 0x385c8b6b9ff05329), /* k = 12 */
+ (0xbf20004001555d56, 0x3bcddd88863f53f6, 0xb859332cbe6e6ac5), /* k = 13 */
+ (0xbf10002000555655, 0xbbb62224ccd5f17f, 0xb8366327cc029156), /* k = 14 */
+ (0xbf00001000155575, 0xbba5622237779c0a, 0xb7d38f7110a9391d), /* k = 15 */
+ (0xbef0000800055559, 0xbb95562222cccd5f, 0xb816715f87b8e1ee), /* k = 16 */
+ (0xbee0000400015556, 0x3b75553bbbb1110c, 0x381fb17b1f791778), /* k = 17 */
+ (0xbed0000200005555, 0xbb79555622224ccd, 0x3805074f75071791), /* k = 18 */
+ (0xbec0000100001555, 0xbb65d55562222377, 0xb80de7027127028d), /* k = 19 */
+ (0xbeb0000080000555, 0xbb5565555622222d, 0x37e9995075035075), /* k = 20 */
+ (0xbea0000040000155, 0xbb45575555622222, 0xb7edddde70270670), /* k = 21 */
+ (0xbe90000020000055, 0xbb35559555562222, 0xb7c266666af8af9b), /* k = 22 */
+ (0xbe80000010000015, 0xbb25555d55556222, 0xb7b11bbbbbce04e0), /* k = 23 */
+ (0xbe70000008000005, 0xbb15555655555622, 0xb7a111666666af8b), /* k = 24 */
+ (0xbe60000004000001, 0xbb05555575555562, 0xb7911113bbbbbce0), /* k = 25 */
+ (0xbe50000002000000, 0xbaf5555559555556, 0xb78111112666666b), /* k = 26 */
+ (0xbe40000001000000, 0xbac5555557555556, 0x376ddddddc888888), /* k = 27 */
+]
+
+const logoverkill32 = {x : flt32
+ var x64 : flt64 = (x : flt64)
+ var l64 : flt64 = log64(x64)
+ var y1 : flt32 = (l64 : flt32)
+ var y2 : flt32 = ((l64 - (y1 : flt64)) : flt32)
+ -> (y1, y2)
+}
+
+const logoverkill64 = {x : flt64
+ var xn, xe, xs
+ (xn, xe, xs) = std.flt64explode(x)
+
+ /* Special cases */
+ if std.isnan(x)
+ -> (std.flt64frombits(0x7ff8000000000000), std.flt64frombits(0x7ff8000000000000))
+ elif xe == -1024 && xs == 0
+ /* log (+/- 0) is -infinity */
+ -> (std.flt64frombits(0xfff0000000000000), std.flt64frombits(0xfff0000000000000))
+ elif xe == 1023
+ -> (std.flt64frombits(0xfff8000000000000), std.flt64frombits(0xfff8000000000000))
+ elif xn
+ /* log(-anything) is NaN */
+ -> (std.flt64frombits(0x7ff8000000000000), std.flt64frombits(0x7ff8000000000000))
+ ;;
+
+ /*
+ Deal with 2^xe up front: multiply xe by a high-precision log(2). We'll
+ add them back in to the giant mess of tis later.
+ */
+ var xef : flt64 = (xe : flt64)
+ var log_2_hi, log_2_lo
+ (log_2_hi, log_2_lo) = accurate_logs64[128] /* See log-impl.myr */
+ var lxe_1, lxe_2, lxe_3, lxe_4
+ (lxe_1, lxe_2) = two_by_two64(xef, std.flt64frombits(log_2_hi))
+ (lxe_3, lxe_4) = two_by_two64(xef, std.flt64frombits(log_2_lo))
+
+ /*
+ We split t into three parts, so that we can gradually build up two
+ flt64s worth of information
+ */
+ var t1 = 0.0
+ var t2 = 0.0
+ var t3 = 0.0
+
+ /*
+ We also split lprime.
+ */
+ var lprime1
+ var lprime2
+ (lprime1, lprime2) = slow2sum(std.flt64assem(false, 1, xs), -2.0)
+ var lprime3 = 0.0
+
+ for var k = 1; k <= 107; ++k
+ /* Calculate d_k and some quanitites for iteration */
+ var d = 0.0
+ var ln_hi : flt64, ln_mi : flt64, ln_lo : flt64
+
+ /*
+ Note the truncation method for [Mul16] is for signed-digit systems,
+ which we don't have. This comparison follows from the remark following
+ (8.23), though.
+ */
+ if lprime1 <= -0.5
+ d = 1.0
+ (ln_hi, ln_mi, ln_lo) = get_C_plus(k)
+ elif lprime1 < 1.0
+ d = 0.0
+
+ /* In this case, t_n is unchanged, and we just scale lprime by 2 */
+ lprime1 = lprime1 * 2.0
+ lprime2 = lprime2 * 2.0
+ lprime3 = lprime3 * 2.0
+
+ /*
+ If you're looking for a way to speed this up, calculate how many k we
+ can skip here, preferably by a lookup table.
+ */
+ continue
+ else
+ d = -1.0
+ (ln_hi, ln_mi, ln_lo) = get_C_minus(k)
+ ;;
+
+ /* t_{n + 1} */
+ (t1, t2, t3) = foursum(t1, t2, t3, -1.0 * ln_hi)
+ (t1, t2, t3) = foursum(t1, t2, t3, -1.0 * ln_mi)
+ (t1, t2, t3) = foursum(t1, t2, t3, -1.0 * ln_lo)
+
+ /* lprime_{n + 1} */
+ lprime1 *= 2.0
+ lprime2 *= 2.0
+ lprime3 *= 2.0
+
+ var lp1m = d * scale2(lprime1, -k)
+ var lp2m = d * scale2(lprime2, -k)
+ var lp3m = d * scale2(lprime3, -k)
+ (lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, lp1m)
+ (lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, lp2m)
+ (lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, lp3m)
+ (lprime1, lprime2, lprime3) = foursum(lprime1, lprime2, lprime3, 2.0 * d)
+ ;;
+
+ var l : flt64[:] = [t1, t2, t3, lxe_1, lxe_2, lxe_3, lxe_4][:]
+ std.sort(l, mag_cmp64)
+ -> double_compensated_sum(l)
+}
+
+/* significand for 1/3 (if you reconstruct without compensating, you get 4/3) */
+const one_third_sig = 0x0015555555555555
+
+/* and for 1/5 (if you reconstruct, you get 8/5) */
+const one_fifth_sig = 0x001999999999999a
+
+/*
+ These calculations are incredibly slow. Somebody should speed them up.
+ */
+const get_C_plus = {k : int64
+ if k < 0
+ -> (0.0, 0.0, 0.0)
+ elif k < 28
+ var t1, t2, t3
+ (t1, t2, t3) = C_plus[k]
+ -> (std.flt64frombits(t1), std.flt64frombits(t2), std.flt64frombits(t3))
+ elif k < 54
+ var t1 = std.flt64assem(false, -k, 1 << 53) /* x [ = 2^-k ] */
+ var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
+ var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /* x^3 / 3 */
+ var t4 = std.flt64assem(true, -4*k - 2, 1 << 53) /* -x^4 / 4 */
+ var t5 = std.flt64assem(false, -5*k - 3, one_fifth_sig) /* x^5 / 5 */
+ -> triple_compensated_sum([t1, t2, t3, t4, t5][:])
+ else
+ var t1 = std.flt64assem(false, -k, 1 << 53) /* x [ = 2^-k ] */
+ var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
+ var t3 = std.flt64assem(false, -3*k - 2, one_third_sig) /* x^3 / 3 */
+ -> (t1, t2, t3)
+ ;;
+}
+
+const get_C_minus = {k : int64
+ if k < 0
+ -> (0.0, 0.0, 0.0)
+ elif k < 28
+ var t1, t2, t3
+ (t1, t2, t3) = C_minus[k]
+ -> (std.flt64frombits(t1), std.flt64frombits(t2), std.flt64frombits(t3))
+ elif k < 54
+ var t1 = std.flt64assem(true, -k, 1 << 53) /* x [ = 2^-k ] */
+ var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
+ var t3 = std.flt64assem(true, -3*k - 2, one_third_sig) /* x^3 / 3 */
+ var t4 = std.flt64assem(true, -4*k - 2, 1 << 53) /* -x^4 / 4 */
+ var t5 = std.flt64assem(true, -5*k - 3, one_fifth_sig) /* x^5 / 5 */
+ -> triple_compensated_sum([t1, t2, t3, t4, t5][:])
+ else
+ var t1 = std.flt64assem(true, -k, 1 << 53) /* x [ = 2^-k ] */
+ var t2 = std.flt64assem(true, -2*k - 1, 1 << 53) /* -x^2 / 2 */
+ var t3 = std.flt64assem(true, -3*k - 2, one_third_sig) /* x^3 / 3 */
+ -> (t1, t2, t3)
+ ;;
+}
+
+const foursum = {a1 : flt64, a2 : flt64, a3 : flt64, x : flt64
+ var t1, t2, t3, t4, t5, t6, s1, s2, s3, s4
+
+ (t5, t6) = slow2sum(a3, x)
+ (t3, t4) = slow2sum(a2, t5)
+ (t1, t2) = slow2sum(a1, t3)
+ (s3, s4) = slow2sum(t4, t6)
+ (s1, s2) = slow2sum(t2, s3)
+
+ -> (t1, s1, s2 + s4)
+}
--- a/lib/math/references
+++ b/lib/math/references
@@ -29,6 +29,12 @@
J. M. Muller. Elementary functions : algorithms and implementation.
Third edition. New York: Birkhäuser, 2016. isbn: 9781489979810.
+[NM96]
+Asger Munk Nielsen and Jean-Michel Muller. “On-line algorithms for
+computing exponentials and logarithms”. In: Lecture Notes in Computer
+Science. Springer Berlin Heidelberg, 1996, pp. 165–174. doi:
+10.1007/bfb0024699.
+
[Tan89]
Ping-Tak Peter Tang. “Table-driven Implementation of the Exponential
Function in IEEE Floating-point Arithmetic”. In: ACM Trans. Math.
--- /dev/null
+++ b/lib/math/test/log-overkill.myr
@@ -1,0 +1,242 @@
+use std
+use math
+use testr
+
+/*
+ Test the extra-precision log() function of log-overkill.myr. We only
+ test inputs in a reasonable range, because this function is only used
+ by the various pow* functions, so its input should already be normalized.
+ */
+const main = {
+ testr.run([
+ [.name="log-overkill-01", .fn = log01],
+ [.name="log-overkill-02", .fn = log02],
+ ][:])
+}
+
+/* Commented-out entries are off by one or two ulps. */
+const log01 = {c
+ var inputs : (uint32, uint32, uint32)[:] = [
+ (0x3e064666, 0xc0020570, 0xb31d3367),
+ (0x41a210cc, 0x40408c3e, 0x32d67b14),
+ (0x3d4179b1, 0xc0435e10, 0x33ddd9f2),
+ (0x41f80c99, 0x405bc9b2, 0x33e2ee76),
+ (0x3bf31225, 0xc09cec61, 0x34666578),
+ (0x7fc5acf4, 0xffc00000, 0xffc00000),
+ (0x3d951adb, 0xc027ad92, 0x329b2b14),
+ (0x406c0805, 0x3fa70ce9, 0xb317a93b),
+ //(0x3dc554d4, 0xc015be36, 0xb3b98877),
+ (0x4168b510, 0x402b5720, 0xb3a1b4f0),
+ (0x40de6245, 0x3ff8264f, 0x32ca972e),
+ (0x3f3e70d0, 0xbe9777e9, 0xb1abc5cb),
+ (0x3ce907fb, 0xc063d2cc, 0xb3c0b1e5),
+ (0x42620a10, 0x408119ed, 0xb46e5f23),
+ //(0x3d66c8cb, 0xc0381503, 0x30678ced),
+ (0x3d9b70a7, 0xc02503d5, 0x32985d8e),
+ (0xffa9f86b, 0xffc00000, 0xffc00000),
+ (0xffb90a0e, 0xffc00000, 0xffc00000),
+ (0x3ac5e138, 0xc0cfddf1, 0xb3183fc3),
+ (0x3e4d6b8e, 0xbfcd9efd, 0xb34112b5),
+ (0x41c68585, 0x404d887f, 0x33ec7a86),
+ (0x3e1ff255, 0xbfeda61c, 0x32a4597e),
+ (0x40212290, 0x3f6c614a, 0xb260dbed),
+ (0x3ffef4dc, 0x3f306668, 0x3251ff94),
+ (0x3c2acf70, 0xc0920840, 0x3423757f),
+ (0x3eca034d, 0xbf6e1407, 0xb1e20a45),
+ (0x401f1faa, 0x3f692a1b, 0xb0e6f386),
+ (0xff924ed6, 0xffc00000, 0xffc00000),
+ (0x3dde9a94, 0xc00e07ca, 0x32cac4f6),
+ (0x3f5a3ea5, 0xbe2363d5, 0x31d40c16),
+ (0x3d0d721e, 0xc0576a15, 0xb3267b54),
+ (0x4096bb03, 0x3fc65e76, 0xb35d04c9),
+ (0x3c286893, 0xc0927c42, 0x3452ed38),
+ (0x409d6c90, 0x3fcbee38, 0xb0812022),
+ (0x410e42f3, 0x400bd853, 0xb34ea1b1),
+ (0x3c83eeb8, 0xc0841dae, 0x3361e7f0),
+ (0x3ec6be28, 0xbf724193, 0xb2e12e9f),
+ //(0x42a04df2, 0x408c4923, 0xb3123a17),
+ (0x3dae72a5, 0xc01da1ae, 0xb3ef93ad),
+ (0x40a564e7, 0x3fd24092, 0xb2b04b3a),
+ (0xffdf964f, 0xffc00000, 0xffc00000),
+ (0x3efbab20, 0xbf35d075, 0x30c29d03),
+ (0x3bd1c6ca, 0xc0a1a325, 0x34346c03),
+ (0x420b22d1, 0x40632566, 0xb3d096e6),
+ (0x427215df, 0x40834bbf, 0xb26b7be4),
+ (0x3cfe7d24, 0xc05e2f9e, 0xb3a67d1c),
+ (0x7ff7c700, 0xffc00000, 0xffc00000),
+ //(0x3b6d287b, 0xc0b3e460, 0x332811e1),
+ (0x3da95f09, 0xc01f858c, 0x3203e75d),
+ (0x40550116, 0x3f99e932, 0xb37c7f51),
+ (0x3ab855e3, 0xc0d222c6, 0x343b7062),
+ (0x3eb3b944, 0xbf8600f3, 0xb2aa74ad),
+ (0x3a832581, 0xc0dd07ad, 0xb392c172),
+ (0x3dededb8, 0xc009c4fe, 0x333fcc75),
+ (0x4241474c, 0x40782e7f, 0xb3b85075),
+ (0x3b51550e, 0xc0b7e2c6, 0x33c8ea73),
+ (0x3feb708b, 0x3f1c033a, 0xb16315be),
+ (0x4109dcd1, 0x4009d5b5, 0xb3bc8763),
+ (0x3f608f42, 0xbe062e61, 0x31df27bb),
+ (0x402b87ff, 0x3f7c62c8, 0x32400557),
+ (0x3c5161c6, 0xc08b844e, 0xb38b064e),
+ (0x3abdaffc, 0xc0d13851, 0x34226ee0),
+ (0x3e0a9b16, 0xbffffab0, 0x33153c0a),
+ (0x3f86422e, 0x3d4387c2, 0xb09e0810),
+ (0x3c4943fe, 0xc08cc82c, 0xb3b272a4),
+ (0x3c09995c, 0xc098f370, 0x337aae82),
+ (0x3dcc9670, 0xc0136e8d, 0xb2fa279c),
+ (0x3b4a5d8c, 0xc0b8f80d, 0xb44e2bdd),
+ (0x3b10ef54, 0xc0c3a677, 0x330fe58e),
+ (0x40a4299a, 0x3fd14ba4, 0x3373219b),
+ //(0x4174c7f5, 0x402e93e0, 0xb2e8d1a7),
+ (0x7fdafcd0, 0xffc00000, 0xffc00000),
+ (0x4141df1c, 0x401fa7a4, 0xb293e445),
+ (0x3eb7f2cc, 0xbf830798, 0x3326b7a4),
+ (0x3c4f0693, 0xc08be104, 0x33457d70),
+ (0x3a996967, 0xc0d80316, 0xb2f994c1),
+ (0x3f0b1dc5, 0xbf1c2043, 0x3255d211),
+ (0x7fa77367, 0xffc00000, 0xffc00000),
+ (0x42091ff2, 0x406236d6, 0x33aaeb0c),
+ //(0x3f9075dd, 0x3df7c1d6, 0x2b437f70),
+ (0x3ec76138, 0xbf716fdf, 0xb1cf4b2a),
+ (0x411e876a, 0x4012c639, 0x32cdf984),
+ (0x3e3435cb, 0xbfde616d, 0xb36a44c5),
+ (0x7fde9793, 0xffc00000, 0xffc00000),
+ (0x41903949, 0x4039154b, 0xb3e93b77),
+ (0x418a88de, 0x403681e8, 0xb1c35fed),
+ (0x42195823, 0x40695e79, 0x33da2247),
+ (0x3e23b534, 0xbfeaac7a, 0x333b9184),
+ (0x3f38811c, 0xbea7aeab, 0xb2035974),
+ (0x4149bb60, 0x402232cf, 0xb3a796da),
+ (0x41f1eb7a, 0x405a2fc2, 0xb36edf06),
+ (0x3d4581d4, 0xc0420c26, 0x3371bdc3),
+ (0x4167d2ce, 0x402b18c7, 0x33e01e7d),
+ //(0x414b2e8b, 0x4022a824, 0xb2d79adf),
+ (0x3ab71624, 0xc0d25a78, 0x3407187d),
+ (0xffc0330b, 0xffc00000, 0xffc00000),
+ (0x3bf24b92, 0xc09d0690, 0x34428dae),
+ (0x3b649052, 0xc0b512c2, 0x3364cef1),
+ (0x3d9075aa, 0xc029b420, 0x32fff057),
+ (0x4207cfdd, 0x40619939, 0xb190080f),
+ ][:]
+
+ for (x, y1, y2) : inputs
+ var xf : flt32 = std.flt32frombits(x)
+ var y1f : flt32 = std.flt32frombits(y1)
+ var y2f : flt32 = std.flt32frombits(y2)
+ var r1f, r2f
+ (r1f, r2f) = math.logoverkill32(xf)
+ testr.check(c, r1f == y1f && r2f == y2f,
+ "log(0x{b=16,w=8,p=0}) should be (0x{b=16,w=8,p=0}, 0x{b=16,w=8,p=0}), was (0x{b=16,w=8,p=0}, 0x{b=16,w=8,p=0})",
+ x, y1, y2, std.flt32bits(r1f), std.flt32bits(r2f))
+ ;;
+}
+
+const log02 = {c
+ var inputs : (uint64, uint64, uint64)[:] = [
+ (0x3f8a92cfe4c879dd, 0xc01160fa5e8a9274, 0xbc917bec31c59733),
+ (0x3fdcbde7752d4339, 0xbfe99df158595cd2, 0x3c5e051326211e5f),
+ (0x4049c21ac69da8fb, 0x400f890367199d9f, 0x3ca3c7bc43b3a7bf),
+ (0x3fdf7df87f170e95, 0xbfe6b15582a8021a, 0x3c59170acc3a93ac),
+ (0x3fa33efe56ef1925, 0xc00a3f8645c8921d, 0xbcab31ca72888736),
+ (0x3f8eb16a31ea9437, 0xc010cd65c2f67333, 0xbcb05fe9a1ef1357),
+ (0x3f974b5311aeae57, 0xc00e4420e231d7f0, 0xbc9d614ed9b94484),
+ (0x3fe28aed659dab73, 0xbfe1760d162fed7e, 0xbc64a0ff30250148),
+ (0x403273d9892e62d3, 0x40075255633e0533, 0xbc91eb9834046d7b),
+ (0x3fee1d239d2061d7, 0xbfaf1ad3961ab8ba, 0xbbc9bff82ae3fde7),
+ (0x3fbc0666ebc60265, 0xc001b257198142d0, 0xbca1cf93360a27f6),
+ (0x3f53267a24ceab6a, 0xc01b01c8ad09c3c1, 0xbca0d85af74df975),
+ (0x3fd2005446cb268e, 0xbff44b879f2ec561, 0x3c66e8eff64f40a1),
+ (0x404c495cb7ea6e6b, 0x401024631de2a59a, 0xbcb0dc3bd3a88f14),
+ (0x3fc37680a1b7c852, 0xbffe22e609516976, 0xbc9b49bc37601215),
+ (0x3fe26c01523f67a8, 0xbfe1ab96ed675629, 0xbc8cb9b8209aac7c),
+ (0x3facd27e97d8c4b3, 0xc007047576bb7dae, 0xbc8e4a38613d1b43),
+ (0x400f56fb7037ba7d, 0x3ff5d8de6ad03a3d, 0x3c8c1c5a6d1511a9),
+ (0x3fc570f145460d05, 0xbffc966460e1709c, 0x3c94af6de9212790),
+ (0x3fa809e6ed829a68, 0xc0087822f56ac395, 0x3c8678342157ac85),
+ (0x3f55823e1e200f42, 0xc01a8adac8967a45, 0x3cb7be1b6b7dd05e),
+ (0x405784744f92d247, 0x40122d177db51497, 0xbcb68ecdab1f3e8a),
+ (0x3fab4cbea9976254, 0xc0077399de5b1e6c, 0x3cab9184e0d9f693),
+ (0x3fdea414a2b5f667, 0xbfe791c90d39b559, 0x3c7fccdcd93c8396),
+ (0x4058c25cbee955fc, 0x401261c8d103d77f, 0x3cbb3825321123b7),
+ (0x3ff30000aa9a48c0, 0x3fc5ff34edfa415b, 0x3c05ef4da5cc3d05),
+ (0x3fccce5d53963f87, 0xbff7dcf30350acb1, 0x3c9f40d3134e3ad5),
+ (0x3fc670550e615cc0, 0xbffbdc1d2b8a9b59, 0xbc919aeccfafca2c),
+ (0x3f85b54e302d0f6a, 0xc012300dd72719c1, 0xbc89f5a3d1cdb2e1),
+ (0x402386e5450b51a7, 0x40023aab9fb1da19, 0x3ca2c7af66ae419b),
+ (0x3ff6959f36e58464, 0x3fd60f20e1496691, 0xbc6018ad87980799),
+ (0x3f600e959bdc4215, 0xc018f067a04992e6, 0xbc9e3fe4581527a9),
+ (0x3f568e6f133c989b, 0xc01a5a27ce23b3c6, 0x3ca06da90a0ee1a3),
+ (0x402c19721717a0ed, 0x4005240bf65143ae, 0x3cae244b763ae814),
+ (0x3fbb13aebb1a4839, 0xc001f8d370035c97, 0x3ca0b45e0003fc1b),
+ (0x4033102153118b25, 0x400794fdcf559b7e, 0xbc96afebbfc675de),
+ (0x3f537adaddec30d9, 0xc01af04f3bdc1d51, 0x3cb11840ac75087e),
+ (0x3f60bfa935803114, 0xc018c537670bd5fd, 0x3cbffed98ba70ea0),
+ (0x40359fdaa8225f24, 0x40089730e1a85732, 0x3c5a95afca1885f9),
+ (0x3fc175000ebd1ad0, 0xbfffe06913afd5fe, 0xbc987c68046a4b2b),
+ (0x3f972d70572c34da, 0xc00e4e6affcb0d69, 0x3c944da2b4a32e8b),
+ (0x404517d0a0f954f1, 0x400defccb99937fb, 0xbca1c136d4f696bd),
+ (0x3fdfb653305a1ee4, 0xbfe6784521bc975c, 0xbc740e603a0d13fa),
+ (0x3fae6163881229a6, 0xc00698a129128185, 0xbca942e03278e8ab),
+ (0x3fc9b9b239873842, 0xbff9ac3f1cc4718f, 0xbc92a473acf76c1a),
+ (0x4003e596f5fa87a5, 0x3fed27e34824474b, 0x3c7f931c3eb17713),
+ (0x3f745e0770c0cfac, 0xc0153720a362d3a6, 0x3cbc60e53acef442),
+ (0x3ff5475469f15847, 0x3fd23f517f2dbe19, 0x3c5121e6c154dd81),
+ (0x3f9e13761b9e29aa, 0xc00c38d1a2b3226f, 0xbc9005203d9c6052),
+ (0x40295d7732b1928b, 0x400452628d50de4a, 0xbc4ed3676ab974b5),
+ (0x3f62950d7337c671, 0xc0185ad612d393eb, 0x3cbeb1d1a30a44c1),
+ (0x3fd2bde1daa23c6a, 0xbff3a66c4d1ad065, 0x3c6923aafd35c010),
+ (0x405549c0fb4d986f, 0x4011c71bf85e6e4a, 0xbcb733ae08222b6e),
+ (0x3f5f4efc3afddc5d, 0xc0190a69efff1b0d, 0x3c9fbcb89efdcd50),
+ (0x404c494cad96babd, 0x40102460d9352cb6, 0x3cb4422e754c9f46),
+ (0x3feca6325d6abdee, 0xbfbc50f2c5fd3c19, 0xbc324eb80eb13744),
+ (0x4033b1089f54810d, 0x4007d76d5362f9ab, 0x3ca2797a76df7f34),
+ (0x3fb568b670585cfd, 0xc003d9d59db58016, 0x3c96e6b1033b5db7),
+ (0x3f7a4b16328a04e3, 0xc014319d5482cfa4, 0xbca5c490a1c64c36),
+ (0x403860ebb2cc92ab, 0x40098cb57f4e07d2, 0xbcac4c5cb21f0a09),
+ (0x3f8f699b31fc25d3, 0xc010b5ab9279b85e, 0xbc8f89278a4cb33e),
+ (0x402960d16df7e842, 0x4004537129c98c04, 0xbcaa05a89ebbaeb6),
+ (0x3f7b1ee11dc6efb9, 0xc01411e41d504509, 0x3cbc00518be08e27),
+ (0x3f689b59b7cedd72, 0xc0173b474018f50c, 0xbcb2bca19c384d9b),
+ (0x3fffea8b7062e501, 0x3fe618c73ad1ebbe, 0x3c7fd2ed59c5733b),
+ (0x3f5444bf49407dcb, 0xc01ac7ab90ee3df2, 0xbcae95724cec72f7),
+ (0x3fd9d0459601ade3, 0xbfed0e32876b58cc, 0x3c887d50c70101d8),
+ (0x3fa6149e59686077, 0xc009262661604245, 0x3caa9c6eae5c2bf0),
+ (0x400f2eef1f07ec55, 0x3ff5c45f2900b421, 0x3c9b3e071d8ba3fb),
+ (0x4049fe32d48952f4, 0x400f9b97bfc6db20, 0xbca2bde351b359d4),
+ (0x3fa71132a433e561, 0xc008cc9fa0c868b2, 0x3c7c8ee247413c51),
+ (0x3fe9e1080fe367d7, 0xbfcb2cbe2a374cc5, 0xbc66334680dbd7b4),
+ (0x3f82b3495c06976d, 0xc012c8c882486192, 0xbca509424bf6cd86),
+ (0x3f98b38fffa575d7, 0xc00dcc00f9c3d7b0, 0xbc97c42e61bc1601),
+ (0x3fd37b54f8eb6485, 0xbff307cca9e335a2, 0xbc8aa7fac0e02bcb),
+ (0x404ee2915c521fe7, 0x40107e617f569191, 0xbcbcc3b1d0962048),
+ (0x3f53f44cc701e150, 0xc01ad7abbb5f390d, 0xbcb0de2e1a1f03c6),
+ (0x401966b9da967882, 0x3ffd9379ec0424c4, 0x3c5bfd6202281021),
+ (0x4021d99afb55ff01, 0x400182c7b9c135e4, 0x3ca1f0c43451a487),
+ (0x3fb9ac4522c48658, 0xc00265de40742912, 0x3ca9241f75f83a46),
+ (0x4017ce46954f5151, 0x3ffc89c32c306e36, 0x3c94073d23df5f61),
+ (0x3f95f1017c2d7cae, 0xc00ebea8b58ec314, 0x3ca5ba6bc5a40177),
+ (0x40196a636809aa8c, 0x3ffd95c84f74799e, 0xbc966910c2a6b4a9),
+ (0x4032fb17a164e627, 0x40078c24c23d49e9, 0xbcabd33227e7440a),
+ (0x401527d0beb5b398, 0x3ffaa65372b6ed05, 0x3c9c092468a3d811),
+ (0x3fb0070ab95eda9b, 0xc0062abe686b059c, 0x3ca08e8de0559f9b),
+ (0x3ff7b00fd8bccd43, 0x3fd91c92bbe66ba2, 0x3c71ec38fe18a6f3),
+ (0x4021b9bc8823cfde, 0x4001747278d08206, 0xbca0dfe566dc4cf5),
+ (0x4014a5ce06d7df05, 0x3ffa42cc8df38d10, 0xbc5e54e0ca2ed44c),
+ (0x3f68d1447d5a29e8, 0xc017328d1195bac4, 0x3cb4fd5db024d1da),
+ (0x404963f9a80919eb, 0x400f6b9160b05ec4, 0x3c4526374db12c53),
+ ][:]
+
+ for (x, y1, y2) : inputs
+ var xf : flt64 = std.flt64frombits(x)
+ var r1f : flt64, r2f : flt64, r1u : uint64, r2u : uint64
+ (r1f, r2f) = math.logoverkill64(xf)
+ r1u = std.flt64bits(r1f)
+ r2u = std.flt64bits(r2f)
+
+ /* Cut ourselves some slack on the second component */
+ testr.check(c, r1u == y1 && (r2u & 0xfffffffffff00000) == (y2 & 0xfffffffffff00000),
+ "log(0x{b=16,w=16,p=0}) should be (0x{b=16,w=16,p=0}, 0x{b=16,w=16,p=0}), was (0x{b=16,w=16,p=0}, 0x{b=16,w=16,p=0})",
+ x, y1, y2, r1u, r2u)
+ ;;
+}
+