ref: e09c6b53f3b1928d3752c53c813025dbcbb976e0
parent: d52fd3b32109f12e529a7f5e380c5ee8f06aefcf
author: S. Gilles <[email protected]>
date: Sun Jun 9 09:47:22 EDT 2019
Use fma instead of rote multiplication in powr's final calculation. Since the exp() we use is sometimes wrong by 1 ulp, the final multiplication has a possibility of increasing that to 2 ulps. Using fma instead makes such a thing much less likely. This makes some of our off-by-0s off-by-1s, and vice versa, but on the whole it is a definite improvement.
--- a/lib/math/powr-impl.myr
+++ b/lib/math/powr-impl.myr
@@ -1,6 +1,7 @@
use std
use "fpmath"
+use "impls"
use "log-impl"
use "log-overkill"
use "util"
@@ -34,6 +35,7 @@
emin : @i
sgnmask : @u
log_overkill : (x : @f -> (@f, @f))
+ fma : (x : @f, y : @f, z : @f -> @f)
split_mul : (x_h : @f, x_l : @f, y_h : @f, y_l : @f -> (@f, @f))
exp_zero_border : @u
;;
@@ -51,6 +53,7 @@
.emin = -126,
.sgnmask = 1 << 31,
.log_overkill = logoverkill32,
+ .fma = fma32,
.split_mul = split_mul32,
.exp_zero_border = 0xc2cff1b4, /* minimal y such that e^y > 0 */
]
@@ -68,6 +71,7 @@
.emin = -1022,
.sgnmask = 1 << 63,
.log_overkill = logoverkill64,
+ .fma = fma64,
.split_mul = hl_mult,
.exp_zero_border = 0xc0874910d52d3052, /* minimal y such that e^y > 0 */
]
@@ -198,5 +202,5 @@
-> z_hi
;;
- -> z_hi * (1.0 + final_exp_lo)
+ -> d.fma(z_hi, final_exp_lo, z_hi)
}
--- a/lib/math/test/powr-impl.myr
+++ b/lib/math/test/powr-impl.myr
@@ -50,6 +50,13 @@
(0x000342cdeeb18fc9, 0xbe645d513ed4670d, 0x3ff0001c3d5eaa62),
(0x3e086c8c9160ccfe, 0xc027b2f7021e0508, 0x567134b75886e1ce),
(0x3d799a9014c5c710, 0xc0294dc8ea46772e, 0x5f068df47efc8583),
+ (0x3fca93f7f9ca25b6, 0x3ff5847df0338da2, 0x3fbee98550a085a0),
+ (0x3fabb5869cd3c59e, 0x3ff36f822e577f69, 0x3f9da0409e2f391c),
+ (0x3f77f7f7131769cb, 0x3feaf3f718dd7ebe, 0x3f8af6517cbfdc36),
+ (0x3fcac3d4060de9e2, 0x4005f5e897c9aeb8, 0x3f8be74e7fa8bdd6),
+ (0x406f0b978c302c5e, 0x4035705781be3d35, 0x4a97d0db24be1855),
+ (0x40644d87e8676e6c, 0x404b0add51b6a1db, 0x58c21ad2028c4bcc),
+ (0x4196f9c67efe9a1f, 0x3fbb8bebec6ccd29, 0x401ceae1ef1736e2),
/* x in (-0.1, 0.1), y in (-50, 50) */
(0x85c8e7c348119f30, 0x895b661cd3f6bf12, 0xfff8000000000000),
@@ -105,9 +112,7 @@
(0x41094a4327327bff, 0xc011131c033fdfd5, 0x3b3879e23ee63805),
(0x40f8a138d93dfa6f, 0xc001413283b35d5f, 0x3db1bbb215bf42ae),
(0x414539311cf511ff, 0xbfbca848c2d205ed, 0x3fc84fc70f28d73e),
- (0x417c6bc9e89d9c1d, 0xc01992c87dc70f2c, 0x36032a3faeb94526),
(0xfff3bc381986a30c, 0xbfdca95e011cebe9, 0xfff8000000000000),
- (0x41c7ac7f3f65ea65, 0xc01e7dbcfb5c724c, 0x31d8c5031b0424e3),
(0x412e44a5346a1be6, 0xbffdcb72cdad7356, 0x3d9dfbadec90e3fa),
(0x418e5e64d3edf4c6, 0xc01674baa108856c, 0x36d602089075174d),
(0x416b58e89ad71a84, 0xbfca78a8fcb854b3, 0x3fa0f412b7918686),
@@ -115,7 +120,6 @@
/* x and y both in (0, 10) */
(0x7ff693c2af30864a, 0x3fa16af316e2d88c, 0xfff8000000000000),
(0x3fdde3d0f357d227, 0x3fd4e93bd57a2981, 0x3fe8f3d387c3cfe2),
- (0x3fa351c48c3746ce, 0x3fe729ab4b99e801, 0x3fb7e116428f44c0),
(0x3fea1f57254b2d12, 0x3ff33aaf58690aa3, 0x3fe912f746775394),
(0x3fd19a840c634304, 0x3fba064281d93290, 0x3fec1099957576a6),
(0x3fce944eca01df35, 0x3f746940bfb8cacf, 0x3fefc5c3319cd7dd),
@@ -140,12 +144,10 @@
(0x3f6d1ed0f9c04a9f, 0x3f62ffb9e40680c7, 0x3fef958dbae35a0f),
(0xfffe3ddc1a7f09a7, 0x3f845a6294f3a890, 0xfff8000000000000),
(0x3ff041e289ac5b8c, 0x3f57490a9fd58f95, 0x3ff00017c7d7ad4b),
- (0x3fc9ffe250089ea1, 0x3ff88bf79a9dc33c, 0x3fb63170e54074b8),
(0x4020bc3c2d25bb94, 0xfff903dc5ef7b5da, 0xfff8000000000000),
(0x4003d467c525071c, 0x3f98c1fe89a5ecc4, 0x3ff05ae362b9a84a),
(0x3fc238b70f4e7216, 0x3f8a75e2fd6e64f4, 0x3fef343ee42df045),
(0x3fe001d1da500969, 0x400ff32df1b36945, 0x3fb0191d9c03801f),
- (0x3f55ee3142fec6bf, 0x401cdc101b6b2276, 0x3ba18abf782d7bc4),
(0x3ffb954dbc226673, 0x400408855a1fc270, 0x400f49e30f335764),
(0x400bb9f936dad207, 0xfffb58709426aaec, 0xfff8000000000000),
(0x3f7bbe6db78c8014, 0xfff26b1c8b0746a0, 0xfff8000000000000),
@@ -200,13 +202,6 @@
const powr04 = {c
var inputs : (uint64, uint64, uint64, uint64)[:] = [
(0x3f8627bbf0b2534e, 0x3fab532501422efb, 0x3fe921e86671e519, 0x3fe921e86671e518),
- (0x3fca93f7f9ca25b6, 0x3ff5847df0338da2, 0x3fbee98550a085a0, 0x3fbee98550a085a1),
- (0x3fabb5869cd3c59e, 0x3ff36f822e577f69, 0x3f9da0409e2f391c, 0x3f9da0409e2f391b),
- (0x3f77f7f7131769cb, 0x3feaf3f718dd7ebe, 0x3f8af6517cbfdc36, 0x3f8af6517cbfdc35),
- (0x3fcac3d4060de9e2, 0x4005f5e897c9aeb8, 0x3f8be74e7fa8bdd6, 0x3f8be74e7fa8bdd5),
- (0x406f0b978c302c5e, 0x4035705781be3d35, 0x4a97d0db24be1855, 0x4a97d0db24be1854),
- (0x40644d87e8676e6c, 0x404b0add51b6a1db, 0x58c21ad2028c4bcc, 0x58c21ad2028c4bcd),
- (0x4196f9c67efe9a1f, 0x3fbb8bebec6ccd29, 0x401ceae1ef1736e2, 0x401ceae1ef1736e1),
(0x41c84ac138a030ef, 0x3f91da7f2b3e4605, 0x3ff6e1b47e9ed782, 0x3ff6e1b47e9ed781),
(0x41ca9d0efec9e036, 0x3fa8f2f672d68769, 0x4005d70b6fe1084b, 0x4005d70b6fe1084a),
(0x40f949e1394ba90c, 0x3fb52c7ac7e9fb25, 0x4004cad8a0151dff, 0x4004cad8a0151e00),
@@ -214,6 +209,11 @@
(0x41242aa370d444b6, 0xbff214c0dfc7a867, 0x3e91c53dfd314590, 0x3e91c53dfd31458f),
(0x418f00a1df7a23d0, 0xc033fc3dd88f7505, 0x1f83a0afed046038, 0x1f83a0afed046039),
(0x4169769b9e71e521, 0xc00ddf0ecaa33de6, 0x3a6889acb36be574, 0x3a6889acb36be573),
+ (0x417c6bc9e89d9c1d, 0xc01992c87dc70f2c, 0x36032a3faeb94526, 0x36032a3faeb94525),
+ (0x41c7ac7f3f65ea65, 0xc01e7dbcfb5c724c, 0x31d8c5031b0424e3, 0x31d8c5031b0424e2),
+ (0x3fa351c48c3746ce, 0x3fe729ab4b99e801, 0x3fb7e116428f44c0, 0x3fb7e116428f44c1),
+ (0x3fc9ffe250089ea1, 0x3ff88bf79a9dc33c, 0x3fb63170e54074b8, 0x3fb63170e54074b9),
+ (0x3f55ee3142fec6bf, 0x401cdc101b6b2276, 0x3ba18abf782d7bc4, 0x3ba18abf782d7bc5),
][:]
for (x, y, z_perfect, z_accepted) : inputs