shithub: mc

Download patch

ref: 3665783a1d1fe03db70cca452c615b439310a329
parent: 0f956d6bb55e0455326f4627024c1fa8c8adbff8
parent: 37ba2ec87069c9ca9bfeac71ec4dba660fcadf60
author: Ori Bernstein <[email protected]>
date: Wed May 9 15:21:16 EDT 2018

Merge remote-tracking branch 'npnth/libmath'

diff: cannot open b/lib/math/test//null: file does not exist: 'b/lib/math/test//null' diff: cannot open b/lib/math//null: file does not exist: 'b/lib/math//null'
--- a/6/isel.c
+++ b/6/isel.c
@@ -939,8 +939,8 @@
 		/* These operators should never show up in the reduced trees,
 		 * since they should have been replaced with more primitive
 		 * expressions by now */
-	case Obad: case Opreinc: case Opostinc: case Opredec: case Otern:
-	case Opostdec: case Olor: case Oland: case Oaddeq:
+	case Obad: case Oauto: case Opreinc: case Opostinc: case Opredec:
+	case Otern: case Opostdec: case Olor: case Oland: case Oaddeq:
 	case Osubeq: case Omuleq: case Odiveq: case Omodeq: case Oboreq:
 	case Obandeq: case Obxoreq: case Obsleq: case Obsreq: case Omemb:
 	case Oslbase: case Osllen: case Ocast: case Outag: case Oudata:
--- a/lib/bld.sub
+++ b/lib/bld.sub
@@ -8,6 +8,7 @@
 	inifile
 	iter
 	json
+	math
 	regex
 	std
 	sys
--- /dev/null
+++ b/lib/math/bld.sub
@@ -1,0 +1,36 @@
+lib math =
+	fpmath.myr
+
+	# exp
+	exp-impl.myr
+
+	# rounding (to actual integers)
+	round-impl+posixy-x64-sse4.s
+	round-impl.myr
+
+	# fused-multiply-add
+	fma-impl+posixy-x64-fma.s
+	fma-impl.myr
+
+	# polynomial evaluation methods
+	poly-impl.myr
+
+	# scalb (multiply x by 2^m)
+	scale2-impl.myr
+
+	# sqrt
+	sqrt-impl+posixy-x64-sse2.s
+	sqrt-impl.myr
+
+	# summation
+	sum-impl.myr
+
+	# trunc, floor, ceil
+	trunc-impl+posixy-x64-sse4.s
+	trunc-impl.myr
+
+	# util
+	util.myr
+
+	lib ../std:std
+;;
--- /dev/null
+++ b/lib/math/exp-impl.myr
@@ -1,0 +1,396 @@
+use std
+
+use "fpmath"
+use "util"
+
+/*
+    See [Mul16] (6.2.1), [Tan89], and [Tan92]. While the exp and
+    expm1 functions are similar enough to be in the same file (Tang's
+    algorithms use the same S constants), they are not quite similar
+    enough to be in the same function.
+ */
+pkg math =
+	pkglocal const exp32 : (f : flt32 -> flt32)
+	pkglocal const exp64 : (f : flt64 -> flt64)
+
+	pkglocal const expm132 : (f : flt32 -> flt32)
+	pkglocal const expm164 : (f : flt64 -> flt64)
+;;
+
+extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+extern const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+extern const horner_polyu32 : (f : flt32, a : uint32[:] -> flt32)
+extern const horner_polyu64 : (f : flt64, a : uint64[:] -> flt64)
+
+type fltdesc(@f, @u, @i) = struct
+	explode : (f : @f -> (bool, @i, @u))
+	assem : (n : bool, e : @i, s : @u -> @f)
+	fma : (x : @f, y : @f, z : @f -> @f)
+	horner : (f : @f, a : @u[:] -> @f)
+	sgnmask : @u
+	tobits : (f : @f -> @u)
+	frombits : (u : @u -> @f)
+	inf : @u
+	nan : @u
+
+	/* For exp */
+	thresh_1_min : @u
+	thresh_1_max : @u
+	thresh_2 : @u
+	Ai : @u[:]
+
+	/* For expm1 */
+	thresh_tiny : @u
+	thresh_huge_neg : @u
+	Log3_4 : @u
+	Log5_4 : @u
+	Bi : @u[:]
+	precision : @u
+
+	/* For both */
+	nabs : @u
+	inv_L : @u
+	L1 : @u
+	L2 : @u
+	S : (@u, @u)[32]
+
+;;
+
+const desc32 : fltdesc(flt32, uint32, int32) = [
+	.explode = std.flt32explode,
+	.assem = std.flt32assem,
+	.fma = fma32,
+	.horner = horner_polyu32,
+	.sgnmask = (1 << 31),
+	.tobits = std.flt32bits,
+	.frombits = std.flt32frombits,
+	.inf = 0x7f800000,
+	.nan = 0x7fc00000,
+	.thresh_1_min = 0xc2cff1b4, /* ln(2^(-127 - 23)) ~= -103.9720770839917 */
+	.thresh_1_max = 0x42b17218, /* ln([2 - 2^(-24)] * 2^127) ~= 88.72283905206 */
+	.inv_L = 0x4238aa3b, /* L = log(2) / 32, this is 1/L in working precision */
+	.L1 = 0x3cb17200, /* L1 and L2 sum to give L in high precision, */
+	.L2 = 0x333fbe8e, /* and L1 has some trailing zeroes. */
+	.nabs = 9, /* L1 has 9 trailing zeroes in binary */
+	.Ai = [ /* Coefficients for approximating expm1 in a tiny range */
+		0x3f000044,
+		0x3e2aaaec,
+	][:],
+	.Log3_4 = 0xbe934b11, /* log(1 - 1/4) < x < log(1 + 1/4) */
+	.Log5_4 = 0x3e647fbf, /* triggers special expm1 case */
+	.thresh_tiny = 0x33000000, /* similar to thresh_1_{min,max}, but for expm1 */
+	.thresh_huge_neg = 0xc18aa122,
+	.Bi = [ /* Coefficients for approximating expm1 between log(3/4) and log(5/4) */
+		0x3e2aaaaa,
+		0x3d2aaaa0,
+		0x3c0889ff,
+		0x3ab64de5,
+		0x394ab327,
+	][:],
+	.S = [ /* Double-precise expansions of 2^(J/32) */
+		(0x3f800000, 0x00000000),
+		(0x3f82cd80, 0x35531585),
+		(0x3f85aac0, 0x34d9f312),
+		(0x3f889800, 0x35e8092e),
+		(0x3f8b95c0, 0x3471f546),
+		(0x3f8ea400, 0x36e62d17),
+		(0x3f91c3c0, 0x361b9d59),
+		(0x3f94f4c0, 0x36bea3fc),
+		(0x3f9837c0, 0x36c14637),
+		(0x3f9b8d00, 0x36e6e755),
+		(0x3f9ef500, 0x36c98247),
+		(0x3fa27040, 0x34c0c312),
+		(0x3fa5fec0, 0x36354d8b),
+		(0x3fa9a140, 0x3655a754),
+		(0x3fad5800, 0x36fba90b),
+		(0x3fb123c0, 0x36d6074b),
+		(0x3fb504c0, 0x36cccfe7),
+		(0x3fb8fb80, 0x36bd1d8c),
+		(0x3fbd0880, 0x368e7d60),
+		(0x3fc12c40, 0x35cca667),
+		(0x3fc56700, 0x36a84554),
+		(0x3fc9b980, 0x36f619b9),
+		(0x3fce2480, 0x35c151f8),
+		(0x3fd2a800, 0x366c8f89),
+		(0x3fd744c0, 0x36f32b5a),
+		(0x3fdbfb80, 0x36de5f6c),
+		(0x3fe0ccc0, 0x36776155),
+		(0x3fe5b900, 0x355cef90),
+		(0x3feac0c0, 0x355cfba5),
+		(0x3fefe480, 0x36e66f73),
+		(0x3ff52540, 0x36f45492),
+		(0x3ffa8380, 0x36cb6dc9),
+	],
+	.precision = 24, /* threshold to prevent underflow in a S[high] + 2^-m */
+]
+
+const desc64 : fltdesc(flt64, uint64, int64) = [
+	.explode = std.flt64explode,
+	.assem = std.flt64assem,
+	.fma = fma64,
+	.horner = horner_polyu64,
+	.sgnmask = (1 << 63),
+	.tobits = std.flt64bits,
+	.frombits = std.flt64frombits,
+	.inf = 0x7ff0000000000000,
+	.nan = 0x7ff8000000000000,
+	.thresh_1_min = 0xc0874910d52d3052, /* ln(2^(-1023 - 52)) ~= -745.1332191019 */
+	.thresh_1_max = 0x40862e42fefa39ef, /* ln([2 - 2^(-53)] * 2^1023) ~= 709.78271289 */
+	.inv_L = 0x40471547652b82fe, /* below as in exp32 */
+	.L1 = 0x3f962e42fef00000,
+	.L2 = 0x3d8473de6af278ed,
+	.nabs = 20,
+	.Ai = [
+		0x3fe0000000000000,
+		0x3fc5555555548f7c,
+		0x3fa5555555545d4e,
+		0x3f811115b7aa905e,
+		0x3f56c1728d739765,
+	][:],
+	.Log3_4 = 0xbfd269621134db93,
+	.Log5_4 = 0x3fcc8ff7c79a9a22,
+	.thresh_tiny = 0x3c90000000000000,
+	.thresh_huge_neg = 0xc042b708872320e1,
+	.Bi = [
+		0x3fc5555555555549,
+		0x3fa55555555554b6,
+		0x3f8111111111a9f3,
+		0x3f56c16c16ce14c6,
+		0x3f2a01a01159dd2d,
+		0x3efa019f635825c4,
+		0x3ec71e14bfe3db59,
+		0x3e928295484734ea,
+		0x3e5a2836aa646b96,
+	][:],
+	.S = [
+		(0x3ff0000000000000, 0x0000000000000000),
+		(0x3ff059b0d3158540, 0x3d0a1d73e2a475b4),
+		(0x3ff0b5586cf98900, 0x3ceec5317256e308),
+		(0x3ff11301d0125b40, 0x3cf0a4ebbf1aed93),
+		(0x3ff172b83c7d5140, 0x3d0d6e6fbe462876),
+		(0x3ff1d4873168b980, 0x3d053c02dc0144c8),
+		(0x3ff2387a6e756200, 0x3d0c3360fd6d8e0b),
+		(0x3ff29e9df51fdec0, 0x3d009612e8afad12),
+		(0x3ff306fe0a31b700, 0x3cf52de8d5a46306),
+		(0x3ff371a7373aa9c0, 0x3ce54e28aa05e8a9),
+		(0x3ff3dea64c123400, 0x3d011ada0911f09f),
+		(0x3ff44e0860618900, 0x3d068189b7a04ef8),
+		(0x3ff4bfdad5362a00, 0x3d038ea1cbd7f621),
+		(0x3ff5342b569d4f80, 0x3cbdf0a83c49d86a),
+		(0x3ff5ab07dd485400, 0x3d04ac64980a8c8f),
+		(0x3ff6247eb03a5580, 0x3cd2c7c3e81bf4b7),
+		(0x3ff6a09e667f3bc0, 0x3ce921165f626cdd),
+		(0x3ff71f75e8ec5f40, 0x3d09ee91b8797785),
+		(0x3ff7a11473eb0180, 0x3cdb5f54408fdb37),
+		(0x3ff82589994cce00, 0x3cf28acf88afab35),
+		(0x3ff8ace5422aa0c0, 0x3cfb5ba7c55a192d),
+		(0x3ff93737b0cdc5c0, 0x3d027a280e1f92a0),
+		(0x3ff9c49182a3f080, 0x3cf01c7c46b071f3),
+		(0x3ffa5503b23e2540, 0x3cfc8b424491caf8),
+		(0x3ffae89f995ad380, 0x3d06af439a68bb99),
+		(0x3ffb7f76f2fb5e40, 0x3cdbaa9ec206ad4f),
+		(0x3ffc199bdd855280, 0x3cfc2220cb12a092),
+		(0x3ffcb720dcef9040, 0x3d048a81e5e8f4a5),
+		(0x3ffd5818dcfba480, 0x3cdc976816bad9b8),
+		(0x3ffdfc97337b9b40, 0x3cfeb968cac39ed3),
+		(0x3ffea4afa2a490c0, 0x3cf9858f73a18f5e),
+		(0x3fff50765b6e4540, 0x3c99d3e12dd8a18b),
+	],
+	.precision = 53,
+]
+
+const exp32 = {f : flt32
+	-> expgen(f, desc32)
+}
+
+const exp64 = {f : flt64
+	-> expgen(f, desc64)
+}
+
+generic expgen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+	var b = d.tobits(f)
+	var n, e, s
+	(n, e, s) = d.explode(f)
+
+	/*
+	   Detect if exp(f) would round to outside representability.
+	   We don't do bias adjustment, so Tang's thresholds can
+	   be tightened.
+	 */
+	if !n && b >= d.thresh_1_max
+		-> d.frombits(d.inf)
+	elif n && b > d.thresh_1_min
+		-> (0.0 : @f)
+	;;
+
+	/* Detect if exp(f) is close to 1. */
+	if (b & ~d.sgnmask) <= d.thresh_2
+		-> (1.0 : @f)
+	;;
+
+	/* Argument reduction to [ -ln(2)/64, ln(2)/64 ] */
+	var inv_L = d.frombits(d.inv_L)
+
+	var N = rn(f * inv_L)
+	var N2  = N % (32 : @i)
+	if N2 < 0
+		N2 += (32 : @i)
+	;;
+	var N1 = N - N2
+
+	var R1 : @f, R2 : @f
+	var Nf = (N : @f)
+
+	/*
+	   There are few enough significant bits that these are all
+	   exact, without need for an FMA. R1 + R2 will approximate
+	   (very well) f reduced into [ -ln(2)/64, ln(2)/64 ]
+	 */
+	if std.abs(N) >= (1 << d.nabs)
+		R1 = (f - (N1 : @f) * d.frombits(d.L1)) - ((N2 : @f) * d.frombits(d.L1))
+	else
+		R1 = f - (N : @f) * d.frombits(d.L1)
+	;;
+	R2 = -1.0 * (N : @f) * d.frombits(d.L2)
+
+	var M = (N1 : @i) / 32
+	var J = (N2 : std.size)
+
+	/* Compute a polynomial approximation of exp1m */
+	var R = R1 + R2
+	var Q = R * R * d.horner(R, d.Ai)
+	var P = R1 + (R2 + Q)
+
+	/* Expand out from reduced range */
+	var Su_hi, Su_lo
+	(Su_hi, Su_lo) = d.S[J]
+	var S_hi = d.frombits(Su_hi)
+	var S_lo = d.frombits(Su_lo)
+
+	var S = S_hi + S_lo
+	var unscaled = S_hi + (S_lo + (P * S))
+	var nu, eu, su
+	(nu, eu, su) = d.explode(unscaled)
+	var exp = d.assem(nu, eu + M, su)
+	if (d.tobits(exp) == 0)
+		/* Make sure we don't quite return 0 */
+		-> d.frombits(1)
+	;;
+
+	-> exp
+}
+
+const expm132 = {f : flt32
+	-> expm1gen(f, desc32)
+}
+
+const expm164 = {f : flt64
+	-> expm1gen(f, desc64)
+}
+
+generic expm1gen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+	var b = d.tobits(f)
+	var n, e, s
+	(n, e, s) = d.explode(f)
+
+	/* Special cases: +/- 0, inf, NaN, tiny, and huge */
+	if (b & ~d.sgnmask == 0)
+		-> f
+	elif n && (b & ~d.sgnmask == d.inf)
+		-> (-1.0 : @f)
+	elif (b & ~d.sgnmask == d.inf)
+		-> f
+	elif std.isnan(f)
+		-> d.frombits(d.nan)
+	elif (b & ~d.sgnmask) <= d.thresh_tiny
+		var two_to_large = d.assem(false, 100, 0)
+		var two_to_small = d.assem(false, -100, 0)
+		var abs_f = d.assem(false, e, s)
+		-> (two_to_large * f + abs_f) * two_to_small
+	elif !n && b >= d.thresh_1_max /* exp(x) = oo <=> expm1(x) = oo, as it turns out */
+		-> d.frombits(d.inf)
+	elif n && b >= d.thresh_huge_neg
+		-> (-1.0 : @f)
+	;;
+
+	if ((n && b < d.Log3_4) || (!n && b < d.Log5_4))
+		/* Procedure 2 */
+
+		/* compute x^2  / 2 with extra precision */
+		var u = round(f, d)
+		var v = f - u
+		var y = u * u * (0.5 : @f)
+		var z = v * (f + u) * (0.5 : @f)
+		var q = f * f * f * d.horner(f, d.Bi)
+
+		var yn, ye, ys
+		(yn, ye, ys) = d.explode(y)
+		if (ye >= -7)
+			-> (u + y) + (q + (v  + z))
+		else
+			-> f + (y + (q + z))
+		;;
+	;;
+
+	/* Procedure 1 */
+	var inv_L = d.frombits(d.inv_L)
+
+	var N = rn(f * inv_L)
+	var N2 = N % (32 : @i)
+	if N2 < 0
+		N2 += (32 : @i)
+	;;
+	var N1 = N - N2
+
+	var R1 : @f, R2 : @f
+
+	/*
+	   As in the exp case, R1 + R2 very well approximates f
+	   reduced into [ -ln(2)/64, ln(2)/64 ]
+	 */
+	if std.abs(N) >= (1 << d.nabs)
+		R1 = (f - (N1 : @f) * d.frombits(d.L1)) - ((N2 : @f) * d.frombits(d.L1))
+	else
+		R1 = f - (N : @f) * d.frombits(d.L1)
+	;;
+	R2 = -1.0 * (N : @f) * d.frombits(d.L2)
+
+	var M = (N1 : @i) / 32
+	var J = (N2 : std.size)
+
+	/* Compute a polynomial approximation of exp1m */
+	var R = R1 + R2
+	var Q = R * R * d.horner(R, d.Ai)
+	var P = R1 + (R2 + Q)
+
+	/* Expand out */
+	var Su_hi, Su_lo
+	(Su_hi, Su_lo) = d.S[J]
+	var S_hi = d.frombits(Su_hi)
+	var S_lo = d.frombits(Su_lo)
+	var S = S_hi + S_lo
+
+	/*
+	   Note that [Tan92] includes cases to avoid tripping the
+	   underflow flag when not technically required. We currently
+	   do not care about such flags, so that logic is omitted.
+	 */
+	if M >= d.precision
+		-> scale2(S_hi + (S * P + (S_lo - scale2(1.0, -M))), M)
+	elif M <= -8
+		-> scale2(S_hi + (S * P + S_lo), M) - (1.0 : @f)
+	;;
+
+	-> scale2((S_hi - scale2(1.0, -M)) + (S_hi * P + S_lo * (P + (1.0 : @f))), M)
+}
+
+generic round = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i, roundable @f -> @i
+	var b = d.tobits(f)
+	var n, e, s
+	(n, e, s) = d.explode(f)
+	var mask = ~((1 << d.nabs) - 1)
+	if need_round_away(0, ((s & mask) : uint64), (d.nabs : int64) + 1)
+		-> d.assem(n, e, 1 + s & mask)
+	;;
+	-> d.assem(n, e, s & mask)
+}
--- /dev/null
+++ b/lib/math/fma-impl+posixy-x64-fma.s
@@ -1,0 +1,13 @@
+.globl math$fma32
+.globl _math$fma32
+math$fma32:
+_math$fma32:
+	vfmadd132ss %xmm1, %xmm2, %xmm0
+	ret
+
+.globl math$fma64
+.globl _math$fma64
+math$fma64:
+_math$fma64:
+	vfmadd132sd %xmm1, %xmm2, %xmm0
+	ret
--- /dev/null
+++ b/lib/math/fma-impl.myr
@@ -1,0 +1,446 @@
+use std
+
+use "util"
+
+pkg math =
+	pkglocal const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+	pkglocal const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+;;
+
+const exp_mask32 : uint32 = 0xff << 23
+const exp_mask64 : uint64 = 0x7ff << 52
+
+pkglocal const fma32 = {x : flt32, y : flt32, z : flt32
+	var xn, yn
+	(xn, _, _) = std.flt32explode(x)
+	(yn, _, _) = std.flt32explode(y)
+	var xd : flt64 = flt64fromflt32(x)
+	var yd : flt64 = flt64fromflt32(y)
+	var zd : flt64 = flt64fromflt32(z)
+	var prod : flt64 = xd * yd
+	var pn, pe, ps
+	(pn, pe, ps) = std.flt64explode(prod)
+	if pe == -1023
+		pe = -1022
+	;;
+	if pn != (xn != yn)
+		/* In case of NaNs, sign might not have been preserved */
+		pn = (xn != yn)
+		prod = std.flt64assem(pn, pe, ps)
+	;;
+
+	var r : flt64 = prod + zd
+	var rn, re, rs
+	(rn, re, rs) = std.flt64explode(r)
+
+	/*
+	   At this point, r is probably the correct answer. The
+	   only issue is the rounding.
+
+	   Ex 1: If x*y > 0 and z is a tiny, negative number, then
+	   adding z probably does no rounding. However, if
+	   truncating to 23 bits of precision would cause round-to-even,
+	   and that round would be upwards, then we need to remember
+	   those trailing bits of z and cancel the rounding.
+
+	   Ex 2: If x, y, z > 0, and z is small, with
+	                 last bit in flt64 |
+	          last bit in flt32 v      v
+	   x * y = ...............101011..11
+	       z =                          10000...,
+	   then x * y + z will be rounded to
+	           ...............101100..00,
+	   and then as a flt32 it will become
+	           ...............110,
+	   Even though, looking at the original bits, it doesn't
+	   "deserve" the final rounding.
+
+	   These can only happen if r is non-inf, non-NaN, and the
+	   lower 29 bits correspond to "exactly halfway".
+	 */
+	if re == 1024 || rs & 0x1fffffff != 0x10000000
+		-> flt32fromflt64(r)
+	;;
+
+	/*
+	   At this point, a rounding is about to happen. We need
+	   to know what direction that rounding is, so that we can
+	   tell if it's wrong. +1 means "away from 0", -1 means
+	   "towards 0".
+	 */
+	var zn, ze, zs
+	(zn, ze, zs) = std.flt64explode(zd)
+	var round_direction = 0
+	if rs & 0x20000000 == 0
+		round_direction = -1
+	else
+		round_direction = 1
+	;;
+
+	var smaller, larger, smaller_e, larger_e
+	if pe > ze || (pe == ze && ps > zs)
+		(smaller, larger, smaller_e, larger_e) = (zs, ps, ze, pe)
+	else
+		(smaller, larger, smaller_e, larger_e) = (ps, zs, pe, ze)
+	;;
+	var mask = shr((-1 : uint64), 64 - std.min(64, larger_e - smaller_e))
+	var prevent_rounding = false
+	if (round_direction > 0 && pn != zn) || (round_direction < 0 && pn == zn)
+		/*
+		   The prospective rounding disagrees with the
+		   signage. We are potentially in the case of Ex
+		   1.
+
+		   Look at the bits (of the smaller flt64) that are
+		   outside the range of r. If there are any such
+		   bits, we need to cancel the rounding.
+
+		   We certainly need to consider bits very far to
+		   the right, but there's an awkwardness concerning
+		   the bit just outside the flt64 range: it governed
+		   round-to-even, so it might have had an effect.
+		   We only care about bits which did not have an
+		   effect. Therefore, we perform the subtraction
+		   using only the bits from smaller that lie in
+		   larger's range, then check whether the result
+		   is susceptible to round-to-even.
+
+		   (Since we only care about the last bit, and the
+		   base is 2, subtraction or addition are equally
+		   useful.)
+		*/
+		if (larger ^ shr(smaller, larger_e - smaller_e)) & 0x1 == 0
+			prevent_rounding = smaller & mask != 0
+		;;
+	else
+		/*
+		   The prospective rounding agrees with the signage.
+		   We are potentially in the case of Ex 2.
+
+		   We just need to check if r was obtained by
+		   rounding in the addition step. In this case, we
+		   still check the smaller/larger, and we only
+		   care about round-to-even. Any
+		   rounding that happened previously is enough
+		   reason to disqualify this next rounding.
+		*/
+		prevent_rounding = (larger ^ shr(smaller, larger_e - smaller_e)) & 0x1 != 0
+	;;
+
+	if prevent_rounding
+		if round_direction > 0
+			rs--
+		else
+			rs++
+		;;
+	;;
+
+	-> flt32fromflt64(std.flt64assem(rn, re, rs))
+}
+
+pkglocal const fma64 = {x : flt64, y : flt64, z : flt64
+	var xn : bool, yn : bool, zn : bool
+	var xe : int64, ye : int64, ze : int64
+	var xs : uint64, ys : uint64, zs : uint64
+
+	var xb : uint64 = std.flt64bits(x)
+	var yb : uint64 = std.flt64bits(y)
+	var zb : uint64 = std.flt64bits(z)
+
+	/* check for both NaNs and infinities */
+	if xb & exp_mask64 == exp_mask64 || \
+	   yb & exp_mask64 == exp_mask64
+		-> x * y + z
+	elif z == 0.0 || z == -0.0 || x * y == 0.0 || x * y == -0.0
+		-> x * y + z
+	elif zb & exp_mask64 == exp_mask64
+		-> z
+	;;
+
+	(xn, xe, xs) = std.flt64explode(x)
+	(yn, ye, ys) = std.flt64explode(y)
+	(zn, ze, zs) = std.flt64explode(z)
+	if xe == -1023
+		xe = -1022
+	;;
+	if ye == -1023
+		ye = -1022
+	;;
+	if ze == -1023
+		ze = -1022
+	;;
+
+        /* Keep product in high/low uint64s */
+	var xs_h : uint64 = xs >> 32
+	var ys_h : uint64 = ys >> 32
+	var xs_l : uint64 = xs & 0xffffffff
+	var ys_l : uint64 = ys & 0xffffffff
+
+	var t_l : uint64 = xs_l * ys_l
+	var t_m : uint64 = xs_l * ys_h + xs_h * ys_l
+	var t_h : uint64 = xs_h * ys_h
+
+	var prod_l : uint64 = t_l + (t_m << 32)
+	var prod_h : uint64 = t_h + (t_m >> 32)
+	if t_l > prod_l
+		prod_h++
+	;;
+
+	var prod_n = xn != yn
+	var prod_lastbit_e = (xe - 52) + (ye - 52)
+	var prod_first1 = find_first1_64_hl(prod_h, prod_l, 105)
+	var prod_firstbit_e = prod_lastbit_e + prod_first1
+
+	var z_firstbit_e = ze
+	var z_lastbit_e = ze - 52
+	var z_first1 = 52
+
+	/* subnormals could throw firstbit_e calculations out of whack */
+	if (zb & exp_mask64 == 0)
+		z_first1 = find_first1_64(zs, z_first1)
+		z_firstbit_e = z_lastbit_e + z_first1
+	;;
+
+	var res_n
+	var res_h = 0
+	var res_l = 0
+	var res_first1
+	var res_lastbit_e
+	var res_firstbit_e
+
+	if prod_n == zn
+		res_n = prod_n
+
+		/*
+		   Align prod and z so that the top bit of the
+		   result is either 53 or 54, then add.
+		 */
+		if prod_firstbit_e >= z_firstbit_e
+			/*
+			    [ prod_h ][ prod_l ]
+			         [ z...
+			 */
+			res_lastbit_e = prod_lastbit_e
+			(res_h, res_l) = (prod_h, prod_l)
+			(res_h, res_l) = add_shifted(res_h, res_l, zs, z_lastbit_e - prod_lastbit_e)
+		else
+			/*
+			        [ prod_h ][ prod_l ]
+			    [ z...
+			 */
+			res_lastbit_e = z_lastbit_e - 64
+			res_h = zs
+			res_l = 0
+			if prod_lastbit_e >= res_lastbit_e + 64
+				/* In this situation, prod must be extremely subnormal */
+				res_h += shl(prod_l, prod_lastbit_e - res_lastbit_e - 64)
+			elif prod_lastbit_e >= res_lastbit_e
+				res_h += shl(prod_h, prod_lastbit_e - res_lastbit_e)
+				res_h += shr(prod_l, res_lastbit_e + 64 - prod_lastbit_e)
+				res_l += shl(prod_l, prod_lastbit_e - res_lastbit_e)
+			elif prod_lastbit_e + 64 >= res_lastbit_e
+				res_h += shr(prod_h, res_lastbit_e - prod_lastbit_e)
+				var l1 = shl(prod_h, prod_lastbit_e + 64 - res_lastbit_e)
+				var l2 = shr(prod_l, res_lastbit_e - prod_lastbit_e)
+				res_l = l1 + l2
+				if res_l < l1
+					res_h++
+				;;
+			elif prod_lastbit_e + 128 >= res_lastbit_e
+				res_l += shr(prod_h, res_lastbit_e - prod_lastbit_e - 64)
+			;;
+		;;
+	else
+		match compare_hl_z(prod_h, prod_l, prod_firstbit_e, prod_lastbit_e, zs, z_firstbit_e, z_lastbit_e)
+		| `std.Equal: -> 0.0
+		| `std.Before:
+			/* prod > z */
+			res_n = prod_n
+			res_lastbit_e = prod_lastbit_e
+			(res_h, res_l) = sub_shifted(prod_h, prod_l, zs, z_lastbit_e - prod_lastbit_e)
+		| `std.After:
+			/* z > prod */
+			res_n = zn
+			res_lastbit_e = z_lastbit_e - 64
+			(res_h, res_l) = sub_shifted(zs, 0, prod_h, prod_lastbit_e + 64 - (z_lastbit_e - 64))
+			(res_h, res_l) = sub_shifted(res_h, res_l, prod_l, prod_lastbit_e - (z_lastbit_e - 64))
+		;;
+	;;
+
+	res_first1 = 64 + find_first1_64(res_h, 55)
+	if res_first1 == 63
+		res_first1 = find_first1_64(res_l, 63)
+	;;
+	res_firstbit_e = res_first1 + res_lastbit_e
+
+	/*
+	   Finally, res_h and res_l are the high and low bits of
+	   the result. They now need to be assembled into a flt64.
+	   Subnormals and infinities could be a problem.
+	 */
+	var res_s = 0
+	if res_firstbit_e <= -1023
+		/* Subnormal case */
+		if res_lastbit_e + 128 < 12 - 1022
+			res_s = shr(res_h, 12 - 1022 - (res_lastbit_e + 128))
+			res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
+		elif res_lastbit_e + 64 < 12 - 1022
+			res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
+			res_s |= shr(res_l, 12 - 1022 - (res_lastbit_e + 64))
+		else
+			res_s = shl(res_h, -12 + (res_lastbit_e + 128) - (-1022))
+			res_s |= shl(res_l, -12 + (res_lastbit_e + 64) - (-1022))
+		;;
+
+		if need_round_away(res_h, res_l, res_first1 + (-1074 - res_firstbit_e))
+			res_s++
+		;;
+
+		/* No need for exponents, they are all zero */
+		var res = res_s
+		if res_n
+			res |= (1 << 63)
+		;;
+		-> std.flt64frombits(res)
+	;;
+
+	if res_firstbit_e >= 1024
+		/* Infinity case */
+		if res_n
+			-> std.flt64frombits(0xfff0000000000000)
+		else
+			-> std.flt64frombits(0x7ff0000000000000)
+		;;
+	;;
+
+	if res_first1 - 52 >= 64
+		res_s = shr(res_h, (res_first1 : int64) - 64 - 52)
+		if need_round_away(res_h, res_l, res_first1 - 52)
+			res_s++
+		;;
+	elif res_first1 - 52 >= 0
+		res_s = shl(res_h, 64 - (res_first1 - 52))
+		res_s |= shr(res_l, res_first1 - 52)
+		if need_round_away(res_h, res_l, res_first1 - 52)
+			res_s++
+		;;
+	else
+		res_s = shl(res_h, res_first1 - 52)
+	;;
+
+	/* The res_s++s might have messed everything up */
+	if res_s & (1 << 53) != 0
+		res_s >= 1
+		res_firstbit_e++
+		if res_firstbit_e >= 1024
+			if res_n
+				-> std.flt64frombits(0xfff0000000000000)
+			else
+				-> std.flt64frombits(0x7ff0000000000000)
+			;;
+		;;
+	;;
+
+	-> std.flt64assem(res_n, res_firstbit_e, res_s)
+}
+
+/*
+   Add (a << s) to [ h ][ l ], where if s < 0 then a corresponding
+   right-shift is used. This is aligned such that if s == 0, then
+   the result is [ h ][ l + a ]
+ */
+const add_shifted = {h : uint64, l : uint64, a : uint64, s : int64
+	if s >= 64
+		-> (h + shl(a, s - 64), l)
+	elif s >= 0
+		var new_h = h + shr(a, 64 - s)
+		var sa = shl(a, s)
+		var new_l = l + sa
+		if new_l < l
+			new_h++
+		;;
+		-> (new_h, new_l)
+	else
+		var new_h = h
+		var sa = shr(a, -s)
+		var new_l = l + sa
+		if new_l < l
+			new_h++
+		;;
+		-> (new_h, new_l)
+	;;
+}
+
+/* As above, but subtract (a << s) */
+const sub_shifted = {h : uint64, l : uint64, a : uint64, s : int64
+	if s >= 64
+		-> (h - shl(a, s - 64), l)
+	elif s >= 0
+		var new_h = h - shr(a, 64 - s)
+		var sa = shl(a, s)
+		var new_l = l - sa
+		if sa > l
+			new_h--
+		;;
+		-> (new_h, new_l)
+	else
+		var new_h = h
+		var sa = shr(a, -s)
+		var new_l = l - sa
+		if sa > l
+			new_h--
+		;;
+		-> (new_h, new_l)
+	;;
+}
+
+const compare_hl_z = {h : uint64, l : uint64, hl_firstbit_e : int64, hl_lastbit_e : int64, z : uint64, z_firstbit_e : int64, z_lastbit_e : int64
+	if hl_firstbit_e > z_firstbit_e
+		-> `std.Before
+	elif hl_firstbit_e < z_firstbit_e
+		-> `std.After
+	;;
+
+	var h_k : int64 = (hl_firstbit_e - hl_lastbit_e - 64)
+	var z_k : int64 = (z_firstbit_e - z_lastbit_e)
+	while h_k >= 0 && z_k >= 0
+		var h1 = h & shl(1, h_k) != 0
+		var z1 = z & shl(1, z_k) != 0
+		if h1 && !z1
+			-> `std.Before
+		elif !h1 && z1
+			-> `std.After
+		;;
+		h_k--
+		z_k--
+	;;
+
+	if z_k < 0
+		if (h & shr((-1 : uint64), 64 - h_k) != 0) || (l != 0)
+			-> `std.Before
+		else
+			-> `std.Equal
+		;;
+	;;
+
+	var l_k : int64 = 63
+	while l_k >= 0 && z_k >= 0
+		var l1 = l & shl(1, l_k) != 0
+		var z1 = z & shl(1, z_k) != 0
+		if l1 && !z1
+			-> `std.Before
+		elif !l1 && z1
+			-> `std.After
+		;;
+		l_k--
+		z_k--
+	;;
+
+	if (z_k < 0) && (l & shr((-1 : uint64), 64 - l_k) != 0)
+		-> `std.Before
+	elif (l_k < 0) && (z & shr((-1 : uint64), 64 - z_k) != 0)
+		-> `std.After
+	;;
+
+	-> `std.Equal
+}
--- /dev/null
+++ b/lib/math/fpmath.myr
@@ -1,0 +1,150 @@
+use std
+
+pkg math =
+	trait fpmath @f =
+
+		/* exp-impl */
+		exp : (f : @f -> @f)
+		expm1 : (f : @f -> @f)
+
+		/* fma-impl */
+		fma : (x : @f, y : @f, z : @f -> @f)
+
+		/* poly-impl */
+		horner_poly : (x : @f, a : @f[:] -> @f)
+		horner_polyu : (x : @f, a : @u[:] -> @f)
+
+		/* scale2-impl */
+		scale2 : (f : @f, m : @i -> @f)
+
+		/* sqrt-impl */
+		sqrt : (f : @f -> @f)
+
+		/* sum-impl */
+		kahan_sum : (a : @f[:] -> @f)
+		priest_sum : (a : @f[:] -> @f)
+
+		/* trunc-impl */
+		trunc : (f : @f -> @f)
+		ceil  : (f : @f -> @f)
+		floor : (f : @f -> @f)
+	;;
+
+	trait roundable @f -> @i =
+		/* round-impl */
+		rn : (f : @f -> @i)
+	;;
+
+	impl std.equatable flt32
+	impl std.equatable flt64
+	impl roundable flt64 -> int64
+	impl roundable flt32 -> int32
+	impl fpmath flt32
+	impl fpmath flt64
+;;
+
+/*
+   We consider two floating-point numbers equal if their bits are
+   equal. This does not treat NaNs specially: two distinct NaNs may
+   compare equal, or they may compare distinct (if they arise from
+   different bit patterns).
+
+   Additionally, +0.0 and -0.0 compare differently.
+ */
+impl std.equatable flt32 =
+	eq = {a : flt32, b : flt32; -> std.flt32bits(a) == std.flt32bits(b)}
+;;
+
+impl std.equatable flt64 =
+	eq = {a : flt64, b : flt64; -> std.flt64bits(a) == std.flt64bits(b)}
+;;
+
+impl roundable flt32 -> int32 =
+	rn = {f : flt32; -> rn32(f) }
+;;
+
+impl roundable flt64 -> int64 =
+	rn = {f : flt64; -> rn64(f) }
+;;
+
+impl fpmath flt32 =
+	fma = {x, y, z; -> fma32(x, y, z)}
+
+	exp = {f; -> exp32(f)}
+	expm1 = {f; -> expm132(f)}
+
+	horner_poly = {f, a; -> horner_poly32(f, a)}
+	horner_polyu = {f, a; -> horner_polyu32(f, a)}
+
+	scale2 = {f, m; -> scale232(f, m)}
+
+	sqrt = {f; -> sqrt32(f)}
+
+	kahan_sum = {l; -> kahan_sum32(l) }
+	priest_sum = {l; -> priest_sum32(l) }
+
+	trunc = {f; -> trunc32(f)}
+	floor = {f; -> floor32(f)}
+	ceil  = {f; -> ceil32(f)}
+
+;;
+
+impl fpmath flt64 =
+	fma = {x, y, z; -> fma64(x, y, z)}
+
+	exp = {f; -> exp64(f)}
+	expm1 = {f; -> expm164(f)}
+
+	horner_poly = {f, a; -> horner_poly64(f, a)}
+	horner_polyu = {f, a; -> horner_polyu64(f, a)}
+
+	scale2 = {f, m; -> scale264(f, m)}
+
+	sqrt = {f; -> sqrt64(f)}
+
+	kahan_sum = {l; -> kahan_sum64(l) }
+	priest_sum = {l; -> priest_sum64(l) }
+
+	trunc = {f; -> trunc64(f)}
+	floor = {f; -> floor64(f)}
+	ceil  = {f; -> ceil64(f)}
+;;
+
+extern const rn32 : (f : flt32 -> int32)
+extern const rn64 : (f : flt64 -> int64)
+
+extern const exp32 : (x : flt32 -> flt32)
+extern const exp64 : (x : flt64 -> flt64)
+
+extern const expm132 : (x : flt32 -> flt32)
+extern const expm164 : (x : flt64 -> flt64)
+
+extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+extern const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+
+extern const horner_poly32 : (f : flt32, a : flt32[:] -> flt32)
+extern const horner_poly64 : (f : flt64, a : flt64[:] -> flt64)
+
+extern const horner_polyu32 : (f : flt32, a : uint32[:] -> flt32)
+extern const horner_polyu64 : (f : flt64, a : uint64[:] -> flt64)
+
+extern const scale232 : (f : flt32, m : int32 -> flt32)
+extern const scale264 : (f : flt64, m : int64 -> flt64)
+
+extern const sqrt32 : (x : flt32 -> flt32)
+extern const sqrt64 : (x : flt64 -> flt64)
+
+extern const kahan_sum32 : (l : flt32[:] -> flt32)
+extern const kahan_sum64 : (l : flt64[:] -> flt64)
+
+extern const priest_sum32 : (l : flt32[:] -> flt32)
+extern const priest_sum64 : (l : flt64[:] -> flt64)
+
+extern const trunc32 : (f : flt32 -> flt32)
+extern const trunc64 : (f : flt64 -> flt64)
+
+extern const floor32 : (f : flt32 -> flt32)
+extern const floor64 : (f : flt64 -> flt64)
+
+extern const ceil32  : (f : flt32 -> flt32)
+extern const ceil64  : (f : flt64 -> flt64)
--- /dev/null
+++ b/lib/math/poly-impl.myr
@@ -1,0 +1,47 @@
+use std
+
+/* See [Mul16], section 5.1 */
+pkg math =
+        pkglocal const horner_poly32 : (f : flt32, a : flt32[:] -> flt32)
+        pkglocal const horner_poly64 : (f : flt64, a : flt64[:] -> flt64)
+
+        pkglocal const horner_polyu32 : (f : flt32, a : uint32[:] -> flt32)
+        pkglocal const horner_polyu64 : (f : flt64, a : uint64[:] -> flt64)
+;;
+
+extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+extern const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+
+const horner_poly32 = {f : flt32, a : flt32[:]
+        var r : flt32 = 0.0
+        for var j = a.len - 1; j >= 0; j--
+                r = fma32(r, f, a[j])
+        ;;
+        -> r
+}
+
+const horner_poly64 = {f : flt64, a : flt64[:]
+        var r : flt64 = 0.0
+        for var j = a.len - 1; j >= 0; j--
+                r = fma64(r, f, a[j])
+        ;;
+        -> r
+}
+
+const horner_polyu32 = {f : flt32, a : uint32[:]
+        var r : flt32 = 0.0
+        for var j = a.len - 1; j >= 0; j--
+                r = fma32(r, f, std.flt32frombits(a[j]))
+        ;;
+        -> r
+}
+
+const horner_polyu64 = {f : flt64, a : uint64[:]
+        var r : flt64 = 0.0
+        for var j = a.len - 1; j >= 0; j--
+                r = fma64(r, f, std.flt64frombits(a[j]))
+        ;;
+        -> r
+}
+
+/* TODO: add Knuth's method */
--- /dev/null
+++ b/lib/math/references
@@ -1,0 +1,27 @@
+References
+
+[KM06]
+Peter Kornerup and Jean-Michel Muller. “Choosing starting values
+for certain Newton–Raphson iterations”. In: Theoretical Computer
+Science 351 (1 2006), pp. 101–110. doi:
+https://doi.org/10.1016/j.tcs.2005.09.056.
+
+[Mul+10]
+Jean-Michel Muller et al. Handbook of floating-point arithmetic.
+Boston: Birkhäuser, 2010. isbn: 9780817647049.
+
+[Mul16]
+J. M. Muller. Elementary functions : algorithms and implementation.
+Third edition. New York: Birkhäuser, 2016. isbn: 9781489979810.
+
+[Tan89]
+Ping-Tak Peter Tang. “Table-driven Implementation of the Exponential
+Function in IEEE Floating-point Arithmetic”. In: ACM Trans. Math.
+Softw. 15.2 (June 1989), pp. 144–157. issn: 0098-3500.  doi:
+10.1145/63522.214389. url: http://doi.acm.org/10.1145/63522.214389.
+
+[Tan92]
+Ping Tak Peter Tang. “Table-driven Implementation of the Expm1
+Function in IEEE Floating- point Arithmetic”. In: ACM Trans. Math.
+Softw. 18.2 (June 1992), pp. 211–222. issn: 0098-3500.  doi:
+10.1145/146847.146928. url: http://doi.acm.org/10.1145/146847.146928.
--- /dev/null
+++ b/lib/math/round-impl+posixy-x64-sse4.s
@@ -1,0 +1,32 @@
+.globl math$rn32
+.globl _math$rn32
+math$rn32:
+_math$rn32:
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$16, %rsp
+	fnstcw	(%rsp)
+	fnstcw	8(%rsp)
+	andl	$0xf3ff, 8(%rsp)
+	fldcw	8(%rsp)
+	cvtss2si	%xmm0, %eax
+	fldcw	(%rsp)
+	leave
+	ret
+
+.globl math$rn64
+.globl _math$rn64
+math$rn64:
+_math$rn64:
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$16, %rsp
+	fnstcw	(%rsp)
+	fnstcw	8(%rsp)
+	andl	$0xf3ff, 8(%rsp)
+	fldcw	8(%rsp)
+	cvtsd2si	%xmm0, %rax
+	fldcw	(%rsp)
+	leave
+	ret
+
--- /dev/null
+++ b/lib/math/round-impl.myr
@@ -1,0 +1,70 @@
+use std
+
+use "util"
+
+pkg math =
+	const rn64 : (f : flt64 -> int64)
+	const rn32 : (f : flt32 -> int32)
+;;
+
+const rn64 = {f : flt64
+	var n : bool, e : int64, s : uint64
+
+	(n, e, s) = std.flt64explode(f)
+
+	if e >= 63
+		-> -9223372036854775808
+	elif e >= 52
+		var shifted : int64 = (( s << (e - 52 : uint64)) : int64)
+		if n
+			-> -1 * shifted
+		else
+			-> shifted
+		;;
+	elif e < -1
+		-> 0
+	;;
+
+	var integral_s = (s >> (52 - e : uint64) : int64)
+
+	if need_round_away(0, s, 52 - e)
+		integral_s++
+	;;
+
+	if n
+		-> -integral_s
+	else
+		-> integral_s
+	;;
+}
+
+const rn32 = {f : flt32
+	var n : bool, e : int32, s : uint32
+
+	(n, e, s) = std.flt32explode(f)
+
+	if e >= 31
+		-> -2147483648
+	elif e >= 23
+		var shifted : int32 = (( s << (e - 23 : uint32)) : int32)
+		if n
+			->  -1 * shifted
+		else
+			->  shifted
+		;;
+	elif e < -1
+		-> 0
+	;;
+
+	var integral_s = (s >> (23 - e : uint32) : int32)
+
+	if need_round_away(0, (s : uint64), (23 - e : int64))
+		integral_s++
+	;;
+
+	if n
+		-> -integral_s
+	else
+		-> integral_s
+	;;
+}
--- /dev/null
+++ b/lib/math/scale2-impl.myr
@@ -1,0 +1,73 @@
+use std
+
+use "fpmath"
+use "util"
+
+/*
+   The scaleB function recommended by the 2008 revision of IEEE
+   754. Since only integer exponents are considered, the naive
+   approach works quite well.
+ */
+pkg math =
+	const scale232 : (f : flt32, m : int32 -> flt32)
+	const scale264 : (f : flt64, m : int64 -> flt64)
+;;
+
+const scale232 = {f : flt32, m : int32
+	var n, e, s
+	(n, e, s) = std.flt32explode(f)
+	(n, e, s) = scale2gen(n, e, s, -126, 127, 24, m)
+	-> std.flt32assem(n, e, s)
+}
+
+const scale264 = {f : flt64, m : int64
+	var n, e, s
+	(n, e, s) = std.flt64explode(f)
+	(n, e, s) = scale2gen(n, e, s, -1022, 1023, 53, m)
+	-> std.flt64assem(n, e, s)
+}
+
+generic scale2gen = {n : bool, e : @i, s : @u, emin : @i, emax : @i, p : @u, m : @i :: numeric, integral @i, numeric, integral @u
+	if e < emin && s == 0
+		-> (n, e, s)
+	elif m == 0
+		-> (n, e, s)
+	elif m < 0
+		var sprime = s
+		var eprime = e
+		if e < emin
+			sprime = s >> (-m)
+			if need_round_away(0, (s : uint64), (-m : int64))
+				sprime++
+			;;
+			eprime = emin - 1
+		elif e + m < emin
+			sprime = s >> (emin - m - e)
+			if need_round_away(0, (s : uint64), ((emin - m - e) : int64))
+				sprime++
+			;;
+			eprime = emin - 1
+		else
+			eprime = e + m
+		;;
+		-> (n, eprime, sprime)
+	;;
+
+	if e < emin
+		var first_1 = find_first1_64((s : uint64), (p : int64))
+		var shift = p - (first_1 : @u) - 1
+		if m >= (shift : @i)
+			s = s << shift
+			m -= (shift : @i)
+			e = emin
+		else
+			-> (n, e, s << m)
+		;;
+	;;
+
+	if e + m > emax && s != 0
+		-> (n, emax + 1, 1 << (p - 1))
+	;;
+
+	-> (n, e + m, s)
+}
--- /dev/null
+++ b/lib/math/sqrt-impl+posixy-x64-sse2.s
@@ -1,0 +1,13 @@
+.globl math$sqrt32
+.globl _math$sqrt32
+math$sqrt32:
+_math$sqrt32:
+	sqrtss %xmm0, %xmm0
+	ret
+
+.globl math$sqrt64
+.globl _math$sqrt64
+math$sqrt64:
+_math$sqrt64:
+	sqrtsd %xmm0, %xmm0
+	ret
--- /dev/null
+++ b/lib/math/sqrt-impl.myr
@@ -1,0 +1,189 @@
+use std
+
+use "fpmath"
+
+/* See [Mul+10], sections 5.4 and 8.7 */
+pkg math =
+	pkglocal const sqrt32 : (f : flt32 -> flt32)
+	pkglocal const sqrt64 : (f : flt64 -> flt64)
+;;
+
+extern const fma32 : (x : flt32, y : flt32, z : flt32 -> flt32)
+extern const fma64 : (x : flt64, y : flt64, z : flt64 -> flt64)
+
+type fltdesc(@f, @u, @i) = struct
+	explode : (f : @f -> (bool, @i, @u))
+	assem : (n : bool, e : @i, s : @u -> @f)
+	fma : (x : @f, y : @f, z : @f -> @f)
+	tobits : (f : @f -> @u)
+	frombits : (u : @u -> @f)
+	nan : @u
+	emin : @i
+	emax : @i
+	normmask : @u
+	sgnmask : @u
+	ab : (@u, @u)[:]
+	iterlim : int
+;;
+
+/*
+   The starting point of the N-R iteration of 1/sqrt, after significand
+   has been normalized to [1, 4).
+
+   See [KM06] for the construction and notation. Case p = -2. The
+   dividers (left values) are chosen roughly so that maximal error
+   of N-R, after 3 iterations, starting with the right value, is
+   less than an ulp (of the result). If g falls in [a_i, a_{i+1}),
+   N-R should start with b_{i+1}.
+
+   In the flt64 case, we need only one more iteration.
+ */
+const ab32 : (uint32, uint32)[:] = [
+	(0x3f800000, 0x3f800000), /* Nothing should ever get normalized to < 1.0 */
+	(0x3fa66666, 0x3f6f30ae), /* [1.0,  1.3 ) -> 0.9343365431 */
+	(0x3fd9999a, 0x3f5173ca), /* [1.3,  1.7 ) -> 0.8181730509 */
+	(0x40100000, 0x3f3691d3), /* [1.7,  2.25) -> 0.713162601  */
+	(0x40333333, 0x3f215342), /* [2.25, 2.8 ) -> 0.6301766634 */
+	(0x4059999a, 0x3f118e0e), /* [2.8,  3.4 ) -> 0.5685738325 */
+	(0x40800000, 0x3f053049), /* [3.4,  4.0 ) -> 0.520268023  */
+][:]
+
+const ab64 : (uint64, uint64)[:] = [
+	(0x3ff0000000000000, 0x3ff0000000000000), /* < 1.0 */
+	(0x3ff3333333333333, 0x3fee892ce1608cbc), /* [1.0,  1.2)  -> 0.954245033445111356940060431953 */
+	(0x3ff6666666666666, 0x3fec1513a2184094), /* [1.2,  1.4)  -> 0.877572838393478438234751592972 */
+	(0x3ffc000000000000, 0x3fe9878eb3e9ba20), /* [1.4,  1.75) -> 0.797797538178034670863780775107 */
+	(0x400199999999999a, 0x3fe6ccb14eeb238d), /* [1.75, 2.2)  -> 0.712486890924184046447464879748 */
+	(0x400599999999999a, 0x3fe47717c17cd34f), /* [2.2,  2.7)  -> 0.639537694840876969060161627567 */
+	(0x400b333333333333, 0x3fe258df212a8e9a), /* [2.7,  3.4)  -> 0.573348583963212421465982515656 */
+	(0x4010000000000000, 0x3fe0a5989f2dc59a), /* [3.4,  4.0)  -> 0.520214377304159869552790951275 */
+][:]
+
+const sqrt32 = {f : flt32
+	const d : fltdesc(flt32, uint32, int32) =  [
+		.explode = std.flt32explode,
+		.assem = std.flt32assem,
+		.fma = fma32,
+		.tobits = std.flt32bits,
+		.frombits = std.flt32frombits,
+		.nan = 0x7fc00000,
+		.emin = -127,
+		.emax = 128,
+		.normmask = 1 << 23,
+		.sgnmask = 1 << 31,
+		.ab = ab32,
+		.iterlim = 3,
+	]
+	-> sqrtgen(f, d)
+}
+
+const sqrt64 = {f : flt64
+	const d : fltdesc(flt64, uint64, int64) =  [
+		.explode = std.flt64explode,
+		.assem = std.flt64assem,
+		.fma = fma64,
+		.tobits = std.flt64bits,
+		.frombits = std.flt64frombits,
+		.nan = 0x7ff8000000000000,
+		.emin = -1023,
+		.emax = 1024,
+		.normmask = 1 << 52,
+		.sgnmask = 1 << 63,
+		.ab = ab64,
+		.iterlim = 4,
+	]
+	-> sqrtgen(f, d)
+}
+
+generic sqrtgen = {f : @f, d : fltdesc(@f, @u, @i) :: numeric,floating,std.equatable @f, numeric,integral @u, numeric,integral @i
+	var n : bool, e : @i, s : @u, e2 : @i
+	(n, e, s) = d.explode(f)
+
+	/* Special cases: +/- 0.0, negative, NaN, and +inf */
+	if e == d.emin && s == 0
+		-> f
+	elif n || std.isnan(f)
+		/* Make sure to return a quiet NaN */
+		-> d.frombits(d.nan)
+	elif e == d.emax
+		-> f
+	;;
+
+	/*
+	   Remove a factor of 2^(even) to normalize significand.
+	 */
+	if e == d.emin
+		e = d.emin + 1
+		while s & d.normmask == 0
+			s <<= 1
+			e--
+		;;
+	;;
+	if e % 2 != 0
+		e2 = e - 1
+		e = 1
+	else
+		e2 = e
+		e = 0
+	;;
+
+	var a : @f = d.assem(false, e, s)
+	var au : @u = d.tobits(a)
+
+        /*
+           We shall perform iterated Newton-Raphson in order to
+           compute 1/sqrt(g), then multiply by g to obtain sqrt(g).
+           This is faster than calculating sqrt(g) directly because
+           it avoids division. (The multiplication by g is built
+           into Markstein's r, g, n variables.)
+         */
+	var xn : @f = d.frombits(0)
+	for (ai, beta) : d.ab
+		if au <= ai
+			xn = d.frombits(beta)
+			break
+		;;
+	;;
+
+	/* split up "x_{n+1} = x_n (3 - ax_n^2)/2" */
+	var epsn = d.fma(-1.0 * a, xn * xn, 1.0)
+	var rn = 0.5 * epsn
+	var gn = a * xn
+	var hn = 0.5 * xn
+	for var j = 0; j < d.iterlim; ++j
+		rn = d.fma(-1.0 * gn, hn, 0.5)
+		gn = d.fma(gn, rn, gn)
+		hn = d.fma(hn, rn, hn)
+	;;
+
+	/*
+	   gn is almost what we want, except that we might want to
+	   adjust by an ulp in one direction or the other. This is
+	   the Tuckerman test.
+
+	   Exhaustive testing has shown that we need only 3 adjustments
+	   in the flt32 case (and it should be 4 in the flt64 case).
+	 */
+	(_, e, s) = d.explode(gn)
+	e += (e2 / 2)
+	var r : @f = d.assem(false, e, s)
+
+	for var j = 0; j < d.iterlim; ++j
+		var r_plus_ulp : @f = d.frombits(d.tobits(r) + 1)
+		var r_minus_ulp : @f = d.frombits(d.tobits(r) - 1)
+
+		var delta_1 = d.fma(r, r_minus_ulp, -1.0 * f)
+		if d.tobits(delta_1) & d.sgnmask == 0
+			r = r_minus_ulp
+		else
+			var delta_2 = d.fma(r, r_plus_ulp, -1.0 * f)
+			if d.tobits(delta_2) & d.sgnmask != 0
+				r = r_plus_ulp
+			else
+				-> r
+			;;
+		;;
+	;;
+
+	-> r
+}
--- /dev/null
+++ b/lib/math/sum-impl.myr
@@ -1,0 +1,105 @@
+use std
+
+/* For references, see [Mul+10] section 6.3 */
+pkg math =
+	pkglocal const kahan_sum32 : (l : flt32[:] -> flt32)
+	pkglocal const priest_sum32 : (l : flt32[:] -> flt32)
+
+	pkglocal const kahan_sum64: (l : flt64[:] -> flt64)
+	pkglocal const priest_sum64 : (l : flt64[:] -> flt64)
+;;
+
+type doomed_flt32_arr = flt32[:]
+type doomed_flt64_arr = flt64[:]
+
+impl disposable doomed_flt32_arr =
+	__dispose__ = {a : doomed_flt32_arr; std.slfree((a : flt32[:])) }
+;;
+
+impl disposable doomed_flt64_arr =
+	__dispose__ = {a : doomed_flt64_arr; std.slfree((a : flt64[:])) }
+;;
+
+/*
+   Kahan's compensated summation. Fast and reasonably accurate,
+   although cancellation can cause relative error blowup. For
+   something slower, but more accurate, use something like Priest's
+   doubly compensated sums.
+ */
+pkglocal const kahan_sum32 = {l; -> kahan_sum_gen(l, (0.0 : flt32))}
+pkglocal const kahan_sum64 = {l; -> kahan_sum_gen(l, (0.0 : flt64))}
+
+generic kahan_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
+	if l.len == 0
+		-> zero
+	;;
+
+	var s = zero
+	var c = zero
+	var y = zero
+	var t = zero
+
+	for x : l
+		y = x - c
+		t = s + y
+		c = (t - s) - y
+		s = t
+	;;
+
+	-> s
+}
+
+/*
+   Priest's doubly compensated summation. Extremely accurate, but
+   relatively slow. For situations in which cancellation is not
+   expected, something like Kahan's compensated summation may be
+   more useful.
+ */
+pkglocal const priest_sum32 = {l : flt32[:]
+	var l2 = std.sldup(l)
+	std.sort(l2, mag_cmp32)
+	auto (l2 : doomed_flt32_arr)
+	-> priest_sum_gen(l2, (0.0 : flt32))
+}
+
+const mag_cmp32 = {f : flt32, g : flt32
+	var u = std.flt32bits(f) & ~(1 << 31)
+	var v = std.flt32bits(g) & ~(1 << 31)
+	-> std.numcmp(v, u)
+}
+
+pkglocal const priest_sum64 = {l : flt64[:]
+	var l2 = std.sldup(l)
+	std.sort(l, mag_cmp64)
+	auto (l2 : doomed_flt64_arr)
+	-> priest_sum_gen(l2, (0.0 : flt64))
+}
+
+const mag_cmp64 = {f : flt64, g : flt64
+	var u = std.flt64bits(f) & ~(1 << 63)
+	var v = std.flt64bits(g) & ~(1 << 63)
+	-> std.numcmp(v, u)
+}
+
+generic priest_sum_gen = {l : @f[:], zero : @f :: numeric,floating @f
+	/* l should be sorted in descending order */
+	if l.len == 0
+		-> zero
+	;;
+
+	var s = zero
+	var c = zero
+
+	for x : l
+		var y = c + x
+		var u = x - (y - c)
+		var t = (y + s)
+		var v = (y - (t - s))
+		var z = u + v
+		s = t + z
+		c = z - (s - t)
+	;;
+
+	-> s
+}
+
--- /dev/null
+++ b/lib/math/test/exp-impl.myr
@@ -1,0 +1,285 @@
+use std
+use math
+use testr
+
+/*
+   Note: a major part of the algorithms are the S constants. They
+   are tested extensively in expm101 and expm102.
+ */
+const main = {
+	testr.run([
+		[.name="exp-01", .fn = exp01],
+		[.name="exp-02", .fn = exp02],
+		[.name="exp-03", .fn = exp03],
+		[.name="exp-04", .fn = exp04],
+		[.name="expm1-01", .fn = expm101],
+		[.name="expm1-02", .fn = expm102],
+		[.name="expm1-03", .fn = expm103],
+		[.name="expm1-04", .fn = expm104],
+	][:])
+}
+
+const exp01 = {c
+	var inputs : (uint32, uint32)[:] = [
+		(0x00000000, 0x3f800000),
+		(0x34000000, 0x3f800001),
+		(0x3c000000, 0x3f810101),
+		(0x42000000, 0x568fa1fe),
+		(0xc2b00000, 0x0041edc4),
+		(0xc2b20000, 0x001840fc),
+		(0x7f7fffff, 0x7f800000),
+		(0x7f800000, 0x7f800000),
+		(0x7f800001, 0x7fc00000),
+		(0xc2cff1b3, 0x00000001),
+		(0xc2cff1b4, 0x00000001),
+		(0xc2cff1b5, 0000000000),
+		(0x42b17217, 0x7f7fff84),
+		(0x42b17218, 0x7f800000),
+		(0x42b17219, 0x7f800000),
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var yf : flt32 = std.flt32frombits(y)
+		var rf = math.exp(xf)
+		testr.check(c, rf == yf,
+			"exp(0x{b=16,w=8,p=0}) should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+			x, y, std.flt32bits(rf))
+	;;
+}
+
+const exp02 = {c
+	var inputs : (uint64, uint64)[:] = [
+		(0x0000000000000000, 0x3ff0000000000000),
+		(0x3e50000000000000, 0x3ff0000004000000),
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var yf : flt64 = std.flt64frombits(y)
+		var rf = math.exp(xf)
+		testr.check(c, rf == yf,
+			"exp(0x{b=16,w=16,p=0}) should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
+			x, y, std.flt64bits(rf))
+	;;
+}
+
+const exp03 = {c
+	/*
+	   Tang's algorithm has an error of up to 0.77 ulps. This
+	   is not terrible (musl appears to follow it, for example).
+	   Here we quarantine off some known-bad results.
+	 */
+
+	var inputs : (uint32, uint32, uint32)[:] = [
+		(0x42020000, 0x56eccf79, 0x56eccf78),
+		(0x3ec40600, 0x3fbbb54b, 0x3fbbb54c),
+	][:]
+
+	for (x, y_perfect, y_acceptable) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var ypf : flt32 = std.flt32frombits(y_perfect)
+		var yaf : flt32 = std.flt32frombits(y_acceptable)
+		var rf = math.exp(xf)
+		if rf != ypf && rf != yaf
+		testr.fail(c, "exp(0x{b=16,w=8,p=0}) was 0x{b=16,w=8,p=0}. It should have been 0x{b=16,w=8,p=0}, although we will also accept 0x{b=16,w=8,p=0}",
+			x, std.flt32bits(rf), y_perfect, y_acceptable)
+		;;
+	;;
+}
+
+const exp04 = {c
+	/*
+	   Tang's algorithm has an error of up to 0.77 ulps. This
+	   is not terrible (musl appears to follow it, for example).
+	   Here we quarantine off some known-bad results.
+	 */
+
+	var inputs : (uint64, uint64, uint64)[:] = [
+		(0x3cda000000000000, 0x3ff0000000000006, 0x3ff0000000000007),
+		(0x3d57020000000000, 0x3ff00000000005c0, 0x3ff00000000005c1),
+		(0x3d58020000000000, 0x3ff0000000000600, 0x3ff0000000000601),
+		(0xc087030000000000, 0x0000000000000c6d, 0x0000000000000c6e),
+		(0xc011070000000000, 0x3f8d039e34c59187, 0x3f8d039e34c59186),
+		(0xbd50070000000000, 0x3feffffffffff7fc, 0x3feffffffffff7fd),
+		(0xbd430e0000000000, 0x3feffffffffffb3c, 0x3feffffffffffb3d),
+	][:]
+
+	for (x, y_perfect, y_acceptable) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var ypf : flt64 = std.flt64frombits(y_perfect)
+		var yaf : flt64 = std.flt64frombits(y_acceptable)
+		var rf = math.exp(xf)
+		if rf != ypf && rf != yaf
+		testr.fail(c, "exp(0x{b=16,w=16,p=0}) was 0x{b=16,w=16,p=0}. It should have been 0x{b=16,w=16,p=0}, although we will also accept 0x{b=16,w=16,p=0}",
+			x, std.flt64bits(rf), y_perfect, y_acceptable)
+		;;
+	;;
+}
+
+const expm101 = {c
+	var inputs : (uint32, uint32)[:] = [
+		(0x00000000, 0x00000000),
+		(0x80000000, 0x80000000),
+		(0x3f000000, 0x3f261299),
+		(0x3c000000, 0x3c008056),
+		(0x42000000, 0x568fa1fe),
+		(0xc2b00000, 0xbf800000),
+		(0xc2b20000, 0xbf800000),
+		(0x01000000, 0x01000000),
+		(0x40000000, 0x40cc7326),
+		(0x42b17200, 0x7f7ff404),
+		(0x415a3cf2, 0x494cd0e3),
+		(0x7f800000, 0x7f800000),
+		(0xff800000, 0xbf800000),
+		(0x7a2028b1, 0x7f800000),
+		(0xa201a23a, 0xa201a23a),
+		(0xc0000000, 0xbf5d5aab),
+		(0xbe934b10, 0xbe7fffff),
+		(0xbe934b11, 0xbe800000),
+		(0xbe934b12, 0xbe800001),
+		(0x3e647fbe, 0x3e800000),
+		(0x3e647fbf, 0x3e800000),
+		(0x3e647fc0, 0x3e800001),
+		(0xc0f744f5, 0xbf7fe31e),
+		(0x4210297a, 0x597f31f5), /* J = 0 */
+		(0x3f34c3cd, 0x3f83573d), /* J = 1 */
+		(0x3f3a52b6, 0x3f89087b), /* J = 2 */
+		(0xbf20e72b, 0xbeeee940), /*  ...  */
+		(0x41f4bd2a, 0x558c999f),
+		(0xc02a0418, 0xbf6e07cd),
+		(0xc0293a2a, 0xbf6dcec1),
+		(0x40b62779, 0x4393ca4b),
+		(0x3fc680ac, 0x406dc6a4),
+		(0x3fc9d2c6, 0x4075b516),
+		(0xbfedd645, 0xbf581273),
+		(0x3e70e5d1, 0x3e87cbdb),
+		(0xbeddcacc, 0xbeb3ffd9),
+		(0x3e8beb21, 0x3ea0e776),
+		(0x3e9ded31, 0x3eb8fe1f),
+		(0x40e8503c, 0x44b19ed8),
+		(0x40d265cb, 0x4432f91f),
+		(0xbea0f9bc, 0xbe8a2036),
+		(0x3ec42672, 0x3eef04c4),
+		(0xc140e8cc, 0xbf7fff9f),
+		(0x4117320e, 0x46467e73),
+		(0x3ee8b75d, 0x3f134ef0),
+		(0xc03f51f1, 0xbf731e4e),
+		(0x42733615, 0x6b52d3d4),
+		(0x3f02f2b5, 0x3f2af617),
+		(0xbf5ac925, 0xbf131660),
+		(0x40813277, 0x425eb7ac),
+		(0x41842e94, 0x4b64b1dd),
+		(0x41b0ba81, 0x4f6a0cc1),
+		(0xc061d7c2, 0xbf787d28),
+		(0xc0611682, 0xbf786657),
+		(0x40dcd7e9, 0x447827c5), /* J = 31 */
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var yf : flt32 = std.flt32frombits(y)
+		var rf = math.expm1(xf)
+		testr.check(c, rf == yf,
+			"expm1(0x{b=16,w=8,p=0}) should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+			x, y, std.flt32bits(rf))
+	;;
+}
+
+const expm102 = {c
+	var inputs : (uint64, uint64)[:] = [
+		(0x0000000000000000, 0x0000000000000000),
+		(0x404ef04831cb65ed, 0x45834ac37c44b3d3),
+		(0x7ff0000000000000, 0x7ff0000000000000),
+		(0xfff0000000000000, 0xbff0000000000000),
+		(0x80318a89f290021a, 0x80318a89f290021a),
+		(0xc0180881a9e73af6, 0xbfefebdcaf24d5fe),
+		(0xc020cedaedb028c9, 0xbfeffe2a4ee5ba79),
+		(0xbfe62812ff80cb9e, 0xbfdff9cf6758cc6a), /* J = 0 */
+		(0xbfe526dab7e5054c, 0xbfdef44fe4876d02), /* J = 1 */
+		(0x3ff6ea5c51cbf0f2, 0x400980f836e2c055), /* J = 2 */
+		(0x403f4315360b2cdc, 0x42c12ad5f692ffff), /*  ...  */
+		(0x3fe93f778a9dc013, 0x3ff3381168aca01e),
+		(0x405023f373e01862, 0x45c1aa771de3ba9c),
+		(0xbfff56366f3b92de, 0xbfeb7c693f8bdbb8),
+		(0xc0159bcd07244a34, 0xbfefdb1461286124),
+		(0x40079bbc23a67f90, 0x40322039bbbb5e2e),
+		(0xbffe1933d2c0ed70, 0xbfeb1f6c166582bb),
+		(0xc009ea7d09d84479, 0xbfeebf01fa92741c),
+		(0x405d84f7563a2612, 0x4a946476fb27817e),
+		(0x4045f770a45f8e7f, 0x43e4da111dae0dfb),
+		(0x4070e9a83b352180, 0x585516d4e37bd9f6),
+		(0x3fefcdb2a528c6e8, 0x3ffb39ec926d8a50),
+		(0x3fd52b5e7995f00c, 0x3fd91739183464e0),
+		(0xbfd625845bbaf98f, 0xbfd2b893d222b8f2),
+		(0xbfd43c0407d114d5, 0xbfd15909a4bea38e),
+		(0x3fd93f8288681821, 0x3fdef406a9f7c4b1),
+		(0x3fdaad696fbfea76, 0x3fe08c8035acbd0b),
+		(0x405e9b6f4b4bc7f3, 0x4af8b6ad079946a7),
+		(0x3fdc85cf6b85bfaf, 0x3fe1f811193e5cf5),
+		(0xbfed55f9b317d7eb, 0xbfe334b0378703a0),
+		(0x406ee7f396a46c9b, 0x563a11333c75a10f),
+		(0x4031266dc880810e, 0x417ac458c1525e64),
+		(0xc019905c018b6d96, 0xbfeff243dffe774d),
+		(0x404dc17e83d7ee6b, 0x454cfbf6572d627f),
+		(0xc013dcd6fae06405, 0xbfefc6dfe34e5408),
+		(0x402762d99fa5cfda, 0x40fd3b9a2004f68b),
+		(0xbfe89074f132e353, 0xbfe12602fb3c9806),
+		(0x4077c7ea24627ae7, 0x623ea61f88281bd5),
+		(0xbff6a873237d8072, 0xbfe83c311800b6ee), /* J = 31 */
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var yf : flt64 = std.flt64frombits(y)
+		var rf = math.expm1(xf)
+		testr.check(c, rf == yf,
+			"expm1(0x{b=16,w=16,p=0}) should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
+			x, y, std.flt64bits(rf))
+	;;
+}
+
+const expm103 = {c
+	/*
+	   As with exp, there is some accepted error in expm1.
+	 */
+
+	var inputs : (uint32, uint32, uint32)[:] = [
+		(0x34000000, 0x34000001, 0x34000000),
+		(0xbe651dea, 0xbe4d4b4d, 0xbe4d4b4c),
+	][:]
+
+	for (x, y_perfect, y_acceptable) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var ypf : flt32 = std.flt32frombits(y_perfect)
+		var yaf : flt32 = std.flt32frombits(y_acceptable)
+		var rf = math.expm1(xf)
+		if rf != ypf && rf != yaf
+		testr.fail(c, "expm1(0x{b=16,w=8,p=0}) was 0x{b=16,w=8,p=0}. It should have been 0x{b=16,w=8,p=0}, although we will also accept 0x{b=16,w=8,p=0}",
+			x, std.flt32bits(rf), y_perfect, y_acceptable)
+		;;
+	;;
+}
+
+const expm104 = {c
+	/*
+	   As with exp, there is some accepted error in expm1.
+	 */
+
+	var inputs : (uint64, uint64, uint64)[:] = [
+		(0xbf9d0b5aadc4d0ac, 0xbf9ca2e5b7bfa859, 0xbf9ca2e5b7bfa85a),
+		(0x3fc2dbb101fe0392, 0x3fc451731cc0e358, 0x3fc451731cc0e359),
+		(0x3fc8a39bc9c32fec, 0x3fcb2b988c3e0b2f, 0x3fcb2b988c3e0b30),
+	][:]
+
+	for (x, y_perfect, y_acceptable) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var ypf : flt64 = std.flt64frombits(y_perfect)
+		var yaf : flt64 = std.flt64frombits(y_acceptable)
+		var rf = math.expm1(xf)
+		if rf != ypf && rf != yaf
+		testr.fail(c, "expm1(0x{b=16,w=16,p=0}) was 0x{b=16,w=16,p=0}. It should have been 0x{b=16,w=16,p=0}, although we will also accept 0x{b=16,w=16,p=0}",
+			x, std.flt64bits(rf), y_perfect, y_acceptable)
+		;;
+	;;
+}
--- /dev/null
+++ b/lib/math/test/fma-impl.myr
@@ -1,0 +1,104 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name="fma-01", .fn = fma01],
+		[.name="fma-02", .fn = fma02],
+	][:])
+}
+
+const fma01 = {c
+	var inputs : (uint32, uint32, uint32, uint32)[:] = [
+		/*
+		   These are mostly obtained by running fpmath-consensus
+		   with seed 1234. Each (mostly) covers a different
+		   corner case.
+		 */
+		(0x000009a4, 0x00000000, 0x00000002, 0x00000002),
+		(0x69000000, 0x90008002, 0x68348026, 0x68348026),
+		(0x334802ab, 0x49113e8d, 0x90aea62e, 0x3ce2f4c3),
+		(0x5c35d8c1, 0x12dcb6e2, 0x6c1a8cc2, 0x6c1a8cc2),
+		(0xf6266d83, 0x2b3e04e8, 0x62f99bda, 0x62bbd79e),
+		(0x7278e907, 0x75f6c0f1, 0xf6f9b8e0, 0x7f800000),
+		(0xd7748eeb, 0x6737b23e, 0x68e3bbc7, 0xff2f7c71),
+		(0x7f373de4, 0x3dcf90f0, 0xd22ac17c, 0x7d9492ca),
+		(0xb50fce04, 0x00cd486d, 0x03800000, 0x03800000),
+		(0xbb600000, 0x43b7161a, 0x8684d442, 0xbfa03357),
+		(0xf26f8a00, 0x4bfac000, 0xc74ba9fc, 0xfeeaa06c),
+		(0x55d1fa60, 0x32f20000, 0x1b1fea3d, 0x49467eaf),
+		(0x29e26c00, 0x62352000, 0xa0e845af, 0x4ca032a9),
+		(0x287650f8, 0x7cd00000, 0x94e85d5e, 0x65c821c9),
+		(0x7689f580, 0x91418000, 0xaa2822ae, 0xc8508e21),
+		(0xbd813cc0, 0x421f0000, 0x9f098e17, 0xc0208977),
+		(0x3745461a, 0x4db9b736, 0xb6d7deff, 0x458f1cd8),
+		(0xa3ccfd37, 0x7f800000, 0xed328e70, 0xff800000),
+		(0xa3790205, 0x5033a3e6, 0xa001fd11, 0xb42ebbd5),
+		(0x83dd6ede, 0x31ddf8e6, 0x01fea4c8, 0x01fea4c7),
+		(0xa4988128, 0x099a41ad, 0x00800000, 0x00800000),
+		(0x1e0479cd, 0x91d5fcb4, 0x00800000, 0x00800000),
+		(0x2f413021, 0x0a3f5a4e, 0x80800483, 0x80800000),
+		(0x144dcd10, 0x12f4aba0, 0x80800000, 0x80800000),
+		(0x0d580b86, 0x435768a8, 0x966c8d6f, 0x966c5ffd),
+		(0xa19e9a6f, 0xb49af3e3, 0xa2468b59, 0xa2468b57),
+		(0xd119e996, 0x8e5ad0e3, 0x247e0028, 0x247e83b7),
+		(0x381adbc6, 0x00ee4f61, 0x005f2aeb, 0x005f2d2c),
+		(0x7008233c, 0x2a9613fb, 0x46affd02, 0x5b1f9e8a),
+		(0xe85018a1, 0x2cbd53ed, 0x3fcffab8, 0xd599e668),
+
+		/* These ones are especially tricky */
+		(0x65dbf098, 0xd5beb8b4, 0x7c23db61, 0x73027654),
+		(0xa4932927, 0xc565bc34, 0x316887af, 0x31688bcf),
+		(0xb080a420, 0x09e2e5ca, 0x807ff1bf, 0x80800000),
+	][:]
+
+	for (x, y, z, r) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var yf : flt32 = std.flt32frombits(y)
+		var zf : flt32 = std.flt32frombits(z)
+		var rf = math.fma(xf, yf, zf)
+		testr.check(c, rf == std.flt32frombits(r),
+			"0x{b=16,w=8,p=0} * 0x{b=16,w=8,p=0} + 0x{b=16,w=8,p=0} should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+			x, y, z, r, std.flt32bits(rf))
+	;;
+}
+
+const fma02 = {c
+	var inputs : (uint64, uint64, uint64, uint64)[:] = [
+		/*
+		   These are mostly obtained by running fpmath-consensus
+		   with seed 1234. Each (mostly) covers a different
+		   corner case.
+		 */
+		(0x0000000000000000, 0x0000000000000000, 0x0100000000000000, 0x0100000000000000),
+		(0x0000000000000000, 0x0000000000000000, 0x0200000000000000, 0x0200000000000000),
+		(0x00000000000009a4, 0x6900000000000002, 0x6834802690008002, 0x6834802690008002),
+		(0x49113e8d334802ab, 0x5c35d8c190aea62e, 0x6c1a8cc212dcb6e2, 0x6c1a8cc212dcb6e2),
+		(0x2b3e04e8f6266d83, 0xae84e20f62f99bda, 0xc9115a1ccea6ce27, 0xc9115a1ccea6ce27),
+		(0xa03ea9e9b09d932c, 0xded7bc19edcbf0c7, 0xbbc4c1f83b3f8f2e, 0x3f26be5f0c7b48e3),
+		(0xa5ec2141c1e6f339, 0xa2d80fc217f57b61, 0x00b3484b473ef1b8, 0x08d526cb86ee748d),
+		(0xccc6600ee88bb67c, 0xc692eeec9b51cf0f, 0xbf5f1ae3486401b0, 0x536a7a30857129db),
+		(0x5f9b9e449db17602, 0xbef22ae5b6a2b1c5, 0x6133e925e6bf8a12, 0x6133e925e6bf823b),
+		(0x7f851249841b6278, 0x3773388e53a375f4, 0x761c27fc2ffa57be, 0x7709506b0e99dc30),
+		(0x7c7cb20f3ca8af93, 0x800fd7f5cfd5baae, 0x14e4c09c9bb1e17e, 0xbc9c6a3fd0e58682),
+		(0xb5e8db2107f4463f, 0x614af740c0d7eb3b, 0xd7e3d25c4daa81e0, 0xd7e3d798d3ccdffb),
+		(0xae62c8be4cb45168, 0x90cc5236f3516c90, 0x0007f8b14f684558, 0x0007f9364eb1a815),
+		(0x5809f53e32a7e1ba, 0xcc647611ccaa5bf4, 0xdfbdb5c345ce7a56, 0xe480990da5526103),
+		(0xbb889d7f826438e1, 0x03bdaff82129696d, 0x000000dacab276ae, 0x8000009296c962f8),
+		(0x003d95525e2b057a, 0xbef738ea5717d89a, 0x800000089763d88c, 0x800000b456ed1a9c),
+		(0x0be868cb5a7180c8, 0x3357a30707ed947c, 0x80000050d6b86ac6, 0x000000cfa41cb229),
+		(0xbe535f4f8a7498af, 0x00d24adee12217b8, 0x0000005729e93fb0, 0x800000016d975af3),
+		(0x39d1968eb883f088, 0x856f286e3b268f0e, 0x800000d7cdd0ed70, 0x800001e9cf01a0ae),
+	][:]
+
+	for (x, y, z, r) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var yf : flt64 = std.flt64frombits(y)
+		var zf : flt64 = std.flt64frombits(z)
+		var rf = math.fma(xf, yf, zf)
+		testr.check(c, rf == std.flt64frombits(r),
+			"0x{b=16,w=16,p=0} * 0x{b=16,w=16,p=0} + 0x{b=16,w=16,p=0} should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
+			x, y, z, r, std.flt64bits(rf))
+	;;
+}
--- /dev/null
+++ b/lib/math/test/poly-impl.myr
@@ -1,0 +1,66 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name="horner-01", .fn = horner01],
+		[.name="horner-02", .fn = horner02],
+	][:])
+}
+
+const horner01 = {c
+	var inputs : (uint32, uint32[:], uint32)[:] = [
+		(0x00000000, [][:], 0x00000000),
+		(0xbf800000, [0x3f715545, 0x3f715546, 0x3f715544, 0x3f715545][:], 0xb4000000),
+		(0xbf800001, [0x3f715545, 0x3f715546, 0x3f715544, 0x3f715545][:], 0xb4bc5552),
+	][:]
+
+	for (f, a, r) : inputs
+		var f2 = std.flt32frombits(f)
+		var a2 = [][:]
+		for aa : a
+			std.slpush(&a2, std.flt32frombits(aa))
+		;;
+		var sf = math.horner_poly(f2, a2)
+		var s : uint32 = std.flt32bits(sf)
+		testr.eq(c, s, r)
+	;;
+}
+
+const horner02 = {c
+	var inputs : (uint64, uint64[:], uint64)[:] = [
+		(0x0000000000000000, [][:], 0x0000000000000000),
+		(0xc0003d3368ee1111, [
+			0x3ff0000000000000,
+			0x3fe0000000000000,
+			0x3fc5555555555555,
+			0x3fa5555555555555,
+			0x3f81a7b9611a7b96,
+			0x3f59c2d14ee4a102,
+			0x3f3136f054ff42a4,
+			0x3f05902ed525b6ee
+		][:], 0x3fdb64730ab8fa29),
+		(0x40003d3368ee1111, [
+			0x3ff0000000000000,
+			0x3fe0000000000000,
+			0x3fc5555555555555,
+			0x3fa5555555555555,
+			0x3f81a7b9611a7b96,
+			0x3f59c2d14ee4a102,
+			0x3f3136f054ff42a4,
+			0x3f05902ed525b6ee
+		][:], 0x400a331575ee40db),
+	][:]
+
+	for (f, a, r) : inputs
+		var f2 = std.flt64frombits(f)
+		var a2 = [][:]
+		for aa : a
+			std.slpush(&a2, std.flt64frombits(aa))
+		;;
+		var sf = math.horner_poly(f2, a2)
+		var s : uint64 = std.flt64bits(sf)
+		testr.eq(c, s, r)
+	;;
+}
--- /dev/null
+++ b/lib/math/test/round-impl.myr
@@ -1,0 +1,82 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name = "round-01",    .fn = round01],
+		[.name = "round-02",    .fn = round02],
+	][:])
+}
+
+const round01 = {c
+	var inputs : (flt32, int32)[:] = [
+		(123.4, 123),
+		(0.0, 0),
+		(-0.0, 0),
+		(1.0, 1),
+		(1.1, 1),
+		(0.9, 1),
+		(15.3, 15),
+		(15.5, 16),
+		(15.7, 16),
+		(16.3, 16),
+		(16.5, 16),
+		(16.7, 17),
+		(-102.1, -102),
+		(-102.5, -102),
+		(-102.7, -103),
+		(-103.1, -103),
+		(-103.5, -104),
+		(-103.7, -104),
+		(2147483641.5, -2147483648),
+		(2147483646.5, -2147483648),
+		(2147483647.5, -2147483648),
+		(2147483649.0, -2147483648),
+		(-2147483641.5, -2147483648),
+		(-2147483646.5, -2147483648),
+		(-2147483647.5, -2147483648),
+		(-2147483649.0, -2147483648),
+	][:]
+
+	for (f, g) : inputs
+		testr.eq(c, math.rn(f), g)
+	;;
+}
+
+const round02 = {c
+	var inputs : (flt64, int64)[:] = [
+		(123.4, 123),
+		(0.0, 0),
+		(-0.0, 0),
+		(1.0, 1),
+		(1.1, 1),
+		(0.9, 1),
+		(15.3, 15),
+		(15.5, 16),
+		(15.7, 16),
+		(16.3, 16),
+		(16.5, 16),
+		(16.7, 17),
+		(-102.1, -102),
+		(-102.5, -102),
+		(-102.7, -103),
+		(-103.1, -103),
+		(-103.5, -104),
+		(-103.7, -104),
+		(2147483641.5, 2147483642),
+		(2147483646.5, 2147483646),
+		(2147483647.5, 2147483648),
+		(2147483649.0, 2147483649),
+		(-2147483641.5, -2147483642),
+		(-2147483646.5, -2147483646),
+		(-2147483647.5, -2147483648),
+		(-2147483649.0, -2147483649),
+		(9223372036854775806.1, -9223372036854775808),
+		(-9223372036854775806.1, -9223372036854775808),
+	][:]
+
+	for (f, g) : inputs
+		testr.eq(c, math.rn(f), g)
+	;;
+}
--- /dev/null
+++ b/lib/math/test/scale2-impl.myr
@@ -1,0 +1,96 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name = "scale2-01",    .fn = scale201],
+		[.name = "scale2-02",    .fn = scale202],
+		[.name = "scale2-03",    .fn = scale203],
+		[.name = "scale2-04",    .fn = scale204],
+	][:])
+}
+
+const scale201 = {c
+	var inputsf : (flt32, int32, flt32)[:] = [
+		(0.0, 1, 0.0),
+		(-0.0, 2, -0.0),
+		(1.0, 3, 8.0),
+		(1.0, -3, 0.125),
+		(23.0, 2, 92.0),
+		(184.2, 10, 188620.8),
+		(0.00000234, 15, 0.07667712),
+		(1834.2, -31, 0.0000008541159331798554),
+		(4321.22341, 0, 4321.22341),
+	][:]
+
+	for (f, m, g) : inputsf
+		testr.eq(c, math.scale2(f, m), g)
+	;;
+}
+
+const scale202 = {c
+	var inputsf : (flt64, int64, flt64)[:] = [
+		(0.0, 1, 0.0),
+		(-0.0, 2, -0.0),
+		(1.0, 3, 8.0),
+		(1.0, -3, 0.125),
+		(23.0, 2, 92.0),
+		(184.2, 10, 188620.8),
+		(0.00000234, 15, 0.07667712),
+		(1834.2, -31, 0.0000008541159331798554),
+		(4321.22341, 0, 4321.22341),
+	][:]
+
+	for (f, m, g) : inputsf
+		testr.eq(c, math.scale2(f, m), g)
+	;;
+}
+
+const scale203 = {c
+	var inputsb : (uint32, int32, uint32)[:] = [
+		(0x00000000,   1, 0x00000000),
+		(0x7f38aa32,   0, 0x7f38aa32),
+		(0xaaaaaaaa,   0, 0xaaaaaaaa),
+		(0x43000000,  -3, 0x41800000),
+		(0x000030a0,  -8, 0x00000031),
+		(0x002f3030,  -8, 0x00002f30),
+		(0x032f3030, -20, 0x0000015e),
+		(0x032aafff,  -8, 0x00155600),
+		(0x002aafff,   8, 0x03aabffc),
+		(0x0000af31,   2, 0x0002bcc4),
+		(0x0000af31, 260, 0x7eaf3100),
+		(0x0000af31, 266, 0x7f800000),
+		(0x3f7ff404, 128, 0x7f7ff404),
+	][:]
+
+	for (u, m, v) : inputsb
+		var f = std.flt32frombits(u)
+		var g = math.scale2(f, m)
+		var w = std.flt32bits(g)
+		testr.check(c, v == w, "scale2(0x{w=8,b=16,p=0}, {}) should be 0x{w=8,b=16,p=0}, was 0x{w=8,b=16,p=0}", u, m, v, w)
+	;;
+}
+
+const scale204 = {c
+	var inputsb : (uint64, int64, uint64)[:] = [
+		(0x0000000000000000,     1, 0x0000000000000000),
+		(0x7f83785551aa873c,     0, 0x7f83785551aa873c),
+		(0xc2b00000aabbccdd, -1080, 0x800000400002aaef),
+		(0xc644fa802f33cfbd,    -1, 0xc634fa802f33cfbd),
+		(0x8004fa802f33cfbd,    -1, 0x80027d401799e7de),
+		(0x8004fa8fffffffff,    -1, 0x80027d4800000000),
+		(0x0082aaffffffffff,     8, 0x0102aaffffffffff), 
+		(0x000000ffffffffff,     1, 0x000001fffffffffe),
+		(0x000000ffffffffff,  1000, 0x3dcfffffffffe000),
+		(0x000000ffffffffff,  2000, 0x7c4fffffffffe000),
+		(0x000000ffffffffff,  2400, 0x7ff0000000000000),
+	][:]
+
+	for (u, m, v) : inputsb
+		var f = std.flt64frombits(u)
+		var g = math.scale2(f, m)
+		var w = std.flt64bits(g)
+		testr.check(c, v == w, "scale2(0x{w=16,b=16,p=0}, {}) should be 0x{w=16,b=16,p=0}, was 0x{w=16,b=16,p=0}", u, m, v, w)
+	;;
+}
--- /dev/null
+++ b/lib/math/test/sqrt-impl.myr
@@ -1,0 +1,83 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name="sqrt-01", .fn = sqrt01],
+		[.name="sqrt-02", .fn = sqrt02],
+	][:])
+}
+
+const sqrt01 = {c
+	var inputs : (uint32, uint32)[:] = [
+		(0x00000000, 0x00000000),
+		(0x80000000, 0x80000000),
+		(0x80000001, 0x7ff80000),
+		(0x8aaaaaaa, 0x7ff80000),
+		(0x3f800000, 0x3f800000),
+		(0x40800000, 0x40000000),
+		(0x41100000, 0x40400000),
+		(0x3e800000, 0x3f000000),
+		(0x3a3a0000, 0x3cda35fe),
+		(0x017a1000, 0x207d038b),
+		(0x00fc0500, 0x20339b45),
+		(0x160b0000, 0x2abca321),
+		(0x00000800, 0x1d000000),
+		(0x7f690a00, 0x5f743ff8),
+		(0x7f5c0e00, 0x5f6d590c),
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt32 = std.flt32frombits(x)
+		var yf : flt32 = std.flt32frombits(y)
+		var rf = math.sqrt(xf)
+		testr.check(c, rf == yf,
+			"sqrt(0x{b=16,w=8,p=0}) should be 0x{b=16,w=8,p=0}, was 0x{b=16,w=8,p=0}",
+			x, y, std.flt32bits(rf))
+	;;
+}
+
+const sqrt02 = {c
+	var inputs : (uint64, uint64)[:] = [
+		(0x0000000000000000ul, 0x0000000000000000ul),
+		(0x8000000000000000ul, 0x8000000000000000ul),
+		(0x8000000000000001ul, 0x7ff8000000000000ul),
+		(0x8aaaaaaaaaaaaaaaul, 0x7ff8000000000000ul),
+		(0x0606e437817acd16ul, 0x22fb10b36e4ae795ul),
+		(0x3ff0000000000000ul, 0x3ff0000000000000ul),
+		(0x4010000000000000ul, 0x4000000000000000ul),
+		(0x3fd0000000000000ul, 0x3fe0000000000000ul),
+		(0x1bbffa831c8f220eul, 0x2dd69ead9d353d6cul),
+		(0x3f0e0f7339499f0bul, 0x3f7f03d8229c8b81ul),
+		(0x3ca510f548e0f3ecul, 0x3e49f6bcadd1e806ul),
+		(0x044ef24a3cca214bul, 0x221f780430319d58ul),
+		(0x7ab034357a1e0474ul, 0x5d501a0593fd8d49ul),
+		(0x216b2df113b38de7ul, 0x30ad7dcc6f26285aul),
+		(0x2e2de34118496c06ul, 0x370eed0301fdade1ul),
+		(0x155bf26b4fb0b2c8ul, 0x2aa5255cf9bd799cul),
+		(0x4b8004df0ac137aaul, 0x45b6a40fee232f2aul),
+		(0x1acaf23d7b0bf80cul, 0x2d5d5d56beda3392ul),
+		(0x3f97ea4c6399a8e6ul, 0x3fc38fb000d55805ul),
+		(0x78f36ea1656dec48ul, 0x5c71a1fce3f370e4ul),
+		(0x409636d438489edbul, 0x4042da4eeac985aaul),
+		(0x72dfd27869ffd768ul, 0x5966907fc9668f57ul),
+		(0x1f483c585e4f03dcul, 0x2f9bd93c3bd1f884ul),
+		(0x7ade25ea6bb6464eul, 0x5d65f681bdbcdf4eul),
+		(0x24ffe5593b0836dbul, 0x3276973038d3bbddul),
+		(0x03e92ac739ec355eul, 0x21ec60eea1d102e8ul),
+		(0x76b656a961a4f64eul, 0x5b52e7cc1d30f55bul),
+		(0x5bc2fac208381d11ul, 0x4dd8a4f5203ab3d2ul),
+		(0x000578e105ac27aaul, 0x1ff2b6d3204e206eul),
+		(0x00057e1016b7c1edul, 0x1ff2bfae3a8e21bbul),
+	][:]
+
+	for (x, y) : inputs
+		var xf : flt64 = std.flt64frombits(x)
+		var yf : flt64 = std.flt64frombits(y)
+		var rf = math.sqrt(xf)
+		testr.check(c, rf == yf,
+			"sqrt(0x{b=16,w=16,p=0}) should be 0x{b=16,w=16,p=0}, was 0x{b=16,w=16,p=0}",
+			x, y, std.flt64bits(rf))
+	;;
+}
--- /dev/null
+++ b/lib/math/test/sum-impl.myr
@@ -1,0 +1,36 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name = "sums-kahan-01", .fn = sums_kahan_01],
+		[.name = "sums-priest-01", .fn = sums_priest_01],
+	][:])
+}
+
+const sums_kahan_01 = {c
+	var sums : (flt32[:], flt32)[:] = [
+		([1.0, 2.0, 3.0][:], 6.0),
+
+		/* Naive summing gives 1.0, actual answer is 2.0 */
+		([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 3.0)
+	][:]
+
+	for (a, r) : sums
+		var s = math.kahan_sum(a)
+		testr.eq(c, r, s)
+	;;
+}
+
+const sums_priest_01 = {c
+	var sums : (flt32[:], flt32)[:] = [
+		([1.0, 2.0, 3.0][:], 6.0),
+		([33554432.0, 33554430.0, -16777215.0, -16777215.0, -16777215.0, -16777215.0][:], 2.0)
+	][:]
+
+	for (a, r) : sums
+		var s = math.priest_sum(a)
+		testr.eq(c, r, s)
+	;;
+}
--- /dev/null
+++ b/lib/math/test/trunc-impl.myr
@@ -1,0 +1,124 @@
+use std
+use math
+use testr
+
+const main = {
+	testr.run([
+		[.name = "trunc-01",    .fn = trunc01],
+		[.name = "trunc-02",    .fn = trunc02],
+		[.name = "floor-01",    .fn = floor01],
+		[.name = "floor-02",    .fn = floor02],
+		[.name = "ceil-01",     .fn = ceil01],
+		[.name = "ceil-02",     .fn = ceil02],
+	][:])
+}
+
+const trunc01 = {c
+	var flt32s : (flt32, flt32)[:] = [
+		(123.4, 123.0),
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(1.0, 1.0),
+		(1.1, 1.0),
+		(0.9, 0.0),
+		(10664524000000000000.0, 10664524000000000000.0),
+		(-3.5, -3.0),
+		(101.999, 101.0),
+		(std.flt32nan(), std.flt32nan()),
+	][:]
+
+	for (f, g) : flt32s
+		testr.eq(c, math.trunc(f), g)
+	;;
+}
+
+const trunc02 = {c
+	var flt64s : (flt64, flt64)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(1.0, 1.0),
+		(1.1, 1.0),
+		(0.9, 0.0),
+		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+		(-3.5, -3.0),
+		(101.999, 101.0),
+		(std.flt64nan(), std.flt64nan()),
+	][:]
+
+	for (f, g) : flt64s
+		testr.eq(c, math.trunc(f), g)
+	;;
+}
+
+const floor01 = {c
+	var flt32s : (flt32, flt32)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 0.0),
+		(1.1, 1.0),
+		(10664524000000000000.0, 10664524000000000000.0),
+		(-3.5, -4.0),
+		(-101.999, -102.0),
+		(-126.999, -127.0),
+		(-127.999, -128.0),
+		(-128.999, -129.0),
+		(std.flt32nan(), std.flt32nan()),
+	][:]
+
+	for (f, g) : flt32s
+		testr.eq(c, math.floor(f), g)
+	;;
+}
+
+const floor02 = {c
+	var flt64s : (flt64, flt64)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 0.0),
+		(1.1, 1.0),
+		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+		(-3.5, -4.0),
+		(-101.999, -102.0),
+		(std.flt64nan(), std.flt64nan()),
+	][:]
+
+	for (f, g) : flt64s
+		testr.eq(c, math.floor(f), g)
+	;;
+}
+
+const ceil01 = {c
+	var flt32s : (flt32, flt32)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 1.0),
+		(-0.1, -0.0),
+		(1.1, 2.0),
+		(10664524000000000000.0, 10664524000000000000.0),
+		(-3.5, -3.0),
+		(-101.999, -101.0),
+		(std.flt32nan(), std.flt32nan()),
+	][:]
+
+	for (f, g) : flt32s
+		testr.eq(c, math.ceil(f), g)
+	;;
+}
+
+const ceil02 = {c
+	var flt64s : (flt64, flt64)[:] = [
+		(0.0, 0.0),
+		(-0.0, -0.0),
+		(0.5, 1.0),
+		(-0.1, -0.0),
+		(1.1, 2.0),
+		(13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+		(-3.5, -3.0),
+		(-101.999, -101.0),
+		(std.flt64nan(), std.flt64nan()),
+	][:]
+
+	for (f, g) : flt64s
+		testr.eq(c, math.ceil(f), g)
+	;;
+}
--- /dev/null
+++ b/lib/math/trunc-impl+posixy-x64-sse4.s
@@ -1,0 +1,41 @@
+.globl math$trunc32
+.globl _math$trunc32
+math$trunc32:
+_math$trunc32:
+	roundss $0x03, %xmm0, %xmm0
+	ret
+
+.globl math$floor32
+.globl _math$floor32
+math$floor32:
+_math$floor32:
+	roundss $0x01, %xmm0, %xmm0
+	ret
+
+.globl math$ceil32
+.globl _math$ceil32
+math$ceil32:
+_math$ceil32:
+	roundss $0x02, %xmm0, %xmm0
+	ret
+
+.globl math$trunc64
+.globl _math$trunc64
+math$trunc64:
+_math$trunc64:
+	roundsd $0x03, %xmm0, %xmm0
+	ret
+
+.globl math$floor64
+.globl _math$floor64
+math$floor64:
+_math$floor64:
+	roundsd $0x01, %xmm0, %xmm0
+	ret
+
+.globl math$ceil64
+.globl _math$ceil64
+math$ceil64:
+_math$ceil64:
+	roundsd $0x02, %xmm0, %xmm0
+	ret
--- /dev/null
+++ b/lib/math/trunc-impl.myr
@@ -1,0 +1,103 @@
+use std
+
+pkg math =
+	pkglocal const trunc32 : (f : flt32 -> flt32)
+	pkglocal const floor32 : (f : flt32 -> flt32)
+	pkglocal const ceil32  : (f : flt32 -> flt32)
+	pkglocal const trunc64 : (f : flt64 -> flt64)
+	pkglocal const floor64 : (f : flt64 -> flt64)
+	pkglocal const ceil64  : (f : flt64 -> flt64)
+;;
+
+const Flt32NegMask : uint32 = (1 << 31)
+const Flt32SigMask : uint32 = (1 << 23) - 1
+
+const Flt64NegMask : uint64 = (1 << 63)
+const Flt64SigMask : uint64 = (1 << 52) - 1
+
+pkglocal const floor32 = {f : flt32
+	var n, e, s
+	(n, e, s) = std.flt32explode(f)
+
+	/* Many special cases */
+	if e >= 23 || f == -0.0
+		-> f
+	elif e < 0
+		if n
+			-> -1.0
+		else
+			-> 0.0
+		;;
+	;;
+
+	if n
+		var fractional_mask = Flt32SigMask >> (e : uint32)
+		if s & fractional_mask == 0
+			-> f
+		else
+			/* Turns out the packing of exp and sig is useful */
+			var u : uint32 = std.flt32bits(f) & ~fractional_mask
+			u += ((1 << 23) >> (e : uint32))
+			-> std.flt32frombits(u)
+		;;
+	;;
+
+	var m : uint32 = (Flt32SigMask >> (e : uint32))
+	-> std.flt32assem(n, e, s & ~m)
+}
+
+pkglocal const trunc32 = {f : flt32
+	if std.flt32bits(f) & Flt32NegMask != 0
+		-> -floor32(-f)
+	else
+		-> floor32(f)
+	;;
+}
+
+pkglocal const ceil32 = {f : flt32
+	-> -floor32(-f)
+}
+
+pkglocal const floor64 = {f : flt64
+	var n, e, s
+	(n, e, s) = std.flt64explode(f)
+
+	/* Many special cases */
+	if e >= 52 || f == -0.0
+		-> f
+	elif e < 0
+		if n
+			-> -1.0
+		else
+			-> 0.0
+		;;
+	;;
+
+	if n
+		var fractional_mask = Flt64SigMask >> (e : uint64)
+		if s & fractional_mask == 0
+			-> f
+		else
+			/* Turns out the packing of exp and sig is useful */
+			var u : uint64 = std.flt64bits(f) & ~fractional_mask
+			u += ((1 << 52) >> (e : uint64))
+			-> std.flt64frombits(u)
+		;;
+	;;
+
+	var m : uint64 = (Flt64SigMask >> (e : uint64))
+	-> std.flt64assem(n, e, s & ~m)
+}
+
+pkglocal const trunc64 = {f : flt64
+	if std.flt64bits(f) & Flt64NegMask != 0
+		-> -floor64(-f)
+	else
+		-> floor64(f)
+	;;
+}
+
+pkglocal const ceil64 = {f : flt64
+	-> -floor64(-f)
+}
+
--- /dev/null
+++ b/lib/math/util.myr
@@ -1,0 +1,174 @@
+use std
+
+pkg math =
+	const flt32fromflt64 : (f : flt64 -> flt32)
+	const flt64fromflt32 : (x : flt32 -> flt64)
+
+	/* For use in various normalizations */
+	const find_first1_64 : (b : uint64, start : int64 -> int64)
+	const find_first1_64_hl : (h : uint64, l : uint64, start : int64 -> int64)
+
+	/* >> and <<, but without wrapping when the shift is >= 64 */
+	const shr : (u : uint64, s : int64 -> uint64)
+	const shl : (u : uint64, s : int64 -> uint64)
+
+	/* Whether RN() requires incrementing after truncating */
+	const need_round_away : (h : uint64, l : uint64, bitpos_last : int64 -> bool)
+;;
+
+const flt64fromflt32 = {f : flt32
+	var n, e, s
+	(n, e, s) = std.flt32explode(f)
+	var xs : uint64 = (s : uint64)
+	var xe : int64 = (e : int64)
+
+	if e == 128
+		-> std.flt64assem(n, 1024, xs)
+	elif e == -127
+		/*
+		  All subnormals in single precision (except 0.0s)
+		  can be upgraded to double precision, since the
+		  exponent range is so much wider.
+		 */
+		var first1 = find_first1_64(xs, 23)
+		if first1 < 0
+			-> std.flt64assem(n, -1023, 0)
+		;;
+		xs = xs << (52 - (first1 : uint64))
+		xe = -126 - (23 - first1)
+		-> std.flt64assem(n, xe, xs)
+	;;
+
+	-> std.flt64assem(n, xe, xs << (52 - 23))
+}
+
+const flt32fromflt64 = {f : flt64
+	var n : bool, e : int64, s : uint64
+	(n, e, s) = std.flt64explode(f)
+	var ts : uint32
+	var te : int32 = (e : int32)
+
+	if e >= 128
+		if e == 1023 && s != 0
+			/* NaN */
+			-> std.flt32assem(n, 128, 1)
+		else
+			/* infinity */
+			-> std.flt32assem(n, 128, 0)
+		;;
+	;;
+
+	if e >= -127
+		/* normal */
+		ts = ((s >> (52 - 23)) : uint32)
+		if need_round_away(0, s, 52 - 23)
+			ts++
+			if ts & (1 << 24) != 0
+				ts >>= 1
+				te++
+			;;
+		;;
+		if te >= -126
+			-> std.flt32assem(n, te, ts)
+		;;
+	;;
+
+	/* subnormal already, will have to go to 0 */
+	if e == -1023
+		-> std.flt32assem(n, -127, 0)
+	;;
+
+	/* subnormal (at least, it will be) */
+	te = -127
+	var shift : int64 = (52 - 23) + (-126 - e)
+	var ts1 = shr(s, shift)
+	ts = (ts1 : uint32)
+	if need_round_away(0, s, shift)
+		ts++
+		if ts & (1 << 23) != 0
+			/* false alarm, it's normal again */
+			te++
+		;;
+	;;
+	-> std.flt32assem(n, te, ts)
+}
+
+/* >> and <<, but without wrapping when the shift is >= 64 */
+const shr = {u : uint64, s : int64
+	if (s : uint64) >= 64
+		-> 0
+	else
+		-> u >> (s : uint64)
+	;;
+}
+
+const shl = {u : uint64, s : int64
+	if (s : uint64) >= 64
+		-> 0
+	else
+		-> u << (s : uint64)
+	;;
+}
+
+/* Find the first 1 bit in a bitstring */
+const find_first1_64 = {b : uint64, start : int64
+	for var j = start; j >= 0; --j
+		var m = shl(1, j)
+		if b & m != 0
+			-> j
+		;;
+	;;
+
+	-> -1
+}
+
+const find_first1_64_hl = {h, l, start
+	var first1_h = find_first1_64(h, start - 64)
+	if first1_h >= 0
+		-> first1_h + 64
+	;;
+
+	-> find_first1_64(l, 63)
+}
+
+/*
+   For [ h ][ l ], where bitpos_last is the position of the last
+   bit that was included in the truncated result (l's last bit has
+   position 0), decide whether rounding up/away is needed. This is
+   true if
+
+    - following bitpos_last is a 1, then a non-zero sequence, or
+
+    - following bitpos_last is a 1, then a zero sequence, and the
+      round would be to even
+ */
+const need_round_away = {h : uint64, l : uint64, bitpos_last : int64
+	var first_omitted_is_1 = false
+	var nonzero_beyond = false
+	if bitpos_last > 64
+		first_omitted_is_1 = h & shl(1, bitpos_last - 1 - 64) != 0
+		nonzero_beyond = nonzero_beyond || h & shr((-1 : uint64), 2 + 64 - (bitpos_last - 64)) != 0
+		nonzero_beyond = nonzero_beyond || (l != 0)
+	else
+		first_omitted_is_1 = l & shl(1, bitpos_last - 1) != 0
+		nonzero_beyond = nonzero_beyond || l & shr((-1 : uint64), 1 + 64 - bitpos_last) != 0
+	;;
+
+	if !first_omitted_is_1
+		-> false
+	;;
+
+	if nonzero_beyond
+		-> true
+	;;
+
+	var hl_is_odd = false
+
+	if bitpos_last >= 64
+		hl_is_odd = h & shl(1, bitpos_last - 64) != 0
+	else
+		hl_is_odd = l & shl(1, bitpos_last) != 0
+	;;
+
+	-> hl_is_odd
+}
--- a/lib/std/fltbits.myr
+++ b/lib/std/fltbits.myr
@@ -9,10 +9,10 @@
 	generic isnan		: (f : @a -> bool) ::floating @a
 	const flt64frombits	: (bits : uint64 -> flt64)
 	const flt32frombits	: (bits : uint32 -> flt32)
-	const flt64explode	: (flt : flt64 -> (bool, uint64, int64))
-	const flt32explode	: (flt : flt32 -> (bool, uint32, int32))
-	const flt64assem	: (sign : bool, mant : uint64, exp : int64 -> flt64)
-	const flt32assem	: (sign : bool, mant : uint32, exp : int32 -> flt32)
+	const flt64explode	: (flt : flt64 -> (bool, int64, uint64))
+	const flt32explode	: (flt : flt32 -> (bool, int32, uint32))
+	const flt64assem	: (sign : bool, exp : int64, mant : uint64 -> flt64)
+	const flt32assem	: (sign : bool, exp : int32, mant : uint32 -> flt32)
 ;;
 
 const flt64bits	= {flt;	-> (&flt : uint64#)#}
@@ -20,6 +20,9 @@
 const flt64frombits	= {bits;	-> (&bits : flt64#)#}
 const flt32frombits	= {bits;	-> (&bits : flt32#)#}
 
+const Dblbias = 1023
+const Fltbias = 127
+
 const flt64explode = {flt
 	var bits, isneg, mant, uexp, exp
 
@@ -31,17 +34,11 @@
 	/* add back the implicit bit if this is not a denormal */
 	if uexp != 0
 		mant |= 1ul << 52
-		exp = (uexp : int64)
-	else
-		exp = 1
 	;;
-	/*
-	   adjust for exponent bias. nb: because we are
-	   treating the mantissa as m.0 instead of 0.m,
-	   our exponent bias needs to be offset by the
-	   size of m
-	*/
-	-> (isneg, mant, exp)
+
+	/* adjust for exponent bias */
+	exp = (uexp : int64) - Dblbias
+	-> (isneg, exp, mant)
 }
 
 const flt32explode = {flt
@@ -55,33 +52,61 @@
 	/* add back the implicit bit if this is not a denormal */
 	if uexp != 0
 		mant |= 1 << 23
-		exp = (uexp : int32)
-	else
-		exp = 1
 	;;
-	/*
-	   adjust for exponent bias. nb: because we are
-	   treating the mantissa as m.0 instead of 0.m,
-	   our exponent bias needs to be offset by the
-	   size of m
-	*/
-	-> (isneg, mant, exp)
+
+	/* adjust for exponent bias */
+	exp = (uexp : int32) - Fltbias
+	-> (isneg, exp, mant)
 }
 
-const flt64assem = {sign, mant, exp
+const flt64assem = {sign, exp, mant
 	var s, m, e
 
+	if exp <= -Dblbias && (mant & (1ul << 52) != 0)
+		var roundup = false
+		var shift : uint64 = ((1 - Dblbias - exp) : uint64)
+		var firstcut = mant & (1 << shift)
+		var restcut = mant & ((1 << shift) - 1)
+		var lastkept = mant & (1 << (shift + 1))
+		roundup = firstcut != 0 && (lastkept != 0 || restcut != 0)
+		mant >>= shift
+		exp = -Dblbias
+		if roundup
+			mant++
+			if (mant & (1ul << 52) != 0)
+				exp++
+			;;
+		;;
+	;;
+
 	s = (sign : uint64)
-	e = (exp : uint64) & 0x7ff
+	e = (exp + Dblbias : uint64) & 0x7ff
 	m = (mant : uint64) & ((1ul<<52) - 1)
 	-> std.flt64frombits((s << 63) | (e << 52) | m)
 }
 
-const flt32assem = {sign, mant, exp
+const flt32assem = {sign, exp, mant
 	var s, m, e
 
+	if exp <= -Fltbias && (mant & (1 << 23) != 0)
+		var roundup = false
+		var shift : uint32 = ((1 - Fltbias - exp) : uint32)
+		var firstcut = mant & (1 << shift)
+		var restcut = mant & ((1 << shift) - 1)
+		var lastkept = mant & (1 << (shift + 1))
+		roundup = firstcut != 0 && (lastkept != 0 || restcut != 0)
+		mant >>= shift
+		exp = -Fltbias
+		if roundup
+			mant++
+			if (mant & (1 << 23) != 0)
+				exp++
+			;;
+		;;
+	;;
+
 	s = (sign : uint32)
-	e = (exp : uint32) & 0xff
+	e = (exp + Fltbias : uint32) & 0xff
 	m = (mant : uint32) & ((1<<23) - 1)
 	-> std.flt32frombits(s << 31 | e << 23 | m)
 
--- a/lib/std/fltfmt.myr
+++ b/lib/std/fltfmt.myr
@@ -24,14 +24,16 @@
 const flt64bfmt = {sb, val, mode, precision
 	var isneg, exp, mant
 
-	(isneg, mant, exp) = flt64explode(val)
-	dragon4(sb, isneg, mant, (exp - 52 : int64), Dblbias, mode, precision)
+	(isneg, exp, mant) = flt64explode(val)
+	exp = max(exp, 1 - Dblbias)
+	dragon4(sb, isneg, mant, exp - 52, Dblbias, mode, precision)
 }
 
 const flt32bfmt = {sb, val, mode, precision
 	var isneg, exp, mant
 
-	(isneg, mant, exp) = flt32explode(val)
+	(isneg, exp, mant) = flt32explode(val)
+	exp = (max((exp : int64), 1 - Fltbias) : int32)
 	dragon4(sb, isneg, (mant : uint64), (exp - 23 : int64), Fltbias, mode, precision)
 }
 
@@ -64,9 +66,9 @@
 	/* initialize */
 	roundup = false
 	u = mkbigint(0)
-	r = bigshli(mkbigint(f), max(e - p, 0))
-	s = bigshli(mkbigint(1), max(0, -(e - p)))
-	mm = bigshli(mkbigint(1), max((e - p), 0))
+	r = bigshli(mkbigint(f), max(e, 0))
+	s = bigshli(mkbigint(1), max(0, -e))
+	mm = bigshli(mkbigint(1), max(e, 0))
 	mp = bigdup(mm)
 
 	/* fixup: unequal gaps */
--- a/lib/std/fltparse.myr
+++ b/lib/std/fltparse.myr
@@ -264,7 +264,7 @@
 	var sign, mant, exp
 	var za
 
-	(sign, mant, exp) = std.flt64explode(z)
+	(sign, exp, mant) = std.flt64explode(z)
 	if std.abs((mant : int64) - (1l << 52) - 1) < (lim.nextinc : int64)
 		mant = 0
 		exp++
@@ -271,7 +271,7 @@
 	else
 		mant += lim.nextinc
 	;;
-	za = std.flt64assem(sign, mant, exp)
+	za = std.flt64assem(sign, exp, mant)
 	-> za
 }
 
--- a/lib/std/test/fltbits.myr
+++ b/lib/std/test/fltbits.myr
@@ -19,6 +19,8 @@
 		[.name = "bits-roundtrip-64", .fn = bitsround64],
 		[.name = "flt32bits", .fn = flt32bits],
 		[.name = "flt64bits", .fn = flt64bits],
+		[.name = "explode-roundtrip-32", .fn = exploderound32],
+		[.name = "explode-roundtrip-64", .fn = exploderound64],
 	][:])
 }
 
@@ -95,6 +97,7 @@
 			(1.0, 0x3f800000),
 			(0.0000123, 0x374e5c19),
 			(-993.83, 0xc478751f),
+			(0.000000000000000000000000000000000000006054601, 0x0041edc4),
 		     ][:]
 		var uprime = std.flt32bits(f)
 		testr.check(c, u == uprime, "flt32bits wrong for {}:  0x{x} != 0x{x}", f, u, uprime)
@@ -112,3 +115,65 @@
 		testr.check(c, u == uprime, "flt64bits wrong for {}:  0x{x} != 0x{x}", f, u, uprime)
 	;;
 }
+
+const exploderound32 = {c
+	for f : [1.0, 0.00001, 123.45, 1111111111111111.2, -1.9, -0.0001, 0.000000000000000000000000000000000000006054601, std.flt32nan()][:]
+		var n, e, s
+		(n, e, s) = std.flt32explode(f)
+		var g = std.flt32assem(n, e, s)
+		testr.check(c, f == g, "assem o explode non-identity: {} != {}", f, g)
+	;;
+
+	/*
+	   The exponents and significands need to be rather specific
+	   in order for flt32assem to work as expected
+	 */
+	for (n, e, s) : [
+			(false, -127, 0),
+			(true, -127, 0),
+			(false, -127, 0x399),
+			(true, -127, 0x23),
+			(false, 45, (1 << 23) | 0x23),
+			(true, -12, (1 << 23) | 0x3a2),
+			(true, -126, (1 << 23) | 0x3a1),
+			(false, -127, 4320708),
+		][:]
+		var m, f, t
+		(m, f, t) = std.flt32explode(std.flt32assem(n, e, s))
+		testr.check(c, n == m, "explode o assem non-identity: {} != {}", (n, e, s), (m, f, t))
+		testr.check(c, e == f, "explode o assem non-identity: {} != {}", (n, e, s), (m, f, t))
+		testr.check(c, s == t, "explode o assem non-identity: {} != {}", (n, e, s), (m, f, t))
+	;;
+}
+
+const exploderound64 = {c
+	for f : [1.0, 0.00001, 123.45, 1111111111111e+309, -1.9, -0.0001, std.flt64nan()][:]
+		var n, e, s
+		(n, e, s) = std.flt64explode(f)
+		var g = std.flt64assem(n, e, s)
+		testr.check(c, f == g, "assem o explode non-identity: {} != {}", f, g)
+	;;
+
+	/*
+	   The exponents and significands need to be rather specific
+	   in order for flt32assem to work as expected
+	 */
+	for (n, e, s) : [
+			(false, -1023, 0),
+			(true, -1023, 0),
+			(false, -1023, 0x399),
+			(true, -1023, 0x23),
+			(false, 45, (1 << 52) | 0xa33bc),
+			(true, -12, (1 << 52) | 0x3),
+			(true, -200, (1 << 52) | 0x11aabbcc),
+			(true, 543, (1 << 52) | 0x3a1),
+			(true, 1001, (1 << 52) | 0x99aa228),
+		][:]
+		var m, f, t
+		(m, f, t) = std.flt64explode(std.flt64assem(n, e, s))
+		testr.check(c, n == m, "explode o assem non-identity: {} != {}", (n, e, s), (m, f, t))
+		testr.check(c, e == f, "explode o assem non-identity: {} != {}", (n, e, s), (m, f, t))
+		testr.check(c, s == t, "explode o assem non-identity: {} != {}", (n, e, s), (m, f, t))
+	;;
+}
+
--- a/lib/std/test/fmt.myr
+++ b/lib/std/test/fmt.myr
@@ -66,6 +66,7 @@
 	check("7b", "{x}", 123)
 	check("0x7b", "0x{x}", 123)
 	check("0.0", "{}", 0.0)
+	check("-0.0", "{}", -0.0)
 	check("0.3", "{}", 0.3)
 	check("0.3", "{}", (0.3 : flt32))
 	check("1.0", "{}", 1.0)
--- a/mbld/opts.myr
+++ b/mbld/opts.myr
@@ -34,7 +34,15 @@
 	const parseversion	: (v : byte[:] -> (int, int, int))
 
 	/* not exactly portable, but good enough for now */
+	const CpuidSSE2 : uint64= 0x400000000000000
 	const CpuidSSE4 : uint64= 0x180000
+
+	/*
+	   Intel manuals (vol 1, 14.5.3) say AVX, OSXSAVE also
+	   needed. For full portability, XGETBV also needs to be
+	   checked, though it isn't right now.
+	 */
+	const CpuidFMA  : uint64= 0x18001000
 	extern const cpufeatures : (-> uint64)
 ;;
 
--- a/mbld/syssel.myr
+++ b/mbld/syssel.myr
@@ -162,8 +162,14 @@
 		match opt_arch
 		| "x64":	
 			tag(b, "x64")
+			if opt_cpufeatures & CpuidSSE2 == CpuidSSE2
+				tag(b, "sse2")
+			;;
 			if opt_cpufeatures & CpuidSSE4 == CpuidSSE4
 				tag(b, "sse4")
+			;;
+			if opt_cpufeatures & CpuidFMA == CpuidFMA
+				tag(b, "fma")
 			;;
 		| unknown:
 			std.fatal("unknown architecture {}\n", unknown)
--- a/mi/flatten.c
+++ b/mi/flatten.c
@@ -114,10 +114,10 @@
 static Node *
 temp(Flattenctx *flatten, Node *e)
 {
-	Node *t, *dcl;
+	Node *t;
 
 	assert(e->type == Nexpr);
-	t = gentemp(e->loc, e->expr.type, &dcl);
+	t = gentemp(e->loc, e->expr.type, NULL);
 	return t;
 }
 
@@ -225,23 +225,20 @@
 }
 
 static void
-dispose(Flattenctx *s, Stab *st)
+dispose(Flattenctx *s, Stab *st, size_t n)
 {
-	Node *d, *call, *func, *val;
+	Node *e, *call, *func;
 	Trait *tr;
 	Type *ty;
 	size_t i;
 
 	tr = traittab[Tcdisp];
-	/* dispose in reverse order of declaration */
-	for (i = st->nautodcl; i-- > 0;) {
-		d = st->autodcl[i];
-		ty = decltype(d);
-		val = mkexpr(Zloc, Ovar, d->decl.name, NULL);
-		val->expr.type = ty;
-		val->expr.did = d->decl.did;
+	/* dispose in reverse order of appearance */
+	for (i = st->nautotmp; i-- > n;) {
+		e = st->autotmp[i];
+		ty = exprtype(e);
 		func = traitfn(Zloc, tr, "__dispose__", ty);
-		call = mkexpr(Zloc, Ocall, func, val, NULL);
+		call = mkexpr(Zloc, Ocall, func, e, NULL);
 		call->expr.type = mktype(Zloc, Tyvoid);
 		flatten(s, call);
 	}
@@ -460,18 +457,23 @@
 
 /* returns 1 when the exit jump needs to be emitted */
 static int
-exitscope(Flattenctx *s, Stab *stop, Srcloc loc, int x)
+exitscope(Flattenctx *s, Stab *stop, Srcloc loc, Exit x)
 {
+	Node *exit;
 	Stab *st;
 
 	for (st = s->curst;; st = st->super) {
-		if (st->exit[x]) {
-			jmp(s, st->exit[x]);
+		exit = st->exit[x];
+		if (st->ndisposed[x] < st->nautotmp) {
+			st->exit[x] = genlbl(loc);
+			flatten(s, st->exit[x]);
+			dispose(s, st, st->ndisposed[x]);
+			st->ndisposed[x] = st->nautotmp;
+		}
+		if (exit) {
+			jmp(s, exit);
 			return 0;
 		}
-		st->exit[x] = genlbl(loc);
-		flatten(s, st->exit[x]);
-		dispose(s, st);
 		if ((!stop && st->isfunc) || st == stop) {
 			return 1;
 		}
@@ -503,6 +505,12 @@
 	r = NULL;
 	args = n->expr.args;
 	switch (exprop(n)) {
+	case Oauto:
+		r = rval(s, n->expr.args[0]);
+		t = temp(s, r);
+		r = asn(t, r);
+		lappend(&s->curst->autotmp, &s->curst->nautotmp, t);
+		break;
 	case Osize:
 		r = n;	/* don't touch subexprs; they're a pseudo decl */
 		break;
@@ -696,7 +704,10 @@
 		flatten(s, n->block.stmts[i]);
 	}
 	assert(s->curst == n->block.scope);
-	dispose(s, s->curst);
+	if (st->isfunc)
+		exitscope(s, NULL, Zloc, Xret);
+	else
+		dispose(s, s->curst, 0);
 	s->curst = st;
 }
 
--- a/parse/dump.c
+++ b/parse/dump.c
@@ -186,7 +186,6 @@
 		findentf(fd, depth + 1, "isimport=%d\n", n->decl.isimport);
 		findentf(fd, depth + 1, "isnoret=%d\n", n->decl.isnoret);
 		findentf(fd, depth + 1, "isexportinit=%d\n", n->decl.isexportinit);
-		findentf(fd, depth + 1, "isauto=%d\n", n->decl.isauto);
 		findentf(fd, depth, ")\n");
 		outsym(n, fd, depth + 1);
 		outnode(n->decl.init, fd, depth + 1);
--- a/parse/gram.y
+++ b/parse/gram.y
@@ -148,7 +148,7 @@
 %type<node> littok literal lorexpr landexpr borexpr strlit bandexpr
 %type<node> cmpexpr addexpr mulexpr shiftexpr prefixexpr ternexpr
 %type<node> postfixexpr funclit seqlit tuplit name block stmt label
-%type<node> use fnparam declbody declcore typedeclcore autodecl structent
+%type<node> use fnparam declbody declcore typedeclcore structent
 %type<node> arrayelt structelt tuphead ifstmt forstmt whilestmt
 %type<node> matchstmt elifs optexprln loopcond optexpr match
 
@@ -420,8 +420,8 @@
 	}
 	;
 
-declbody: autodecl Tasn expr {$$ = $1; $1->decl.init = $3;}
-	| autodecl
+declbody: declcore Tasn expr {$$ = $1; $1->decl.init = $3;}
+	| declcore
 	;
 
 declcore: name {$$ = mkdecl($1->loc, $1, mktyvar($1->loc));}
@@ -432,10 +432,6 @@
 	: name Tcolon type {$$ = mkdecl($1->loc, $1, $3);}
 	;
 
-autodecl: Tauto declcore {$$ = $2; $$->decl.isauto = 1;}
-	| declcore
-	;
-
 name	: Tident {$$ = mkname($1->loc, $1->id);}
 	| Tident Tdot Tident {
 		$$ = mknsname($3->loc, $1->id, $3->id);
@@ -771,7 +767,8 @@
 shiftop : Tbsl | Tbsr;
 
 prefixexpr
-	: Tinc prefixexpr	{$$ = mkexpr($1->loc, Opreinc, $2, NULL);}
+	: Tauto prefixexpr	{$$ = mkexpr($1->loc, Oauto, $2, NULL);}
+	| Tinc prefixexpr	{$$ = mkexpr($1->loc, Opreinc, $2, NULL);}
 	| Tdec prefixexpr	{$$ = mkexpr($1->loc, Opredec, $2, NULL);}
 	| Tband prefixexpr	{$$ = mkexpr($1->loc, Oaddr, $2, NULL);}
 	| Tlnot prefixexpr	{$$ = mkexpr($1->loc, Olnot, $2, NULL);}
@@ -930,7 +927,7 @@
 	| /* empty */ {$$.nl = NULL; $$.nn = 0;}
 	;
 
-fnparam : autodecl {$$ = $1;}
+fnparam : declcore {$$ = $1;}
 	| Tgap { $$ = mkpseudodecl($1->loc, mktyvar($1->loc)); }
 	| Tgap Tcolon type { $$ = mkpseudodecl($1->loc, $3); }
 	;
--- a/parse/infer.c
+++ b/parse/infer.c
@@ -263,7 +263,7 @@
 	Type *ty;
 
 	tr = traittab[Tcdisp];
-	ty = decltype(n);
+	ty = exprtype(n);
 	assert(tr->nproto == 1);
 	if (hthas(tr->proto[0]->decl.impls, ty))
 		return;
@@ -883,7 +883,7 @@
 				if (update)
 					bsput(ty->trneed, tr->uid);
 				return 1;
-			} 
+			}
 			if (bshas(tm->traits, tr->uid))
 				return 1;
 			if (tm->name && ty->type == Tyname) {
@@ -1641,6 +1641,13 @@
 	infernode(&n->expr.idx, NULL, NULL);
 	n = checkns(n, np);
 	switch (exprop(n)) {
+	case Oauto:	/* @a -> @a */
+		infersub(n, ret, sawret, &isconst);
+		t = type(args[0]);
+		constrain(n, t, traittab[Tcdisp]);
+		n->expr.isconst = isconst;
+		settype(n, t);
+		break;
 		/* all operands are same type */
 	case Oadd:	/* @a + @a -> @a */
 	case Osub:	/* @a - @a -> @a */
@@ -2058,8 +2065,6 @@
 		inferdecl(n);
 		if (hasparams(type(n)) && !ingeneric)
 			fatal(n, "generic type in non-generic near %s", ctxstr(n));
-		if (n->decl.isauto)
-			constrain(n, type(n), traittab[Tcdisp]);
 		popenv(n->decl.env);
 		indentdepth--;
 		if (n->decl.isgeneric)
@@ -2618,8 +2623,6 @@
 		if (streq(declname(n), "__init__"))
 			if (!initcompatible(tybase(decltype(n))))
 				fatal(n, "__init__ must be (->void), got %s", tystr(decltype(n)));
-		if (n->decl.isauto)
-			adddispspecialization(n, curstab());
 		popenv(n->decl.env);
 		break;
 	case Nblock:
@@ -2666,6 +2669,8 @@
 			settype(n->expr.args[0], exprtype(n));
 			settype(n->expr.args[0]->expr.args[0], exprtype(n));
 		}
+		if (exprop(n) == Oauto)
+			adddispspecialization(n, curstab());
 		for (i = 0; i < n->expr.nargs; i++)
 			typesub(n->expr.args[i], noerr);
 		if (!noerr)
@@ -2731,7 +2736,15 @@
 	for (i = 0; i < nspecializations; i++) {
 		pushstab(specializationscope[i]);
 		n = specializations[i];
-		if (n->type == Nexpr) {
+		if (n->type == Nexpr && exprop(n) == Oauto) {
+			tr = traittab[Tcdisp];
+			assert(tr->nproto == 1);
+			ty = exprtype(n);
+			dt = mktyfunc(n->loc, NULL, 0, mktype(n->loc, Tyvoid));
+			lappend(&dt->sub, &dt->nsub, ty);
+			d = specializedcl(tr->proto[0], ty, dt, &name);
+			htput(tr->proto[0]->decl.impls, ty, d);
+		} else if (n->type == Nexpr && exprop(n) == Ovar) {
 			d = specializedcl(genericdecls[i], n->expr.param, n->expr.type, &name);
 			n->expr.args[0] = name;
 			n->expr.did = d->decl.did;
@@ -2753,14 +2766,6 @@
 			it = itertype(n->iterstmt.seq, mktype(n->loc, Tyvoid));
 			d = specializedcl(tr->proto[1], ty, it, &name);
 			htput(tr->proto[1]->decl.impls, ty, d);
-		} else if (n->type == Ndecl && n->decl.isauto) {
-			tr = traittab[Tcdisp];
-			assert(tr->nproto == 1);
-			ty = decltype(n);
-			dt = mktyfunc(n->loc, NULL, 0, mktype(n->loc, Tyvoid));
-			lappend(&dt->sub, &dt->nsub, ty);
-			d = specializedcl(tr->proto[0], ty, dt, &name);
-			htput(tr->proto[0]->decl.impls, ty, d);
 		} else {
 			die("unknown node for specialization\n");
 		}
--- a/parse/ops.def
+++ b/parse/ops.def
@@ -1,5 +1,6 @@
 /* operator name, is it pure, pretty name */
 O(Obad,         1,	OTmisc,	"BAD")
+O(Oauto,	1,	OTpre,	"auto")
 O(Oadd,	        1,	OTbin,	"+")
 O(Osub,	        1,	OTbin,	"-")
 O(Omul,	        1,	OTbin,	"*")
@@ -105,4 +106,3 @@
 O(Ouge,	        1,	OTmisc, NULL)
 O(Oult,	        1,	OTmisc, NULL)
 O(Oule,	        1,	OTmisc, NULL)
-
--- a/parse/parse.h
+++ b/parse/parse.h
@@ -108,10 +108,12 @@
 	Htab *lbl;	/* labels */
 	Htab *impl;	/* trait implementations: really a set of implemented traits. */
 
-	Node **autodcl;	/* declarations in dcl marked 'auto' */
-	size_t nautodcl;
+	/* See mi/flatten.c for the following. */
+	Node **autotmp; /* temporaries for 'auto' expressions */
+	size_t nautotmp;
 
 	Node *exit[Nexits];
+	size_t ndisposed[Nexits];
 };
 
 struct Tyenv {
@@ -331,7 +333,6 @@
 			char isnoret;
 			char isexportinit;
 			char isinit;
-			char isauto;
 		} decl;
 
 		struct {
--- a/parse/stab.c
+++ b/parse/stab.c
@@ -417,10 +417,6 @@
 
 	st = findstab(st, s->decl.name);
 	old = htget(st->dcl, s->decl.name);
-	if (s->decl.isauto) {
-		assert(!old);
-		lappend(&st->autodcl, &st->nautodcl, s);
-	}
 	if (!old)
 		forcedcl(st, s);
 	else if (!mergedecl(old, s))
--- a/test/pkgtrait.myr
+++ b/test/pkgtrait.myr
@@ -8,7 +8,6 @@
 ;;
 
 const main = {
-	var auto r : regex.regex#
-	r = std.try(regex.compile(".*"))
+	auto std.try(regex.compile(".*"))
 	std.exit(42)
 }