shithub: dav1d

Download patch

ref: 6a10a981e3e1b15b4a2986b1853f4bac229a10ff
parent: 12dc2be0e918efd204d47eb5d9686a2d277aee71
author: Ronald S. Bultje <[email protected]>
date: Fri Dec 14 10:38:57 EST 2018

Rewrite inverse transforms to prevent integer overflows

The basic idea is that with intermediates of 19+sign bits and
multipliers of 12+sign bits, the intermediates are 19+12=31+sign
bits, and adding two of these together can overflow, which is UB
in C. These are not valid AV1 streams, but they are codable, and
so although we don't particularly care about the pixel-level
output for such streams, we do want to prevent triggering UB,
since that could be considered a security vulnerability.

To resolve this, we clip all multipliers to 11 bit by inverting
them:

(a * constant_1 + b * constant_2 + 2048) >> 12, where
constant_1 < 2048 but constant_2 >= 2048, is identical to:
((a * constant_1 + b * (4096 - constant_2) + 2048) >> 12) + b,
and 4096 - constant_2 < 2048. In other places, where both
constants are a multiple of 2, we can reduce the magnitude of
both and round/shift by 11 instead of 12.

Do this in dct4,8,16,32,64 as well as adst8,16. Also slightly
simplify the final phase of idct64_1d by moving the add/sub to
before the multiply.

The adst4 is rewritten to be shaped like a matrix-multiply, and
then use the same idea on all 4 multipliers in the matrix, since
the sum of all 4 multipliers is still under 4096 in all cases.

Fixes clusterfuzz-testcase-minimized-dav1d_fuzzer-5709759466962944,
credits to oss-fuzz. Also fixes #223.

--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -34,6 +34,32 @@
 
 #define CLIP(a) iclip(a, min, max)
 
+/*
+ * In some places, we use the pattern like this:
+ * t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+ * even though the reference code might use something like:
+ * t2 =  (in1 *  1567         - in3 *  3784         + 2048) >> 12;
+ *
+ * The reason for this is that for 12 bits/component bitstreams (corrupt/
+ * invalid ones, but they are codable nonetheless), each coefficient or
+ * input can be 19(+sign) bits, and therefore if the combination of the
+ * two multipliers (each 12 bits) is >= 4096, the result of the add/sub
+ * after the pair of multiplies will exceed the 31+sign bit range. Signed
+ * integer overflows are UB in C, and we'd like to prevent that.
+ *
+ * To workaround this, we invert one of the two coefficients (or, if both are
+ * multiples of 2, we reduce their magnitude by one bit). It should be noted
+ * that SIMD implementations do not have to follow this exact behaviour. The
+ * AV1 spec clearly states that the result of the multiply/add pairs should
+ * fit in 31+sign bit intermediates, and that streams violating this convention
+ * are not AV1-compliant. So, as long as we don't trigger UB (which some people
+ * would consider a security vulnerability), we're fine. So, SIMD can simply
+ * use the faster implementation, even if that might in some cases result in
+ * integer overflows, since these are not considered valid AV1 anyway, and in
+ * e.g. x86 assembly, integer overflows are not considered UB, but they merely
+ * wrap around.
+ */
+
 static void NOINLINE
 inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
             coef *const out, const ptrdiff_t out_s, const int max)
@@ -44,8 +70,8 @@
 
     int t0 = ((in0 + in2) * 181 + 128) >> 8;
     int t1 = ((in0 - in2) * 181 + 128) >> 8;
-    int t2 = (in1 * 1567 - in3 * 3784 + 2048) >> 12;
-    int t3 = (in1 * 3784 + in3 * 1567 + 2048) >> 12;
+    int t2 = ((in1 *  1567         - in3 * (3784 - 4096) + 2048) >> 12) - in3;
+    int t3 = ((in1 * (3784 - 4096) + in3 *  1567         + 2048) >> 12) + in1;
 
     out[0 * out_s] = CLIP(t0 + t3);
     out[1 * out_s] = CLIP(t1 + t2);
@@ -65,10 +91,10 @@
     const int in1 = in[1 * in_s], in3 = in[3 * in_s];
     const int in5 = in[5 * in_s], in7 = in[7 * in_s];
 
-    int t4a = (in1 *  799 - in7 * 4017 + 2048) >> 12;
-    int t5a = (in5 * 3406 - in3 * 2276 + 2048) >> 12;
-    int t6a = (in5 * 2276 + in3 * 3406 + 2048) >> 12;
-    int t7a = (in1 * 4017 + in7 *  799 + 2048) >> 12;
+    int t4a = ((in1 *   799         - in7 * (4017 - 4096) + 2048) >> 12) - in7;
+    int t5a =  (in5 *  1703         - in3 *  1138         + 1024) >> 11;
+    int t6a =  (in5 *  1138         + in3 *  1703         + 1024) >> 11;
+    int t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;
 
     int t4  = CLIP(t4a + t5a);
         t5a = CLIP(t4a - t5a);
@@ -102,14 +128,14 @@
     const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
     const int in13 = in[13 * in_s], in15 = in[15 * in_s];
 
-    int t8a  = (in1  *  401 - in15 * 4076 + 2048) >> 12;
-    int t15a = (in1  * 4076 + in15 *  401 + 2048) >> 12;
-    int t9a  = (in9  * 3166 - in7  * 2598 + 2048) >> 12;
-    int t14a = (in9  * 2598 + in7  * 3166 + 2048) >> 12;
-    int t10a = (in5  * 1931 - in11 * 3612 + 2048) >> 12;
-    int t13a = (in5  * 3612 + in11 * 1931 + 2048) >> 12;
-    int t11a = (in13 * 3920 - in3  * 1189 + 2048) >> 12;
-    int t12a = (in13 * 1189 + in3  * 3920 + 2048) >> 12;
+    int t8a  = ((in1  *   401         - in15 * (4076 - 4096) + 2048) >> 12) - in15;
+    int t15a = ((in1  * (4076 - 4096) + in15 *   401         + 2048) >> 12) + in1;
+    int t9a  =  (in9  *  1583         - in7  *  1299         + 1024) >> 11;
+    int t14a =  (in9  *  1299         + in7  *  1583         + 1024) >> 11;
+    int t10a = ((in5  *  1931         - in11 * (3612 - 4096) + 2048) >> 12) - in11;
+    int t13a = ((in5  * (3612 - 4096) + in11 *  1931         + 2048) >> 12) + in5;
+    int t11a = ((in13 * (3920 - 4096) - in3  *  1189         + 2048) >> 12) + in13;
+    int t12a = ((in13 *  1189         + in3  * (3920 - 4096) + 2048) >> 12) + in3;
 
     int t8  = CLIP(t8a  + t9a);
     int t9  = CLIP(t8a  - t9a);
@@ -120,10 +146,10 @@
     int t14 = CLIP(t15a - t14a);
     int t15 = CLIP(t15a + t14a);
 
-    t9a  = (  t14 * 1567 - t9  * 3784  + 2048) >> 12;
-    t14a = (  t14 * 3784 + t9  * 1567  + 2048) >> 12;
-    t10a = (-(t13 * 3784 + t10 * 1567) + 2048) >> 12;
-    t13a = (  t13 * 1567 - t10 * 3784  + 2048) >> 12;
+    t9a  = ((  t14 *  1567         - t9  * (3784 - 4096)  + 2048) >> 12) - t9;
+    t14a = ((  t14 * (3784 - 4096) + t9  *  1567          + 2048) >> 12) + t14;
+    t10a = ((-(t13 * (3784 - 4096) + t10 *  1567)         + 2048) >> 12) - t13;
+    t13a = ((  t13 *  1567         - t10 * (3784 - 4096)  + 2048) >> 12) - t10;
 
     t8a  = CLIP(t8   + t11);
     t9   = CLIP(t9a  + t10a);
@@ -175,22 +201,22 @@
     const int in25 = in[25 * in_s], in27 = in[27 * in_s];
     const int in29 = in[29 * in_s], in31 = in[31 * in_s];
 
-    int t16a = (in1  *  201 - in31 * 4091 + 2048) >> 12;
-    int t31a = (in1  * 4091 + in31 *  201 + 2048) >> 12;
-    int t17a = (in17 * 3035 - in15 * 2751 + 2048) >> 12;
-    int t30a = (in17 * 2751 + in15 * 3035 + 2048) >> 12;
-    int t18a = (in9  * 1751 - in23 * 3703 + 2048) >> 12;
-    int t29a = (in9  * 3703 + in23 * 1751 + 2048) >> 12;
-    int t19a = (in25 * 3857 - in7  * 1380 + 2048) >> 12;
-    int t28a = (in25 * 1380 + in7  * 3857 + 2048) >> 12;
-    int t20a = (in5  *  995 - in27 * 3973 + 2048) >> 12;
-    int t27a = (in5  * 3973 + in27 *  995 + 2048) >> 12;
-    int t21a = (in21 * 3513 - in11 * 2106 + 2048) >> 12;
-    int t26a = (in21 * 2106 + in11 * 3513 + 2048) >> 12;
-    int t22a = (in13 * 2440 - in19 * 3290 + 2048) >> 12;
-    int t25a = (in13 * 3290 + in19 * 2440 + 2048) >> 12;
-    int t23a = (in29 * 4052 - in3  *  601 + 2048) >> 12;
-    int t24a = (in29 *  601 + in3  * 4052 + 2048) >> 12;
+    int t16a = ((in1  *   201         - in31 * (4091 - 4096) + 2048) >> 12) - in31;
+    int t31a = ((in1  * (4091 - 4096) + in31 *   201         + 2048) >> 12) + in1;
+    int t17a = ((in17 * (3035 - 4096) - in15 *  2751         + 2048) >> 12) + in17;
+    int t30a = ((in17 *  2751         + in15 * (3035 - 4096) + 2048) >> 12) + in15;
+    int t18a = ((in9  *  1751         - in23 * (3703 - 4096) + 2048) >> 12) - in23;
+    int t29a = ((in9  * (3703 - 4096) + in23 *  1751         + 2048) >> 12) + in9;
+    int t19a = ((in25 * (3857 - 4096) - in7  *  1380         + 2048) >> 12) + in25;
+    int t28a = ((in25 *  1380         + in7  * (3857 - 4096) + 2048) >> 12) + in7;
+    int t20a = ((in5  *   995         - in27 * (3973 - 4096) + 2048) >> 12) - in27;
+    int t27a = ((in5  * (3973 - 4096) + in27 *   995         + 2048) >> 12) + in5;
+    int t21a = ((in21 * (3513 - 4096) - in11 *  2106         + 2048) >> 12) + in21;
+    int t26a = ((in21 *  2106         + in11 * (3513 - 4096) + 2048) >> 12) + in11;
+    int t22a =  (in13 *  1220         - in19 *  1645         + 1024) >> 11;
+    int t25a =  (in13 *  1645         + in19 *  1220         + 1024) >> 11;
+    int t23a = ((in29 * (4052 - 4096) - in3  *   601         + 2048) >> 12) + in29;
+    int t24a = ((in29 *   601         + in3  * (4052 - 4096) + 2048) >> 12) + in3;
 
     int t16 = CLIP(t16a + t17a);
     int t17 = CLIP(t16a - t17a);
@@ -209,14 +235,14 @@
     int t30 = CLIP(t31a - t30a);
     int t31 = CLIP(t31a + t30a);
 
-    t17a = (  t30 *  799 - t17 * 4017  + 2048) >> 12;
-    t30a = (  t30 * 4017 + t17 *  799  + 2048) >> 12;
-    t18a = (-(t29 * 4017 + t18 *  799) + 2048) >> 12;
-    t29a = (  t29 *  799 - t18 * 4017  + 2048) >> 12;
-    t21a = (  t26 * 3406 - t21 * 2276  + 2048) >> 12;
-    t26a = (  t26 * 2276 + t21 * 3406  + 2048) >> 12;
-    t22a = (-(t25 * 2276 + t22 * 3406) + 2048) >> 12;
-    t25a = (  t25 * 3406 - t22 * 2276  + 2048) >> 12;
+    t17a = ((  t30 *   799         - t17 * (4017 - 4096)  + 2048) >> 12) - t17;
+    t30a = ((  t30 * (4017 - 4096) + t17 *   799          + 2048) >> 12) + t30;
+    t18a = ((-(t29 * (4017 - 4096) + t18 *   799)         + 2048) >> 12) - t29;
+    t29a = ((  t29 *   799         - t18 * (4017 - 4096)  + 2048) >> 12) - t18;
+    t21a =  (  t26 *  1703         - t21 *  1138          + 1024) >> 11;
+    t26a =  (  t26 *  1138         + t21 *  1703          + 1024) >> 11;
+    t22a =  (-(t25 *  1138         + t22 *  1703        ) + 1024) >> 11;
+    t25a =  (  t25 *  1703         - t22 *  1138          + 1024) >> 11;
 
     t16a = CLIP(t16  + t19);
     t17  = CLIP(t17a + t18a);
@@ -235,14 +261,14 @@
     t30  = CLIP(t30a + t29a);
     t31a = CLIP(t31  + t28);
 
-    t18a = (  t29  * 1567 - t18  * 3784  + 2048) >> 12;
-    t29a = (  t29  * 3784 + t18  * 1567  + 2048) >> 12;
-    t19  = (  t28a * 1567 - t19a * 3784  + 2048) >> 12;
-    t28  = (  t28a * 3784 + t19a * 1567  + 2048) >> 12;
-    t20  = (-(t27a * 3784 + t20a * 1567) + 2048) >> 12;
-    t27  = (  t27a * 1567 - t20a * 3784  + 2048) >> 12;
-    t21a = (-(t26  * 3784 + t21  * 1567) + 2048) >> 12;
-    t26a = (  t26  * 1567 - t21  * 3784  + 2048) >> 12;
+    t18a = ((  t29  *  1567         - t18  * (3784 - 4096)  + 2048) >> 12) - t18;
+    t29a = ((  t29  * (3784 - 4096) + t18  *  1567          + 2048) >> 12) + t29;
+    t19  = ((  t28a *  1567         - t19a * (3784 - 4096)  + 2048) >> 12) - t19a;
+    t28  = ((  t28a * (3784 - 4096) + t19a *  1567          + 2048) >> 12) + t28a;
+    t20  = ((-(t27a * (3784 - 4096) + t20a *  1567)         + 2048) >> 12) - t27a;
+    t27  = ((  t27a *  1567         - t20a * (3784 - 4096)  + 2048) >> 12) - t20a;
+    t21a = ((-(t26  * (3784 - 4096) + t21  *  1567)         + 2048) >> 12) - t26;
+    t26a = ((  t26  *  1567         - t21  * (3784 - 4096)  + 2048) >> 12) - t21;
 
     t16  = CLIP(t16a + t23a);
     t17a = CLIP(t17  + t22);
@@ -330,38 +356,38 @@
     const int in57 = in[57 * in_s], in59 = in[59 * in_s];
     const int in61 = in[61 * in_s], in63 = in[63 * in_s];
 
-    int t32a = (in1  *  101 - in63 * 4095 + 2048) >> 12;
-    int t33a = (in33 * 2967 - in31 * 2824 + 2048) >> 12;
-    int t34a = (in17 * 1660 - in47 * 3745 + 2048) >> 12;
-    int t35a = (in49 * 3822 - in15 * 1474 + 2048) >> 12;
-    int t36a = (in9  *  897 - in55 * 3996 + 2048) >> 12;
-    int t37a = (in41 * 3461 - in23 * 2191 + 2048) >> 12;
-    int t38a = (in25 * 2359 - in39 * 3349 + 2048) >> 12;
-    int t39a = (in57 * 4036 - in7  *  700 + 2048) >> 12;
-    int t40a = (in5  *  501 - in59 * 4065 + 2048) >> 12;
-    int t41a = (in37 * 3229 - in27 * 2520 + 2048) >> 12;
-    int t42a = (in21 * 2019 - in43 * 3564 + 2048) >> 12;
-    int t43a = (in53 * 3948 - in11 * 1092 + 2048) >> 12;
-    int t44a = (in13 * 1285 - in51 * 3889 + 2048) >> 12;
-    int t45a = (in45 * 3659 - in19 * 1842 + 2048) >> 12;
-    int t46a = (in29 * 2675 - in35 * 3102 + 2048) >> 12;
-    int t47a = (in61 * 4085 - in3  *  301 + 2048) >> 12;
-    int t48a = (in61 *  301 + in3  * 4085 + 2048) >> 12;
-    int t49a = (in29 * 3102 + in35 * 2675 + 2048) >> 12;
-    int t50a = (in45 * 1842 + in19 * 3659 + 2048) >> 12;
-    int t51a = (in13 * 3889 + in51 * 1285 + 2048) >> 12;
-    int t52a = (in53 * 1092 + in11 * 3948 + 2048) >> 12;
-    int t53a = (in21 * 3564 + in43 * 2019 + 2048) >> 12;
-    int t54a = (in37 * 2520 + in27 * 3229 + 2048) >> 12;
-    int t55a = (in5  * 4065 + in59 *  501 + 2048) >> 12;
-    int t56a = (in57 *  700 + in7  * 4036 + 2048) >> 12;
-    int t57a = (in25 * 3349 + in39 * 2359 + 2048) >> 12;
-    int t58a = (in41 * 2191 + in23 * 3461 + 2048) >> 12;
-    int t59a = (in9  * 3996 + in55 *  897 + 2048) >> 12;
-    int t60a = (in49 * 1474 + in15 * 3822 + 2048) >> 12;
-    int t61a = (in17 * 3745 + in47 * 1660 + 2048) >> 12;
-    int t62a = (in33 * 2824 + in31 * 2967 + 2048) >> 12;
-    int t63a = (in1  * 4095 + in63 *  101 + 2048) >> 12;
+    int t32a = ((in1  *   101         - in63 * (4095 - 4096) + 2048) >> 12) - in63;
+    int t33a = ((in33 * (2967 - 4096) - in31 *  2824         + 2048) >> 12) + in33;
+    int t34a = ((in17 *  1660         - in47 * (3745 - 4096) + 2048) >> 12) - in47;
+    int t35a =  (in49 *  1911         - in15 *   737         + 1024) >> 11;
+    int t36a = ((in9  *   897         - in55 * (3996 - 4096) + 2048) >> 12) - in55;
+    int t37a = ((in41 * (3461 - 4096) - in23 *  2191         + 2048) >> 12) + in41;
+    int t38a = ((in25 *  2359         - in39 * (3349 - 4096) + 2048) >> 12) - in39;
+    int t39a =  (in57 *  2018         - in7  *   350         + 1024) >> 11;
+    int t40a = ((in5  *   501         - in59 * (4065 - 4096) + 2048) >> 12) - in59;
+    int t41a = ((in37 * (3229 - 4096) - in27 *  2520         + 2048) >> 12) + in37;
+    int t42a = ((in21 *  2019         - in43 * (3564 - 4096) + 2048) >> 12) - in43;
+    int t43a =  (in53 *  1974         - in11 *   546         + 1024) >> 11;
+    int t44a = ((in13 *  1285         - in51 * (3889 - 4096) + 2048) >> 12) - in51;
+    int t45a = ((in45 * (3659 - 4096) - in19 *  1842         + 2048) >> 12) + in45;
+    int t46a = ((in29 *  2675         - in35 * (3102 - 4096) + 2048) >> 12) - in35;
+    int t47a = ((in61 * (4085 - 4096) - in3  *   301         + 2048) >> 12) + in61;
+    int t48a = ((in61 *   301         + in3  * (4085 - 4096) + 2048) >> 12) + in3;
+    int t49a = ((in29 * (3102 - 4096) + in35 *  2675         + 2048) >> 12) + in29;
+    int t50a = ((in45 *  1842         + in19 * (3659 - 4096) + 2048) >> 12) + in19;
+    int t51a = ((in13 * (3889 - 4096) + in51 *  1285         + 2048) >> 12) + in13;
+    int t52a =  (in53 *   546         + in11 *  1974         + 1024) >> 11;
+    int t53a = ((in21 * (3564 - 4096) + in43 *  2019         + 2048) >> 12) + in21;
+    int t54a = ((in37 *  2520         + in27 * (3229 - 4096) + 2048) >> 12) + in27;
+    int t55a = ((in5  * (4065 - 4096) + in59 *   501         + 2048) >> 12) + in5;
+    int t56a =  (in57 *   350         + in7  *  2018         + 1024) >> 11;
+    int t57a = ((in25 * (3349 - 4096) + in39 *  2359         + 2048) >> 12) + in25;
+    int t58a = ((in41 *  2191         + in23 * (3461 - 4096) + 2048) >> 12) + in23;
+    int t59a = ((in9  * (3996 - 4096) + in55 *   897         + 2048) >> 12) + in9;
+    int t60a =  (in49 *   737         + in15 *  1911         + 1024) >> 11;
+    int t61a = ((in17 * (3745 - 4096) + in47 *  1660         + 2048) >> 12) + in17;
+    int t62a = ((in33 *  2824         + in31 * (2967 - 4096) + 2048) >> 12) + in31;
+    int t63a = ((in1  * (4095 - 4096) + in63 *   101         + 2048) >> 12) + in1;
 
     int t32 = CLIP(t32a + t33a);
     int t33 = CLIP(t32a - t33a);
@@ -396,22 +422,22 @@
     int t62 = CLIP(t63a - t62a);
     int t63 = CLIP(t63a + t62a);
 
-    t33a = (t33 * -4076 + t62 *   401 + 2048) >> 12;
-    t34a = (t34 * - 401 + t61 * -4076 + 2048) >> 12;
-    t37a = (t37 * -2598 + t58 *  3166 + 2048) >> 12;
-    t38a = (t38 * -3166 + t57 * -2598 + 2048) >> 12;
-    t41a = (t41 * -3612 + t54 *  1931 + 2048) >> 12;
-    t42a = (t42 * -1931 + t53 * -3612 + 2048) >> 12;
-    t45a = (t45 * -1189 + t50 *  3920 + 2048) >> 12;
-    t46a = (t46 * -3920 + t49 * -1189 + 2048) >> 12;
-    t49a = (t46 * -1189 + t49 *  3920 + 2048) >> 12;
-    t50a = (t45 *  3920 + t50 *  1189 + 2048) >> 12;
-    t53a = (t42 * -3612 + t53 *  1931 + 2048) >> 12;
-    t54a = (t41 *  1931 + t54 *  3612 + 2048) >> 12;
-    t57a = (t38 * -2598 + t57 *  3166 + 2048) >> 12;
-    t58a = (t37 *  3166 + t58 *  2598 + 2048) >> 12;
-    t61a = (t34 * -4076 + t61 *   401 + 2048) >> 12;
-    t62a = (t33 *   401 + t62 *  4076 + 2048) >> 12;
+    t33a = ((t33 * (4096 - 4076) + t62 *   401         + 2048) >> 12) - t33;
+    t34a = ((t34 *  -401         + t61 * (4096 - 4076) + 2048) >> 12) - t61;
+    t37a =  (t37 * -1299         + t58 *  1583         + 1024) >> 11;
+    t38a =  (t38 * -1583         + t57 * -1299         + 1024) >> 11;
+    t41a = ((t41 * (4096 - 3612) + t54 *  1931         + 2048) >> 12) - t41;
+    t42a = ((t42 * -1931         + t53 * (4096 - 3612) + 2048) >> 12) - t53;
+    t45a = ((t45 * -1189         + t50 * (3920 - 4096) + 2048) >> 12) + t50;
+    t46a = ((t46 * (4096 - 3920) + t49 * -1189         + 2048) >> 12) - t46;
+    t49a = ((t46 * -1189         + t49 * (3920 - 4096) + 2048) >> 12) + t49;
+    t50a = ((t45 * (3920 - 4096) + t50 *  1189         + 2048) >> 12) + t45;
+    t53a = ((t42 * (4096 - 3612) + t53 *  1931         + 2048) >> 12) - t42;
+    t54a = ((t41 *  1931         + t54 * (3612 - 4096) + 2048) >> 12) + t54;
+    t57a =  (t38 * -1299         + t57 *  1583         + 1024) >> 11;
+    t58a =  (t37 *  1583         + t58 *  1299         + 1024) >> 11;
+    t61a = ((t34 * (4096 - 4076) + t61 *   401         + 2048) >> 12) - t34;
+    t62a = ((t33 *   401         + t62 * (4076 - 4096) + 2048) >> 12) + t62;
 
     t32a = CLIP(t32  + t35);
     t33  = CLIP(t33a + t34a);
@@ -446,22 +472,22 @@
     t62  = CLIP(t62a + t61a);
     t63a = CLIP(t63  + t60);
 
-    t34a = (t34  * -4017 + t61  *   799 + 2048) >> 12;
-    t35  = (t35a * -4017 + t60a *   799 + 2048) >> 12;
-    t36  = (t36a * - 799 + t59a * -4017 + 2048) >> 12;
-    t37a = (t37  * - 799 + t58  * -4017 + 2048) >> 12;
-    t42a = (t42  * -2276 + t53  *  3406 + 2048) >> 12;
-    t43  = (t43a * -2276 + t52a *  3406 + 2048) >> 12;
-    t44  = (t44a * -3406 + t51a * -2276 + 2048) >> 12;
-    t45a = (t45  * -3406 + t50  * -2276 + 2048) >> 12;
-    t50a = (t45  * -2276 + t50  *  3406 + 2048) >> 12;
-    t51  = (t44a * -2276 + t51a *  3406 + 2048) >> 12;
-    t52  = (t43a *  3406 + t52a *  2276 + 2048) >> 12;
-    t53a = (t42  *  3406 + t53  *  2276 + 2048) >> 12;
-    t58a = (t37  * -4017 + t58  *   799 + 2048) >> 12;
-    t59  = (t36a * -4017 + t59a *   799 + 2048) >> 12;
-    t60  = (t35a *   799 + t60a *  4017 + 2048) >> 12;
-    t61a = (t34  *   799 + t61  *  4017 + 2048) >> 12;
+    t34a = ((t34  * (4096 - 4017) + t61  *   799         + 2048) >> 12) - t34;
+    t35  = ((t35a * (4096 - 4017) + t60a *   799         + 2048) >> 12) - t35a;
+    t36  = ((t36a *  -799         + t59a * (4096 - 4017) + 2048) >> 12) - t59a;
+    t37a = ((t37  *  -799         + t58  * (4096 - 4017) + 2048) >> 12) - t58;
+    t42a =  (t42  * -1138         + t53  *  1703         + 1024) >> 11;
+    t43  =  (t43a * -1138         + t52a *  1703         + 1024) >> 11;
+    t44  =  (t44a * -1703         + t51a * -1138         + 1024) >> 11;
+    t45a =  (t45  * -1703         + t50  * -1138         + 1024) >> 11;
+    t50a =  (t45  * -1138         + t50  *  1703         + 1024) >> 11;
+    t51  =  (t44a * -1138         + t51a *  1703         + 1024) >> 11;
+    t52  =  (t43a *  1703         + t52a *  1138         + 1024) >> 11;
+    t53a =  (t42  *  1703         + t53  *  1138         + 1024) >> 11;
+    t58a = ((t37  * (4096 - 4017) + t58  *   799         + 2048) >> 12) - t37;
+    t59  = ((t36a * (4096 - 4017) + t59a *   799         + 2048) >> 12) - t36a;
+    t60  = ((t35a *   799         + t60a * (4017 - 4096) + 2048) >> 12) + t60a;
+    t61a = ((t34  *   799         + t61  * (4017 - 4096) + 2048) >> 12) + t61;
 
     t32  = CLIP(t32a + t39a);
     t33a = CLIP(t33  + t38);
@@ -496,22 +522,22 @@
     t62a = CLIP(t62  + t57);
     t63  = CLIP(t63a + t56a);
 
-    t36  = (t36a * -3784 + t59a *  1567 + 2048) >> 12;
-    t37a = (t37  * -3784 + t58  *  1567 + 2048) >> 12;
-    t38  = (t38a * -3784 + t57a *  1567 + 2048) >> 12;
-    t39a = (t39  * -3784 + t56  *  1567 + 2048) >> 12;
-    t40a = (t40  * -1567 + t55  * -3784 + 2048) >> 12;
-    t41  = (t41a * -1567 + t54a * -3784 + 2048) >> 12;
-    t42a = (t42  * -1567 + t53  * -3784 + 2048) >> 12;
-    t43  = (t43a * -1567 + t52a * -3784 + 2048) >> 12;
-    t52  = (t43a * -3784 + t52a *  1567 + 2048) >> 12;
-    t53a = (t42  * -3784 + t53  *  1567 + 2048) >> 12;
-    t54  = (t41a * -3784 + t54a *  1567 + 2048) >> 12;
-    t55a = (t40  * -3784 + t55  *  1567 + 2048) >> 12;
-    t56a = (t39  *  1567 + t56  *  3784 + 2048) >> 12;
-    t57  = (t38a *  1567 + t57a *  3784 + 2048) >> 12;
-    t58a = (t37  *  1567 + t58  *  3784 + 2048) >> 12;
-    t59  = (t36a *  1567 + t59a *  3784 + 2048) >> 12;
+    t36  = ((t36a * (4096 - 3784) + t59a *  1567         + 2048) >> 12) - t36a;
+    t37a = ((t37  * (4096 - 3784) + t58  *  1567         + 2048) >> 12) - t37;
+    t38  = ((t38a * (4096 - 3784) + t57a *  1567         + 2048) >> 12) - t38a;
+    t39a = ((t39  * (4096 - 3784) + t56  *  1567         + 2048) >> 12) - t39;
+    t40a = ((t40  * -1567         + t55  * (4096 - 3784) + 2048) >> 12) - t55;
+    t41  = ((t41a * -1567         + t54a * (4096 - 3784) + 2048) >> 12) - t54a;
+    t42a = ((t42  * -1567         + t53  * (4096 - 3784) + 2048) >> 12) - t53;
+    t43  = ((t43a * -1567         + t52a * (4096 - 3784) + 2048) >> 12) - t52a;
+    t52  = ((t43a * (4096 - 3784) + t52a *  1567         + 2048) >> 12) - t43a;
+    t53a = ((t42  * (4096 - 3784) + t53  *  1567         + 2048) >> 12) - t42;
+    t54  = ((t41a * (4096 - 3784) + t54a *  1567         + 2048) >> 12) - t41a;
+    t55a = ((t40  * (4096 - 3784) + t55  *  1567         + 2048) >> 12) - t40;
+    t56a = ((t39  *  1567         + t56  * (3784 - 4096) + 2048) >> 12) + t56;
+    t57  = ((t38a *  1567         + t57a * (3784 - 4096) + 2048) >> 12) + t57a;
+    t58a = ((t37  *  1567         + t58  * (3784 - 4096) + 2048) >> 12) + t58;
+    t59  = ((t36a *  1567         + t59a * (3784 - 4096) + 2048) >> 12) + t59a;
 
     t32a = CLIP(t32  + t47);
     t33  = CLIP(t33a + t46a);
@@ -546,22 +572,22 @@
     t62  = CLIP(t62a + t49a);
     t63a = CLIP(t63  + t48);
 
-    t40a = (t40  * -181 + t55  * 181 + 128) >> 8;
-    t41  = (t41a * -181 + t54a * 181 + 128) >> 8;
-    t42a = (t42  * -181 + t53  * 181 + 128) >> 8;
-    t43  = (t43a * -181 + t52a * 181 + 128) >> 8;
-    t44a = (t44  * -181 + t51  * 181 + 128) >> 8;
-    t45  = (t45a * -181 + t50a * 181 + 128) >> 8;
-    t46a = (t46  * -181 + t49  * 181 + 128) >> 8;
-    t47  = (t47a * -181 + t48a * 181 + 128) >> 8;
-    t48  = (t47a *  181 + t48a * 181 + 128) >> 8;
-    t49a = (t46  *  181 + t49  * 181 + 128) >> 8;
-    t50  = (t45a *  181 + t50a * 181 + 128) >> 8;
-    t51a = (t44  *  181 + t51  * 181 + 128) >> 8;
-    t52  = (t43a *  181 + t52a * 181 + 128) >> 8;
-    t53a = (t42  *  181 + t53  * 181 + 128) >> 8;
-    t54  = (t41a *  181 + t54a * 181 + 128) >> 8;
-    t55a = (t40  *  181 + t55  * 181 + 128) >> 8;
+    t40a = ((t55  - t40 ) * 181 + 128) >> 8;
+    t41  = ((t54a - t41a) * 181 + 128) >> 8;
+    t42a = ((t53  - t42 ) * 181 + 128) >> 8;
+    t43  = ((t52a - t43a) * 181 + 128) >> 8;
+    t44a = ((t51  - t44 ) * 181 + 128) >> 8;
+    t45  = ((t50a - t45a) * 181 + 128) >> 8;
+    t46a = ((t49  - t46 ) * 181 + 128) >> 8;
+    t47  = ((t48a - t47a) * 181 + 128) >> 8;
+    t48  = ((t47a + t48a) * 181 + 128) >> 8;
+    t49a = ((t46  + t49 ) * 181 + 128) >> 8;
+    t50  = ((t45a + t50a) * 181 + 128) >> 8;
+    t51a = ((t44  + t51 ) * 181 + 128) >> 8;
+    t52  = ((t43a + t52a) * 181 + 128) >> 8;
+    t53a = ((t42  + t53 ) * 181 + 128) >> 8;
+    t54  = ((t41a + t54a) * 181 + 128) >> 8;
+    t55a = ((t40  + t55 ) * 181 + 128) >> 8;
 
     out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);
     out[ 1 * out_s] = CLIP(tmp[ 1] + t62);
@@ -636,15 +662,16 @@
     const int in0 = in[0 * in_s], in1 = in[1 * in_s];
     const int in2 = in[2 * in_s], in3 = in[3 * in_s];
 
-    int t0 = 1321 * in0 + 3803 * in2 + 2482 * in3;
-    int t1 = 2482 * in0 - 1321 * in2 - 3803 * in3;
-    int t2 = 3344 * (in0 - in2 + in3);
-    int t3 = 3344 * in1;
-
-    out[0 * out_s] = (t0 + t3      + 2048) >> 12;
-    out[1 * out_s] = (t1 + t3      + 2048) >> 12;
-    out[2 * out_s] = (t2           + 2048) >> 12;
-    out[3 * out_s] = (t0 + t1 - t3 + 2048) >> 12;
+    out[0 * out_s] = (( 1321         * in0 + (3803 - 4096) * in2 +
+                       (2482 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+                     in2 + in3 + in1;
+    out[1 * out_s] = (((2482 - 4096) * in0 -  1321         * in2 -
+                       (3803 - 4096) * in3 + (3344 - 4096) * in1 + 2048) >> 12) +
+                     in0 - in3 + in1;
+    out[2 * out_s] = (209 * (in0 - in2 + in3) + 128) >> 8;
+    out[3 * out_s] = (((3803 - 4096) * in0 + (2482 - 4096) * in2 -
+                        1321         * in3 - (3344 - 4096) * in1 + 2048) >> 12) +
+                     in0 + in2 - in1;
 }
 
 static void NOINLINE
@@ -657,14 +684,14 @@
     const int in4 = in[4 * in_s], in5 = in[5 * in_s];
     const int in6 = in[6 * in_s], in7 = in[7 * in_s];
 
-    int t0a = (4076 * in7 +  401 * in0 + 2048) >> 12;
-    int t1a = ( 401 * in7 - 4076 * in0 + 2048) >> 12;
-    int t2a = (3612 * in5 + 1931 * in2 + 2048) >> 12;
-    int t3a = (1931 * in5 - 3612 * in2 + 2048) >> 12;
-    int t4a = (2598 * in3 + 3166 * in4 + 2048) >> 12;
-    int t5a = (3166 * in3 - 2598 * in4 + 2048) >> 12;
-    int t6a = (1189 * in1 + 3920 * in6 + 2048) >> 12;
-    int t7a = (3920 * in1 - 1189 * in6 + 2048) >> 12;
+    int t0a = (((4076 - 4096) * in7 +   401         * in0 + 2048) >> 12) + in7;
+    int t1a = ((  401         * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
+    int t2a = (((3612 - 4096) * in5 +  1931         * in2 + 2048) >> 12) + in5;
+    int t3a = (( 1931         * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
+    int t4a =  ( 1299         * in3 +  1583         * in4 + 1024) >> 11;
+    int t5a =  ( 1583         * in3 -  1299         * in4 + 1024) >> 11;
+    int t6a = (( 1189         * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
+    int t7a = (((3920 - 4096) * in1 -  1189         * in6 + 2048) >> 12) + in1;
 
     int t0 = CLIP(t0a + t4a);
     int t1 = CLIP(t1a + t5a);
@@ -675,10 +702,10 @@
     int t6 = CLIP(t2a - t6a);
     int t7 = CLIP(t3a - t7a);
 
-    t4a = (3784 * t4 + 1567 * t5 + 2048) >> 12;
-    t5a = (1567 * t4 - 3784 * t5 + 2048) >> 12;
-    t6a = (3784 * t7 - 1567 * t6 + 2048) >> 12;
-    t7a = (1567 * t7 + 3784 * t6 + 2048) >> 12;
+    t4a = (((3784 - 4096) * t4 +  1567         * t5 + 2048) >> 12) + t4;
+    t5a = (( 1567         * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
+    t6a = (((3784 - 4096) * t7 -  1567         * t6 + 2048) >> 12) + t7;
+    t7a = (( 1567         * t7 + (3784 - 4096) * t6 + 2048) >> 12) + t6;
 
     out[0 * out_s] = CLIP(  t0 + t2);
     out[7 * out_s] = CLIP(-(t1 + t3));
@@ -710,22 +737,22 @@
     const int in12 = in[12 * in_s], in13 = in[13 * in_s];
     const int in14 = in[14 * in_s], in15 = in[15 * in_s];
 
-    int t0  = (in15 * 4091 + in0  *  201 + 2048) >> 12;
-    int t1  = (in15 *  201 - in0  * 4091 + 2048) >> 12;
-    int t2  = (in13 * 3973 + in2  *  995 + 2048) >> 12;
-    int t3  = (in13 *  995 - in2  * 3973 + 2048) >> 12;
-    int t4  = (in11 * 3703 + in4  * 1751 + 2048) >> 12;
-    int t5  = (in11 * 1751 - in4  * 3703 + 2048) >> 12;
-    int t6  = (in9  * 3290 + in6  * 2440 + 2048) >> 12;
-    int t7  = (in9  * 2440 - in6  * 3290 + 2048) >> 12;
-    int t8  = (in7  * 2751 + in8  * 3035 + 2048) >> 12;
-    int t9  = (in7  * 3035 - in8  * 2751 + 2048) >> 12;
-    int t10 = (in5  * 2106 + in10 * 3513 + 2048) >> 12;
-    int t11 = (in5  * 3513 - in10 * 2106 + 2048) >> 12;
-    int t12 = (in3  * 1380 + in12 * 3857 + 2048) >> 12;
-    int t13 = (in3  * 3857 - in12 * 1380 + 2048) >> 12;
-    int t14 = (in1  *  601 + in14 * 4052 + 2048) >> 12;
-    int t15 = (in1  * 4052 - in14 *  601 + 2048) >> 12;
+    int t0  = ((in15 * (4091 - 4096) + in0  *   201         + 2048) >> 12) + in15;
+    int t1  = ((in15 *   201         - in0  * (4091 - 4096) + 2048) >> 12) - in0;
+    int t2  = ((in13 * (3973 - 4096) + in2  *   995         + 2048) >> 12) + in13;
+    int t3  = ((in13 *   995         - in2  * (3973 - 4096) + 2048) >> 12) - in2;
+    int t4  = ((in11 * (3703 - 4096) + in4  *  1751         + 2048) >> 12) + in11;
+    int t5  = ((in11 *  1751         - in4  * (3703 - 4096) + 2048) >> 12) - in4;
+    int t6  =  (in9  *  1645         + in6  *  1220         + 1024) >> 11;
+    int t7  =  (in9  *  1220         - in6  *  1645         + 1024) >> 11;
+    int t8  = ((in7  *  2751         + in8  * (3035 - 4096) + 2048) >> 12) + in8;
+    int t9  = ((in7  * (3035 - 4096) - in8  *  2751         + 2048) >> 12) + in7;
+    int t10 = ((in5  *  2106         + in10 * (3513 - 4096) + 2048) >> 12) + in10;
+    int t11 = ((in5  * (3513 - 4096) - in10 *  2106         + 2048) >> 12) + in5;
+    int t12 = ((in3  *  1380         + in12 * (3857 - 4096) + 2048) >> 12) + in12;
+    int t13 = ((in3  * (3857 - 4096) - in12 *  1380         + 2048) >> 12) + in3;
+    int t14 = ((in1  *   601         + in14 * (4052 - 4096) + 2048) >> 12) + in14;
+    int t15 = ((in1  * (4052 - 4096) - in14 *   601         + 2048) >> 12) + in1;
 
     int t0a  = CLIP(t0 + t8 );
     int t1a  = CLIP(t1 + t9 );
@@ -744,14 +771,14 @@
     int t14a = CLIP(t6 - t14);
     int t15a = CLIP(t7 - t15);
 
-    t8   = (t8a  * 4017 + t9a  *  799 + 2048) >> 12;
-    t9   = (t8a  *  799 - t9a  * 4017 + 2048) >> 12;
-    t10  = (t10a * 2276 + t11a * 3406 + 2048) >> 12;
-    t11  = (t10a * 3406 - t11a * 2276 + 2048) >> 12;
-    t12  = (t13a * 4017 - t12a *  799 + 2048) >> 12;
-    t13  = (t13a *  799 + t12a * 4017 + 2048) >> 12;
-    t14  = (t15a * 2276 - t14a * 3406 + 2048) >> 12;
-    t15  = (t15a * 3406 + t14a * 2276 + 2048) >> 12;
+    t8   = ((t8a  * (4017 - 4096) + t9a  *   799         + 2048) >> 12) + t8a;
+    t9   = ((t8a  *   799         - t9a  * (4017 - 4096) + 2048) >> 12) - t9a;
+    t10  = ((t10a *  2276         + t11a * (3406 - 4096) + 2048) >> 12) + t11a;
+    t11  = ((t10a * (3406 - 4096) - t11a *  2276         + 2048) >> 12) + t10a;
+    t12  = ((t13a * (4017 - 4096) - t12a *   799         + 2048) >> 12) + t13a;
+    t13  = ((t13a *   799         + t12a * (4017 - 4096) + 2048) >> 12) + t12a;
+    t14  = ((t15a *  2276         - t14a * (3406 - 4096) + 2048) >> 12) - t14a;
+    t15  = ((t15a * (3406 - 4096) + t14a *  2276         + 2048) >> 12) + t15a;
 
     t0   = CLIP(t0a + t4a);
     t1   = CLIP(t1a + t5a);
@@ -770,14 +797,14 @@
     t14a = CLIP(t10 - t14);
     t15a = CLIP(t11 - t15);
 
-    t4a  = (t4   * 3784 + t5   * 1567 + 2048) >> 12;
-    t5a  = (t4   * 1567 - t5   * 3784 + 2048) >> 12;
-    t6a  = (t7   * 3784 - t6   * 1567 + 2048) >> 12;
-    t7a  = (t7   * 1567 + t6   * 3784 + 2048) >> 12;
-    t12  = (t12a * 3784 + t13a * 1567 + 2048) >> 12;
-    t13  = (t12a * 1567 - t13a * 3784 + 2048) >> 12;
-    t14  = (t15a * 3784 - t14a * 1567 + 2048) >> 12;
-    t15  = (t15a * 1567 + t14a * 3784 + 2048) >> 12;
+    t4a  = ((t4   * (3784 - 4096) + t5   *  1567         + 2048) >> 12) + t4;
+    t5a  = ((t4   *  1567         - t5   * (3784 - 4096) + 2048) >> 12) - t5;
+    t6a  = ((t7   * (3784 - 4096) - t6   *  1567         + 2048) >> 12) + t7;
+    t7a  = ((t7   *  1567         + t6   * (3784 - 4096) + 2048) >> 12) + t6;
+    t12  = ((t12a * (3784 - 4096) + t13a *  1567         + 2048) >> 12) + t12a;
+    t13  = ((t12a *  1567         - t13a * (3784 - 4096) + 2048) >> 12) - t13a;
+    t14  = ((t15a * (3784 - 4096) - t14a *  1567         + 2048) >> 12) + t15a;
+    t15  = ((t15a *  1567         + t14a * (3784 - 4096) + 2048) >> 12) + t14a;
 
     out[ 0 * out_s] = CLIP(  t0  + t2   );
     out[15 * out_s] = CLIP(-(t1  + t3)  );