shithub: dav1d

Download patch

ref: d4dfa85cf1cdcc08820f864f00b0eeb0492a8855
parent: de561b3ba1598e269b00847406181158b1f91d1f
author: Henrik Gramner <[email protected]>
date: Tue Oct 1 14:13:46 EDT 2019

x86: Increase precision of SSSE3 IDCT intermediates

--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -202,7 +202,7 @@
     ret
 %endmacro
 
-; flags: 1 = swap, 2: coef_regs
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
     pmaddwd              m%2, m%4, m%1
@@ -218,24 +218,17 @@
     paddd                m%1, m%3
     psrad                m%2, 12
     psrad                m%1, 12
+%if %6 & 4 == 0
     packssdw             m%1, m%2
+%endif
 %endmacro
 
 %macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
-    punpckhwd            m2, m0, m1            ;unpacked in1 in3
-    psubw                m3, m0, m1
-    paddw                m0, m1
-    punpcklqdq           m0, m3                ;high: in0-in2 ;low: in0+in2
-
     mova                 m3, [o(pd_2048)]
+    punpckhwd            m2, m0, m1            ;unpacked in1 in3
+    punpcklwd            m0, m1                ;unpacked in0 in2
     ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
-
-%if %0 == 1
-    pmulhrsw             m0, m%1
-%else
-    pmulhrsw             m0, [o(pw_2896x8)]    ;high: t1 ;low: t0
-%endif
-
+    ITX_MUL2X_PACK        0, 1, 3, 2896, 2896
     psubsw               m1, m0, m2            ;high: out2 ;low: out3
     paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro
@@ -499,79 +492,81 @@
 
 %macro IDCT8_1D_PACKED 0
     mova                 m6, [o(pd_2048)]
-    punpckhwd            m5, m0, m3                 ;unpacked in1 in7
-    punpckhwd            m4, m2, m1                 ;unpacked in5 in3
+    punpckhwd            m4, m0, m3                 ;unpacked in1 in7
+    punpcklwd            m0, m2                     ;unpacked in0 in4
+    punpckhwd            m2, m1                     ;unpacked in5 in3
     punpcklwd            m1, m3                     ;unpacked in2 in6
-    psubw                m3, m0, m2
-    paddw                m0, m2
-    punpcklqdq           m0, m3                     ;low: in0+in4 high: in0-in4
-    ITX_MUL2X_PACK        5, 2, 6,  799, 4017, 1    ;low: t4a high: t7a
-    ITX_MUL2X_PACK        4, 2, 6, 3406, 2276, 1    ;low: t5a high: t6a
-    ITX_MUL2X_PACK        1, 2, 6, 1567, 3784       ;low: t3  high: t2
-    mova                 m6, [o(pw_2896x8)]
-    psubsw               m2, m5, m4                 ;low: t5a high: t6a
-    paddsw               m5, m4                     ;low: t4  high: t7
-    punpckhqdq           m4, m2, m2                 ;low: t6a high: t6a
-    psubw                m3, m4, m2                 ;low: t6a - t5a
-    paddw                m4, m2                     ;low: t6a + t5a
-    punpcklqdq           m4, m3                     ;low: t6a + t5a high: t6a - t5a
-    pmulhrsw             m0, m6                     ;low: t0   high: t1
-    pmulhrsw             m4, m6                     ;low: t6   high: t5
-    shufps               m2, m5, m4, q1032          ;low: t7   high: t6
-    shufps               m5, m4, q3210              ;low: t4   high: t5
-    psubsw               m4, m0, m1                 ;low: tmp3 high: tmp2
+    ITX_MUL2X_PACK        4, 3, 6,  799, 4017       ;low: t7a high: t4a
+    ITX_MUL2X_PACK        2, 3, 6, 3406, 2276       ;low: t6a high: t5a
+    ITX_MUL2X_PACK        1, 3, 6, 1567, 3784       ;low: t3  high: t2
+    psubsw               m3, m4, m2                 ;low: t6a high: t5a
+    paddsw               m4, m2                     ;low: t7  high: t4
+    pshufb               m3, [o(deint_shuf1)]
+    ITX_MUL2X_PACK        0, 2, 6, 2896, 2896       ;low: t0  high: t1
+    ITX_MUL2X_PACK        3, 2, 6, 2896, 2896       ;low: t6  high: t5
+    psubsw               m2, m0, m1                 ;low: tmp3 high: tmp2
     paddsw               m0, m1                     ;low: tmp0 high: tmp1
-    psubsw               m3, m0, m2                 ;low: out7 high: out6
-    paddsw               m0, m2                     ;low: out0 high: out1
-    psubsw               m2, m4, m5                 ;low: out4 high: out5
-    paddsw               m1, m4, m5                 ;low: out3 high: out2
+    punpcklqdq           m1, m4, m3                 ;low: t7   high: t6
+    punpckhqdq           m4, m3                     ;low: t4   high: t5
+    psubsw               m3, m0, m1                 ;low: out7 high: out6
+    paddsw               m0, m1                     ;low: out0 high: out1
+    paddsw               m1, m2, m4                 ;low: out3 high: out2
+    psubsw               m2, m4                     ;low: out4 high: out5
 %endmacro
 
 ;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
 ;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
-%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
-    punpckhwd           m%3, m%1, m%2
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+    punpckhwd           m%4, m%1, m%2
     punpcklwd           m%1, m%2
 %if %7 < 8
     pmaddwd             m%2, m%7, m%1
-    pmaddwd             m%4, m%7, m%3
+    pmaddwd             m%3, m%7, m%4
 %else
     mova                m%2, [o(pw_%7_%6)]
-    pmaddwd             m%4, m%3, m%2
+%if %8
+    pmaddwd             m%3, m%1, m%2
+    pmaddwd             m%2, m%4
+%else
+    pmaddwd             m%3, m%4, m%2
     pmaddwd             m%2, m%1
 %endif
-    paddd               m%4, m%5
+%endif
+    paddd               m%3, m%5
     paddd               m%2, m%5
-    psrad               m%4, 12
+    psrad               m%3, 12
     psrad               m%2, 12
-    packssdw            m%2, m%4                 ;dst2
+%if %8
+    packssdw            m%3, m%2
+%else
+    packssdw            m%2, m%3                 ;dst2
+%endif
 %if %7 < 8
-    pmaddwd             m%3, m%6
+    pmaddwd             m%4, m%6
     pmaddwd             m%1, m%6
+%elif %8
+    mova                m%2, [o(pw_%6_m%7)]
+    pmaddwd             m%4, m%2
+    pmaddwd             m%1, m%2
 %else
-    mova                m%4, [o(pw_%6_m%7)]
-    pmaddwd             m%3, m%4
-    pmaddwd             m%1, m%4
+    mova                m%3, [o(pw_%6_m%7)]
+    pmaddwd             m%4, m%3
+    pmaddwd             m%1, m%3
 %endif
-    paddd               m%3, m%5
+    paddd               m%4, m%5
     paddd               m%1, m%5
-    psrad               m%3, 12
+    psrad               m%4, 12
     psrad               m%1, 12
-    packssdw            m%1, m%3                 ;dst1
+    packssdw            m%1, m%4                 ;dst1
 %endmacro
 
 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
-    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784   ;t2, t3
-    mova                m%6, [o(pw_2896x8)]
-    paddw               m%5, m%1, m%3
-    psubw               m%1, m%3
-    pmulhrsw            m%1, m%6                          ;t1
-    pmulhrsw            m%5, m%6                          ;t0
-    psubsw              m%3, m%1, m%2                     ;out2
-    paddsw              m%2, m%1                          ;out1
-    paddsw              m%1, m%5, m%4                     ;out0
-    psubsw              m%5, m%4                          ;out3
-    mova                m%4, m%5
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+    psubsw              m%3, m%1, m%2                      ;out2
+    paddsw              m%2, m%1                           ;out1
+    paddsw              m%1, m%5, m%4                      ;out0
+    psubsw              m%4, m%5                           ;out3
 %endmacro
 
 %macro WRITE_4X8 4 ;row[1-4]
@@ -1286,17 +1281,13 @@
 %endmacro
 
 %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
-    ITX_MULSUB_2W        %1, %4, %5, %6, %7,  799, 4017   ;t4a, t7a
-    ITX_MULSUB_2W        %3, %2, %5, %6, %7, 3406, 2276   ;t5a, t6a
-    psubsw               m%5, m%1, m%3                    ;t5a
-    paddsw               m%1, m%3                         ;t4
-    psubsw               m%6, m%4, m%2                    ;t6a
-    paddsw               m%4, m%2                         ;t7
-    mova                 m%3, [o(pw_2896x8)]
-    psubw                m%2, m%6, m%5                    ;t6a - t5a
-    paddw                m%6, m%5                         ;t6a + t5a
-    pmulhrsw             m%2, m%3                         ;t5
-    pmulhrsw             m%3, m%6                         ;t6
+    ITX_MULSUB_2W         %1, %4, %5, %6, %7,  799, 4017    ;t4a, t7a
+    ITX_MULSUB_2W         %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+    psubsw               m%2, m%4, m%5                      ;t6a
+    paddsw               m%4, m%5                           ;t7
+    psubsw               m%5, m%1, m%3                      ;t5a
+    paddsw               m%1, m%3                           ;t4
+    ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
 %endmacro
 
 INV_TXFM_8X8_FN dct, dct,      0
@@ -2063,37 +2054,34 @@
 %macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
     punpckhwd            m%5, m%4, m%1                ;packed in13 in3
     punpcklwd            m%1, m%4                     ;packed in1  in15
-    punpcklwd            m%6, m%3, m%2                ;packed in9  in7
+    punpcklwd            m%4, m%3, m%2                ;packed in9  in7
     punpckhwd            m%2, m%3                     ;packed in5  in11
-
     mova                 m%7, [o(pd_2048)]
-    ITX_MUL2X_PACK        %1, %4, %7,  401, 4076, 1    ;low: t8a   high: t15a
-    ITX_MUL2X_PACK        %6, %4, %7, 3166, 2598, 1    ;low: t9a   high: t14a
-    ITX_MUL2X_PACK        %2, %4, %7, 1931, 3612, 1    ;low: t10a  high: t13a
-    ITX_MUL2X_PACK        %5, %4, %7, 3920, 1189, 1    ;low: t11a  high: t12a
-    psubsw               m%4, m%1, m%6                 ;low: t9    high: t14
-    paddsw               m%1, m%6                      ;low: t8    high: t15
+    ITX_MUL2X_PACK        %1, %6, %7,  401, 4076, 1    ;low: t8a   high: t15a
+    ITX_MUL2X_PACK        %4, %6, %7, 3166, 2598, 1    ;low: t9a   high: t14a
+    ITX_MUL2X_PACK        %2, %6, %7, 1931, 3612, 1    ;low: t10a  high: t13a
+    ITX_MUL2X_PACK        %5, %6, %7, 3920, 1189, 1    ;low: t11a  high: t12a
+    psubsw               m%6, m%1, m%4                 ;low: t9    high: t14
+    paddsw               m%1, m%4                      ;low: t8    high: t15
     psubsw               m%3, m%5, m%2                 ;low: t10   high: t13
-    paddsw               m%2, m%5                      ;low: t11   high: t12
-    punpcklqdq           m%5, m%4, m%3                 ;low: t9    high: t10
-    punpckhqdq           m%4, m%3                      ;low: t14   high: t13
-    punpcklwd            m%6, m%4, m%5                 ;packed t14 t9
-    punpckhwd            m%5, m%4                      ;packed t10 t13
+    paddsw               m%5, m%2                      ;low: t11   high: t12
+    mova                 m%2, [o(deint_shuf2)]
+    pshufb               m%6, m%2
+    pshufb               m%3, [o(deint_shuf1)]
     pxor                 m%4, m%4
-    psubw                m%4, m%5                      ;packed -t10 -t13
+    psubw                m%4, m%3                      ;packed -t10 -t13
     ITX_MUL2X_PACK        %6, %3, %7, 1567, 3784, 1    ;low: t9a   high: t14a
     ITX_MUL2X_PACK        %4, %3, %7, 3784, 1567       ;low: t10a  high: t13a
-    psubsw               m%3, m%1, m%2                 ;low: t11a  high: t12a
-    paddsw               m%1, m%2                      ;low: t8a   high: t15a
+    psubsw               m%3, m%1, m%5                 ;low: t11a  high: t12a
+    paddsw               m%1, m%5                      ;low: t8a   high: t15a
     psubsw               m%5, m%6, m%4                 ;low: t10   high: t13
     paddsw               m%6, m%4                      ;low: t9    high: t14
-    mova                 m%7, [o(pw_2896x8)]
-    punpckhqdq           m%4, m%3, m%5                 ;low: t12a  high: t13
-    punpcklqdq           m%3, m%5                      ;low: t11a  high: t10
-    psubw                m%2, m%4, m%3
-    paddw                m%3, m%4
-    pmulhrsw             m%2, m%7                      ;low: t11   high: t10a
-    pmulhrsw             m%3, m%7                      ;low: t12   high: t13a
+    pshufb               m%3, m%2
+    pshufb               m%5, m%2
+    ITX_MUL2X_PACK        %3, %2, %7, 2896, 2896, 4    ;t12,  t11
+    ITX_MUL2X_PACK        %5, %4, %7, 2896, 2896, 4    ;t13a, t10a
+    packssdw             m%2, m%4                      ;low: t11   high: t10a
+    packssdw             m%3, m%5                      ;low: t12   high: t13a
     punpckhqdq           m%4, m%1, m%6                 ;low: t15a  high: t14
     punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
 %endmacro
@@ -2918,19 +2906,14 @@
     mova                   m0, [rsp+gprsize*2+16*1]
     mova                   m2, [rsp+gprsize*2+16*2]
     mova [rsp+gprsize*2+16*1], m4
-    psubsw                 m4, m0, m3                   ;t13
+    psubsw                 m5, m0, m3                   ;t13
     paddsw                 m0, m3                       ;t14
-    psubsw                 m3, m2, m1                   ;t12a
+    mova                   m3, [o(pd_2048)]
+    psubsw                 m4, m2, m1                   ;t12a
     paddsw                 m1, m2                       ;t15a
-    mova                   m5, [o(pw_2896x8)]
-    psubw                  m2, m4, m7                   ;t13-t10
-    paddw                  m7, m4                       ;t13+t10
-    psubw                  m4, m3, m6                   ;t12a-t11a
-    paddw                  m6, m3                       ;t12a+t11a
-    pmulhrsw               m7, m5                       ;t13a
-    pmulhrsw               m4, m5                       ;t11
-    pmulhrsw               m6, m5                       ;t12
-    pmulhrsw               m5, m2                       ;t10a
+    mova [rsp+gprsize*2+16*2], m1
+    ITX_MULSUB_2W           5, 7, 1, 2, 3, 2896, 2896   ;t10a, t13a
+    ITX_MULSUB_2W           4, 6, 1, 2, 3, 2896, 2896   ;t11,  t12
     mova                   m3, [rsp+gprsize*2+16*8]
     psubsw                 m2, m3, m5                   ;out10
     paddsw                 m3, m5                       ;out5
@@ -2950,6 +2933,7 @@
     mova [rsp+gprsize*2+16*5], m6
     psubsw                 m6, m7, m0                   ;out14
     paddsw                 m7, m0                       ;out1
+    mova                   m1, [rsp+gprsize*2+16*2]
     mova                   m0, [rsp+gprsize*2+16*3]
     mova [rsp+gprsize*2+16*4], m7
     psubsw                 m7, m0, m1                   ;out15
@@ -4211,35 +4195,30 @@
     psubsw                  m5, m3, m2                    ;t28a
     paddsw                  m3, m2                        ;t31a
     ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
-
     mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
     psubsw                  m1, m5, m6                    ;t20a
     paddsw                  m5, m6                        ;t19a
     psubsw                  m6, m2, m5                    ;out19
     paddsw                  m2, m5                        ;out12
+    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
     mova [rsp+gprsize*2+16*22], m6                        ;out19
     mova [rsp+gprsize*2+16*15], m2                        ;out12
-    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
     psubsw                  m6, m4, m5                    ;t27a
     paddsw                  m4, m5                        ;t28a
+    ITX_MULSUB_2W            6, 1, 2, 5, 7, 2896, 2896    ;t20, t27
     mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
-    mova                    m7, [o(pw_2896x8)]
-    psubw                   m5, m6, m1                    ;t27a - t20a
-    paddw                   m6, m1                        ;t27a + t20a
-    psubsw                  m1, m2, m4                    ;out28
+    psubsw                  m5, m2, m4                    ;out28
     paddsw                  m2, m4                        ;out3
-    pmulhrsw                m5, m7                        ;t20
-    pmulhrsw                m6, m7                        ;t27
     mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
-    mova [rsp+gprsize*2+16*31], m1                        ;out28
+    mova [rsp+gprsize*2+16*31], m5                        ;out28
     mova [rsp+gprsize*2+16*6 ], m2                        ;out3
-    psubsw                  m1, m4, m5                    ;out20
-    paddsw                  m4, m5                        ;out11
+    psubsw                  m5, m4, m6                    ;out20
+    paddsw                  m4, m6                        ;out11
     mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
-    mova [rsp+gprsize*2+16*23], m1                        ;out20
+    mova [rsp+gprsize*2+16*23], m5                        ;out20
     mova [rsp+gprsize*2+16*14], m4                        ;out11
-    psubsw                  m5, m2, m6                    ;out27
-    paddsw                  m2, m6                        ;out4
+    psubsw                  m5, m2, m1                    ;out27
+    paddsw                  m2, m1                        ;out4
     mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
     mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
     mova [rsp+gprsize*2+16*30], m5                        ;out27
@@ -4248,27 +4227,24 @@
     paddsw                  m0, m1                        ;t16
     psubsw                  m2, m3, m4                    ;t24
     paddsw                  m3, m4                        ;t31
+    ITX_MULSUB_2W            2, 5, 4, 6, 7, 2896, 2896    ;t23a, t24a
     mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
-    psubw                   m1, m2, m5                    ;t24  - t23
-    paddw                   m2, m5                        ;t24  + t23
     psubsw                  m4, m6, m0                    ;out16
     paddsw                  m6, m0                        ;out15
-    pmulhrsw                m1, m7                        ;t23a
-    pmulhrsw                m2, m7                        ;t24a
     mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
-    mova                    m5, [rsp+gprsize*2+16*11]     ;tmp8
+    mova                    m1, [rsp+gprsize*2+16*11]     ;tmp8
     mova [rsp+gprsize*2+16*18], m6                        ;out15
     mova [rsp+gprsize*2+16*19], m4                        ;out16
     psubsw                  m6, m0, m3                    ;out31
     paddsw                  m0, m3                        ;out0
-    psubsw                  m4, m5, m1                    ;out23
-    paddsw                  m5, m1                        ;out8
+    psubsw                  m4, m1, m2                    ;out23
+    paddsw                  m1, m2                        ;out8
     mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
     mova [rsp+gprsize*2+16*34], m6                        ;out31
-    mova [rsp+gprsize*2+16*11], m5                        ;out8
+    mova [rsp+gprsize*2+16*11], m1                        ;out8
     mova [rsp+gprsize*2+16*26], m4                        ;out23
-    paddsw                  m6, m3, m2                    ;out7
-    psubsw                  m3, m2                        ;out24
+    paddsw                  m6, m3, m5                    ;out7
+    psubsw                  m3, m5                        ;out24
     mova                    m1, [rsp+gprsize*2+16*20]     ;t17
     mova                    m5, [rsp+gprsize*2+16*25]     ;t22
     mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
@@ -4283,23 +4259,20 @@
     mova [rsp+gprsize*2+16*20], m3                        ;out17
     psubsw                  m2, m1, m5                    ;t25a
     paddsw                  m1, m5                        ;t30a
-    psubw                   m3, m2, m4                    ;t25a - t22a
-    paddw                   m2, m4                        ;t25a + t22a
+    ITX_MULSUB_2W            2, 4, 3, 5, 7, 2896, 2896    ;t22, t25
     mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
-    pmulhrsw                m3, m7                        ;t22
-    pmulhrsw                m2, m7                        ;t25
-    psubsw                  m4, m5, m1                    ;out30
+    psubsw                  m3, m5, m1                    ;out30
     paddsw                  m5, m1                        ;out1
     mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
-    mova [rsp+gprsize*2+16*33], m4                        ;out30
+    mova [rsp+gprsize*2+16*33], m3                        ;out30
     mova [rsp+gprsize*2+16*4 ], m5                        ;out1
-    psubsw                  m4, m1, m3                    ;out22
-    paddsw                  m1, m3                        ;out9
+    psubsw                  m3, m1, m2                    ;out22
+    paddsw                  m1, m2                        ;out9
     mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
-    mova [rsp+gprsize*2+16*25], m4                        ;out22
+    mova [rsp+gprsize*2+16*25], m3                        ;out22
     mova [rsp+gprsize*2+16*12], m1                        ;out9
-    psubsw                  m3, m5, m2                    ;out25
-    paddsw                  m5, m2                        ;out6
+    psubsw                  m3, m5, m4                    ;out25
+    paddsw                  m5, m4                        ;out6
     mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
     mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
     mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
@@ -4315,17 +4288,14 @@
     mova [rsp+gprsize*2+16*16], m2                        ;out13
     psubsw                  m5, m3, m1                    ;t26
     paddsw                  m3, m1                        ;t29
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 2896, 2896    ;t21a, t26a
     mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
-    psubw                   m1, m5, m4                    ;t26 - t21
-    paddw                   m4, m5                        ;t26 + t21
-    psubsw                  m5, m2, m3                    ;out29
+    psubsw                  m1, m2, m3                    ;out29
     paddsw                  m2, m3                        ;out2
-    pmulhrsw                m1, m7                        ;t21a
-    pmulhrsw                m4, m7                        ;t26a
     mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
-    mova [rsp+gprsize*2+16*32], m5                        ;out29
-    psubsw                  m7, m3, m1                    ;out21
-    paddsw                  m3, m1                        ;out10
+    mova [rsp+gprsize*2+16*32], m1                        ;out29
+    psubsw                  m7, m3, m5                    ;out21
+    paddsw                  m3, m5                        ;out10
     mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
     mova [rsp+gprsize*2+16*24], m7                        ;out21
     mova [rsp+gprsize*2+16*13], m3                        ;out10
@@ -6010,7 +5980,6 @@
     psubw                   m5, m6, m3
     ITX_MULSUB_2W            5, 4, 2, 3, 7, 1567, 3784    ;t43, t52
 
-    mova                    m7, [o(pw_2896x8)]
     mova                    m2, [rsp+gprsize*2+16*38]     ;t35a
     mova                    m3, [rsp+gprsize*2+16*31]     ;tmp[28]
     psubsw                  m6, m2, m0                    ;t44
@@ -6017,90 +5986,81 @@
     paddsw                  m2, m0                        ;t35
     psubsw                  m0, m3, m2                    ;out35
     paddsw                  m2, m3                        ;out28
+    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
     mova [rsp+gprsize*2+16*38], m0                        ;out35
     mova [rsp+gprsize*2+16*31], m2                        ;out28
-    mova                    m3, [rsp+gprsize*2+16*63]     ;t60a
-    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
     psubsw                  m0, m3, m1                    ;t51
     paddsw                  m3, m1                        ;t60
-    psubw                   m1, m0, m6                    ;t44a
-    paddw                   m0, m6                        ;t51a
-    psubsw                  m6, m2, m3                    ;out60
+    ITX_MULSUB_2W            0, 6, 1, 2, 7, 2896, 2896    ;t44a, t51a
+    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp[3]
+    psubsw                  m1, m2, m3                    ;out60
     paddsw                  m2, m3                        ;out3
-    pmulhrsw                m1, m7                        ;t44a
-    pmulhrsw                m0, m7                        ;t51a
     mova                    m3, [rsp+gprsize*2+16*22]     ;tmp[19]
-    mova [rsp+gprsize*2+16*63], m6                        ;out60
+    mova [rsp+gprsize*2+16*63], m1                        ;out60
     mova [rsp+gprsize*2+16*6 ], m2                        ;out3
-    psubsw                  m6, m3, m1                    ;out44
-    paddsw                  m3, m1                        ;out19
+    psubsw                  m1, m3, m0                    ;out44
+    paddsw                  m3, m0                        ;out19
     mova                    m2, [rsp+gprsize*2+16*15]     ;tmp[12]
-    mova [rsp+gprsize*2+16*47], m6                        ;out44
-    mova [rsp+gprsize*2+16*22], m3                        ;out19
-    psubsw                  m1, m2, m0                    ;out51
-    paddsw                  m2, m0                        ;out12
-    mova [rsp+gprsize*2+16*54], m1                        ;out51
-    mova [rsp+gprsize*2+16*15], m2                        ;out12
 
     mova                    m0, [rsp+gprsize*2+16*39]     ;t36
+    mova [rsp+gprsize*2+16*47], m1                        ;out44
+    mova [rsp+gprsize*2+16*22], m3                        ;out19
     mova                    m1, [rsp+gprsize*2+16*62]     ;t59
+    psubsw                  m3, m2, m6                    ;out51
+    paddsw                  m2, m6                        ;out12
+    mova [rsp+gprsize*2+16*54], m3                        ;out51
+    mova [rsp+gprsize*2+16*15], m2                        ;out12
     psubsw                  m2, m0, m5                    ;t43a
     paddsw                  m0, m5                        ;t36a
+    mova                    m5, [rsp+gprsize*2+16*30]     ;tmp[27]
     psubsw                  m3, m1, m4                    ;t52a
     paddsw                  m1, m4                        ;t59a
-    psubw                   m5, m3, m2                    ;t43
-    paddw                   m3, m2                        ;t52
-    mova                    m2, [rsp+gprsize*2+16*30]     ;tmp[27]
+    ITX_MULSUB_2W            3, 2, 4, 6, 7, 2896, 2896    ;t43, t52
     mova                    m4, [rsp+gprsize*2+16*7 ]     ;tmp[4 ]
-    pmulhrsw                m5, m7                        ;t43
-    pmulhrsw                m3, m7                        ;t52
-    psubsw                  m6, m2, m0                    ;out36
-    paddsw                  m2, m0                        ;out27
+    psubsw                  m6, m5, m0                    ;out36
+    paddsw                  m5, m0                        ;out27
     psubsw                  m0, m4, m1                    ;out59
     paddsw                  m4, m1                        ;out4
     mova [rsp+gprsize*2+16*39], m6                        ;out36
-    mova [rsp+gprsize*2+16*30], m2                        ;out27
+    mova [rsp+gprsize*2+16*30], m5                        ;out27
     mova [rsp+gprsize*2+16*62], m0                        ;out59
     mova [rsp+gprsize*2+16*7 ], m4                        ;out4
     mova                    m0, [rsp+gprsize*2+16*23]     ;tmp[20]
-    mova                    m2, [rsp+gprsize*2+16*14]     ;tmp[11]
-    psubsw                  m4, m0, m5                    ;out43
-    paddsw                  m0, m5                        ;out20
-    psubsw                  m6, m2, m3                    ;out52
-    paddsw                  m2, m3                        ;out11
+    mova                    m5, [rsp+gprsize*2+16*14]     ;tmp[11]
+    psubsw                  m4, m0, m3                    ;out43
+    paddsw                  m0, m3                        ;out20
+    psubsw                  m6, m5, m2                    ;out52
+    paddsw                  m5, m2                        ;out11
     mova [rsp+gprsize*2+16*46], m4                        ;out43
     mova [rsp+gprsize*2+16*23], m0                        ;out20
     mova [rsp+gprsize*2+16*55], m6                        ;out52
-    mova [rsp+gprsize*2+16*14], m2                        ;out11
+    mova [rsp+gprsize*2+16*14], m5                        ;out11
 
     mova                    m0, [rsp+gprsize*2+16*40]     ;t37a
-    mova                    m2, [rsp+gprsize*2+16*45]     ;t42a
+    mova                    m5, [rsp+gprsize*2+16*45]     ;t42a
     mova                    m3, [rsp+gprsize*2+16*56]     ;t53a
     mova                    m1, [rsp+gprsize*2+16*61]     ;t58a
-    psubsw                  m4, m0, m2                    ;t42
-    paddsw                  m0, m2                        ;t37
+    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
+    psubsw                  m4, m0, m5                    ;t42
+    paddsw                  m0, m5                        ;t37
     psubsw                  m5, m1, m3                    ;t53
     paddsw                  m1, m3                        ;t58
-    psubw                   m6, m5, m4                    ;t42a
-    paddw                   m5, m4                        ;t53a
-    mova                    m2, [rsp+gprsize*2+16*29]     ;tmp[26]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t43, t52
     mova                    m3, [rsp+gprsize*2+16*8 ]     ;tmp[5 ]
-    pmulhrsw                m6, m7                        ;t42a
-    pmulhrsw                m5, m7                        ;t53a
-    psubsw                  m4, m2, m0                    ;out37
+    psubsw                  m6, m2, m0                    ;out37
     paddsw                  m2, m0                        ;out26
     psubsw                  m0, m3, m1                    ;out58
     paddsw                  m3, m1                        ;out5
-    mova [rsp+gprsize*2+16*40], m4                        ;out37
+    mova [rsp+gprsize*2+16*40], m6                        ;out37
     mova [rsp+gprsize*2+16*29], m2                        ;out26
     mova [rsp+gprsize*2+16*61], m0                        ;out58
     mova [rsp+gprsize*2+16*8 ], m3                        ;out5
     mova                    m0, [rsp+gprsize*2+16*24]     ;tmp[21]
     mova                    m1, [rsp+gprsize*2+16*13]     ;tmp[10]
-    psubsw                  m2, m0, m6                    ;out42
-    paddsw                  m0, m6                        ;out21
-    psubsw                  m3, m1, m5                    ;out53
-    paddsw                  m1, m5                        ;out10
+    psubsw                  m2, m0, m5                    ;out42
+    paddsw                  m0, m5                        ;out21
+    psubsw                  m3, m1, m4                    ;out53
+    paddsw                  m1, m4                        ;out10
     mova [rsp+gprsize*2+16*45], m2                        ;out42
     mova [rsp+gprsize*2+16*24], m0                        ;out21
     mova [rsp+gprsize*2+16*56], m3                        ;out53
@@ -6107,33 +6067,30 @@
     mova [rsp+gprsize*2+16*13], m1                        ;out10
 
     mova                    m0, [rsp+gprsize*2+16*41]     ;t38
-    mova                    m2, [rsp+gprsize*2+16*44]     ;t41
+    mova                    m5, [rsp+gprsize*2+16*44]     ;t41
     mova                    m3, [rsp+gprsize*2+16*57]     ;t54
     mova                    m1, [rsp+gprsize*2+16*60]     ;t57
-    psubsw                  m4, m0, m2                    ;t41a
-    paddsw                  m0, m2                        ;t38a
+    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
+    psubsw                  m4, m0, m5                    ;t41a
+    paddsw                  m0, m5                        ;t38a
     psubsw                  m5, m1, m3                    ;t54a
     paddsw                  m1, m3                        ;t57a
-    psubw                   m6, m5, m4                    ;t41
-    paddw                   m5, m4                        ;t54
-    mova                    m2, [rsp+gprsize*2+16*28]     ;tmp[25]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t41a, t54a
     mova                    m3, [rsp+gprsize*2+16*9 ]     ;tmp[6 ]
-    pmulhrsw                m6, m7                        ;t41a
-    pmulhrsw                m5, m7                        ;t54a
-    psubsw                  m4, m2, m0                    ;out38
+    psubsw                  m6, m2, m0                    ;out38
     paddsw                  m2, m0                        ;out25
     psubsw                  m0, m3, m1                    ;out57
     paddsw                  m3, m1                        ;out6
-    mova [rsp+gprsize*2+16*41], m4                        ;out38
+    mova [rsp+gprsize*2+16*41], m6                        ;out38
     mova [rsp+gprsize*2+16*28], m2                        ;out25
     mova [rsp+gprsize*2+16*60], m0                        ;out57
     mova [rsp+gprsize*2+16*9 ], m3                        ;out6
     mova                    m0, [rsp+gprsize*2+16*25]     ;tmp[22]
     mova                    m1, [rsp+gprsize*2+16*12]     ;tmp[9 ]
-    psubsw                  m2, m0, m6                    ;out41
-    paddsw                  m0, m6                        ;out22
-    psubsw                  m3, m1, m5                    ;out54
-    paddsw                  m1, m5                        ;out9
+    psubsw                  m2, m0, m5                    ;out41
+    paddsw                  m0, m5                        ;out22
+    psubsw                  m3, m1, m4                    ;out54
+    paddsw                  m1, m4                        ;out9
     mova [rsp+gprsize*2+16*44], m2                        ;out41
     mova [rsp+gprsize*2+16*25], m0                        ;out22
     mova [rsp+gprsize*2+16*57], m3                        ;out54
@@ -6140,33 +6097,30 @@
     mova [rsp+gprsize*2+16*12], m1                        ;out9
 
     mova                    m0, [rsp+gprsize*2+16*42]     ;t39a
-    mova                    m2, [rsp+gprsize*2+16*43]     ;t40a
+    mova                    m5, [rsp+gprsize*2+16*43]     ;t40a
     mova                    m3, [rsp+gprsize*2+16*58]     ;t55a
     mova                    m1, [rsp+gprsize*2+16*59]     ;t56a
-    psubsw                  m4, m0, m2                    ;t40
-    paddsw                  m0, m2                        ;t39
+    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
+    psubsw                  m4, m0, m5                    ;t40
+    paddsw                  m0, m5                        ;t39
     psubsw                  m5, m1, m3                    ;t55
     paddsw                  m1, m3                        ;t56
-    psubw                   m6, m5, m4                    ;t40a
-    paddw                   m5, m4                        ;t55a
-    mova                    m2, [rsp+gprsize*2+16*27]     ;tmp[24]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t40a, t55a
     mova                    m3, [rsp+gprsize*2+16*10]     ;tmp[7 ]
-    pmulhrsw                m6, m7                        ;t40a
-    pmulhrsw                m5, m7                        ;t55a
-    psubsw                  m4, m2, m0                    ;out39
+    psubsw                  m6, m2, m0                    ;out39
     paddsw                  m2, m0                        ;out24
     psubsw                  m0, m3, m1                    ;out56
     paddsw                  m3, m1                        ;out7
-    mova [rsp+gprsize*2+16*42], m4                        ;out39
+    mova [rsp+gprsize*2+16*42], m6                        ;out39
     mova [rsp+gprsize*2+16*27], m2                        ;out24
     mova [rsp+gprsize*2+16*59], m0                        ;out56
     mova [rsp+gprsize*2+16*10], m3                        ;out7
     mova                    m0, [rsp+gprsize*2+16*26]     ;tmp[23]
     mova                    m1, [rsp+gprsize*2+16*11]     ;tmp[8 ]
-    psubsw                  m2, m0, m6                    ;out40
-    paddsw                  m0, m6                        ;out23
-    psubsw                  m3, m1, m5                    ;out55
-    paddsw                  m1, m5                        ;out8
+    psubsw                  m2, m0, m5                    ;out40
+    paddsw                  m0, m5                        ;out23
+    psubsw                  m3, m1, m4                    ;out55
+    paddsw                  m1, m4                        ;out8
     mova [rsp+gprsize*2+16*43], m2                        ;out40
     mova [rsp+gprsize*2+16*26], m0                        ;out23
     mova [rsp+gprsize*2+16*58], m3                        ;out55
@@ -6173,33 +6127,30 @@
     mova [rsp+gprsize*2+16*11], m1                        ;out8
 
     mova                    m0, [rsp+gprsize*2+16*37]     ;t34
-    mova                    m2, [rsp+gprsize*2+16*48]     ;t45
+    mova                    m5, [rsp+gprsize*2+16*48]     ;t45
     mova                    m3, [rsp+gprsize*2+16*53]     ;t50
     mova                    m1, [rsp+gprsize*2+16*64]     ;t61
-    psubsw                  m4, m0, m2                    ;t45a
-    paddsw                  m0, m2                        ;t34a
+    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
+    psubsw                  m4, m0, m5                    ;t45a
+    paddsw                  m0, m5                        ;t34a
     psubsw                  m5, m1, m3                    ;t50a
     paddsw                  m1, m3                        ;t61a
-    psubw                   m6, m5, m4                    ;t45
-    paddw                   m5, m4                        ;t50
-    mova                    m2, [rsp+gprsize*2+16*32]     ;tmp[29]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
     mova                    m3, [rsp+gprsize*2+16*5 ]     ;tmp[2 ]
-    pmulhrsw                m6, m7                        ;t45
-    pmulhrsw                m5, m7                        ;t50
-    psubsw                  m4, m2, m0                    ;out34
+    psubsw                  m6, m2, m0                    ;out34
     paddsw                  m2, m0                        ;out29
     psubsw                  m0, m3, m1                    ;out61
     paddsw                  m3, m1                        ;out2
-    mova [rsp+gprsize*2+16*37], m4                        ;out34
+    mova [rsp+gprsize*2+16*37], m6                        ;out34
     mova [rsp+gprsize*2+16*32], m2                        ;out29
     mova [rsp+gprsize*2+16*64], m0                        ;out61
     mova [rsp+gprsize*2+16*5 ], m3                        ;out2
     mova                    m0, [rsp+gprsize*2+16*21]     ;tmp[18]
     mova                    m1, [rsp+gprsize*2+16*16]     ;tmp[13]
-    psubsw                  m2, m0, m6                    ;out45
-    paddsw                  m0, m6                        ;out18
-    psubsw                  m3, m1, m5                    ;out50
-    paddsw                  m1, m5                        ;out13
+    psubsw                  m2, m0, m5                    ;out45
+    paddsw                  m0, m5                        ;out18
+    psubsw                  m3, m1, m4                    ;out50
+    paddsw                  m1, m4                        ;out13
     mova [rsp+gprsize*2+16*48], m2                        ;out45
     mova [rsp+gprsize*2+16*21], m0                        ;out18
     mova [rsp+gprsize*2+16*53], m3                        ;out50
@@ -6206,33 +6157,30 @@
     mova [rsp+gprsize*2+16*16], m1                        ;out13
 
     mova                    m0, [rsp+gprsize*2+16*36]     ;t33a
-    mova                    m2, [rsp+gprsize*2+16*49]     ;t46a
+    mova                    m5, [rsp+gprsize*2+16*49]     ;t46a
     mova                    m3, [rsp+gprsize*2+16*52]     ;t49a
     mova                    m1, [rsp+gprsize*2+16*65]     ;t62a
-    psubsw                  m4, m0, m2                    ;t46
-    paddsw                  m0, m2                        ;t33
+    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
+    psubsw                  m4, m0, m5                    ;t46
+    paddsw                  m0, m5                        ;t33
     psubsw                  m5, m1, m3                    ;t49
     paddsw                  m1, m3                        ;t62
-    psubw                   m6, m5, m4                    ;t46a
-    paddw                   m5, m4                        ;t49a
-    mova                    m2, [rsp+gprsize*2+16*33]     ;tmp[30]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t45, t50
     mova                    m3, [rsp+gprsize*2+16*4 ]     ;tmp[1 ]
-    pmulhrsw                m6, m7                        ;t46a
-    pmulhrsw                m5, m7                        ;t49a
-    psubsw                  m4, m2, m0                    ;out33
+    psubsw                  m6, m2, m0                    ;out33
     paddsw                  m2, m0                        ;out30
     psubsw                  m0, m3, m1                    ;out62
     paddsw                  m3, m1                        ;out1
-    mova [rsp+gprsize*2+16*36], m4                        ;out33
+    mova [rsp+gprsize*2+16*36], m6                        ;out33
     mova [rsp+gprsize*2+16*33], m2                        ;out30
     mova [rsp+gprsize*2+16*65], m0                        ;out62
     mova [rsp+gprsize*2+16*4 ], m3                        ;out1
     mova                    m0, [rsp+gprsize*2+16*20]     ;tmp[17]
     mova                    m1, [rsp+gprsize*2+16*17]     ;tmp[14]
-    psubsw                  m2, m0, m6                    ;out46
-    paddsw                  m0, m6                        ;out17
-    psubsw                  m3, m1, m5                    ;out49
-    paddsw                  m1, m5                        ;out14
+    psubsw                  m2, m0, m5                    ;out46
+    paddsw                  m0, m5                        ;out17
+    psubsw                  m3, m1, m4                    ;out49
+    paddsw                  m1, m4                        ;out14
     mova [rsp+gprsize*2+16*49], m2                        ;out46
     mova [rsp+gprsize*2+16*20], m0                        ;out17
     mova [rsp+gprsize*2+16*52], m3                        ;out49
@@ -6239,39 +6187,35 @@
     mova [rsp+gprsize*2+16*17], m1                        ;out14
 
     mova                    m0, [rsp+gprsize*2+16*35]     ;t32
-    mova                    m2, [rsp+gprsize*2+16*50]     ;t47
+    mova                    m5, [rsp+gprsize*2+16*50]     ;t47
     mova                    m3, [rsp+gprsize*2+16*51]     ;t48
     mova                    m1, [rsp+gprsize*2+16*66]     ;t63
-    psubsw                  m4, m0, m2                    ;t47a
-    paddsw                  m0, m2                        ;t32a
+    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
+    psubsw                  m4, m0, m5                    ;t47a
+    paddsw                  m0, m5                        ;t32a
     psubsw                  m5, m1, m3                    ;t48a
     paddsw                  m1, m3                        ;t63a
-    psubw                   m6, m5, m4                    ;t47
-    paddw                   m5, m4                        ;t48
-    mova                    m2, [rsp+gprsize*2+16*34]     ;tmp[31]
+    ITX_MULSUB_2W            5, 4, 3, 6, 7, 2896, 2896    ;t47, t48
     mova                    m3, [rsp+gprsize*2+16*3 ]     ;tmp[0 ]
-    pmulhrsw                m6, m7                        ;t47
-    pmulhrsw                m5, m7                        ;t48
-    psubsw                  m4, m2, m0                    ;out32
+    psubsw                  m6, m2, m0                    ;out32
     paddsw                  m2, m0                        ;out31
     psubsw                  m0, m3, m1                    ;out63
     paddsw                  m3, m1                        ;out0
-    mova [rsp+gprsize*2+16*35], m4                        ;out32
+    mova [rsp+gprsize*2+16*35], m6                        ;out32
     mova [rsp+gprsize*2+16*34], m2                        ;out31
     mova [rsp+gprsize*2+16*66], m0                        ;out63
     mova [rsp+gprsize*2+16*3 ], m3                        ;out0
     mova                    m0, [rsp+gprsize*2+16*19]     ;tmp[16]
     mova                    m1, [rsp+gprsize*2+16*18]     ;tmp[15]
-    psubsw                  m2, m0, m6                    ;out47
-    paddsw                  m0, m6                        ;out16
-    psubsw                  m3, m1, m5                    ;out48
-    paddsw                  m1, m5                        ;out15
+    psubsw                  m2, m0, m5                    ;out47
+    paddsw                  m0, m5                        ;out16
+    psubsw                  m3, m1, m4                    ;out48
+    paddsw                  m1, m4                        ;out15
     mova [rsp+gprsize*2+16*50], m2                        ;out47
     mova [rsp+gprsize*2+16*19], m0                        ;out16
     mova [rsp+gprsize*2+16*51], m3                        ;out48
     mova [rsp+gprsize*2+16*18], m1                        ;out15
     ret
-
 
 
 cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2