shithub: dav1d

Download patch

ref: 585ac4624890355f6df219cf081d055d367347c2
parent: 5d944dc6cf5a5590bafc8e1eb483bc442c079b02
author: Liwei Wang <[email protected]>
date: Fri Mar 15 09:25:19 EDT 2019

Add SSSE3 implementation for the 8x32 and 32x8 blocks in itx

Cycle times:
inv_txfm_add_8x32_dct_dct_0_8bpc_c: 1164.7
inv_txfm_add_8x32_dct_dct_0_8bpc_ssse3: 79.5
inv_txfm_add_8x32_dct_dct_1_8bpc_c: 11291.6
inv_txfm_add_8x32_dct_dct_1_8bpc_ssse3: 508.5
inv_txfm_add_8x32_dct_dct_2_8bpc_c: 10720.4
inv_txfm_add_8x32_dct_dct_2_8bpc_ssse3: 507.9
inv_txfm_add_8x32_dct_dct_3_8bpc_c: 12351.5
inv_txfm_add_8x32_dct_dct_3_8bpc_ssse3: 687.2
inv_txfm_add_8x32_dct_dct_4_8bpc_c: 10402.3
inv_txfm_add_8x32_dct_dct_4_8bpc_ssse3: 687.9
inv_txfm_add_8x32_identity_identity_0_8bpc_c: 3485.0
inv_txfm_add_8x32_identity_identity_0_8bpc_ssse3: 97.7
inv_txfm_add_8x32_identity_identity_1_8bpc_c: 3495.7
inv_txfm_add_8x32_identity_identity_1_8bpc_ssse3: 97.7
inv_txfm_add_8x32_identity_identity_2_8bpc_c: 3503.7
inv_txfm_add_8x32_identity_identity_2_8bpc_ssse3: 97.8
inv_txfm_add_8x32_identity_identity_3_8bpc_c: 3489.5
inv_txfm_add_8x32_identity_identity_3_8bpc_ssse3: 184.4
inv_txfm_add_8x32_identity_identity_4_8bpc_c: 3498.1
inv_txfm_add_8x32_identity_identity_4_8bpc_ssse3: 182.8
inv_txfm_add_32x8_dct_dct_0_8bpc_c: 1220.4
inv_txfm_add_32x8_dct_dct_0_8bpc_ssse3: 65.6
inv_txfm_add_32x8_dct_dct_1_8bpc_c: 11120.7
inv_txfm_add_32x8_dct_dct_1_8bpc_ssse3: 623.8
inv_txfm_add_32x8_dct_dct_2_8bpc_c: 12236.3
inv_txfm_add_32x8_dct_dct_2_8bpc_ssse3: 624.7
inv_txfm_add_32x8_dct_dct_3_8bpc_c: 10866.3
inv_txfm_add_32x8_dct_dct_3_8bpc_ssse3: 694.1
inv_txfm_add_32x8_dct_dct_4_8bpc_c: 10322.8
inv_txfm_add_32x8_dct_dct_4_8bpc_ssse3: 692.5
inv_txfm_add_32x8_identity_identity_0_8bpc_c: 3368.1
inv_txfm_add_32x8_identity_identity_0_8bpc_ssse3: 98.6
inv_txfm_add_32x8_identity_identity_1_8bpc_c: 3381.1
inv_txfm_add_32x8_identity_identity_1_8bpc_ssse3: 98.3
inv_txfm_add_32x8_identity_identity_2_8bpc_c: 3376.6
inv_txfm_add_32x8_identity_identity_2_8bpc_ssse3: 98.3
inv_txfm_add_32x8_identity_identity_3_8bpc_c: 3364.3
inv_txfm_add_32x8_identity_identity_3_8bpc_ssse3: 182.2
inv_txfm_add_32x8_identity_identity_4_8bpc_c: 3390.0
inv_txfm_add_32x8_identity_identity_4_8bpc_ssse3: 182.2

--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -86,6 +86,8 @@
 decl_itx16_fns( 8, 16, ssse3);
 decl_itx16_fns(16,  8, ssse3);
 decl_itx12_fns(16, 16, ssse3);
+decl_itx2_fns ( 8, 32, ssse3);
+decl_itx2_fns (32,  8, ssse3);
 
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -138,6 +140,8 @@
     assign_itx16_fn(R,  8, 16, ssse3);
     assign_itx16_fn(R, 16,  8, ssse3);
     assign_itx12_fn(,  16, 16, ssse3);
+    assign_itx2_fn (R,  8, 32, ssse3);
+    assign_itx2_fn (R, 32,  8, ssse3);
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -76,6 +76,23 @@
 pw_5793x4:      times 8 dw  5793*4
 pw_8192:        times 8 dw  8192
 pw_m8192:       times 8 dw -8192
+pw_5:           times 8 dw  5
+pw_201x8:       times 8 dw   201*8
+pw_4091x8:      times 8 dw  4091*8
+pw_m2751x8:     times 8 dw -2751*8
+pw_3035x8:      times 8 dw  3035*8
+pw_1751x8:      times 8 dw  1751*8
+pw_3703x8:      times 8 dw  3703*8
+pw_m1380x8:     times 8 dw -1380*8
+pw_3857x8:      times 8 dw  3857*8
+pw_995x8:       times 8 dw   995*8
+pw_3973x8:      times 8 dw  3973*8
+pw_m2106x8:     times 8 dw -2106*8
+pw_3513x8:      times 8 dw  3513*8
+pw_2440x8:      times 8 dw  2440*8
+pw_3290x8:      times 8 dw  3290*8
+pw_m601x8:      times 8 dw  -601*8
+pw_4052x8:      times 8 dw  4052*8
 
 iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
 iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
@@ -1949,6 +1966,16 @@
     mova                 m6, [%1+%2*6]
 %endmacro
 
+%macro SAVE_7ROWS 2 ;src, stride
+    mova          [%1+%2*0], m0
+    mova          [%1+%2*1], m1
+    mova          [%1+%2*2], m2
+    mova          [%1+%2*3], m3
+    mova          [%1+%2*4], m4
+    mova          [%1+%2*5], m5
+    mova          [%1+%2*6], m6
+%endmacro
+
 %macro IDCT16_1D_PACKED_ODDHALF 7  ;src[1-4], tmp[1-3]
     punpckhwd            m%5, m%4, m%1                ;packed in13 in3
     punpcklwd            m%1, m%4                     ;packed in1  in15
@@ -1993,7 +2020,7 @@
 INV_TXFM_16X4_FN dct, identity, 3
 
 cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_7ROWS coeffq, 16
+    LOAD_7ROWS        coeffq, 16
     call .main
 
 .pass1_end:
@@ -2098,7 +2125,7 @@
 INV_TXFM_16X4_FN adst, identity
 
 cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_7ROWS coeffq, 16
+    LOAD_7ROWS        coeffq, 16
     call .main
 
     punpckhwd             m6, m7, m0                 ;packed -out11, -out15
@@ -2236,7 +2263,7 @@
 INV_TXFM_16X4_FN flipadst, identity
 
 cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_7ROWS coeffq, 16
+    LOAD_7ROWS        coeffq, 16
     call m(iadst_16x4_internal).main
 
     punpcklwd             m6, m7, m0                 ;packed  out11,  out15
@@ -2266,7 +2293,7 @@
 INV_TXFM_16X4_FN identity, identity
 
 cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    LOAD_7ROWS coeffq, 16
+    LOAD_7ROWS        coeffq, 16
     mova                  m7, [o(pw_5793x4)]
     REPX    {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6
     REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
@@ -2299,17 +2326,6 @@
     mova                 [%1+%2*7], m7
 %endmacro
 
-%macro ITX_8X16_LOAD_STACK_COEFS 0
-    mova                   m0, [rsp+gprsize+16*3]
-    mova                   m1, [rsp+gprsize+16*4]
-    mova                   m2, [rsp+gprsize+16*5]
-    mova                   m3, [rsp+gprsize+16*6]
-    mova                   m4, [rsp+gprsize+16*7]
-    mova                   m5, [rsp+gprsize+16*8]
-    mova                   m6, [rsp+gprsize+16*9]
-    mova                   m7, [rsp+gprsize+32*5]
-%endmacro
-
 %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
     INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*12
 %ifidn %1_%2, dct_dct
@@ -2435,14 +2451,7 @@
 .pass2_main:
     call m(idct_8x8_internal).main
 
-    mova   [rsp+gprsize+16*3], m0
-    mova   [rsp+gprsize+16*4], m1
-    mova   [rsp+gprsize+16*5], m2
-    mova   [rsp+gprsize+16*6], m3
-    mova   [rsp+gprsize+16*7], m4
-    mova   [rsp+gprsize+16*8], m5
-    mova   [rsp+gprsize+16*9], m6
-
+    SAVE_7ROWS   rsp+gprsize+16*3, 16
     mova                   m0, [coeffq+16*2 ]
     mova                   m1, [coeffq+16*6 ]
     mova                   m2, [coeffq+16*10]
@@ -2458,7 +2467,7 @@
     jmp  m(idct_8x8_internal).end
 
 .end:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     lea                  tx2q, [o(m(idct_8x16_internal).end1)]
     mov                  dstq, r3
@@ -2512,7 +2521,7 @@
     jmp m(iadst_8x8_internal).end
 
 .end:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     lea                  tx2q, [o(m(idct_8x16_internal).end1)]
     mov                  dstq, r3
@@ -2560,7 +2569,7 @@
     jmp  m(iflipadst_8x8_internal).end
 
 .end:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(idct_8x16_internal).end1)]
     mov                   dstq, r3
@@ -2703,13 +2712,7 @@
 cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*0, 32, 1
     call m(idct_8x8_internal).main
-    mova   [rsp+gprsize+16*3], m0
-    mova   [rsp+gprsize+16*4], m1
-    mova   [rsp+gprsize+16*5], m2
-    mova   [rsp+gprsize+16*6], m3
-    mova   [rsp+gprsize+16*7], m4
-    mova   [rsp+gprsize+16*8], m5
-    mova   [rsp+gprsize+16*9], m6
+    SAVE_7ROWS   rsp+gprsize+16*3, 16
 
     LOAD_8ROWS    coeffq+16*1, 32, 1
     call  .main
@@ -2719,7 +2722,7 @@
 
 .pass1_end:
     SAVE_8ROWS    coeffq+16*1, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     mov                  tx2q, r3
     jmp  m(idct_8x8_internal).pass1_end
@@ -2863,7 +2866,7 @@
 
 .pass1_end:
     SAVE_8ROWS    coeffq+16*1, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     mov                  tx2q, r3
     jmp m(iadst_8x8_internal).pass1_end
@@ -3067,7 +3070,7 @@
 
     mova                    m7, [rsp+gprsize+16*0]
     SAVE_8ROWS     coeffq+16*0, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mov                     r3, tx2q
     lea                   tx2q, [o(m(iflipadst_16x8_internal).pass1_end)]
@@ -3098,15 +3101,7 @@
 INV_TXFM_16X8_FN identity, identity
 
 cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    mova                   m7, [o(pw_2896x8)]
-    pmulhrsw               m0, m7, [coeffq+16*8 ]
-    pmulhrsw               m1, m7, [coeffq+16*9 ]
-    pmulhrsw               m2, m7, [coeffq+16*10]
-    pmulhrsw               m3, m7, [coeffq+16*11]
-    pmulhrsw               m4, m7, [coeffq+16*12]
-    pmulhrsw               m5, m7, [coeffq+16*13]
-    pmulhrsw               m6, m7, [coeffq+16*14]
-    pmulhrsw               m7,     [coeffq+16*15]
+    LOAD_8ROWS    coeffq+16*8, 16, 1
 
     mov                    r3, tx2q
     lea                  tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
@@ -3266,30 +3261,10 @@
 INV_TXFM_16X16_FN dct, flipadst
 
 cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    mova                    m0, [coeffq+16*1 ]
-    mova                    m1, [coeffq+16*5 ]
-    mova                    m2, [coeffq+16*9 ]
-    mova                    m3, [coeffq+16*13]
-    mova                    m4, [coeffq+16*17]
-    mova                    m5, [coeffq+16*21]
-    mova                    m6, [coeffq+16*25]
-    mova                    m7, [coeffq+16*29]
+    LOAD_8ROWS     coeffq+16*1, 64
     call  m(idct_8x8_internal).main
-    mova    [rsp+gprsize+16*3], m0
-    mova    [rsp+gprsize+16*4], m1
-    mova    [rsp+gprsize+16*5], m2
-    mova    [rsp+gprsize+16*6], m3
-    mova    [rsp+gprsize+16*7], m4
-    mova    [rsp+gprsize+16*8], m5
-    mova    [rsp+gprsize+16*9], m6
-    mova                    m0, [coeffq+16*3 ]
-    mova                    m1, [coeffq+16*7 ]
-    mova                    m2, [coeffq+16*11]
-    mova                    m3, [coeffq+16*15]
-    mova                    m4, [coeffq+16*19]
-    mova                    m5, [coeffq+16*23]
-    mova                    m6, [coeffq+16*27]
-    mova                    m7, [coeffq+16*31]
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*3, 64
     call m(idct_16x8_internal).main
     mov                     r3, tx2q
     lea                   tx2q, [o(m(idct_16x16_internal).pass1_end)]
@@ -3298,7 +3273,7 @@
 
 .pass1_end:
     SAVE_8ROWS    coeffq+16*17, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(idct_16x16_internal).pass1_end1)]
     mova                    m7, [o(pw_8192)]
@@ -3306,30 +3281,10 @@
 
 .pass1_end1:
     SAVE_8ROWS     coeffq+16*1, 32
-    mova                    m0, [coeffq+16*0 ]
-    mova                    m1, [coeffq+16*4 ]
-    mova                    m2, [coeffq+16*8 ]
-    mova                    m3, [coeffq+16*12]
-    mova                    m4, [coeffq+16*16]
-    mova                    m5, [coeffq+16*20]
-    mova                    m6, [coeffq+16*24]
-    mova                    m7, [coeffq+16*28]
+    LOAD_8ROWS     coeffq+16*0, 64
     call  m(idct_8x8_internal).main
-    mova    [rsp+gprsize+16*3], m0
-    mova    [rsp+gprsize+16*4], m1
-    mova    [rsp+gprsize+16*5], m2
-    mova    [rsp+gprsize+16*6], m3
-    mova    [rsp+gprsize+16*7], m4
-    mova    [rsp+gprsize+16*8], m5
-    mova    [rsp+gprsize+16*9], m6
-    mova                    m0, [coeffq+16*2 ]
-    mova                    m1, [coeffq+16*6 ]
-    mova                    m2, [coeffq+16*10]
-    mova                    m3, [coeffq+16*14]
-    mova                    m4, [coeffq+16*18]
-    mova                    m5, [coeffq+16*22]
-    mova                    m6, [coeffq+16*26]
-    mova                    m7, [coeffq+16*30]
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*2, 64
     call m(idct_16x8_internal).main
     lea                   tx2q, [o(m(idct_16x16_internal).pass1_end2)]
     mova                    m7, [o(pw_8192)]
@@ -3337,7 +3292,7 @@
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+16*16, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mov                   tx2q, r3
     mova                    m7, [o(pw_8192)]
@@ -3348,7 +3303,7 @@
     jmp  m(idct_8x16_internal).pass2_pre
 
 .end:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(idct_16x16_internal).end1)]
     mov                   dstq, r3
@@ -3443,7 +3398,7 @@
 
 .pass1_end:
     SAVE_8ROWS    coeffq+16*17, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(iadst_16x16_internal).pass1_end1)]
     mova                    m7, [o(pw_8192)]
@@ -3460,7 +3415,7 @@
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+16*16, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mov                   tx2q, r3
     mova                    m7, [o(pw_8192)]
@@ -3471,7 +3426,7 @@
     jmp m(iadst_8x16_internal).pass2_pre
 
 .end:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(iadst_16x16_internal).end1)]
     mov                   dstq, r3
@@ -3516,7 +3471,7 @@
 
 .pass1_end:
     SAVE_8ROWS     coeffq+16*1, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end1)]
     mova                    m7, [o(pw_m8192)]
@@ -3529,7 +3484,7 @@
 
     mova                    m7, [rsp+gprsize+16*0]
     SAVE_8ROWS     coeffq+16*0, 32
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(iflipadst_16x16_internal).pass1_end2)]
     mova                    m7, [o(pw_m8192)]
@@ -3549,7 +3504,7 @@
     jmp m(iflipadst_8x16_internal).pass2_pre
 
 .end:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(iflipadst_16x16_internal).end1)]
     lea                   dstq, [dstq+strideq*2]
@@ -3579,7 +3534,7 @@
     jmp m(iflipadst_8x16_internal).pass2_main
 
 .end2:
-    ITX_8X16_LOAD_STACK_COEFS
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(idct_8x16_internal).end1)]
     lea                   dstq, [dstq+strideq*2]
@@ -3661,3 +3616,686 @@
     lea                   tx2q, [o(m(idct_8x16_internal).end1)]
     lea                   dstq, [dstq+strideq*2]
     jmp .end
+
+
+cglobal inv_txfm_add_dct_dct_8x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+    call  m(idct_8x32_internal)
+    RET
+
+.dconly:
+    movd                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1, [coeffq]
+    movd                 m2, [o(pw_8192)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m2
+    psrlw                m2, 2            ;pw_2048
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    pshuflw              m0, m0, q0000
+    punpcklwd            m0, m0
+    mov                 r3d, 8
+    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x32).end)]
+    jmp m(inv_txfm_add_dct_dct_8x8).loop
+
+.end:
+    RET
+
+
+
+cglobal idct_8x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+    cmp                   eobd, 106
+    jle .fast
+
+    LOAD_8ROWS     coeffq+16*3, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1:
+    mova   [rsp+gprsize+16*9 ], m0                        ;in24
+    mova   [rsp+gprsize+16*10], m4                        ;in28
+    mova   [rsp+gprsize+16*17], m2                        ;in26
+    mova   [rsp+gprsize+16*18], m6                        ;in30
+    mova   [rsp+gprsize+16*31], m1                        ;in25
+    mova   [rsp+gprsize+16*30], m3                        ;in27
+    mova   [rsp+gprsize+16*27], m5                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+    LOAD_8ROWS     coeffq+16*2, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1_1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_1:
+    mova   [rsp+gprsize+16*7 ], m0                        ;in16
+    mova   [rsp+gprsize+16*8 ], m4                        ;in20
+    mova   [rsp+gprsize+16*15], m2                        ;in18
+    mova   [rsp+gprsize+16*16], m6                        ;in22
+    mova   [rsp+gprsize+16*33], m1                        ;in17
+    mova   [rsp+gprsize+16*28], m3                        ;in19
+    mova   [rsp+gprsize+16*29], m5                        ;in21
+    mova   [rsp+gprsize+16*32], m7                        ;in23
+
+.fast:
+    LOAD_8ROWS     coeffq+16*1, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end:
+    mova   [rsp+gprsize+16*5 ], m0                        ;in8
+    mova   [rsp+gprsize+16*6 ], m4                        ;in12
+    mova   [rsp+gprsize+16*13], m2                        ;in10
+    mova   [rsp+gprsize+16*14], m6                        ;in14
+    mova   [rsp+gprsize+16*21], m1                        ;in9
+    mova   [rsp+gprsize+16*24], m3                        ;in11
+    mova   [rsp+gprsize+16*25], m5                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+    LOAD_8ROWS     coeffq+16*0, 64
+    call  m(idct_8x8_internal).main
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_8x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    mova   [rsp+gprsize+16*11], m2                        ;in2
+    mova   [rsp+gprsize+16*12], m6                        ;in6
+    mova   [rsp+gprsize+16*19], m1                        ;in1
+    mova   [rsp+gprsize+16*26], m3                        ;in3
+    mova   [rsp+gprsize+16*23], m5                        ;in5
+    mova   [rsp+gprsize+16*22], m7                        ;in7
+    mova                    m1, m4                        ;in4
+    mova                    m2, [rsp+gprsize+16*5 ]       ;in8
+    mova                    m3, [rsp+gprsize+16*6 ]       ;in12
+
+    cmp                   eobd, 106
+    jg .full
+
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3 , 16
+    mova                    m0, [rsp+gprsize+16*11]
+    mova                    m1, [rsp+gprsize+16*12]
+    mova                    m2, [rsp+gprsize+16*13]
+    mova                    m3, [rsp+gprsize+16*14]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call .main_fast
+    jmp  .pass2
+
+.full:
+    mova                    m4, [rsp+gprsize+16*7 ]       ;in16
+    mova                    m5, [rsp+gprsize+16*8 ]       ;in20
+    mova                    m6, [rsp+gprsize+16*9 ]       ;in24
+    mova                    m7, [rsp+gprsize+16*10]       ;in28
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3 , 16
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+    call .main
+
+.pass2:
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_8x32_internal).end1)]
+
+.end:
+    pxor                    m7, m7
+    REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
+                                     8,  9,  10, 11, 12, 13, 14, 15, \
+                                     16, 17, 18, 19, 20, 21, 22, 23, \
+                                     24, 25, 26, 27, 28, 29, 30, 31
+
+    jmp                   tx2q
+
+.end1:
+    lea                   tx2q, [o(m(idct_8x32_internal).end2)]
+    jmp   m(idct_8x8_internal).end
+
+.end2:
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    lea                   tx2q, [o(m(idct_8x32_internal).end3)]
+    jmp   m(idct_8x8_internal).end
+
+.end3:
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    lea                   tx2q, [o(m(idct_8x32_internal).end4)]
+    jmp   m(idct_8x8_internal).end
+
+.end4:
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    lea                   tx2q, [o(m(idct_8x32_internal).end5)]
+    jmp   m(idct_8x8_internal).end
+
+.end5:
+    ret
+
+ALIGN function_align
+.main_fast: ;bottom half is zero
+    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
+    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
+    pmulhrsw                m3, m0, [o(pw_4091x8)]        ;t31a
+    pmulhrsw                m0, [o(pw_201x8)]             ;t16a
+    pmulhrsw                m2, m1, [o(pw_3035x8)]        ;t30a
+    pmulhrsw                m1, [o(pw_m2751x8)]           ;t17a
+    mova                    m7, [o(pd_2048)]
+    psubsw                  m4, m0, m1                    ;t17
+    paddsw                  m0, m1                        ;t16
+    psubsw                  m5, m3, m2                    ;t30
+    paddsw                  m3, m2                        ;t31
+    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
+    mova [rsp+gprsize*2+16*19], m0                        ;t16
+    mova [rsp+gprsize*2+16*20], m5                        ;t17a
+    mova [rsp+gprsize*2+16*33], m4                        ;t30a
+    mova [rsp+gprsize*2+16*34], m3                        ;t31
+    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
+    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
+    pmulhrsw                m3, m0, [o(pw_3703x8)]
+    pmulhrsw                m0, [o(pw_1751x8)]
+    pmulhrsw                m2, m1, [o(pw_3857x8)]
+    pmulhrsw                m1, [o(pw_m1380x8)]
+    psubsw                  m4, m1, m0                    ;t18
+    paddsw                  m0, m1                        ;t19
+    psubsw                  m5, m2, m3                    ;t29
+    paddsw                  m3, m2                        ;t28
+    pxor                    m2, m2
+    psubw                   m2, m4
+    ITX_MULSUB_2W            2, 5, 1, 4, 7,  799, 4017    ;t18a, t29a
+    mova [rsp+gprsize*2+16*21], m2                        ;t18a
+    mova [rsp+gprsize*2+16*22], m0                        ;t19
+    mova [rsp+gprsize*2+16*31], m3                        ;t28
+    mova [rsp+gprsize*2+16*32], m5                        ;t29a
+    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
+    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
+    pmulhrsw                m3, m0, [o(pw_3973x8)]
+    pmulhrsw                m0, [o(pw_995x8)]
+    pmulhrsw                m2, m1, [o(pw_3513x8)]
+    pmulhrsw                m1, [o(pw_m2106x8)]
+    psubsw                  m4, m0, m1                    ;t21
+    paddsw                  m0, m1                        ;t20
+    psubsw                  m5, m3, m2                    ;t26
+    paddsw                  m3, m2                        ;t27
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
+    mova [rsp+gprsize*2+16*23], m0                        ;t20
+    mova [rsp+gprsize*2+16*24], m5                        ;t21a
+    mova [rsp+gprsize*2+16*29], m4                        ;t26a
+    mova [rsp+gprsize*2+16*30], m3                        ;t27
+    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
+    mova                    m2, [rsp+gprsize*2+16*26]     ;in3
+    pmulhrsw                m3, m0, [o(pw_3290x8)]
+    pmulhrsw                m0, [o(pw_2440x8)]
+    pmulhrsw                m1, m2, [o(pw_4052x8)]
+    pmulhrsw                m2, [o(pw_m601x8)]
+    jmp .main2
+
+ALIGN function_align
+.main:
+    mova                    m7, [o(pd_2048)]
+    mova                    m0, [rsp+gprsize*2+16*19]     ;in1
+    mova                    m1, [rsp+gprsize*2+16*20]     ;in15
+    mova                    m2, [rsp+gprsize*2+16*33]     ;in17
+    mova                    m3, [rsp+gprsize*2+16*34]     ;in31
+    ITX_MULSUB_2W            0, 3, 4, 5, 7,  201, 4091    ;t16a, t31a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3035, 2751    ;t17a, t30a
+    psubsw                  m4, m0, m2                    ;t17
+    paddsw                  m0, m2                        ;t16
+    psubsw                  m5, m3, m1                    ;t30
+    paddsw                  m3, m1                        ;t31
+    ITX_MULSUB_2W            5, 4, 1, 2, 7,  799, 4017    ;t17a, t30a
+    mova [rsp+gprsize*2+16*19], m0                        ;t16
+    mova [rsp+gprsize*2+16*20], m5                        ;t17a
+    mova [rsp+gprsize*2+16*33], m4                        ;t30a
+    mova [rsp+gprsize*2+16*34], m3                        ;t31
+    mova                    m0, [rsp+gprsize*2+16*21]     ;in9
+    mova                    m1, [rsp+gprsize*2+16*22]     ;in7
+    mova                    m2, [rsp+gprsize*2+16*31]     ;in25
+    mova                    m3, [rsp+gprsize*2+16*32]     ;in23
+    ITX_MULSUB_2W            0, 3, 4, 5, 7, 1751, 3703    ;t18a, t29a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3857, 1380    ;t19a, t28a
+    psubsw                  m4, m2, m0                    ;t18
+    paddsw                  m0, m2                        ;t19
+    psubsw                  m5, m1, m3                    ;t29
+    paddsw                  m3, m1                        ;t28
+    pxor                    m2, m2
+    psubw                   m2, m4                        ;-t18
+    ITX_MULSUB_2W            2, 5, 1, 4, 7,  799, 4017    ;t18a, t29a
+    mova [rsp+gprsize*2+16*21], m2                        ;t18a
+    mova [rsp+gprsize*2+16*22], m0                        ;t19
+    mova [rsp+gprsize*2+16*31], m3                        ;t28
+    mova [rsp+gprsize*2+16*32], m5                        ;t29a
+    mova                    m0, [rsp+gprsize*2+16*23]     ;in5
+    mova                    m1, [rsp+gprsize*2+16*24]     ;in11
+    mova                    m2, [rsp+gprsize*2+16*29]     ;in21
+    mova                    m3, [rsp+gprsize*2+16*30]     ;in27
+    ITX_MULSUB_2W            0, 3, 4, 5, 7,  995, 3973    ;t20a, t27a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 3513, 2106    ;t21a, t26a
+    psubsw                  m4, m0, m2                    ;t21
+    paddsw                  m0, m2                        ;t20
+    psubsw                  m5, m3, m1                    ;t26
+    paddsw                  m3, m1                        ;t27
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 3406, 2276    ;t21a, t26a
+    mova [rsp+gprsize*2+16*23], m0                        ;t20
+    mova [rsp+gprsize*2+16*24], m5                        ;t21a
+    mova [rsp+gprsize*2+16*29], m4                        ;t26a
+    mova [rsp+gprsize*2+16*30], m3                        ;t27
+    mova                    m0, [rsp+gprsize*2+16*25]     ;in13
+    mova                    m1, [rsp+gprsize*2+16*26]     ;in3
+    mova                    m2, [rsp+gprsize*2+16*27]     ;in29
+    mova                    m3, [rsp+gprsize*2+16*28]     ;in19
+    ITX_MULSUB_2W            0, 3, 4, 5, 7, 2440, 3290    ;t22a, t25a
+    ITX_MULSUB_2W            2, 1, 4, 5, 7, 4052,  601    ;t23a, t24a
+
+.main2:
+    psubsw                  m4, m2, m0                    ;t22
+    paddsw                  m0, m2                        ;t23
+    psubsw                  m5, m1, m3                    ;t25
+    paddsw                  m3, m1                        ;t24
+    pxor                    m6, m6
+    psubw                   m2, m6, m4
+    ITX_MULSUB_2W            2, 5, 1, 4, 7, 3406, 2276    ;t22a, t25a
+
+    mova                    m4, [rsp+gprsize*2+16*24]     ;t21a
+    psubsw                  m1, m2, m4                    ;t21
+    paddsw                  m2, m4                        ;t22
+    psubw                   m4, m6, m1                    ;-t21
+    mova [rsp+gprsize*2+16*25], m2                        ;t22
+    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
+    psubsw                  m2, m5, m1                    ;t26
+    paddsw                  m5, m1                        ;t25
+    mova [rsp+gprsize*2+16*28], m5                        ;t25
+    ITX_MULSUB_2W            4, 2, 1, 5, 7, 1567, 3784    ;t21a, t26a
+    mova [rsp+gprsize*2+16*24], m4                        ;t21a
+    mova [rsp+gprsize*2+16*29], m2                        ;t26a
+
+    mova                    m1, [rsp+gprsize*2+16*23]     ;t20
+    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
+    psubsw                  m2, m0, m1                    ;t20a
+    paddsw                  m0, m1                        ;t23a
+    psubsw                  m4, m3, m5                    ;t27a
+    paddsw                  m3, m5                        ;t24a
+    psubw                   m6, m2                        ;-t20a
+    ITX_MULSUB_2W            6, 4, 1, 5, 7, 1567, 3784    ;t20, t27
+    mova [rsp+gprsize*2+16*26], m0                        ;t23a
+    mova [rsp+gprsize*2+16*27], m3                        ;t24a
+    mova [rsp+gprsize*2+16*30], m4                        ;t27
+
+    mova                    m0, [rsp+gprsize*2+16*20]     ;t17a
+    mova                    m1, [rsp+gprsize*2+16*21]     ;t18a
+    mova                    m2, [rsp+gprsize*2+16*32]     ;t29a
+    mova                    m3, [rsp+gprsize*2+16*33]     ;t30a
+    psubsw                  m4, m0, m1                    ;t18
+    paddsw                  m0, m1                        ;t17
+    psubsw                  m5, m3, m2                    ;t29
+    paddsw                  m3, m2                        ;t30
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t18a, t29a
+    mova [rsp+gprsize*2+16*20], m0                        ;t17
+    mova [rsp+gprsize*2+16*21], m5                        ;t18a
+    mova [rsp+gprsize*2+16*32], m4                        ;t29a
+    mova [rsp+gprsize*2+16*33], m3                        ;t30
+    mova                    m0, [rsp+gprsize*2+16*19]     ;t16
+    mova                    m1, [rsp+gprsize*2+16*22]     ;t19
+    mova                    m2, [rsp+gprsize*2+16*31]     ;t28
+    mova                    m3, [rsp+gprsize*2+16*34]     ;t31
+    psubsw                  m4, m0, m1                    ;t19a
+    paddsw                  m0, m1                        ;t16a
+    psubsw                  m5, m3, m2                    ;t28a
+    paddsw                  m3, m2                        ;t31a
+    ITX_MULSUB_2W            5, 4, 1, 2, 7, 1567, 3784    ;t19, t28
+
+    mova                    m2, [rsp+gprsize*2+16*15]     ;tmp12
+    psubsw                  m1, m5, m6                    ;t20a
+    paddsw                  m5, m6                        ;t19a
+    psubsw                  m6, m2, m5                    ;out19
+    paddsw                  m2, m5                        ;out12
+    mova [rsp+gprsize*2+16*22], m6                        ;out19
+    mova [rsp+gprsize*2+16*15], m2                        ;out12
+    mova                    m5, [rsp+gprsize*2+16*30]     ;t27
+    psubsw                  m6, m4, m5                    ;t27a
+    paddsw                  m4, m5                        ;t28a
+    mova                    m2, [rsp+gprsize*2+16*6 ]     ;tmp3
+    mova                    m7, [o(pw_2896x8)]
+    psubw                   m5, m6, m1                    ;t27a - t20a
+    paddw                   m6, m1                        ;t27a + t20a
+    psubsw                  m1, m2, m4                    ;out28
+    paddsw                  m2, m4                        ;out3
+    pmulhrsw                m5, m7                        ;t20
+    pmulhrsw                m6, m7                        ;t27
+    mova                    m4, [rsp+gprsize*2+16*14]     ;tmp11
+    mova [rsp+gprsize*2+16*31], m1                        ;out28
+    mova [rsp+gprsize*2+16*6 ], m2                        ;out3
+    psubsw                  m1, m4, m5                    ;out20
+    paddsw                  m4, m5                        ;out11
+    mova                    m2, [rsp+gprsize*2+16*7 ]     ;tmp4
+    mova [rsp+gprsize*2+16*23], m1                        ;out20
+    mova [rsp+gprsize*2+16*14], m4                        ;out11
+    psubsw                  m5, m2, m6                    ;out27
+    paddsw                  m2, m6                        ;out4
+    mova                    m1, [rsp+gprsize*2+16*26]     ;t23a
+    mova                    m4, [rsp+gprsize*2+16*27]     ;t24a
+    mova [rsp+gprsize*2+16*30], m5                        ;out27
+    mova [rsp+gprsize*2+16*7 ], m2                        ;out4
+    psubsw                  m5, m0, m1                    ;t23
+    paddsw                  m0, m1                        ;t16
+    psubsw                  m2, m3, m4                    ;t24
+    paddsw                  m3, m4                        ;t31
+    mova                    m6, [rsp+gprsize*2+16*18]     ;tmp15
+    psubw                   m1, m2, m5                    ;t24  - t23
+    paddw                   m2, m5                        ;t24  + t23
+    psubsw                  m4, m6, m0                    ;out16
+    paddsw                  m6, m0                        ;out15
+    pmulhrsw                m1, m7                        ;t23a
+    pmulhrsw                m2, m7                        ;t24a
+    mova                    m0, [rsp+gprsize*2+16*3 ]     ;tmp0
+    mova                    m5, [rsp+gprsize*2+16*11]     ;tmp8
+    mova [rsp+gprsize*2+16*18], m6                        ;out15
+    mova [rsp+gprsize*2+16*19], m4                        ;out16
+    psubsw                  m6, m0, m3                    ;out31
+    paddsw                  m0, m3                        ;out0
+    psubsw                  m4, m5, m1                    ;out23
+    paddsw                  m5, m1                        ;out8
+    mova                    m3, [rsp+gprsize*2+16*10]     ;tmp7
+    mova [rsp+gprsize*2+16*34], m6                        ;out31
+    mova [rsp+gprsize*2+16*11], m5                        ;out8
+    mova [rsp+gprsize*2+16*26], m4                        ;out23
+    paddsw                  m6, m3, m2                    ;out7
+    psubsw                  m3, m2                        ;out24
+    mova                    m1, [rsp+gprsize*2+16*20]     ;t17
+    mova                    m5, [rsp+gprsize*2+16*25]     ;t22
+    mova                    m2, [rsp+gprsize*2+16*17]     ;tmp14
+    mova [rsp+gprsize*2+16*27], m3                        ;out24
+    psubsw                  m4, m1, m5                    ;t22a
+    paddsw                  m1, m5                        ;t17a
+    psubsw                  m3, m2, m1                    ;out17
+    paddsw                  m2, m1                        ;out14
+    mova                    m5, [rsp+gprsize*2+16*28]     ;t25
+    mova                    m1, [rsp+gprsize*2+16*33]     ;t30
+    mova [rsp+gprsize*2+16*17], m2                        ;out14
+    mova [rsp+gprsize*2+16*20], m3                        ;out17
+    psubsw                  m2, m1, m5                    ;t25a
+    paddsw                  m1, m5                        ;t30a
+    psubw                   m3, m2, m4                    ;t25a - t22a
+    paddw                   m2, m4                        ;t25a + t22a
+    mova                    m5, [rsp+gprsize*2+16*4 ]     ;tmp1
+    pmulhrsw                m3, m7                        ;t22
+    pmulhrsw                m2, m7                        ;t25
+    psubsw                  m4, m5, m1                    ;out30
+    paddsw                  m5, m1                        ;out1
+    mova                    m1, [rsp+gprsize*2+16*12]     ;tmp9
+    mova [rsp+gprsize*2+16*33], m4                        ;out30
+    mova [rsp+gprsize*2+16*4 ], m5                        ;out1
+    psubsw                  m4, m1, m3                    ;out22
+    paddsw                  m1, m3                        ;out9
+    mova                    m5, [rsp+gprsize*2+16*9 ]     ;tmp6
+    mova [rsp+gprsize*2+16*25], m4                        ;out22
+    mova [rsp+gprsize*2+16*12], m1                        ;out9
+    psubsw                  m3, m5, m2                    ;out25
+    paddsw                  m5, m2                        ;out6
+    mova                    m4, [rsp+gprsize*2+16*21]     ;t18a
+    mova                    m1, [rsp+gprsize*2+16*24]     ;t21a
+    mova                    m2, [rsp+gprsize*2+16*16]     ;tmp13
+    mova [rsp+gprsize*2+16*28], m3                        ;out25
+    mova [rsp+gprsize*2+16*9 ], m5                        ;out6
+    paddsw                  m3, m4, m1                    ;t18
+    psubsw                  m4, m1                        ;t21
+    psubsw                  m5, m2, m3                    ;out18
+    paddsw                  m2, m3                        ;out13
+    mova                    m1, [rsp+gprsize*2+16*29]     ;t26a
+    mova                    m3, [rsp+gprsize*2+16*32]     ;t29a
+    mova [rsp+gprsize*2+16*21], m5                        ;out18
+    mova [rsp+gprsize*2+16*16], m2                        ;out13
+    psubsw                  m5, m3, m1                    ;t26
+    paddsw                  m3, m1                        ;t29
+    mova                    m2, [rsp+gprsize*2+16*5 ]     ;tmp2
+    psubw                   m1, m5, m4                    ;t26 - t21
+    paddw                   m4, m5                        ;t26 + t21
+    psubsw                  m5, m2, m3                    ;out29
+    paddsw                  m2, m3                        ;out2
+    pmulhrsw                m1, m7                        ;t21a
+    pmulhrsw                m4, m7                        ;t26a
+    mova                    m3, [rsp+gprsize*2+16*13]     ;tmp10
+    mova [rsp+gprsize*2+16*32], m5                        ;out29
+    psubsw                  m7, m3, m1                    ;out21
+    paddsw                  m3, m1                        ;out10
+    mova                    m5, [rsp+gprsize*2+16*8 ]     ;tmp5
+    mova [rsp+gprsize*2+16*24], m7                        ;out21
+    mova [rsp+gprsize*2+16*13], m3                        ;out10
+    psubsw                  m1, m5, m4                    ;out26
+    paddsw                  m5, m4                        ;out5
+    mova                    m7, m6                        ;out7
+    mova                    m3, [rsp+gprsize*2+16*6 ]     ;out3
+    mova                    m4, [rsp+gprsize*2+16*7 ]     ;out4
+    mova [rsp+gprsize*2+16*29], m1                        ;out26
+    mova                    m6, [rsp+gprsize*2+16*9 ]     ;out6
+    mova                    m1, [rsp+gprsize*2+16*4 ]     ;out1
+    ret
+
+
+cglobal inv_txfm_add_dct_dct_32x8, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+    call  m(idct_32x8_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r3d, 8
+
+.body:
+    pmulhrsw                m0, m2
+    movd                    m2, [o(pw_2048)]  ;intentionally rip-relative
+    pmulhrsw                m0, m1
+    pmulhrsw                m0, m2
+    pshuflw                 m0, m0, q0000
+    punpcklwd               m0, m0
+    pxor                    m5, m5
+
+.loop:
+    mova                    m1, [dstq+16*0]
+    mova                    m3, [dstq+16*1]
+    punpckhbw               m2, m1, m5
+    punpcklbw               m1, m5
+    punpckhbw               m4, m3, m5
+    punpcklbw               m3, m5
+    paddw                   m2, m0
+    paddw                   m1, m0
+    paddw                   m4, m0
+    paddw                   m3, m0
+    packuswb                m1, m2
+    packuswb                m3, m4
+    mova           [dstq+16*0], m1
+    mova           [dstq+16*1], m3
+    add                   dstq, strideq
+    dec                    r3d
+    jg .loop
+    RET
+
+
+cglobal idct_32x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+    LOAD_8ROWS     coeffq+16*0, 64
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+16*2, 64
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+16*1, 32
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    cmp                   eobd, 106
+    jg  .full
+    call m(idct_8x32_internal).main_fast
+    jmp .pass2
+
+.full:
+    LOAD_8ROWS    coeffq+16*17, 32
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+    call m(idct_8x32_internal).main
+
+.pass2:
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x8_internal).end)]
+    jmp  m(idct_8x32_internal).end
+
+.end:
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end1:
+    lea                     r3, [dstq+8]
+    lea                   tx2q, [o(m(idct_32x8_internal).end2)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end2:
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end3)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end3:
+    mov                   dstq, r3
+    lea                     r3, [r3+8]
+    lea                   tx2q, [o(m(idct_32x8_internal).end4)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end4:
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end5)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end5:
+    mov                   dstq, r3
+    lea                     r3, [r3+8]
+    lea                   tx2q, [o(m(idct_32x8_internal).end6)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end6:
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x8_internal).end7)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.end7:
+    mov                   dstq, r3
+    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
+    jmp   m(idct_8x8_internal).pass2_main
+
+.end8:
+    ret
+
+
+cglobal inv_txfm_add_identity_identity_8x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    mov                    r5d, 4
+    mov                   tx2d, 2
+    cmp                   eobd, 106
+    cmovg                 tx2d, r5d
+    mov                    r3d, tx2d
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
+
+.loop:
+    LOAD_8ROWS     coeffq+16*0, 64
+    paddw                   m6, [o(pw_5)]
+    mova            [rsp+16*1], m6
+    mova                    m6, [o(pw_5)]
+    REPX         {paddw x, m6}, m0, m1, m2, m3, m4, m5, m7
+
+    call  m(idct_8x8_internal).pass1_end3
+    REPX         {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    mova            [rsp+16*0], m7
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+
+    pxor                    m7, m7
+    REPX   {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+    add                 coeffq, 16
+    dec                    r3d
+    jg .loop
+    RET
+
+cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    mov                    r5d, 4
+    mov                   tx2d, 2
+    cmp                   eobd, 106
+    cmovg                 tx2d, r5d
+    mov                    r3d, tx2d
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+.loop:
+    LOAD_8ROWS     coeffq+16*0, 16
+    pmulhrsw                m6, [o(pw_4096)]
+    mova            [rsp+16*1], m6
+    mova                    m6, [o(pw_4096)]
+    REPX      {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+    lea                   tx2q, [o(m(idct_32x8_internal).end8)]
+    call  m(idct_8x8_internal).pass1_end3
+
+    mov             [rsp+16*3], dstq
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_8x8_internal).end4)]
+    call  m(idct_8x8_internal).end3
+
+    add                 coeffq, 16*8
+    mov                   dstq, [rsp+16*3]
+    lea                   dstq, [dstq+8]
+    dec                    r3d
+    jg .loop
+    jnc .loop
+    RET