shithub: dav1d

Download patch

ref: bd12b1ec70308dc87c7e53bd6c3e0f32c7c53c3c
parent: 72f8cc62ee7b9784b8f96e63474fb4d077fee1ad
author: Liwei Wang <[email protected]>
date: Mon Mar 25 10:59:58 EDT 2019

Add SSSE3 implementation for the 16x32,32x16 and 32x32 blocks in itx

Cycle times:
inv_txfm_add_16x32_dct_dct_0_8bpc_c: 2464.6
inv_txfm_add_16x32_dct_dct_0_8bpc_ssse3: 121.6
inv_txfm_add_16x32_dct_dct_1_8bpc_c: 24751.6
inv_txfm_add_16x32_dct_dct_1_8bpc_ssse3: 1101.9
inv_txfm_add_16x32_dct_dct_2_8bpc_c: 24377.0
inv_txfm_add_16x32_dct_dct_2_8bpc_ssse3: 1117.2
inv_txfm_add_16x32_dct_dct_3_8bpc_c: 24155.6
inv_txfm_add_16x32_dct_dct_3_8bpc_ssse3: 2349.3
inv_txfm_add_16x32_dct_dct_4_8bpc_c: 24175.6
inv_txfm_add_16x32_dct_dct_4_8bpc_ssse3: 1642.0
inv_txfm_add_16x32_identity_identity_0_8bpc_c: 10304.7
inv_txfm_add_16x32_identity_identity_0_8bpc_ssse3: 137.7
inv_txfm_add_16x32_identity_identity_1_8bpc_c: 10341.6
inv_txfm_add_16x32_identity_identity_1_8bpc_ssse3: 137.9
inv_txfm_add_16x32_identity_identity_2_8bpc_c: 10299.9
inv_txfm_add_16x32_identity_identity_2_8bpc_ssse3: 253.9
inv_txfm_add_16x32_identity_identity_3_8bpc_c: 10331.4
inv_txfm_add_16x32_identity_identity_3_8bpc_ssse3: 369.7
inv_txfm_add_16x32_identity_identity_4_8bpc_c: 10360.4
inv_txfm_add_16x32_identity_identity_4_8bpc_ssse3: 484.0
inv_txfm_add_32x16_dct_dct_0_8bpc_c: 2288.4
inv_txfm_add_32x16_dct_dct_0_8bpc_ssse3: 142.3
inv_txfm_add_32x16_dct_dct_1_8bpc_c: 23819.9
inv_txfm_add_32x16_dct_dct_1_8bpc_ssse3: 1740.1
inv_txfm_add_32x16_dct_dct_2_8bpc_c: 23755.8
inv_txfm_add_32x16_dct_dct_2_8bpc_ssse3: 1641.4
inv_txfm_add_32x16_dct_dct_3_8bpc_c: 23839.9
inv_txfm_add_32x16_dct_dct_3_8bpc_ssse3: 1559.0
inv_txfm_add_32x16_dct_dct_4_8bpc_c: 23757.7
inv_txfm_add_32x16_dct_dct_4_8bpc_ssse3: 1579.0
inv_txfm_add_32x16_identity_identity_0_8bpc_c: 10381.7
inv_txfm_add_32x16_identity_identity_0_8bpc_ssse3: 126.3
inv_txfm_add_32x16_identity_identity_1_8bpc_c: 10402.5
inv_txfm_add_32x16_identity_identity_1_8bpc_ssse3: 126.5
inv_txfm_add_32x16_identity_identity_2_8bpc_c: 10429.2
inv_txfm_add_32x16_identity_identity_2_8bpc_ssse3: 244.9
inv_txfm_add_32x16_identity_identity_3_8bpc_c: 10382.0
inv_txfm_add_32x16_identity_identity_3_8bpc_ssse3: 491.0
inv_txfm_add_32x16_identity_identity_4_8bpc_c: 10381.0
inv_txfm_add_32x16_identity_identity_4_8bpc_ssse3: 468.0
inv_txfm_add_32x32_dct_dct_0_8bpc_c: 4168.2
inv_txfm_add_32x32_dct_dct_0_8bpc_ssse3: 204.0
inv_txfm_add_32x32_dct_dct_1_8bpc_c: 46306.2
inv_txfm_add_32x32_dct_dct_1_8bpc_ssse3: 2216.0
inv_txfm_add_32x32_dct_dct_2_8bpc_c: 46300.2
inv_txfm_add_32x32_dct_dct_2_8bpc_ssse3: 2194.2
inv_txfm_add_32x32_dct_dct_3_8bpc_c: 46350.1
inv_txfm_add_32x32_dct_dct_3_8bpc_ssse3: 3484.4
inv_txfm_add_32x32_dct_dct_4_8bpc_c: 46318.1
inv_txfm_add_32x32_dct_dct_4_8bpc_ssse3: 3440.9
inv_txfm_add_32x32_identity_identity_0_8bpc_c: 14663.1
inv_txfm_add_32x32_identity_identity_0_8bpc_ssse3: 179.0
inv_txfm_add_32x32_identity_identity_1_8bpc_c: 14737.0
inv_txfm_add_32x32_identity_identity_1_8bpc_ssse3: 179.2
inv_txfm_add_32x32_identity_identity_2_8bpc_c: 14640.4
inv_txfm_add_32x32_identity_identity_2_8bpc_ssse3: 179.1
inv_txfm_add_32x32_identity_identity_3_8bpc_c: 14638.5
inv_txfm_add_32x32_identity_identity_3_8bpc_ssse3: 663.8
inv_txfm_add_32x32_identity_identity_4_8bpc_c: 14635.6
inv_txfm_add_32x32_identity_identity_4_8bpc_ssse3: 663.9

--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -88,6 +88,9 @@
 decl_itx12_fns(16, 16, ssse3);
 decl_itx2_fns ( 8, 32, ssse3);
 decl_itx2_fns (32,  8, ssse3);
+decl_itx2_fns (16, 32, ssse3);
+decl_itx2_fns (32, 16, ssse3);
+decl_itx2_fns (32, 32, ssse3);
 
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -142,6 +145,9 @@
     assign_itx12_fn(,  16, 16, ssse3);
     assign_itx2_fn (R,  8, 32, ssse3);
     assign_itx2_fn (R, 32,  8, ssse3);
+    assign_itx2_fn (R, 16, 32, ssse3);
+    assign_itx2_fn (R, 32, 16, ssse3);
+    assign_itx2_fn (,  32, 32, ssse3);
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -3750,10 +3750,13 @@
     call .main
 
 .pass2:
-    mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_8x32_internal).end1)]
+    lea                     r3, [o(m(idct_8x32_internal).end6)]
 
 .end:
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_8x32_internal).end2)]
+
+.end1:
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
                                      8,  9,  10, 11, 12, 13, 14, 15, \
@@ -3762,19 +3765,12 @@
 
     jmp                   tx2q
 
-.end1:
-    lea                   tx2q, [o(m(idct_8x32_internal).end2)]
-    jmp   m(idct_8x8_internal).end
-
 .end2:
-    LOAD_8ROWS   rsp+gprsize+16*11, 16
-    mova   [rsp+gprsize+16*0 ], m7
-    lea                   dstq, [dstq+strideq*2]
     lea                   tx2q, [o(m(idct_8x32_internal).end3)]
     jmp   m(idct_8x8_internal).end
 
 .end3:
-    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
     lea                   tx2q, [o(m(idct_8x32_internal).end4)]
@@ -3781,7 +3777,7 @@
     jmp   m(idct_8x8_internal).end
 
 .end4:
-    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
     lea                   tx2q, [o(m(idct_8x32_internal).end5)]
@@ -3788,6 +3784,13 @@
     jmp   m(idct_8x8_internal).end
 
 .end5:
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   dstq, [dstq+strideq*2]
+    mov                   tx2q, r3
+    jmp   m(idct_8x8_internal).end
+
+.end6:
     ret
 
 ALIGN function_align
@@ -4177,7 +4180,7 @@
 .pass2:
     mova   [rsp+gprsize+16*0 ], m7
     lea                   tx2q, [o(m(idct_32x8_internal).end)]
-    jmp  m(idct_8x32_internal).end
+    jmp  m(idct_8x32_internal).end1
 
 .end:
     mova                    m7, [o(pw_8192)]
@@ -4298,4 +4301,809 @@
     dec                    r3d
     jg .loop
     jnc .loop
+    RET
+
+
+cglobal inv_txfm_add_dct_dct_16x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+    call  m(idct_16x32_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_16384)]
+    mov               [coeffq], eobd
+    pmulhrsw                m0, m1
+    mov                    r2d, 16
+    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x32).end)]
+    jmp m(inv_txfm_add_dct_dct_16x4).dconly
+
+.end:
+    RET
+
+cglobal idct_16x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    LOAD_8ROWS     coeffq+16*1, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*5, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end:
+    SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end1:
+    mova        [coeffq+16*1 ], m0                        ;in8
+    mova        [coeffq+16*5 ], m4                        ;in12
+    mova   [rsp+gprsize+16*13], m2                        ;in10
+    mova   [rsp+gprsize+16*14], m6                        ;in14
+    mova   [rsp+gprsize+16*21], m1                        ;in9
+    mova   [rsp+gprsize+16*24], m3                        ;in11
+    mova   [rsp+gprsize+16*25], m5                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+    LOAD_8ROWS     coeffq+16*0, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*4, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end3:
+    mova   [rsp+gprsize+16*11], m2                        ;in2
+    mova   [rsp+gprsize+16*12], m6                        ;in6
+    mova   [rsp+gprsize+16*19], m1                        ;in1
+    mova   [rsp+gprsize+16*26], m3                        ;in3
+    mova   [rsp+gprsize+16*23], m5                        ;in5
+    mova   [rsp+gprsize+16*22], m7                        ;in7
+
+    cmp                eobd, 150
+    jg .full
+
+    mova                    m1, m4                        ;in4
+    mova                    m2, [coeffq+16*1 ]            ;in8
+    mova                    m3, [coeffq+16*5 ]            ;in12
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    mova                    m0, [rsp+gprsize+16*11]       ;in2
+    mova                    m1, [rsp+gprsize+16*12]       ;in6
+    mova                    m2, [rsp+gprsize+16*13]       ;in10
+    mova                    m3, [rsp+gprsize+16*14]       ;in14
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+    jmp  .pass2
+
+.full:
+    mova        [coeffq+16*0 ], m0                        ;in0
+    mova        [coeffq+16*4 ], m4                        ;in4
+
+    LOAD_8ROWS     coeffq+16*2, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*6, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end5)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end5:
+    mova        [coeffq+16*2 ], m0                        ;in16
+    mova        [coeffq+16*6 ], m4                        ;in20
+    mova   [rsp+gprsize+16*15], m2                        ;in18
+    mova   [rsp+gprsize+16*16], m6                        ;in22
+    mova   [rsp+gprsize+16*33], m1                        ;in17
+    mova   [rsp+gprsize+16*28], m3                        ;in19
+    mova   [rsp+gprsize+16*29], m5                        ;in21
+    mova   [rsp+gprsize+16*32], m7                        ;in23
+
+    LOAD_8ROWS     coeffq+16*3, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+16*7, 128, 1
+    call m(idct_16x8_internal).main
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end6)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end6:
+    SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
+    LOAD_8ROWS    rsp+gprsize+16*3, 16
+    mova    [rsp+gprsize+16*0], m7
+    lea                   tx2q, [o(m(idct_16x32_internal).pass1_end7)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end7:
+    mova   [rsp+gprsize+16*17], m2                        ;in26
+    mova   [rsp+gprsize+16*18], m6                        ;in30
+    mova   [rsp+gprsize+16*31], m1                        ;in25
+    mova   [rsp+gprsize+16*30], m3                        ;in27
+    mova   [rsp+gprsize+16*27], m5                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    mova                    m6, m0                        ;in24
+    mova                    m7, m4                        ;in28
+    mova                    m0, [coeffq+16*0 ]            ;in0
+    mova                    m1, [coeffq+16*4 ]            ;in4
+    mova                    m2, [coeffq+16*1 ]            ;in8
+    mova                    m3, [coeffq+16*5 ]            ;in12
+    mova                    m4, [coeffq+16*2 ]            ;in16
+    mova                    m5, [coeffq+16*6 ]            ;in20
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS   rsp+gprsize+16*3 , 16
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main
+
+.pass2:
+    mov  [rsp+gprsize*1+16*35], eobd
+    lea                     r3, [dstq+8]
+    mov  [rsp+gprsize*2+16*35], r3
+    lea                     r3, [o(m(idct_16x32_internal).end)]
+    jmp  m(idct_8x32_internal).end
+
+.end:
+    mov                   dstq, [rsp+gprsize*2+16*35]
+    mov                   eobd, [rsp+gprsize*1+16*35]
+    add                 coeffq, 16*32
+
+    mova                    m0, [coeffq+16*4 ]            ;in1
+    mova                    m1, [coeffq+16*12]            ;in3
+    mova                    m2, [coeffq+16*20]            ;in5
+    mova                    m3, [coeffq+16*28]            ;in7
+    mova                    m4, [coeffq+16*5 ]            ;in9
+    mova                    m5, [coeffq+16*13]            ;in11
+    mova                    m6, [coeffq+16*21]            ;in13
+    mova                    m7, [coeffq+16*29]            ;in15
+
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mova                    m0, [coeffq+16*0 ]            ;in0
+    mova                    m1, [coeffq+16*16]            ;in4
+    mova                    m2, [coeffq+16*1 ]            ;in8
+    mova                    m3, [coeffq+16*17]            ;in12
+
+    cmp                   eobd, 150
+    jg .full1
+
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]            ;in2
+    mova                    m1, [coeffq+16*24]            ;in6
+    mova                    m2, [coeffq+16*9 ]            ;in10
+    mova                    m3, [coeffq+16*25]            ;in14
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+    jmp  .end1
+
+.full1:
+    mova                    m4, [coeffq+16*2 ]            ;in16
+    mova                    m5, [coeffq+16*18]            ;in20
+    mova                    m6, [coeffq+16*3 ]            ;in24
+    mova                    m7, [coeffq+16*19]            ;in26
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]            ;in2
+    mova                    m1, [coeffq+16*24]            ;in6
+    mova                    m2, [coeffq+16*9 ]            ;in10
+    mova                    m3, [coeffq+16*25]            ;in14
+    mova                    m4, [coeffq+16*10]            ;in18
+    mova                    m5, [coeffq+16*26]            ;in22
+    mova                    m6, [coeffq+16*11]            ;in26
+    mova                    m7, [coeffq+16*27]            ;in30
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    mova                    m0, [coeffq+16*6 ]            ;in17
+    mova                    m1, [coeffq+16*14]            ;in19
+    mova                    m2, [coeffq+16*22]            ;in21
+    mova                    m3, [coeffq+16*30]            ;in23
+    mova                    m4, [coeffq+16*7 ]            ;in25
+    mova                    m5, [coeffq+16*15]            ;in27
+    mova                    m6, [coeffq+16*23]            ;in29
+    mova                    m7, [coeffq+16*31]            ;in31
+
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+
+.end1:
+    jmp m(idct_8x32_internal).pass2
+
+
+
+cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_32x16_internal)
+    call m(idct_8x16_internal).pass2
+
+    add                 coeffq, 16*16
+    lea                   dstq, [r3+8]
+    LOAD_8ROWS       rsp+16*11, 16
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end
+    call m(idct_8x16_internal).pass2
+
+    add                 coeffq, 16*16
+    lea                   dstq, [r3+8]
+    LOAD_8ROWS       rsp+16*19, 16
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end
+    call m(idct_8x16_internal).pass2
+
+    add                 coeffq, 16*16
+    lea                   dstq, [r3+8]
+    LOAD_8ROWS       rsp+16*27, 16
+    mova            [rsp+16*0], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end
+    call m(idct_8x16_internal).pass2
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_16384)]
+    mov               [coeffq], eobd
+    pmulhrsw                m0, m1
+    mov                    r3d, 16
+    jmp m(inv_txfm_add_dct_dct_32x8).body
+
+
+cglobal idct_32x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    add                 coeffq, 16
+    lea                     r3, [o(m(idct_32x16_internal).pass1_end1)]
+.pass1:
+    LOAD_8ROWS     coeffq+16*0, 128, 1
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    LOAD_8ROWS     coeffq+16*4, 128, 1
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS     coeffq+16*2, 64, 1
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    LOAD_8ROWS    coeffq+16*34, 64, 1
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+    call m(idct_8x32_internal).main
+
+.pass1_end:
+    mova   [rsp+gprsize+16*0 ], m7
+    mov                   tx2q, r3
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+16*0, 32
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end2:
+    SAVE_8ROWS    coeffq+16*16, 32
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+16*32, 32
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova   [rsp+gprsize+16*0 ], m7
+    lea                   tx2q, [o(m(idct_32x16_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+16*48, 32
+
+    sub                 coeffq, 16
+    lea                     r3, [o(m(idct_32x16_internal).end)]
+    jmp .pass1
+
+.end:
+    ret
+
+
+cglobal inv_txfm_add_identity_identity_16x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                     r4, 1
+    mov                     r5, 2
+    cmp                   eobd, 43                ;if (eob > 43)
+    cmovg                   r4, r5                ;  iteration_count++
+    inc                     r5
+    cmp                   eobd, 150               ;if (eob > 150)
+    cmovg                   r4, r5                ;  iteration_count++
+    inc                     r5
+    cmp                   eobd, 278               ;if (eob > 278)
+    cmovg                   r4, r5                ;  iteration_count++
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    lea                     r3, [dstq+8]
+    mov             [rsp+16*3], r3
+    mov                     r3, r4
+    mov     [rsp+gprsize+16*3], r4
+    mov   [rsp+gprsize*2+16*3], coeffq
+
+.loop:
+    LOAD_8ROWS          coeffq, 64, 1
+    REPX      {psllw    x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+    mova            [rsp+16*1], m6
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end3
+    pmulhrsw                m7, [o(pw_5793x4)]
+    paddw                   m7, [o(pw_5)]
+    psraw                   m7, 3
+    mova            [rsp+16*0], m7
+    mova                    m7, [o(pw_5793x4)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova                    m7, [o(pw_5)]
+    REPX      {paddw    x, m7}, m0, m1, m2, m3, m4, m5, m6
+    REPX      {psraw    x, 3 }, m0, m1, m2, m3, m4, m5, m6
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+
+    pxor                    m7, m7
+    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+
+    add                 coeffq, 16
+    dec                     r3
+    jg .loop
+
+    mov                 coeffq, [rsp+gprsize*2+16*3]
+    add                 coeffq, 64*8
+    mov                     r3, [rsp+gprsize+16*3]
+    xor                   dstq, dstq
+    mov     [rsp+gprsize+16*3], dstq
+    mov                   dstq, [rsp+16*3]
+    test                    r3, r3
+    jnz .loop
+
+    RET
+
+
+cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                     r4, 12                ;0100b
+    mov                     r5, 136               ;1000 1000b
+    cmp                   eobd, 43                ;if (eob > 43)
+    cmovg                   r4, r5                ;  iteration_count+2
+    mov                     r5, 34952             ;1000 1000 1000 1000b
+    cmp                   eobd, 150               ;if (eob > 150)
+    cmovg                   r4, r5                ;  iteration_count += 4
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    lea                     r3, [dstq+8]
+    mov             [rsp+16*3], r3
+    mov                     r3, r4
+
+.loop:
+    LOAD_8ROWS          coeffq, 32, 1
+    REPX         {psllw  x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+    mova            [rsp+16*1], m6
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end3
+    pmulhrsw                m7, [o(pw_5793x4)]
+    pmulhrsw                m7, [o(pw_2048)]
+    mova            [rsp+16*0], m7
+    mova                    m7, [o(pw_5793x4)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova                    m7, [o(pw_2048)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova            [rsp+16*2], m5
+    mova            [rsp+16*1], m6
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+
+    pxor                    m7, m7
+    REPX   {mova [coeffq+32*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+
+.loop_end:
+    add                 coeffq, 16
+    shr                     r3, 2
+    test                    r3, r3
+    jz .ret
+    test                    r3, 2
+    jnz .loop
+    mov                     r4, r3
+    and                     r4, 1
+    shl                     r4, 3
+    add                 coeffq, r4
+    add                 coeffq, 32*7
+    mov                   dstq, [rsp+16*3]
+    lea                     r4, [dstq+8]
+    mov             [rsp+16*3], r4
+    jmp .loop
+
+.ret:
+    RET
+
+
+cglobal inv_txfm_add_dct_dct_32x32, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+    test                  eobd, eobd
+    jz .dconly
+
+    call m(idct_32x32_internal)
+    RET
+
+.dconly:
+    movd                    m1, [o(pw_2896x8)]
+    pmulhrsw                m0, m1, [coeffq]
+    movd                    m2, [o(pw_8192)]
+    mov               [coeffq], eobd
+    mov                    r3d, 32
+    jmp m(inv_txfm_add_dct_dct_32x8).body
+
+
+cglobal idct_32x32_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                     r5, 4
+    mov                     r4, 2
+    sub                   eobd, 136
+    cmovge                  r4, r5
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    mov  [rsp+gprsize*1+16*35], eobd
+    mov                     r3, r4
+    mov  [rsp+gprsize*2+16*35], coeffq
+
+.pass1_loop:
+    LOAD_8ROWS     coeffq+64*1, 64*2
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mov                   tx2d, [rsp+gprsize*1+16*35]
+    test                  tx2d, tx2d
+    jl .fast
+
+.full:
+    LOAD_8ROWS     coeffq+64*0, 64*4
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_8ROWS     coeffq+64*2, 64*4
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    LOAD_8ROWS    coeffq+64*17, 64*2
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+    jmp .pass1_end
+
+.fast:
+    mova                    m0, [coeffq+256*0]
+    mova                    m1, [coeffq+256*1]
+    mova                    m2, [coeffq+256*2]
+    mova                    m3, [coeffq+256*3]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    mova                    m0, [coeffq+128*1]
+    mova                    m1, [coeffq+128*3]
+    mova                    m2, [coeffq+128*5]
+    mova                    m3, [coeffq+128*7]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+
+.pass1_end:
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end1)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end1:
+    SAVE_8ROWS     coeffq+64*0, 64
+    LOAD_8ROWS   rsp+gprsize+16*11, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end2)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end2:
+    SAVE_8ROWS     coeffq+64*8, 64
+    LOAD_8ROWS   rsp+gprsize+16*19, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end3)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end3:
+    SAVE_8ROWS    coeffq+64*16, 64
+    LOAD_8ROWS   rsp+gprsize+16*27, 16
+    mova    [rsp+gprsize+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    lea                   tx2q, [o(m(idct_32x32_internal).pass1_end4)]
+    jmp   m(idct_8x8_internal).pass1_end1
+
+.pass1_end4:
+    SAVE_8ROWS    coeffq+64*24, 64
+
+    add                 coeffq, 16
+    dec                     r3
+    jg .pass1_loop
+
+
+.pass2:
+    mov                 coeffq, [rsp+gprsize*2+16*35]
+    mov                     r3, 4
+
+.pass2_loop:
+    lea                     r4, [dstq+8]
+    mov  [rsp+gprsize*2+16*35], r4
+
+    mova                    m0, [coeffq+16*4 ]
+    mova                    m1, [coeffq+16*12]
+    mova                    m2, [coeffq+16*20]
+    mova                    m3, [coeffq+16*28]
+    mova                    m4, [coeffq+16*5 ]
+    mova                    m5, [coeffq+16*13]
+    mova                    m6, [coeffq+16*21]
+    mova                    m7, [coeffq+16*29]
+    mova   [rsp+gprsize+16*19], m0                        ;in1
+    mova   [rsp+gprsize+16*26], m1                        ;in3
+    mova   [rsp+gprsize+16*23], m2                        ;in5
+    mova   [rsp+gprsize+16*22], m3                        ;in7
+    mova   [rsp+gprsize+16*21], m4                        ;in9
+    mova   [rsp+gprsize+16*24], m5                        ;in11
+    mova   [rsp+gprsize+16*25], m6                        ;in13
+    mova   [rsp+gprsize+16*20], m7                        ;in15
+
+    mov                   tx2d, [rsp+gprsize*1+16*35]
+    test                  tx2d, tx2d
+    jl .fast1
+
+.full1:
+    mova                    m0, [coeffq+16*0 ]
+    mova                    m1, [coeffq+16*16]
+    mova                    m2, [coeffq+16*1 ]
+    mova                    m3, [coeffq+16*17]
+    mova                    m4, [coeffq+16*2 ]
+    mova                    m5, [coeffq+16*18]
+    mova                    m6, [coeffq+16*3 ]
+    mova                    m7, [coeffq+16*19]
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]
+    mova                    m1, [coeffq+16*24]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*25]
+    mova                    m4, [coeffq+16*10]
+    mova                    m5, [coeffq+16*26]
+    mova                    m6, [coeffq+16*11]
+    mova                    m7, [coeffq+16*27]
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    mova                    m0, [coeffq+16*6 ]
+    mova                    m1, [coeffq+16*14]
+    mova                    m2, [coeffq+16*22]
+    mova                    m3, [coeffq+16*30]
+    mova                    m4, [coeffq+16*7 ]
+    mova                    m5, [coeffq+16*15]
+    mova                    m6, [coeffq+16*23]
+    mova                    m7, [coeffq+16*31]
+    mova   [rsp+gprsize+16*33], m0                        ;in17
+    mova   [rsp+gprsize+16*28], m1                        ;in19
+    mova   [rsp+gprsize+16*29], m2                        ;in21
+    mova   [rsp+gprsize+16*32], m3                        ;in23
+    mova   [rsp+gprsize+16*31], m4                        ;in25
+    mova   [rsp+gprsize+16*30], m5                        ;in27
+    mova   [rsp+gprsize+16*27], m6                        ;in29
+    mova   [rsp+gprsize+16*34], m7                        ;in31
+
+    call m(idct_8x32_internal).main
+    jmp .pass2_end
+
+.fast1:
+    mova                    m0, [coeffq+16*0 ]
+    mova                    m1, [coeffq+16*16]
+    mova                    m2, [coeffq+16*1 ]
+    mova                    m3, [coeffq+16*17]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+
+    mova                    m0, [coeffq+16*8 ]
+    mova                    m1, [coeffq+16*24]
+    mova                    m2, [coeffq+16*9 ]
+    mova                    m3, [coeffq+16*25]
+    pxor                    m4, m4
+    REPX          {mova x, m4}, m5, m6, m7
+    call m(idct_16x8_internal).main
+    mova                    m7, [rsp+gprsize+16*0]
+    SAVE_8ROWS   rsp+gprsize+16*11, 16
+
+    call m(idct_8x32_internal).main_fast
+
+.pass2_end:
+    mov  [rsp+gprsize*3+16*35], r3
+    lea                     r3, [o(m(idct_32x32_internal).pass2_end1)]
+    jmp  m(idct_8x32_internal).end
+
+.pass2_end1:
+    add                 coeffq, 16*32
+    mov                   dstq, [rsp+gprsize*2+16*35]
+    mov                     r3, [rsp+gprsize*3+16*35]
+    dec                     r3
+    jg .pass2_loop
+
+    RET
+
+
+cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+    %undef cmp
+
+    mov                     r4, 2
+    mov                     r5, 4
+    cmp                   eobd, 136
+    cmovge                  r4, r5
+
+%if ARCH_X86_32
+    LEA                     r5, $$
+%endif
+
+    lea                     r3, [dstq+8]
+    mov   [rsp+gprsize*0+16*3], r3
+    mov   [rsp+gprsize*1+16*3], r4
+    mov   [rsp+gprsize*2+16*3], r4
+    mov   [rsp+gprsize*3+16*3], coeffq
+    mov                     r3, r4
+
+.loop:
+    LOAD_8ROWS          coeffq, 64
+    mova            [rsp+16*1], m6
+    lea                   tx2q, [o(m(idct_32x16_internal).end)]
+    call  m(idct_8x8_internal).pass1_end3
+    pmulhrsw                m7, [o(pw_8192)]
+    mova            [rsp+16*0], m7
+    mova                    m7, [o(pw_8192)]
+    REPX      {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    mova            [rsp+16*1], m6
+    mova            [rsp+16*2], m5
+    call  m(idct_8x8_internal).end3
+    lea                   dstq, [dstq+strideq*2]
+
+    pxor                    m7, m7
+    REPX   {mova [coeffq+64*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+
+    add                 coeffq, 16
+    dec                     r3
+    jg .loop
+
+    mov                     r4, [rsp+gprsize*2+16*3]
+    dec                     r4
+    jle .ret
+
+    mov                   dstq, [rsp+gprsize*0+16*3]
+    mov                 coeffq, [rsp+gprsize*3+16*3]
+    mov   [rsp+gprsize*2+16*3], r4
+    lea                     r3, [dstq+8]
+    add                 coeffq, 64*8
+    mov   [rsp+gprsize*0+16*3], r3
+    mov                     r3, [rsp+gprsize*1+16*3]
+    mov   [rsp+gprsize*3+16*3], coeffq
+    jmp .loop
+
+.ret:
     RET