shithub: dav1d

Download patch

ref: 5fa6c44a61fbf946646899d9db24d92cdce478ac
parent: 71e13008acae0a3cf6cee1790914248ee1140c89
author: Liwei Wang <[email protected]>
date: Wed Dec 26 09:57:39 EST 2018

Add SSSE3 implementation for the 8x8 blocks in itx

Cycle times:
inv_txfm_add_8x8_adst_adst_0_8bpc_c: 2165.6
inv_txfm_add_8x8_adst_adst_0_8bpc_ssse3: 194.5
inv_txfm_add_8x8_adst_adst_1_8bpc_c: 2158.3
inv_txfm_add_8x8_adst_adst_1_8bpc_ssse3: 194.7
inv_txfm_add_8x8_adst_dct_0_8bpc_c: 2241.0
inv_txfm_add_8x8_adst_dct_0_8bpc_ssse3: 165.1
inv_txfm_add_8x8_adst_dct_1_8bpc_c: 2242.6
inv_txfm_add_8x8_adst_dct_1_8bpc_ssse3: 164.2
inv_txfm_add_8x8_adst_flipadst_0_8bpc_c: 2178.2
inv_txfm_add_8x8_adst_flipadst_0_8bpc_ssse3: 194.4
inv_txfm_add_8x8_adst_flipadst_1_8bpc_c: 2183.0
inv_txfm_add_8x8_adst_flipadst_1_8bpc_ssse3: 194.2
inv_txfm_add_8x8_adst_identity_0_8bpc_c: 1592.1
inv_txfm_add_8x8_adst_identity_0_8bpc_ssse3: 125.2
inv_txfm_add_8x8_adst_identity_1_8bpc_c: 1597.7
inv_txfm_add_8x8_adst_identity_1_8bpc_ssse3: 126.3
inv_txfm_add_8x8_dct_adst_0_8bpc_c: 2214.1
inv_txfm_add_8x8_dct_adst_0_8bpc_ssse3: 162.0
inv_txfm_add_8x8_dct_adst_1_8bpc_c: 2221.5
inv_txfm_add_8x8_dct_adst_1_8bpc_ssse3: 161.9
inv_txfm_add_8x8_dct_dct_0_8bpc_c: 2247.8
inv_txfm_add_8x8_dct_dct_0_8bpc_ssse3: 34.0
inv_txfm_add_8x8_dct_dct_1_8bpc_c: 2243.1
inv_txfm_add_8x8_dct_dct_1_8bpc_ssse3: 133.7
inv_txfm_add_8x8_dct_flipadst_0_8bpc_c: 2255.1
inv_txfm_add_8x8_dct_flipadst_0_8bpc_ssse3: 161.2
inv_txfm_add_8x8_dct_flipadst_1_8bpc_c: 2244.6
inv_txfm_add_8x8_dct_flipadst_1_8bpc_ssse3: 161.8
inv_txfm_add_8x8_dct_identity_0_8bpc_c: 1632.3
inv_txfm_add_8x8_dct_identity_0_8bpc_ssse3: 41.3
inv_txfm_add_8x8_dct_identity_1_8bpc_c: 1629.6
inv_txfm_add_8x8_dct_identity_1_8bpc_ssse3: 97.7
inv_txfm_add_8x8_flipadst_adst_0_8bpc_c: 2185.6
inv_txfm_add_8x8_flipadst_adst_0_8bpc_ssse3: 191.0
inv_txfm_add_8x8_flipadst_adst_1_8bpc_c: 2165.7
inv_txfm_add_8x8_flipadst_adst_1_8bpc_ssse3: 191.6
inv_txfm_add_8x8_flipadst_dct_0_8bpc_c: 2246.4
inv_txfm_add_8x8_flipadst_dct_0_8bpc_ssse3: 162.8
inv_txfm_add_8x8_flipadst_dct_1_8bpc_c: 2252.1
inv_txfm_add_8x8_flipadst_dct_1_8bpc_ssse3: 163.9
inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_c: 2180.9
inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_ssse3: 196.3
inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_c: 2192.2
inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_ssse3: 194.5
inv_txfm_add_8x8_flipadst_identity_0_8bpc_c: 1600.9
inv_txfm_add_8x8_flipadst_identity_0_8bpc_ssse3: 126.6
inv_txfm_add_8x8_flipadst_identity_1_8bpc_c: 1600.5
inv_txfm_add_8x8_flipadst_identity_1_8bpc_ssse3: 126.4
inv_txfm_add_8x8_identity_adst_0_8bpc_c: 1558.0
inv_txfm_add_8x8_identity_adst_0_8bpc_ssse3: 120.7
inv_txfm_add_8x8_identity_adst_1_8bpc_c: 1556.7
inv_txfm_add_8x8_identity_adst_1_8bpc_ssse3: 121.0
inv_txfm_add_8x8_identity_dct_0_8bpc_c: 1600.8
inv_txfm_add_8x8_identity_dct_0_8bpc_ssse3: 37.9
inv_txfm_add_8x8_identity_dct_1_8bpc_c: 1599.5
inv_txfm_add_8x8_identity_dct_1_8bpc_ssse3: 90.3
inv_txfm_add_8x8_identity_flipadst_0_8bpc_c: 1584.9
inv_txfm_add_8x8_identity_flipadst_0_8bpc_ssse3: 120.2
inv_txfm_add_8x8_identity_flipadst_1_8bpc_c: 1584.3
inv_txfm_add_8x8_identity_flipadst_1_8bpc_ssse3: 120.5
inv_txfm_add_8x8_identity_identity_0_8bpc_c: 975.9
inv_txfm_add_8x8_identity_identity_0_8bpc_ssse3: 54.7
inv_txfm_add_8x8_identity_identity_1_8bpc_c: 975.7
inv_txfm_add_8x8_identity_identity_1_8bpc_ssse3: 54.7

--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -80,6 +80,7 @@
 decl_itx17_fns(4, 4, ssse3);
 decl_itx16_fns(4, 8, ssse3);
 decl_itx16_fns(8, 4, ssse3);
+decl_itx16_fns(8, 8, ssse3);
 
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -126,6 +127,7 @@
     assign_itx17_fn(,  4, 4, ssse3);
     assign_itx16_fn(R, 4, 8, ssse3);
     assign_itx16_fn(R, 8, 4, ssse3);
+    assign_itx16_fn(,  8, 8, ssse3);
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -58,6 +58,8 @@
 pd_2048:        times 4 dd  2048
 pw_2048:        times 8 dw  2048
 pw_4096:        times 8 dw  4096
+pw_16384:       times 8 dw  16384
+pw_m16384:      times 8 dw  -16384
 pw_2896x8:      times 8 dw  2896*8
 pw_3344x8:      times 8 dw  3344*8
 pw_5793x4:      times 8 dw  5793*4
@@ -69,6 +71,14 @@
 
 SECTION .text
 
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
+
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
 %if ARCH_X86_64
@@ -841,23 +851,31 @@
 
 
 %macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
-    movq                 m%3, [dstq   ]
+    movq                 m%3, [dstq        ]
     movq                 m%4, [dstq+strideq]
     pxor                 m%5, m%5
     punpcklbw            m%3, m%5                 ;extend byte to word
     punpcklbw            m%4, m%5                 ;extend byte to word
+%ifnum %1
     paddw                m%3, m%1
+%else
+    paddw                m%3, %1
+%endif
+%ifnum %2
     paddw                m%4, m%2
+%else
+    paddw                m%4, %2
+%endif
     packuswb             m%3, m%4
-    movq           [dstq   ], m%3
+    movq      [dstq        ], m%3
     punpckhqdq           m%3, m%3
     movq      [dstq+strideq], m%3
 %endmacro
 
 %macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
-    WRITE_8X2             0, 1, 4, 5, 6
+    WRITE_8X2             %1, %2, %5, %6, %7
     lea                dstq, [dstq+strideq*2]
-    WRITE_8X2             2, 3, 4, 5, 6
+    WRITE_8X2             %3, %4, %5, %6, %7
 %endmacro
 
 %macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
@@ -1023,6 +1041,7 @@
     mova      [coeffq+16*1], m6
     mova      [coeffq+16*2], m6
     mova      [coeffq+16*3], m6
+.end3:
     WRITE_8X4             0, 1, 2, 3, 4, 5, 6
     RET
 
@@ -1116,3 +1135,344 @@
     pmulhrsw             m2, m4
     pmulhrsw             m3, m4
     jmp m(iadst_8x4_internal).end
+
+%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x8, 8
+%ifidn %1_%2, dct_identity
+    mova                 m0, [o(pw_2896x8)]
+    pmulhrsw             m0, [coeffq]
+    mova                 m1, [o(pw_16384)]
+    pmulhrsw             m0, m1
+    psrlw                m1, 2
+    pmulhrsw             m0, m1
+    punpckhwd            m7, m0, m0
+    punpcklwd            m0, m0
+    pshufd               m3, m0, q3333
+    pshufd               m2, m0, q2222
+    pshufd               m1, m0, q1111
+    pshufd               m0, m0, q0000
+    call m(iadst_8x4_internal).end2
+    pshufd               m3, m7, q3333
+    pshufd               m2, m7, q2222
+    pshufd               m1, m7, q1111
+    pshufd               m0, m7, q0000
+    lea                dstq, [dstq+strideq*2]
+    TAIL_CALL m(iadst_8x4_internal).end3
+%elif %3 >= 0
+%ifidn %1, dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklwd            m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mova                 m2, [o(pw_16384)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m2
+    psrlw                m2, 3
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+.end:
+    mov                 r2d, 2
+.end2:
+    lea                  r3, [strideq*3]
+.loop:
+    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
+    lea                dstq, [dstq+strideq*2]
+    dec                 r2d
+    jg .loop
+    RET
+%else ; identity
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    punpcklwd            m0, [coeffq+16*4]
+    punpcklwd            m1, [coeffq+16*5]
+    punpcklwd            m2, [coeffq+16*6]
+    punpcklwd            m3, [coeffq+16*7]
+    punpcklwd            m0, m2
+    punpcklwd            m1, m3
+    punpcklwd            m0, m1
+    pmulhrsw             m0, [o(pw_2896x8)]
+    pmulhrsw             m0, [o(pw_2048)]
+    pxor                 m4, m4
+    REPX {mova [coeffq+16*x], m4}, 0,  1,  2,  3,  4,  5,  6,  7
+    jmp m(inv_txfm_add_dct_dct_8x8).end
+%endif
+%endif
+%endmacro
+
+%macro ITX_8X8_LOAD_COEFS 0
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    mova                 m4, [coeffq+16*4]
+    mova                 m5, [coeffq+16*5]
+    mova                 m6, [coeffq+16*6]
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %1, %4, %5, %6, %7,  799, 4017   ;t4a, t7a
+    ITX_MULSUB_2W        %3, %2, %5, %6, %7, 3406, 2276   ;t5a, t6a
+    psubsw               m%5, m%1, m%3                    ;t5a
+    paddsw               m%1, m%3                         ;t4
+    psubsw               m%6, m%4, m%2                    ;t6a
+    paddsw               m%4, m%2                         ;t7
+    mova                 m%3, [o(pw_2896x8)]
+    psubw                m%2, m%6, m%5                    ;t6a - t5a
+    paddw                m%6, m%5                         ;t6a + t5a
+    pmulhrsw             m%2, m%3                         ;t5
+    pmulhrsw             m%3, m%6                         ;t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct,      0
+INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_8X8_LOAD_COEFS
+    call .main
+
+.pass1_end:
+    mova                  m7, [o(pw_16384)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+
+.pass1_end2:
+    REPX    {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw              m7, [coeffq+16*7]
+
+.pass1_end3:
+    punpcklwd             m6, m1, m5             ;10 50 11 51 12 52 13 53
+    punpckhwd             m1, m5                 ;14 54 15 55 16 56 17 57
+    punpckhwd             m5, m0, m4             ;04 44 05 45 06 46 07 47
+    punpcklwd             m0, m4                 ;00 40 01 41 02 42 03 43
+    punpckhwd             m4, m3, m7             ;34 74 35 75 36 76 37 77
+    punpcklwd             m3, m7                 ;30 70 31 71 32 72 33 73
+    punpckhwd             m7, m1, m4             ;16 36 56 76 17 37 57 77
+    punpcklwd             m1, m4                 ;14 34 54 74 15 35 55 75
+    punpckhwd             m4, m6, m3             ;12 32 52 72 13 33 53 73
+    punpcklwd             m6, m3                 ;10 30 50 70 11 31 51 71
+    mova       [coeffq+16*5], m6
+    mova                  m6, [coeffq+16*6]
+    punpckhwd             m3, m2, m6             ;24 64 25 65 26 66 27 67
+    punpcklwd             m2, m6                 ;20 60 21 61 22 62 23 63
+    punpckhwd             m6, m5, m3             ;06 26 46 66 07 27 47 67
+    punpcklwd             m5, m3                 ;04 24 44 64 05 25 45 65
+    punpckhwd             m3, m0, m2             ;02 22 42 62 03 23 43 63
+    punpcklwd             m0, m2                 ;00 20 40 60 01 21 41 61
+
+    punpckhwd             m2, m6, m7             ;07 17 27 37 47 57 67 77
+    punpcklwd             m6, m7                 ;06 16 26 36 46 56 66 76
+    mova       [coeffq+16*7], m2
+    punpcklwd             m2, m3, m4             ;02 12 22 32 42 52 62 72
+    punpckhwd             m3, m4                 ;03 13 23 33 43 53 63 73
+    punpcklwd             m4, m5, m1             ;04 14 24 34 44 54 64 74
+    punpckhwd             m5, m1                 ;05 15 25 35 45 55 65 75
+    mova                  m7, [coeffq+16*5]
+    punpckhwd             m1, m0, m7             ;01 11 21 31 41 51 61 71
+    punpcklwd             m0, m7                 ;00 10 20 30 40 50 60 70
+    jmp                tx2q
+
+.pass2:
+    call .main
+
+.end:
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+
+.end2:
+    REPX    {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw              m7, [coeffq+16*7]
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*7], m7
+
+.end3:
+    WRITE_8X4             0, 1, 2, 3, 5, 6, 7
+    lea                dstq, [dstq+strideq*2]
+    WRITE_8X4             4, [coeffq+16*5], [coeffq+16*6], [coeffq+16*7], 5, 6, 7
+
+    pxor                 m7, m7
+    REPX {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+    ret
+
+ALIGN function_align
+.main:
+    mova       [coeffq+16*6], m3
+    mova       [coeffq+16*5], m1
+    mova                  m7, [o(pd_2048)]
+    IDCT4_1D               0, 2, 4, 6, 1, 3, 7
+    mova                  m3, [coeffq+16*5]
+    mova       [coeffq+16*5], m2
+    mova                  m2, [coeffq+16*6]
+    mova       [coeffq+16*6], m4
+    mova                  m4, [coeffq+16*7]
+    mova       [coeffq+16*7], m6
+    IDCT8_1D_ODDHALF       3, 2, 5, 4, 1, 6, 7
+    mova                  m6, [coeffq+16*7]
+    psubsw                m7, m0, m4                    ;out7
+    paddsw                m0, m4                        ;out0
+    mova       [coeffq+16*7], m7
+    mova                  m1, [coeffq+16*5]
+    psubsw                m4, m6, m3                    ;out4
+    paddsw                m3, m6                        ;out3
+    mova                  m7, [coeffq+16*6]
+    psubsw                m6, m1, m5                    ;out6
+    paddsw                m1, m5                        ;out1
+    psubsw                m5, m7, m2                    ;out5
+    paddsw                m2, m7                        ;out2
+    ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_8X8_LOAD_COEFS
+    call .main
+    mova                  m7, [o(pw_16384)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+    pxor                  m6, m6
+    psubw                 m6, m7
+    mova                  m7, m6
+    jmp m(idct_8x8_internal).pass1_end2
+
+ALIGN function_align
+.pass2:
+    call .main
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+    pxor                  m6, m6
+    psubw                 m6, m7
+    mova                  m7, m6
+    jmp m(idct_8x8_internal).end2
+
+ALIGN function_align
+.main:
+    mova       [coeffq+16*6], m3
+    mova       [coeffq+16*5], m4
+    mova                  m7, [o(pd_2048)]
+    ITX_MULSUB_2W          5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
+    ITX_MULSUB_2W          1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
+    paddsw                m3, m2, m6                    ;t2
+    psubsw                m2, m6                        ;t6
+    paddsw                m4, m5, m1                    ;t3
+    psubsw                m5, m1                        ;t7
+    ITX_MULSUB_2W          5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
+
+    mova                  m6, [coeffq+16*5]
+    mova       [coeffq+16*5], m5
+    mova                  m1, [coeffq+16*6]
+    mova       [coeffq+16*6], m2
+    mova                  m5, [coeffq+16*7]
+    mova       [coeffq+16*7], m3
+    ITX_MULSUB_2W          5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
+    ITX_MULSUB_2W          1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
+    psubsw                m2, m0, m6                    ;t4
+    paddsw                m0, m6                        ;t0
+    paddsw                m3, m5, m1                    ;t1
+    psubsw                m5, m1                        ;t5
+    ITX_MULSUB_2W          2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
+
+    mova                  m7, [coeffq+16*7]
+    paddsw                m1, m3, m4                    ;-out7
+    psubsw                m3, m4                        ;t3
+    mova       [coeffq+16*7], m1
+    psubsw                m4, m0, m7                    ;t2
+    paddsw                m0, m7                        ;out0
+    mova                  m6, [coeffq+16*5]
+    mova                  m7, [coeffq+16*6]
+    paddsw                m1, m5, m6                    ;-out1
+    psubsw                m5, m6                        ;t6
+    paddsw                m6, m2, m7                    ;out6
+    psubsw                m2, m7                        ;t7
+    paddw                 m7, m4, m3                    ;t2 + t3
+    psubw                 m4, m3                        ;t2 - t3
+    paddw                 m3, m5, m2                    ;t6 + t7
+    psubw                 m5, m2                        ;t6 - t7
+    mova                  m2, [o(pw_2896x8)]
+    pmulhrsw              m4, m2                        ;out4
+    pmulhrsw              m5, m2                        ;-out5
+    pmulhrsw              m7, m2                        ;-out3
+    pmulhrsw              m2, m3                        ;out2
+    mova                  m3, m7
+    ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_8X8_LOAD_COEFS
+    call m(iadst_8x8_internal).main
+    mova                  m7, [o(pw_m16384)]
+    pmulhrsw              m1, m7
+    mova       [coeffq+16*6], m1
+    mova                  m1, m6
+    mova                  m6, m2
+    pmulhrsw              m2, m5, m7
+    mova                  m5, m6
+    mova                  m6, m4
+    pmulhrsw              m4, m3, m7
+    mova                  m3, m6
+    mova                  m6, m0
+    mova                  m0, m7
+    pxor                  m7, m7
+    psubw                 m7, m0
+    pmulhrsw              m0, [coeffq+16*7]
+    REPX    {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw              m7, m6
+    jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+    call m(iadst_8x8_internal).main
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*5], m2
+    mova                  m2, m0
+    pxor                  m0, m0
+    psubw                 m0, m7
+    mova                  m7, m2
+    pmulhrsw              m1, m0
+    pmulhrsw              m2, m5, m0
+    mova       [coeffq+16*6], m1
+    mova                  m5, m4
+    mova                  m1, m6
+    pmulhrsw              m4, m3, m0
+    pmulhrsw              m0, [coeffq+16*7]
+    mova                  m3, m5
+    mova       [coeffq+16*7], m7
+    jmp m(idct_8x8_internal).end3
+
+INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    mova                 m4, [coeffq+16*4]
+    mova                 m5, [coeffq+16*5]
+    mova                 m7, [coeffq+16*7]
+    jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+    mova                  m7, [o(pw_4096)]
+    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw              m7, [coeffq+16*7]
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*6], m6
+    mova       [coeffq+16*7], m7
+    jmp m(idct_8x8_internal).end3