shithub: dav1d

ref: 61dcd11ba7364a0c6a9cef49e7b99722feee133e
dir: /src/x86/itx.asm/

View raw version
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
;    list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
;    this list of conditions and the following disclaimer in the documentation
;    and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

%include "ext/x86/x86inc.asm"

%if ARCH_X86_64

SECTION_RODATA 32

; Note: The order of (at least some of) those constants matter!

iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
iadst4_dconly1a: dw 10568, 19856, 26752, 30424
iadst4_dconly1b: dw 30424, 26752, 19856, 10568

deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15

%macro COEF_PAIR 2
pw_%1_%2:  dw  %1, %2
pw_m%2_%1: dw -%2, %1
%endmacro

; ADST-only
pw_3803_1321:   dw  3803,  1321
pw_m1321_2482:  dw -1321,  2482
pw_2482_3344:   dw  2482,  3344
pw_m3803_3344:  dw -3803,  3344
pw_m3803_m6688: dw -3803, -6688
%define pw_3344x8 iadst4_dconly2b

pw_5:      times 2 dw 5
pw_2048:   times 2 dw 2048
pw_4096:   times 2 dw 4096
pw_8192:   times 2 dw 8192
pw_16384:  times 2 dw 16384
pw_2896x8: times 2 dw 2896*8
pw_5793x4: times 2 dw 5793*4

pd_2048: dd 2048

COEF_PAIR 1567, 3784
COEF_PAIR 3784, 1567
COEF_PAIR  201, 4091
COEF_PAIR  995, 3973
COEF_PAIR 1751, 3703
COEF_PAIR 2440, 3290
COEF_PAIR 3035, 2751
COEF_PAIR 3513, 2106
COEF_PAIR 3857, 1380
COEF_PAIR 4052,  601
COEF_PAIR  401, 4076
COEF_PAIR 1931, 3612
COEF_PAIR 3166, 2598
COEF_PAIR 3920, 1189
COEF_PAIR  799, 4017
COEF_PAIR 3406, 2276
pw_m799_m4017:  dw  -799, -4017
pw_m1567_m3784: dw -1567, -3784
pw_m3406_m2276: dw -3406, -2276
pw_m401_m4076:  dw  -401, -4076
pw_m3166_m2598: dw -3166, -2598
pw_m1931_m3612: dw -1931, -3612
pw_m3920_m1189: dw -3920, -1189
COEF_PAIR 2276, 3406
COEF_PAIR 4017,  799

%macro COEF_X8 1-*
%rep %0
    dw %1*8, %1*8
    %rotate 1
%endrep
%endmacro

pw_3703x8:  COEF_X8  3703
pw_1751x8:  COEF_X8  1751
pw_m1380x8: COEF_X8 -1380
pw_3857x8:  COEF_X8  3857
pw_3973x8:  COEF_X8  3973
pw_995x8:   COEF_X8   995
pw_m2106x8: COEF_X8 -2106
pw_3513x8:  COEF_X8  3513
pw_3290x8:  COEF_X8  3290
pw_2440x8:  COEF_X8  2440
pw_m601x8:  COEF_X8  -601
pw_4052x8:  COEF_X8  4052

idct64_mul: COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
            COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
            COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
            COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301

pw_201_4091x8:   dw   201*8, 4091*8
pw_m601_4052x8:  dw  -601*8, 4052*8
pw_995_3973x8:   dw   995*8, 3973*8
pw_m1380_3857x8: dw -1380*8, 3857*8
pw_1751_3703x8:  dw  1751*8, 3703*8
pw_m2106_3513x8: dw -2106*8, 3513*8
pw_2440_3290x8:  dw  2440*8, 3290*8
pw_m2751_3035x8: dw -2751*8, 3035*8

%define o_idct64_offset idct64_mul - (o_base) - 8

SECTION .text

; Code size reduction trickery: Intead of using rip-relative loads with
; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
; single rip-relative lea and then address things relative from that with
; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
%define o_base iadst4_dconly2a + 128
%define o(x) (rax - (o_base) + (x))

%macro REPX 2-*
    %xdefine %%f(x) %1
%rep %0 - 1
    %rotate 1
    %%f(%1)
%endrep
%endmacro

%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)

; flags: 1 = swap, 2 = interleave, 4: coef_regs
%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
%if %7 & 4
    pmaddwd             m%2, m%5, m%1
    pmaddwd             m%1, m%6
%else
%if %7 & 1
    vpbroadcastd        m%2, [o(pw_%5_%6)]
    vpbroadcastd        m%3, [o(pw_m%6_%5)]
%else
    vpbroadcastd        m%2, [o(pw_m%6_%5)]
    vpbroadcastd        m%3, [o(pw_%5_%6)]
%endif
    pmaddwd             m%2, m%1
    pmaddwd             m%1, m%3
%endif
    paddd               m%2, m%4
    paddd               m%1, m%4
%if %7 & 2
    pslld               m%2, 4
    psrld               m%1, 12
    pblendw             m%1, m%2, 0xaa
%else
    psrad               m%2, 12
    psrad               m%1, 12
    packssdw            m%1, m%2
%endif
%endmacro

; flags: 1 = swap, 2 = interleave, 4 = coef_regs
%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
%if %10 & 1
    vpbroadcastd        m%3, [o(pw_%8_%9)]
    vpbroadcastd        m%4, [o(pw_m%9_%8)]
    vpbroadcastd       xm%2, [o(pw_%6_%7)]
    vpblendd            m%2, m%2, m%3, 0xf0
    vpbroadcastd       xm%3, [o(pw_m%7_%6)]
%else
    vpbroadcastd        m%3, [o(pw_m%9_%8)]
    vpbroadcastd        m%4, [o(pw_%8_%9)]
    vpbroadcastd       xm%2, [o(pw_m%7_%6)]
    vpblendd            m%2, m%2, m%3, 0xf0
    vpbroadcastd       xm%3, [o(pw_%6_%7)]
%endif
    vpblendd            m%3, m%3, m%4, 0xf0
    ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
%endmacro

; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
    punpckhwd           m%3, m%2, m%1
    punpcklwd           m%2, m%1
%if %7 < 32
    pmaddwd             m%1, m%7, m%2
    pmaddwd             m%4, m%7, m%3
%else
    vpbroadcastd        m%1, [o(pw_m%7_%6)]
    pmaddwd             m%4, m%3, m%1
    pmaddwd             m%1, m%2
%endif
    paddd               m%4, m%5
    paddd               m%1, m%5
    psrad               m%4, 12
    psrad               m%1, 12
    packssdw            m%1, m%4
%if %7 < 32
    pmaddwd             m%3, m%6
    pmaddwd             m%2, m%6
%else
    vpbroadcastd        m%4, [o(pw_%6_%7)]
    pmaddwd             m%3, m%4
    pmaddwd             m%2, m%4
%endif
    paddd               m%3, m%5
    paddd               m%2, m%5
    psrad               m%3, 12
    psrad               m%2, 12
    packssdw            m%2, m%3
%endmacro

%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
    vpbroadcastd        m%6, [o(pw_2896x8)]
    paddw               m%5, m%1, m%3
    psubw               m%1, m%3
    pmulhrsw            m%1, m%6 ; t1
    pmulhrsw            m%5, m%6 ; t0
    psubsw              m%3, m%1, m%2
    paddsw              m%2, m%1
    paddsw              m%1, m%5, m%4
    psubsw              m%4, m%5, m%4
%endmacro

%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
    ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
    ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
    ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
    paddsw              m%9, m%2, m%6  ; t4
    psubsw              m%2, m%6       ; t5a
    paddsw             m%10, m%8, m%4  ; t7
    psubsw              m%8, m%4       ; t6a
    vpbroadcastd        m%4, [o(pw_2896x8)]
    psubw               m%6, m%1, m%5
    paddw               m%1, m%5
    psubw               m%5, m%8, m%2
    paddw               m%8, m%2
    pmulhrsw            m%1, m%4       ; t0
    pmulhrsw            m%6, m%4       ; t1
    pmulhrsw            m%8, m%4       ; t6
    pmulhrsw            m%5, m%4       ; t5
    psubsw              m%4, m%1, m%7  ; dct4 out3
    paddsw              m%1, m%7       ; dct4 out0
    paddsw              m%7, m%6, m%3  ; dct4 out1
    psubsw              m%6, m%3       ; dct4 out2
    paddsw              m%2, m%7, m%8  ; out1
    psubsw              m%7, m%8       ; out6
    psubsw              m%8, m%1, m%10 ; out7
    paddsw              m%1, m%10      ; out0
    paddsw              m%3, m%6, m%5  ; out2
    psubsw              m%6, m%5       ; out5
    psubsw              m%5, m%4, m%9  ; out4
    paddsw              m%4, m%9       ; out3
%endmacro

; in1 = %1, in3  = %2, in5  = %3, in7  = %4
; in9 = %5, in11 = %6, in13 = %7, in15 = %8
%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
    ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
    ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
    ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
    psubsw              m%9, m%2, m%6 ; t13
    paddsw              m%6, m%2      ; t12
    psubsw              m%2, m%8, m%4 ; t14
    paddsw              m%8, m%4      ; t15
    psubsw              m%4, m%7, m%3 ; t10
    paddsw              m%3, m%7      ; t11
    psubsw              m%7, m%1, m%5 ; t9
    paddsw              m%1, m%5      ; t8
    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
    vpbroadcastd       m%10, [o(pw_2896x8)]
    psubsw              m%5, m%2, m%9 ; t10
    paddsw              m%2, m%9      ; t9
    psubsw              m%9, m%1, m%3 ; t11a
    paddsw              m%1, m%3      ; t8a
    psubsw              m%3, m%7, m%4 ; t13
    paddsw              m%7, m%4      ; t14
    psubsw              m%4, m%8, m%6 ; t12a
    paddsw              m%8, m%6      ; t15a
    paddw               m%6, m%3, m%5 ; t13a
    psubw               m%3, m%5      ; t10a
    paddw               m%5, m%4, m%9 ; t12
    psubw               m%4, m%9      ; t11
    REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
%endmacro

%macro WRAP_XMM 1+
    INIT_XMM cpuname
    %1
    INIT_YMM cpuname
%endmacro

%macro ITX4_END 4-5 2048 ; row[1-4], rnd
%if %5
    vpbroadcastd         m2, [o(pw_%5)]
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
%endif
    lea                  r2, [dstq+strideq*2]
%assign %%i 1
%rep 4
    %if %1 & 2
        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
    %else
        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
    %endif
    %assign %%i %%i + 1
    %rotate 1
%endrep
    movd                 m2, [%%row_adr1]
    pinsrd               m2, [%%row_adr2], 1
    movd                 m3, [%%row_adr3]
    pinsrd               m3, [%%row_adr4], 1
    pmovzxbw             m2, m2
    pmovzxbw             m3, m3
    paddw                m0, m2
    paddw                m1, m3
    packuswb             m0, m1
    movd       [%%row_adr1], m0
    pextrd     [%%row_adr2], m0, 1
    pextrd     [%%row_adr3], m0, 2
    pextrd     [%%row_adr4], m0, 3
    ret
%endmacro

%macro IWHT4_1D_PACKED 0
    punpckhqdq           m3, m0, m1 ; in1 in3
    punpcklqdq           m0, m1     ; in0 in2
    psubw                m2, m0, m3
    paddw                m0, m3
    punpckhqdq           m2, m2     ; t2 t2
    punpcklqdq           m0, m0     ; t0 t0
    psubw                m1, m0, m2
    psraw                m1, 1
    psubw                m1, m3     ; t1 t3
    psubw                m0, m1     ; ____ out0
    paddw                m2, m1     ; out3 ____
%endmacro

INIT_XMM avx2
cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    pxor                 m2, m2
    mova          [cq+16*0], m2
    mova          [cq+16*1], m2
    psraw                m0, 2
    psraw                m1, 2
    IWHT4_1D_PACKED
    punpckhwd            m0, m1
    punpcklwd            m3, m1, m2
    punpckhdq            m1, m0, m3
    punpckldq            m0, m3
    IWHT4_1D_PACKED
    vpblendd             m0, m0, m2, 0x03
    ITX4_END              3, 0, 2, 1, 0

%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
    %undef cmp
    %define %%p1 m(i%1_%4_internal)
    lea                 rax, [o_base]
    ; Jump to the 1st txfm function if we're not taking the fast path, which
    ; in turn performs an indirect jump to the 2nd txfm function.
    lea tx2q, [m(i%2_%4_internal).pass2]
%if %3 > 0
    cmp                eobd, %3
    jg %%p1
%elif %3 == 0
    test               eobd, eobd
    jnz %%p1
%else
    ; jump to the 1st txfm function unless it's located directly after this
    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
%endif
%endmacro

%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 4x4
%ifidn %1_%2, dct_identity
    vpbroadcastd         m0, [o(pw_2896x8)]
    pmulhrsw             m0, [cq]
    vpbroadcastd         m1, [o(pw_5793x4)]
    paddw                m0, m0
    pmulhrsw             m0, m1
    punpcklwd            m0, m0
    punpckhdq            m1, m0, m0
    punpckldq            m0, m0
    jmp m(iadst_4x4_internal).end
%elifidn %1_%2, identity_dct
    mova                 m0, [cq+16*0]
    packusdw             m0, [cq+16*1]
    vpbroadcastd         m2, [o(pw_5793x4)]
    vpbroadcastd         m3, [o(pw_2896x8)]
    packusdw             m0, m0
    paddw                m0, m0
    pmulhrsw             m0, m2
    pmulhrsw             m0, m3
    mova                 m1, m0
    jmp m(iadst_4x4_internal).end
%elif %3 >= 0
    vpbroadcastw         m0, [cq]
%ifidn %1, dct
    vpbroadcastd         m1, [o(pw_2896x8)]
    pmulhrsw             m0, m1
%elifidn %1, adst
    movddup              m1, [o(iadst4_dconly1a)]
    pmulhrsw             m0, m1
%elifidn %1, flipadst
    movddup              m1, [o(iadst4_dconly1b)]
    pmulhrsw             m0, m1
%endif
    mov                [cq], eobd ; 0
%ifidn %2, dct
%ifnidn %1, dct
    vpbroadcastd         m1, [o(pw_2896x8)]
%endif
    pmulhrsw             m0, m1
    mova                 m1, m0
    jmp m(iadst_4x4_internal).end2
%else ; adst / flipadst
    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
    pmulhrsw             m0, [o(iadst4_dconly2a)]
    jmp m(i%2_4x4_internal).end2
%endif
%endif
%endmacro

%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
    vpbroadcastd         m4, [o(pd_2048)]
    punpckhwd            m2, m1, m0
    psubw                m3, m0, m1
    paddw                m0, m1
    punpcklqdq           m0, m3
    ITX_MUL2X_PACK        2, 1, 3, 4, 1567, 3784
%if %0 == 1
    pmulhrsw             m0, m%1
%else
    vpbroadcastd         m4, [o(pw_2896x8)]
    pmulhrsw             m0, m4     ; t0 t1
%endif
    psubsw               m1, m0, m2 ; out3 out2
    paddsw               m0, m2     ; out0 out1
%endmacro

%macro IADST4_1D_PACKED 0
    punpcklwd            m2, m1, m0
    punpckhwd            m3, m1, m0
    psubw                m0, m1
    punpckhqdq           m1, m1
    paddw                m1, m0 ; in0 - in2 + in3
    vpbroadcastd         m0, [o(pw_3803_1321)]
    vpbroadcastd         m4, [o(pw_m1321_2482)]
    pmaddwd              m0, m2
    pmaddwd              m2, m4
    vpbroadcastd         m4, [o(pw_2482_3344)]
    vpbroadcastd         m5, [o(pw_m3803_3344)]
    pmaddwd              m4, m3
    pmaddwd              m5, m3
    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
    vpbroadcastd         m0, [o(pw_m3803_m6688)]
    pmaddwd              m3, m0
    vpbroadcastd         m0, [o(pw_3344x8)]
    pmulhrsw             m1, m0 ; out2 ____
    vpbroadcastd         m0, [o(pd_2048)]
    paddd                m2, m0
    paddd                m0, m4
    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
    paddd                m2, m4
    paddd                m2, m3
    psrad                m0, 12
    psrad                m5, 12
    psrad                m2, 12
    packssdw             m0, m5 ; out0 out1
    packssdw             m2, m2 ; out3 out3
%endmacro

INV_TXFM_4X4_FN dct, dct,      0
INV_TXFM_4X4_FN dct, adst,     0
INV_TXFM_4X4_FN dct, flipadst, 0
INV_TXFM_4X4_FN dct, identity, 3

cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    IDCT4_1D_PACKED
    mova                 m2, [o(deint_shuf)]
    shufps               m3, m0, m1, q1331
    shufps               m0, m0, m1, q0220
    pshufb               m0, m2
    pshufb               m1, m3, m2
    jmp                tx2q
.pass2:
    IDCT4_1D_PACKED
    pxor                 m2, m2
    mova          [cq+16*0], m2
    mova          [cq+16*1], m2
    ITX4_END              0, 1, 3, 2

INV_TXFM_4X4_FN adst, dct,      0
INV_TXFM_4X4_FN adst, adst,     0
INV_TXFM_4X4_FN adst, flipadst, 0
INV_TXFM_4X4_FN adst, identity

cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    call .main
    punpckhwd            m3, m0, m2
    punpcklwd            m0, m1
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
    jmp                tx2q
.pass2:
    call .main
    vpblendd             m1, m1, m2, 0x0c ; out2 out3
.end:
    pxor                 m2, m2
    mova          [cq+16*0], m2
    mova          [cq+16*1], m2
.end2:
    ITX4_END              0, 1, 2, 3
ALIGN function_align
.main:
    IADST4_1D_PACKED
    ret

INV_TXFM_4X4_FN flipadst, dct,      0
INV_TXFM_4X4_FN flipadst, adst,     0
INV_TXFM_4X4_FN flipadst, flipadst, 0
INV_TXFM_4X4_FN flipadst, identity

cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    call m(iadst_4x4_internal).main
    punpcklwd            m1, m0
    punpckhwd            m2, m0
    punpcklwd            m0, m2, m1
    punpckhwd            m1, m2, m1
    jmp                tx2q
.pass2:
    call m(iadst_4x4_internal).main
    vpblendd             m1, m1, m2, 0x0c ; out2 out3
.end:
    pxor                 m2, m2
    mova          [cq+16*0], m2
    mova          [cq+16*1], m2
.end2:
    ITX4_END              3, 2, 1, 0

INV_TXFM_4X4_FN identity, dct,      3
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity

cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
    mova                 m0, [cq+16*0]
    mova                 m1, [cq+16*1]
    vpbroadcastd         m2, [o(pw_5793x4)]
    paddw                m0, m0
    paddw                m1, m1
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    punpckhwd            m2, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m0, m2
    punpcklwd            m0, m2
    jmp                tx2q
.pass2:
    vpbroadcastd         m2, [o(pw_5793x4)]
    paddw                m0, m0
    paddw                m1, m1
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    jmp m(iadst_4x4_internal).end

%macro WRITE_4X8 2 ; coefs[1-2]
    movd                xm4, [dstq+strideq*0]
    pinsrd              xm4, [dstq+strideq*1], 1
    movd                xm5, [dstq+strideq*2]
    pinsrd              xm5, [dstq+r3       ], 1
    pinsrd              xm4, [r2  +strideq*0], 2
    pinsrd              xm4, [r2  +strideq*1], 3
    pinsrd              xm5, [r2  +strideq*2], 2
    pinsrd              xm5, [r2  +r3       ], 3
    pmovzxbw             m4, xm4
    pmovzxbw             m5, xm5
    paddw                m4, m%1
    paddw                m5, m%2
    packuswb             m4, m5
    vextracti128        xm5, m4, 1
    movd   [dstq+strideq*0], xm4
    pextrd [dstq+strideq*1], xm4, 1
    pextrd [dstq+strideq*2], xm4, 2
    pextrd [dstq+r3       ], xm4, 3
    movd   [r2  +strideq*0], xm5
    pextrd [r2  +strideq*1], xm5, 1
    pextrd [r2  +strideq*2], xm5, 2
    pextrd [r2  +r3       ], xm5, 3
%endmacro

%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 4x8
%if %3 >= 0
%ifidn %1_%2, dct_identity
    vpbroadcastd        xm0, [o(pw_2896x8)]
    pmulhrsw            xm1, xm0, [cq]
    vpbroadcastd        xm2, [o(pw_4096)]
    pmulhrsw            xm1, xm0
    pmulhrsw            xm1, xm2
    vpermq               m1, m1, q1100
    punpcklwd            m1, m1
    punpckldq            m0, m1, m1
    punpckhdq            m1, m1
    jmp m(iadst_4x8_internal).end3
%elifidn %1_%2, identity_dct
    movd                xm0, [cq+16*0]
    punpcklwd           xm0, [cq+16*1]
    movd                xm1, [cq+16*2]
    punpcklwd           xm1, [cq+16*3]
    vpbroadcastd        xm2, [o(pw_2896x8)]
    vpbroadcastd        xm3, [o(pw_5793x4)]
    vpbroadcastd        xm4, [o(pw_2048)]
    punpckldq           xm0, xm1
    pmulhrsw            xm0, xm2
    paddw               xm0, xm0
    pmulhrsw            xm0, xm3
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm4
    vpbroadcastq         m0, xm0
    mova                 m1, m0
    jmp m(iadst_4x8_internal).end3
%elifidn %1_%2, dct_dct
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_2048)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    mova                 m1, m0
    jmp m(iadst_4x8_internal).end4
%else ; adst_dct / flipadst_dct
    vpbroadcastw        xm0, [cq]
    vpbroadcastd        xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, [o(iadst4_dconly1a)]
    vpbroadcastd        xm2, [o(pw_2048)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
%ifidn %1, adst
    vpbroadcastq         m0, xm0
%else ; flipadst
    vpermq               m0, m0, q1111
%endif
    mova                 m1, m0
    jmp m(iadst_4x8_internal).end4
%endif
%endif
%endmacro

%macro IDCT8_1D_PACKED 0
    vpbroadcastd         m6, [o(pd_2048)]
    punpckhwd            m5, m3, m0 ; in7 in1
    punpckhwd            m4, m1, m2 ; in3 in5
    punpcklwd            m3, m1     ; in2 in6
    psubw                m1, m0, m2
    paddw                m0, m2
    punpcklqdq           m0, m1     ; in0+in4 in0-in4
    ITX_MUL2X_PACK        5, 1, 2, 6,  799, 4017, 1 ; t4a t7a
    ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
    ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
    vpbroadcastd         m6, [o(pw_2896x8)]
    psubsw               m2, m5, m4 ; t4 t7
    paddsw               m5, m4     ; t5a t6a
    pshufd               m4, m2, q1032
    psubw                m1, m2, m4
    paddw                m4, m2
    vpblendd             m4, m4, m1, 0xcc
    pmulhrsw             m0, m6     ; t0 t1
    pmulhrsw             m4, m6     ; t6 t5
    psubsw               m1, m0, m3 ; tmp3 tmp2
    paddsw               m0, m3     ; tmp0 tmp1
    shufps               m2, m5, m4, q1032 ; t7 t6
    vpblendd             m5, m5, m4, 0xcc  ; t4 t5
    psubsw               m3, m0, m2 ; out7 out6
    paddsw               m0, m2     ; out0 out1
    psubsw               m2, m1, m5 ; out4 out5
    paddsw               m1, m5     ; out3 out2
%endmacro

%macro IADST8_1D_PACKED 0
    vpbroadcastd         m6, [o(pd_2048)]
    punpckhwd            m0, m4, m3 ; 0 7
    punpckhwd            m1, m5, m2 ; 2 5
    punpcklwd            m2, m5     ; 4 3
    punpcklwd            m3, m4     ; 6 1
    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
    psubsw               m4, m0, m2 ; t4 t5
    paddsw               m0, m2     ; t0 t1
    psubsw               m5, m1, m3 ; t6 t7
    paddsw               m1, m3     ; t2 t3
    shufps               m2, m5, m4, q1032
    punpckhwd            m4, m2
    punpcklwd            m5, m2
    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
    psubsw               m2, m0, m1        ; t2 t3
    paddsw               m0, m1            ; out0 -out7
    psubsw               m1, m4, m5        ; t7 t6
    paddsw               m4, m5            ; out6 -out1
    vpbroadcastd         m5, [o(pw_2896x8)]
    vpblendd             m3, m0, m4, 0x33  ; out6 -out7
    vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
    shufps               m4, m2, m1, q1032 ; t3 t7
    vpblendd             m1, m2, m1, 0xcc  ; t2 t6
    psubw                m2, m1, m4        ; t2-t3 t6-t7
    paddw                m1, m4            ; t2+t3 t6+t7
    pmulhrsw             m2, m5            ; out4 -out5
    pshufd               m1, m1, q1032
    pmulhrsw             m1, m5            ; out2 -out3
%endmacro

INIT_YMM avx2
INV_TXFM_4X8_FN dct, dct,      0
INV_TXFM_4X8_FN dct, identity, 7
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst

cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q3120
    vpermq               m1, [cq+32*1], q3120
    vpbroadcastd         m5, [o(pw_2896x8)]
    pmulhrsw             m0, m5
    pmulhrsw             m1, m5
    IDCT4_1D_PACKED       5
    vbroadcasti128       m2, [o(deint_shuf)]
    shufps               m3, m0, m1, q1331
    shufps               m0, m0, m1, q0220
    pshufb               m0, m2
    pshufb               m1, m3, m2
    jmp                tx2q
.pass2:
    vextracti128        xm2, m0, 1
    vextracti128        xm3, m1, 1
    call .main
    vpbroadcastd         m4, [o(pw_2048)]
    vinserti128          m0, m0, xm2, 1
    vinserti128          m1, m1, xm3, 1
    pshufd               m1, m1, q1032
    jmp m(iadst_4x8_internal).end2
ALIGN function_align
.main:
    WRAP_XMM IDCT8_1D_PACKED
    ret

INV_TXFM_4X8_FN adst, dct,      0
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity

cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q3120
    vpermq               m1, [cq+32*1], q3120
    vpbroadcastd         m2, [o(pw_2896x8)]
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    call m(iadst_8x4_internal).main
    punpckhwd            m3, m0, m2
    punpcklwd            m0, m1
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
    jmp                tx2q
.pass2:
    vextracti128        xm2, m0, 1
    vextracti128        xm3, m1, 1
    pshufd              xm4, xm0, q1032
    pshufd              xm5, xm1, q1032
    call .main
    vpbroadcastd         m4, [o(pw_2048)]
    vinserti128          m0, m0, xm2, 1
    vinserti128          m1, m1, xm3, 1
    pxor                 m5, m5
    psubw                m5, m4
.end:
    vpblendd             m4, m4, m5, 0xcc
.end2:
    pmulhrsw             m0, m4
    pmulhrsw             m1, m4
    WIN64_RESTORE_XMM
.end3:
    pxor                 m2, m2
    mova          [cq+32*0], m2
    mova          [cq+32*1], m2
.end4:
    lea                  r2, [dstq+strideq*4]
    lea                  r3, [strideq*3]
    WRITE_4X8             0, 1
    RET
ALIGN function_align
.main:
    WRAP_XMM IADST8_1D_PACKED
    ret

INV_TXFM_4X8_FN flipadst, dct,      0
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity

cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q3120
    vpermq               m1, [cq+32*1], q3120
    vpbroadcastd         m2, [o(pw_2896x8)]
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    call m(iadst_8x4_internal).main
    punpcklwd            m3, m1, m0
    punpckhwd            m1, m2, m0
    punpcklwd            m0, m1, m3
    punpckhwd            m1, m3
    jmp                tx2q
.pass2:
    vextracti128        xm2, m0, 1
    vextracti128        xm3, m1, 1
    pshufd              xm4, xm0, q1032
    pshufd              xm5, xm1, q1032
    call m(iadst_4x8_internal).main
    vpbroadcastd         m5, [o(pw_2048)]
    vinserti128          m3, m3, xm1, 1
    vinserti128          m2, m2, xm0, 1
    pxor                 m4, m4
    psubw                m4, m5
    pshufd               m0, m3, q1032
    pshufd               m1, m2, q1032
    jmp m(iadst_4x8_internal).end

INV_TXFM_4X8_FN identity, dct,      3
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity

cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m2, [cq+32*0], q3120
    vpermq               m0, [cq+32*1], q3120
    vpbroadcastd         m3, [o(pw_2896x8)]
    vpbroadcastd         m4, [o(pw_5793x4)]
    punpcklwd            m1, m2, m0
    punpckhwd            m2, m0
    pmulhrsw             m1, m3
    pmulhrsw             m2, m3
    punpcklwd            m0, m1, m2
    punpckhwd            m1, m2
    paddw                m0, m0
    paddw                m1, m1
    pmulhrsw             m0, m4
    pmulhrsw             m1, m4
    jmp                tx2q
.pass2:
    vpbroadcastd         m4, [o(pw_4096)]
    jmp m(iadst_4x8_internal).end2

%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 4x16
%if %3 >= 0
%ifidn %1_%2, dct_identity
    vpbroadcastd         m0, [o(pw_2896x8)]
    pmulhrsw             m0, [cq]
    vpbroadcastd         m1, [o(pw_16384)]
    vpbroadcastd         m2, [o(pw_5793x4)]
    vpbroadcastd         m3, [o(pw_2048)]
    pmulhrsw             m0, m1
    psllw                m0, 2
    pmulhrsw             m0, m2
    pmulhrsw             m3, m0
    punpcklwd            m1, m3, m3
    punpckhwd            m3, m3
    punpckldq            m0, m1, m1
    punpckhdq            m1, m1
    punpckldq            m2, m3, m3
    punpckhdq            m3, m3
    jmp m(iadst_4x16_internal).end3
%elifidn %1_%2, identity_dct
    movd                xm0, [cq+32*0]
    punpcklwd           xm0, [cq+32*1]
    movd                xm1, [cq+32*2]
    punpcklwd           xm1, [cq+32*3]
    vpbroadcastd        xm2, [o(pw_5793x4)]
    vpbroadcastd        xm3, [o(pw_16384)]
    vpbroadcastd        xm4, [o(pw_2896x8)]
    punpckldq           xm0, xm1
    paddw               xm0, xm0
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm3
    psrlw               xm3, 3 ; pw_2048
    pmulhrsw            xm0, xm4
    pmulhrsw            xm0, xm3
    vpbroadcastq         m0, xm0
    mova                 m1, m0
    mova                 m2, m0
    mova                 m3, m0
    jmp m(iadst_4x16_internal).end3
%elifidn %1_%2, dct_dct
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    movd                xm3, [o(pw_2048)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm3
    vpbroadcastw         m0, xm0
    mova                 m1, m0
    mova                 m2, m0
    mova                 m3, m0
    jmp m(iadst_4x16_internal).end4
%else ; adst_dct / flipadst_dct
    vpbroadcastw        xm0, [cq]
    pmulhrsw            xm0, [o(iadst4_dconly1a)]
    vpbroadcastd        xm1, [o(pw_16384)]
    vpbroadcastd        xm2, [o(pw_2896x8)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    psrlw               xm1, 3 ; pw_2048
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm1
%ifidn %1, adst
    vpbroadcastq         m0, xm0
%else ; flipadst
    vpermq               m0, m0, q1111
%endif
    mova                 m1, m0
    mova                 m2, m0
    mova                 m3, m0
    jmp m(iadst_4x16_internal).end4
%endif
%endif
%endmacro

%macro IDCT16_1D_PACKED 0
    vpbroadcastd        m10, [o(pd_2048)]
.main2:
    punpckhwd            m8, m7, m0 ; dct16 in15 in1
    paddw                m9, m0, m4
    psubw                m0, m4
    punpcklqdq           m9, m0     ; dct4  in0+in2 in0-in2
    punpckhwd            m0, m3, m4 ; dct16 in7  in9
    punpcklwd            m7, m1     ; dct8  in7  in1
    punpckhwd            m1, m6     ; dct16 in3  in13
    punpcklwd            m3, m5     ; dct8  in3  in5
    punpckhwd            m5, m2     ; dct16 in11 in5
    punpcklwd            m6, m2     ; dct4  in3  in1
    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
    psubsw               m2, m8, m0 ; t9  t14
    paddsw               m8, m0     ; t8  t15
    psubsw               m0, m1, m5 ; t10 t13
    paddsw               m1, m5     ; t11 t12
%if mmsize > 16
    vbroadcasti128       m5, [o(deint_shuf)]
%else
    mova                 m5, [o(deint_shuf)]
%endif
    pshufb               m8, m5
    pshufb               m1, m5
    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
    psubsw               m5, m7, m3 ; t5a t6a
    paddsw               m7, m3     ; t4  t7
    psubsw               m4, m8, m1 ; t11a t12a
    paddsw               m8, m1     ; t8a  t15a
    paddsw               m1, m2, m0 ; t9   t14
    psubsw               m2, m0     ; t10  t13
    punpckhqdq           m0, m8, m1 ; t15a t14
    punpcklqdq           m8, m1     ; t8a  t9
    pshufd               m3, m5, q1032
    psubw                m1, m5, m3
    paddw                m3, m5
    vpblendd             m3, m3, m1, 0xcc ; t6 t5
    vpbroadcastd         m1, [o(pw_2896x8)]
    punpckhqdq           m5, m4, m2 ; t12a t13
    punpcklqdq           m2, m4, m2 ; t11a t10
    psubw                m4, m5, m2
    paddw                m5, m2
    pmulhrsw             m9, m1     ; t0   t1
    pmulhrsw             m3, m1     ; t6   t5
    pmulhrsw             m4, m1     ; t11  t10a
    pmulhrsw             m5, m1     ; t12  t13a
    shufps               m2, m7, m3, q1032 ; t7 t6
    vpblendd             m7, m7, m3, 0xcc  ; t4 t5
    psubsw               m1, m9, m6 ; dct4 out3 out2
    paddsw               m9, m6     ; dct4 out0 out1
    psubsw               m3, m9, m2 ; dct8 out7 out6
    paddsw               m9, m2     ; dct8 out0 out1
    psubsw               m2, m1, m7 ; dct8 out4 out5
    paddsw               m1, m7     ; dct8 out3 out2
    psubsw               m7, m9, m0 ; out15 out14
    paddsw               m0, m9     ; out0  out1
    psubsw               m6, m1, m5 ; out12 out13
    paddsw               m1, m5     ; out3  out2
    psubsw               m5, m2, m4 ; out11 out10
    paddsw               m2, m4     ; out4  out5
    psubsw               m4, m3, m8 ; out8  out9
    paddsw               m3, m8     ; out7  out6
%endmacro

INV_TXFM_4X16_FN dct, dct,      0
INV_TXFM_4X16_FN dct, identity, 15
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst

cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                 m0, [cq+32*0]
    mova                 m1, [cq+32*1]
    mova                 m2, [cq+32*2]
    mova                 m3, [cq+32*3]
    call m(idct_16x4_internal).main
    vpbroadcastd         m5, [o(pw_16384)]
    punpckhwd            m4, m2, m3
    punpcklwd            m2, m3
    punpckhwd            m3, m0, m1
    punpcklwd            m0, m1
    REPX   {pmulhrsw x, m5}, m0, m4, m2, m3
    punpckhdq            m1, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    jmp                tx2q
.pass2:
    vextracti128        xm4, m0, 1
    vextracti128        xm5, m1, 1
    vextracti128        xm6, m2, 1
    vextracti128        xm7, m3, 1
    call .main
    vinserti128          m0, m0, xm4, 1
    vinserti128          m1, m1, xm5, 1
    vpbroadcastd         m5, [o(pw_2048)]
    vinserti128          m2, m2, xm6, 1
    vinserti128          m3, m3, xm7, 1
    pshufd               m1, m1, q1032
    pshufd               m3, m3, q1032
    jmp m(iadst_4x16_internal).end2
ALIGN function_align
.main:
    WRAP_XMM IDCT16_1D_PACKED
    ret

INV_TXFM_4X16_FN adst, dct,      0
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity

cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                 m0, [cq+32*0]
    mova                 m1, [cq+32*1]
    mova                 m2, [cq+32*2]
    mova                 m3, [cq+32*3]
    call m(iadst_16x4_internal).main
    vpbroadcastd         m5, [o(pw_16384)]
    punpckhwd            m4, m2, m3
    punpcklwd            m2, m3
    punpckhwd            m3, m0, m1
    punpcklwd            m0, m1
    REPX   {pmulhrsw x, m5}, m4, m2, m3, m0
    punpckhdq            m1, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    jmp                tx2q
.pass2:
    call .main
    pshufd               m1, m1, q1032
    vpbroadcastd         m5, [o(pw_2048)]
    vpblendd             m4, m1, m0, 0x33
    vpblendd             m0, m0, m2, 0x33
    vpblendd             m2, m2, m3, 0x33
    vpblendd             m3, m3, m1, 0x33
    vpermq               m0, m0, q2031
    vpermq               m1, m2, q1302
    vpermq               m2, m3, q3120
    vpermq               m3, m4, q0213
    psubw                m6, m7, m5
.end:
    vpblendd             m5, m5, m6, 0xcc
.end2:
    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
    WIN64_RESTORE_XMM
.end3:
    pxor                 m4, m4
    mova          [cq+32*0], m4
    mova          [cq+32*1], m4
    mova          [cq+32*2], m4
    mova          [cq+32*3], m4
.end4:
    lea                  r2, [dstq+strideq*8]
    lea                  r3, [strideq*3]
    WRITE_4X8             0, 1
    lea                dstq, [dstq+strideq*4]
    lea                  r2, [r2  +strideq*4]
    WRITE_4X8             2, 3
    RET
ALIGN function_align
.main:
    vpblendd             m4, m1, m0, 0xcc
    vpblendd             m1, m1, m0, 0x33
    vpblendd             m5, m2, m3, 0xcc
    vpblendd             m2, m2, m3, 0x33
    vperm2i128           m3, m5, m2, 0x31
    vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
    vperm2i128           m4, m1, m4, 0x31
    vinserti128          m1, m5, xm2, 1 ; in4  in7  in6  in5
    pshufd               m3, m3, q1032  ; in12 in15 in13 in14
    pshufd               m2, m4, q1032  ; in11 in8  in9  in10
.main2:
    vpbroadcastd         m8, [o(pd_2048)]
    pxor                 m7, m7
    punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
    punpcklwd            m0, m3     ; in0  in15 in2  in13
    punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
    punpcklwd            m1, m2     ; in4  in11 in6  in9
    ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
    ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
    ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
    ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
    ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
    psubw                m6, m7, m5
    ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
    vpbroadcastd         m6, [o(pw_m3784_1567)]
    vpbroadcastd         m5, [o(pw_1567_3784)]
    psubsw               m4, m0, m1 ; t5   t4   t7   t6
    paddsw               m0, m1     ; t1   t0   t3   t2
    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
    paddsw               m2, m3     ; t9a  t8a  t11a t10a
    psubw                m3, m7, m6 ; pw_3784_m1567
    vpblendd             m6, m6, m3, 0xf0
    ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
    ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
    vbroadcasti128       m5, [o(deint_shuf)]
    pshufb               m0, m5
    pshufb               m2, m5
    vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
    vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
    vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
    vpbroadcastd         m5, [o(pw_2896x8)]
    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
    psubsw               m1, m0, m3        ; t3a t2a t11 t10
    paddsw               m0, m3     ; -out15  out0   out14 -out1
    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
    psubsw               m4, m2            ; t6 t7 t14a t15a
    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
    vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
    paddw                m1, m2, m4
    psubw                m2, m4
    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
    ret

INV_TXFM_4X16_FN flipadst, dct,      0
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity

cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                 m0, [cq+32*0]
    mova                 m1, [cq+32*1]
    mova                 m2, [cq+32*2]
    mova                 m3, [cq+32*3]
    call m(iadst_16x4_internal).main
    vpbroadcastd         m5, [o(pw_16384)]
    punpcklwd            m4, m1, m0
    punpckhwd            m1, m0
    punpcklwd            m0, m3, m2
    punpckhwd            m3, m2
    REPX   {pmulhrsw x, m5}, m4, m1, m0, m3
    punpckldq            m2, m3, m1
    punpckhdq            m3, m1
    punpckhdq            m1, m0, m4
    punpckldq            m0, m4
    jmp                tx2q
.pass2:
    call m(iadst_4x16_internal).main
    pshufd               m1, m1, q1032
    vpbroadcastd         m6, [o(pw_2048)]
    vpblendd             m4, m0, m2, 0x33
    vpblendd             m0, m0, m1, 0xcc
    vpblendd             m1, m1, m3, 0xcc
    vpblendd             m2, m2, m3, 0x33
    vpermq               m0, m0, q3120
    vpermq               m1, m1, q0213
    vpermq               m2, m2, q2031
    vpermq               m3, m4, q1302
    psubw                m5, m7, m6
    jmp m(iadst_4x16_internal).end

INV_TXFM_4X16_FN identity, dct,      3
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity

cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                 m3, [cq+32*0]
    mova                 m2, [cq+32*1]
    mova                 m4, [cq+32*2]
    mova                 m0, [cq+32*3]
    vpbroadcastd         m5, [o(pw_5793x4)]
    punpcklwd            m1, m3, m2
    punpckhwd            m3, m2
    punpcklwd            m2, m4, m0
    punpckhwd            m4, m0
    REPX   {paddw    x, x }, m1, m2, m3, m4
    REPX   {pmulhrsw x, m5}, m1, m2, m3, m4
    vpbroadcastd         m5, [o(pw_16384)]
    punpckldq            m0, m1, m2
    punpckhdq            m1, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
    jmp                tx2q
.pass2:
    vpbroadcastd         m4, [o(pw_5793x4)]
    vpbroadcastd         m5, [o(pw_2048)]
    REPX   {psllw    x, 2 }, m0, m1, m2, m3
    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
    jmp m(iadst_4x16_internal).end2

%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
    movq               xm%3, [dstq   ]
    movhps             xm%3, [dstq+%5]
    movq               xm%4, [dstq+%6]
    movhps             xm%4, [dstq+%7]
    pmovzxbw            m%3, xm%3
    pmovzxbw            m%4, xm%4
%ifnum %1
    paddw               m%3, m%1
%else
    paddw               m%3, %1
%endif
%ifnum %2
    paddw               m%4, m%2
%else
    paddw               m%4, %2
%endif
    packuswb            m%3, m%4
    vextracti128       xm%4, m%3, 1
    movq          [dstq   ], xm%3
    movhps        [dstq+%6], xm%3
    movq          [dstq+%5], xm%4
    movhps        [dstq+%7], xm%4
%endmacro

%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 8x4
%if %3 >= 0
%ifidn %1_%2, dct_identity
    vpbroadcastd        xm0, [o(pw_2896x8)]
    pmulhrsw            xm1, xm0, [cq]
    vpbroadcastd        xm2, [o(pw_5793x4)]
    vpbroadcastd        xm3, [o(pw_2048)]
    pmulhrsw            xm1, xm0
    paddw               xm1, xm1
    pmulhrsw            xm1, xm2
    pmulhrsw            xm1, xm3
    punpcklwd           xm1, xm1
    punpckldq           xm0, xm1, xm1
    punpckhdq           xm1, xm1
    vpermq               m0, m0, q1100
    vpermq               m1, m1, q1100
%elifidn %1_%2, identity_dct
    mova                xm0, [cq+16*0]
    packusdw            xm0, [cq+16*1]
    mova                xm1, [cq+16*2]
    packusdw            xm1, [cq+16*3]
    vpbroadcastd        xm2, [o(pw_2896x8)]
    vpbroadcastd        xm3, [o(pw_2048)]
    packusdw            xm0, xm1
    pmulhrsw            xm0, xm2
    paddw               xm0, xm0
    pmulhrsw            xm0, xm2
    pmulhrsw            xm0, xm3
    vinserti128          m0, m0, xm0, 1
    mova                 m1, m0
%else
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    pmulhrsw            xm0, xm1
%ifidn %2, dct
    movd                xm2, [o(pw_2048)]
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    mova                 m1, m0
%else ; adst / flipadst
    vpbroadcastw         m0, xm0
    pmulhrsw             m0, [o(iadst4_dconly2a)]
    vpbroadcastd         m1, [o(pw_2048)]
    pmulhrsw             m1, m0
%ifidn %2, adst
    vpermq               m0, m1, q1100
    vpermq               m1, m1, q3322
%else ; flipadst
    vpermq               m0, m1, q2233
    vpermq               m1, m1, q0011
%endif
%endif
%endif
    jmp m(iadst_8x4_internal).end3
%endif
%endmacro

INV_TXFM_8X4_FN dct, dct,      0
INV_TXFM_8X4_FN dct, adst,     0
INV_TXFM_8X4_FN dct, flipadst, 0
INV_TXFM_8X4_FN dct, identity, 3

cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpbroadcastd        xm3, [o(pw_2896x8)]
    pmulhrsw            xm0, xm3, [cq+16*0]
    pmulhrsw            xm1, xm3, [cq+16*1]
    pmulhrsw            xm2, xm3, [cq+16*2]
    pmulhrsw            xm3,      [cq+16*3]
    call m(idct_4x8_internal).main
    vbroadcasti128       m4, [o(deint_shuf)]
    vinserti128          m3, m1, xm3, 1
    vinserti128          m1, m0, xm2, 1
    shufps               m0, m1, m3, q0220
    shufps               m1, m1, m3, q1331
    pshufb               m0, m4
    pshufb               m1, m4
    jmp                tx2q
.pass2:
    IDCT4_1D_PACKED
    vpermq               m0, m0, q3120
    vpermq               m1, m1, q2031
    jmp m(iadst_8x4_internal).end2

INV_TXFM_8X4_FN adst, dct
INV_TXFM_8X4_FN adst, adst
INV_TXFM_8X4_FN adst, flipadst
INV_TXFM_8X4_FN adst, identity

cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpbroadcastd        xm0, [o(pw_2896x8)]
    pshufd              xm4,      [cq+16*0], q1032
    pmulhrsw            xm3, xm0, [cq+16*3]
    pshufd              xm5,      [cq+16*1], q1032
    pmulhrsw            xm2, xm0, [cq+16*2]
    pmulhrsw            xm4, xm0
    pmulhrsw            xm5, xm0
    call m(iadst_4x8_internal).main
    vinserti128        m0, m0, xm2, 1
    vinserti128        m1, m1, xm3, 1
    punpckhwd          m2, m0, m1
    punpcklwd          m0, m1
    pxor               m3, m3
    psubw              m3, m2
    punpckhwd          m1, m0, m3
    punpcklwd          m0, m3
    jmp              tx2q
.pass2:
    call .main
    vpblendd             m1, m1, m2, 0xcc
.end:
    vpermq               m0, m0, q3120
    vpermq               m1, m1, q3120
.end2:
    vpbroadcastd         m2, [o(pw_2048)]
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    WIN64_RESTORE_XMM
.end3:
    pxor                 m2, m2
    mova          [cq+32*0], m2
    mova          [cq+32*1], m2
    lea                  r3, [strideq*3]
    WRITE_8X4             0, 1, 4, 5
    RET
ALIGN function_align
.main:
    IADST4_1D_PACKED
    ret

INV_TXFM_8X4_FN flipadst, dct
INV_TXFM_8X4_FN flipadst, adst
INV_TXFM_8X4_FN flipadst, flipadst
INV_TXFM_8X4_FN flipadst, identity

cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpbroadcastd        xm0, [o(pw_2896x8)]
    pshufd              xm4,      [cq+16*0], q1032
    pmulhrsw            xm3, xm0, [cq+16*3]
    pshufd              xm5,      [cq+16*1], q1032
    pmulhrsw            xm2, xm0, [cq+16*2]
    pmulhrsw            xm4, xm0
    pmulhrsw            xm5, xm0
    call m(iadst_4x8_internal).main
    vinserti128          m3, m3, xm1, 1
    vinserti128          m2, m2, xm0, 1
    punpckhwd            m1, m3, m2
    punpcklwd            m3, m2
    pxor                 m0, m0
    psubw                m0, m1
    punpckhwd            m1, m0, m3
    punpcklwd            m0, m3
    jmp                tx2q
.pass2:
    call m(iadst_8x4_internal).main
    vpblendd             m2, m2, m1, 0x33
    vpermq               m1, m0, q2031
    vpermq               m0, m2, q2031
    jmp m(iadst_8x4_internal).end2

INV_TXFM_8X4_FN identity, dct,      7
INV_TXFM_8X4_FN identity, adst
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity

cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
    mova                xm2,     [cq+16*0]
    mova                xm0,     [cq+16*1]
    vinserti128          m2, m2, [cq+16*2], 1
    vinserti128          m0, m0, [cq+16*3], 1
    vpbroadcastd         m3, [o(pw_2896x8)]
    punpcklwd            m1, m2, m0
    punpckhwd            m2, m0
    pmulhrsw             m1, m3
    pmulhrsw             m2, m3
    punpcklwd            m0, m1, m2
    punpckhwd            m1, m2
    paddw                m0, m0
    paddw                m1, m1
    jmp                tx2q
.pass2:
    vpbroadcastd         m2, [o(pw_5793x4)]
    paddw                m0, m0
    paddw                m1, m1
    pmulhrsw             m0, m2
    pmulhrsw             m1, m2
    jmp m(iadst_8x4_internal).end

%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 8x8
%ifidn %1_%2, dct_identity
    vpbroadcastd        xm0, [o(pw_2896x8)]
    pmulhrsw            xm0, [cq]
    vpbroadcastd        xm1, [o(pw_16384)]
    pmulhrsw            xm0, xm1
    psrlw               xm1, 2 ; pw_4096
    pmulhrsw            xm0, xm1
    pshufb              xm0, [o(deint_shuf)]
    vpermq               m3, m0, q1100
    punpcklwd            m3, m3
    pshufd               m0, m3, q0000
    pshufd               m1, m3, q1111
    pshufd               m2, m3, q2222
    pshufd               m3, m3, q3333
    jmp m(iadst_8x8_internal).end4
%elif %3 >= 0
%ifidn %1, dct
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm2
    psrlw               xm2, 3 ; pw_2048
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
.end:
    mov                 r2d, 2
.end2:
    lea                  r3, [strideq*3]
.loop:
    WRITE_8X4             0, 0, 1, 2
    lea                dstq, [dstq+strideq*4]
    dec                 r2d
    jg .loop
    RET
%else ; identity
    mova                 m0, [cq+32*0]
    punpcklwd            m0, [cq+32*1]
    mova                 m1, [cq+32*2]
    punpcklwd            m1, [cq+32*3]
    vpbroadcastd         m2, [o(pw_2896x8)]
    vpbroadcastd         m3, [o(pw_2048)]
    pxor                 m4, m4
    mova          [cq+32*0], m4
    mova          [cq+32*1], m4
    mova          [cq+32*2], m4
    mova          [cq+32*3], m4
    punpckldq            m0, m1
    vpermq               m1, m0, q3232
    vpermq               m0, m0, q1010
    punpcklwd            m0, m1
    pmulhrsw             m0, m2
    pmulhrsw             m0, m3
    jmp m(inv_txfm_add_dct_dct_8x8).end
%endif
%endif
%endmacro

INV_TXFM_8X8_FN dct, dct,      0
INV_TXFM_8X8_FN dct, identity, 7
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst

cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q3120 ; 0 1
    vpermq               m3, [cq+32*3], q3120 ; 6 7
    vpermq               m2, [cq+32*2], q3120 ; 4 5
    vpermq               m1, [cq+32*1], q3120 ; 2 3
    call .main
    shufps               m4, m0, m1, q0220
    shufps               m5, m0, m1, q1331
    shufps               m1, m2, m3, q0220
    shufps               m3, m2, m3, q1331
    vbroadcasti128       m0, [o(deint_shuf)]
    vpbroadcastd         m2, [o(pw_16384)]
    REPX   {pshufb   x, m0}, m4, m5, m1, m3
    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
    vinserti128          m0, m4, xm1, 1
    vperm2i128           m2, m4, m1, 0x31
    vinserti128          m1, m5, xm3, 1
    vperm2i128           m3, m5, m3, 0x31
    jmp                tx2q
.pass2:
    call .main
    vpbroadcastd         m4, [o(pw_2048)]
    vpermq               m0, m0, q3120
    vpermq               m1, m1, q2031
    vpermq               m2, m2, q3120
    vpermq               m3, m3, q2031
    jmp m(iadst_8x8_internal).end2
ALIGN function_align
.main:
    IDCT8_1D_PACKED
    ret

INV_TXFM_8X8_FN adst, dct
INV_TXFM_8X8_FN adst, adst
INV_TXFM_8X8_FN adst, flipadst
INV_TXFM_8X8_FN adst, identity

cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m4, [cq+32*0], q1302 ; 1 0
    vpermq               m3, [cq+32*3], q3120 ; 6 7
    vpermq               m5, [cq+32*1], q1302 ; 3 2
    vpermq               m2, [cq+32*2], q3120 ; 4 5
    call .main
    vpbroadcastd         m5, [o(pw_16384)]
    punpcklwd            m4, m0, m1
    punpckhwd            m0, m1
    punpcklwd            m1, m2, m3
    punpckhwd            m2, m3
    pxor                 m3, m3
    psubw                m3, m5 ; negate odd elements during rounding
    pmulhrsw             m4, m5
    pmulhrsw             m0, m3
    pmulhrsw             m1, m5
    pmulhrsw             m2, m3
    punpcklwd            m3, m4, m0
    punpckhwd            m4, m0
    punpcklwd            m0, m1, m2
    punpckhwd            m1, m2
    vperm2i128           m2, m3, m0, 0x31
    vinserti128          m0, m3, xm0, 1
    vperm2i128           m3, m4, m1, 0x31
    vinserti128          m1, m4, xm1, 1
    jmp                tx2q
.pass2:
    pshufd               m4, m0, q1032
    pshufd               m5, m1, q1032
    call .main
    vpbroadcastd         m5, [o(pw_2048)]
    vpbroadcastd        xm4, [o(pw_4096)]
    psubw                m4, m5 ; lower half = 2048, upper half = -2048
.end:
    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
.end2:
    pmulhrsw             m0, m4
    pmulhrsw             m1, m4
.end3:
    pmulhrsw             m2, m4
    pmulhrsw             m3, m4
    WIN64_RESTORE_XMM
.end4:
    pxor                 m4, m4
    mova          [cq+32*0], m4
    mova          [cq+32*1], m4
    mova          [cq+32*2], m4
    mova          [cq+32*3], m4
    lea                  r3, [strideq*3]
    WRITE_8X4             0, 1, 4, 5
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4             2, 3, 4, 5
    RET
ALIGN function_align
.main:
    IADST8_1D_PACKED
    ret

INV_TXFM_8X8_FN flipadst, dct
INV_TXFM_8X8_FN flipadst, adst
INV_TXFM_8X8_FN flipadst, flipadst
INV_TXFM_8X8_FN flipadst, identity

cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    vpermq               m4, [cq+32*0], q1302 ; 1 0
    vpermq               m3, [cq+32*3], q3120 ; 6 7
    vpermq               m5, [cq+32*1], q1302 ; 3 2
    vpermq               m2, [cq+32*2], q3120 ; 4 5
    call m(iadst_8x8_internal).main
    vpbroadcastd         m5, [o(pw_16384)]
    punpckhwd            m4, m3, m2
    punpcklwd            m3, m2
    punpckhwd            m2, m1, m0
    punpcklwd            m1, m0
    pxor                 m0, m0
    psubw                m0, m5
    pmulhrsw             m4, m0
    pmulhrsw             m3, m5
    pmulhrsw             m2, m0
    pmulhrsw             m1, m5
    punpckhwd            m0, m4, m3
    punpcklwd            m4, m3
    punpckhwd            m3, m2, m1
    punpcklwd            m2, m1
    vinserti128          m1, m0, xm3, 1
    vperm2i128           m3, m0, m3, 0x31
    vinserti128          m0, m4, xm2, 1
    vperm2i128           m2, m4, m2, 0x31
    jmp                tx2q
.pass2:
    pshufd               m4, m0, q1032
    pshufd               m5, m1, q1032
    call m(iadst_8x8_internal).main
    vpbroadcastd         m4, [o(pw_2048)]
    vpbroadcastd        xm5, [o(pw_4096)]
    psubw                m4, m5 ; lower half = -2048, upper half = 2048
    vpermq               m5, m3, q2031
    vpermq               m3, m0, q2031
    vpermq               m0, m2, q2031
    vpermq               m2, m1, q2031
    pmulhrsw             m1, m0, m4
    pmulhrsw             m0, m5, m4
    jmp m(iadst_8x8_internal).end3

INV_TXFM_8X8_FN identity, dct,      7
INV_TXFM_8X8_FN identity, adst
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity

cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
    mova                xm3,     [cq+16*0]
    mova                xm2,     [cq+16*1]
    vinserti128          m3, m3, [cq+16*4], 1
    vinserti128          m2, m2, [cq+16*5], 1
    mova                xm4,     [cq+16*2]
    mova                xm0,     [cq+16*3]
    vinserti128          m4, m4, [cq+16*6], 1
    vinserti128          m0, m0, [cq+16*7], 1
    punpcklwd            m1, m3, m2
    punpckhwd            m3, m2
    punpcklwd            m2, m4, m0
    punpckhwd            m4, m0
    punpckldq            m0, m1, m2
    punpckhdq            m1, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    jmp                tx2q
.pass2:
    vpbroadcastd         m4, [o(pw_4096)]
    jmp m(iadst_8x8_internal).end

%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 8x16
%ifidn %1_%2, dct_dct
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    psrlw               xm2, 3 ; pw_2048
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    mov                 r2d, 4
    jmp m(inv_txfm_add_dct_dct_8x8).end2
%elifidn %1_%2, dct_identity
    WIN64_SPILL_XMM      13
    vpbroadcastd         m0, [o(pw_2896x8)]
    pmulhrsw             m7, m0, [cq]
    vpbroadcastd         m1, [o(pw_16384)]
    vpbroadcastd         m2, [o(pw_5793x4)]
    pxor                 m3, m3
    mova               [cq], m3
    pmulhrsw             m7, m0
    pmulhrsw             m7, m1
    psrlw                m1, 3 ; pw_2048
    psllw                m7, 2
    pmulhrsw             m7, m2
    pmulhrsw             m7, m1
    punpcklwd            m5, m7, m7
    punpckhwd            m7, m7
    punpcklwd            m4, m5, m5
    punpckhwd            m5, m5
    punpcklwd            m6, m7, m7
    punpckhwd            m7, m7
    vpermq               m0, m4, q1100
    vpermq               m1, m5, q1100
    vpermq               m2, m6, q1100
    vpermq               m3, m7, q1100
    vpermq               m4, m4, q3322
    vpermq               m5, m5, q3322
    vpermq               m6, m6, q3322
    vpermq               m7, m7, q3322
    jmp m(idct_8x16_internal).end4
%elifidn %1_%2, identity_dct
    movd                xm0, [cq+32*0]
    punpcklwd           xm0, [cq+32*1]
    movd                xm2, [cq+32*2]
    punpcklwd           xm2, [cq+32*3]
    add                  cq, 32*4
    movd                xm1, [cq+32*0]
    punpcklwd           xm1, [cq+32*1]
    movd                xm3, [cq+32*2]
    punpcklwd           xm3, [cq+32*3]
    vpbroadcastd        xm4, [o(pw_2896x8)]
    vpbroadcastd        xm5, [o(pw_2048)]
    xor                 eax, eax
    mov           [cq-32*4], eax
    mov           [cq-32*3], eax
    mov           [cq-32*2], eax
    mov           [cq-32*1], eax
    punpckldq           xm0, xm2
    punpckldq           xm1, xm3
    punpcklqdq          xm0, xm1
    pmulhrsw            xm0, xm4
    pmulhrsw            xm0, xm4
    pmulhrsw            xm0, xm5
    mov           [cq+32*0], eax
    mov           [cq+32*1], eax
    mov           [cq+32*2], eax
    mov           [cq+32*3], eax
    vinserti128          m0, m0, xm0, 1
    mov                 r2d, 4
    jmp m(inv_txfm_add_dct_dct_8x8).end2
%endif
%endmacro

%macro ITX_8X16_LOAD_COEFS 0
    vpbroadcastd         m4, [o(pw_2896x8)]
    pmulhrsw             m0, m4, [cq+32*0]
    add                  cq, 32*4
    pmulhrsw             m7, m4, [cq+32*3]
    pmulhrsw             m1, m4, [cq-32*3]
    pmulhrsw             m6, m4, [cq+32*2]
    pmulhrsw             m2, m4, [cq-32*2]
    pmulhrsw             m5, m4, [cq+32*1]
    pmulhrsw             m3, m4, [cq-32*1]
    pmulhrsw             m4,     [cq+32*0]
%endmacro

INV_TXFM_8X16_FN dct, dct,      0
INV_TXFM_8X16_FN dct, identity, 15
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst

cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_8X16_LOAD_COEFS
    call m(idct_16x8_internal).main
    vpbroadcastd        m10, [o(pw_16384)]
.pass1_end:
    vperm2i128           m9, m3, m7, 0x31
    vinserti128          m3, m3, xm7, 1
    vperm2i128           m8, m2, m6, 0x31
    vinserti128          m2, m2, xm6, 1
    vperm2i128           m6, m1, m5, 0x31
    vinserti128          m1, m1, xm5, 1
    vperm2i128           m5, m0, m4, 0x31
    vinserti128          m0, m0, xm4, 1
    punpckhwd            m4, m2, m3
    punpcklwd            m2, m3
    punpckhwd            m3, m0, m1
    punpcklwd            m0, m1
.pass1_end2:
    punpckhwd            m7, m5, m6
    punpcklwd            m5, m6
    punpcklwd            m6, m8, m9
    punpckhwd            m8, m9
    REPX  {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
    punpckhdq            m1, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    punpckldq            m4, m5, m6
    punpckhdq            m5, m6
    punpckldq            m6, m7, m8
    punpckhdq            m7, m8
    jmp                tx2q
.pass2:
    call .main
    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
.end:
    vpbroadcastd         m8, [o(pw_2048)]
.end2:
    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
.end3:
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
.end4:
    lea                  r3, [strideq*3]
    WRITE_8X4             0, 1, 8, 9
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4             2, 3, 0, 1
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4             4, 5, 0, 1
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4             6, 7, 0, 1
    RET
ALIGN function_align
.main:
    IDCT16_1D_PACKED
    ret

INV_TXFM_8X16_FN adst, dct
INV_TXFM_8X16_FN adst, adst
INV_TXFM_8X16_FN adst, flipadst
INV_TXFM_8X16_FN adst, identity

cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_8X16_LOAD_COEFS
    call m(iadst_16x8_internal).main
    vpbroadcastd        m10, [o(pw_16384)]
    pslld                m9, m10, 17
    psubw               m10, m9 ; 16384, -16384
    jmp m(idct_8x16_internal).pass1_end
ALIGN function_align
.pass2:
    call .main
    vpbroadcastd         m9, [o(pw_2048)]
    vpbroadcastd        xm8, [o(pw_4096)]
    psubw                m8, m9
    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
    jmp m(idct_8x16_internal).end2
ALIGN function_align
.main:
    REPX {pshufd x, x, q1032}, m7, m1, m5, m3
.main2:
    vpbroadcastd        m10, [o(pd_2048)]
    punpckhwd            m8, m7, m0 ; in14 in1
    punpcklwd            m0, m7     ; in0  in15
    punpcklwd            m7, m6, m1 ; in12 in3
    punpckhwd            m1, m6     ; in2  in13
    punpckhwd            m6, m5, m2 ; in10 in5
    punpcklwd            m2, m5     ; in4  in11
    punpcklwd            m5, m4, m3 ; in8  in7
    punpckhwd            m3, m4     ; in6  in9
    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
    psubsw               m4, m0, m5 ; t9a  t8a
    paddsw               m0, m5     ; t1a  t0a
    psubsw               m5, m1, m6 ; t11a t10a
    paddsw               m1, m6     ; t3a  t2a
    psubsw               m6, m2, m7 ; t13a t12a
    paddsw               m2, m7     ; t5a  t4a
    psubsw               m7, m3, m8 ; t15a t14a
    paddsw               m3, m8     ; t7a  t6a
    vpbroadcastd        m11, [o(pw_m4017_799)]
    vpbroadcastd        m12, [o(pw_799_4017)]
    pxor                 m9, m9
    ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
    psubw                m8, m9, m11 ; pw_4017_m799
    ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
    vpbroadcastd        m11, [o(pw_m2276_3406)]
    vpbroadcastd        m12, [o(pw_3406_2276)]
    ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
    psubw                m8, m9, m11 ; pw_2276_m3406
    ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
    psubsw               m8, m1, m3 ; t7   t6
    paddsw               m1, m3     ; t3   t2
    psubsw               m3, m0, m2 ; t5   t4
    paddsw               m0, m2     ; t1   t0
    psubsw               m2, m5, m7 ; t14a t15a
    paddsw               m7, m5     ; t10a t11a
    psubsw               m5, m4, m6 ; t12a t13a
    paddsw               m4, m6     ; t8a  t9a
    vpbroadcastd        m11, [o(pw_m3784_1567)]
    vpbroadcastd        m12, [o(pw_1567_3784)]
    ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
    psubw                m6, m9, m11 ; pw_3784_m1567
    ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
    vpbroadcastd        m11, [o(pw_m1567_3784)]
    vpbroadcastd        m12, [o(pw_3784_1567)]
    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
    psubw                m6, m9, m11 ; pw_1567_m3784
    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
    vbroadcasti128      m11, [o(deint_shuf)]
    vpbroadcastd        m12, [o(pw_2896x8)]
    psubsw               m6, m0, m1        ;  t3a    t2a
    paddsw               m0, m1            ; -out15  out0
    paddsw               m1, m2, m5        ; -out13  out2
    psubsw               m5, m2            ;  t15a   t14a
    paddsw               m2, m4, m7        ; -out1  out14
    psubsw               m4, m7            ;  t10    t11
    psubsw               m7, m3, m8        ;  t6     t7
    paddsw               m8, m3            ; -out3   out12
    REPX    {pshufb x, m11}, m6, m4, m0, m2
    vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
    shufps               m6, m6, m4, q1032 ;  t2a    t10
    vpblendd             m4, m5, m7, 0xcc  ;  t15a   t7
    shufps               m5, m5, m7, q1032 ;  t14a   t6
    shufps               m7, m2, m0, q1032 ;  out14 -out15
    vpblendd             m0, m0, m2, 0x33  ; -out1   out0
    paddw                m2, m5, m4        ; -out5   out4
    psubw                m5, m4            ;  out10 -out11
    psubw                m4, m6, m3        ;  out8  -out9
    paddw                m3, m6            ; -out7   out6
    shufps               m6, m8, m1, q1032 ;  out12 -out13
    vpblendd             m1, m1, m8, 0x33  ; -out3   out2
    REPX  {pmulhrsw x, m12}, m2, m3, m4, m5
    ret

INV_TXFM_8X16_FN flipadst, dct
INV_TXFM_8X16_FN flipadst, adst
INV_TXFM_8X16_FN flipadst, flipadst
INV_TXFM_8X16_FN flipadst, identity

cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_8X16_LOAD_COEFS
    call m(iadst_16x8_internal).main
    vpbroadcastd         m9, [o(pw_16384)]
    pslld               m10, m9, 17
    psubw               m10, m9 ; -16384, 16384
    vperm2i128           m9, m4, m0, 0x31
    vinserti128          m0, m4, xm0, 1
    vperm2i128           m8, m5, m1, 0x31
    vinserti128          m4, m5, xm1, 1
    vperm2i128           m5, m7, m3, 0x31
    vinserti128          m3, m7, xm3, 1
    vinserti128          m1, m6, xm2, 1
    vperm2i128           m6, m6, m2, 0x31
    punpcklwd            m2, m4, m0
    punpckhwd            m4, m0
    punpcklwd            m0, m3, m1
    punpckhwd            m3, m1
    jmp m(idct_8x16_internal).pass1_end2
.pass2:
    call m(iadst_8x16_internal).main
    vpbroadcastd         m8, [o(pw_2048)]
    vpbroadcastd        xm9, [o(pw_4096)]
    psubw                m8, m9
    vpermq               m9, m0, q3120
    vpermq               m0, m7, q2031
    vpermq               m7, m1, q3120
    vpermq               m1, m6, q2031
    vpermq               m6, m2, q3120
    vpermq               m2, m5, q2031
    vpermq               m5, m3, q3120
    vpermq               m3, m4, q2031
    pmulhrsw             m0, m8
    pmulhrsw             m1, m8
    pmulhrsw             m2, m8
    pmulhrsw             m3, m8
    pmulhrsw             m4, m5, m8
    pmulhrsw             m5, m6, m8
    pmulhrsw             m6, m7, m8
    pmulhrsw             m7, m9, m8
    jmp m(idct_8x16_internal).end3

INV_TXFM_8X16_FN identity, dct,      7
INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity

cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
    mova                xm3,     [cq+16*0]
    mova                xm2,     [cq+16*2]
    add                  cq, 16*8
    vinserti128          m3, m3, [cq+16*0], 1
    vinserti128          m2, m2, [cq+16*2], 1
    vpbroadcastd         m9, [o(pw_2896x8)]
    mova                xm4,     [cq-16*4]
    mova                xm5,     [cq-16*2]
    vinserti128          m4, m4, [cq+16*4], 1
    vinserti128          m5, m5, [cq+16*6], 1
    mova                xm7,     [cq-16*7]
    mova                xm6,     [cq-16*5]
    vinserti128          m7, m7, [cq+16*1], 1
    vinserti128          m6, m6, [cq+16*3], 1
    mova                xm8,     [cq-16*3]
    mova                xm0,     [cq-16*1]
    vinserti128          m8, m8, [cq+16*5], 1
    vinserti128          m0, m0, [cq+16*7], 1
    punpcklwd            m1, m3, m2
    punpckhwd            m3, m2
    punpcklwd            m2, m4, m5
    punpckhwd            m4, m5
    punpcklwd            m5, m7, m6
    punpckhwd            m7, m6
    punpcklwd            m6, m8, m0
    punpckhwd            m8, m0
    REPX   {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
    punpckldq            m0, m1, m2
    punpckhdq            m1, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    punpckldq            m4, m5, m6
    punpckhdq            m5, m6
    punpckldq            m6, m7, m8
    punpckhdq            m7, m8
    jmp                tx2q
.pass2:
    vpbroadcastd         m8, [o(pw_5793x4)]
    REPX {psllw    x, 2       }, m0, m1, m2, m3, m4, m5, m6, m7
    REPX {vpermq   x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
    REPX {pmulhrsw x, m8      }, m0, m1, m2, m3, m4, m5, m6, m7
    jmp m(idct_8x16_internal).end

%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
    pmovzxbw            m%3, [dstq+%5]
%ifnum %1
    paddw               m%3, m%1
%else
    paddw               m%3, %1
%endif
    pmovzxbw            m%4, [dstq+%6]
%ifnum %2
    paddw               m%4, m%2
%else
    paddw               m%4, %2
%endif
    packuswb            m%3, m%4
    vpermq              m%3, m%3, q3120
    mova          [dstq+%5], xm%3
    vextracti128  [dstq+%6], m%3, 1
%endmacro

%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 16x4
%if %3 >= 0
%ifidn %1_%2, dct_identity
    vpbroadcastd        xm3, [o(pw_2896x8)]
    pmulhrsw            xm3, [cq]
    vpbroadcastd        xm0, [o(pw_16384)]
    vpbroadcastd        xm1, [o(pw_5793x4)]
    pmulhrsw            xm3, xm0
    psrlw               xm0, 3 ; pw_2048
    paddw               xm3, xm3
    pmulhrsw            xm3, xm1
    pmulhrsw            xm3, xm0
    punpcklwd           xm3, xm3
    punpckldq           xm1, xm3, xm3
    punpckhdq           xm3, xm3
    vpbroadcastq         m0, xm1
    vpermq               m1, m1, q1111
    vpbroadcastq         m2, xm3
    vpermq               m3, m3, q1111
    jmp m(iadst_16x4_internal).end2
%elifidn %1_%2, identity_dct
    mova                xm0,     [cq+16*0]
    mova                xm2,     [cq+16*1]
    vinserti128          m0, m0, [cq+16*4], 1
    vinserti128          m2, m2, [cq+16*5], 1
    mova                xm1,     [cq+16*2]
    mova                xm3,     [cq+16*3]
    vinserti128          m1, m1, [cq+16*6], 1
    vinserti128          m3, m3, [cq+16*7], 1
    vpbroadcastd         m4, [o(pw_5793x4)]
    vpbroadcastd         m5, [o(pw_16384)]
    packusdw             m0, m2
    packusdw             m1, m3
    packusdw             m0, m1
    vpbroadcastd         m1, [o(pw_2896x8)]
    psllw                m0, 2
    pmulhrsw             m0, m4
    pmulhrsw             m0, m5
    psrlw                m5, 3 ; pw_2048
    pmulhrsw             m0, m1
    pmulhrsw             m0, m5
    mov                 r3d, 2
.end:
    pxor                 m3, m3
.end_loop:
    mova          [cq+32*0], m3
    mova          [cq+32*1], m3
    add                  cq, 32*2
    WRITE_16X2            0, 0, 1, 2, strideq*0, strideq*1
    lea                dstq, [dstq+strideq*2]
    dec                 r3d
    jg .end_loop
    RET
%else
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
%ifidn %2, dct
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    mov                 r2d, 2
.dconly:
    pmulhrsw            xm0, xm2
    movd                xm2, [pw_2048] ; intentionally rip-relative
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    pxor                 m3, m3
.dconly_loop:
    mova                xm1, [dstq]
    vinserti128          m1, m1, [dstq+strideq], 1
    punpckhbw            m2, m1, m3
    punpcklbw            m1, m3
    paddw                m2, m0
    paddw                m1, m0
    packuswb             m1, m2
    mova             [dstq], xm1
    vextracti128 [dstq+strideq], m1, 1
    lea                dstq, [dstq+strideq*2]
    dec                 r2d
    jg .dconly_loop
    RET
%else ; adst / flipadst
    movd                xm2, [o(pw_16384)]
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    pmulhrsw             m0, [o(iadst4_dconly2a)]
    vpbroadcastd         m3, [o(pw_2048)]
    mov                [cq], eobd
    pmulhrsw             m3, m0
%ifidn %2, adst
    vpbroadcastq         m0, xm3
    vpermq               m1, m3, q1111
    vpermq               m2, m3, q2222
    vpermq               m3, m3, q3333
%else ; flipadst
    vpermq               m0, m3, q3333
    vpermq               m1, m3, q2222
    vpermq               m2, m3, q1111
    vpbroadcastq         m3, xm3
%endif
    jmp m(iadst_16x4_internal).end3
%endif
%endif
%endif
%endmacro

INV_TXFM_16X4_FN dct, dct,      0
INV_TXFM_16X4_FN dct, adst,     0
INV_TXFM_16X4_FN dct, flipadst, 0
INV_TXFM_16X4_FN dct, identity, 3

cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                xm0, [cq+16*0]
    mova                xm1, [cq+16*1]
    mova                xm2, [cq+16*2]
    mova                xm3, [cq+16*3]
    mova                xm4, [cq+16*4]
    mova                xm5, [cq+16*5]
    mova                xm6, [cq+16*6]
    mova                xm7, [cq+16*7]
    call m(idct_4x16_internal).main
    vinserti128          m6, m2, xm6, 1
    vinserti128          m2, m0, xm4, 1
    vinserti128          m0, m1, xm5, 1
    vinserti128          m1, m3, xm7, 1
    punpcklwd            m3, m2, m6
    punpckhwd            m2, m6
    vpbroadcastd         m6, [o(pw_16384)]
    punpckhwd            m4, m0, m1
    punpcklwd            m0, m1
    mova                 m1, m6
    jmp m(iadst_16x4_internal).pass1_end
.pass2:
    call .main
    jmp m(iadst_16x4_internal).end
ALIGN function_align
.main:
    vpbroadcastd         m6, [o(pd_2048)]
    IDCT4_1D              0, 1, 2, 3, 4, 5, 6
    ret

INV_TXFM_16X4_FN adst, dct
INV_TXFM_16X4_FN adst, adst
INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity

cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q1230
    vpermq               m3, [cq+32*3], q2103
    vpermq               m1, [cq+32*1], q1230
    vpermq               m2, [cq+32*2], q2103
    call m(iadst_4x16_internal).main2
    pshufd               m2, m2, q1032
    punpcklwd            m4, m3, m1
    punpcklwd            m5, m2, m0
    punpckhwd            m0, m1
    punpckhwd            m2, m3
    vpbroadcastd         m1, [o(pw_16384)]
    vinserti128          m3, m0, xm2, 1
    vperm2i128           m2, m0, m2, 0x31
    vinserti128          m0, m4, xm5, 1
    vperm2i128           m4, m4, m5, 0x31
    psubw                m6, m7, m1
.pass1_end:
    pmulhrsw             m3, m1
    pmulhrsw             m2, m6
    pmulhrsw             m4, m1
    pmulhrsw             m0, m6
    punpcklwd            m1, m3, m2
    punpckhwd            m3, m2
    punpcklwd            m2, m4, m0
    punpckhwd            m4, m0
    punpckldq            m0, m1, m2
    punpckhdq            m1, m2
    punpckldq            m2, m3, m4
    punpckhdq            m3, m4
    jmp                tx2q
.pass2:
    call .main
.end:
    vpbroadcastd         m4, [o(pw_2048)]
    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
    WIN64_RESTORE_XMM
.end2:
    pxor                 m4, m4
    mova          [cq+32*0], m4
    mova          [cq+32*1], m4
    mova          [cq+32*2], m4
    mova          [cq+32*3], m4
.end3:
    WRITE_16X2            0, 1, 4, 5, strideq*0, strideq*1
    lea                dstq, [dstq+strideq*2]
    WRITE_16X2            2, 3, 4, 5, strideq*0, strideq*1
    RET
ALIGN function_align
.main:
    vpbroadcastd         m7, [o(pw_3803_1321)]
    vpbroadcastd         m8, [o(pw_m1321_2482)]
    vpbroadcastd         m9, [o(pw_2482_3344)]
    punpcklwd            m4, m2, m0 ; in2 in0 l
    psubw                m6, m0, m2
    punpckhwd            m2, m0     ; in2 in0 h
    paddw                m6, m3     ; t2
    pmaddwd              m0, m7, m4 ; t0:02 l
    pmaddwd              m7, m2     ; t0:02 h
    pmaddwd              m4, m8     ; t1:02 l
    pmaddwd              m8, m2     ; t1:02 h
    punpckhwd            m2, m3, m1 ; in3 in1 h
    punpcklwd            m3, m1     ; in3 in1 l
    vpbroadcastd         m1, [o(pd_2048)]
    pmaddwd              m5, m9, m3
    pmaddwd              m9, m2
    paddd                m0, m1
    paddd                m7, m1
    paddd                m0, m5     ; t0 + t3 + 2048 l
    paddd                m7, m9     ; t0 + t3 + 2048 h
    vpbroadcastd         m9, [o(pw_m3803_3344)]
    pmaddwd              m5, m9, m2
    pmaddwd              m9, m3
    paddd                m5, m1     ; t1:13 + 2048 h
    paddd                m1, m9     ; t1:13 + 2048 l
    vpbroadcastd         m9, [o(pw_m3803_m6688)]
    pmaddwd              m2, m9
    pmaddwd              m3, m9
    paddd                m5, m8     ; t1 + t3 + 2048 h
    paddd                m1, m4     ; t1 + t3 + 2048 l
    paddd                m8, m7
    paddd                m4, m0
    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
    REPX      {psrad x, 12}, m0, m7, m5, m1, m2, m3
    packssdw             m0, m7
    packssdw             m1, m5
    packssdw             m3, m2
    vpbroadcastd         m2, [o(pw_3344x8)]
    pmulhrsw             m2, m6
    ret

INV_TXFM_16X4_FN flipadst, dct
INV_TXFM_16X4_FN flipadst, adst
INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity

cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    vpermq               m0, [cq+32*0], q1230
    vpermq               m3, [cq+32*3], q2103
    vpermq               m1, [cq+32*1], q1230
    vpermq               m2, [cq+32*2], q2103
    call m(iadst_4x16_internal).main2
    pshufd               m2, m2, q1032
    punpckhwd            m4, m3, m2
    punpckhwd            m5, m1, m0
    punpcklwd            m0, m2
    punpcklwd            m1, m3
    vpbroadcastd         m6, [o(pw_16384)]
    vinserti128          m3, m0, xm1, 1
    vperm2i128           m2, m0, m1, 0x31
    vinserti128          m0, m4, xm5, 1
    vperm2i128           m4, m4, m5, 0x31
    psubw                m1, m7, m6
    jmp m(iadst_16x4_internal).pass1_end
ALIGN function_align
.pass2:
    call m(iadst_16x4_internal).main
    vpbroadcastd         m4, [o(pw_2048)]
    REPX   {pmulhrsw x, m4}, m3, m2, m1, m0
    pxor                 m4, m4
    mova          [cq+32*0], m4
    mova          [cq+32*1], m4
    mova          [cq+32*2], m4
    mova          [cq+32*3], m4
    WRITE_16X2            3, 2, 4, 5, strideq*0, strideq*1
    lea                dstq, [dstq+strideq*2]
    WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
    RET

INV_TXFM_16X4_FN identity, dct,      15
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity

cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
    mova                xm2,     [cq+16*0]
    mova                xm4,     [cq+16*1]
    vinserti128          m2, m2, [cq+16*4], 1
    vinserti128          m4, m4, [cq+16*5], 1
    mova                xm0,     [cq+16*2]
    mova                xm1,     [cq+16*3]
    vinserti128          m0, m0, [cq+16*6], 1
    vinserti128          m1, m1, [cq+16*7], 1
    vpbroadcastd         m5, [o(pw_5793x4)]
    punpcklwd            m3, m2, m4
    punpckhwd            m2, m4
    punpcklwd            m4, m0, m1
    punpckhwd            m0, m1
    REPX       {psllw x, 2}, m3, m2, m4, m0
    punpcklwd            m1, m3, m2
    punpckhwd            m3, m2
    punpcklwd            m2, m4, m0
    punpckhwd            m4, m0
    REPX   {pmulhrsw x, m5}, m1, m3, m2, m4
    vpbroadcastd         m5, [o(pw_16384)]
    punpcklqdq           m0, m1, m2
    punpckhqdq           m1, m2
    punpcklqdq           m2, m3, m4
    punpckhqdq           m3, m4
    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
    jmp                tx2q
.pass2:
    vpbroadcastd         m4, [o(pw_5793x4)]
    REPX   {paddw    x, x }, m0, m1, m2, m3
    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
    jmp m(iadst_16x4_internal).end

%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 16x8
%ifidn %1_%2, dct_dct
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    mov                 r2d, 4
    jmp m(inv_txfm_add_dct_dct_16x4).dconly
%elifidn %1_%2, dct_identity
    WIN64_SPILL_XMM      13
    vbroadcasti128       m7, [cq]
    vpbroadcastd         m0, [o(pw_2896x8)]
    vpbroadcastd         m1, [o(pw_16384)]
    pxor                xm2, xm2
    mova               [cq], xm2
    pmulhrsw             m7, m0
    pmulhrsw             m7, m0
    pmulhrsw             m7, m1
    psrlw                m1, 2 ; pw_4096
    pmulhrsw             m7, m1
    punpcklwd            m3, m7, m7
    punpckhwd            m7, m7
    pshufd               m0, m3, q0000
    pshufd               m1, m3, q1111
    pshufd               m2, m3, q2222
    pshufd               m3, m3, q3333
    pshufd               m4, m7, q0000
    pshufd               m5, m7, q1111
    pshufd               m6, m7, q2222
    pshufd               m7, m7, q3333
    lea                  r3, [strideq*3]
    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
    jmp m(idct_16x8_internal).end4
%elifidn %1_%2, identity_dct
    mova                 m0, [cq+32*0]
    packusdw             m0, [cq+32*1]
    mova                 m2, [cq+32*2]
    packusdw             m2, [cq+32*3]
    mova                 m1, [cq+32*4]
    packusdw             m1, [cq+32*5]
    mova                 m3, [cq+32*6]
    packusdw             m3, [cq+32*7]
    vpbroadcastd         m4, [o(pw_2896x8)]
    vpbroadcastd         m5, [o(pw_5793x4)]
    packusdw             m0, m2
    packusdw             m1, m3
    vpbroadcastd         m2, [o(pw_16384)]
    packusdw             m0, m1
    vpermq               m1, m0, q3322
    vpermq               m0, m0, q1100
    punpcklwd            m0, m1
    pmulhrsw             m0, m4
    psllw                m0, 2
    pmulhrsw             m0, m5
    pmulhrsw             m0, m2
    psrlw                m2, 3 ; pw_2048
    pmulhrsw             m0, m4
    pmulhrsw             m0, m2
    mov                 r3d, 4
    jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
%endmacro

%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
    vpbroadcastd         m8, [o(pw_2896x8)]
    vpermq               m0, [cq+32*0], q3120
    add                  cq, 32*4
    vpermq               m7, [cq+32*3], q%1
    vpermq               m1, [cq-32*3], q%1
    vpermq               m6, [cq+32*2], q3120
    vpermq               m2, [cq-32*2], q3120
    vpermq               m5, [cq+32*1], q%1
    vpermq               m3, [cq-32*1], q%1
    vpermq               m4, [cq+32*0], q3120
    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
%endmacro

INV_TXFM_16X8_FN dct, dct,      0
INV_TXFM_16X8_FN dct, identity, 7
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst

cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_16X8_LOAD_COEFS 3120
    call m(idct_8x16_internal).main
    vpbroadcastd        m10, [o(pw_16384)]
    punpckhwd            m8, m0, m2
    punpcklwd            m0, m2
    punpckhwd            m2, m1, m3
    punpcklwd            m1, m3
    punpcklwd            m9, m4, m6
    punpckhwd            m4, m6
    punpcklwd            m6, m5, m7
    punpckhwd            m5, m7
    REPX  {pmulhrsw x, m10}, m8, m1, m4, m6
.pass1_end:
    REPX  {pmulhrsw x, m10}, m0, m2, m9, m5
    punpckhwd            m3, m0, m8
    punpcklwd            m0, m8
    punpckhwd            m8, m2, m1
    punpcklwd            m2, m1
    punpcklwd            m7, m9, m4
    punpckhwd            m9, m4
    punpcklwd            m4, m5, m6
    punpckhwd            m5, m6
    punpckhdq            m1, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m8
    punpckhdq            m3, m8
    punpckldq            m6, m7, m4
    punpckhdq            m7, m4
    punpckldq            m8, m9, m5
    punpckhdq            m9, m5
    vperm2i128           m4, m0, m6, 0x31
    vinserti128          m0, m0, xm6, 1
    vperm2i128           m5, m1, m7, 0x31
    vinserti128          m1, m1, xm7, 1
    vperm2i128           m6, m2, m8, 0x31
    vinserti128          m2, m2, xm8, 1
    vperm2i128           m7, m3, m9, 0x31
    vinserti128          m3, m3, xm9, 1
    jmp                tx2q
.pass2:
    call .main
    vpbroadcastd         m8, [o(pw_2048)]
.end:
    REPX   {pmulhrsw x, m8}, m0, m2, m4, m6
.end2:
    REPX   {pmulhrsw x, m8}, m1, m3, m5, m7
    lea                  r3, [strideq*3]
    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
.end3:
    pxor                 m0, m0
    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
.end4:
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            4, 5, 0, 1, strideq*0, strideq*1
    WRITE_16X2            6, 7, 0, 1, strideq*2, r3
    RET
ALIGN function_align
.main:
    vpbroadcastd        m10, [o(pd_2048)]
.main2:
    IDCT8_1D              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
    ret

INV_TXFM_16X8_FN adst, dct
INV_TXFM_16X8_FN adst, adst
INV_TXFM_16X8_FN adst, flipadst
INV_TXFM_16X8_FN adst, identity

cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_16X8_LOAD_COEFS 1302
    call m(iadst_8x16_internal).main2
    vpbroadcastd        m10, [o(pw_16384)]
    psubw               m11, m9, m10
    punpcklwd            m8, m0, m2
    punpckhwd            m0, m2
    punpckhwd            m2, m1, m3
    punpcklwd            m1, m3
    punpcklwd            m9, m4, m6
    punpckhwd            m4, m6
    punpckhwd            m6, m5, m7
    punpcklwd            m5, m7
    REPX  {pmulhrsw x, m11}, m8, m1, m4, m6
    jmp m(idct_16x8_internal).pass1_end
ALIGN function_align
.pass2:
    call .main
    vpbroadcastd         m9, [o(pw_2048)]
    pxor                 m8, m8
    psubw                m8, m9
    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
    jmp m(idct_16x8_internal).end2
ALIGN function_align
.main:
    vpbroadcastd        m10, [o(pd_2048)]
    ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
    ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
    ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
    ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
    psubsw               m8, m2, m6 ; t6
    paddsw               m2, m6     ; t2
    psubsw               m6, m0, m4 ; t4
    paddsw               m0, m4     ; t0
    psubsw               m4, m5, m1 ; t7
    paddsw               m5, m1     ; t3
    psubsw               m1, m7, m3 ; t5
    paddsw               m7, m3     ; t1
    ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
    psubsw               m9, m6, m8 ;  t7
    paddsw               m6, m8     ;  out6
    vpbroadcastd         m8, [o(pw_2896x8)]
    psubsw               m3, m7, m5 ;  t3
    paddsw               m7, m5     ; -out7
    psubsw               m5, m0, m2 ;  t2
    paddsw               m0, m2     ;  out0
    psubsw               m2, m1, m4 ;  t6
    paddsw               m1, m4     ; -out1
    psubw                m4, m5, m3
    paddw                m3, m5
    psubw                m5, m2, m9
    paddw                m2, m9
    pmulhrsw             m2, m8     ;  out2
    pmulhrsw             m3, m8     ; -out3
    pmulhrsw             m4, m8     ;  out4
    pmulhrsw             m5, m8     ; -out5
    ret

INV_TXFM_16X8_FN flipadst, dct
INV_TXFM_16X8_FN flipadst, adst
INV_TXFM_16X8_FN flipadst, flipadst
INV_TXFM_16X8_FN flipadst, identity

cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    ITX_16X8_LOAD_COEFS 1302
    call m(iadst_8x16_internal).main2
    vpbroadcastd        m10, [o(pw_16384)]
    psubw                m9, m10
    punpcklwd            m8, m6, m4
    punpckhwd            m6, m4
    punpcklwd            m4, m7, m5
    punpckhwd            m7, m5
    punpckhwd            m5, m3, m1
    punpcklwd            m3, m1
    punpckhwd            m1, m2, m0
    punpcklwd            m2, m0
    REPX  {pmulhrsw x, m10}, m8, m4, m5, m1
    REPX  {pmulhrsw x, m9 }, m6, m7, m3, m2
    punpcklwd            m0, m7, m4
    punpckhwd            m7, m4
    punpckhwd            m4, m6, m8
    punpcklwd            m6, m8
    punpckhwd            m8, m3, m5
    punpcklwd            m3, m5
    punpcklwd            m5, m2, m1
    punpckhwd            m2, m1
    punpckhdq            m1, m0, m6
    punpckldq            m0, m6
    punpckldq            m6, m7, m4
    punpckhdq            m7, m4
    punpckhdq            m4, m3, m5
    punpckldq            m3, m5
    punpckldq            m5, m8, m2
    punpckhdq            m8, m2
    vinserti128          m2, m6, xm5, 1
    vperm2i128           m6, m6, m5, 0x31
    vperm2i128           m5, m1, m4, 0x31
    vinserti128          m1, m1, xm4, 1
    vperm2i128           m4, m0, m3, 0x31
    vinserti128          m0, m0, xm3, 1
    vinserti128          m3, m7, xm8, 1
    vperm2i128           m7, m7, m8, 0x31
    jmp                tx2q
.pass2:
    call m(iadst_16x8_internal).main
    vpbroadcastd         m9, [o(pw_2048)]
    pxor                 m8, m8
    psubw                m8, m9
    pmulhrsw            m10, m7, m8
    pmulhrsw             m7, m0, m9
    pmulhrsw             m0, m6, m9
    pmulhrsw             m6, m1, m8
    pmulhrsw             m1, m5, m8
    pmulhrsw             m5, m2, m9
    pmulhrsw             m2, m4, m9
    pmulhrsw             m4, m3, m8
    lea                  r3, [strideq*3]
    WRITE_16X2           10, 0, 8, 9, strideq*0, strideq*1
    WRITE_16X2            1, 2, 0, 1, strideq*2, r3
    jmp m(idct_16x8_internal).end3

INV_TXFM_16X8_FN identity, dct,      15
INV_TXFM_16X8_FN identity, adst
INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity

cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
    mova                xm7,     [cq+16*0]
    mova                xm2,     [cq+16*1]
    add                  cq, 16*8
    vpbroadcastd         m3, [o(pw_2896x8)]
    vinserti128          m7, m7, [cq+16*0], 1
    vinserti128          m2, m2, [cq+16*1], 1
    mova                xm6,     [cq-16*6]
    mova                xm4,     [cq-16*5]
    vinserti128          m6, m6, [cq+16*2], 1
    vinserti128          m4, m4, [cq+16*3], 1
    mova                xm8,     [cq-16*4]
    mova                xm5,     [cq-16*3]
    vinserti128          m8, m8, [cq+16*4], 1
    vinserti128          m5, m5, [cq+16*5], 1
    mova                xm0,     [cq-16*2]
    mova                xm1,     [cq-16*1]
    vinserti128          m0, m0, [cq+16*6], 1
    vinserti128          m1, m1, [cq+16*7], 1
    vpbroadcastd         m9, [o(pw_5793x4)]
    vpbroadcastd        m10, [o(pw_16384)]
    REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
    punpcklwd            m3, m7, m2
    punpckhwd            m7, m2
    punpcklwd            m2, m6, m4
    punpckhwd            m6, m4
    punpcklwd            m4, m8, m5
    punpckhwd            m8, m5
    punpcklwd            m5, m0, m1
    punpckhwd            m0, m1
    REPX       {psllw x, 2}, m3, m7, m2, m6, m4, m8, m5, m0
    punpckldq            m1, m3, m2
    punpckhdq            m3, m2
    punpckldq            m2, m4, m5
    punpckhdq            m4, m5
    punpckldq            m5, m7, m6
    punpckhdq            m7, m6
    punpckldq            m6, m8, m0
    punpckhdq            m8, m0
    REPX   {pmulhrsw x, m9}, m1, m3, m2, m4, m5, m7, m6, m8
    punpcklqdq           m0, m1, m2
    punpckhqdq           m1, m2
    punpcklqdq           m2, m3, m4
    punpckhqdq           m3, m4
    punpcklqdq           m4, m5, m6
    punpckhqdq           m5, m6
    punpcklqdq           m6, m7, m8
    punpckhqdq           m7, m8
    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
    jmp                tx2q
.pass2:
    vpbroadcastd         m8, [o(pw_4096)]
    jmp m(idct_16x8_internal).end

%define o_base pw_5 + 128

%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
    INV_TXFM_FN          %1, %2, %3, 16x16
%ifidn %1_%2, dct_dct
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    mov                 r2d, 8
    jmp m(inv_txfm_add_dct_dct_16x4).dconly
%elifidn %1_%2, dct_identity
    WIN64_SPILL_XMM       7
    vpbroadcastd         m3, [o(pw_2896x8)]
    pmulhrsw             m3, [cq]
    vpbroadcastd         m0, [o(pw_8192)]
    vpbroadcastd         m1, [o(pw_5793x4)]
    vpbroadcastw         m4, [o(deint_shuf)] ; pb_0_1
    pcmpeqb              m5, m5
    pxor                 m6, m6
    mova               [cq], m6
    paddb                m5, m5 ; pb_m2
    pmulhrsw             m3, m0
    psrlw                m0, 2  ; pw_2048
    psllw                m3, 2
    pmulhrsw             m3, m1
    pmulhrsw             m3, m0
    mov                 r3d, 8
.loop:
    mova                xm1, [dstq]
    vinserti128          m1, m1, [dstq+strideq*8], 1
    pshufb               m0, m3, m4
    psubb                m4, m5 ; += 2
    punpckhbw            m2, m1, m6
    punpcklbw            m1, m6
    paddw                m2, m0
    paddw                m1, m0
    packuswb             m1, m2
    mova             [dstq], xm1
    vextracti128 [dstq+strideq*8], m1, 1
    add                dstq, strideq
    dec                 r3d
    jg .loop
    RET
%elifidn %1_%2, identity_dct
    movd                xm0,     [cq+32*0 ]
    movd                xm2,     [cq+32*1 ]
    movd                xm1,     [cq+32*2 ]
    movd                xm3,     [cq+32*3 ]
    vinserti128          m0, m0, [cq+32*8 ], 1
    vinserti128          m2, m2, [cq+32*9 ], 1
    vinserti128          m1, m1, [cq+32*10], 1
    vinserti128          m3, m3, [cq+32*11], 1
    punpcklwd            m0, m2
    punpcklwd            m1, m3
    punpckldq            m0, m1
    movd                xm1,     [cq+32*4 ]
    movd                xm3,     [cq+32*5 ]
    movd                xm2,     [cq+32*6 ]
    movd                xm4,     [cq+32*7 ]
    vinserti128          m1, m1, [cq+32*12], 1
    vinserti128          m3, m3, [cq+32*13], 1
    vinserti128          m2, m2, [cq+32*14], 1
    vinserti128          m4, m4, [cq+32*15], 1
    punpcklwd            m1, m3
    vpbroadcastd         m3, [o(pw_5793x4)]
    punpcklwd            m2, m4
    vpbroadcastd         m4, [o(pw_8192)]
    punpckldq            m1, m2
    vpbroadcastd         m2, [o(pw_2896x8)]
    punpcklqdq           m0, m1
    psllw                m0, 2
    pmulhrsw             m0, m3
    pmulhrsw             m0, m4
    psrlw                m4, 2 ; pw_2048
    pmulhrsw             m0, m2
    pmulhrsw             m0, m4
    mov                 r3d, 8
    jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
%endmacro

%macro ITX_16X16_LOAD_COEFS 0
    mova                 m0, [cq+32*0]
    mova                 m1, [cq+32*1]
    mova                 m2, [cq+32*2]
    mova                 m3, [cq+32*3]
    add                  cq, 32*8
    mova                 m4, [cq-32*4]
    mova                 m5, [cq-32*3]
    mova                 m6, [cq-32*2]
    mova                 m7, [cq-32*1]
    mova                 m8, [cq+32*0]
    mova                 m9, [cq+32*1]
    mova                m10, [cq+32*2]
    mova                m11, [cq+32*3]
    mova                m12, [cq+32*4]
    mova                m13, [cq+32*5]
    mova                m14, [cq+32*6]
    mova                m15, [cq+32*7]
    mova              [rsp], m15
%endmacro

INV_TXFM_16X16_FN dct, dct,      0
INV_TXFM_16X16_FN dct, identity, 15
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst

cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    ITX_16X16_LOAD_COEFS
    call .main
.pass1_end:
    vpbroadcastd         m1, [o(pw_8192)]
    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
    vextracti128 [rsp+16*5], m8, 1
    mova         [rsp+16*1], xm8
.pass1_end2:
    vextracti128 [rsp+16*4], m0, 1
    mova         [rsp+16*0], xm0
    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
    pmulhrsw             m1, [rsp+32*1]
    vperm2i128           m8, m1, m9, 0x31
    vinserti128          m1, m1, xm9, 1
    vperm2i128           m9, m2, m10, 0x31
    vinserti128          m2, m2, xm10, 1
    vperm2i128          m10, m3, m11, 0x31
    vinserti128          m3, m3, xm11, 1
    vperm2i128          m11, m4, m12, 0x31
    vinserti128          m4, m4, xm12, 1
    vperm2i128          m12, m5, m13, 0x31
    vinserti128          m5, m5, xm13, 1
    vperm2i128          m13, m6, m14, 0x31
    vinserti128          m6, m6, xm14, 1
    vperm2i128          m14, m7, m15, 0x31
    vinserti128          m7, m7, xm15, 1
    mova                m15, [rsp+32*2]
.pass1_end3:
    punpcklwd            m0, m9, m10
    punpckhwd            m9, m10
    punpcklwd           m10, m15, m8
    punpckhwd           m15, m8
    punpckhwd            m8, m11, m12
    punpcklwd           m11, m12
    punpckhwd           m12, m13, m14
    punpcklwd           m13, m14
    punpckhdq           m14, m11, m13
    punpckldq           m11, m13
    punpckldq           m13, m15, m9
    punpckhdq           m15, m9
    punpckldq            m9, m10, m0
    punpckhdq           m10, m0
    punpckhdq            m0, m8, m12
    punpckldq            m8, m12
    punpcklqdq          m12, m13, m8
    punpckhqdq          m13, m8
    punpcklqdq           m8, m9, m11
    punpckhqdq           m9, m11
    punpckhqdq          m11, m10, m14
    punpcklqdq          m10, m14
    punpcklqdq          m14, m15, m0
    punpckhqdq          m15, m0
    mova                 m0, [rsp]
    mova              [rsp], m15
    punpckhwd           m15, m4, m5
    punpcklwd            m4, m5
    punpckhwd            m5, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m6, m7
    punpcklwd            m6, m7
    punpckhwd            m7, m2, m3
    punpcklwd            m2, m3
    punpckhdq            m3, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m4, m6
    punpckhdq            m4, m6
    punpckhdq            m6, m5, m7
    punpckldq            m5, m7
    punpckldq            m7, m15, m1
    punpckhdq           m15, m1
    punpckhqdq           m1, m0, m2
    punpcklqdq           m0, m2
    punpcklqdq           m2, m3, m4
    punpckhqdq           m3, m4
    punpcklqdq           m4, m5, m7
    punpckhqdq           m5, m7
    punpckhqdq           m7, m6, m15
    punpcklqdq           m6, m15
    jmp                tx2q
.pass2:
    call .main
.end:
    vpbroadcastd         m1, [o(pw_2048)]
    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
    mova              [rsp], m6
.end2:
    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
    pmulhrsw             m1, [rsp+32*1]
    lea                  r3, [strideq*3]
    WRITE_16X2            0,  1,  6,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2        [rsp],  7,  0,  1, strideq*2, r3
.end3:
    pxor                 m2, m2
    REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            8,  9,  0,  1, strideq*0, strideq*1
    WRITE_16X2           10, 11,  0,  1, strideq*2, r3
    REPX {mova [cq+32*x], m2},  0,  1,  2,  3,  4,  5,  6,  7
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2           12, 13,  0,  1, strideq*0, strideq*1
    WRITE_16X2           14, 15,  0,  1, strideq*2, r3
    RET
ALIGN function_align
.main:
    vpbroadcastd        m15, [o(pd_2048)]
    mova [rsp+gprsize+32*1], m1
    mova [rsp+gprsize+32*2], m9
    IDCT8_1D              0,  2,  4,  6,  8, 10, 12, 14,  1,  9, 15
    mova                 m1, [rsp+gprsize+32*2] ; in9
    mova [rsp+gprsize+32*2], m14 ; tmp7
    mova                 m9, [rsp+gprsize+32*1] ; in1
    mova [rsp+gprsize+32*1], m10 ; tmp5
    mova                m14, [rsp+gprsize+32*0] ; in15
    mova [rsp+gprsize+32*0], m6  ; tmp3
    IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
    mova                 m6, [rsp+gprsize+32*1] ; tmp5
    psubsw              m15, m0, m14  ; out15
    paddsw               m0, m14      ; out0
    psubsw              m14, m2, m13  ; out14
    paddsw               m2, m13      ; out1
    mova [rsp+gprsize+32*1], m2
    psubsw              m13, m4, m11  ; out13
    paddsw               m2, m4, m11  ; out2
    psubsw              m11, m8, m7   ; out11
    paddsw               m4, m8, m7   ; out4
    mova                 m7, [rsp+gprsize+32*2] ; tmp7
    psubsw              m10, m6, m5   ; out10
    paddsw               m5, m6       ; out5
    psubsw               m8, m7, m9   ; out8
    paddsw               m7, m9       ; out7
    psubsw               m9, m12, m3  ; out9
    paddsw               m6, m12, m3  ; out6
    mova                 m3, [rsp+gprsize+32*0] ; tmp3
    psubsw              m12, m3, m1   ; out12
    paddsw               m3, m1       ; out3
    ret

INV_TXFM_16X16_FN adst, dct
INV_TXFM_16X16_FN adst, adst
INV_TXFM_16X16_FN adst, flipadst

cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    ITX_16X16_LOAD_COEFS
    call .main
    vpbroadcastd         m1, [o(pw_8192)]
    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
    vextracti128 [rsp+16*5], m8, 1
    mova         [rsp+16*1], xm8
    pxor                 m8, m8
    psubw                m1, m8, m1
    jmp m(idct_16x16_internal).pass1_end2
ALIGN function_align
.pass2:
    call .main
    vpbroadcastd         m1, [o(pw_2048)]
    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
    mova         [rsp+32*0], m6
    pxor                 m6, m6
    psubw                m1, m6, m1
    jmp m(idct_16x16_internal).end2
ALIGN function_align
.main:
    vpbroadcastd        m15, [o(pd_2048)]
    mova [rsp+gprsize+32*1], m0
    mova [rsp+gprsize+32*2], m4
    ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
    ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
    ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
    ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
    psubsw               m0, m2, m10  ; t10a
    paddsw               m2, m10      ; t2a
    psubsw              m10, m13, m5  ; t11a
    paddsw              m13, m5       ; t3a
    psubsw               m5, m6, m14  ; t14a
    paddsw               m6, m14      ; t6a
    psubsw              m14, m9, m1   ; t15a
    paddsw               m9, m1       ; t7a
    ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
    ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
    psubsw               m1, m10, m14 ; t14a
    paddsw              m10, m14      ; t10a
    psubsw              m14, m0, m5   ; t15a
    paddsw               m0, m5       ; t11a
    psubsw               m5, m2, m6   ; t6
    paddsw               m2, m6       ; t2
    psubsw               m6, m13, m9  ; t7
    paddsw              m13, m9       ; t3
    ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
    ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
    mova                 m9, [rsp+gprsize+32*0] ; in15
    mova [rsp+gprsize+32*0], m10 ; t10a
    mova                 m4, [rsp+gprsize+32*1] ; in0
    mova [rsp+gprsize+32*1], m6  ; t6a
    mova                 m6, [rsp+gprsize+32*2] ; in4
    mova [rsp+gprsize+32*2], m2  ; t2
    ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
    ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
    ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
    ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
    psubsw              m10, m4, m8  ; t8a
    paddsw               m8, m4      ; t0a
    psubsw               m4, m9, m7  ; t9a
    paddsw               m9, m7      ; t1a
    psubsw               m7, m6, m12 ; t12a
    paddsw               m6, m12     ; t4a
    psubsw              m12, m11, m3 ; t13a
    paddsw              m11, m3      ; t5a
    ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
    ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
    psubsw               m3, m9, m11 ; t5
    paddsw               m9, m11     ; t1
    psubsw              m11, m4, m12 ; t12a
    paddsw               m4, m12     ; t8a
    paddsw              m12, m8, m6  ; t0
    psubsw               m8, m6      ; t4
    paddsw               m6, m10, m7 ; t9a
    psubsw              m10, m7      ; t13a
    ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
    ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
    mova                 m7, [rsp+gprsize+32*0] ; t10a
    mova                 m2, [rsp+gprsize+32*1] ; t6a
    paddsw              m15, m9, m13  ; -out15
    psubsw               m9, m13      ;  t3a
    paddsw              m13, m11, m1  ; -out13
    psubsw              m11, m1       ;  t15a
    psubsw               m1, m4, m7   ;  t10
    paddsw               m7, m4       ; -out1
    psubsw               m4, m3, m2   ;  t6
    paddsw               m3, m2       ; -out3
    paddsw               m2, m10, m14 ;  out2
    psubsw              m10, m14      ;  t14a
    paddsw              m14, m6, m0   ;  out14
    psubsw               m6, m0       ;  t11
    mova                 m0, [rsp+gprsize+32*2] ; t2
    mova [rsp+gprsize+32*1], m7
    psubsw               m7, m12, m0  ;  t2a
    paddsw               m0, m12      ;  out0
    paddsw              m12, m8, m5   ;  out12
    psubsw               m8, m5       ;  t7
    paddw                m5, m10, m11 ; -out5
    psubw               m10, m11      ;  out10
    psubw               m11, m4, m8   ; -out11
    paddw                m4, m8       ;  out4
    psubw                m8, m7, m9   ;  out8
    paddw                m7, m9       ; -out7
    psubw                m9, m1, m6   ; -out9
    paddw                m6, m1       ;  out6
    vpbroadcastd         m1, [o(pw_2896x8)]
    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
    ret

INV_TXFM_16X16_FN flipadst, dct
INV_TXFM_16X16_FN flipadst, adst
INV_TXFM_16X16_FN flipadst, flipadst

cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    ITX_16X16_LOAD_COEFS
    call m(iadst_16x16_internal).main
    vpbroadcastd         m1, [o(pw_8192)]
    pmulhrsw             m6, m1
    mova         [rsp+32*2], m6
    pmulhrsw             m6, m1, m4
    pmulhrsw             m4, m1, m10
    pmulhrsw            m10, m1, m12
    pmulhrsw            m12, m1, m2
    pmulhrsw             m2, m1, m8
    pmulhrsw             m8, m1, m14
    pmulhrsw            m14, m1, m0
    pxor                 m0, m0
    psubw                m0, m1
    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
    pmulhrsw             m1, m0, m9
    pmulhrsw             m9, m0, m13
    pmulhrsw             m0, [rsp+32*1]
    mova         [rsp+16*0], xm15
    mova         [rsp+16*1], xm7
    vperm2i128          m15, m15, m7, 0x31
    vinserti128          m7, m2, xm14, 1
    vperm2i128          m14, m2, m14, 0x31
    vinserti128          m2, m9, xm5, 1
    vperm2i128           m9, m9, m5, 0x31
    vinserti128          m5, m4, xm12, 1
    vperm2i128          m12, m4, m12, 0x31
    vinserti128          m4, m11, xm3, 1
    vperm2i128          m11, m11, m3, 0x31
    vinserti128          m3, m10, xm6, 1
    vperm2i128          m10, m10, m6, 0x31
    vinserti128          m6, m1, xm0, 1
    vperm2i128          m13, m1, m0, 0x31
    vinserti128          m1, m8, [rsp+32*2], 1
    vperm2i128           m8, m8, [rsp+32*2], 0x31
    jmp m(idct_16x16_internal).pass1_end3
.pass2:
    call m(iadst_16x16_internal).main
    vpbroadcastd         m1, [o(pw_2048)]
    pmulhrsw             m0, m1
    pmulhrsw             m8, m1
    mova         [rsp+32*0], m0
    mova         [rsp+32*2], m8
    pxor                 m0, m0
    psubw                m0, m1
    pmulhrsw             m8, m0, m7
    pmulhrsw             m7, m0, m9
    pmulhrsw             m9, m1, m6
    pmulhrsw             m6, m1, m10
    pmulhrsw            m10, m0, m5
    pmulhrsw             m5, m0, m11
    pmulhrsw            m11, m1, m4
    pmulhrsw             m4, m1, m12
    pmulhrsw            m12, m0, m3
    pmulhrsw             m3, m0, m13
    pmulhrsw            m13, m1, m2
    pmulhrsw             m1, m14
    pmulhrsw            m14, m0, [rsp+32*1]
    pmulhrsw             m0, m15
    lea                  r3, [strideq*3]
    WRITE_16X2            0,  1,  2,  0, strideq*0, strideq*1
    mova                m15, [rsp+32*0]
    WRITE_16X2            3,  4,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            5,  6,  0,  1, strideq*0, strideq*1
    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
    jmp m(idct_16x16_internal).end3

INV_TXFM_16X16_FN identity, dct,      15
INV_TXFM_16X16_FN identity, identity

cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
    mova                xm0,      [cq+16*0]
    mova               xm15,      [cq+16*1]
    mova                xm1,      [cq+16*2]
    mova                xm8,      [cq+16*3]
    mova                xm2,      [cq+16*4]
    mova                xm9,      [cq+16*5]
    mova                xm3,      [cq+16*6]
    mova               xm10,      [cq+16*7]
    add                  cq, 16*16
    vinserti128          m0, m0,  [cq+16*0], 1
    vinserti128         m15, m15, [cq+16*1], 1
    mova                xm4,      [cq-16*8]
    mova               xm11,      [cq-16*7]
    vinserti128          m1, m1,  [cq+16*2], 1
    vinserti128          m8, m8,  [cq+16*3], 1
    mova                xm5,      [cq-16*6]
    mova               xm12,      [cq-16*5]
    vinserti128          m2, m2,  [cq+16*4], 1
    vinserti128          m9, m9,  [cq+16*5], 1
    mova                xm6,      [cq-16*4]
    mova               xm13,      [cq-16*3]
    vinserti128          m3, m3,  [cq+16*6], 1
    vinserti128         m10, m10, [cq+16*7], 1
    mova                xm7,      [cq-16*2]
    mova               xm14,      [cq-16*1]
    vinserti128          m4, m4,  [cq+16*8], 1
    vinserti128         m11, m11, [cq+16*9], 1
    vinserti128          m5, m5,  [cq+16*10], 1
    vinserti128         m12, m12, [cq+16*11], 1
    vinserti128          m6, m6,  [cq+16*12], 1
    vinserti128         m13, m13, [cq+16*13], 1
    vinserti128          m7, m7,  [cq+16*14], 1
    vinserti128         m14, m14, [cq+16*15], 1
    REPX   {psllw    x, 2 }, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
                             m8,  m9,  m10, m11, m12, m13, m14, m15
    mova              [rsp], m0
    vpbroadcastd         m0, [o(pw_5793x4)]
    REPX   {pmulhrsw x, m0},      m1,  m2,  m3,  m4,  m5,  m6,  m7, \
                             m8,  m9,  m10, m11, m12, m13, m14, m15
    pmulhrsw             m0, [rsp]
    mova              [rsp], m1
    vpbroadcastd         m1, [o(pw_8192)]
    REPX   {pmulhrsw x, m1}, m0,       m2,  m3,  m4,  m5,  m6,  m7, \
                             m8,  m9,  m10, m11, m12, m13, m14, m15
    pmulhrsw             m1, [rsp]
    mova              [rsp], m0
    jmp m(idct_16x16_internal).pass1_end3
ALIGN function_align
.pass2:
    vpbroadcastd        m15, [o(pw_5793x4)]
    REPX  {psllw    x, 2  }, m0, m1, m2, m3, m4, m5, m6, m7
    REPX  {pmulhrsw x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
    mova         [rsp+32*1], m1
    mova                 m1, [rsp+32*0]
    REPX  {psllw    x, 2  }, m8, m9, m10, m11, m12, m13, m14, m1
    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
    pmulhrsw            m15, m1
    jmp m(idct_16x16_internal).end

%define o_base iadst4_dconly2a + 128

%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
%if %3
    vpbroadcastd        m15, [o(pw_2896x8)]
    pmulhrsw             m0, m15, [%1+%2*0]
    pmulhrsw             m1, m15, [%1+%2*1]
    pmulhrsw             m2, m15, [%1+%2*2]
    pmulhrsw             m3, m15, [%1+%2*3]
    pmulhrsw             m4, m15, [%1+%2*4]
    pmulhrsw             m5, m15, [%1+%2*5]
    pmulhrsw             m6, m15, [%1+%2*6]
    pmulhrsw             m7, m15, [%1+%2*7]
%else
    mova                 m0, [%1+%2*0]
    mova                 m1, [%1+%2*1]
    mova                 m2, [%1+%2*2]
    mova                 m3, [%1+%2*3]
    mova                 m4, [%1+%2*4]
    mova                 m5, [%1+%2*5]
    mova                 m6, [%1+%2*6]
    mova                 m7, [%1+%2*7]
%endif
%endmacro

%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
%if %3
%if %3 == 1
    vpbroadcastd        m15, [o(pw_2896x8)]
%endif
    pmulhrsw             m8, m15, [%1+%2*0]
    pmulhrsw             m9, m15, [%1+%2*1]
    pmulhrsw            m10, m15, [%1+%2*2]
    pmulhrsw            m11, m15, [%1+%2*3]
    pmulhrsw            m12, m15, [%1+%2*4]
    pmulhrsw            m13, m15, [%1+%2*5]
    pmulhrsw            m14, m15, [%1+%2*6]
    pmulhrsw            m15,      [%1+%2*7]
%else
    mova                 m8, [%1+%2*0]
    mova                 m9, [%1+%2*1]
    mova                m10, [%1+%2*2]
    mova                m11, [%1+%2*3]
    mova                m12, [%1+%2*4]
    mova                m13, [%1+%2*5]
    mova                m14, [%1+%2*6]
    mova                m15, [%1+%2*7]
%endif
%endmacro

%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
    punpcklwd           m%1, m%2, m%2
    pmulhrsw            m%1, m%3
    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
    punpckhwd           m%2, m%2
    pmulhrsw            m%2, m%3
%endmacro

cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jz .dconly
    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
    %undef cmp
    cmp                eobd, 106
    jle .fast
    LOAD_8ROWS      cq+32*1, 32*2
    call m(idct_16x8_internal).main
    vperm2i128          m11, m0, m4, 0x31
    vinserti128          m0, m0, xm4, 1
    vperm2i128           m4, m1, m5, 0x31
    vinserti128          m1, m1, xm5, 1
    vperm2i128           m5, m2, m6, 0x31
    vinserti128          m2, m2, xm6, 1
    vperm2i128           m6, m3, m7, 0x31
    vinserti128          m3, m3, xm7, 1
    pxor                 m7, m7
    REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
    punpckhwd            m7, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m2, m3
    punpcklwd            m2, m3
    punpcklwd            m3, m11, m4
    punpckhwd           m11, m4
    punpckhwd            m4, m5, m6
    punpcklwd            m5, m6
    punpckhdq            m6, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m5
    punpckhdq            m3, m5
    punpckhdq            m5, m11, m4
    punpckldq           m11, m4
    punpckldq            m4, m7, m1
    punpckhdq            m7, m1
    punpckhqdq          m12, m6, m0
    punpcklqdq           m0, m6     ; out4
    punpckhqdq          m13, m7, m4
    punpcklqdq           m4, m7     ; out5
    punpckhqdq          m14, m3, m2
    punpcklqdq           m2, m3     ; out6
    punpckhqdq          m15, m5, m11
    punpcklqdq          m11, m5     ; out7
    mova         [rsp+32*0], m0
    mova         [rsp+32*1], m4
    mova         [rsp+32*2], m2
.fast:
    LOAD_8ROWS      cq+32*0, 32*2
    call m(idct_16x8_internal).main
    vperm2i128           m8, m0, m4, 0x31
    vinserti128          m0, m0, xm4, 1
    vperm2i128           m4, m1, m5, 0x31
    vinserti128          m1, m1, xm5, 1
    vperm2i128           m5, m2, m6, 0x31
    vinserti128          m2, m2, xm6, 1
    vperm2i128           m6, m3, m7, 0x31
    vinserti128          m3, m3, xm7, 1
    vpbroadcastd         m9, [o(pw_8192)]
    pxor                 m7, m7
    REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
    punpckhwd            m7, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m2, m3
    punpcklwd            m2, m3
    punpckhwd            m3, m8, m4
    punpcklwd            m8, m4
    punpckhwd            m4, m5, m6
    punpcklwd            m5, m6
    punpckhdq            m6, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m8, m5
    punpckhdq            m8, m5
    punpckhdq            m5, m3, m4
    punpckldq            m3, m4
    punpckhdq            m4, m7, m1
    punpckldq            m7, m1
    punpcklqdq           m1, m7, m4
    punpckhqdq           m7, m4     ; out9
    punpckhqdq           m4, m2, m8 ; out10
    punpcklqdq           m2, m8
    punpckhqdq           m8, m3, m5
    punpcklqdq           m3, m5
    punpckhqdq           m5, m0, m6 ; out8
    punpcklqdq           m0, m6
    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
    cmp                eobd, 106
    jg .full
    mova         [rsp+32*0], m5
    mova         [rsp+32*1], m7
    mova         [rsp+32*2], m4
    pmulhrsw            m11, m9, m8
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7
    call .main_fast
    jmp .pass2
.dconly:
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm2
    psrlw               xm2, 2 ; pw_2048
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    mov                 r2d, 8
    jmp m(inv_txfm_add_dct_dct_8x8).end2
.full:
    REPX   {pmulhrsw x, m9}, m12, m13, m14, m15
    pmulhrsw             m6, m9, [rsp+32*2]
    mova         [rsp+32*2], m4
    pmulhrsw             m4, m9, [rsp+32*0]
    mova         [rsp+32*0], m5
    pmulhrsw             m5, m9, [rsp+32*1]
    mova         [rsp+32*1], m7
    pmulhrsw             m7, m9, m11
    pmulhrsw            m11, m9, m8
    call .main
.pass2:
    vpbroadcastd        m12, [o(pw_2048)]
    REPX  {pmulhrsw x, m12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
                             m8,  m9,  m10, m11,      m13, m14, m15
    pmulhrsw            m12, [rsp]
    REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
    REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
    mova         [rsp+32*0], m4
    mova         [rsp+32*1], m6
    lea                  r3, [strideq*3]
    WRITE_8X4             0,  1,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4             2,  3,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4    [rsp+32*0],  5,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4    [rsp+32*1],  7,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4             8,  9,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4            10, 11,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4            12, 13,  4,  6
    lea                dstq, [dstq+strideq*4]
    WRITE_8X4            14, 15,  4,  6
    RET
ALIGN function_align
.main_fast: ; bottom half is zero
    call m(idct_8x16_internal).main
    mova                 m8, [rsp+gprsize+0*32]
    mova [rsp+gprsize+0*32], m0
    mova                 m9, [rsp+gprsize+1*32]
    mova [rsp+gprsize+1*32], m1
    mova                 m0, [rsp+gprsize+2*32]
    mova [rsp+gprsize+2*32], m6
    lea                  r5, [rax-(o_base)+pw_201_4091x8]
    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
    jmp .main2
ALIGN function_align
.main:
    call m(idct_8x16_internal).main
    mova                 m8, [rsp+gprsize+0*32]
    mova [rsp+gprsize+0*32], m0
    mova                 m9, [rsp+gprsize+1*32]
    mova [rsp+gprsize+1*32], m1
    mova                 m0, [rsp+gprsize+2*32]
    mova [rsp+gprsize+2*32], m6
    punpcklwd            m1, m15, m8  ; in31 in1
    punpckhwd            m8, m15      ; in3  in29
    punpcklwd           m15, m14, m9  ; in27 in5
    punpckhwd            m9, m14      ; in7  in25
    punpcklwd           m14, m13, m0  ; in23 in9
    punpckhwd            m0, m13      ; in11 in21
    punpcklwd           m13, m12, m11 ; in19 in13
    punpckhwd           m11, m12      ; in15 in17
    ITX_MUL2X_PACK        1,  6, 12, 10,  201, 4091, 3 ; t16a, t31a
    ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
    ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
    ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
    ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
    ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
    ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
    ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
.main2:
    psubsw               m6, m1, m11  ; t17 t30
    paddsw               m1, m11      ; t16 t31
    psubsw              m11, m9, m14  ; t18 t29
    paddsw               m9, m14      ; t19 t28
    psubsw              m14, m15, m0  ; t21 t26
    paddsw              m15, m0       ; t20 t27
    psubsw               m0, m8, m13  ; t22 t25
    paddsw               m8, m13      ; t23 t24
    ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
    ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
    ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
    ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
    psubsw              m13, m1, m9   ; t19a t28a
    paddsw               m1, m9       ; t16a t31a
    psubsw               m9, m8, m15  ; t20a t27a
    paddsw               m8, m15      ; t23a t24a
    psubsw              m15, m6, m11  ; t18  t29
    paddsw               m6, m11      ; t17  t30
    psubsw              m11, m0, m14  ; t21  t26
    paddsw               m0, m14      ; t22  t25
    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
    vbroadcasti128      m12, [o(deint_shuf)]
    REPX    {pshufb x, m12}, m0, m1, m6, m8
    psubsw              m14, m1, m8   ; t23  t24
    paddsw               m1, m8       ; t16  t31
    psubsw               m8, m6, m0   ; t22a t25a
    paddsw               m6, m0       ; t17a t30a
    psubsw               m0, m15, m11 ; t21  t26
    paddsw              m15, m11      ; t18  t29
    psubsw              m11, m13, m9  ; t20a t27a
    paddsw              m13, m9       ; t19a t28a
    vpbroadcastd        m12, [o(pw_2896x8)]
    punpcklqdq           m9, m11, m0  ; t20a t21
    punpckhqdq          m11, m0       ; t27a t26
    punpcklqdq           m0, m14, m8  ; t23  t22a
    punpckhqdq          m14, m8       ; t24  t25a
    psubw                m8, m11, m9  ; t20  t21a
    paddw               m11, m9       ; t27  t26a
    psubw                m9, m14, m0  ; t23a t22
    paddw               m14, m0       ; t24a t25
    REPX  {pmulhrsw x, m12}, m8, m9, m14, m11
    punpcklqdq           m0, m1, m6   ; t16  t17a
    punpckhqdq           m1, m6       ; t31  t30a
    psubsw              m10, m5, m8   ; out20 out21
    paddsw               m5, m8       ; out11 out10
    psubsw               m6, m3, m14  ; out24 out25
    paddsw               m3, m14      ; out7  out6
    psubsw               m8, m7, m0   ; out16 out17
    paddsw               m7, m0       ; out15 out14
    mova                 m0, [rsp+gprsize+0*32]
    punpcklqdq          m12, m13, m15 ; t19a t18
    punpckhqdq          m13, m15      ; t28a t29
    psubsw              m15, m0, m1   ; out31 out30
    paddsw               m0, m1       ; out0  out1
    mova                 m1, [rsp+gprsize+1*32]
    mova [rsp+gprsize+0*32], m6
    mova                 m6, [rsp+gprsize+2*32]
    psubsw              m14, m1, m13  ; out28 out29
    paddsw               m1, m13      ; out3  out2
    psubsw              m13, m2, m11  ; out27 out26
    paddsw               m2, m11      ; out4  out5
    psubsw              m11, m4, m9   ; out23 out22
    paddsw               m4, m9       ; out8  out9
    psubsw               m9, m6, m12  ; out19 out18
    paddsw               m6, m12      ; out12 out13
    ret

%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
    vbroadcasti128      m%1, [cq+16*%3]
    vbroadcasti128      m%2, [cq+16*%4]
    shufpd              m%1, m%1, m%2, 0x0c
%endmacro

cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    mov                 r2d, 8
.dconly:
    pmulhrsw            xm0, xm2
    movd                xm2, [pw_2048] ; intentionally rip-relative
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    pxor                 m3, m3
.dconly_loop:
    mova                 m1, [dstq]
    punpckhbw            m2, m1, m3
    punpcklbw            m1, m3
    paddw                m2, m0
    paddw                m1, m0
    packuswb             m1, m2
    mova             [dstq], m1
    add                dstq, strideq
    dec                 r2d
    jg .dconly_loop
    RET
.normal:
    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
    %undef cmp
    LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
    LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
    LOAD_PACKED_16X2      1,  7,  4,  6 ; in4  in6
    LOAD_PACKED_16X2      5,  7,  5,  7 ; in5  in7
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
    add                  cq, 16*16
    LOAD_PACKED_16X2      2,  7, -8, -6 ; in8  in10
    LOAD_PACKED_16X2      6,  7, -7, -5 ; in9  in11
    LOAD_PACKED_16X2      3,  7, -4, -2 ; in12 in14
    LOAD_PACKED_16X2     11,  7, -3, -1 ; in13 in15
    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
    mova         [rsp+32*0], m4
    mova         [rsp+32*1], m5
    mova         [rsp+32*2], m6
    cmp                eobd, 106
    jg .full
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7
    call m(inv_txfm_add_dct_dct_8x32).main_fast
    jmp .pass2
.full:
    LOAD_PACKED_16X2      4,  7,  0,  2 ; in16 in18
    LOAD_PACKED_16X2     12,  7,  3,  1 ; in19 in17
    LOAD_PACKED_16X2      5,  7,  4,  6 ; in20 in22
    LOAD_PACKED_16X2     13,  7,  7,  5 ; in23 in21
    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
    add                  cq, 16*8
    LOAD_PACKED_16X2      6,  7,  0,  2 ; in24 in26
    LOAD_PACKED_16X2     14,  7,  3,  1 ; in27 in25
    LOAD_PACKED_16X2      7,  8,  4,  6 ; in28 in30
    LOAD_PACKED_16X2     15,  8,  7,  5 ; in31 in29
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
    call m(inv_txfm_add_dct_dct_8x32).main
.pass2:
    vpbroadcastd        m12, [o(pw_8192)]
    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
    mova         [rsp+32*1], m9
    mova         [rsp+32*2], m10
    punpckhwd            m9, m0, m2
    punpcklwd            m0, m2
    punpckhwd            m2, m1, m3
    punpcklwd            m1, m3
    punpcklwd           m10, m4, m6
    punpckhwd            m4, m6
    punpcklwd            m6, m5, m7
    punpckhwd            m5, m7
    punpckhwd            m3, m0, m9
    punpcklwd            m0, m9
    punpckhwd            m9, m2, m1
    punpcklwd            m2, m1
    punpcklwd            m7, m10, m4
    punpckhwd           m10, m4
    punpcklwd            m4, m5, m6
    punpckhwd            m5, m6
    punpckhdq            m1, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m9
    punpckhdq            m3, m9
    punpckldq            m6, m7, m4
    punpckhdq            m7, m4
    punpckldq            m9, m10, m5
    punpckhdq           m10, m5
    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
    pmulhrsw            m12, [rsp+32*0]
    mova         [rsp+32*0], m8
    vperm2i128           m4, m0, m6, 0x31
    vinserti128          m0, m0, xm6, 1
    vperm2i128           m5, m1, m7, 0x31
    vinserti128          m1, m1, xm7, 1
    vperm2i128           m6, m2, m9, 0x31
    vinserti128          m2, m2, xm9, 1
    vperm2i128           m7, m3, m10, 0x31
    vinserti128          m3, m3, xm10, 1
    call m(idct_16x8_internal).main
    vpbroadcastd         m8, [o(pw_2048)]
    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
    lea                  r2, [strideq*3]
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
    lea                  r3, [dstq+strideq*4]
    %define dstq r3
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
    mova                 m0, [rsp+32*0]
    mova                 m1, [rsp+32*1]
    mova                 m2, [rsp+32*2]
    punpckhwd            m7, m0, m2
    punpcklwd            m0, m2
    punpckhwd            m2, m1, m11
    punpcklwd            m1, m11
    punpckhwd            m4, m12, m14
    punpcklwd           m12, m14
    punpckhwd            m5, m13, m15
    punpcklwd           m13, m15
    punpckhwd            m3, m0, m7
    punpcklwd            m0, m7
    punpckhwd            m9, m2, m1
    punpcklwd            m2, m1
    punpcklwd            m7, m12, m4
    punpckhwd           m12, m4
    punpcklwd            m4, m5, m13
    punpckhwd            m5, m13
    punpckhdq            m1, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m3, m9
    punpckhdq            m3, m9
    punpckldq            m6, m7, m4
    punpckhdq            m7, m4
    punpckldq            m9, m12, m5
    punpckhdq           m12, m5
    vperm2i128           m4, m0, m6, 0x31
    vinserti128          m0, m0, xm6, 1
    vperm2i128           m5, m1, m7, 0x31
    vinserti128          m1, m1, xm7, 1
    vperm2i128           m6, m2, m9, 0x31
    vinserti128          m2, m2, xm9, 1
    vperm2i128           m7, m3, m12, 0x31
    vinserti128          m3, m3, xm12, 1
    call m(idct_16x8_internal).main2
    vpbroadcastd         m8, [o(pw_2048)]
    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
    add                  r0, 16
    add                  r3, 16
    %define dstq r0
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
    %define dstq r3
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
    RET

cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
    vpbroadcastd         m9, [pw_5]
    lea                  r4, [strideq*3]
    sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
.loop:
    mova                xm0,     [cq+16* 0]
    mova                xm1,     [cq+16* 4]
    vinserti128          m0, m0, [cq+16* 1], 1
    vinserti128          m1, m1, [cq+16* 5], 1
    pxor                 m8, m8
    mova          [cq+32*0], m8
    mova          [cq+32*2], m8
    add                  cq, 16*16
    mova                xm2,     [cq-16* 8]
    mova                xm3,     [cq-16* 4]
    vinserti128          m2, m2, [cq-16* 7], 1
    vinserti128          m3, m3, [cq-16* 3], 1
    mova                xm4,     [cq+16* 0]
    mova                xm5,     [cq+16* 4]
    vinserti128          m4, m4, [cq+16* 1], 1
    vinserti128          m5, m5, [cq+16* 5], 1
    mova                xm6,     [cq+16* 8]
    mova                xm7,     [cq+16*12]
    vinserti128          m6, m6, [cq+16* 9], 1
    vinserti128          m7, m7, [cq+16*13], 1
    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
    REPX  {paddw     x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
    call .transpose8x8
    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
    add                dstq, strideq
    WRITE_8X4             1,  5,  0,  4, strideq*8, strideq*4, r4*4
    add                dstq, strideq
    WRITE_8X4             2,  6,  0,  4, strideq*8, strideq*4, r4*4
    add                dstq, strideq
    WRITE_8X4             3,  7,  0,  4, strideq*8, strideq*4, r4*4
    add                dstq, strideq
    sub                  cq, 16*16-32
    lea                dstq, [dstq+r4*4]
    add                eobd, 0x80000000
    jnc .loop
    RET
ALIGN function_align
.transpose8x8:
    punpckhwd            m8, m4, m5
    punpcklwd            m4, m5
    punpckhwd            m5, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m6, m7
    punpcklwd            m6, m7
    punpckhwd            m7, m2, m3
    punpcklwd            m2, m3
    punpckhdq            m3, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m4, m6
    punpckhdq            m4, m6
    punpckhdq            m6, m5, m7
    punpckldq            m5, m7
    punpckldq            m7, m8, m1
    punpckhdq            m8, m1
    punpckhqdq           m1, m0, m2
    punpcklqdq           m0, m2
    punpcklqdq           m2, m3, m4
    punpckhqdq           m3, m4
    punpcklqdq           m4, m5, m7
    punpckhqdq           m5, m7
    punpckhqdq           m7, m6, m8
    punpcklqdq           m6, m8
    ret

cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob
    add                  cq, 16*8
    vpbroadcastd         m9, [pw_4096]
    lea                  r4, [strideq*3]
    lea                  r5, [dstq+strideq*4]
    sub                eobd, 107
.loop:
    mova                xm0,     [cq-16*8]
    mova                xm1,     [cq-16*7]
    vinserti128          m0, m0, [cq+16*0], 1
    vinserti128          m1, m1, [cq+16*1], 1
    mova                xm2,     [cq-16*6]
    mova                xm3,     [cq-16*5]
    vinserti128          m2, m2, [cq+16*2], 1
    vinserti128          m3, m3, [cq+16*3], 1
    mova                xm4,     [cq-16*4]
    mova                xm5,     [cq-16*3]
    vinserti128          m4, m4, [cq+16*4], 1
    vinserti128          m5, m5, [cq+16*5], 1
    mova                xm6,     [cq-16*2]
    mova                xm7,     [cq-16*1]
    vinserti128          m6, m6, [cq+16*6], 1
    vinserti128          m7, m7, [cq+16*7], 1
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
    %define dstq r5
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
    add                  cq, 16*16
    add                  r0, 16
    add                  r5, 16
    add                eobd, 0x80000000
    jnc .loop
    RET

%define o_base pw_5 + 128

%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
%if %3
    vpbroadcastd        m15, [o(pw_2896x8)]
    pmulhrsw             m0, m15, [%1+%2* 0]
    pmulhrsw             m1, m15, [%1+%2* 1]
    pmulhrsw             m2, m15, [%1+%2* 2]
    pmulhrsw             m3, m15, [%1+%2* 3]
    pmulhrsw             m4, m15, [%1+%2* 4]
    pmulhrsw             m5, m15, [%1+%2* 5]
    pmulhrsw             m6, m15, [%1+%2* 6]
    pmulhrsw             m7, m15, [%1+%2* 7]
    pmulhrsw             m8, m15, [%1+%2* 8]
    pmulhrsw             m9, m15, [%1+%2* 9]
    pmulhrsw            m10, m15, [%1+%2*10]
    pmulhrsw            m11, m15, [%1+%2*11]
    pmulhrsw            m12, m15, [%1+%2*12]
    pmulhrsw            m13, m15, [%1+%2*13]
    pmulhrsw            m14, m15, [%1+%2*14]
    pmulhrsw            m15,      [%1+%2*15]
%else
    mova                 m0, [%1+%2* 0]
    mova                 m1, [%1+%2* 1]
    mova                 m2, [%1+%2* 2]
    mova                 m3, [%1+%2* 3]
    mova                 m4, [%1+%2* 4]
    mova                 m5, [%1+%2* 5]
    mova                 m6, [%1+%2* 6]
    mova                 m7, [%1+%2* 7]
    mova                 m8, [%1+%2* 8]
    mova                 m9, [%1+%2* 9]
    mova                m10, [%1+%2*10]
    mova                m11, [%1+%2*11]
    mova                m12, [%1+%2*12]
    mova                m13, [%1+%2*13]
    mova                m14, [%1+%2*14]
    mova                m15, [%1+%2*15]
%endif
    mova              [rsp], m15
%if %4
    pxor                m15, m15
    REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
                                8,  9, 10, 11, 12, 13, 14, 15
%endif
%endmacro

%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
    mova                m%4, [%2]
    paddsw              m%3, m%1, m%4
    psubsw              m%1, m%4
    pmovzxbw            m%4, [dstq+%6]
    pmulhrsw            m%3, m%5
    pmulhrsw            m%1, m%5
    paddw               m%3, m%4
    pmovzxbw            m%4, [r2+%7]
    paddw               m%1, m%4
    packuswb            m%3, m%1
    vpermq              m%3, m%3, q3120
    mova          [dstq+%6], xm%3
    vextracti128    [r2+%7], m%3, 1
%endmacro

cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jz .dconly
    PROLOGUE              0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
                                           base, tmp3
    %undef cmp
    LOAD_16ROWS          cq, 64, 1
    call m(idct_16x16_internal).main
    lea               tmp1q, [rsp+32*7]
    lea               tmp2q, [tmp1q+32*8]
    lea               tmp3q, [tmp1q+32*16]
    mova                 m1, [rsp+32*1]
    mova         [rsp+32*0], m6
    mova         [rsp+32*1], m7
    vpbroadcastd         m7, [o(pw_16384)]
    call .transpose_2x8x8_round
    mova                m15, [rsp+32*0]
    mova         [tmp3q-32*4+ 0], xm0
    vextracti128 [tmp3q+32*0+ 0], m0, 1
    mova         [tmp3q-32*3+ 0], xm2
    vextracti128 [tmp3q+32*1+ 0], m2, 1
    mova         [tmp3q-32*2+ 0], xm4
    vextracti128 [tmp3q+32*2+ 0], m4, 1
    mova         [tmp3q-32*1+ 0], xm6
    vextracti128 [tmp3q+32*3+ 0], m6, 1
    mova         [tmp3q-32*4+16], xm8
    vextracti128 [tmp3q+32*0+16], m8, 1
    mova         [tmp3q-32*3+16], xm10
    vextracti128 [tmp3q+32*1+16], m10, 1
    mova         [tmp3q-32*2+16], xm12
    vextracti128 [tmp3q+32*2+16], m12, 1
    mova         [tmp3q-32*1+16], xm14
    vextracti128 [tmp3q+32*3+16], m14, 1
    cmp                eobd, 150
    jg .full
    vinserti128          m0, m1, xm9, 1
    vperm2i128           m4, m1, m9, 0x31
    vinserti128          m2, m5, xm13, 1
    vperm2i128           m6, m5, m13, 0x31
    vinserti128          m1, m3, xm11, 1
    vperm2i128           m5, m3, m11, 0x31
    vinserti128          m3, m7, xm15, 1
    vperm2i128           m7, m7, m15, 0x31
    call .main_oddhalf_fast
    pxor                 m8, m8
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
    jmp .idct16
.dconly:
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    mov                 r2d, 16
    jmp m(inv_txfm_add_dct_dct_16x4).dconly
.full:
    mova       [tmp1q-32*4], m1
    mova       [tmp1q-32*3], m3
    mova       [tmp1q-32*2], m5
    mova       [tmp1q-32*1], m7
    mova       [tmp1q+32*0], m9
    mova       [tmp1q+32*1], m11
    mova       [tmp1q+32*2], m13
    mova       [tmp1q+32*3], m15
    LOAD_16ROWS       cq+32, 64, 1
    call m(idct_16x16_internal).main
    lea                  r2, [tmp3q+32*8]
    mova                 m1, [rsp+32*1]
    mova         [rsp+32*0], m6
    mova         [rsp+32*1], m7
    vpbroadcastd         m7, [o(pw_16384)]
    call .transpose_2x8x8_round
    mova                m15, [rsp+32*0]
    mova         [r2-32*4+ 0], xm0
    vextracti128 [r2+32*0+ 0], m0, 1
    mova         [r2-32*3+ 0], xm2
    vextracti128 [r2+32*1+ 0], m2, 1
    mova         [r2-32*2+ 0], xm4
    vextracti128 [r2+32*2+ 0], m4, 1
    mova         [r2-32*1+ 0], xm6
    vextracti128 [r2+32*3+ 0], m6, 1
    mova         [r2-32*4+16], xm8
    vextracti128 [r2+32*0+16], m8, 1
    mova         [r2-32*3+16], xm10
    vextracti128 [r2+32*1+16], m10, 1
    mova         [r2-32*2+16], xm12
    vextracti128 [r2+32*2+16], m12, 1
    mova         [r2-32*1+16], xm14
    vextracti128 [r2+32*3+16], m14, 1
    vinserti128          m8, m1, xm9, 1
    vperm2i128          m12, m1, m9, 0x31
    mova                xm0,     [tmp1q-32*4]
    mova                xm1,     [tmp1q-32*3]
    vinserti128          m0, m0, [tmp1q+32*0], 1
    vinserti128          m1, m1, [tmp1q+32*1], 1
    vinserti128         m10, m5, xm13, 1
    vperm2i128          m14, m5, m13, 0x31
    mova                xm4,     [tmp1q-32*4+16]
    mova                xm5,     [tmp1q-32*3+16]
    vinserti128          m4, m4, [tmp1q+32*0+16], 1
    vinserti128          m5, m5, [tmp1q+32*1+16], 1
    vinserti128          m9, m3, xm11, 1
    vperm2i128          m13, m3, m11, 0x31
    mova                xm2,     [tmp1q-32*2]
    mova                xm3,     [tmp1q-32*1]
    vinserti128          m2, m2, [tmp1q+32*2], 1
    vinserti128          m3, m3, [tmp1q+32*3], 1
    vinserti128         m11, m7, xm15, 1
    vperm2i128          m15, m7, m15, 0x31
    mova                xm6,     [tmp1q-32*2+16]
    mova                xm7,     [tmp1q-32*1+16]
    vinserti128          m6, m6, [tmp1q+32*2+16], 1
    vinserti128          m7, m7, [tmp1q+32*3+16], 1
    call .main_oddhalf
    LOAD_8ROWS_H    r2-32*4, 32
.idct16:
    LOAD_8ROWS   tmp3q-32*4, 32
    mova              [rsp], m15
    call m(idct_16x16_internal).main
    imul                 r2, strideq, 19
    lea                  r3, [strideq*3]
    add                  r2, dstq
    call .pass2_end
    RET
ALIGN function_align
.main_oddhalf_fast: ; lower half is zero
    mova [rsp+gprsize+32*1], m7
    pxor                 m7, m7
    mova [rsp+gprsize+32*0], m7
    mova [rsp+gprsize+32*2], m7
    vpbroadcastd        m11, [o(pw_3703x8)]
    vpbroadcastd         m7, [o(pw_1751x8)]
    vpbroadcastd        m12, [o(pw_m1380x8)]
    vpbroadcastd         m8, [o(pw_3857x8)]
    vpbroadcastd        m13, [o(pw_3973x8)]
    vpbroadcastd        m15, [o(pw_995x8)]
    pmulhrsw            m11, m4  ; t29a
    pmulhrsw             m4, m7  ; t18a
    pmulhrsw            m12, m3  ; t19a
    pmulhrsw             m3, m8  ; t28a
    pmulhrsw            m13, m2  ; t27a
    pmulhrsw             m2, m15 ; t20a
    vpbroadcastd        m10, [o(pw_m2106x8)]
    vpbroadcastd         m7, [o(pw_3513x8)]
    vpbroadcastd         m9, [o(pw_3290x8)]
    vpbroadcastd         m8, [o(pw_2440x8)]
    vpbroadcastd        m14, [o(pw_m601x8)]
    vpbroadcastd        m15, [o(pw_4052x8)]
    pmulhrsw            m10, m5  ; t21a
    pmulhrsw             m5, m7  ; t26a
    pmulhrsw             m9, m6  ; t25a
    pmulhrsw             m6, m8  ; t22a
    pmulhrsw            m14, m1  ; t23a
    pmulhrsw             m1, m15 ; t24a
    vpbroadcastd        m15, [o(pd_2048)]
    jmp .main2
ALIGN function_align
.main_oddhalf:
    mova [rsp+gprsize+32*0], m15
    mova [rsp+gprsize+32*1], m7
    mova [rsp+gprsize+32*2], m8
    vpbroadcastd        m15, [o(pd_2048)]
    ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
    ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
    ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
    ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
    ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
    ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
.main2:
    psubsw               m7, m12, m4  ; t18
    paddsw              m12, m4       ; t19
    psubsw               m4, m2, m10  ; t21
    paddsw               m2, m10      ; t20
    psubsw              m10, m14, m6  ; t22
    paddsw              m14, m6       ; t23
    psubsw               m6, m1, m9   ; t25
    paddsw               m1, m9       ; t24
    psubsw               m9, m13, m5  ; t26
    paddsw              m13, m5       ; t27
    psubsw               m5, m3, m11  ; t29
    paddsw               m3, m11      ; t28
    ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
    ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
    ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
    psubsw               m8, m14, m2  ; t20a
    paddsw              m14, m2       ; t23a
    psubsw               m2, m1, m13  ; t27a
    paddsw               m1, m13      ; t24a
    psubsw              m13, m6, m9   ; t21
    paddsw               m6, m9       ; t22
    psubsw               m9, m10, m4  ; t26
    paddsw              m10, m4       ; t25
    ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
    ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
    mova                 m4, [rsp+gprsize+32*0] ; in31
    mova [rsp+gprsize+32*0], m6  ; t22
    mova                 m6, [rsp+gprsize+32*1] ; in15
    mova [rsp+gprsize+32*1], m14 ; t23a
    mova                m14, [rsp+gprsize+32*2] ; in17
    mova [rsp+gprsize+32*2], m1  ; t24a
    ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
    ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
    psubsw               m1, m0, m14  ; t17
    paddsw               m0, m14      ; t16
    psubsw              m14, m4, m6   ; t30
    paddsw               m4, m6       ; t31
    ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
    psubsw               m6, m0, m12  ; t19a
    paddsw               m0, m12      ; t16a
    psubsw              m12, m4, m3   ; t28a
    paddsw               m4, m3       ; t31a
    psubsw               m3, m14, m5  ; t18
    paddsw              m14, m5       ; t17
    psubsw               m5, m1, m7   ; t29
    paddsw               m1, m7       ; t30
    ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
    ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
    psubsw               m7, m1, m10  ; t25a
    paddsw               m1, m10      ; t30a
    psubsw              m10, m5, m9   ; t21
    paddsw               m5, m9       ; t18
    psubsw               m9, m12, m2  ; t20a
    paddsw              m12, m2       ; t19a
    psubsw               m2, m3, m13  ; t26
    paddsw               m3, m13      ; t29
    psubsw              m13, m6, m8   ; t27a
    paddsw               m6, m8       ; t28a
    mova       [tmp1q-32*2], m5
    mova       [tmp1q-32*1], m12
    mova       [tmp2q+32*0], m6
    mova       [tmp2q+32*1], m3
    mova       [tmp2q+32*2], m1
    mova                 m5, [rsp+gprsize+32*0] ; t22
    mova                 m6, [rsp+gprsize+32*1] ; t23
    mova                 m3, [rsp+gprsize+32*2] ; t24a
    vpbroadcastd         m8, [o(pw_2896x8)]
    psubsw               m1, m14, m5  ; t22a
    paddsw              m14, m5       ; t17a
    psubsw               m5, m0, m6   ; t23
    paddsw               m0, m6       ; t16
    psubsw               m6, m4, m3   ; t24
    paddsw               m4, m3       ; t31
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m14
    mova       [tmp2q+32*3], m4
    psubw                m3, m13, m9  ; t20
    paddw               m13, m9       ; t27
    psubw                m9, m2, m10  ; t21a
    paddw                m2, m10      ; t26a
    psubw               m10, m7, m1   ; t22
    paddw                m7, m1       ; t25
    psubw                m1, m6, m5   ; t23a
    paddw                m6, m5       ; t24a
    REPX   {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
    mova       [tmp1q+32*0], m3
    mova       [tmp1q+32*1], m9
    mova       [tmp1q+32*2], m10
    mova       [tmp1q+32*3], m1
    mova       [tmp2q-32*4], m6
    mova       [tmp2q-32*3], m7
    mova       [tmp2q-32*2], m2
    mova       [tmp2q-32*1], m13
    ret
ALIGN function_align
.transpose_2x8x8_round:
    punpckhwd            m6, m12, m13
    punpcklwd           m12, m13
    punpckhwd           m13, m8, m9
    punpcklwd            m8, m9
    punpckhwd            m9, m14, m15
    punpcklwd           m14, m15
    punpckhwd           m15, m10, m11
    punpcklwd           m10, m11
    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
    punpckhdq           m11, m8, m10
    punpckldq            m8, m10
    punpckldq           m10, m12, m14
    punpckhdq           m12, m14
    punpckhdq           m14, m13, m15
    punpckldq           m13, m15
    punpckldq           m15, m6, m9
    punpckhdq            m6, m9
    punpckhqdq           m9, m8, m10
    punpcklqdq           m8, m10
    punpcklqdq          m10, m11, m12
    punpckhqdq          m11, m12
    punpcklqdq          m12, m13, m15
    punpckhqdq          m13, m15
    punpckhqdq          m15, m14, m6
    punpcklqdq          m14, m6
    pmulhrsw             m6, m7, [rsp+gprsize+32*0]
    REPX   {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
    pmulhrsw             m7, [rsp+gprsize+32*1]
    mova [rsp+gprsize+32*0], m15
    punpckhwd           m15, m4, m5
    punpcklwd            m4, m5
    punpckhwd            m5, m0, m1
    punpcklwd            m0, m1
    punpckhwd            m1, m6, m7
    punpcklwd            m6, m7
    punpckhwd            m7, m2, m3
    punpcklwd            m2, m3
    punpckhdq            m3, m0, m2
    punpckldq            m0, m2
    punpckldq            m2, m4, m6
    punpckhdq            m4, m6
    punpckhdq            m6, m5, m7
    punpckldq            m5, m7
    punpckldq            m7, m15, m1
    punpckhdq           m15, m1
    punpckhqdq           m1, m0, m2
    punpcklqdq           m0, m2
    punpcklqdq           m2, m3, m4
    punpckhqdq           m3, m4
    punpcklqdq           m4, m5, m7
    punpckhqdq           m5, m7
    punpckhqdq           m7, m6, m15
    punpcklqdq           m6, m15
    ret
ALIGN function_align
.pass2_end:
    mova [rsp+gprsize+32*0], m7
    mova [rsp+gprsize+32*2], m15
    vpbroadcastd        m15, [o(pw_2048)]
    IDCT32_PASS2_END      0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
    IDCT32_PASS2_END      4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
    IDCT32_PASS2_END      8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
    IDCT32_PASS2_END     12, tmp1q-32*1, 0, 4, 15, r3*4,      strideq*0
    add                dstq, strideq
    sub                  r2, strideq
    mova                 m1, [rsp+gprsize+32*1]
    IDCT32_PASS2_END      1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
    IDCT32_PASS2_END      5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
    IDCT32_PASS2_END      9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
    IDCT32_PASS2_END     13, tmp1q-32*2, 0, 4, 15, r3*4,      strideq*0
    add                dstq, strideq
    sub                  r2, strideq
    IDCT32_PASS2_END      2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
    IDCT32_PASS2_END      6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
    IDCT32_PASS2_END     10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
    IDCT32_PASS2_END     14, tmp1q-32*3, 0, 4, 15, r3*4,      strideq*0
    add                dstq, strideq
    sub                  r2, strideq
    mova                 m7, [rsp+gprsize+32*0]
    mova                 m1, [rsp+gprsize+32*2]
    IDCT32_PASS2_END      3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
    IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
    IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
    IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
    ret

; Perform the final sumsub step and YMM lane shuffling
%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
    mova                m%3, [tmp2q+32*( 3-%1)]
    psubsw              m%4, m%1, m%3
    paddsw              m%1, m%3
    mova                m%3, [tmp1q+32*(11-%2)]
    mova         [tmp1q+32*(11-%2)+16], xm%4
    vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
    paddsw              m%4, m%2, m%3
    psubsw              m%2, m%3
    mova         [tmp1q+32*(11-%2)], xm%2
    vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
    vperm2i128          m%2, m%1, m%4, 0x31
    vinserti128         m%1, m%1, xm%4, 1
%endmacro

cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    mov                 r2d, 16
    jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
    PROLOGUE              0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
    vpbroadcastd        m15, [o(pw_2896x8)]
    pmulhrsw             m0, m15, [cq+32* 1]
    pmulhrsw             m1, m15, [cq+32* 3]
    pmulhrsw             m2, m15, [cq+32* 5]
    pmulhrsw             m3, m15, [cq+32* 7]
    pmulhrsw             m4, m15, [cq+32* 9]
    pmulhrsw             m5, m15, [cq+32*11]
    pmulhrsw             m6, m15, [cq+32*13]
    pmulhrsw             m7, m15, [cq+32*15]
    pmulhrsw             m8, m15, [cq+32*17]
    pmulhrsw             m9, m15, [cq+32*19]
    pmulhrsw            m10, m15, [cq+32*21]
    pmulhrsw            m11, m15, [cq+32*23]
    pmulhrsw            m12, m15, [cq+32*25]
    pmulhrsw            m13, m15, [cq+32*27]
    pmulhrsw            m14, m15, [cq+32*29]
    pmulhrsw            m15,      [cq+32*31]
    lea               tmp1q, [rsp+32*7]
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
    LOAD_16ROWS     cq+32*0, 32*2, 1, 0
    pxor                m15, m15
    mov                 r3d, 8
.zero_loop:
    mova          [cq+32*0], m15
    mova          [cq+32*1], m15
    mova          [cq+32*2], m15
    mova          [cq+32*3], m15
    add                  cq, 32*4
    dec                 r3d
    jg .zero_loop
    call m(idct_16x16_internal).main
    call .pass1_end
    lea                  r2, [strideq*3]
    mov                  r3, dstq
.pass2:
    vpbroadcastd         m7, [o(pw_16384)]
    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
    call m(idct_16x16_internal).main
    mova         [rsp+32*2], m15
    vpbroadcastd        m15, [o(pw_2048)]
    REPX  {pmulhrsw x, m15}, m2, m3, m0
    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
    pmulhrsw             m1, m15, [rsp+32*1]
    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
    lea                dstq, [dstq+strideq*4]
    REPX  {pmulhrsw x, m15}, m4, m5, m6, m7
    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
    lea                dstq, [dstq+strideq*4]
    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11
    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
    lea                dstq, [dstq+strideq*4]
    REPX  {pmulhrsw x, m15}, m11, m12, m13, m14
    pmulhrsw            m15, [rsp+32*2]
    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
    test                 r3, r3
    jnz .right_half
    RET
.right_half:
    LOAD_8ROWS   tmp1q-32*4, 32
    LOAD_8ROWS_H tmp2q-32*4, 32
    lea                dstq, [r3+16]
    xor                 r3d, r3d
    mova         [rsp+32*0], m6
    mova         [rsp+32*1], m7
    jmp .pass2
ALIGN function_align
.pass1_end:
    mova [rsp+gprsize+32*0], m9
    IDCT32_PASS1_END      0,  8,  1,  9
    IDCT32_PASS1_END      2, 10,  1,  9
    IDCT32_PASS1_END      3, 11,  1,  9
    IDCT32_PASS1_END      4, 12,  1,  9
    IDCT32_PASS1_END      5, 13,  1,  9
    IDCT32_PASS1_END      6, 14,  1,  9
    IDCT32_PASS1_END      7, 15,  1,  9
    mova                 m1, [rsp+gprsize+32*1]
    mova                 m9, [rsp+gprsize+32*0]
    mova [rsp+gprsize+32*0], m6
    mova [rsp+gprsize+32*1], m7
    IDCT32_PASS1_END      1,  9,  6,  7
    ret

cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 12, dst, stride, c, eob
%undef cmp
    lea                 rax, [o_base]
    vpbroadcastd         m9, [o(pw_2896x8)]
    vpbroadcastd        m10, [o(pw_5793x4)]
    vpbroadcastd        m11, [o(pw_5)]
    cmp                eobd, 43   ; if (eob > 43)
    setg                r4b       ;   iteration_count++
    cmp                eobd, 150  ; if (eob > 150)
    setg                 al       ;   iteration_count++
    add                eobd, -279 ; if (eob > 278)
    adc                 r4b, al   ;   iteration_count++
    lea                  r3, [strideq*3]
    mov                 rax, cq
.loop:
    mova                xm0,     [cq+64* 0]
    mova                xm1,     [cq+64* 1]
    vinserti128          m0, m0, [cq+64* 8], 1
    vinserti128          m1, m1, [cq+64* 9], 1
    mova                xm2,     [cq+64* 2]
    mova                xm3,     [cq+64* 3]
    vinserti128          m2, m2, [cq+64*10], 1
    vinserti128          m3, m3, [cq+64*11], 1
    mova                xm4,     [cq+64* 4]
    mova                xm5,     [cq+64* 5]
    vinserti128          m4, m4, [cq+64*12], 1
    vinserti128          m5, m5, [cq+64*13], 1
    mova                xm6,     [cq+64* 6]
    mova                xm7,     [cq+64* 7]
    vinserti128          m6, m6, [cq+64*14], 1
    vinserti128          m7, m7, [cq+64*15], 1
    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
    REPX  {psllw    x, 2  }, m0, m1, m2, m3, m4, m5, m6, m7
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
    REPX  {paddw    x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
    REPX  {psraw    x, 3  }, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
    add                  cq, 16
    dec                 r4b
    jge .loop
    sub                  cq, 32
    pxor                 m0, m0
    mov                 r0d, 8
    cmp                  cq, rax
    jg .zero_loop
.zero_loop_half:
    mova         [rax+64*0], m0
    mova         [rax+64*1], m0
    mova         [rax+64*2], m0
    mova         [rax+64*3], m0
    add                 rax, 64*4
    sub                 r0d, 2
    jg .zero_loop_half
    RET
.zero_loop:
    mova         [rax+32*0], m0
    mova         [rax+32*1], m0
    mova         [rax+32*2], m0
    mova         [rax+32*3], m0
    add                 rax, 32*4
    dec                 r0d
    jg .zero_loop
    RET

cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
%undef cmp
    lea                 rax, [o_base]
    vpbroadcastd         m9, [o(pw_2896x8)]
    vpbroadcastd        m10, [o(pw_5793x4)]
    vpbroadcastd        m11, [o(pw_2048)]
    cmp                eobd, 35  ; if (eob > 35)
    setg                r4b      ;   iteration_count++
    cmp                eobd, 150 ; if (eob > 150)
    setg                r3b      ;   iteration_count += 2
    lea                 r4d, [r4+r3*2]
    lea                  r3, [strideq*3]
    mov                  r5, dstq
    mov                 rax, cq
.loop:
    mova                xm0,     [cq+32* 0]
    mova                xm1,     [cq+32* 1]
    vinserti128          m0, m0, [cq+32* 8], 1
    vinserti128          m1, m1, [cq+32* 9], 1
    mova                xm2,     [cq+32* 2]
    mova                xm3,     [cq+32* 3]
    vinserti128          m2, m2, [cq+32*10], 1
    vinserti128          m3, m3, [cq+32*11], 1
    mova                xm4,     [cq+32* 4]
    mova                xm5,     [cq+32* 5]
    vinserti128          m4, m4, [cq+32*12], 1
    vinserti128          m5, m5, [cq+32*13], 1
    mova                xm6,     [cq+32* 6]
    mova                xm7,     [cq+32* 7]
    vinserti128          m6, m6, [cq+32*14], 1
    vinserti128          m7, m7, [cq+32*15], 1
    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
    REPX  {psllw    x, 3  }, m0, m1, m2, m3, m4, m5, m6, m7
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
    lea                dstq, [dstq+strideq*4]
    add                  cq, 16
    dec                 r4b
    jl .ret
    test                r4b, 1
    jz .loop
    add                  cq, 32*15
    lea                dstq, [r5+16]
    jmp .loop
.ret:
    sub                  cq, 32
    pxor                 m0, m0
    mov                 r0d, 4
    mov                 r1d, 8
    cmp                  cq, rax
    cmovg               r0d, r1d
.zero_loop:
    mova         [rax+32*0], m0
    mova         [rax+32*1], m0
    mova         [rax+32*2], m0
    mova         [rax+32*3], m0
    add                 rax, 32*4
    dec                 r0d
    jg .zero_loop
    RET

cglobal inv_txfm_add_dct_dct_32x32, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    mov                 r2d, 32
    jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
    PROLOGUE              0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
                                           base, tmp3, tmp4
    %undef cmp
    lea               tmp1q, [rsp+32*7]
    lea               tmp2q, [tmp1q+32*8]
    sub                eobd, 136
    mov               tmp4d, eobd
.pass1_loop:
    LOAD_8ROWS      cq+64*1, 64*2
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
    test              tmp4d, tmp4d
    jl .fast
    LOAD_8ROWS_H   cq+64*17, 64*2
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
    LOAD_8ROWS_H   cq+64*16, 64*2
    pxor                 m0, m0
    REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
                               24, 25, 26, 27, 28, 29, 30, 31
    mova              [rsp], m15
    jmp .idct16
.fast:
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    pxor                 m8, m8
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
    mova              [rsp], m8
.idct16:
    LOAD_8ROWS      cq+64*0, 64*2
    pxor                m15, m15
    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
    call m(idct_16x16_internal).main
    call m(inv_txfm_add_dct_dct_32x16).pass1_end
    vpbroadcastd         m7, [o(pw_8192)]
    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
    lea               tmp3q, [tmp1q+32*32]
    mova                m15, [rsp]
    mova       [tmp3q-32*4], m0
    mova       [tmp3q-32*3], m2
    mova       [tmp3q-32*2], m4
    mova       [tmp3q-32*1], m6
    mova       [tmp3q+32*0], m8
    mova       [tmp3q+32*1], m10
    mova       [tmp3q+32*2], m12
    mova       [tmp3q+32*3], m14
    add               tmp3q, 32*8
    mova       [tmp3q-32*4], m1
    mova       [tmp3q-32*3], m3
    mova       [tmp3q-32*2], m5
    mova       [tmp3q-32*1], m7
    mova       [tmp3q+32*0], m9
    mova       [tmp3q+32*1], m11
    mova       [tmp3q+32*2], m13
    mova       [tmp3q+32*3], m15
    vpbroadcastd         m9, [o(pw_8192)]
    pmulhrsw             m0, m9, [tmp1q-32*4]
    pmulhrsw             m1, m9, [tmp1q-32*3]
    pmulhrsw             m2, m9, [tmp1q-32*2]
    pmulhrsw             m3, m9, [tmp1q-32*1]
    pmulhrsw             m4, m9, [tmp1q+32*0]
    pmulhrsw             m5, m9, [tmp1q+32*1]
    pmulhrsw             m6, m9, [tmp1q+32*2]
    pmulhrsw             m7, m9, [tmp1q+32*3]
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    mova       [tmp1q-32*4], m0
    pmulhrsw             m0, m9, [tmp2q-32*4]
    mova       [tmp2q-32*4], m1
    pmulhrsw             m1, m9, [tmp2q-32*3]
    mova       [tmp1q-32*3], m2
    pmulhrsw             m2, m9, [tmp2q-32*2]
    mova       [tmp2q-32*3], m3
    pmulhrsw             m3, m9, [tmp2q-32*1]
    mova       [tmp1q-32*2], m4
    pmulhrsw             m4, m9, [tmp2q+32*0]
    mova       [tmp2q-32*2], m5
    pmulhrsw             m5, m9, [tmp2q+32*1]
    mova       [tmp1q-32*1], m6
    pmulhrsw             m6, m9, [tmp2q+32*2]
    mova       [tmp2q-32*1], m7
    pmulhrsw             m7, m9, [tmp2q+32*3]
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    mova       [tmp1q+32*0], m0
    mova       [tmp2q+32*0], m1
    mova       [tmp1q+32*1], m2
    mova       [tmp2q+32*1], m3
    mova       [tmp1q+32*2], m4
    mova       [tmp2q+32*2], m5
    mova       [tmp1q+32*3], m6
    mova       [tmp2q+32*3], m7
    add                  cq, 32
    add               tmp1q, 32*16
    add               tmp2q, 32*16
    add                eobd, 0x80000000
    jnc .pass1_loop
    add               tmp1q, 32*24
    imul                 r2, strideq, 19
    lea                  r3, [strideq*3]
    add                  r2, dstq
    test              tmp4d, tmp4d
    jge .pass2_loop
    add               tmp1q, 32*16
    add               tmp2q, 32*16
    add               tmp3q, 32*16
.pass2_loop:
    LOAD_8ROWS   tmp2q-32*4, 32
    test              tmp4d, tmp4d
    jl .fast2
    LOAD_8ROWS_H tmp3q-32*4, 32
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
    sub               tmp3q, 32*8
    LOAD_8ROWS_H tmp3q-32*4, 32
    sub               tmp3q, 32*16
    jmp .pass2_loop_end
.fast2:
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    sub               tmp3q, 32*24
    pxor                 m8, m8
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
.pass2_loop_end:
    LOAD_8ROWS   tmp3q-32*4, 32
    mova              [rsp], m15
    call m(idct_16x16_internal).main
    call m(inv_txfm_add_dct_dct_16x32).pass2_end
    lea               tmp3q, [tmp1q-32*32]
    cmp               tmp2q, tmp3q
    jl .ret
    sub               tmp2q, 32*32
    sub                dstq, r3
    lea                  r2, [r2+r3+16]
    add                dstq, 16
    jmp .pass2_loop
.ret:
    RET

cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob
    %undef cmp
    vpbroadcastd         m9, [pw_8192]
    sub                eobd, 136 ; if (eob < 136)
    shr                eobd, 30  ;     topleft 16x16 only
    lea                eobd, [eobq*2-8]
    lea                  r4, [strideq*3]
    mov                  r5, dstq
    lea                 rax, [cq+32]
.loop:
    mova                xm0,     [cq+64* 0]
    mova                xm1,     [cq+64* 1]
    vinserti128          m0, m0, [cq+64* 8], 1
    vinserti128          m1, m1, [cq+64* 9], 1
    mova                xm2,     [cq+64* 2]
    mova                xm3,     [cq+64* 3]
    vinserti128          m2, m2, [cq+64*10], 1
    vinserti128          m3, m3, [cq+64*11], 1
    mova                xm4,     [cq+64* 4]
    mova                xm5,     [cq+64* 5]
    vinserti128          m4, m4, [cq+64*12], 1
    vinserti128          m5, m5, [cq+64*13], 1
    mova                xm6,     [cq+64* 6]
    mova                xm7,     [cq+64* 7]
    vinserti128          m6, m6, [cq+64*14], 1
    vinserti128          m7, m7, [cq+64*15], 1
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
    lea                dstq, [dstq+strideq*4]
    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
    lea                dstq, [dstq+strideq*4]
    add                  cq, 16
    inc                eobd
    jz .ret
    test               eobd, 3
    jnz .loop
    add                  cq, 64*15
    lea                dstq, [r5+16]
    jmp .loop
.ret:
    pxor                 m0, m0
    mov                 r0d, 16
    cmp                  cq, rax
    jne .zero_loop
.zero_loop_topleft:
    mova         [rax-32*1], m0
    mova         [rax+32*1], m0
    mova         [rax+32*3], m0
    mova         [rax+32*5], m0
    add                 rax, 64*4
    sub                 r0d, 4
    jg .zero_loop_topleft
    RET
.zero_loop:
    mova         [rax-32*1], m0
    mova         [rax+32*0], m0
    mova         [rax+32*1], m0
    mova         [rax+32*2], m0
    add                 rax, 32*4
    dec                 r0d
    jg .zero_loop
    RET

%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
%if %1 & 1
    mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
    mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
%else
    mova                m%5, [tmp1q-32*(45-%1)]
    mova                m%4, [tmp2q-32*(20+%1)]
%endif
    psubsw              m%6, m%5, m%4 ; idct32 out31-n
    paddsw              m%5, m%4      ; idct32 out 0+n
    psubsw              m%4, m%6, m%3 ; out32+n
    paddsw              m%6, m%3      ; out31-n
    psubsw              m%3, m%5, m%2 ; out63-n
    paddsw              m%5, m%2      ; out 0+n
%if %0 == 6 ; pass 1
%if %1 & 1
    mova [tmp2q-32*(19-%1)], m%4
    mova [tmp1q-32*(14+%1)], m%6
    mova [tmp1q+32*(18-%1)], m%3
    mova [tmp2q-32*(51-%1)], m%5
%else
    mova [tmp1q-32*(13-%1)], m%4
    mova [tmp2q-32*(20+%1)], m%6
    mova [tmp2q+32*(12-%1)], m%3
    mova [tmp1q-32*(45-%1)], m%5
%endif
%else ; pass 2
    REPX  {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
%if %1 & 1
    %define %%d0 r2
    %define %%d1 dstq
%else
    %define %%d0 dstq
    %define %%d1 r2
%endif
    pmovzxbw            m%2, [%%d0+%9 ]
    paddw               m%2, m%4
    pmovzxbw            m%4, [%%d1+%8 ]
    paddw               m%4, m%6
    pmovzxbw            m%6, [%%d1+%10]
    paddw               m%3, m%6
    pmovzxbw            m%6, [%%d0+%7 ]
    paddw               m%5, m%6
    packuswb            m%2, m%4
    packuswb            m%3, m%5
    vpermq              m%2, m%2, q3120
    vpermq              m%3, m%3, q3120
    mova         [%%d0+%9 ], xm%2
    vextracti128 [%%d1+%8 ], m%2, 1
    mova         [%%d1+%10], xm%3
    vextracti128 [%%d0+%7 ], m%3, 1
%endif
%endmacro

cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    mov                 r2d, 32
    jmp m(inv_txfm_add_dct_dct_16x4).dconly
.normal:
    PROLOGUE              0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
    %undef cmp
    lea               tmp1q, [rsp+32*23]
    lea               tmp2q, [tmp1q+32*24]
    sub                eobd, 151
    mov                 r7d, eobd
.pass1_loop:
    LOAD_16ROWS          cq, 64
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova         [rsp+32*0], m6
    mova         [rsp+32*1], m7
    vpbroadcastd         m7, [o(pw_8192)]
    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
    mova                m15, [rsp+32*0]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m2
    mova       [tmp1q-32*2], m4
    mova       [tmp1q-32*1], m6
    mova       [tmp1q+32*0], m8
    mova       [tmp1q+32*1], m10
    mova       [tmp1q+32*2], m12
    mova       [tmp1q+32*3], m14
    mova       [tmp2q-32*4], m1
    mova       [tmp2q-32*3], m3
    mova       [tmp2q-32*2], m5
    mova       [tmp2q-32*1], m7
    mova       [tmp2q+32*0], m9
    mova       [tmp2q+32*1], m11
    mova       [tmp2q+32*2], m13
    mova       [tmp2q+32*3], m15
    add                  cq, 32
    add               tmp1q, 32*8
    add               tmp2q, 32*8
    add                eobd, 0x80000000
    jnc .pass1_loop
    lea                  r2, [rsp+32*23]
    mova                xm0,     [r2-32*4+ 0]
    mova                xm1,     [r2-32*2+ 0]
    vinserti128          m0, m0, [r2+32*0+ 0], 1
    vinserti128          m1, m1, [r2+32*2+ 0], 1
    mova                xm2,     [r2-32*4+16]
    mova                xm3,     [r2-32*2+16]
    vinserti128          m2, m2, [r2+32*0+16], 1
    vinserti128          m3, m3, [r2+32*2+16], 1
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
    test                r7d, r7d
    jl .fast
    lea                  r3, [r2+32*8]
    mova                xm4,     [r3-32*4+ 0]
    mova                xm5,     [r3-32*2+ 0]
    vinserti128          m4, m4, [r3+32*0+ 0], 1
    vinserti128          m5, m5, [r3+32*2+ 0], 1
    mova                xm6,     [r3-32*4+16]
    mova                xm7,     [r3-32*2+16]
    vinserti128          m6, m6, [r3+32*0+16], 1
    vinserti128          m7, m7, [r3+32*2+16], 1
.fast:
    mova              [rsp], m8
    lea               tmp1q, [rsp+32*7]
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m1
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m5
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    add               tmp1q, 32*8
    mova       [tmp1q-32*4], m8
    mova       [tmp1q-32*3], m9
    mova       [tmp1q-32*2], m10
    mova       [tmp1q-32*1], m11
    mova       [tmp1q+32*0], m12
    mova       [tmp1q+32*1], m13
    mova       [tmp1q+32*2], m14
    mova       [tmp1q+32*3], m15
    mova                xm0,     [r2-32*3+ 0]
    mova                xm1,     [r2-32*1+ 0]
    vinserti128          m0, m0, [r2+32*1+ 0], 1
    vinserti128          m1, m1, [r2+32*3+ 0], 1
    mova                xm2,     [r2-32*3+16]
    mova                xm3,     [r2-32*1+16]
    vinserti128          m2, m2, [r2+32*1+16], 1
    vinserti128          m3, m3, [r2+32*3+16], 1
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7
    test                r7d, r7d
    jl .fast2
    mova                xm4,     [r3-32*3+ 0]
    mova                xm5,     [r3-32*1+ 0]
    vinserti128          m4, m4, [r3+32*1+ 0], 1
    vinserti128          m5, m5, [r3+32*3+ 0], 1
    mova                xm6,     [r3-32*3+16]
    mova                xm7,     [r3-32*1+16]
    vinserti128          m6, m6, [r3+32*1+16], 1
    vinserti128          m7, m7, [r3+32*3+16], 1
.fast2:
    add               tmp1q, 32*8
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    add                  r2, 32*24
    vpbroadcastd        m15, [o(pd_2048)]
    add               tmp1q, 32*16
    add               tmp2q, 32*32
    mova                xm0,     [r2-32*4+ 0]
    mova                xm3,     [r2-32*1+16]
    vinserti128          m0, m0, [r2+32*0+ 0], 1
    vinserti128          m3, m3, [r2+32*3+16], 1
    mova                xm4,     [r2-32*4+16]
    mova                xm7,     [r2-32*1+ 0]
    vinserti128          m4, m4, [r2+32*0+16], 1
    vinserti128          m7, m7, [r2+32*3+ 0], 1
    pxor                 m1, m1
    REPX       {mova x, m1}, m2, m5, m6
    test                r7d, r7d
    jl .fast3
    add                  r3, 32*24
    mova                xm1,     [r3-32*1+16]
    mova                xm2,     [r3-32*4+ 0]
    vinserti128          m1, m1, [r3+32*3+16], 1
    vinserti128          m2, m2, [r3+32*0+ 0], 1
    mova                xm5,     [r3-32*1+ 0]
    mova                xm6,     [r3-32*4+16]
    vinserti128          m5, m5, [r3+32*3+ 0], 1
    vinserti128          m6, m6, [r3+32*0+16], 1
.fast3:
    add                 rax, o_idct64_offset
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    add                 rax, 8
    add               tmp1q, 32*8
    sub               tmp2q, 32*8
    mova                xm0,     [r2-32*2+ 0]
    mova                xm3,     [r2-32*3+16]
    vinserti128          m0, m0, [r2+32*2+ 0], 1
    vinserti128          m3, m3, [r2+32*1+16], 1
    mova                xm4,     [r2-32*2+16]
    mova                xm7,     [r2-32*3+ 0]
    vinserti128          m4, m4, [r2+32*2+16], 1
    vinserti128          m7, m7, [r2+32*1+ 0], 1
    pxor                 m1, m1
    REPX       {mova x, m1}, m2, m5, m6
    test                r7d, r7d
    jl .fast4
    mova                xm1,     [r3-32*3+16]
    mova                xm2,     [r3-32*2+ 0]
    vinserti128          m1, m1, [r3+32*1+16], 1
    vinserti128          m2, m2, [r3+32*2+ 0], 1
    mova                xm5,     [r3-32*3+ 0]
    mova                xm6,     [r3-32*2+16]
    vinserti128          m5, m5, [r3+32*1+ 0], 1
    vinserti128          m6, m6, [r3+32*2+16], 1
.fast4:
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
    RET
ALIGN function_align
%define o_base idct64_mul - 8
.main_part1:
    ; idct64 steps 1-5:
    ; in1/31/17/15/ 9/23/25/ 7 ->
    ;     t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
    ; in5/27/21/11/13/19/29/ 3 ->
    ;     t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
    vpbroadcastd        m11, [o(idct64_mul+4* 0)]
    vpbroadcastd        m13, [o(idct64_mul+4* 1)]
    vpbroadcastd        m10, [o(idct64_mul+4* 4)]
    vpbroadcastd        m12, [o(idct64_mul+4* 5)]
    pmulhrsw            m11, m0  ; t63a
    pmulhrsw             m0, m13 ; t32a
    pmulhrsw            m10, m1  ; t62a
    pmulhrsw             m1, m12 ; t33a
    vpbroadcastd         m9, [o(idct64_mul+4* 8)]
    vpbroadcastd        m13, [o(idct64_mul+4* 9)]
    vpbroadcastd         m8, [o(idct64_mul+4*12)]
    vpbroadcastd        m12, [o(idct64_mul+4*13)]
    pmulhrsw             m9, m2  ; t61a
    pmulhrsw             m2, m13 ; t34a
    pmulhrsw             m8, m3  ; t60a
    pmulhrsw             m3, m12 ; t35a
    psubsw              m12, m0, m1   ; t33
    paddsw               m0, m1       ; t32
    psubsw               m1, m3, m2   ; t34
    paddsw               m3, m2       ; t35
    psubsw               m2, m8, m9   ; t61
    paddsw               m8, m9       ; t60
    psubsw               m9, m11, m10 ; t62
    paddsw              m11, m10      ; t63
    ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
    vpbroadcastd        m14, [o(pw_401_4076)]
    ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
    psubsw              m10, m0, m3  ; t35a
    paddsw               m0, m3      ; t32a
    psubsw               m3, m11, m8 ; t60a
    paddsw              m11, m8      ; t63a
    psubsw               m8, m9, m2  ; t34
    paddsw               m9, m2      ; t33
    psubsw               m2, m12, m1 ; t61
    paddsw              m12, m1      ; t62
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m9
    mova       [tmp2q+32*2], m12
    mova       [tmp2q+32*3], m11
    vpbroadcastd        m13, [o(pw_m4017_799)]
    vpbroadcastd        m14, [o(pw_799_4017)]
    ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
    ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp2q+32*0], m10
    mova       [tmp2q+32*1], m8
    vpbroadcastd         m3, [o(idct64_mul+4*16)]
    vpbroadcastd        m11, [o(idct64_mul+4*17)]
    vpbroadcastd         m2, [o(idct64_mul+4*20)]
    vpbroadcastd        m10, [o(idct64_mul+4*21)]
    vpbroadcastd         m1, [o(idct64_mul+4*24)]
    vpbroadcastd         m9, [o(idct64_mul+4*25)]
    vpbroadcastd         m0, [o(idct64_mul+4*28)]
    vpbroadcastd         m8, [o(idct64_mul+4*29)]
    pmulhrsw             m3, m4  ; t59a
    pmulhrsw             m4, m11 ; t36a
    pmulhrsw             m2, m5  ; t58a
    pmulhrsw             m5, m10 ; t37a
    pmulhrsw             m1, m6  ; t57a
    pmulhrsw             m6, m9  ; t38a
    pmulhrsw             m0, m7  ; t56a
    pmulhrsw             m7, m8  ; t39a
    psubsw               m8, m4, m5 ; t37
    paddsw               m4, m5     ; t36
    psubsw               m5, m7, m6 ; t38
    paddsw               m7, m6     ; t39
    psubsw               m6, m0, m1 ; t57
    paddsw               m0, m1     ; t56
    psubsw               m1, m3, m2 ; t58
    paddsw               m3, m2     ; t59
    ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
    vpbroadcastd        m10, [o(pw_3166_2598)]
    ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
    psubsw               m2, m7, m4 ; t36a
    paddsw               m7, m4     ; t39a
    psubsw               m4, m0, m3 ; t59a
    paddsw               m0, m3     ; t56a
    psubsw               m3, m6, m1 ; t37
    paddsw               m6, m1     ; t38
    psubsw               m1, m5, m8 ; t58
    paddsw               m5, m8     ; t57
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    mova       [tmp2q-32*4], m0
    mova       [tmp2q-32*3], m5
    vpbroadcastd         m6, [o(pw_m799_m4017)]
    vpbroadcastd         m7, [o(pw_m4017_799)]
    ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
    ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m1
    mova       [tmp2q-32*2], m3
    mova       [tmp2q-32*1], m2
    ret
%define o_base pw_5 + 128
.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
    sub                 rax, o_idct64_offset + 8
    vpbroadcastd        m11, [o(pw_1567_3784)]
    vpbroadcastd        m12, [o(pw_m3784_1567)]
    vpbroadcastd        m13, [o(pw_m1567_m3784)]
    vpbroadcastd        m14, [o(pw_2896x8)]
.main_part2_pass1_loop:
    call .main_part2_internal
    REPX  {pmulhrsw x, m14}, m1, m2, m4, m3
    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
    IDCT64_PART2_END     15,  3,  4,  0,  6,  7
    cmp               tmp1q, tmp2q
    jne .main_part2_pass1_loop
    ret
.main_part2_internal:
    mova                 m0, [tmp1q-32*12] ; t32a
    mova                 m1, [tmp2q-32*13] ; t39a
    mova                 m2, [tmp1q-32* 4] ; t40a
    mova                 m5, [tmp2q+32* 3] ; t55a
    add               tmp1q, 32
    sub               tmp2q, 32
    mova                 m4, [tmp1q+32* 3] ; t48a
    mova                 m3, [tmp2q-32* 4] ; t47a
    mova                 m6, [tmp1q+32*11] ; t56a
    mova                 m7, [tmp2q+32*12] ; t63a
    psubsw               m8, m0, m1 ; t39
    paddsw               m0, m1     ; t32
    psubsw               m1, m3, m2 ; t40
    paddsw               m3, m2     ; t47
    psubsw               m2, m4, m5 ; t55
    paddsw               m4, m5     ; t48
    psubsw               m5, m7, m6 ; t56
    paddsw               m7, m6     ; t63
    ITX_MULSUB_2W         5,  8,  6,  9, 15, 11, 12 ; t39a, t56a
    ITX_MULSUB_2W         2,  1,  6,  9, 15, 12, 13 ; t40a, t55a
    psubsw               m6, m0, m3 ; t47a
    paddsw               m0, m3     ; t32a
    psubsw               m3, m7, m4 ; t48a
    paddsw               m7, m4     ; t63a
    psubsw               m4, m5, m2 ; t40
    paddsw               m5, m2     ; t39
    psubsw               m2, m8, m1 ; t55
    paddsw               m8, m1     ; t56
    psubw                m1, m2, m4 ; t40a
    paddw                m2, m4     ; t55a
    psubw                m4, m3, m6 ; t47
    paddw                m3, m6     ; t48
    ret
.main_part2_pass2:
    sub                 rax, o_idct64_offset + 8
    vpbroadcastd        m11, [o(pw_1567_3784)]
    vpbroadcastd        m12, [o(pw_m3784_1567)]
    vpbroadcastd        m13, [o(pw_m1567_m3784)]
    vpbroadcastd        m14, [o(pw_2048)]
    lea                  r9, [strideq*5]    ; stride*5
    lea                  r3, [r9+strideq*1] ; stride*6
    lea                  r7, [r9+strideq*2] ; stride*7
    lea                  r8, [r3+strideq*2] ; stride*8
    lea                  r2, [dstq+r7]
.main_part2_pass2_loop:
    call .main_part2_internal
    vpbroadcastd        m10, [o(pw_2896x8)]
    REPX  {pmulhrsw x, m10}, m1, m2, m4, m3
    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
    add                dstq, strideq
    sub                  r2, strideq
    cmp               tmp1q, tmp2q
    jne .main_part2_pass2_loop
    ret

cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    mov                 r2d, 16
.dconly:
    pmulhrsw            xm0, xm2
    movd                xm2, [o(pw_2048)]
    pmulhrsw            xm0, xm1
    pmulhrsw            xm0, xm2
    vpbroadcastw         m0, xm0
    pxor                 m1, m1
.dconly_loop:
    mova                 m2, [dstq+32*0]
    mova                 m3, [dstq+32*1]
    punpckhbw            m4, m2, m1
    punpcklbw            m2, m1
    punpckhbw            m5, m3, m1
    punpcklbw            m3, m1
    paddw                m4, m0
    paddw                m2, m0
    paddw                m5, m0
    paddw                m3, m0
    packuswb             m2, m4
    packuswb             m3, m5
    mova        [dstq+32*0], m2
    mova        [dstq+32*1], m3
    add                dstq, strideq
    dec                 r2d
    jg .dconly_loop
    RET
.normal:
    PROLOGUE              0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
    LOAD_8ROWS      cq+32*0, 32*4
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
    mova              [rsp], m8
    lea               tmp1q, [rsp+32*7]
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m1
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m5
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    add               tmp1q, 32*8
    mova       [tmp1q-32*4], m8
    mova       [tmp1q-32*3], m9
    mova       [tmp1q-32*2], m10
    mova       [tmp1q-32*1], m11
    mova       [tmp1q+32*0], m12
    mova       [tmp1q+32*1], m13
    mova       [tmp1q+32*2], m14
    mova       [tmp1q+32*3], m15
    LOAD_8ROWS      cq+32*2, 32*4
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
    add               tmp1q, 32*8
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    vpbroadcastd        m15, [o(pd_2048)]
    add               tmp1q, 32*16
    add               tmp2q, 32*32
    mova                 m0, [cq+32* 1]
    mova                 m1, [cq+32*31]
    mova                 m2, [cq+32*17]
    mova                 m3, [cq+32*15]
    mova                 m4, [cq+32* 9]
    mova                 m5, [cq+32*23]
    mova                 m6, [cq+32*25]
    mova                 m7, [cq+32* 7]
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
    add                 rax, o_idct64_offset
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    add                 rax, 8
    add               tmp1q, 32*8
    sub               tmp2q, 32*8
    mova                 m0, [cq+32* 5]
    mova                 m1, [cq+32*27]
    mova                 m2, [cq+32*21]
    mova                 m3, [cq+32*11]
    mova                 m4, [cq+32*13]
    mova                 m5, [cq+32*19]
    mova                 m6, [cq+32*29]
    mova                 m7, [cq+32* 3]
    pxor                 m8, m8
    REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
    sub               tmp1q, 32*36
    lea                  r2, [strideq*3]
    mov               tmp2d, 4
.pass2_loop:
    lea                  r3, [tmp1q-32*8]
    mova                xm0,      [r3   -32*4]
    mova                xm1,      [r3   -32*3]
    vinserti128          m0, m0,  [tmp1q-32*4], 1
    vinserti128          m1, m1,  [tmp1q-32*3], 1
    mova                xm2,      [r3   -32*2]
    mova                xm3,      [r3   -32*1]
    vinserti128          m2, m2,  [tmp1q-32*2], 1
    vinserti128          m3, m3,  [tmp1q-32*1], 1
    mova                xm4,      [r3   +32*0]
    mova                xm5,      [r3   +32*1]
    vinserti128          m4, m4,  [tmp1q+32*0], 1
    vinserti128          m5, m5,  [tmp1q+32*1], 1
    mova                xm6,      [r3   +32*2]
    mova                xm7,      [r3   +32*3]
    vinserti128          m6, m6,  [tmp1q+32*2], 1
    vinserti128          m7, m7,  [tmp1q+32*3], 1
    mova                xm8,      [r3   -32*4+16]
    mova                xm9,      [r3   -32*3+16]
    vinserti128          m8, m8,  [tmp1q-32*4+16], 1
    vinserti128          m9, m9,  [tmp1q-32*3+16], 1
    mova               xm10,      [r3   -32*2+16]
    mova               xm11,      [r3   -32*1+16]
    vinserti128         m10, m10, [tmp1q-32*2+16], 1
    vinserti128         m11, m11, [tmp1q-32*1+16], 1
    mova               xm12,      [r3   +32*0+16]
    mova               xm13,      [r3   +32*1+16]
    vinserti128         m12, m12, [tmp1q+32*0+16], 1
    vinserti128         m13, m13, [tmp1q+32*1+16], 1
    mova               xm14,      [r3   +32*2+16]
    mova               xm15,      [r3   +32*3+16]
    vinserti128         m14, m14, [tmp1q+32*2+16], 1
    vinserti128         m15, m15, [tmp1q+32*3+16], 1
    mova         [rsp+32*0], m6
    mova         [rsp+32*1], m7
    vpbroadcastd         m7, [o(pw_8192)]
    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
    call m(idct_16x16_internal).main
    mova         [rsp+32*0], m15
    vpbroadcastd        m15, [o(pw_2048)]
    REPX  {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
    pmulhrsw             m1, m15, [rsp+32*1]
    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
    lea                  r3, [dstq+strideq*4]
    %define dstq r3
    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
    lea                  r3, [r3+strideq*4]
    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
    pmulhrsw            m15, [rsp+32*0]
    lea                  r3, [r3+strideq*4]
    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
    add               tmp1q, 32*16
    add                  r0, 16
    dec               tmp2d
    jg .pass2_loop
    RET

cglobal inv_txfm_add_dct_dct_32x64, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    mov                 r2d, 64
    jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
    PROLOGUE              0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
    lea               tmp1q, [rsp+32*7]
    lea                r10d, [eobq-136]
    sar                r10d, 31
.pass1_loop:
    lea               tmp2q, [tmp1q+32*16]
    LOAD_8ROWS      cq+64*1, 64*2, 1
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
    test               r10b, r10b
    jnz .fast
    LOAD_8ROWS_H   cq+64*17, 64*2, 2
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
    LOAD_8ROWS_H   cq+64*16, 64*2, 1
    mova              [rsp], m15
    pxor                m15, m15
    REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
                                24, 25, 26, 27, 28, 29, 30, 31
    jmp .idct16
.fast:
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    pxor                 m8, m8
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
    mova              [rsp], m8
.idct16:
    LOAD_8ROWS      cq+64*0, 64*2, 1
    pxor                m15, m15
    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
    call m(idct_16x16_internal).main
    call m(inv_txfm_add_dct_dct_32x16).pass1_end
    vpbroadcastd         m7, [o(pw_16384)]
    call m(inv_txfm_add_dct_dct_16x32).transpose_2x8x8_round
    lea                  r3, [tmp1q+32*48]
    mova                m15, [rsp]
    mova          [r3-32*4], m0
    mova          [r3-32*3], m2
    mova          [r3-32*2], m4
    mova          [r3-32*1], m6
    mova          [r3+32*0], m8
    mova          [r3+32*1], m10
    mova          [r3+32*2], m12
    mova          [r3+32*3], m14
    add                  r3, 32*24
    mova          [r3-32*4], m1
    mova          [r3-32*3], m3
    mova          [r3-32*2], m5
    mova          [r3-32*1], m7
    mova          [r3+32*0], m9
    mova          [r3+32*1], m11
    mova          [r3+32*2], m13
    mova          [r3+32*3], m15
    vpbroadcastd         m9, [o(pw_16384)]
    pmulhrsw             m0, m9, [tmp1q-32*4]
    pmulhrsw             m1, m9, [tmp1q-32*3]
    pmulhrsw             m2, m9, [tmp1q-32*2]
    pmulhrsw             m3, m9, [tmp1q-32*1]
    pmulhrsw             m4, m9, [tmp1q+32*0]
    pmulhrsw             m5, m9, [tmp1q+32*1]
    pmulhrsw             m6, m9, [tmp1q+32*2]
    pmulhrsw             m7, m9, [tmp1q+32*3]
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    mova       [tmp1q-32*4], m0
    pmulhrsw             m0, m9, [tmp2q-32*4]
    mova       [tmp2q-32*4], m1
    pmulhrsw             m1, m9, [tmp2q-32*3]
    mova       [tmp1q-32*3], m2
    pmulhrsw             m2, m9, [tmp2q-32*2]
    mova       [tmp2q-32*3], m3
    pmulhrsw             m3, m9, [tmp2q-32*1]
    mova       [tmp1q-32*2], m4
    pmulhrsw             m4, m9, [tmp2q+32*0]
    mova       [tmp2q-32*2], m5
    pmulhrsw             m5, m9, [tmp2q+32*1]
    mova       [tmp1q-32*1], m6
    pmulhrsw             m6, m9, [tmp2q+32*2]
    mova       [tmp2q-32*1], m7
    pmulhrsw             m7, m9, [tmp2q+32*3]
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    mova       [tmp1q+32*0], m0
    mova       [tmp2q+32*0], m1
    mova       [tmp1q+32*1], m2
    mova       [tmp2q+32*1], m3
    mova       [tmp1q+32*2], m4
    mova       [tmp2q+32*2], m5
    mova       [tmp1q+32*3], m6
    mova       [tmp2q+32*3], m7
    add                  cq, 32
    add               tmp1q, 32*8
    add                r10d, 0x80000000
    jnc .pass1_loop
    lea                  r2, [rsp+32*55]
    lea                  r7, [r2+32*24]
.pass2_loop:
    lea                  r3, [r2+32*8]
    lea                  r8, [r7+32*8]
    mova                 m0, [r2-32*4]
    mova                 m1, [r2-32*2]
    mova                 m2, [r2+32*0]
    mova                 m3, [r2+32*2]
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
    test               r10b, r10b
    jnz .fast2
    mova                 m4, [r3-32*4]
    mova                 m5, [r3-32*2]
    mova                 m6, [r3+32*0]
    mova                 m7, [r3+32*2]
.fast2:
    mova              [rsp], m8
    lea               tmp1q, [rsp+32*39]
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m1
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m5
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    add               tmp1q, 32*8
    mova       [tmp1q-32*4], m8
    mova       [tmp1q-32*3], m9
    mova       [tmp1q-32*2], m10
    mova       [tmp1q-32*1], m11
    mova       [tmp1q+32*0], m12
    mova       [tmp1q+32*1], m13
    mova       [tmp1q+32*2], m14
    mova       [tmp1q+32*3], m15
    mova                 m0, [r2-32*3]
    mova                 m1, [r2-32*1]
    mova                 m2, [r2+32*1]
    mova                 m3, [r2+32*3]
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7
    test               r10b, r10b
    jnz .fast3
    mova                 m4, [r3-32*3]
    mova                 m5, [r3-32*1]
    mova                 m6, [r3+32*1]
    mova                 m7, [r3+32*3]
.fast3:
    add               tmp1q, 32*8
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    vpbroadcastd        m15, [o(pd_2048)]
    add               tmp1q, 32*16
    add               tmp2q, 32*32
    mova                 m0, [r7-32*4]
    mova                 m3, [r7+32*3]
    mova                 m4, [r7+32*0]
    mova                 m7, [r7-32*1]
    pxor                 m1, m1
    REPX       {mova x, m1}, m2, m5, m6
    test               r10b, r10b
    jnz .fast4
    mova                 m1, [r8+32*3]
    mova                 m2, [r8-32*4]
    mova                 m5, [r8-32*1]
    mova                 m6, [r8+32*0]
.fast4:
    add                 rax, o_idct64_offset
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    add                 rax, 8
    add               tmp1q, 32*8
    sub               tmp2q, 32*8
    mova                 m0, [r7-32*2]
    mova                 m3, [r7+32*1]
    mova                 m4, [r7+32*2]
    mova                 m7, [r7-32*3]
    pxor                 m1, m1
    REPX       {mova x, m1}, m2, m5, m6
    test               r10b, r10b
    jnz .fast5
    mova                 m1, [r8+32*1]
    mova                 m2, [r8-32*2]
    mova                 m5, [r8-32*3]
    mova                 m6, [r8+32*2]
.fast5:
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
    add                r10d, 0x80000000
    jc .ret
    lea                  r2, [rsp+32*7]
    lea                  r7, [r2+32*16]
    sub                dstq, r8
    lea                dstq, [dstq+strideq*4+16]
    jmp .pass2_loop
.ret:
    RET

cglobal inv_txfm_add_dct_dct_64x32, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_16384)]
    mov                [cq], eobd
    pmulhrsw            xm0, xm1
    mov                 r2d, 32
    jmp m(inv_txfm_add_dct_dct_64x16).dconly
.normal:
    PROLOGUE              0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
                                            base, tmp3, tmp4
    lea               tmp1q, [rsp+32*7]
    lea               tmp4d, [eobq-136]
.pass1_loop:
    LOAD_8ROWS      cq+64*0, 64*4, 1
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
    mova              [rsp], m8
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m1
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m5
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    add               tmp1q, 32*8
    mova       [tmp1q-32*4], m8
    mova       [tmp1q-32*3], m9
    mova       [tmp1q-32*2], m10
    mova       [tmp1q-32*1], m11
    mova       [tmp1q+32*0], m12
    mova       [tmp1q+32*1], m13
    mova       [tmp1q+32*2], m14
    mova       [tmp1q+32*3], m15
    LOAD_8ROWS      cq+64*2, 64*4, 1
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
    add               tmp1q, 32*8
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    vpbroadcastd        m15, [o(pd_2048)]
    add               tmp1q, 32*16
    add               tmp2q, 32*32
    vpbroadcastd         m7, [o(pw_2896x8)]
    pmulhrsw             m0, m7, [cq+64* 1]
    pmulhrsw             m1, m7, [cq+64*31]
    pmulhrsw             m2, m7, [cq+64*17]
    pmulhrsw             m3, m7, [cq+64*15]
    pmulhrsw             m4, m7, [cq+64* 9]
    pmulhrsw             m5, m7, [cq+64*23]
    pmulhrsw             m6, m7, [cq+64*25]
    pmulhrsw             m7,     [cq+64* 7]
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
    add                 rax, o_idct64_offset
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    vpbroadcastd         m7, [o(pw_2896x8-(o_idct64_offset))]
    add                 rax, 8
    add               tmp1q, 32*8
    sub               tmp2q, 32*8
    pmulhrsw             m0, m7, [cq+64* 5]
    pmulhrsw             m1, m7, [cq+64*27]
    pmulhrsw             m2, m7, [cq+64*21]
    pmulhrsw             m3, m7, [cq+64*11]
    pmulhrsw             m4, m7, [cq+64*13]
    pmulhrsw             m5, m7, [cq+64*19]
    pmulhrsw             m6, m7, [cq+64*29]
    pmulhrsw             m7,     [cq+64* 3]
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
    sub               tmp1q, 32*44
    vpbroadcastd        m10, [o(pw_16384)]
    call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
    add                  cq, 32
    add               tmp4d, 0x80000000
    jnc .pass1_loop
    lea               tmp1q, [rsp+32*15]
    imul                 r2, strideq, 19
    lea                  r3, [strideq*3]
    add                  r2, dstq
    mov               tmp4b, 4
.pass2_loop:
    lea               tmp2q, [tmp1q+32*64]
    LOAD_8ROWS   tmp1q-32*4, 32
    test              tmp4d, 0x40000000
    jnz .fast
    LOAD_8ROWS_H tmp2q-32*4, 32
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf
    lea               tmp3q, [tmp2q-32*8]
    LOAD_8ROWS_H tmp3q-32*4, 32
    mova              [rsp], m15
    jmp .idct16
.fast:
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    pxor                 m8, m8
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
    mova              [rsp], m8
.idct16:
    lea               tmp3q, [tmp1q-32*8]
    LOAD_8ROWS   tmp3q-32*4, 32
    call m(idct_16x16_internal).main
    call m(inv_txfm_add_dct_dct_16x32).pass2_end
    add               tmp1q, 32*16
    sub                dstq, r3
    lea                  r2, [r2+r3+16]
    add                dstq, 16
    dec               tmp4b
    jg .pass2_loop
    RET
ALIGN function_align
.transpose_round_interleave:
    mov               tmp3d, 4
.loop:
    lea               tmp2q, [tmp1q+32*8]
    mova                xm0,      [tmp1q-32*4]
    mova                xm1,      [tmp1q-32*3]
    vinserti128          m0, m0,  [tmp2q-32*4], 1
    vinserti128          m1, m1,  [tmp2q-32*3], 1
    mova                xm2,      [tmp1q-32*2]
    mova                xm3,      [tmp1q-32*1]
    vinserti128          m2, m2,  [tmp2q-32*2], 1
    vinserti128          m3, m3,  [tmp2q-32*1], 1
    mova                xm4,      [tmp1q+32*0]
    mova                xm5,      [tmp1q+32*1]
    vinserti128          m4, m4,  [tmp2q+32*0], 1
    vinserti128          m5, m5,  [tmp2q+32*1], 1
    mova                xm6,      [tmp1q+32*2]
    mova                xm7,      [tmp1q+32*3]
    vinserti128          m6, m6,  [tmp2q+32*2], 1
    vinserti128          m7, m7,  [tmp2q+32*3], 1
    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    mova                xm8,      [tmp1q-32*4+16]
    mova                xm9,      [tmp1q-32*3+16]
    vinserti128          m8, m8,  [tmp2q-32*4+16], 1
    vinserti128          m9, m9,  [tmp2q-32*3+16], 1
    mova       [tmp1q-32*4], m0
    mova       [tmp2q-32*4], m1
    mova       [tmp1q-32*3], m2
    mova       [tmp2q-32*3], m3
    mova                xm2,     [tmp1q-32*2+16]
    mova                xm3,     [tmp1q-32*1+16]
    vinserti128          m2, m2, [tmp2q-32*2+16], 1
    vinserti128          m3, m3, [tmp2q-32*1+16], 1
    mova       [tmp1q-32*2], m4
    mova       [tmp2q-32*2], m5
    mova       [tmp1q-32*1], m6
    mova       [tmp2q-32*1], m7
    mova                xm4,     [tmp1q+32*0+16]
    mova                xm5,     [tmp1q+32*1+16]
    vinserti128          m4, m4, [tmp2q+32*0+16], 1
    vinserti128          m5, m5, [tmp2q+32*1+16], 1
    mova                xm6,     [tmp1q+32*2+16]
    mova                xm7,     [tmp1q+32*3+16]
    vinserti128          m6, m6, [tmp2q+32*2+16], 1
    vinserti128          m7, m7, [tmp2q+32*3+16], 1
    pmulhrsw             m0, m8, m10
    pmulhrsw             m1, m9, m10
    REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
    call m(inv_txfm_add_identity_identity_8x32).transpose8x8
    mova       [tmp1q+32*0], m0
    mova       [tmp2q+32*0], m1
    mova       [tmp1q+32*1], m2
    mova       [tmp2q+32*1], m3
    mova       [tmp1q+32*2], m4
    mova       [tmp2q+32*2], m5
    mova       [tmp1q+32*3], m6
    mova       [tmp2q+32*3], m7
    add               tmp1q, 32*16
    dec               tmp3d
    jg .loop
    ret

cglobal inv_txfm_add_dct_dct_64x64, 4, 4, 0, dst, stride, c, eob
    lea                 rax, [o_base]
    test               eobd, eobd
    jnz .normal
    movd                xm1, [o(pw_2896x8)]
    pmulhrsw            xm0, xm1, [cq]
    movd                xm2, [o(pw_8192)]
    mov                [cq], eobd
    mov                 r2d, 64
    jmp m(inv_txfm_add_dct_dct_64x16).dconly
.normal:
    PROLOGUE              0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
    lea               tmp1q, [rsp+32*71]
    lea                r10d, [eobq-136]
.pass1_loop:
    LOAD_8ROWS      cq+64*0, 64*4
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
    mova              [rsp], m8
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m1
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m5
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    add               tmp1q, 32*8
    mova       [tmp1q-32*4], m8
    mova       [tmp1q-32*3], m9
    mova       [tmp1q-32*2], m10
    mova       [tmp1q-32*1], m11
    mova       [tmp1q+32*0], m12
    mova       [tmp1q+32*1], m13
    mova       [tmp1q+32*2], m14
    mova       [tmp1q+32*3], m15
    LOAD_8ROWS      cq+64*2, 64*4
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
    add               tmp1q, 32*8
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    vpbroadcastd        m15, [o(pd_2048)]
    add               tmp1q, 32*16
    add               tmp2q, 32*32
    mova                 m0, [cq+64* 1]
    mova                 m1, [cq+64*31]
    mova                 m2, [cq+64*17]
    mova                 m3, [cq+64*15]
    mova                 m4, [cq+64* 9]
    mova                 m5, [cq+64*23]
    mova                 m6, [cq+64*25]
    mova                 m7, [cq+64* 7]
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
    add                 rax, o_idct64_offset
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    add                 rax, 8
    add               tmp1q, 32*8
    sub               tmp2q, 32*8
    mova                 m0, [cq+64* 5]
    mova                 m1, [cq+64*27]
    mova                 m2, [cq+64*21]
    mova                 m3, [cq+64*11]
    mova                 m4, [cq+64*13]
    mova                 m5, [cq+64*19]
    mova                 m6, [cq+64*29]
    mova                 m7, [cq+64* 3]
    pxor                 m8, m8
    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass1
    sub               tmp1q, 32*44
    vpbroadcastd        m10, [o(pw_8192)]
    call m(inv_txfm_add_dct_dct_64x32).transpose_round_interleave
    add                  cq, 32
    add                r10d, 0x80000000
    jnc .pass1_loop
    lea               tmp1q, [rsp+32*7]
    mov                r10b, 4
.pass2_loop:
    lea                  r2, [tmp1q+32*64]
    mova                 m0, [r2-32*4]
    mova                 m1, [r2-32*2]
    mova                 m2, [r2+32*0]
    mova                 m3, [r2+32*2]
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
    mova              [rsp], m4
    test               r10d, 0x40000000
    jnz .fast
    lea                  r3, [r2+32*64]
    mova                 m4, [r3-32*4]
    mova                 m5, [r3-32*2]
    mova                 m6, [r3+32*0]
    mova                 m7, [r3+32*2]
.fast:
    call m(idct_16x16_internal).main
    mova                 m1, [rsp+32*1]
    mova       [tmp1q-32*4], m0
    mova       [tmp1q-32*3], m1
    mova       [tmp1q-32*2], m2
    mova       [tmp1q-32*1], m3
    mova       [tmp1q+32*0], m4
    mova       [tmp1q+32*1], m5
    mova       [tmp1q+32*2], m6
    mova       [tmp1q+32*3], m7
    add               tmp1q, 32*8
    mova       [tmp1q-32*4], m8
    mova       [tmp1q-32*3], m9
    mova       [tmp1q-32*2], m10
    mova       [tmp1q-32*1], m11
    mova       [tmp1q+32*0], m12
    mova       [tmp1q+32*1], m13
    mova       [tmp1q+32*2], m14
    mova       [tmp1q+32*3], m15
    mova                 m0, [r2-32*3]
    mova                 m1, [r2-32*1]
    mova                 m2, [r2+32*1]
    mova                 m3, [r2+32*3]
    pxor                 m4, m4
    REPX       {mova x, m4}, m5, m6, m7
    test               r10d, 0x40000000
    jnz .fast2
    mova                 m4, [r3-32*3]
    mova                 m5, [r3-32*1]
    mova                 m6, [r3+32*1]
    mova                 m7, [r3+32*3]
.fast2:
    add               tmp1q, 32*8
    lea               tmp2q, [tmp1q+32*8]
    call m(inv_txfm_add_dct_dct_16x32).main_oddhalf_fast
    vpbroadcastd        m15, [o(pd_2048)]
    add                  r2, 32*8
    add                  r3, 32*8
    add               tmp1q, 32*16
    add               tmp2q, 32*32
    mova                 m0, [r2-32*4] ;  1
    mova                 m3, [r2+32*3] ; 15
    mova                 m4, [r2+32*0] ;  9
    mova                 m7, [r2-32*1] ;  7
    pxor                 m1, m1
    REPX       {mova x, m1}, m2, m5, m6
    test               r10d, 0x40000000
    jnz .fast3
    mova                 m1, [r3+32*3] ; 31
    mova                 m2, [r3-32*4] ; 17
    mova                 m5, [r3-32*1] ; 23
    mova                 m6, [r3+32*0] ; 25
.fast3:
    add                 rax, o_idct64_offset
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    add                 rax, 8
    add               tmp1q, 32*8
    sub               tmp2q, 32*8
    mova                 m0, [r2-32*2] ;  5
    mova                 m3, [r2+32*1] ; 11
    mova                 m4, [r2+32*2] ; 13
    mova                 m7, [r2-32*3] ;  3
    pxor                 m1, m1
    REPX       {mova x, m1}, m2, m5, m6
    test               r10d, 0x40000000
    jnz .fast4
    mova                 m1, [r3+32*1] ; 27
    mova                 m2, [r3-32*2] ; 21
    mova                 m5, [r3-32*3] ; 19
    mova                 m6, [r3+32*2] ; 29
.fast4:
    call m(inv_txfm_add_dct_dct_16x64).main_part1
    call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
    sub               tmp1q, 32*28
    sub                dstq, r8
    lea                dstq, [dstq+strideq*4+16]
    dec                r10b
    jg .pass2_loop
    RET

%endif ; ARCH_X86_64