ref: a532e5aeb8498c6a4f4470363d59e5dc6cb6483f
parent: e811c4767d0c698fc674e9603e1e63d15acff16e
author: Liwei Wang <[email protected]>
date: Thu Feb 21 12:17:59 EST 2019
Add SSSE3 implementation for the 8x16 and 16x8 blocks in itx Cycle times: inv_txfm_add_8x16_adst_adst_0_8bpc_c: 5063.0 inv_txfm_add_8x16_adst_adst_0_8bpc_ssse3: 406.8 inv_txfm_add_8x16_adst_adst_1_8bpc_c: 5051.2 inv_txfm_add_8x16_adst_adst_1_8bpc_ssse3: 407.3 inv_txfm_add_8x16_adst_adst_2_8bpc_c: 5065.4 inv_txfm_add_8x16_adst_adst_2_8bpc_ssse3: 407.9 inv_txfm_add_8x16_adst_dct_0_8bpc_c: 5201.1 inv_txfm_add_8x16_adst_dct_0_8bpc_ssse3: 354.8 inv_txfm_add_8x16_adst_dct_1_8bpc_c: 5214.8 inv_txfm_add_8x16_adst_dct_1_8bpc_ssse3: 354.8 inv_txfm_add_8x16_adst_dct_2_8bpc_c: 5225.0 inv_txfm_add_8x16_adst_dct_2_8bpc_ssse3: 355.1 inv_txfm_add_8x16_adst_flipadst_0_8bpc_c: 7135.9 inv_txfm_add_8x16_adst_flipadst_0_8bpc_ssse3: 409.7 inv_txfm_add_8x16_adst_flipadst_1_8bpc_c: 8354.4 inv_txfm_add_8x16_adst_flipadst_1_8bpc_ssse3: 409.2 inv_txfm_add_8x16_adst_flipadst_2_8bpc_c: 7198.7 inv_txfm_add_8x16_adst_flipadst_2_8bpc_ssse3: 409.7 inv_txfm_add_8x16_adst_identity_0_8bpc_c: 3936.5 inv_txfm_add_8x16_adst_identity_0_8bpc_ssse3: 262.0 inv_txfm_add_8x16_adst_identity_1_8bpc_c: 4617.8 inv_txfm_add_8x16_adst_identity_1_8bpc_ssse3: 261.4 inv_txfm_add_8x16_adst_identity_2_8bpc_c: 3895.1 inv_txfm_add_8x16_adst_identity_2_8bpc_ssse3: 262.1 inv_txfm_add_8x16_dct_adst_0_8bpc_c: 5203.9 inv_txfm_add_8x16_dct_adst_0_8bpc_ssse3: 355.1 inv_txfm_add_8x16_dct_adst_1_8bpc_c: 5200.8 inv_txfm_add_8x16_dct_adst_1_8bpc_ssse3: 355.4 inv_txfm_add_8x16_dct_adst_2_8bpc_c: 5208.2 inv_txfm_add_8x16_dct_adst_2_8bpc_ssse3: 355.1 inv_txfm_add_8x16_dct_dct_0_8bpc_c: 5270.8 inv_txfm_add_8x16_dct_dct_0_8bpc_ssse3: 57.0 inv_txfm_add_8x16_dct_dct_1_8bpc_c: 5280.9 inv_txfm_add_8x16_dct_dct_1_8bpc_ssse3: 303.2 inv_txfm_add_8x16_dct_dct_2_8bpc_c: 5275.9 inv_txfm_add_8x16_dct_dct_2_8bpc_ssse3: 302.4 inv_txfm_add_8x16_dct_flipadst_0_8bpc_c: 5374.4 inv_txfm_add_8x16_dct_flipadst_0_8bpc_ssse3: 356.5 inv_txfm_add_8x16_dct_flipadst_1_8bpc_c: 5449.9 inv_txfm_add_8x16_dct_flipadst_1_8bpc_ssse3: 356.8 inv_txfm_add_8x16_dct_flipadst_2_8bpc_c: 5446.9 inv_txfm_add_8x16_dct_flipadst_2_8bpc_ssse3: 356.7 inv_txfm_add_8x16_dct_identity_0_8bpc_c: 3883.4 inv_txfm_add_8x16_dct_identity_0_8bpc_ssse3: 76.1 inv_txfm_add_8x16_dct_identity_1_8bpc_c: 3892.3 inv_txfm_add_8x16_dct_identity_1_8bpc_ssse3: 76.1 inv_txfm_add_8x16_dct_identity_2_8bpc_c: 4027.1 inv_txfm_add_8x16_dct_identity_2_8bpc_ssse3: 209.9 inv_txfm_add_8x16_flipadst_adst_0_8bpc_c: 7387.5 inv_txfm_add_8x16_flipadst_adst_0_8bpc_ssse3: 408.9 inv_txfm_add_8x16_flipadst_adst_1_8bpc_c: 7298.8 inv_txfm_add_8x16_flipadst_adst_1_8bpc_ssse3: 408.8 inv_txfm_add_8x16_flipadst_adst_2_8bpc_c: 7397.2 inv_txfm_add_8x16_flipadst_adst_2_8bpc_ssse3: 408.9 inv_txfm_add_8x16_flipadst_dct_0_8bpc_c: 5250.4 inv_txfm_add_8x16_flipadst_dct_0_8bpc_ssse3: 355.3 inv_txfm_add_8x16_flipadst_dct_1_8bpc_c: 5263.9 inv_txfm_add_8x16_flipadst_dct_1_8bpc_ssse3: 355.4 inv_txfm_add_8x16_flipadst_dct_2_8bpc_c: 5259.0 inv_txfm_add_8x16_flipadst_dct_2_8bpc_ssse3: 356.3 inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_c: 5448.4 inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_ssse3: 410.2 inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_c: 5402.6 inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_ssse3: 410.8 inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_c: 6479.7 inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_ssse3: 409.8 inv_txfm_add_8x16_flipadst_identity_0_8bpc_c: 3828.9 inv_txfm_add_8x16_flipadst_identity_0_8bpc_ssse3: 262.7 inv_txfm_add_8x16_flipadst_identity_1_8bpc_c: 3884.5 inv_txfm_add_8x16_flipadst_identity_1_8bpc_ssse3: 262.0 inv_txfm_add_8x16_flipadst_identity_2_8bpc_c: 3809.2 inv_txfm_add_8x16_flipadst_identity_2_8bpc_ssse3: 262.9 inv_txfm_add_8x16_identity_adst_0_8bpc_c: 4294.5 inv_txfm_add_8x16_identity_adst_0_8bpc_ssse3: 268.8 inv_txfm_add_8x16_identity_adst_1_8bpc_c: 4955.4 inv_txfm_add_8x16_identity_adst_1_8bpc_ssse3: 269.1 inv_txfm_add_8x16_identity_adst_2_8bpc_c: 4166.4 inv_txfm_add_8x16_identity_adst_2_8bpc_ssse3: 269.9 inv_txfm_add_8x16_identity_dct_0_8bpc_c: 4012.3 inv_txfm_add_8x16_identity_dct_0_8bpc_ssse3: 56.7 inv_txfm_add_8x16_identity_dct_1_8bpc_c: 4767.1 inv_txfm_add_8x16_identity_dct_1_8bpc_ssse3: 215.1 inv_txfm_add_8x16_identity_dct_2_8bpc_c: 4012.6 inv_txfm_add_8x16_identity_dct_2_8bpc_ssse3: 215.9 inv_txfm_add_8x16_identity_flipadst_0_8bpc_c: 4452.6 inv_txfm_add_8x16_identity_flipadst_0_8bpc_ssse3: 270.5 inv_txfm_add_8x16_identity_flipadst_1_8bpc_c: 4885.8 inv_txfm_add_8x16_identity_flipadst_1_8bpc_ssse3: 270.3 inv_txfm_add_8x16_identity_flipadst_2_8bpc_c: 4186.1 inv_txfm_add_8x16_identity_flipadst_2_8bpc_ssse3: 271.5 inv_txfm_add_8x16_identity_identity_0_8bpc_c: 2623.0 inv_txfm_add_8x16_identity_identity_0_8bpc_ssse3: 123.1 inv_txfm_add_8x16_identity_identity_1_8bpc_c: 2617.7 inv_txfm_add_8x16_identity_identity_1_8bpc_ssse3: 122.9 inv_txfm_add_8x16_identity_identity_2_8bpc_c: 2617.2 inv_txfm_add_8x16_identity_identity_2_8bpc_ssse3: 123.1 inv_txfm_add_16x8_adst_adst_0_8bpc_c: 5102.3 inv_txfm_add_16x8_adst_adst_0_8bpc_ssse3: 409.0 inv_txfm_add_16x8_adst_adst_1_8bpc_c: 5063.2 inv_txfm_add_16x8_adst_adst_1_8bpc_ssse3: 409.5 inv_txfm_add_16x8_adst_adst_2_8bpc_c: 5029.1 inv_txfm_add_16x8_adst_adst_2_8bpc_ssse3: 410.1 inv_txfm_add_16x8_adst_dct_0_8bpc_c: 5848.8 inv_txfm_add_16x8_adst_dct_0_8bpc_ssse3: 358.8 inv_txfm_add_16x8_adst_dct_1_8bpc_c: 5612.8 inv_txfm_add_16x8_adst_dct_1_8bpc_ssse3: 358.8 inv_txfm_add_16x8_adst_dct_2_8bpc_c: 5143.2 inv_txfm_add_16x8_adst_dct_2_8bpc_ssse3: 358.5 inv_txfm_add_16x8_adst_flipadst_0_8bpc_c: 5072.4 inv_txfm_add_16x8_adst_flipadst_0_8bpc_ssse3: 413.3 inv_txfm_add_16x8_adst_flipadst_1_8bpc_c: 5082.2 inv_txfm_add_16x8_adst_flipadst_1_8bpc_ssse3: 413.6 inv_txfm_add_16x8_adst_flipadst_2_8bpc_c: 5108.0 inv_txfm_add_16x8_adst_flipadst_2_8bpc_ssse3: 413.8 inv_txfm_add_16x8_adst_identity_0_8bpc_c: 3897.2 inv_txfm_add_16x8_adst_identity_0_8bpc_ssse3: 283.6 inv_txfm_add_16x8_adst_identity_1_8bpc_c: 3947.2 inv_txfm_add_16x8_adst_identity_1_8bpc_ssse3: 283.1 inv_txfm_add_16x8_adst_identity_2_8bpc_c: 3881.7 inv_txfm_add_16x8_adst_identity_2_8bpc_ssse3: 283.6 inv_txfm_add_16x8_dct_adst_0_8bpc_c: 5200.7 inv_txfm_add_16x8_dct_adst_0_8bpc_ssse3: 355.0 inv_txfm_add_16x8_dct_adst_1_8bpc_c: 5261.0 inv_txfm_add_16x8_dct_adst_1_8bpc_ssse3: 355.1 inv_txfm_add_16x8_dct_adst_2_8bpc_c: 5212.5 inv_txfm_add_16x8_dct_adst_2_8bpc_ssse3: 354.5 inv_txfm_add_16x8_dct_dct_0_8bpc_c: 5252.9 inv_txfm_add_16x8_dct_dct_0_8bpc_ssse3: 43.6 inv_txfm_add_16x8_dct_dct_1_8bpc_c: 5260.0 inv_txfm_add_16x8_dct_dct_1_8bpc_ssse3: 302.1 inv_txfm_add_16x8_dct_dct_2_8bpc_c: 5250.4 inv_txfm_add_16x8_dct_dct_2_8bpc_ssse3: 302.0 inv_txfm_add_16x8_dct_flipadst_0_8bpc_c: 5216.6 inv_txfm_add_16x8_dct_flipadst_0_8bpc_ssse3: 359.3 inv_txfm_add_16x8_dct_flipadst_1_8bpc_c: 5229.9 inv_txfm_add_16x8_dct_flipadst_1_8bpc_ssse3: 357.6 inv_txfm_add_16x8_dct_flipadst_2_8bpc_c: 5261.4 inv_txfm_add_16x8_dct_flipadst_2_8bpc_ssse3: 357.4 inv_txfm_add_16x8_dct_identity_0_8bpc_c: 3999.2 inv_txfm_add_16x8_dct_identity_0_8bpc_ssse3: 63.8 inv_txfm_add_16x8_dct_identity_1_8bpc_c: 4018.1 inv_txfm_add_16x8_dct_identity_1_8bpc_ssse3: 227.1 inv_txfm_add_16x8_dct_identity_2_8bpc_c: 3998.7 inv_txfm_add_16x8_dct_identity_2_8bpc_ssse3: 226.2 inv_txfm_add_16x8_flipadst_adst_0_8bpc_c: 5124.9 inv_txfm_add_16x8_flipadst_adst_0_8bpc_ssse3: 419.7 inv_txfm_add_16x8_flipadst_adst_1_8bpc_c: 5100.7 inv_txfm_add_16x8_flipadst_adst_1_8bpc_ssse3: 420.5 inv_txfm_add_16x8_flipadst_adst_2_8bpc_c: 5087.1 inv_txfm_add_16x8_flipadst_adst_2_8bpc_ssse3: 419.9 inv_txfm_add_16x8_flipadst_dct_0_8bpc_c: 5183.2 inv_txfm_add_16x8_flipadst_dct_0_8bpc_ssse3: 367.1 inv_txfm_add_16x8_flipadst_dct_1_8bpc_c: 5193.7 inv_txfm_add_16x8_flipadst_dct_1_8bpc_ssse3: 368.6 inv_txfm_add_16x8_flipadst_dct_2_8bpc_c: 5186.8 inv_txfm_add_16x8_flipadst_dct_2_8bpc_ssse3: 368.4 inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_c: 5091.3 inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_ssse3: 421.2 inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_c: 5118.5 inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_ssse3: 421.4 inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_c: 5119.0 inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_ssse3: 421.2 inv_txfm_add_16x8_flipadst_identity_0_8bpc_c: 3909.3 inv_txfm_add_16x8_flipadst_identity_0_8bpc_ssse3: 289.9 inv_txfm_add_16x8_flipadst_identity_1_8bpc_c: 3920.7 inv_txfm_add_16x8_flipadst_identity_1_8bpc_ssse3: 290.4 inv_txfm_add_16x8_flipadst_identity_2_8bpc_c: 3936.7 inv_txfm_add_16x8_flipadst_identity_2_8bpc_ssse3: 290.6 inv_txfm_add_16x8_identity_adst_0_8bpc_c: 3869.3 inv_txfm_add_16x8_identity_adst_0_8bpc_ssse3: 280.0 inv_txfm_add_16x8_identity_adst_1_8bpc_c: 3832.2 inv_txfm_add_16x8_identity_adst_1_8bpc_ssse3: 281.4 inv_txfm_add_16x8_identity_adst_2_8bpc_c: 3820.8 inv_txfm_add_16x8_identity_adst_2_8bpc_ssse3: 281.5 inv_txfm_add_16x8_identity_dct_0_8bpc_c: 3878.6 inv_txfm_add_16x8_identity_dct_0_8bpc_ssse3: 76.7 inv_txfm_add_16x8_identity_dct_1_8bpc_c: 3883.3 inv_txfm_add_16x8_identity_dct_1_8bpc_ssse3: 76.3 inv_txfm_add_16x8_identity_dct_2_8bpc_c: 3900.6 inv_txfm_add_16x8_identity_dct_2_8bpc_ssse3: 220.1 inv_txfm_add_16x8_identity_flipadst_0_8bpc_c: 3840.9 inv_txfm_add_16x8_identity_flipadst_0_8bpc_ssse3: 277.1 inv_txfm_add_16x8_identity_flipadst_1_8bpc_c: 3860.6 inv_txfm_add_16x8_identity_flipadst_1_8bpc_ssse3: 277.0 inv_txfm_add_16x8_identity_flipadst_2_8bpc_c: 3849.4 inv_txfm_add_16x8_identity_flipadst_2_8bpc_ssse3: 277.2 inv_txfm_add_16x8_identity_identity_0_8bpc_c: 2610.9 inv_txfm_add_16x8_identity_identity_0_8bpc_ssse3: 159.8 inv_txfm_add_16x8_identity_identity_1_8bpc_c: 2597.1 inv_txfm_add_16x8_identity_identity_1_8bpc_ssse3: 159.8 inv_txfm_add_16x8_identity_identity_2_8bpc_c: 2607.9 inv_txfm_add_16x8_identity_identity_2_8bpc_ssse3: 159.9
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -83,6 +83,8 @@
decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
+decl_itx16_fns( 8, 16, ssse3);
+decl_itx16_fns(16, 8, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
@@ -132,6 +134,8 @@
assign_itx16_fn(, 8, 8, ssse3);
assign_itx16_fn(R, 4, 16, ssse3);
assign_itx16_fn(R, 16, 4, ssse3);
+ assign_itx16_fn(R, 8, 16, ssse3);
+ assign_itx16_fn(R, 16, 8, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -1147,7 +1147,7 @@
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x8, 8
+ INV_TXFM_FN %1, %2, %3, 8x8, 8, 16*4
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
pmulhrsw m0, [coeffq]
@@ -1182,6 +1182,7 @@
pmulhrsw m0, m2
.end:
mov r2d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)]
.end2:
lea r3, [strideq*3]
.loop:
@@ -1189,6 +1190,8 @@
lea dstq, [dstq+strideq*2]
dec r2d
jg .loop
+ jmp tx2q
+.end3:
RET
%else ; identity
mova m0, [coeffq+16*0]
@@ -1219,6 +1222,7 @@
mova m4, [coeffq+16*4]
mova m5, [coeffq+16*5]
mova m6, [coeffq+16*6]
+ mova m7, [coeffq+16*7]
%endmacro
%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
@@ -1242,97 +1246,106 @@
cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_8X8_LOAD_COEFS
+
+.pass1:
call .main
.pass1_end:
- mova m7, [o(pw_16384)]
- REPX {pmulhrsw x, m7}, m0, m2, m4, m6
- mova [coeffq+16*6], m6
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
.pass1_end2:
- REPX {pmulhrsw x, m7}, m1, m3, m5
- pmulhrsw m7, [coeffq+16*7]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
.pass1_end3:
- punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
- punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
- punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
- punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
- punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
- punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
- punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
- punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
- punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
- punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
- mova [coeffq+16*5], m6
- mova m6, [coeffq+16*6]
- punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
- punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
- punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
- punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
- punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
- punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
+ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
+ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
+ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
+ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
+ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
+ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
+ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
+ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
+ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
+ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
+ mova [rsp+gprsize+16*2], m6
+ mova m6, [rsp+gprsize+16*1]
+ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
+ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
+ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
+ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
+ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
+ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
- punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
- punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
- mova [coeffq+16*7], m2
- punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
- punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
- punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
- punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
- mova m7, [coeffq+16*5]
- punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
- punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
- jmp tx2q
+ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
+ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
+ mova [rsp+gprsize+16*0], m2
+ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
+ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
+ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
+ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
+ mova m7, [rsp+gprsize+16*2]
+ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
+ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
+ mova m7, [rsp+gprsize+16*0]
+ jmp tx2q
.pass2:
+ lea tx2q, [o(m(idct_8x8_internal).end4)]
+
+.pass2_main:
call .main
.end:
- mova m7, [o(pw_2048)]
- REPX {pmulhrsw x, m7}, m0, m2, m4, m6
- mova [coeffq+16*6], m6
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
.end2:
- REPX {pmulhrsw x, m7}, m1, m3, m5
- pmulhrsw m7, [coeffq+16*7]
- mova [coeffq+16*5], m5
- mova [coeffq+16*7], m7
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*0], m7
.end3:
- WRITE_8X4 0, 1, 2, 3, 5, 6, 7
- lea dstq, [dstq+strideq*2]
- WRITE_8X4 4, [coeffq+16*5], [coeffq+16*6], [coeffq+16*7], 5, 6, 7
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
+ jmp tx2q
- pxor m7, m7
- REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.end4:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
ALIGN function_align
.main:
- mova [coeffq+16*6], m3
- mova [coeffq+16*5], m1
- mova m7, [o(pd_2048)]
- IDCT4_1D 0, 2, 4, 6, 1, 3, 7
- mova m3, [coeffq+16*5]
- mova [coeffq+16*5], m2
- mova m2, [coeffq+16*6]
- mova [coeffq+16*6], m4
- mova m4, [coeffq+16*7]
- mova [coeffq+16*7], m6
- IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
- mova m6, [coeffq+16*7]
- psubsw m7, m0, m4 ;out7
- paddsw m0, m4 ;out0
- mova [coeffq+16*7], m7
- mova m1, [coeffq+16*5]
- psubsw m4, m6, m3 ;out4
- paddsw m3, m6 ;out3
- mova m7, [coeffq+16*6]
- psubsw m6, m1, m5 ;out6
- paddsw m1, m5 ;out1
- psubsw m5, m7, m2 ;out5
- paddsw m2, m7 ;out2
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m1
+ mova m7, [o(pd_2048)]
+ IDCT4_1D 0, 2, 4, 6, 1, 3, 7
+ mova m3, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m2
+ mova m2, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m4
+ mova m4, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m6
+ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
+ mova m6, [rsp+gprsize*2+16*0]
+ psubsw m7, m0, m4 ;out7
+ paddsw m0, m4 ;out0
+ mova [rsp+gprsize*2+16*0], m7
+ mova m1, [rsp+gprsize*2+16*2]
+ psubsw m4, m6, m3 ;out4
+ paddsw m3, m6 ;out3
+ mova m7, [rsp+gprsize*2+16*1]
+ psubsw m6, m1, m5 ;out6
+ paddsw m1, m5 ;out1
+ psubsw m5, m7, m2 ;out5
+ paddsw m2, m7 ;out2
ret
@@ -1343,75 +1356,85 @@
cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_8X8_LOAD_COEFS
+
+.pass1:
call .main
- mova m7, [o(pw_16384)]
- REPX {pmulhrsw x, m7}, m0, m2, m4, m6
- mova [coeffq+16*6], m6
- pxor m6, m6
- psubw m6, m7
- mova m7, m6
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
jmp m(idct_8x8_internal).pass1_end2
ALIGN function_align
.pass2:
+ lea tx2q, [o(m(idct_8x8_internal).end4)]
+
+.pass2_main:
call .main
- mova m7, [o(pw_2048)]
- REPX {pmulhrsw x, m7}, m0, m2, m4, m6
- mova [coeffq+16*6], m6
- pxor m6, m6
- psubw m6, m7
- mova m7, m6
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
jmp m(idct_8x8_internal).end2
ALIGN function_align
.main:
- mova [coeffq+16*6], m3
- mova [coeffq+16*5], m4
- mova m7, [o(pd_2048)]
- ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
- ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
- paddsw m3, m2, m6 ;t2
- psubsw m2, m6 ;t6
- paddsw m4, m5, m1 ;t3
- psubsw m5, m1 ;t7
- ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m4
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
+ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
+ paddsw m3, m2, m6 ;t2
+ psubsw m2, m6 ;t6
+ paddsw m4, m5, m1 ;t3
+ psubsw m5, m1 ;t7
+ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
- mova m6, [coeffq+16*5]
- mova [coeffq+16*5], m5
- mova m1, [coeffq+16*6]
- mova [coeffq+16*6], m2
- mova m5, [coeffq+16*7]
- mova [coeffq+16*7], m3
- ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
- ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
- psubsw m2, m0, m6 ;t4
- paddsw m0, m6 ;t0
- paddsw m3, m5, m1 ;t1
- psubsw m5, m1 ;t5
- ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
-
- mova m7, [coeffq+16*7]
- paddsw m1, m3, m4 ;-out7
- psubsw m3, m4 ;t3
- mova [coeffq+16*7], m1
- psubsw m4, m0, m7 ;t2
- paddsw m0, m7 ;out0
- mova m6, [coeffq+16*5]
- mova m7, [coeffq+16*6]
- paddsw m1, m5, m6 ;-out1
- psubsw m5, m6 ;t6
- paddsw m6, m2, m7 ;out6
- psubsw m2, m7 ;t7
- paddw m7, m4, m3 ;t2 + t3
- psubw m4, m3 ;t2 - t3
- paddw m3, m5, m2 ;t6 + t7
- psubw m5, m2 ;t6 - t7
- mova m2, [o(pw_2896x8)]
- pmulhrsw m4, m2 ;out4
- pmulhrsw m5, m2 ;-out5
- pmulhrsw m7, m2 ;-out3
- pmulhrsw m2, m3 ;out2
- mova m3, m7
+ mova m6, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m2
+ mova m5, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m3
+ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
+ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
+ psubsw m2, m0, m6 ;t4
+ paddsw m0, m6 ;t0
+ paddsw m3, m5, m1 ;t1
+ psubsw m5, m1 ;t5
+ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
+
+ mova m7, [rsp+gprsize*2+16*0]
+ paddsw m1, m3, m4 ;-out7
+ psubsw m3, m4 ;t3
+ mova [rsp+gprsize*2+16*0], m1
+ psubsw m4, m0, m7 ;t2
+ paddsw m0, m7 ;out0
+ mova m6, [rsp+gprsize*2+16*2]
+ mova m7, [rsp+gprsize*2+16*1]
+ paddsw m1, m5, m6 ;-out1
+ psubsw m5, m6 ;t6
+ paddsw m6, m2, m7 ;out6
+ psubsw m2, m7 ;t7
+ paddw m7, m4, m3 ;t2 + t3
+ psubw m4, m3 ;t2 - t3
+ paddw m3, m5, m2 ;t6 + t7
+ psubw m5, m2 ;t6 - t7
+ mova m2, [o(pw_2896x8)]
+ pmulhrsw m4, m2 ;out4
+ pmulhrsw m5, m2 ;-out5
+ pmulhrsw m7, m2 ;-out3
+ pmulhrsw m2, m3 ;out2
+ mova m3, m7
ret
INV_TXFM_8X8_FN flipadst, dct
@@ -1421,45 +1444,54 @@
cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_8X8_LOAD_COEFS
+
+.pass1:
call m(iadst_8x8_internal).main
- mova m7, [o(pw_m16384)]
- pmulhrsw m1, m7
- mova [coeffq+16*6], m1
- mova m1, m6
- mova m6, m2
- pmulhrsw m2, m5, m7
- mova m5, m6
- mova m6, m4
- pmulhrsw m4, m3, m7
- mova m3, m6
- mova m6, m0
- mova m0, m7
- pxor m7, m7
- psubw m7, m0
- pmulhrsw m0, [coeffq+16*7]
- REPX {pmulhrsw x, m7}, m1, m3, m5
- pmulhrsw m7, m6
+
+.pass1_end:
+ mova m7, [o(pw_m16384)]
+ pmulhrsw m1, m7
+ mova [rsp+gprsize+16*1], m1
+ mova m1, m6
+ mova m6, m2
+ pmulhrsw m2, m5, m7
+ mova m5, m6
+ mova m6, m4
+ pmulhrsw m4, m3, m7
+ mova m3, m6
+ mova m6, m0
+ mova m0, m7
+ pxor m7, m7
+ psubw m7, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, m6
jmp m(idct_8x8_internal).pass1_end3
ALIGN function_align
.pass2:
+ lea tx2q, [o(m(idct_8x8_internal).end4)]
+
+.pass2_main:
call m(iadst_8x8_internal).main
- mova m7, [o(pw_2048)]
- REPX {pmulhrsw x, m7}, m0, m2, m4, m6
- mova [coeffq+16*5], m2
- mova m2, m0
- pxor m0, m0
- psubw m0, m7
- mova m7, m2
- pmulhrsw m1, m0
- pmulhrsw m2, m5, m0
- mova [coeffq+16*6], m1
- mova m5, m4
- mova m1, m6
- pmulhrsw m4, m3, m0
- pmulhrsw m0, [coeffq+16*7]
- mova m3, m5
- mova [coeffq+16*7], m7
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*2], m2
+ mova m2, m0
+ pxor m0, m0
+ psubw m0, m7
+ mova m7, m2
+ pmulhrsw m1, m0
+ pmulhrsw m2, m5, m0
+ mova [rsp+gprsize+16*1], m1
+ mova m5, m4
+ mova m1, m6
+ pmulhrsw m4, m3, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ mova m3, m5
+ mova [rsp+gprsize+16*0], m7
jmp m(idct_8x8_internal).end3
INV_TXFM_8X8_FN identity, dct, 7
@@ -1468,23 +1500,21 @@
INV_TXFM_8X8_FN identity, identity
cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
- mova m0, [coeffq+16*0]
- mova m1, [coeffq+16*1]
- mova m2, [coeffq+16*2]
- mova m3, [coeffq+16*3]
- mova m4, [coeffq+16*4]
- mova m5, [coeffq+16*5]
- mova m7, [coeffq+16*7]
- jmp m(idct_8x8_internal).pass1_end3
+ ITX_8X8_LOAD_COEFS
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal).pass1_end3
ALIGN function_align
.pass2:
- mova m7, [o(pw_4096)]
- REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
- pmulhrsw m7, [coeffq+16*7]
- mova [coeffq+16*5], m5
- mova [coeffq+16*6], m6
- mova [coeffq+16*7], m7
+ lea tx2q, [o(m(idct_8x8_internal).end4)]
+
+.end:
+ pmulhrsw m7, [o(pw_4096)]
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_4096)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*1], m6
jmp m(idct_8x8_internal).end3
@@ -1829,6 +1859,7 @@
movd m2, [o(pw_16384)]
mov [coeffq], eobd
mov r2d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)]
.dconly:
pmulhrsw m0, m2
movd m2, [o(pw_2048)] ;intentionally rip-relative
@@ -1855,6 +1886,8 @@
lea dstq, [dstq+strideq*2]
dec r2d
jg .dconly_loop
+ jmp tx2q
+.end:
RET
%else ; adst / flipadst
movd m2, [o(pw_16384)]
@@ -1889,7 +1922,13 @@
%endmacro
%macro ITX_16X4_LOAD_COEFS 0
- ITX_8X8_LOAD_COEFS
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova m5, [coeffq+16*5]
+ mova m6, [coeffq+16*6]
%endmacro
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
@@ -2229,3 +2268,916 @@
.pass2:
lea tx2q, [o(m(iidentity_8x4_internal).pass2)]
jmp m(idct_16x4_internal).pass2_end
+
+
+%macro ITX_8X16_LOAD_EVEN_COEFS 0
+ mova m0, [coeffq+32*0]
+ mova m1, [coeffq+32*1]
+ mova m2, [coeffq+32*2]
+ mova m3, [coeffq+32*3]
+ mova m4, [coeffq+32*4]
+ mova m5, [coeffq+32*5]
+ mova m6, [coeffq+32*6]
+ mova m7, [coeffq+32*7]
+%endmacro
+
+%macro ITX_8X16_RECT2_LOAD_EVEN_COEFS 0
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+32*0]
+ pmulhrsw m1, m7, [coeffq+32*1]
+ pmulhrsw m2, m7, [coeffq+32*2]
+ pmulhrsw m3, m7, [coeffq+32*3]
+ pmulhrsw m4, m7, [coeffq+32*4]
+ pmulhrsw m5, m7, [coeffq+32*5]
+ pmulhrsw m6, m7, [coeffq+32*6]
+ pmulhrsw m7, [coeffq+32*7]
+%endmacro
+
+%macro ITX_8X16_LOAD_ODD_COEFS 0
+ mova m0, [coeffq+16*1 ]
+ mova m1, [coeffq+16*3 ]
+ mova m2, [coeffq+16*5 ]
+ mova m3, [coeffq+16*7 ]
+ mova m4, [coeffq+16*9 ]
+ mova m5, [coeffq+16*11]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+%endmacro
+
+%macro ITX_8X16_RECT2_LOAD_ODD_COEFS 0
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*1 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*5 ]
+ pmulhrsw m3, m7, [coeffq+16*7 ]
+ pmulhrsw m4, m7, [coeffq+16*9 ]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*13]
+ pmulhrsw m7, [coeffq+16*15]
+%endmacro
+
+%macro ITX_8X16_SAVE_EVEN_COEFS 0
+ mova [coeffq+32*0], m0
+ mova [coeffq+32*1], m1
+ mova [coeffq+32*2], m2
+ mova [coeffq+32*3], m3
+ mova [coeffq+32*4], m4
+ mova [coeffq+32*5], m5
+ mova [coeffq+32*6], m6
+ mova [coeffq+32*7], m7
+%endmacro
+
+%macro ITX_8X16_SAVE_ODD_COEFS 0
+ mova [coeffq+16*1 ], m0
+ mova [coeffq+16*3 ], m1
+ mova [coeffq+16*5 ], m2
+ mova [coeffq+16*7 ], m3
+ mova [coeffq+16*9 ], m4
+ mova [coeffq+16*11], m5
+ mova [coeffq+16*13], m6
+ mova [coeffq+16*15], m7
+%endmacro
+
+%macro ITX_8X16_LOAD_STACK_COEFS 0
+ mova m0, [rsp+gprsize+16*3]
+ mova m1, [rsp+gprsize+16*4]
+ mova m2, [rsp+gprsize+16*5]
+ mova m3, [rsp+gprsize+16*6]
+ mova m4, [rsp+gprsize+16*7]
+ mova m5, [rsp+gprsize+16*8]
+ mova m6, [rsp+gprsize+16*9]
+ mova m7, [rsp+gprsize+32*5]
+%endmacro
+
+%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
+ INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ psrlw m2, 3 ; pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)]
+ jmp m(inv_txfm_add_dct_dct_8x8).end2
+.end:
+ RET
+%elifidn %1_%2, dct_identity
+ mov r3d, 2
+.loop:
+ mova m0, [o(pw_2896x8)]
+ pmulhrsw m7, m0, [coeffq]
+ mova m1, [o(pw_16384)]
+ pxor m2, m2
+ mova [coeffq], m2
+ pmulhrsw m7, m0
+ pmulhrsw m7, m1
+ psrlw m1, 3 ; pw_2048
+ psllw m7, 2
+ pmulhrsw m7, [o(pw_5793x4)]
+ pmulhrsw m7, m1
+ punpcklwd m0, m7, m7
+ punpckhwd m7, m7
+ pshufd m3, m0, q3333
+ pshufd m2, m0, q2222
+ pshufd m1, m0, q1111
+ pshufd m0, m0, q0000
+ call m(iadst_8x4_internal).end3
+ pshufd m3, m7, q3333
+ pshufd m2, m7, q2222
+ pshufd m1, m7, q1111
+ pshufd m0, m7, q0000
+ lea dstq, [dstq+strideq*2]
+ call m(iadst_8x4_internal).end3
+
+ add coeffq, 16
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .loop
+ RET
+%elifidn %1_%2, identity_dct
+ movd m0, [coeffq+32*0]
+ punpcklwd m0, [coeffq+32*1]
+ movd m2, [coeffq+32*2]
+ punpcklwd m2, [coeffq+32*3]
+ add coeffq, 32*4
+ movd m1, [coeffq+32*0]
+ punpcklwd m1, [coeffq+32*1]
+ movd m3, [coeffq+32*2]
+ punpcklwd m3, [coeffq+32*3]
+ mova m4, [o(pw_2896x8)]
+ xor eobd, eobd
+ mov [coeffq-32*4], eobd
+ mov [coeffq-32*3], eobd
+ mov [coeffq-32*2], eobd
+ mov [coeffq-32*1], eobd
+ punpckldq m0, m2
+ punpckldq m1, m3
+ punpcklqdq m0, m1
+ pmulhrsw m0, m4
+ pmulhrsw m0, m4
+ pmulhrsw m0, [o(pw_2048)]
+ mov [coeffq+32*0], eobd
+ mov [coeffq+32*1], eobd
+ mov [coeffq+32*2], eobd
+ mov [coeffq+32*3], eobd
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_identity_dct_8x16).end)]
+ jmp m(inv_txfm_add_dct_dct_8x8).end2
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct, 0
+INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_8x8_internal).pass1)]
+
+.pass1:
+ ITX_8X16_RECT2_LOAD_ODD_COEFS
+ mov [rsp+gprsize+16*11], tx2q
+ lea tx2q, [o(m(idct_8x16_internal).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ ITX_8X16_SAVE_ODD_COEFS
+ ITX_8X16_RECT2_LOAD_EVEN_COEFS
+ mov tx2q, [rsp+gprsize+16*11]
+ jmp r3
+
+.pass2:
+ mova [coeffq+16*0], m1
+ mova [coeffq+16*2], m3
+ mova [coeffq+16*4], m5
+ mova [coeffq+16*6], m7
+ mova m1, m2
+ mova m2, m4
+ mova m3, m6
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+ call m(idct_8x8_internal).main
+
+ mova [rsp+gprsize+16*3], m0
+ mova [rsp+gprsize+16*4], m1
+ mova [rsp+gprsize+16*5], m2
+ mova [rsp+gprsize+16*6], m3
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*9], m6
+
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*2 ]
+ mova m2, [coeffq+16*4 ]
+ mova m3, [coeffq+16*6 ]
+ mova m4, [coeffq+16*3 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*15]
+ call m(idct_16x8_internal).main
+
+ lea tx2q, [o(m(idct_8x16_internal).end)]
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal).end
+
+.end:
+ ITX_8X16_LOAD_STACK_COEFS
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_8x8_internal).pass1)]
+ jmp m(idct_8x16_internal).pass1
+
+.pass2:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal).main
+
+ lea tx2q, [o(m(iadst_8x16_internal).end)]
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(iadst_8x8_internal).end
+
+.end:
+ ITX_8X16_LOAD_STACK_COEFS
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal).end
+
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_8x8_internal).pass1)]
+ jmp m(idct_8x16_internal).pass1
+
+.pass2:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal).main
+
+ lea tx2q, [o(m(iflipadst_8x16_internal).end)]
+ lea r3, [dstq+strideq*8]
+ jmp m(iflipadst_8x8_internal).end
+
+.end:
+ ITX_8X16_LOAD_STACK_COEFS
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal).end
+
+
+INV_TXFM_8X16_FN identity, dct, 7
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_8X16_RECT2_LOAD_ODD_COEFS
+ mov r3, tx2q
+ lea tx2q, [o(m(iidentity_8x16_internal).pass1_end)]
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal).pass1_end3
+
+.pass1_end:
+ ITX_8X16_SAVE_ODD_COEFS
+ ITX_8X16_RECT2_LOAD_EVEN_COEFS
+ mov tx2q, r3
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iidentity_8x16_internal).end1)]
+
+.end:
+ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ pmulhrsw m7, [o(pw_5793x4)]
+ pmulhrsw m7, [o(pw_2048)]
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_5793x4)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+gprsize+16*1], m6
+ mova [rsp+gprsize+16*2], m5
+ jmp m(idct_8x8_internal).end3
+
+.end1:
+ ITX_8X16_LOAD_ODD_COEFS
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
+ INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*12
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4).dconly
+.end:
+ RET
+%elifidn %1_%2, dct_identity
+ mova m7, [coeffq]
+ mova m0, [o(pw_2896x8)]
+ mova m1, [o(pw_16384)]
+ pxor m2, m2
+ mova [coeffq], m2
+ pmulhrsw m7, m0
+ pmulhrsw m7, m0
+ pmulhrsw m7, m1
+ psrlw m1, 2 ; pw_4096
+ pmulhrsw m7, m1
+ punpcklwd m3, m7, m7
+ punpckhwd m7, m7
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ lea r3, [dstq+strideq*4]
+ lea tx2q, [dstq+8]
+ call m(iadst_8x4_internal).end2
+ add coeffq, 16*4
+ mov dstq, tx2q
+ call m(iadst_8x4_internal).end2
+ mov dstq, r3
+ add coeffq, 16*4
+ pshufd m0, m7, q0000
+ pshufd m1, m7, q1111
+ pshufd m2, m7, q2222
+ pshufd m3, m7, q3333
+ lea tx2q, [dstq+8]
+ call m(iadst_8x4_internal).end2
+ add coeffq, 16*4
+ mov dstq, tx2q
+ TAIL_CALL m(iadst_8x4_internal).end2
+%elifidn %1_%2, identity_dct
+ mova m5, [o(pw_16384)]
+ mova m6, [o(pw_5793x4)]
+ mova m7, [o(pw_2896x8)]
+ pxor m4, m4
+ mov r3d, 2
+.main_loop:
+ mova m0, [coeffq+16*0]
+ punpcklwd m0, [coeffq+16*1]
+ mova m1, [coeffq+16*2]
+ punpcklwd m1, [coeffq+16*3]
+ mova m2, [coeffq+16*4]
+ punpcklwd m2, [coeffq+16*5]
+ mova m3, [coeffq+16*6]
+ punpcklwd m3, [coeffq+16*7]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ punpcklqdq m0, m2
+ pmulhrsw m0, m7
+ psllw m0, 2
+ pmulhrsw m0, m6
+ pmulhrsw m0, m5
+ psrlw m1, m5, 3 ; pw_2048
+ pmulhrsw m0, m7
+ pmulhrsw m0, m1
+.end:
+ REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ add coeffq, 16*8
+ lea tx2q, [dstq+8]
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ mov dstq, tx2q
+ dec r3d
+ jg .main_loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct, 0
+INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_8X16_RECT2_LOAD_EVEN_COEFS
+ call m(idct_8x8_internal).main
+ mova [rsp+gprsize+16*3], m0
+ mova [rsp+gprsize+16*4], m1
+ mova [rsp+gprsize+16*5], m2
+ mova [rsp+gprsize+16*6], m3
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*9], m6
+
+ ITX_8X16_RECT2_LOAD_ODD_COEFS
+ call .main
+ mov r3, tx2q
+ lea tx2q, [o(m(idct_16x8_internal).pass1_end)]
+ jmp m(idct_8x8_internal).pass1_end
+
+.pass1_end:
+ ITX_8X16_SAVE_ODD_COEFS
+ ITX_8X16_LOAD_STACK_COEFS
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(idct_16x8_internal).end)]
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal).pass2_main
+
+.end:
+ ITX_8X16_LOAD_ODD_COEFS
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal).pass2_main
+
+
+ALIGN function_align
+.main:
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+ mova [rsp+gprsize*2+32*5], m5
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a
+ ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a
+ psubsw m2, m0, m4 ;t9
+ paddsw m0, m4 ;t8
+ psubsw m4, m7, m3 ;t14
+ paddsw m7, m3 ;t15
+ ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a
+ mova m3, [rsp+gprsize*2+16*1]
+ mova m5, [rsp+gprsize*2+32*5]
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+32*5], m4
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m7
+ ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a
+ ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a
+ pxor m4, m4
+ psubsw m7, m2, m3 ;t10
+ paddsw m2, m3 ;t11
+ psubsw m3, m1, m5 ;t13
+ paddsw m1, m5 ;t12
+ psubw m4, m7
+ ITX_MULSUB_2W 4, 3, 7, 5, 6, 1567, 3784 ;t10a, t13a
+ mova m7, [rsp+gprsize*2+32*5]
+ psubsw m6, m0, m2 ;t11a
+ paddsw m0, m2 ;t8a
+ paddsw m2, m7, m4 ;t9
+ psubsw m7, m4 ;t10
+ mova m5, [rsp+gprsize*2+16*0]
+ psubsw m4, m5, m0 ;out8
+ paddsw m0, m5 ;out7
+ mova [rsp+gprsize*2+32*5], m0
+ mova m5, [rsp+gprsize*2+16*9]
+ psubsw m0, m5, m2 ;out9
+ paddsw m2, m5 ;out6
+ mova [rsp+gprsize*2+16*0], m0
+ mova [rsp+gprsize*2+16*9], m2
+ mova m0, [rsp+gprsize*2+16*1]
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*1], m4
+ psubsw m4, m0, m3 ;t13
+ paddsw m0, m3 ;t14
+ psubsw m3, m2, m1 ;t12a
+ paddsw m1, m2 ;t15a
+ mova m5, [o(pw_2896x8)]
+ psubw m2, m4, m7 ;t13-t10
+ paddw m7, m4 ;t13+t10
+ psubw m4, m3, m6 ;t12a-t11a
+ paddw m6, m3 ;t12a+t11a
+ pmulhrsw m7, m5 ;t13a
+ pmulhrsw m4, m5 ;t11
+ pmulhrsw m6, m5 ;t12
+ pmulhrsw m5, m2 ;t10a
+ mova m3, [rsp+gprsize*2+16*8]
+ psubsw m2, m3, m5 ;out10
+ paddsw m3, m5 ;out5
+ mova m5, [rsp+gprsize*2+16*7]
+ mova [rsp+gprsize*2+16*8], m3
+ psubsw m3, m5, m4 ;out11
+ paddsw m5, m4 ;out4
+ mova m4, [rsp+gprsize*2+16*6]
+ mova [rsp+gprsize*2+16*7], m5
+ paddsw m5, m4, m6 ;out3
+ psubsw m4, m6 ;out12
+ mova m6, [rsp+gprsize*2+16*5]
+ mova [rsp+gprsize*2+16*6], m5
+ psubsw m5, m6, m7 ;out13
+ paddsw m6, m7 ;out2
+ mova m7, [rsp+gprsize*2+16*4]
+ mova [rsp+gprsize*2+16*5], m6
+ psubsw m6, m7, m0 ;out14
+ paddsw m7, m0 ;out1
+ mova m0, [rsp+gprsize*2+16*3]
+ mova [rsp+gprsize*2+16*4], m7
+ psubsw m7, m0, m1 ;out15
+ paddsw m0, m1 ;out0
+ mova [rsp+gprsize*2+16*3], m0
+ mova m1, [rsp+gprsize*2+16*0]
+ mova m0, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*0], m7
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call .main
+ mov r3, tx2q
+ lea tx2q, [o(m(iadst_16x8_internal).pass1_end)]
+ jmp m(iadst_8x8_internal).pass1_end
+
+.pass1_end:
+ ITX_8X16_SAVE_ODD_COEFS
+ ITX_8X16_LOAD_STACK_COEFS
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iadst_8x8_internal).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(iadst_16x8_internal).end)]
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal).pass2_main
+
+.end:
+ ITX_8X16_LOAD_ODD_COEFS
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal).pass2_main
+
+ALIGN function_align
+.main:
+ mova [rsp+gprsize*2+16*0], m1
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2
+ ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10
+ psubsw m1, m0, m4 ;t10a
+ paddsw m0, m4 ;t2a
+ psubsw m4, m7, m3 ;t11a
+ paddsw m3, m7 ;t3a
+ ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10
+ mova m2, [rsp+gprsize*2+16*0] ;in3
+ mova m7, [rsp+gprsize*2+16*1] ;in4
+ mova [rsp+gprsize*2+16*0], m1 ;t11
+ mova [rsp+gprsize*2+16*1], m4 ;t10
+ mova m1, [rsp+gprsize*2+16*2] ;in12
+ mova [rsp+gprsize*2+16*2], m0 ;t2a
+ ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4
+ ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12
+ psubsw m0, m7, m1 ;t12a
+ paddsw m1, m7 ;t4a
+ psubsw m4, m5, m2 ;t13a
+ paddsw m5, m2 ;t5a
+ ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13
+ mova m2, [rsp+gprsize*2+16*8] ;in1
+ mova m7, [rsp+gprsize*2+16*9] ;in14
+ mova [rsp+gprsize*2+16*8], m4 ;t12
+ mova [rsp+gprsize*2+16*9], m0 ;t13
+ mova m4, [rsp+gprsize*2+16*4] ;in9
+ mova m0, [rsp+gprsize*2+16*5] ;in6
+ mova [rsp+gprsize*2+16*4], m1 ;t4a
+ mova [rsp+gprsize*2+16*5], m5 ;t5a
+ ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14
+ ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6
+ psubsw m1, m0, m7 ;t14a
+ paddsw m0, m7 ;t6a
+ psubsw m5, m4, m2 ;t15a
+ paddsw m4, m2 ;t7a
+ ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*2] ;t2a
+ mova [rsp+gprsize*2+16*2], m5 ;t14
+ psubsw m7, m2, m0 ;t6
+ paddsw m2, m0 ;t2
+ psubsw m0, m3, m4 ;t7
+ paddsw m3, m4 ;t3
+ ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a
+ mova m4, [rsp+gprsize*2+16*7] ;in0
+ mova m5, [rsp+gprsize*2+32*5] ;in15
+ mova [rsp+gprsize*2+16*7], m3 ;t3
+ mova [rsp+gprsize*2+32*5], m1 ;t15
+ mova m1, [rsp+gprsize*2+16*6] ;in7
+ mova m3, [rsp+gprsize*2+16*3] ;in8
+ mova [rsp+gprsize*2+16*6], m7 ;t7a
+ mova [rsp+gprsize*2+16*3], m0 ;t6a
+ ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0
+ ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8
+ psubsw m0, m4, m3 ;t8a
+ paddsw m4, m3 ;t0a
+ psubsw m3, m5, m1 ;t9a
+ paddsw m5, m1 ;t1a
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8
+ mova m1, [rsp+gprsize*2+16*4] ;t4a
+ mova m7, [rsp+gprsize*2+16*5] ;t5a
+ mova [rsp+gprsize*2+16*4], m3 ;t8
+ mova [rsp+gprsize*2+16*5], m0 ;t9
+ psubsw m0, m4, m1 ;t4
+ paddsw m4, m1 ;t0
+ psubsw m3, m5, m7 ;t5
+ paddsw m5, m7 ;t1
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a
+ mova m7, [rsp+gprsize*2+16*3] ;t6a
+ psubsw m1, m4, m2 ;t2a
+ paddsw m4, m2 ;out0
+ mova [rsp+gprsize*2+16*3], m4 ;out0
+ mova m4, [rsp+gprsize*2+16*6] ;t7a
+ psubsw m2, m3, m7 ;t6
+ paddsw m3, m7 ;-out3
+ mova [rsp+gprsize*2+16*6], m3 ;-out3
+ psubsw m3, m0, m4 ;t7
+ paddsw m0, m4 ;out12
+ mova m7, [o(pw_2896x8)]
+ psubw m4, m2, m3
+ paddw m2, m3
+ mova m3, [rsp+gprsize*2+16*7] ;t3
+ pmulhrsw m4, m7 ;-out11
+ pmulhrsw m2, m7 ;out4
+ mova [rsp+gprsize*2+16*7], m2 ;out4
+ psubsw m2, m5, m3 ;t3a
+ paddsw m5, m3 ;-out15
+ psubw m3, m1, m2
+ paddw m1, m2
+ mova m2, [rsp+gprsize*2+32*5] ;t15
+ pmulhrsw m3, m7 ;out8
+ pmulhrsw m1, m7 ;-out7
+ mova [rsp+gprsize*2+32*5 ], m1 ;-out7
+ mova m1, [rsp+gprsize*2+16*0] ;t11
+ mova [rsp+gprsize*2+16*11], m3 ;out8
+ mova [rsp+gprsize*2+16*0 ], m5 ;-out15
+ mova m3, [rsp+gprsize*2+16*1] ;t10
+ mova [rsp+gprsize*2+16*1 ], m4 ;-out11
+ mova m4, [rsp+gprsize*2+16*2] ;t14
+ mova [rsp+gprsize*2+16*2 ], m0 ;out12
+ psubsw m0, m3, m4 ;t14a
+ paddsw m3, m4 ;t10a
+ psubsw m5, m1, m2 ;t15a
+ paddsw m1, m2 ;t11a
+ ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*4] ;t8
+ mova m4, [rsp+gprsize*2+16*5] ;t9
+ mova [rsp+gprsize*2+16*4], m3 ;t10a
+ mova [rsp+gprsize*2+16*5], m1 ;t11a
+ mova m3, [rsp+gprsize*2+16*8] ;t12
+ mova m1, [rsp+gprsize*2+16*9] ;t13
+ mova [rsp+gprsize*2+16*8], m5 ;t14
+ mova [rsp+gprsize*2+16*9], m0 ;t15
+ psubsw m5, m2, m3 ;t12a
+ paddsw m2, m3 ;t8a
+ psubsw m0, m4, m1 ;t13a
+ paddsw m4, m1 ;t9a
+ ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12
+ mova m6, [rsp+gprsize*2+16*4] ;t10a
+ mova m1, [rsp+gprsize*2+16*5] ;t11a
+ psubsw m3, m2, m6 ;t10
+ paddsw m2, m6 ;-out1
+ paddsw m6, m4, m1 ;out14
+ psubsw m4, m1 ;t11
+ psubw m1, m3, m4
+ paddw m3, m4
+ pmulhrsw m1, m7 ;-out9
+ pmulhrsw m3, m7 ;out6
+ mova [rsp+gprsize*2+16*4], m2 ;-out1
+ mova m4, [rsp+gprsize*2+16*8] ;t14
+ mova m2, [rsp+gprsize*2+16*9] ;t15
+ mova [rsp+gprsize*2+16*9], m3 ;out6
+ psubsw m3, m0, m4 ;t14a
+ paddsw m0, m4 ;out2
+ psubsw m4, m5, m2 ;t15a
+ paddsw m5, m2 ;-out13
+ psubw m2, m3, m4
+ paddw m3, m4
+ mova [rsp+gprsize*2+16*5], m0 ;out2
+ pmulhrsw m3, m7 ;-out5
+ pmulhrsw m2, m7 ;out10
+ mova [rsp+gprsize*2+16*8], m3 ;-out5
+ mova m0, [rsp+gprsize*2+16*11] ;out8
+ mova m3, [rsp+gprsize*2+16*1 ] ;-out11
+ mova m4, [rsp+gprsize*2+16*2 ] ;out12
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call m(iadst_16x8_internal).main
+
+ mova m7, [rsp+gprsize+16*0]
+ ITX_8X16_SAVE_EVEN_COEFS
+ ITX_8X16_LOAD_STACK_COEFS
+ mova [rsp+gprsize+16*0], m7
+ mov r3, tx2q
+ lea tx2q, [o(m(iflipadst_16x8_internal).pass1_end)]
+ jmp m(iflipadst_8x8_internal).pass1_end
+
+.pass1_end:
+ ITX_8X16_SAVE_ODD_COEFS
+ ITX_8X16_LOAD_EVEN_COEFS
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iflipadst_8x8_internal).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_16x8_internal).end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x8_internal).pass2_main
+
+.end:
+ ITX_8X16_LOAD_ODD_COEFS
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal).pass2_main
+
+
+INV_TXFM_16X8_FN identity, dct, 15
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*8 ]
+ pmulhrsw m1, m7, [coeffq+16*9 ]
+ pmulhrsw m2, m7, [coeffq+16*10]
+ pmulhrsw m3, m7, [coeffq+16*11]
+ pmulhrsw m4, m7, [coeffq+16*12]
+ pmulhrsw m5, m7, [coeffq+16*13]
+ pmulhrsw m6, m7, [coeffq+16*14]
+ pmulhrsw m7, [coeffq+16*15]
+
+ mov r3, tx2q
+ lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
+
+.pass1:
+ REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ pmulhrsw m7, [o(pw_5793x4)]
+ mova [rsp+gprsize+16*0], m7
+
+ mova m7, [o(pw_5793x4)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+
+ jmp m(idct_8x8_internal).pass1_end
+
+.pass1_end:
+ mova [coeffq+16*9 ], m4
+ mova [coeffq+16*11], m5
+ mova [coeffq+16*13], m6
+ mova [coeffq+16*15], m7
+ mova m4, [o(pw_2896x8)]
+ pmulhrsw m5, m4, [coeffq+16*5]
+ pmulhrsw m6, m4, [coeffq+16*6]
+ pmulhrsw m7, m4, [coeffq+16*7]
+ mova [coeffq+16*5 ], m2
+ mova [coeffq+16*7 ], m3
+ pmulhrsw m2, m4, [coeffq+16*2]
+ pmulhrsw m3, m4, [coeffq+16*3]
+ mova [coeffq+16*3 ], m1
+ pmulhrsw m1, m4, [coeffq+16*1]
+ mova [coeffq+16*1 ], m0
+ pmulhrsw m0, m4, [coeffq+16*0]
+ pmulhrsw m4, [coeffq+16*4]
+
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea tx2q, [o(m(iidentity_16x8_internal).end)]
+ lea r3, [dstq+8]
+ jmp m(iidentity_8x8_internal).end
+
+.end:
+ ITX_8X16_LOAD_ODD_COEFS
+ lea tx2q, [o(m(idct_8x16_internal).end1)]
+ mov dstq, r3
+ jmp m(iidentity_8x8_internal).end