ref: e0b88bd2b2c97a2695edcc498485e1cb3003e7f1
parent: 98ed9be69b08f5438cce7e696b2c8eadfb3ce905
author: Henrik Gramner <[email protected]>
date: Mon Apr 20 19:35:30 EDT 2020
x86: Remove identity/adst itx fast paths Testing shows that those code paths are essentially never executed with real-world bitstreams so they just add redundant branches, increase code size, and add complexity for no actual benefit.
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -27,15 +27,10 @@
%if ARCH_X86_64
-SECTION_RODATA 32
+SECTION_RODATA 16
; Note: The order of (at least some of) those constants matter!
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-iadst4_dconly1a: dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: dw 30424, 26752, 19856, 10568
-
deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
%macro COEF_PAIR 2
@@ -132,7 +127,7 @@
; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
; single rip-relative lea and then address things relative from that with
; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
-%define o_base iadst4_dconly2a + 128
+%define o_base deint_shuf + 128
%define o(x) (rax - (o_base) + (x))
%macro REPX 2-*
@@ -363,18 +358,14 @@
vpblendd m0, m0, m2, 0x03
ITX4_END 3, 0, 2, 1, 0
-%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
- %undef cmp
- %define %%p1 m(i%1_%4_internal)
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%3_internal)
lea rax, [o_base]
; Jump to the 1st txfm function if we're not taking the fast path, which
; in turn performs an indirect jump to the 2nd txfm function.
- lea tx2q, [m(i%2_%4_internal).pass2]
-%if %3 > 0
- cmp eobd, %3
- jg %%p1
-%elif %3 == 0
+ lea tx2q, [m(i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
test eobd, eobd
jnz %%p1
%else
@@ -385,55 +376,17 @@
%endif
%endmacro
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x4
-%ifidn %1_%2, dct_identity
- vpbroadcastd m0, [o(pw_2896x8)]
- pmulhrsw m0, [cq]
- vpbroadcastd m1, [o(pw_1697x8)]
- pmulhrsw m1, m0
- paddsw m0, m1
- punpcklwd m0, m0
- punpckhdq m1, m0, m0
- punpckldq m0, m0
- jmp m(iadst_4x4_internal).end
-%elifidn %1_%2, identity_dct
- mova m0, [cq+16*0]
- packusdw m0, [cq+16*1]
- vpbroadcastd m1, [o(pw_1697x8)]
- vpbroadcastd m2, [o(pw_2896x8)]
- packusdw m0, m0
- pmulhrsw m1, m0
- paddsw m0, m1
- pmulhrsw m0, m2
- mova m1, m0
- jmp m(iadst_4x4_internal).end
-%elif %3 >= 0
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
vpbroadcastw m0, [cq]
-%ifidn %1, dct
vpbroadcastd m1, [o(pw_2896x8)]
pmulhrsw m0, m1
-%elifidn %1, adst
- movddup m1, [o(iadst4_dconly1a)]
- pmulhrsw m0, m1
-%elifidn %1, flipadst
- movddup m1, [o(iadst4_dconly1b)]
- pmulhrsw m0, m1
-%endif
mov [cq], eobd ; 0
-%ifidn %2, dct
-%ifnidn %1, dct
- vpbroadcastd m1, [o(pw_2896x8)]
-%endif
pmulhrsw m0, m1
mova m1, m0
jmp m(iadst_4x4_internal).end2
-%else ; adst / flipadst
- pmulhrsw m1, m0, [o(iadst4_dconly2b)]
- pmulhrsw m0, [o(iadst4_dconly2a)]
- jmp m(i%2_4x4_internal).end2
%endif
-%endif
%endmacro
%macro IDCT4_1D_PACKED 0
@@ -477,10 +430,10 @@
packssdw m1, m2 ; out2 out3
%endmacro
-INV_TXFM_4X4_FN dct, dct, 0
-INV_TXFM_4X4_FN dct, adst, 0
-INV_TXFM_4X4_FN dct, flipadst, 0
-INV_TXFM_4X4_FN dct, identity, 3
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
@@ -499,9 +452,9 @@
mova [cq+16*1], m2
ITX4_END 0, 1, 3, 2
-INV_TXFM_4X4_FN adst, dct, 0
-INV_TXFM_4X4_FN adst, adst, 0
-INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
@@ -526,9 +479,9 @@
IADST4_1D_PACKED
ret
-INV_TXFM_4X4_FN flipadst, dct, 0
-INV_TXFM_4X4_FN flipadst, adst, 0
-INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
@@ -549,7 +502,7 @@
.end2:
ITX4_END 3, 2, 1, 0
-INV_TXFM_4X4_FN identity, dct, 3
+INV_TXFM_4X4_FN identity, dct
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
@@ -600,38 +553,9 @@
pextrd [r2 +r3 ], xm5, 3
%endmacro
-%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- vpbroadcastd xm0, [o(pw_2896x8)]
- pmulhrsw xm1, xm0, [cq]
- vpbroadcastd xm2, [o(pw_4096)]
- pmulhrsw xm1, xm0
- pmulhrsw xm1, xm2
- vpermq m1, m1, q1100
- punpcklwd m1, m1
- punpckldq m0, m1, m1
- punpckhdq m1, m1
- jmp m(iadst_4x8_internal).end3
-%elifidn %1_%2, identity_dct
- movd xm0, [cq+16*0]
- punpcklwd xm0, [cq+16*1]
- movd xm1, [cq+16*2]
- punpcklwd xm1, [cq+16*3]
- vpbroadcastd xm2, [o(pw_2896x8)]
- vpbroadcastd xm3, [o(pw_1697x8)]
- vpbroadcastd xm4, [o(pw_2048)]
- punpckldq xm0, xm1
- pmulhrsw xm0, xm2
- pmulhrsw xm3, xm0
- paddsw xm0, xm3
- pmulhrsw xm0, xm2
- pmulhrsw xm0, xm4
- vpbroadcastq m0, xm0
- mova m1, m0
- jmp m(iadst_4x8_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_2048)]
@@ -641,25 +565,8 @@
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
mova m1, m0
- jmp m(iadst_4x8_internal).end4
-%else ; adst_dct / flipadst_dct
- vpbroadcastw xm0, [cq]
- vpbroadcastd xm1, [o(pw_2896x8)]
- pmulhrsw xm0, xm1
- pmulhrsw xm0, [o(iadst4_dconly1a)]
- vpbroadcastd xm2, [o(pw_2048)]
- mov [cq], eobd
- pmulhrsw xm0, xm1
- pmulhrsw xm0, xm2
-%ifidn %1, adst
- vpbroadcastq m0, xm0
-%else ; flipadst
- vpermq m0, m0, q1111
+ jmp m(iadst_4x8_internal).end3
%endif
- mova m1, m0
- jmp m(iadst_4x8_internal).end4
-%endif
-%endif
%endmacro
%macro IDCT8_1D_PACKED 0
@@ -772,10 +679,10 @@
%endmacro
INIT_YMM avx2
-INV_TXFM_4X8_FN dct, dct, 0
-INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, dct
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
@@ -804,7 +711,7 @@
WRAP_XMM IDCT8_1D_PACKED
ret
-INV_TXFM_4X8_FN adst, dct, 0
+INV_TXFM_4X8_FN adst, dct
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
@@ -838,11 +745,10 @@
pmulhrsw m0, m4
pmulhrsw m1, m4
WIN64_RESTORE_XMM
-.end3:
pxor m2, m2
mova [cq+32*0], m2
mova [cq+32*1], m2
-.end4:
+.end3:
lea r2, [dstq+strideq*4]
lea r3, [strideq*3]
WRITE_4X8 0, 1
@@ -856,7 +762,7 @@
WRAP_XMM IADST8_1D_PACKED 2
ret
-INV_TXFM_4X8_FN flipadst, dct, 0
+INV_TXFM_4X8_FN flipadst, dct
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
@@ -888,7 +794,7 @@
pshufd m1, m2, q1032
jmp m(iadst_4x8_internal).end
-INV_TXFM_4X8_FN identity, dct, 3
+INV_TXFM_4X8_FN identity, dct
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
@@ -913,49 +819,9 @@
vpbroadcastd m4, [o(pw_4096)]
jmp m(iadst_4x8_internal).end2
-%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x16
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- vpbroadcastd m0, [o(pw_2896x8)]
- pmulhrsw m0, [cq]
- vpbroadcastd m1, [o(pw_16384)]
- vpbroadcastd m2, [o(pw_1697x16)]
- vpbroadcastd m3, [o(pw_2048)]
- pmulhrsw m0, m1
- pmulhrsw m2, m0
- paddsw m0, m0
- paddsw m0, m2
- pmulhrsw m3, m0
- punpcklwd m1, m3, m3
- punpckhwd m3, m3
- punpckldq m0, m1, m1
- punpckhdq m1, m1
- punpckldq m2, m3, m3
- punpckhdq m3, m3
- jmp m(iadst_4x16_internal).end3
-%elifidn %1_%2, identity_dct
- movd xm0, [cq+32*0]
- punpcklwd xm0, [cq+32*1]
- movd xm1, [cq+32*2]
- punpcklwd xm1, [cq+32*3]
- vpbroadcastd xm2, [o(pw_1697x8)]
- vpbroadcastd xm3, [o(pw_2896x8)]
- vpbroadcastd xm4, [o(pw_2048)]
- punpckldq xm0, xm1
- pcmpeqw xm1, xm1
- pmulhrsw xm2, xm0
- pcmpeqw xm1, xm0
- pxor xm0, xm1
- pavgw xm0, xm2
- pmulhrsw xm0, xm3
- pmulhrsw xm0, xm4
- vpbroadcastq m0, xm0
- mova m1, m0
- mova m2, m0
- mova m3, m0
- jmp m(iadst_4x16_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
@@ -968,28 +834,8 @@
mova m1, m0
mova m2, m0
mova m3, m0
- jmp m(iadst_4x16_internal).end4
-%else ; adst_dct / flipadst_dct
- vpbroadcastw xm0, [cq]
- pmulhrsw xm0, [o(iadst4_dconly1a)]
- vpbroadcastd xm1, [o(pw_16384)]
- vpbroadcastd xm2, [o(pw_2896x8)]
- mov [cq], eobd
- pmulhrsw xm0, xm1
- psrlw xm1, 3 ; pw_2048
- pmulhrsw xm0, xm2
- pmulhrsw xm0, xm1
-%ifidn %1, adst
- vpbroadcastq m0, xm0
-%else ; flipadst
- vpermq m0, m0, q1111
+ jmp m(iadst_4x16_internal).end3
%endif
- mova m1, m0
- mova m2, m0
- mova m3, m0
- jmp m(iadst_4x16_internal).end4
-%endif
-%endif
%endmacro
%macro IDCT16_1D_PACKED 0
@@ -1061,10 +907,10 @@
paddsw m3, m8 ; out7 out6
%endmacro
-INV_TXFM_4X16_FN dct, dct, 0
-INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, dct
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
@@ -1102,7 +948,7 @@
WRAP_XMM IDCT16_1D_PACKED
ret
-INV_TXFM_4X16_FN adst, dct, 0
+INV_TXFM_4X16_FN adst, dct
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
@@ -1147,13 +993,12 @@
.end2:
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
WIN64_RESTORE_XMM
-.end3:
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
mova [cq+32*2], m4
mova [cq+32*3], m4
-.end4:
+.end3:
lea r2, [dstq+strideq*8]
lea r3, [strideq*3]
WRITE_4X8 0, 1
@@ -1232,7 +1077,7 @@
packssdw m1, m4 ; -out7 out4 out6 -out5
ret
-INV_TXFM_4X16_FN flipadst, dct, 0
+INV_TXFM_4X16_FN flipadst, dct
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
@@ -1274,7 +1119,7 @@
psubw m5, m7, m6
jmp m(iadst_4x16_internal).end
-INV_TXFM_4X16_FN identity, dct, 3
+INV_TXFM_4X16_FN identity, dct
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
@@ -1350,69 +1195,25 @@
movhps [dstq+%7], xm%4
%endmacro
-%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x4
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- vpbroadcastd xm0, [o(pw_2896x8)]
- pmulhrsw xm1, xm0, [cq]
- vpbroadcastd xm2, [o(pw_1697x8)]
- vpbroadcastd xm3, [o(pw_2048)]
- pmulhrsw xm1, xm0
- pmulhrsw xm2, xm1
- paddsw xm1, xm2
- pmulhrsw xm1, xm3
- punpcklwd xm1, xm1
- punpckldq xm0, xm1, xm1
- punpckhdq xm1, xm1
- vpermq m0, m0, q1100
- vpermq m1, m1, q1100
-%elifidn %1_%2, identity_dct
- mova xm0, [cq+16*0]
- packusdw xm0, [cq+16*1]
- mova xm1, [cq+16*2]
- packusdw xm1, [cq+16*3]
- vpbroadcastd xm2, [o(pw_2896x8)]
- vpbroadcastd xm3, [o(pw_2048)]
- packusdw xm0, xm1
- pmulhrsw xm0, xm2
- paddsw xm0, xm0
- pmulhrsw xm0, xm2
- pmulhrsw xm0, xm3
- vinserti128 m0, m0, xm0, 1
- mova m1, m0
-%else
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
pmulhrsw xm0, xm1
-%ifidn %2, dct
movd xm2, [o(pw_2048)]
pmulhrsw xm0, xm1
pmulhrsw xm0, xm2
vpbroadcastw m0, xm0
mova m1, m0
-%else ; adst / flipadst
- vpbroadcastw m0, xm0
- pmulhrsw m0, [o(iadst4_dconly2a)]
- vpbroadcastd m1, [o(pw_2048)]
- pmulhrsw m1, m0
-%ifidn %2, adst
- vpermq m0, m1, q1100
- vpermq m1, m1, q3322
-%else ; flipadst
- vpermq m0, m1, q2233
- vpermq m1, m1, q0011
-%endif
-%endif
-%endif
jmp m(iadst_8x4_internal).end3
%endif
%endmacro
-INV_TXFM_8X4_FN dct, dct, 0
-INV_TXFM_8X4_FN dct, adst, 0
-INV_TXFM_8X4_FN dct, flipadst, 0
-INV_TXFM_8X4_FN dct, identity, 3
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm3, [o(pw_2896x8)]
@@ -1510,7 +1311,7 @@
vpermq m0, m2, q2031
jmp m(iadst_8x4_internal).end2
-INV_TXFM_8X4_FN identity, dct, 7
+INV_TXFM_8X4_FN identity, dct
INV_TXFM_8X4_FN identity, adst
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity
@@ -1538,25 +1339,9 @@
paddsw m1, m3
jmp m(iadst_8x4_internal).end
-%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x8
-%ifidn %1_%2, dct_identity
- vpbroadcastd xm0, [o(pw_2896x8)]
- pmulhrsw xm0, [cq]
- vpbroadcastd xm1, [o(pw_16384)]
- pmulhrsw xm0, xm1
- psrlw xm1, 2 ; pw_4096
- pmulhrsw xm0, xm1
- pshufb xm0, [o(deint_shuf)]
- vpermq m3, m0, q1100
- punpcklwd m3, m3
- pshufd m0, m3, q0000
- pshufd m1, m3, q1111
- pshufd m2, m3, q2222
- pshufd m3, m3, q3333
- jmp m(iadst_8x8_internal).end4
-%elif %3 >= 0
-%ifidn %1, dct
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
movd xm2, [o(pw_16384)]
@@ -1576,33 +1361,13 @@
dec r2d
jg .loop
RET
-%else ; identity
- mova m0, [cq+32*0]
- punpcklwd m0, [cq+32*1]
- mova m1, [cq+32*2]
- punpcklwd m1, [cq+32*3]
- vpbroadcastd m2, [o(pw_2896x8)]
- vpbroadcastd m3, [o(pw_2048)]
- pxor m4, m4
- mova [cq+32*0], m4
- mova [cq+32*1], m4
- mova [cq+32*2], m4
- mova [cq+32*3], m4
- punpckldq m0, m1
- vpermq m1, m0, q3232
- vpermq m0, m0, q1010
- punpcklwd m0, m1
- pmulhrsw m0, m2
- pmulhrsw m0, m3
- jmp m(inv_txfm_add_dct_dct_8x8).end
%endif
-%endif
%endmacro
-INV_TXFM_8X8_FN dct, dct, 0
-INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, dct
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120 ; 0 1
@@ -1749,7 +1514,7 @@
pmulhrsw m0, m5, m4
jmp m(iadst_8x8_internal).end3
-INV_TXFM_8X8_FN identity, dct, 7
+INV_TXFM_8X8_FN identity, dct
INV_TXFM_8X8_FN identity, adst
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
@@ -1776,8 +1541,8 @@
vpbroadcastd m4, [o(pw_4096)]
jmp m(iadst_8x8_internal).end
-%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x16
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
@@ -1791,66 +1556,6 @@
vpbroadcastw m0, xm0
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_8x8).end2
-%elifidn %1_%2, dct_identity
- WIN64_SPILL_XMM 13
- vpbroadcastd m0, [o(pw_2896x8)]
- pmulhrsw m7, m0, [cq]
- vpbroadcastd m1, [o(pw_16384)]
- vpbroadcastd m2, [o(pw_1697x16)]
- pxor m3, m3
- mova [cq], m3
- pmulhrsw m7, m0
- pmulhrsw m7, m1
- psrlw m1, 3 ; pw_2048
- pmulhrsw m2, m7
- paddsw m7, m7
- paddsw m7, m2
- pmulhrsw m7, m1
- punpcklwd m5, m7, m7
- punpckhwd m7, m7
- punpcklwd m4, m5, m5
- punpckhwd m5, m5
- punpcklwd m6, m7, m7
- punpckhwd m7, m7
- vpermq m0, m4, q1100
- vpermq m1, m5, q1100
- vpermq m2, m6, q1100
- vpermq m3, m7, q1100
- vpermq m4, m4, q3322
- vpermq m5, m5, q3322
- vpermq m6, m6, q3322
- vpermq m7, m7, q3322
- jmp m(idct_8x16_internal).end4
-%elifidn %1_%2, identity_dct
- movd xm0, [cq+32*0]
- punpcklwd xm0, [cq+32*1]
- movd xm2, [cq+32*2]
- punpcklwd xm2, [cq+32*3]
- add cq, 32*4
- movd xm1, [cq+32*0]
- punpcklwd xm1, [cq+32*1]
- movd xm3, [cq+32*2]
- punpcklwd xm3, [cq+32*3]
- vpbroadcastd xm4, [o(pw_2896x8)]
- vpbroadcastd xm5, [o(pw_2048)]
- xor eax, eax
- mov [cq-32*4], eax
- mov [cq-32*3], eax
- mov [cq-32*2], eax
- mov [cq-32*1], eax
- punpckldq xm0, xm2
- punpckldq xm1, xm3
- punpcklqdq xm0, xm1
- pmulhrsw xm0, xm4
- pmulhrsw xm0, xm4
- pmulhrsw xm0, xm5
- mov [cq+32*0], eax
- mov [cq+32*1], eax
- mov [cq+32*2], eax
- mov [cq+32*3], eax
- vinserti128 m0, m0, xm0, 1
- mov r2d, 4
- jmp m(inv_txfm_add_dct_dct_8x8).end2
%endif
%endmacro
@@ -1867,10 +1572,10 @@
pmulhrsw m4, [cq+32*0]
%endmacro
-INV_TXFM_8X16_FN dct, dct, 0
-INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, dct
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
@@ -1915,7 +1620,6 @@
.end3:
pxor m8, m8
REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
-.end4:
lea r3, [strideq*3]
WRITE_8X4 0, 1, 8, 9
lea dstq, [dstq+strideq*4]
@@ -2120,7 +1824,7 @@
pmulhrsw m7, m9, m8
jmp m(idct_8x16_internal).end3
-INV_TXFM_8X16_FN identity, dct, 7
+INV_TXFM_8X16_FN identity, dct
INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
@@ -2197,64 +1901,11 @@
vextracti128 [dstq+%6], m%3, 1
%endmacro
-%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x4
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- vpbroadcastd xm3, [o(pw_2896x8)]
- pmulhrsw xm3, [cq]
- vpbroadcastd xm0, [o(pw_16384)]
- vpbroadcastd xm1, [o(pw_1697x8)]
- pmulhrsw xm3, xm0
- psrlw xm0, 3 ; pw_2048
- pmulhrsw xm1, xm3
- paddsw xm3, xm1
- pmulhrsw xm3, xm0
- punpcklwd xm3, xm3
- punpckldq xm1, xm3, xm3
- punpckhdq xm3, xm3
- vpbroadcastq m0, xm1
- vpermq m1, m1, q1111
- vpbroadcastq m2, xm3
- vpermq m3, m3, q1111
- jmp m(iadst_16x4_internal).end2
-%elifidn %1_%2, identity_dct
- mova xm0, [cq+16*0]
- mova xm2, [cq+16*1]
- vinserti128 m0, m0, [cq+16*4], 1
- vinserti128 m2, m2, [cq+16*5], 1
- mova xm1, [cq+16*2]
- mova xm3, [cq+16*3]
- vinserti128 m1, m1, [cq+16*6], 1
- vinserti128 m3, m3, [cq+16*7], 1
- vpbroadcastd m4, [o(pw_1697x16)]
- vpbroadcastd m5, [o(pw_16384)]
- packusdw m0, m2
- packusdw m1, m3
- packusdw m0, m1
- vpbroadcastd m1, [o(pw_2896x8)]
- pmulhrsw m4, m0
- pmulhrsw m4, m5
- paddsw m0, m4
- psrlw m5, 3 ; pw_2048
- pmulhrsw m0, m1
- pmulhrsw m0, m5
- mov r3d, 2
-.end:
- pxor m3, m3
-.end_loop:
- mova [cq+32*0], m3
- mova [cq+32*1], m3
- add cq, 32*2
- WRITE_16X2 0, 0, 1, 2, strideq*0, strideq*1
- lea dstq, [dstq+strideq*2]
- dec r3d
- jg .end_loop
- RET
-%else
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
-%ifidn %2, dct
movd xm2, [o(pw_16384)]
mov [cq], eobd
mov r2d, 2
@@ -2279,35 +1930,13 @@
dec r2d
jg .dconly_loop
RET
-%else ; adst / flipadst
- movd xm2, [o(pw_16384)]
- pmulhrsw xm0, xm2
- vpbroadcastw m0, xm0
- pmulhrsw m0, [o(iadst4_dconly2a)]
- vpbroadcastd m3, [o(pw_2048)]
- mov [cq], eobd
- pmulhrsw m3, m0
-%ifidn %2, adst
- vpbroadcastq m0, xm3
- vpermq m1, m3, q1111
- vpermq m2, m3, q2222
- vpermq m3, m3, q3333
-%else ; flipadst
- vpermq m0, m3, q3333
- vpermq m1, m3, q2222
- vpermq m2, m3, q1111
- vpbroadcastq m3, xm3
%endif
- jmp m(iadst_16x4_internal).end3
-%endif
-%endif
-%endif
%endmacro
-INV_TXFM_16X4_FN dct, dct, 0
-INV_TXFM_16X4_FN dct, adst, 0
-INV_TXFM_16X4_FN dct, flipadst, 0
-INV_TXFM_16X4_FN dct, identity, 3
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova xm0, [cq+16*0]
@@ -2481,7 +2110,7 @@
WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1
RET
-INV_TXFM_16X4_FN identity, dct, 15
+INV_TXFM_16X4_FN identity, dct
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
@@ -2531,8 +2160,8 @@
paddsw m3, m7
jmp m(iadst_16x4_internal).end
-%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x8
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
@@ -2541,59 +2170,6 @@
pmulhrsw xm0, xm1
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_16x4).dconly
-%elifidn %1_%2, dct_identity
- WIN64_SPILL_XMM 13
- vbroadcasti128 m7, [cq]
- vpbroadcastd m0, [o(pw_2896x8)]
- vpbroadcastd m1, [o(pw_16384)]
- pxor xm2, xm2
- mova [cq], xm2
- pmulhrsw m7, m0
- pmulhrsw m7, m0
- pmulhrsw m7, m1
- psrlw m1, 2 ; pw_4096
- pmulhrsw m7, m1
- punpcklwd m3, m7, m7
- punpckhwd m7, m7
- pshufd m0, m3, q0000
- pshufd m1, m3, q1111
- pshufd m2, m3, q2222
- pshufd m3, m3, q3333
- pshufd m4, m7, q0000
- pshufd m5, m7, q1111
- pshufd m6, m7, q2222
- pshufd m7, m7, q3333
- lea r3, [strideq*3]
- WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
- WRITE_16X2 2, 3, 0, 1, strideq*2, r3
- jmp m(idct_16x8_internal).end4
-%elifidn %1_%2, identity_dct
- mova m0, [cq+32*0]
- packusdw m0, [cq+32*1]
- mova m2, [cq+32*2]
- packusdw m2, [cq+32*3]
- mova m1, [cq+32*4]
- packusdw m1, [cq+32*5]
- mova m3, [cq+32*6]
- packusdw m3, [cq+32*7]
- vpbroadcastd m4, [o(pw_2896x8)]
- vpbroadcastd m5, [o(pw_1697x16)]
- packusdw m0, m2
- packusdw m1, m3
- vpbroadcastd m2, [o(pw_16384)]
- packusdw m0, m1
- vpermq m1, m0, q3322
- vpermq m0, m0, q1100
- punpcklwd m0, m1
- pmulhrsw m0, m4
- pmulhrsw m5, m0
- pmulhrsw m5, m2
- paddsw m0, m5
- psrlw m2, 3 ; pw_2048
- pmulhrsw m0, m4
- pmulhrsw m0, m2
- mov r3d, 4
- jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
%endmacro
@@ -2611,10 +2187,10 @@
REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
%endmacro
-INV_TXFM_16X8_FN dct, dct, 0
-INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, dct
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 3120
@@ -2837,7 +2413,7 @@
WRITE_16X2 1, 2, 0, 1, strideq*2, r3
jmp m(idct_16x8_internal).end3
-INV_TXFM_16X8_FN identity, dct, 15
+INV_TXFM_16X8_FN identity, dct
INV_TXFM_16X8_FN identity, adst
INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity
@@ -2896,8 +2472,8 @@
%define o_base pw_5 + 128
-%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x16
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
@@ -2905,72 +2481,6 @@
mov [cq], eobd
mov r2d, 8
jmp m(inv_txfm_add_dct_dct_16x4).dconly
-%elifidn %1_%2, dct_identity
- WIN64_SPILL_XMM 7
- vpbroadcastd m3, [o(pw_2896x8)]
- pmulhrsw m3, [cq]
- vpbroadcastd m0, [o(pw_8192)]
- vpbroadcastd m1, [o(pw_1697x16)]
- vpbroadcastw m4, [o(deint_shuf)] ; pb_0_1
- pcmpeqb m5, m5
- pxor m6, m6
- mova [cq], m6
- paddb m5, m5 ; pb_m2
- pmulhrsw m3, m0
- psrlw m0, 2 ; pw_2048
- IDTX16 3, 1, 1
- pmulhrsw m3, m0
- mov r3d, 8
-.loop:
- mova xm1, [dstq]
- vinserti128 m1, m1, [dstq+strideq*8], 1
- pshufb m0, m3, m4
- psubb m4, m5 ; += 2
- punpckhbw m2, m1, m6
- punpcklbw m1, m6
- paddw m2, m0
- paddw m1, m0
- packuswb m1, m2
- mova [dstq], xm1
- vextracti128 [dstq+strideq*8], m1, 1
- add dstq, strideq
- dec r3d
- jg .loop
- RET
-%elifidn %1_%2, identity_dct
- movd xm0, [cq+32*0 ]
- movd xm2, [cq+32*1 ]
- movd xm1, [cq+32*2 ]
- movd xm3, [cq+32*3 ]
- vinserti128 m0, m0, [cq+32*8 ], 1
- vinserti128 m2, m2, [cq+32*9 ], 1
- vinserti128 m1, m1, [cq+32*10], 1
- vinserti128 m3, m3, [cq+32*11], 1
- punpcklwd m0, m2
- punpcklwd m1, m3
- punpckldq m0, m1
- movd xm1, [cq+32*4 ]
- movd xm3, [cq+32*5 ]
- movd xm2, [cq+32*6 ]
- movd xm4, [cq+32*7 ]
- vinserti128 m1, m1, [cq+32*12], 1
- vinserti128 m3, m3, [cq+32*13], 1
- vinserti128 m2, m2, [cq+32*14], 1
- vinserti128 m4, m4, [cq+32*15], 1
- punpcklwd m1, m3
- vpbroadcastd m3, [o(pw_1697x16)]
- punpcklwd m2, m4
- vpbroadcastd m4, [o(pw_2896x8)]
- punpckldq m1, m2
- vpbroadcastd m2, [o(pw_2048)]
- punpcklqdq m0, m1
- pmulhrsw m3, m0
- psraw m3, 1
- pavgw m0, m3
- pmulhrsw m0, m4
- pmulhrsw m0, m2
- mov r3d, 8
- jmp m(inv_txfm_add_identity_dct_16x4).end
%endif
%endmacro
@@ -2995,10 +2505,10 @@
mova [rsp], m15
%endmacro
-INV_TXFM_16X16_FN dct, dct, 0
-INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, dct
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
@@ -3395,7 +2905,7 @@
pavgw m%1, m%2 ; signs are guaranteed to be equal
%endmacro
-INV_TXFM_16X16_FN identity, dct, 15
+INV_TXFM_16X16_FN identity, dct
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
@@ -3456,7 +2966,7 @@
paddsw m15, m1
jmp m(idct_16x16_internal).end
-%define o_base iadst4_dconly2a + 128
+%define o_base deint_shuf + 128
%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
%if %3
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -139,11 +139,6 @@
pw_4085x8: times 8 dw 4085*8
pw_m301x8: times 8 dw -301*8
-iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-
SECTION .text
%macro REPX 2-*
@@ -243,31 +238,24 @@
paddsw m0, m2 ;high: out1 ;low: out0
%endmacro
-%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
-cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
- %undef cmp
- %define %%p1 m(i%1_%4_internal)
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2
+ %define %%p1 m(i%1_%3_internal)
%if ARCH_X86_32
LEA r5, $$
%endif
%if has_epilogue
-%if %3 > 0
- cmp eobd, %3
- jle %%end
-%elif %3 == 0
+%ifidn %1_%2, dct_dct
test eobd, eobd
jz %%end
%endif
- lea tx2q, [o(m(i%2_%4_internal).pass2)]
+ lea tx2q, [o(m(i%2_%3_internal).pass2)]
call %%p1
RET
%%end:
%else
- lea tx2q, [o(m(i%2_%4_internal).pass2)]
-%if %3 > 0
- cmp eobd, %3
- jg %%p1
-%elif %3 == 0
+ lea tx2q, [o(m(i%2_%3_internal).pass2)]
+%ifidn %1_%2, dct_dct
test eobd, eobd
jnz %%p1
%else
@@ -278,63 +266,26 @@
%endif
%endmacro
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x4, 6
-%ifidn %1_%2, dct_identity
- mova m0, [o(pw_2896x8)]
- pmulhrsw m0, [coeffq]
- pmulhrsw m1, m0, [o(pw_1697x8)]
- paddsw m0, m1
- punpcklwd m0, m0
- punpckhdq m1, m0, m0
- punpckldq m0, m0
- TAIL_CALL m(iadst_4x4_internal).end
-%elifidn %1_%2, identity_dct
- mova m1, [coeffq+16*0]
- mova m2, [coeffq+16*1]
- punpcklwd m0, m1, m2
- punpckhwd m1, m2
- punpcklwd m0, m1
- punpcklqdq m0, m0
- pmulhrsw m1, m0, [o(pw_1697x8)]
- paddsw m0, m1
- pmulhrsw m0, [o(pw_2896x8)]
- mova m1, m0
- TAIL_CALL m(iadst_4x4_internal).end
-%elif %3 >= 0
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
-%ifidn %1, dct
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
-%elifidn %1, adst
- pmulhrsw m0, [o(iadst4_dconly1a)]
-%elifidn %1, flipadst
- pmulhrsw m0, [o(iadst4_dconly1b)]
-%endif
mov [coeffq], eobd ;0
-%ifidn %2, dct
-%ifnidn %1, dct
- pmulhrsw m0, [o(pw_2896x8)]
-%else
pmulhrsw m0, m1
-%endif
mova m1, m0
TAIL_CALL m(iadst_4x4_internal).end2
-%else ; adst / flipadst
- pmulhrsw m1, m0, [o(iadst4_dconly2b)]
- pmulhrsw m0, [o(iadst4_dconly2a)]
- TAIL_CALL m(i%2_4x4_internal).end2
%endif
-%endif
%endmacro
INIT_XMM ssse3
-INV_TXFM_4X4_FN dct, dct, 0
-INV_TXFM_4X4_FN dct, adst, 0
-INV_TXFM_4X4_FN dct, flipadst, 0
-INV_TXFM_4X4_FN dct, identity, 3
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0] ;high: in1 ;low: in0
@@ -358,9 +309,9 @@
ITX4_END 0, 1, 3, 2
-INV_TXFM_4X4_FN adst, dct, 0
-INV_TXFM_4X4_FN adst, adst, 0
-INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
@@ -410,9 +361,9 @@
packssdw m1, m2 ;high: out3 ;low: out3
ret
-INV_TXFM_4X4_FN flipadst, dct, 0
-INV_TXFM_4X4_FN flipadst, adst, 0
-INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
@@ -436,7 +387,7 @@
.end2:
ITX4_END 3, 2, 1, 0
-INV_TXFM_4X4_FN identity, dct, 3
+INV_TXFM_4X4_FN identity, dct
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
@@ -595,39 +546,9 @@
punpckhdq m3, m4 ;low: in6 high: in7
%endmacro
-%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x8, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- mova m1, [o(pw_2896x8)]
- pmulhrsw m0, m1, [coeffq]
- pmulhrsw m0, m1
- pmulhrsw m0, [o(pw_4096)]
- punpckhwd m2, m0, m0
- punpcklwd m0, m0
- punpckhdq m1, m0, m0
- punpckldq m0, m0
- punpckhdq m3, m2, m2
- punpckldq m2, m2
- TAIL_CALL m(iadst_4x8_internal).end3
-%elifidn %1_%2, identity_dct
- movd m0, [coeffq+16*0]
- punpcklwd m0, [coeffq+16*1]
- movd m1, [coeffq+16*2]
- punpcklwd m1, [coeffq+16*3]
- mova m2, [o(pw_2896x8)]
- punpckldq m0, m1
- pmulhrsw m0, m2
- pmulhrsw m1, m0, [o(pw_1697x8)]
- paddsw m0, m1
- pmulhrsw m0, m2
- pmulhrsw m0, [o(pw_2048)]
- punpcklqdq m0, m0
- mova m1, m0
- mova m2, m0
- mova m3, m0
- TAIL_CALL m(iadst_4x8_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
mova m1, [o(pw_2896x8)]
@@ -639,32 +560,14 @@
mova m1, m0
mova m2, m0
mova m3, m0
- TAIL_CALL m(iadst_4x8_internal).end4
-%else ; adst_dct / flipadst_dct
- pshuflw m0, [coeffq], q0000
- punpcklqdq m0, m0
- mova m1, [o(pw_2896x8)]
- pmulhrsw m0, m1
-%ifidn %1, adst
- pmulhrsw m0, [o(iadst4_dconly1a)]
-%else ; flipadst
- pmulhrsw m0, [o(iadst4_dconly1b)]
+ TAIL_CALL m(iadst_4x8_internal).end3
%endif
- mov [coeffq], eobd
- pmulhrsw m0, m1
- pmulhrsw m0, [o(pw_2048)]
- mova m1, m0
- mova m2, m0
- mova m3, m0
- TAIL_CALL m(iadst_4x8_internal).end4
-%endif
-%endif
%endmacro
-INV_TXFM_4X8_FN dct, dct, 0
-INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, dct
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m3, [o(pw_2896x8)]
@@ -690,7 +593,7 @@
ret
-INV_TXFM_4X8_FN adst, dct, 0
+INV_TXFM_4X8_FN adst, dct
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
@@ -725,8 +628,6 @@
pmulhrsw m1, m4
pmulhrsw m2, m4
pmulhrsw m3, m4
-
-.end3:
pxor m5, m5
mova [coeffq+16*0], m5
mova [coeffq+16*1], m5
@@ -733,7 +634,7 @@
mova [coeffq+16*2], m5
mova [coeffq+16*3], m5
-.end4:
+.end3:
WRITE_4X8 0, 1, 2, 3
RET
@@ -783,7 +684,7 @@
packssdw m2, m4 ;low: out4 high: -out5
ret
-INV_TXFM_4X8_FN flipadst, dct, 0
+INV_TXFM_4X8_FN flipadst, dct
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
@@ -824,7 +725,7 @@
psubw m4, m5
jmp m(iadst_4x8_internal).end
-INV_TXFM_4X8_FN identity, dct, 3
+INV_TXFM_4X8_FN identity, dct
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
@@ -881,50 +782,14 @@
WRITE_8X2 %3, %4, %5, %6, %7
%endmacro
-%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x4, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- mova m0, [o(pw_2896x8)]
- pmulhrsw m1, m0, [coeffq]
- pmulhrsw m1, m0
- pmulhrsw m0, m1, [o(pw_1697x8)]
- paddsw m1, m0
- pmulhrsw m1, [o(pw_2048)]
- punpcklwd m1, m1
- punpckhdq m2, m1, m1
- punpckldq m1, m1
- punpckhdq m3, m2, m2
- punpckldq m2, m2
- punpckldq m0, m1, m1
- punpckhdq m1, m1
-%elifidn %1_%2, identity_dct
- mova m0, [coeffq+16*0]
- mova m1, [coeffq+16*1]
- mova m2, [coeffq+16*2]
- mova m3, [coeffq+16*3]
- punpckhwd m4, m0, m1
- punpcklwd m0, m1
- punpckhwd m5, m2, m3
- punpcklwd m2, m3
- punpcklwd m0, m4
- punpcklwd m2, m5
- punpcklqdq m0, m2
- mova m4, [o(pw_2896x8)]
- pmulhrsw m0, m4
- paddsw m0, m0
- pmulhrsw m0, m4
- pmulhrsw m0, [o(pw_2048)]
- mova m1, m0
- mova m2, m0
- mova m3, m0
-%else
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
pmulhrsw m0, m1
-%ifidn %2, dct
mova m2, [o(pw_2048)]
pmulhrsw m0, m1
pmulhrsw m0, m2
@@ -931,34 +796,14 @@
mova m1, m0
mova m2, m0
mova m3, m0
-%else ; adst / flipadst
- pmulhrsw m2, m0, [o(iadst4_dconly2b)]
- pmulhrsw m0, [o(iadst4_dconly2a)]
- mova m1, [o(pw_2048)]
- pmulhrsw m0, m1
- pmulhrsw m2, m1
-%ifidn %2, adst
- punpckhqdq m1, m0, m0
- punpcklqdq m0, m0
- punpckhqdq m3, m2, m2
- punpcklqdq m2, m2
-%else ; flipadst
- mova m3, m0
- punpckhqdq m0, m2, m2
- punpcklqdq m1, m2, m2
- punpckhqdq m2, m3, m3
- punpcklqdq m3, m3
-%endif
-%endif
-%endif
TAIL_CALL m(iadst_8x4_internal).end2
%endif
%endmacro
-INV_TXFM_8X4_FN dct, dct, 0
-INV_TXFM_8X4_FN dct, adst, 0
-INV_TXFM_8X4_FN dct, flipadst, 0
-INV_TXFM_8X4_FN dct, identity, 3
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m3, [o(pw_2896x8)]
@@ -1157,7 +1002,7 @@
mova m3, m4
jmp m(iadst_8x4_internal).end
-INV_TXFM_8X4_FN identity, dct, 7
+INV_TXFM_8X4_FN identity, dct
INV_TXFM_8X4_FN identity, adst
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity
@@ -1199,30 +1044,9 @@
paddsw m3, m7
jmp m(iadst_8x4_internal).end
-%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x8, 8, 16*4
-%ifidn %1_%2, dct_identity
- mova m0, [o(pw_2896x8)]
- pmulhrsw m0, [coeffq]
- mova m1, [o(pw_16384)]
- pmulhrsw m0, m1
- psrlw m1, 2
- pmulhrsw m0, m1
- punpckhwd m7, m0, m0
- punpcklwd m0, m0
- pshufd m3, m0, q3333
- pshufd m2, m0, q2222
- pshufd m1, m0, q1111
- pshufd m0, m0, q0000
- call m(iadst_8x4_internal).end2
- pshufd m3, m7, q3333
- pshufd m2, m7, q2222
- pshufd m1, m7, q1111
- pshufd m0, m7, q0000
- lea dstq, [dstq+strideq*2]
- TAIL_CALL m(iadst_8x4_internal).end3
-%elif %3 >= 0
-%ifidn %1, dct
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
mova m1, [o(pw_2896x8)]
@@ -1244,25 +1068,7 @@
jmp tx2q
.end3:
RET
-%else ; identity
- mova m0, [coeffq+16*0]
- mova m1, [coeffq+16*1]
- mova m2, [coeffq+16*2]
- mova m3, [coeffq+16*3]
- punpcklwd m0, [coeffq+16*4]
- punpcklwd m1, [coeffq+16*5]
- punpcklwd m2, [coeffq+16*6]
- punpcklwd m3, [coeffq+16*7]
- punpcklwd m0, m2
- punpcklwd m1, m3
- punpcklwd m0, m1
- pmulhrsw m0, [o(pw_2896x8)]
- pmulhrsw m0, [o(pw_2048)]
- pxor m4, m4
- REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
- jmp m(inv_txfm_add_dct_dct_8x8).end
%endif
-%endif
%endmacro
%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
@@ -1298,10 +1104,10 @@
ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
%endmacro
-INV_TXFM_8X8_FN dct, dct, 0
-INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, dct
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq, 16
@@ -1610,7 +1416,7 @@
mova [rsp+gprsize+16*0], m7
jmp m(idct_8x8_internal).end3
-INV_TXFM_8X8_FN identity, dct, 7
+INV_TXFM_8X8_FN identity, dct
INV_TXFM_8X8_FN identity, adst
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
@@ -1634,58 +1440,9 @@
jmp m(idct_8x8_internal).end3
-%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x16, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- mova m0, [o(pw_2896x8)]
- mova m1, m0
- pmulhrsw m0, [coeffq+16*0]
- pmulhrsw m1, [coeffq+16*1]
- mova m2, [o(pw_16384)]
- mova m3, [o(pw_1697x16)]
- mova m4, [o(pw_2048)]
- pmulhrsw m0, m2
- pmulhrsw m1, m2
- pmulhrsw m2, m3, m0
- pmulhrsw m3, m1
- paddsw m0, m0
- paddsw m1, m1
- paddsw m0, m2
- paddsw m1, m3
- pmulhrsw m0, m4
- pmulhrsw m4, m1
- punpckhwd m2, m0, m0
- punpcklwd m0, m0
- punpckhwd m6, m4, m4
- punpcklwd m4, m4
- punpckhdq m1, m0, m0
- punpckldq m0, m0
- punpckhdq m3, m2, m2
- punpckldq m2, m2
- punpckhdq m5, m4, m4
- punpckldq m4, m4
- punpckhdq m7, m6, m6
- punpckldq m6, m6
- mova [coeffq+16*4], m4
- TAIL_CALL m(iadst_4x16_internal).end2
-%elifidn %1_%2, identity_dct
- movd m0, [coeffq+32*0]
- punpcklwd m0, [coeffq+32*1]
- movd m1, [coeffq+32*2]
- punpcklwd m1, [coeffq+32*3]
- punpckldq m0, m1
- pmulhrsw m1, m0, [o(pw_1697x8)]
- pcmpeqw m2, m2
- pcmpeqw m2, m0
- pxor m0, m2
- pavgw m0, m1
- pmulhrsw m0, [o(pw_2896x8)]
- pmulhrsw m0, [o(pw_2048)]
- punpcklqdq m0, m0
- pxor m1, m1
- REPX {mova [coeffq+32*x], m1}, 0, 1, 2, 3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
mova m1, [o(pw_2896x8)]
@@ -1694,21 +1451,6 @@
pmulhrsw m0, [o(pw_16384)]
pmulhrsw m0, m1
pmulhrsw m0, [o(pw_2048)]
-%else ; adst_dct / flipadst_dct
- pshuflw m0, [coeffq], q0000
- punpcklwd m0, m0
-%ifidn %1, adst
- pmulhrsw m0, [o(iadst4_dconly1a)]
-%else ; flipadst
- pmulhrsw m0, [o(iadst4_dconly1b)]
-%endif
- mova m1, [o(pw_16384)]
- mov [coeffq], eobd
- pmulhrsw m0, m1
- psrlw m1, 3 ; pw_2048
- pmulhrsw m0, [o(pw_2896x8)]
- pmulhrsw m0, m1
-%endif
.end:
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
@@ -1721,10 +1463,10 @@
%endif
%endmacro
-INV_TXFM_4X16_FN dct, dct, 0
-INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, dct
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(idct_4x8_internal).pass1)]
@@ -1790,7 +1532,7 @@
REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
-INV_TXFM_4X16_FN adst, dct, 0
+INV_TXFM_4X16_FN adst, dct
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
@@ -1858,7 +1600,7 @@
ret
-INV_TXFM_4X16_FN flipadst, dct, 0
+INV_TXFM_4X16_FN flipadst, dct
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
@@ -1888,7 +1630,7 @@
jmp m(iadst_4x16_internal).end1
-INV_TXFM_4X16_FN identity, dct, 3
+INV_TXFM_4X16_FN identity, dct
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
@@ -1964,68 +1706,11 @@
jmp m(iadst_4x16_internal).end2
-%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x4, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
- mova m3, [o(pw_2896x8)]
- pmulhrsw m3, [coeffq]
- mova m0, [o(pw_16384)]
- pmulhrsw m3, m0
- psrlw m0, 3 ; pw_2048
- pmulhrsw m1, m3, [o(pw_1697x8)]
- paddsw m3, m1
- pmulhrsw m3, m0
- punpcklwd m3, m3
- pshufd m0, m3, q0000
- pshufd m1, m3, q1111
- pshufd m2, m3, q2222
- pshufd m3, m3, q3333
- lea tx2q, [dstq+8]
- call m(iadst_8x4_internal).end2
- add coeffq, 16*4
- mov dstq, tx2q
- TAIL_CALL m(iadst_8x4_internal).end2
-%elifidn %1_%2, identity_dct
- mova m4, [o(pw_1697x16)]
- mova m5, [o(pw_16384)]
- mova m6, [o(pw_2896x8)]
- mov r3d, 2
- psrlw m7, m5, 3 ; pw_2048
-.main_loop:
- mova m0, [coeffq+16*0]
- mova m1, [coeffq+16*1]
- punpckhwd m2, m0, m1
- punpcklwd m0, m1
- punpcklwd m0, m2
- mova m1, [coeffq+16*2]
- mova m2, [coeffq+16*3]
- punpckhwd m3, m1, m2
- punpcklwd m1, m2
- punpcklwd m1, m3
- punpcklqdq m0, m1
- pmulhrsw m1, m4, m0
- pmulhrsw m1, m5
- paddsw m0, m1
- pmulhrsw m0, m6
- pmulhrsw m0, m7
-.end:
- pxor m3, m3
- mova [coeffq+16*0], m3
- mova [coeffq+16*1], m3
- mova [coeffq+16*2], m3
- mova [coeffq+16*3], m3
- add coeffq, 16*4
- lea tx2q, [dstq+8]
- WRITE_8X4 0, 0, 0, 0, 1, 2, 3
- mov dstq, tx2q
- dec r3d
- jg .main_loop
- RET
-%else
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
-%ifidn %2, dct
movd m2, [o(pw_16384)]
mov [coeffq], eobd
mov r2d, 2
@@ -2059,36 +1744,7 @@
jmp tx2q
.end:
RET
-%else ; adst / flipadst
- movd m2, [o(pw_16384)]
- pmulhrsw m0, m2
- pshuflw m0, m0, q0000
- punpcklwd m0, m0
- mov [coeffq], eobd
- pmulhrsw m2, m0, [o(iadst4_dconly2b)]
- pmulhrsw m0, [o(iadst4_dconly2a)]
- mova m1, [o(pw_2048)]
- pmulhrsw m0, m1
- pmulhrsw m2, m1
-%ifidn %2, adst
- punpckhqdq m1, m0, m0
- punpcklqdq m0, m0
- punpckhqdq m3, m2, m2
- punpcklqdq m2, m2
-%else ; flipadst
- mova m3, m0
- punpckhqdq m0, m2, m2
- punpcklqdq m1, m2, m2
- punpckhqdq m2, m3, m3
- punpcklqdq m3, m3
%endif
- lea tx2q, [dstq+8]
- call m(iadst_8x4_internal).end3
- mov dstq, tx2q
- TAIL_CALL m(iadst_8x4_internal).end3
-%endif
-%endif
-%endif
%endmacro
%macro LOAD_7ROWS 2 ;src, stride
@@ -2144,10 +1800,10 @@
punpcklqdq m%1, m%6 ;low: t8a high: t9
%endmacro
-INV_TXFM_16X4_FN dct, dct, 0
-INV_TXFM_16X4_FN dct, adst, 0
-INV_TXFM_16X4_FN dct, flipadst, 0
-INV_TXFM_16X4_FN dct, identity, 3
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16
@@ -2464,7 +2120,7 @@
jmp m(idct_16x4_internal).pass2_end
-INV_TXFM_16X4_FN identity, dct, 15
+INV_TXFM_16X4_FN identity, dct
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
@@ -2537,8 +2193,8 @@
mova [%1+%2*7], m7
%endmacro
-%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*16
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16, 8, 16*16
%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
@@ -2556,78 +2212,13 @@
jmp m(inv_txfm_add_dct_dct_8x8).loop
.end:
RET
-%elifidn %1_%2, dct_identity
- mov r3d, 2
-.loop:
- mova m0, [o(pw_2896x8)]
- pmulhrsw m7, m0, [coeffq]
- mova m1, [o(pw_16384)]
- pxor m2, m2
- mova [coeffq], m2
- pmulhrsw m7, m0
- pmulhrsw m7, m1
- psrlw m1, 3 ; pw_2048
- pmulhrsw m0, m7, [o(pw_1697x16)]
- paddsw m7, m7
- paddsw m7, m0
- pmulhrsw m7, m1
- punpcklwd m0, m7, m7
- punpckhwd m7, m7
- pshufd m3, m0, q3333
- pshufd m2, m0, q2222
- pshufd m1, m0, q1111
- pshufd m0, m0, q0000
- call m(iadst_8x4_internal).end3
- pshufd m3, m7, q3333
- pshufd m2, m7, q2222
- pshufd m1, m7, q1111
- pshufd m0, m7, q0000
- lea dstq, [dstq+strideq*2]
- call m(iadst_8x4_internal).end3
-
- add coeffq, 16
- lea dstq, [dstq+strideq*2]
- dec r3d
- jg .loop
- RET
-%elifidn %1_%2, identity_dct
- movd m0, [coeffq+32*0]
- punpcklwd m0, [coeffq+32*1]
- movd m2, [coeffq+32*2]
- punpcklwd m2, [coeffq+32*3]
- add coeffq, 32*4
- movd m1, [coeffq+32*0]
- punpcklwd m1, [coeffq+32*1]
- movd m3, [coeffq+32*2]
- punpcklwd m3, [coeffq+32*3]
- mova m4, [o(pw_2896x8)]
- xor eobd, eobd
- mov [coeffq-32*4], eobd
- mov [coeffq-32*3], eobd
- mov [coeffq-32*2], eobd
- mov [coeffq-32*1], eobd
- punpckldq m0, m2
- punpckldq m1, m3
- punpcklqdq m0, m1
- pmulhrsw m0, m4
- pmulhrsw m0, m4
- pmulhrsw m0, [o(pw_2048)]
- mov [coeffq+32*0], eobd
- mov [coeffq+32*1], eobd
- mov [coeffq+32*2], eobd
- mov [coeffq+32*3], eobd
- mov r3d, 4
- lea tx2q, [o(m(inv_txfm_add_identity_dct_8x16).end)]
- jmp m(inv_txfm_add_dct_dct_8x8).loop
-.end:
- RET
%endif
%endmacro
-INV_TXFM_8X16_FN dct, dct, 0
-INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, dct
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(idct_8x8_internal).pass1)]
@@ -2790,7 +2381,7 @@
jmp m(iflipadst_8x8_internal).end
-INV_TXFM_8X16_FN identity, dct, 7
+INV_TXFM_8X16_FN identity, dct
INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
@@ -2837,8 +2428,8 @@
jmp .end
-%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*16
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8, 8, 16*16
%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
@@ -2850,83 +2441,13 @@
jmp m(inv_txfm_add_dct_dct_16x4).dconly
.end:
RET
-%elifidn %1_%2, dct_identity
- mova m7, [coeffq]
- mova m0, [o(pw_2896x8)]
- mova m1, [o(pw_16384)]
- pxor m2, m2
- mova [coeffq], m2
- pmulhrsw m7, m0
- pmulhrsw m7, m0
- pmulhrsw m7, m1
- psrlw m1, 2 ; pw_4096
- pmulhrsw m7, m1
- punpcklwd m3, m7, m7
- punpckhwd m7, m7
- pshufd m0, m3, q0000
- pshufd m1, m3, q1111
- pshufd m2, m3, q2222
- pshufd m3, m3, q3333
- lea r3, [dstq+strideq*4]
- lea tx2q, [dstq+8]
- call m(iadst_8x4_internal).end2
- add coeffq, 16*4
- mov dstq, tx2q
- call m(iadst_8x4_internal).end2
- mov dstq, r3
- add coeffq, 16*4
- pshufd m0, m7, q0000
- pshufd m1, m7, q1111
- pshufd m2, m7, q2222
- pshufd m3, m7, q3333
- lea tx2q, [dstq+8]
- call m(iadst_8x4_internal).end2
- add coeffq, 16*4
- mov dstq, tx2q
- TAIL_CALL m(iadst_8x4_internal).end2
-%elifidn %1_%2, identity_dct
- mova m4, [o(pw_2896x8)]
- mova m5, [o(pw_1697x16)]
- mova m6, [o(pw_16384)]
- psrlw m7, m6, 3 ; pw_2048
- mov r3d, 2
-.main_loop:
- mova m0, [coeffq+16*0]
- punpcklwd m0, [coeffq+16*1]
- mova m1, [coeffq+16*2]
- punpcklwd m1, [coeffq+16*3]
- punpckldq m0, m1
- mova m1, [coeffq+16*4]
- punpcklwd m1, [coeffq+16*5]
- mova m2, [coeffq+16*6]
- punpcklwd m2, [coeffq+16*7]
- punpckldq m1, m2
- punpcklqdq m0, m1
- pmulhrsw m0, m4
- pmulhrsw m1, m5, m0
- pmulhrsw m1, m6
- paddsw m0, m1
- pmulhrsw m0, m4
- pmulhrsw m0, m7
-.end:
- pxor m1, m1
- REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7
- add coeffq, 16*8
- lea tx2q, [dstq+8]
- WRITE_8X4 0, 0, 0, 0, 1, 2, 3
- lea dstq, [dstq+strideq*2]
- WRITE_8X4 0, 0, 0, 0, 1, 2, 3
- mov dstq, tx2q
- dec r3d
- jg .main_loop
- RET
%endif
%endmacro
-INV_TXFM_16X8_FN dct, dct, 0
-INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, dct
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*0, 32, 1
@@ -3382,7 +2903,7 @@
jmp m(iflipadst_8x8_internal).pass2_main
-INV_TXFM_16X8_FN identity, dct, 15
+INV_TXFM_16X8_FN identity, dct
INV_TXFM_16X8_FN identity, adst
INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity
@@ -3463,8 +2984,8 @@
jmp m(iidentity_8x8_internal).end
-%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*16
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16, 8, 16*16
%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
@@ -3475,104 +2996,13 @@
jmp m(inv_txfm_add_dct_dct_16x4).dconly
.end:
RET
-%elifidn %1_%2, dct_identity
- mova m3, [o(pw_2896x8)]
- pmulhrsw m2, m3, [coeffq+16*0]
- pmulhrsw m3, [coeffq+16*1]
- mova m0, [o(pw_8192)]
- mova m1, [o(pw_1697x16)]
- pshuflw m4, [o(deint_shuf)], q0000 ;pb_0_1
- punpcklwd m4, m4
- pcmpeqb m5, m5
- pxor m6, m6
- mova [coeffq+16*0], m6
- mova [coeffq+16*1], m6
- paddb m5, m5 ;pb_m2
- pmulhrsw m2, m0
- pmulhrsw m3, m0
- psrlw m0, 2 ;pw_2048
- pmulhrsw m7, m1, m2
- pmulhrsw m1, m3
- paddsw m2, m2
- paddsw m3, m3
- paddsw m2, m7
- paddsw m3, m1
- pmulhrsw m2, m0
- pmulhrsw m3, m0
- mov r3d, 8
-.loop:
- mova m1, [dstq]
- pshufb m0, m2, m4
- punpckhbw m7, m1, m6
- punpcklbw m1, m6
- paddw m7, m0
- paddw m1, m0
- packuswb m1, m7
- mova [dstq], m1
- mova m1, [dstq+strideq*8]
- pshufb m0, m3, m4
- psubb m4, m5 ; += 2
- punpckhbw m7, m1, m6
- punpcklbw m1, m6
- paddw m7, m0
- paddw m1, m0
- packuswb m1, m7
- mova [dstq+strideq*8], m1
- add dstq, strideq
- dec r3d
- jg .loop
- RET
-%elifidn %1_%2, identity_dct
- mova m4, [o(pw_1697x16)]
- mova m5, [o(pw_2896x8)]
- mova m6, [o(pw_2048)]
- xor eobd, eobd
- lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end)]
- lea r3, [dstq+8]
- mov [rsp+16*0], r3
-.main:
- movd m0, [coeffq+32*0]
- punpcklwd m0, [coeffq+32*1]
- movd m1, [coeffq+32*2]
- punpcklwd m1, [coeffq+32*3]
- add coeffq, 32*4
- punpckldq m0, m1
- movd m1, [coeffq+32*0]
- punpcklwd m1, [coeffq+32*1]
- movd m2, [coeffq+32*2]
- punpcklwd m2, [coeffq+32*3]
- xor eobd, eobd
- mov [coeffq-32*4], eobd
- mov [coeffq-32*3], eobd
- mov [coeffq-32*2], eobd
- mov [coeffq-32*1], eobd
- punpckldq m1, m2
- punpcklqdq m0, m1
- pmulhrsw m1, m4, m0
- psraw m1, 1
- pavgw m0, m1
- pmulhrsw m0, m5
- pmulhrsw m0, m6
- mov [coeffq+32*0], eobd
- mov [coeffq+32*1], eobd
- mov [coeffq+32*2], eobd
- mov [coeffq+32*3], eobd
- mov r3d, 4
- jmp m(inv_txfm_add_dct_dct_8x8).loop
-.end:
- lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end1)]
- add coeffq, 32*4
- mov dstq, [rsp+16*0]
- jmp .main
-.end1:
- RET
%endif
%endmacro
-INV_TXFM_16X16_FN dct, dct, 0
-INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, dct
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*1, 64
@@ -3865,7 +3295,7 @@
pavgw m%1, m%2
%endmacro
-INV_TXFM_16X16_FN identity, dct, 15
+INV_TXFM_16X16_FN identity, dct
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2