shithub: dav1d

Download patch

ref: 8c5d34c85613aa8c6ba2f6da4cf8ba6b6e41ef8f
parent: 1703f21fb708f3e75ec8889c2a7592652d1ecfbc
author: Henrik Gramner <[email protected]>
date: Fri Dec 21 21:45:00 EST 2018

Add tail call optimizations in SSSE3 itx

--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -682,7 +682,7 @@
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
         RET
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -198,9 +198,11 @@
 %macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
 cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
     %undef cmp
+    %define %%p1 m(i%1_%4_internal)
 %if ARCH_X86_32
     LEA                    r5, $$
 %endif
+%if has_epilogue
 %if %3 > 0
     cmp                  eobd, %3
     jle %%end
@@ -209,10 +211,23 @@
     jz %%end
 %endif
     lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
-    call m(i%1_%4_internal)
+    call %%p1
     RET
+%%end:
+%else
+    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
+%if %3 > 0
+    cmp                  eobd, %3
+    jg %%p1
+%elif %3 == 0
+    test                 eobd, eobd
+    jnz %%p1
+%else
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
 ALIGN function_align
 %%end:
+%endif
+%endif
 %endmacro
 
 %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
@@ -225,8 +240,7 @@
     punpcklwd            m0, m0
     punpckhdq            m1, m0, m0
     punpckldq            m0, m0
-    call m(iadst_4x4_internal).end
-    RET
+    TAIL_CALL m(iadst_4x4_internal).end
 %elifidn %1_%2, identity_dct
     mova                 m1, [coeffq+16*0]
     mova                 m2, [coeffq+16*1]
@@ -238,8 +252,7 @@
     pmulhrsw             m0, [o(pw_5793x4)]
     pmulhrsw             m0, [o(pw_2896x8)]
     mova                 m1, m0
-    call m(iadst_4x4_internal).end
-    RET
+    TAIL_CALL m(iadst_4x4_internal).end
 %elif %3 >= 0
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
@@ -259,13 +272,11 @@
     pmulhrsw             m0, m1
 %endif
     mova                 m1, m0
-    call m(iadst_4x4_internal).end2
-    RET
+    TAIL_CALL m(iadst_4x4_internal).end2
 %else ; adst / flipadst
     pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
     pmulhrsw             m0, [o(iadst4_dconly2a)]
-    call m(i%2_4x4_internal).end2
-    RET
+    TAIL_CALL m(i%2_4x4_internal).end2
 %endif
 %endif
 %endmacro
@@ -624,8 +635,7 @@
     punpckldq            m0, m0
     punpckhdq            m3, m2, m2
     punpckldq            m2, m2
-    call m(iadst_4x8_internal).end3
-    RET
+    TAIL_CALL m(iadst_4x8_internal).end3
 %elifidn %1_%2, identity_dct
     movd                 m0, [coeffq+16*0]
     punpcklwd            m0, [coeffq+16*1]
@@ -642,8 +652,7 @@
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    call m(iadst_4x8_internal).end3
-    RET
+    TAIL_CALL m(iadst_4x8_internal).end3
 %elifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
@@ -656,8 +665,7 @@
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    call m(iadst_4x8_internal).end4
-    RET
+    TAIL_CALL m(iadst_4x8_internal).end4
 %else ; adst_dct / flipadst_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
@@ -674,8 +682,7 @@
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    call m(iadst_4x8_internal).end4
-    RET
+    TAIL_CALL m(iadst_4x8_internal).end4
 %endif
 %endif
 %endmacro
@@ -923,8 +930,7 @@
 %endif
 %endif
 %endif
-    call m(iadst_8x4_internal).end2
-    RET
+    TAIL_CALL m(iadst_8x4_internal).end2
 %endif
 %endmacro