ref: 6f2f0188f1efb12614164e356cf1f1027e4cbaaa
parent: c3980e394d32ed832dfd65decde5f210c03b2f27
author: Xuefeng Jiang <[email protected]>
date: Tue Dec 4 10:28:24 EST 2018
Add SSSE3 implementation for dav1d_ipred_h Cycle times: intra_pred_h_w4_8bpc_c: 146.6 intra_pred_h_w4_8bpc_ssse3: 30.6 intra_pred_h_w8_8bpc_c: 236.3 intra_pred_h_w8_8bpc_ssse3: 42.2 intra_pred_h_w16_8bpc_c: 446.6 intra_pred_h_w16_8bpc_ssse3: 55.8 intra_pred_h_w32_8bpc_c: 688.2 intra_pred_h_w32_8bpc_ssse3: 85.9 intra_pred_h_w64_8bpc_c: 634.2 intra_pred_h_w64_8bpc_ssse3: 169.2
--- a/src/meson.build
+++ b/src/meson.build
@@ -123,6 +123,7 @@
'x86/mc.asm',
'x86/mc_ssse3.asm',
'x86/itx_ssse3.asm',
+ 'x86/ipred_ssse3.asm',
)
# Compile the ASM sources with NASM
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -50,8 +50,16 @@
decl_pal_pred_fn(dav1d_pal_pred_avx2);
+decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
+
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+#if BITDEPTH == 8
+ c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
+#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- /dev/null
+++ b/src/x86/ipred_ssse3.asm
@@ -1,0 +1,96 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 16
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+
+SECTION .text
+
+
+%macro IPRED_SET 4 ; width, store_type, stride, stride size, pshuflw_imm8
+ pshuflw m1, m0, %4 ; extend 8 byte for 2 pos
+ punpcklqdq m1, m1
+ mov%2 [dstq + %3], m1
+%if %1 > 16
+ mov%2 [dstq + 16 + %3], m1
+%endif
+%if %1 > 32
+ mov%2 [dstq + 32 + %3], m1
+ mov%2 [dstq + 48 + %3], m1
+%endif
+%endmacro
+
+%macro IPRED_H 3 ; width, loop label, store_type
+ sub tlq, 4
+ movd m0, [tlq] ; get 4 bytes of topleft data
+ punpcklbw m0, m0 ; extend 2 byte
+ IPRED_SET %1, %3, 0, q3333
+ IPRED_SET %1, %3, strideq, q2222
+ IPRED_SET %1, %3, strideq*2, q1111
+ IPRED_SET %1, %3, stride3q, q0000
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg %2
+ RET
+%endmacro
+
+INIT_XMM ssse3
+cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
+ lea r5, [ipred_h_ssse3_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+%if ARCH_X86_64
+ movsxd wq, [r5+wq*4]
+%else
+ mov wq, [r5+wq*4]
+%endif
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ IPRED_H 4, .w4, d
+.w8:
+ IPRED_H 8, .w8, q
+.w16:
+ IPRED_H 16, .w16, u
+.w32:
+ IPRED_H 32, .w32, u
+.w64:
+ IPRED_H 64, .w64, u