ref: 9ea56386dee2706d94f3c2dac1720bcf4961aaba
parent: 5fa6c44a61fbf946646899d9db24d92cdce478ac
author: Xuefeng Jiang <[email protected]>
date: Thu Dec 27 04:13:07 EST 2018
Add SSSE3 implementations for dav1d_ipred_top, dav1d_ipred_left and dav1d_ipred_128 Cycle times: intra_pred_dc_128_w4_8bpc_c: 905.2 intra_pred_dc_128_w4_8bpc_ssse3: 61.6 intra_pred_dc_128_w8_8bpc_c: 1393.1 intra_pred_dc_128_w8_8bpc_ssse3: 82.3 intra_pred_dc_128_w16_8bpc_c: 2227.4 intra_pred_dc_128_w16_8bpc_ssse3: 119.6 intra_pred_dc_128_w32_8bpc_c: 2696.0 intra_pred_dc_128_w32_8bpc_ssse3: 195.5 intra_pred_dc_128_w64_8bpc_c: 4298.6 intra_pred_dc_128_w64_8bpc_ssse3: 465.1 intra_pred_dc_left_w4_8bpc_c: 974.2 intra_pred_dc_left_w4_8bpc_ssse3: 80.2 intra_pred_dc_left_w8_8bpc_c: 1478.4 intra_pred_dc_left_w8_8bpc_ssse3: 103.7 intra_pred_dc_left_w16_8bpc_c: 2313.0 intra_pred_dc_left_w16_8bpc_ssse3: 159.1 intra_pred_dc_left_w32_8bpc_c: 2835.1 intra_pred_dc_left_w32_8bpc_ssse3: 305.3 intra_pred_dc_left_w64_8bpc_c: 4462.2 intra_pred_dc_left_w64_8bpc_ssse3: 525.5 intra_pred_dc_top_w4_8bpc_c: 949.5 intra_pred_dc_top_w4_8bpc_ssse3: 95.5 intra_pred_dc_top_w8_8bpc_c: 1462.2 intra_pred_dc_top_w8_8bpc_ssse3: 103.1 intra_pred_dc_top_w16_8bpc_c: 2312.5 intra_pred_dc_top_w16_8bpc_ssse3: 146.4 intra_pred_dc_top_w32_8bpc_c: 2895.9 intra_pred_dc_top_w32_8bpc_ssse3: 250.4 intra_pred_dc_top_w64_8bpc_c: 4617.9 intra_pred_dc_top_w64_8bpc_ssse3: 493.3
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -52,6 +52,9 @@
decl_pal_pred_fn(dav1d_pal_pred_avx2);
decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
@@ -61,9 +64,12 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
- c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3;
- c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
- c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
+ c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3;
+ c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_ssse3;
+ c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_ssse3;
+ c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3;
+ c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
+ c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -29,6 +29,9 @@
SECTION_RODATA 16
+pb_128 : times 8 db 128
+pd_32768 : times 1 dd 32768
+
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1_%2)
@@ -44,6 +47,7 @@
JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
SECTION .text
@@ -376,3 +380,93 @@
sub hd, 2
jg .s64
RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_left_ssse3_table
+ mov hd, hm ; zero upper half
+ tzcnt r6d, hd
+ sub tlq, hq
+ tzcnt wd, wm
+ movu m0, [tlq]
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, r6d
+ psrld m3, m2
+ movsxd r6, [r5+r6*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ movu m1, [tlq+48] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+ movu m1, [tlq+32] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h32:
+ movu m1, [tlq+16] ; unaligned when jumping here from dc_top
+ pmaddubsw m1, m2
+ paddw m0, m1
+.h16:
+ pshufd m1, m0, q3232 ; psrlq m1, m0, 16
+ paddw m0, m1
+.h8:
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+.h4:
+ pmaddwd m0, m2
+ pmulhrsw m0, m3
+ lea stride3q, [strideq*3]
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+ LEA r5, ipred_dc_left_ssse3_table
+ tzcnt wd, wm
+ inc tlq
+ movu m0, [tlq]
+ movifnidn hd, hm
+ movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+ movd m2, wd
+ psrld m3, m2
+ movsxd r6, [r5+wq*4]
+ pcmpeqd m2, m2
+ pmaddubsw m0, m2
+ add r6, r5
+ add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+