ref: d67e3476c91d408cab8e5d7fb3f29120dadb15ed
parent: 18ef9556b71e3b6b839c35ae614ef0bb5b6a2179
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Mon Mar 4 10:26:52 EST 2019
x86: add SSSE3 cdef dir implementation --------------------- x86_64: ------------------------------------------ cdef_dir_8bpc_c: 1023.1 cdef_dir_8bpc_ssse3: 110.3 cdef_dir_8bpc_avx2: 71.1 ------------------------------------------ --------------------- x86_32: ------------------------------------------ cdef_dir_8bpc_c: 1074.8 cdef_dir_8bpc_ssse3: 120.6 ------------------------------------------ Thanks to Ronald for the AVX2 XMM version which was a very good starting point.
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -38,6 +38,7 @@
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
+decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -45,6 +46,7 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH ==8
+ c->dir = dav1d_cdef_dir_ssse3;
c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
--- a/src/x86/cdef_ssse3.asm
+++ b/src/x86/cdef_ssse3.asm
@@ -29,10 +29,17 @@
SECTION_RODATA 16
+%if ARCH_X86_32
pb_0: times 16 db 0
+%endif
+pw_128: times 8 dw 128
pw_256: times 8 dw 256
pw_2048: times 8 dw 2048
pw_0x7FFF: times 8 dw 0x7FFF
+pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
+div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
tap_table: dw 4, 2, 3, 3, 2, 1
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
@@ -711,3 +718,589 @@
cdef_filter_fn 8, 8, 32
cdef_filter_fn 4, 8, 32
cdef_filter_fn 4, 4, 32
+
+%macro MULLD 2-3 0 ; %3 = is_constant
+ %if ARCH_X86_32
+ %define m15 m1
+ %endif
+ pmulhuw m15, %1, %2
+ pmullw %1, %2
+ pslld m15, 16
+ paddd %1, m15
+%endmacro
+
+%if ARCH_X86_64
+cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
+ lea stride3q, [strideq*3]
+ movq m1, [srcq+strideq*0]
+ movhps m1, [srcq+strideq*1]
+ movq m3, [srcq+strideq*2]
+ movhps m3, [srcq+stride3q]
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+
+ pxor m8, m8
+ psadbw m0, m1, m8
+ psadbw m2, m3, m8
+ psadbw m4, m5, m8
+ psadbw m6, m7, m8
+ packssdw m0, m2
+ packssdw m4, m6
+ packssdw m0, m4
+ SWAP m0, m9
+
+ punpcklbw m0, m1, m8
+ punpckhbw m1, m8
+ punpcklbw m2, m3, m8
+ punpckhbw m3, m8
+ punpcklbw m4, m5, m8
+ punpckhbw m5, m8
+ punpcklbw m6, m7, m8
+ punpckhbw m7, m8
+
+ mova m8, [pw_128]
+ psubw m0, m8
+ psubw m1, m8
+ psubw m2, m8
+ psubw m3, m8
+ psubw m4, m8
+ psubw m5, m8
+ psubw m6, m8
+ psubw m7, m8
+ psllw m8, 3
+ psubw m9, m8 ; partial_sum_hv[0]
+
+ paddw m8, m0, m1
+ paddw m10, m2, m3
+ paddw m8, m4
+ paddw m10, m5
+ paddw m8, m6
+ paddw m10, m7
+ paddw m8, m10 ; partial_sum_hv[1]
+
+ pmaddwd m8, m8
+ pmaddwd m9, m9
+ phaddd m9, m8
+ SWAP m8, m9
+ MULLD m8, [div_table+48], 1
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m0
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14 ; partial_sum_diag[0] top/right half
+ paddw m9, m11 ; partial_sum_diag[0] top/left half
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13 ; partial_sum_diag[0][0-7]
+ paddw m10, m14 ; partial_sum_diag[0][8-14,zero]
+ pshufb m10, [shufw_6543210x]
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ MULLD m11, [div_table+16]
+ MULLD m9, [div_table+0]
+ paddd m9, m11 ; cost[0a-d]
+
+ pslldq m10, m0, 14
+ psrldq m11, m0, 2
+ pslldq m12, m1, 12
+ psrldq m13, m1, 4
+ pslldq m14, m2, 10
+ psrldq m15, m2, 6
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m3, 8
+ psrldq m13, m3, 8
+ pslldq m14, m4, 6
+ psrldq m15, m4, 10
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15
+ pslldq m12, m5, 4
+ psrldq m13, m5, 12
+ pslldq m14, m6, 2
+ psrldq m15, m6, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m15 ; partial_sum_diag[1][8-14,zero]
+ paddw m10, m7 ; partial_sum_diag[1][0-7]
+ pshufb m11, [shufw_6543210x]
+ punpckhwd m12, m10, m11
+ punpcklwd m10, m11
+ pmaddwd m12, m12
+ pmaddwd m10, m10
+ MULLD m12, [div_table+16]
+ MULLD m10, [div_table+0]
+ paddd m10, m12 ; cost[4a-d]
+ phaddd m9, m10 ; cost[0a/b,4a/b]
+
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+ pslldq m4, m11, 2
+ psrldq m5, m11, 14
+ pslldq m6, m12, 4
+ psrldq m7, m12, 12
+ pslldq m14, m13, 6
+ psrldq m15, m13, 10
+ paddw m4, m10
+ paddw m5, m7
+ paddw m4, m6
+ paddw m5, m15 ; partial_sum_alt[3] right
+ paddw m4, m14 ; partial_sum_alt[3] left
+ pshuflw m5, m5, q3012
+ punpckhwd m6, m4, m5
+ punpcklwd m4, m5
+ pmaddwd m6, m6
+ pmaddwd m4, m4
+ MULLD m6, [div_table+48], 1
+ MULLD m4, [div_table+32]
+ paddd m4, m6 ; cost[7a-d]
+
+ pslldq m5, m10, 6
+ psrldq m6, m10, 10
+ pslldq m7, m11, 4
+ psrldq m10, m11, 12
+ pslldq m11, m12, 2
+ psrldq m12, 14
+ paddw m5, m7
+ paddw m6, m10
+ paddw m5, m11
+ paddw m6, m12
+ paddw m5, m13
+ pshuflw m6, m6, q3012
+ punpckhwd m7, m5, m6
+ punpcklwd m5, m6
+ pmaddwd m7, m7
+ pmaddwd m5, m5
+ MULLD m7, [div_table+48], 1
+ MULLD m5, [div_table+32]
+ paddd m5, m7 ; cost[5a-d]
+
+ pslldq m6, m1, 2
+ psrldq m7, m1, 14
+ pslldq m10, m2, 4
+ psrldq m11, m2, 12
+ pslldq m12, m3, 6
+ psrldq m13, m3, 10
+ paddw m6, m0
+ paddw m7, m11
+ paddw m6, m10
+ paddw m7, m13 ; partial_sum_alt[3] right
+ paddw m6, m12 ; partial_sum_alt[3] left
+ pshuflw m7, m7, q3012
+ punpckhwd m10, m6, m7
+ punpcklwd m6, m7
+ pmaddwd m10, m10
+ pmaddwd m6, m6
+ MULLD m10, [div_table+48], 1
+ MULLD m6, [div_table+32]
+ paddd m6, m10 ; cost[1a-d]
+
+ pshufd m0, m0, q1032
+ pshufd m1, m1, q1032
+ pshufd m2, m2, q1032
+ pshufd m3, m3, q1032
+
+ pslldq m10, m0, 6
+ psrldq m11, m0, 10
+ pslldq m12, m1, 4
+ psrldq m13, m1, 12
+ pslldq m14, m2, 2
+ psrldq m2, 14
+ paddw m10, m12
+ paddw m11, m13
+ paddw m10, m14
+ paddw m11, m2
+ paddw m10, m3
+ pshuflw m11, m11, q3012
+ punpckhwd m12, m10, m11
+ punpcklwd m10, m11
+ pmaddwd m12, m12
+ pmaddwd m10, m10
+ MULLD m12, [div_table+48], 1
+ MULLD m10, [div_table+32]
+ paddd m10, m12 ; cost[3a-d]
+
+ phaddd m0, m9, m8 ; cost[0,4,2,6]
+ phaddd m6, m5
+ phaddd m10, m4
+ phaddd m1, m6, m10 ; cost[1,5,3,7]
+
+ pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
+ pand m3, m2, m1
+ pandn m4, m2, m0
+ por m3, m4 ; higher 4 values
+ pshufd m1, m1, q2301
+ pshufd m0, m0, q2301
+ pand m1, m2, m1
+ pandn m4, m2, m0
+ por m0, m4, m1 ; 4 values at idx^4 offset
+ pand m14, m2, [pd_0to7+16]
+ pandn m15, m2, [pd_0to7]
+ por m15, m14
+
+ punpckhqdq m4, m3, m0
+ punpcklqdq m3, m0
+ pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5]
+ punpcklqdq m5, m5
+ pand m6, m5, m4
+ pandn m7, m5, m3
+ por m6, m7 ; { highest 2 values, complements at idx^4 }
+ movhlps m14, m15
+ pand m14, m5, m14
+ pandn m13, m5, m15
+ por m15, m13, m14
+
+ pshufd m7, m6, q3311
+ pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3]
+ punpcklqdq m8, m8
+ pand m9, m8, m7
+ pandn m10, m8, m6
+ por m9, m10 ; max
+ movhlps m10, m9 ; complement at idx^4
+ psubd m9, m10
+ psrld m9, 10
+ movd [varq], m9
+ pshufd m14, m15, q1111
+ pand m14, m8, m14
+ pandn m13, m8, m15
+ por m15, m13, m14
+ movd eax, m15
+%else
+cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
+ %define PIC_reg r4
+ LEA PIC_reg, PIC_base_offset
+
+ pxor m0, m0
+ mova m1, [PIC_sym(pw_128)]
+
+ lea stride3q, [strideq*3]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ psadbw m2, m5, m0
+ psadbw m3, m7, m0
+ packssdw m2, m3
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+
+ mova [esp+0x00], m4
+ mova [esp+0x10], m5
+ mova [esp+0x20], m6
+ mova [esp+0x50], m7
+
+ lea srcq, [srcq+strideq*4]
+ movq m5, [srcq+strideq*0]
+ movhps m5, [srcq+strideq*1]
+ movq m7, [srcq+strideq*2]
+ movhps m7, [srcq+stride3q]
+ psadbw m3, m5, m0
+ psadbw m0, m7, m0
+ packssdw m3, m0
+ pxor m0, m0
+ packssdw m2, m3
+ punpcklbw m4, m5, m0
+ punpckhbw m5, m0
+ punpcklbw m6, m7, m0
+ punpckhbw m7, m0
+ psubw m4, m1
+ psubw m5, m1
+ psubw m6, m1
+ psubw m7, m1
+
+ psllw m1, 3
+ psubw m2, m1 ; partial_sum_hv[0]
+ pmaddwd m2, m2
+
+ mova m3, [esp+0x50]
+ mova m0, [esp+0x00]
+ paddw m0, [esp+0x10]
+ paddw m1, m3, [esp+0x20]
+ paddw m0, m4
+ paddw m1, m5
+ paddw m0, m6
+ paddw m1, m7
+ paddw m0, m1 ; partial_sum_hv[1]
+ pmaddwd m0, m0
+
+ phaddd m2, m0
+ MULLD m2, [PIC_sym(div_table)+48], 1
+ mova [esp+0x30], m2
+
+ mova m1, [esp+0x10]
+ pslldq m0, m1, 2
+ psrldq m1, 14
+ paddw m0, [esp+0x00]
+ pslldq m2, m3, 6
+ psrldq m3, 10
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 4
+ psrldq m3, 12
+ paddw m0, m2 ; partial_sum_diag[0] top/left half
+ paddw m1, m3 ; partial_sum_diag[0] top/right half
+ pslldq m2, m4, 8
+ psrldq m3, m4, 8
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 10
+ psrldq m3, m5, 6
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 12
+ psrldq m3, m6, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m7, 14
+ psrldq m3, m7, 2
+ paddw m0, m2 ; partial_sum_diag[0][0-7]
+ paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [PIC_sym(shufw_6543210x)]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [PIC_sym(div_table)+16]
+ MULLD m0, [PIC_sym(div_table)+0]
+ paddd m0, m2 ; cost[0a-d]
+ mova [esp+0x40], m0
+
+ mova m1, [esp+0x00]
+ pslldq m0, m1, 14
+ psrldq m1, 2
+ paddw m0, m7
+ pslldq m2, m3, 8
+ psrldq m3, 8
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x20]
+ pslldq m2, m3, 10
+ psrldq m3, 6
+ paddw m0, m2
+ paddw m1, m3
+ mova m3, [esp+0x10]
+ pslldq m2, m3, 12
+ psrldq m3, 4
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m4, 6
+ psrldq m3, m4, 10
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m5, 4
+ psrldq m3, m5, 12
+ paddw m0, m2
+ paddw m1, m3
+ pslldq m2, m6, 2
+ psrldq m3, m6, 14
+ paddw m0, m2 ; partial_sum_diag[1][0-7]
+ paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
+ mova m3, [esp+0x50]
+ pshufb m1, [PIC_sym(shufw_6543210x)]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [PIC_sym(div_table)+16]
+ MULLD m0, [PIC_sym(div_table)+0]
+ paddd m0, m2 ; cost[4a-d]
+ phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
+ phaddd m1, [esp+0x30] ; cost[0,4,2,6]
+ mova [esp+0x30], m1
+
+ phaddw m0, [esp+0x00], m4
+ phaddw m1, [esp+0x10], m5
+ paddw m4, m5
+ mova m2, [esp+0x20]
+ paddw m5, m2, m3
+ phaddw m2, m6
+ paddw m6, m7
+ phaddw m3, m7
+ mova m7, [esp+0x00]
+ paddw m7, [esp+0x10]
+ mova [esp+0x00], m0
+ mova [esp+0x10], m1
+ mova [esp+0x20], m2
+
+ pslldq m1, m4, 4
+ pslldq m2, m6, 6
+ pslldq m0, m5, 2
+ paddw m1, m2
+ paddw m0, m7
+ psrldq m2, m5, 14
+ paddw m0, m1 ; partial_sum_alt[3] left
+ psrldq m1, m4, 12
+ paddw m1, m2
+ psrldq m2, m6, 10
+ paddw m1, m2 ; partial_sum_alt[3] right
+ pshuflw m1, m1, q3012
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [PIC_sym(div_table)+48], 1
+ MULLD m0, [PIC_sym(div_table)+32]
+ paddd m0, m2 ; cost[7a-d]
+ mova [esp+0x40], m0
+
+ pslldq m0, m7, 6
+ psrldq m7, 10
+ pslldq m1, m5, 4
+ psrldq m5, 12
+ pslldq m2, m4, 2
+ psrldq m4, 14
+ paddw m0, m6
+ paddw m7, m5
+ paddw m0, m1
+ paddw m7, m4
+ paddw m0, m2
+ pshuflw m7, m7, q3012
+ punpckhwd m2, m0, m7
+ punpcklwd m0, m7
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [PIC_sym(div_table)+48], 1
+ MULLD m0, [PIC_sym(div_table)+32]
+ paddd m0, m2 ; cost[5a-d]
+ mova [esp+0x50], m0
+
+ mova m1, [esp+0x10]
+ mova m2, [esp+0x20]
+ pslldq m0, m1, 2
+ psrldq m1, 14
+ pslldq m4, m2, 4
+ psrldq m2, 12
+ pslldq m5, m3, 6
+ psrldq m6, m3, 10
+ paddw m0, [esp+0x00]
+ paddw m1, m2
+ paddw m4, m5
+ paddw m1, m6 ; partial_sum_alt[3] right
+ paddw m0, m4 ; partial_sum_alt[3] left
+ pshuflw m1, m1, q3012
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+ MULLD m2, [PIC_sym(div_table)+48], 1
+ MULLD m0, [PIC_sym(div_table)+32]
+ paddd m0, m2 ; cost[1a-d]
+ phaddd m0, [esp+0x50]
+ mova [esp+0x50], m0
+
+ pshufd m0, [esp+0x00], q1032
+ pshufd m1, [esp+0x10], q1032
+ pshufd m2, [esp+0x20], q1032
+ pshufd m3, m3, q1032
+
+ pslldq m4, m0, 6
+ psrldq m0, 10
+ pslldq m5, m1, 4
+ psrldq m1, 12
+ pslldq m6, m2, 2
+ psrldq m2, 14
+ paddw m4, m3
+ paddw m0, m1
+ paddw m5, m6
+ paddw m0, m2
+ paddw m4, m5
+ pshuflw m0, m0, q3012
+ punpckhwd m2, m4, m0
+ punpcklwd m4, m0
+ pmaddwd m2, m2
+ pmaddwd m4, m4
+ MULLD m2, [PIC_sym(div_table)+48], 1
+ MULLD m4, [PIC_sym(div_table)+32]
+ paddd m4, m2 ; cost[3a-d]
+ phaddd m4, [esp+0x40]
+
+ mova m1, [esp+0x50]
+ mova m0, [esp+0x30] ; cost[0,4,2,6]
+ phaddd m1, m4 ; cost[1,5,3,7]
+
+ pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
+ pand m3, m2, m1
+ pandn m4, m2, m0
+ por m3, m4 ; higher 4 values
+ pshufd m1, m1, q2301
+ pshufd m0, m0, q2301
+ pand m1, m2, m1
+ pandn m4, m2, m0
+ por m0, m4, m1 ; 4 values at idx^4 offset
+ pand m5, m2, [PIC_sym(pd_0to7)+16]
+ pandn m6, m2, [PIC_sym(pd_0to7)]
+ por m6, m5
+
+ punpckhqdq m4, m3, m0
+ punpcklqdq m3, m0
+ pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5]
+ punpcklqdq m0, m0
+ pand m1, m0, m4
+ pandn m7, m0, m3
+ por m1, m7 ; { highest 2 values, complements at idx^4 }
+ movhlps m5, m6
+ pand m5, m0, m5
+ pandn m3, m0, m6
+ por m6, m3, m5
+
+ pshufd m7, m1, q3311
+ pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3]
+ punpcklqdq m2, m2
+ pand m0, m2, m7
+ pandn m7, m2, m1
+ por m0, m7 ; max
+ movhlps m7, m0 ; complement at idx^4
+ psubd m0, m7
+ psrld m0, 10
+ movd [varq], m0
+ pshufd m5, m6, q1111
+ pand m5, m2, m5
+ pandn m3, m2, m6
+ por m6, m3, m5
+ movd eax, m6
+%endif
+
+ RET