ref: 91568b2a57de65aad9f9f73fcc8cb71fdc507e9b
parent: 75e88fab368c21cc0089222e5e08c9a15b369885
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Thu Mar 21 13:11:00 EDT 2019
x86: cdef_dir: optimize best cost finding for SSE Port of 65ee1233cf86f03e029d0520f7cc5a3e152d3bbd for AVX-2 from Kyle Siefring to SSE41, and optimize SSSE3. --------------------- x86_64: ------------------------------------------ before: cdef_dir_8bpc_ssse3: 110.3 after: cdef_dir_8bpc_ssse3: 105.9 new: cdef_dir_8bpc_sse4: 96.4 ------------------------------------------ --------------------- x86_32: ------------------------------------------ before: cdef_dir_8bpc_ssse3: 120.6 after: cdef_dir_8bpc_ssse3: 110.7 new: cdef_dir_8bpc_sse4: 106.5 ------------------------------------------
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -41,6 +41,7 @@
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
+decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
@@ -58,6 +59,7 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 8
+ c->dir = dav1d_cdef_dir_sse4;
c->fb[0] = dav1d_cdef_filter_8x8_sse4;
c->fb[1] = dav1d_cdef_filter_4x8_sse4;
c->fb[2] = dav1d_cdef_filter_4x4_sse4;
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -40,9 +40,10 @@
pw_0x7FFF: times 8 dw 0x7FFF
pw_0x8000: times 8 dw 0x8000
%endif
-pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
-div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
- dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+ dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
tap_table: ; masks for 8-bit shift emulation
@@ -746,18 +747,22 @@
%endmacro
%macro MULLD 2
- %if ARCH_X86_32
- %define m15 m1
- %endif
+ %if cpuflag(sse4)
+ pmulld %1, %2
+ %else
+ %if ARCH_X86_32
+ %define m15 m1
+ %endif
pmulhuw m15, %1, %2
pmullw %1, %2
pslld m15, 16
paddd %1, m15
+ %endif
%endmacro
-INIT_XMM ssse3
-%if ARCH_X86_64
-cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
+%macro CDEF_DIR 0
+ %if ARCH_X86_64
+cglobal cdef_dir, 3, 5, 16, 32, src, stride, var, stride3
lea stride3q, [strideq*3]
movq m1, [srcq+strideq*0]
movhps m1, [srcq+strideq*1]
@@ -812,7 +817,7 @@
pmaddwd m9, m9
phaddd m9, m8
SWAP m8, m9
- MULLD m8, [div_table+48]
+ MULLD m8, [div_table%+SUFFIX+48]
pslldq m9, m1, 2
psrldq m10, m1, 14
@@ -846,8 +851,8 @@
punpcklwd m9, m10
pmaddwd m11, m11
pmaddwd m9, m9
- MULLD m11, [div_table+16]
- MULLD m9, [div_table+0]
+ MULLD m11, [div_table%+SUFFIX+16]
+ MULLD m9, [div_table%+SUFFIX+0]
paddd m9, m11 ; cost[0a-d]
pslldq m10, m0, 14
@@ -882,8 +887,8 @@
punpcklwd m10, m11
pmaddwd m12, m12
pmaddwd m10, m10
- MULLD m12, [div_table+16]
- MULLD m10, [div_table+0]
+ MULLD m12, [div_table%+SUFFIX+16]
+ MULLD m10, [div_table%+SUFFIX+0]
paddd m10, m12 ; cost[4a-d]
phaddd m9, m10 ; cost[0a/b,4a/b]
@@ -908,14 +913,14 @@
paddw m4, m6
paddw m5, m15 ; partial_sum_alt[3] right
paddw m4, m14 ; partial_sum_alt[3] left
- pshuflw m5, m5, q3012
- punpckhwd m6, m4, m5
- punpcklwd m4, m5
- pmaddwd m6, m6
+ pshuflw m6, m5, q3012
+ punpckhwd m5, m4
+ punpcklwd m4, m6
+ pmaddwd m5, m5
pmaddwd m4, m4
- MULLD m6, [div_table+48]
- MULLD m4, [div_table+32]
- paddd m4, m6 ; cost[7a-d]
+ MULLD m5, [div_table%+SUFFIX+48]
+ MULLD m4, [div_table%+SUFFIX+32]
+ paddd m4, m5 ; cost[7a-d]
pslldq m5, m10, 6
psrldq m6, m10, 10
@@ -928,14 +933,14 @@
paddw m5, m11
paddw m6, m12
paddw m5, m13
- pshuflw m6, m6, q3012
- punpckhwd m7, m5, m6
- punpcklwd m5, m6
- pmaddwd m7, m7
+ pshuflw m7, m6, q3012
+ punpckhwd m6, m5
+ punpcklwd m5, m7
+ pmaddwd m6, m6
pmaddwd m5, m5
- MULLD m7, [div_table+48]
- MULLD m5, [div_table+32]
- paddd m5, m7 ; cost[5a-d]
+ MULLD m6, [div_table%+SUFFIX+48]
+ MULLD m5, [div_table%+SUFFIX+32]
+ paddd m5, m6 ; cost[5a-d]
pslldq m6, m1, 2
psrldq m7, m1, 14
@@ -948,14 +953,14 @@
paddw m6, m10
paddw m7, m13 ; partial_sum_alt[3] right
paddw m6, m12 ; partial_sum_alt[3] left
- pshuflw m7, m7, q3012
- punpckhwd m10, m6, m7
- punpcklwd m6, m7
- pmaddwd m10, m10
+ pshuflw m10, m7, q3012
+ punpckhwd m7, m6
+ punpcklwd m6, m10
+ pmaddwd m7, m7
pmaddwd m6, m6
- MULLD m10, [div_table+48]
- MULLD m6, [div_table+32]
- paddd m6, m10 ; cost[1a-d]
+ MULLD m7, [div_table%+SUFFIX+48]
+ MULLD m6, [div_table%+SUFFIX+32]
+ paddd m6, m7 ; cost[1a-d]
pshufd m0, m0, q1032
pshufd m1, m1, q1032
@@ -973,63 +978,64 @@
paddw m10, m14
paddw m11, m2
paddw m10, m3
- pshuflw m11, m11, q3012
- punpckhwd m12, m10, m11
- punpcklwd m10, m11
- pmaddwd m12, m12
+ pshuflw m12, m11, q3012
+ punpckhwd m11, m10
+ punpcklwd m10, m12
+ pmaddwd m11, m11
pmaddwd m10, m10
- MULLD m12, [div_table+48]
- MULLD m10, [div_table+32]
- paddd m10, m12 ; cost[3a-d]
+ MULLD m11, [div_table%+SUFFIX+48]
+ MULLD m10, [div_table%+SUFFIX+32]
+ paddd m10, m11 ; cost[3a-d]
- phaddd m0, m9, m8 ; cost[0,4,2,6]
- phaddd m6, m5
- phaddd m10, m4
- phaddd m1, m6, m10 ; cost[1,5,3,7]
+ phaddd m9, m8 ; cost[0,4,2,6]
+ phaddd m6, m10
+ phaddd m5, m4
+ phaddd m6, m5 ; cost[1,3,5,7]
+ pshufd m4, m9, q3120
- pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
- pand m3, m2, m1
- pandn m4, m2, m0
- por m3, m4 ; higher 4 values
- pshufd m1, m1, q2301
- pshufd m0, m0, q2301
- pand m1, m2, m1
- pandn m4, m2, m0
- por m0, m4, m1 ; 4 values at idx^4 offset
- pand m14, m2, [pd_0to7+16]
- pandn m15, m2, [pd_0to7]
- por m15, m14
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m9, m6
+ pshufd m0, m9, q1032
+ pmaxsd m0, m9
+ pshufd m1, m0, q2301
+ pmaxsd m0, m1 ; best cost
+ %else
+ pcmpgtd m0, m9, m6
+ pand m9, m0
+ pandn m0, m6
+ por m9, m0
+ pshufd m1, m9, q1032
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m9, m0
+ pshufd m1, m9, q2301
+ pcmpgtd m0, m9, m1
+ pand m9, m0
+ pandn m0, m1
+ por m0, m9
+ %endif
- punpckhqdq m4, m3, m0
- punpcklqdq m3, m0
- pcmpgtd m5, m4, m3 ; [2or3-6or7] > [0or1/4or5]
- punpcklqdq m5, m5
- pand m6, m5, m4
- pandn m7, m5, m3
- por m6, m7 ; { highest 2 values, complements at idx^4 }
- movhlps m14, m15
- pand m14, m5, m14
- pandn m13, m5, m15
- por m15, m13, m14
-
- pshufd m7, m6, q3311
- pcmpgtd m8, m7, m6 ; [4or5or6or7] > [0or1or2or3]
- punpcklqdq m8, m8
- pand m9, m8, m7
- pandn m10, m8, m6
- por m9, m10 ; max
- movhlps m10, m9 ; complement at idx^4
- psubd m9, m10
- psrld m9, 10
- movd [varq], m9
- pshufd m14, m15, q1111
- pand m14, m8, m14
- pandn m13, m8, m15
- por m15, m13, m14
- movd eax, m15
-%else
+ ; get direction and variance
+ punpckhdq m1, m4, m6
+ punpckldq m4, m6
+ psubd m2, m0, m1
+ psubd m3, m0, m4
+ mova [rsp+0x00], m2 ; emulate ymm in stack
+ mova [rsp+0x10], m3
+ pcmpeqd m1, m0 ; compute best cost mask
+ pcmpeqd m4, m0
+ packssdw m4, m1
+ pmovmskb eax, m4 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [rsp+rax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [varq], r1d
+ %else
cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
- %define PIC_reg r4
+ %define PIC_reg r4
LEA PIC_reg, PIC_base_offset
pxor m0, m0
@@ -1092,7 +1098,7 @@
pmaddwd m0, m0
phaddd m2, m0
- MULLD m2, [PIC_sym(div_table)+48]
+ MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
mova [esp+0x30], m2
mova m1, [esp+0x10]
@@ -1130,8 +1136,8 @@
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table)+16]
- MULLD m0, [PIC_sym(div_table)+0]
+ MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
+ MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
paddd m0, m2 ; cost[0a-d]
mova [esp+0x40], m0
@@ -1171,8 +1177,8 @@
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table)+16]
- MULLD m0, [PIC_sym(div_table)+0]
+ MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
+ MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
paddd m0, m2 ; cost[4a-d]
phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
phaddd m1, [esp+0x30] ; cost[0,4,2,6]
@@ -1208,8 +1214,8 @@
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table)+48]
- MULLD m0, [PIC_sym(div_table)+32]
+ MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
+ MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
paddd m0, m2 ; cost[7a-d]
mova [esp+0x40], m0
@@ -1224,44 +1230,44 @@
paddw m0, m1
paddw m7, m4
paddw m0, m2
- pshuflw m7, m7, q3012
- punpckhwd m2, m0, m7
- punpcklwd m0, m7
- pmaddwd m2, m2
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table)+48]
- MULLD m0, [PIC_sym(div_table)+32]
- paddd m0, m2 ; cost[5a-d]
+ MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
+ MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ paddd m0, m7 ; cost[5a-d]
mova [esp+0x50], m0
- mova m1, [esp+0x10]
+ mova m7, [esp+0x10]
mova m2, [esp+0x20]
- pslldq m0, m1, 2
- psrldq m1, 14
+ pslldq m0, m7, 2
+ psrldq m7, 14
pslldq m4, m2, 4
psrldq m2, 12
pslldq m5, m3, 6
psrldq m6, m3, 10
paddw m0, [esp+0x00]
- paddw m1, m2
+ paddw m7, m2
paddw m4, m5
- paddw m1, m6 ; partial_sum_alt[3] right
+ paddw m7, m6 ; partial_sum_alt[3] right
paddw m0, m4 ; partial_sum_alt[3] left
- pshuflw m1, m1, q3012
- punpckhwd m2, m0, m1
- punpcklwd m0, m1
- pmaddwd m2, m2
+ pshuflw m2, m7, q3012
+ punpckhwd m7, m0
+ punpcklwd m0, m2
+ pmaddwd m7, m7
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table)+48]
- MULLD m0, [PIC_sym(div_table)+32]
- paddd m0, m2 ; cost[1a-d]
- phaddd m0, [esp+0x50]
- mova [esp+0x50], m0
+ MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
+ MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ paddd m0, m7 ; cost[1a-d]
+ SWAP m0, m4
pshufd m0, [esp+0x00], q1032
pshufd m1, [esp+0x10], q1032
pshufd m2, [esp+0x20], q1032
pshufd m3, m3, q1032
+ mova [esp+0x00], m4
pslldq m4, m0, 6
psrldq m0, 10
@@ -1274,70 +1280,76 @@
paddw m5, m6
paddw m0, m2
paddw m4, m5
- pshuflw m0, m0, q3012
- punpckhwd m2, m4, m0
- punpcklwd m4, m0
- pmaddwd m2, m2
- pmaddwd m4, m4
- MULLD m2, [PIC_sym(div_table)+48]
- MULLD m4, [PIC_sym(div_table)+32]
- paddd m4, m2 ; cost[3a-d]
- phaddd m4, [esp+0x40]
+ pshuflw m2, m0, q3012
+ punpckhwd m0, m4
+ punpcklwd m4, m2
+ pmaddwd m0, m0
+ pmaddwd m4, m4
+ MULLD m0, [PIC_sym(div_table%+SUFFIX)+48]
+ MULLD m4, [PIC_sym(div_table%+SUFFIX)+32]
+ paddd m4, m0 ; cost[3a-d]
- mova m1, [esp+0x50]
+ mova m1, [esp+0x00]
+ mova m2, [esp+0x50]
mova m0, [esp+0x30] ; cost[0,4,2,6]
- phaddd m1, m4 ; cost[1,5,3,7]
+ phaddd m1, m4
+ phaddd m2, [esp+0x40] ; cost[1,3,5,7]
+ phaddd m1, m2
+ pshufd m2, m0, q3120
- pcmpgtd m2, m1, m0 ; [1/5/3/7] > [0/4/2/6]
- pand m3, m2, m1
- pandn m4, m2, m0
- por m3, m4 ; higher 4 values
- pshufd m1, m1, q2301
- pshufd m0, m0, q2301
- pand m1, m2, m1
- pandn m4, m2, m0
- por m0, m4, m1 ; 4 values at idx^4 offset
- pand m5, m2, [PIC_sym(pd_0to7)+16]
- pandn m6, m2, [PIC_sym(pd_0to7)]
- por m6, m5
+ ; now find the best cost
+ %if cpuflag(sse4)
+ pmaxsd m0, m1
+ pshufd m3, m0, q1032
+ pmaxsd m3, m0
+ pshufd m0, m3, q2301
+ pmaxsd m0, m3
+ %else
+ pcmpgtd m3, m0, m1
+ pand m0, m3
+ pandn m3, m1
+ por m0, m3
+ pshufd m4, m0, q1032
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ pshufd m4, m0, q2301
+ pcmpgtd m3, m0, m4
+ pand m0, m3
+ pandn m3, m4
+ por m0, m3
+ %endif
- punpckhqdq m4, m3, m0
- punpcklqdq m3, m0
- pcmpgtd m0, m4, m3 ; [2or3-6or7] > [0or1/4or5]
- punpcklqdq m0, m0
- pand m1, m0, m4
- pandn m7, m0, m3
- por m1, m7 ; { highest 2 values, complements at idx^4 }
- movhlps m5, m6
- pand m5, m0, m5
- pandn m3, m0, m6
- por m6, m3, m5
+ ; get direction and variance
+ punpckhdq m3, m2, m1
+ punpckldq m2, m1
+ psubd m1, m0, m3
+ psubd m4, m0, m2
+ mova [esp+0x00], m1 ; emulate ymm in stack
+ mova [esp+0x10], m4
+ pcmpeqd m3, m0 ; compute best cost mask
+ pcmpeqd m2, m0
+ packssdw m2, m3
+ pmovmskb eax, m2 ; get byte-idx from mask
+ tzcnt eax, eax
+ mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm
+ shr eax, 1 ; get direction by converting byte-idx to word-idx
+ shr r1d, 10
+ mov [vard], r1d
+ %endif
- pshufd m7, m1, q3311
- pcmpgtd m2, m7, m1 ; [4or5or6or7] > [0or1or2or3]
- punpcklqdq m2, m2
- pand m0, m2, m7
- pandn m7, m2, m1
- por m0, m7 ; max
- movhlps m7, m0 ; complement at idx^4
- psubd m0, m7
- psrld m0, 10
- movd [varq], m0
- pshufd m5, m6, q1111
- pand m5, m2, m5
- pandn m3, m2, m6
- por m6, m3, m5
- movd eax, m6
-%endif
-
RET
+%endmacro
INIT_XMM sse4
CDEF_FILTER 8, 8, 32
CDEF_FILTER 4, 8, 32
CDEF_FILTER 4, 4, 32
+CDEF_DIR
INIT_XMM ssse3
CDEF_FILTER 8, 8, 32
CDEF_FILTER 4, 8, 32
CDEF_FILTER 4, 4, 32
+CDEF_DIR