ref: 65ee1233cf86f03e029d0520f7cc5a3e152d3bbd
parent: 4380bc331d3bbc61110f6279076ee5861c7efaa9
author: Kyle Siefring <[email protected]>
date: Sun Mar 3 10:47:01 EST 2019
Speed up finding the best cost in avx2 cdef
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -29,10 +29,10 @@
%if ARCH_X86_64
SECTION_RODATA 32
-pd_04512763: dd 0, 4, 5, 1, 2, 7, 6, 3
+pd_02564713: dd 0, 2, 5, 6, 4, 7, 1, 3
+pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
div_table: dd 840, 420, 280, 210, 168, 140, 120, 105
dd 420, 210, 140, 105
-pd_04261537: dd 0, 4, 2, 6, 1, 5, 3, 7
shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pw_128: times 2 dw 128
@@ -640,54 +640,38 @@
pmulld m5, m13
paddd m5, m6 ; cost1[a-d] | cost3[a-d]
- mova xm0, [pd_04512763+ 0]
- mova xm1, [pd_04512763+ 16]
+ mova xm0, [pd_47130256+ 16]
+ mova m1, [pd_47130256]
phaddd m9, m8
phaddd m5, m4
phaddd m9, m5
- vpermd m0, m9 ; cost[0/4/2/6]
- vpermd m1, m9 ; cost[1/5/3/7]
+ vpermd m0, m9 ; cost[0-3]
+ vpermd m1, m9 ; cost[4-7] | cost[0-3]
- ; now find the best cost, its idx^4 complement, and its idx
- pcmpgtd xm2, xm1, xm0 ; [1/5/3/7] > [0/4/2/6]
- pand xm3, xm2, xm1
- pandn xm4, xm2, xm0
- por xm3, xm4 ; higher 4 values
- pshufd xm1, xm1, q2301
- pshufd xm0, xm0, q2301
- pand xm1, xm2, xm1
- pandn xm0, xm2, xm0
- por xm0, xm1 ; complementary 4 values at idx^4 offset
- pand xm13, xm2, [pd_04261537+16]
- pandn xm14, xm2, [pd_04261537+ 0]
- por xm14, xm13 ; indices
+ ; now find the best cost
+ pmaxsd xm2, xm0, xm1
+ pshufd xm3, xm2, q3232
+ pmaxsd xm2, xm3
+ pshufd xm3, xm2, q1111
+ pmaxsd xm2, xm3
+ pshufd xm2, xm2, q0000 ; best cost
- punpckhqdq xm4, xm3, xm0
- punpcklqdq xm3, xm0
- pcmpgtd xm5, xm4, xm3 ; [2or3-6or7] > [0or1/4or5]
- punpcklqdq xm5, xm5
- pand xm6, xm5, xm4
- pandn xm7, xm5, xm3
- por xm6, xm7 ; { highest 2 values, complements at idx^4 }
- movhlps xm13, xm14
- pand xm13, xm5, xm13
- pandn xm14, xm5, xm14
- por xm14, xm13
+ ; find the idx using minpos
+ ; make everything other than the best cost negative via subtraction
+ ; find the min of unsigned 16-bit ints to sort out the negative values
+ psubd xm4, xm1, xm2
+ psubd xm3, xm0, xm2
+ packssdw xm3, xm4
+ phminposuw xm3, xm3
- pshufd xm7, xm6, q3311
- pcmpgtd xm8, xm7, xm6 ; [4or5or6or7] > [0or1or2or3]
- punpcklqdq xm8, xm8
- pand xm9, xm8, xm7
- pandn xm10, xm8, xm6
- por xm9, xm10 ; max
- movhlps xm10, xm9 ; complement at idx^4
- psubd xm9, xm10
- psrld xm9, 10
- movd [varq], xm9
- pshufd xm13, xm14, q1111
- pand xm13, xm8, xm13
- pandn xm14, xm8, xm14
- por xm14, xm13
- movd eax, xm14
+ ; convert idx to 32-bits
+ psrldq xm3, 2
+ movd eax, xm3
+
+ ; get idx^4 complement
+ vpermd m3, m1
+ psubd xm2, xm3
+ psrld xm2, 10
+ movd [varq], xm2
RET
%endif ; ARCH_X86_64