ref: 22080aa30cfed267f8c13c293db1dcc34012ecef
parent: 1bd078c2e5592fde8ba045a585398a5a2c1fb603
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Fri Feb 14 10:46:20 EST 2020
x86: optimize cdef_filter_{4x{4,8},8x8}_avx2 Add 2 seperate code paths for pri/sec strengths equal 0. Having both strengths not equal to 0 is uncommon, branching to skip unnecessary computations is therefore beneficial. ------------------------------------------ before: cdef_filter_4x4_8bpc_avx2: 93.8 after: cdef_filter_4x4_8bpc_avx2: 71.7 --------------------- before: cdef_filter_4x8_8bpc_avx2: 161.5 after: cdef_filter_4x8_8bpc_avx2: 116.3 --------------------- before: cdef_filter_8x8_8bpc_avx2: 221.8 after: cdef_filter_8x8_8bpc_avx2: 156.4 ------------------------------------------
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -128,16 +128,82 @@
SECTION .text
-%macro ACCUMULATE_TAP_BYTE 7 ; tap_offset, shift, mask, strength, mul_tap, w, h
+%macro PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r6m
+ lea tableq, [cdef_filter_%1x%2_jmptable]
+ lea dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+ DEFINE_ARGS dst, stride, left, top, pri, sec, \
+ table, dir, dirjmp, dst4, stride3, k
+ %else
+ DEFINE_ARGS dst, stride, left, top, pri, sec, \
+ table, dir, dirjmp, dst4, dst8, stride3, k
+ lea dst8q, [dstq+strideq*8]
+ %endif
+%else
+ DEFINE_ARGS dst, stride, h, top1, pri, sec, \
+ table, dir, dirjmp, top2, dst4, stride3, k
+ mov hq, -8
+ lea top1q, [top1q+strideq*0]
+ lea top2q, [top1q+strideq*1]
+%endif
+ lea dst4q, [dstq+strideq*4]
+%if %1 == 4
+ lea stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+ pxor m15, m15 ; sum
+%if %2 == 8
+ pxor m12, m12
+ %if %1 == 4
+ movd xm4, [dstq +strideq*0]
+ movd xm6, [dstq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm7, [dstq +stride3q ]
+ vinserti128 m4, [dst4q+strideq*0], 1
+ vinserti128 m6, [dst4q+strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*2], 1
+ vinserti128 m7, [dst4q+stride3q ], 1
+ punpckldq m4, m6
+ punpckldq m5, m7
+ %else
+ movq xm4, [dstq+strideq*0]
+ movq xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ %endif
+ punpcklqdq m4, m5
+%else
+ movd xm4, [dstq+strideq*0]
+ movd xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ punpckldq m4, m5
+%endif
+%if %3 == 1
+ mova m7, m4 ; min
+ mova m8, m4 ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, h, clip
; load p0/p1
movsxd dirjmpq, [dirq+kq*4+%1*2*4]
add dirjmpq, tableq
call dirjmpq
+%if %8 == 1
pmaxub m7, m5
pminub m8, m5
pmaxub m7, m6
pminub m8, m6
+%endif
; accumulate sum[m15] over p0/p1
%if %7 == 4
@@ -147,7 +213,7 @@
psubusb m5, m6, m5
por m9, m5 ; abs_diff_p01(p01 - px)
pcmpeqb m5, m9
- por m5, m3
+ por m5, %5
psignb m6, %5, m5
psrlw m5, m9, %2 ; emulate 8-bit shift
pand m5, %3
@@ -166,8 +232,8 @@
pcmpeqb m6, m11
punpckhbw m10, m9, m11
punpcklbw m9, m11
- por m5, m3
- por m11, m6, m3
+ por m5, %5
+ por m11, m6, %5
punpckhbw m6, m5, m11
punpcklbw m5, m11
psignb m11, %5, m6
@@ -187,7 +253,107 @@
%endif
%endmacro
-%macro ACCUMULATE_TAP_WORD 6 ; tap_offset, shift, mask, strength, mul_tap, w
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+ punpcklbw m4, %3
+ %endif
+ pcmpgtw %3, m15
+ paddw m15, %3
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m15
+ paddb m4, m15
+ %else
+ paddw m4, m15
+ packuswb m4, m4 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ pcmpgtw m6, %3, m12
+ pcmpgtw m5, %3, m15
+ paddw m12, m6
+ paddw m15, m5
+ %if %5 == 1
+ punpckhbw m5, m4, %3
+ punpcklbw m4, %3
+ %endif
+ pmulhrsw m12, %4
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m12
+ paddb m4, m15
+ %else
+ paddw m5, m12
+ paddw m4, m15
+ packuswb m4, m5 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ %if %1 == 4
+ movd [dstq +strideq*0], xm4
+ movd [dst4q+strideq*0], xm5
+ pextrd [dstq +strideq*1], xm4, 1
+ pextrd [dst4q+strideq*1], xm5, 1
+ pextrd [dstq +strideq*2], xm4, 2
+ pextrd [dst4q+strideq*2], xm5, 2
+ pextrd [dstq +stride3q ], xm4, 3
+ pextrd [dst4q+stride3q ], xm5, 3
+ %else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r6m
+ lea dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
+ %else
+ DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ %endif
+ mov hd, %1*%2*2/mmsize
+%else
+ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
+%endif
+ lea stkq, [px]
+ pxor m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+%if %1 == 4
+ movq xm4, [stkq+32*0]
+ movhps xm4, [stkq+32*1]
+ movq xm5, [stkq+32*2]
+ movhps xm5, [stkq+32*3]
+ vinserti128 m4, xm5, 1
+%else
+ mova xm4, [stkq+32*0] ; px
+ vinserti128 m4, [stkq+32*1], 1
+%endif
+ pxor m15, m15 ; sum
+%if %3 == 1
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, clip
; load p0/p1
movsx offq, byte [dirq+kq+%1] ; off1
%if %6 == 4
@@ -211,6 +377,7 @@
movu xm6, [stkq+offq*2+32*0] ; p1
vinserti128 m6, [stkq+offq*2+32*1], 1
%endif
+%if %7 == 1
; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value.
; use signed max and unsigned min to remove them
@@ -218,6 +385,7 @@
pminuw m8, m5 ; min after p0
pmaxsw m7, m6 ; max after p1
pminuw m8, m6 ; min after p1
+%endif
; accumulate sum[m15] over p0/p1
; calculate difference before converting
@@ -242,6 +410,28 @@
paddw m15, m5
%endmacro
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+ pcmpgtw m9, m11, m15
+ paddw m15, m9
+ pmulhrsw m15, %2
+ paddw m4, m15
+%if %3 == 1
+ pminsw m4, m7
+ pmaxsw m4, m8
+%endif
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+%if %1 == 4
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+stride3q], xm5, 1
+%else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
%macro CDEF_FILTER 2 ; w, h
INIT_YMM avx2
cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
@@ -303,21 +493,24 @@
%endif
DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
+ mov dampingd, r7m
+ xor zerod, zerod
movifnidn prid, prim
-%if UNIX64
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ or prid, 0
+ jz .sec_only
movd xm0, prid
- movd xm1, secdmpd
-%endif
- mov dampingd, r7m
lzcnt pridmpd, prid
- lzcnt secdmpd, secdmpm
- sub dampingd, 31
- xor zerod, zerod
add pridmpd, dampingd
cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ or secdmpd, 0
+ jz .pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
- mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
@@ -327,132 +520,29 @@
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
-%if UNIX64
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
-%else
- vpbroadcastb m0, prim
- vpbroadcastb m1, secm
-%endif
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
- ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
- mov dird, r6m
- lea tableq, [cdef_filter_%1x%2_jmptable]
- lea dirq, [tableq+dirq*2*4]
-%if %1 == 4
- %if %2 == 4
- DEFINE_ARGS dst, stride, left, top, pri, sec, \
- table, dir, dirjmp, dst4, stride3, k
- %else
- DEFINE_ARGS dst, stride, left, top, pri, sec, \
- table, dir, dirjmp, dst4, dst8, stride3, k
- lea dst8q, [dstq+strideq*8]
- %endif
-%else
- DEFINE_ARGS dst, stride, h, top1, pri, sec, \
- table, dir, dirjmp, top2, dst4, stride3, k
- mov hq, -8
- lea top1q, [top1q+strideq*0]
- lea top2q, [top1q+strideq*1]
-%endif
- lea dst4q, [dstq+strideq*4]
-%if %1 == 4
- lea stride3q, [strideq*3]
-%endif
+ PREP_REGS %1, %2
%if %1*%2 > mmsize
.v_loop:
%endif
- mov kd, 1
- pxor m15, m15 ; sum
-%if %2 == 8
- pxor m12, m12
- %if %1 == 4
- movd xm4, [dstq +strideq*0]
- movd xm6, [dstq +strideq*1]
- movd xm5, [dstq +strideq*2]
- movd xm7, [dstq +stride3q ]
- vinserti128 m4, [dst4q+strideq*0], 1
- vinserti128 m6, [dst4q+strideq*1], 1
- vinserti128 m5, [dst4q+strideq*2], 1
- vinserti128 m7, [dst4q+stride3q ], 1
- punpckldq m4, m6
- punpckldq m5, m7
- %else
- movq xm4, [dstq+strideq*0]
- movq xm5, [dstq+strideq*1]
- vinserti128 m4, [dstq+strideq*2], 1
- vinserti128 m5, [dstq+stride3q ], 1
- %endif
- punpcklqdq m4, m5
-%else
- movd xm4, [dstq+strideq*0]
- movd xm5, [dstq+strideq*1]
- vinserti128 m4, [dstq+strideq*2], 1
- vinserti128 m5, [dstq+stride3q ], 1
- punpckldq m4, m5
-%endif
- mova m7, m4 ; min
- mova m8, m4 ; max
+ LOAD_BLOCK %1, %2, 1
.k_loop:
- vpbroadcastb m2, [priq+kq] ; pri_taps
- vpbroadcastb m3, [secq+kq] ; sec_taps
-
- ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
- ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
- ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
dec kq
jge .k_loop
vpbroadcastd m10, [pw_2048]
pxor m9, m9
-%if %2 == 4
- punpcklbw m4, m9
- pcmpgtw m9, m15
- paddw m15, m9
- pmulhrsw m15, m10
- paddw m4, m15
- packuswb m4, m4 ; clip px in [0x0,0xff]
- pminub m4, m7
- pmaxub m4, m8
- vextracti128 xm5, m4, 1
- movd [dstq+strideq*0], xm4
- movd [dstq+strideq*2], xm5
- pextrd [dstq+strideq*1], xm4, 1
- pextrd [dstq+stride3q ], xm5, 1
-%else
- pcmpgtw m6, m9, m12
- pcmpgtw m5, m9, m15
- paddw m12, m6
- paddw m15, m5
- punpckhbw m5, m4, m9
- punpcklbw m4, m9
- pmulhrsw m12, m10
- pmulhrsw m15, m10
- paddw m5, m12
- paddw m4, m15
- packuswb m4, m5 ; clip px in [0x0,0xff]
- pminub m4, m7
- pmaxub m4, m8
- vextracti128 xm5, m4, 1
- %if %1 == 4
- movd [dstq +strideq*0], xm4
- movd [dst4q+strideq*0], xm5
- pextrd [dstq +strideq*1], xm4, 1
- pextrd [dst4q+strideq*1], xm5, 1
- pextrd [dstq +strideq*2], xm4, 2
- pextrd [dst4q+strideq*2], xm5, 2
- pextrd [dstq +stride3q ], xm4, 3
- pextrd [dst4q+stride3q ], xm5, 3
- %else
- movq [dstq+strideq*0], xm4
- movq [dstq+strideq*2], xm5
- movhps [dstq+strideq*1], xm4
- movhps [dstq+stride3q ], xm5
- %endif
-%endif
+ ADJUST_PIXEL %1, %2, m9, m10, 1
%if %1*%2 > mmsize
mov dstq, dst4q
lea top1q, [rsp+0x90]
@@ -463,6 +553,76 @@
%endif
RET
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, pri, _, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m3, [pw_2048]
+ pxor m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+ dec kq
+ jge .pri_k_loop
+ ADJUST_PIXEL %1, %2, m1, m3
+%if %1*%2 > mmsize
+ mov dstq, dst4q
+ lea top1q, [rsp+0x90]
+ lea top2q, [rsp+0xA0]
+ lea dst4q, [dst4q+strideq*4]
+ add hq, 4
+ jl .pri_v_loop
+%endif
+ RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ cmovs secdmpd, zerod
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, _, secdmp, table
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, _, sec, table, dir
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m2, [pw_2048]
+ pxor m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ dec kq
+ jge .sec_k_loop
+ ADJUST_PIXEL %1, %2, m0, m2
+%if %1*%2 > mmsize
+ mov dstq, dst4q
+ lea top1q, [rsp+0x90]
+ lea top2q, [rsp+0xA0]
+ lea dst4q, [dst4q+strideq*4]
+ add hq, 4
+ jl .sec_v_loop
+%endif
+ RET
+
.d0k0:
%if %1 == 4
%if %2 == 4
@@ -1342,21 +1502,24 @@
; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi]
- movifnidn prid, prim
mov dampingd, r7m
- lzcnt pridmpd, prid
-%if UNIX64
- movd xm0, prid
- movd xm1, secdmpd
-%endif
- lzcnt secdmpd, secdmpm
- sub dampingd, 31
xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ or prid, 0
+ jz .border_sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
add pridmpd, dampingd
cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ or secdmpd, 0
+ jz .border_pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
- mov [rsp+0], pridmpq ; pri_shift
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
@@ -1366,87 +1529,99 @@
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
-%if UNIX64
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
-%else
- vpbroadcastb m0, prim
- vpbroadcastb m1, secm
-%endif
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
- ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
- mov dird, r6m
- lea dirq, [tableq+dirq*2+14]
+ BORDER_PREP_REGS %1, %2
%if %1*%2*2/mmsize > 1
- %if %1 == 4
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
- %else
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
- %endif
- mov hd, %1*%2*2/mmsize
-%else
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
-%endif
- lea stkq, [px]
- pxor m11, m11
-%if %1*%2*2/mmsize > 1
.border_v_loop:
%endif
- mov kd, 1
-%if %1 == 4
- movq xm4, [stkq+32*0]
- movhps xm4, [stkq+32*1]
- movq xm5, [stkq+32*2]
- movhps xm5, [stkq+32*3]
- vinserti128 m4, xm5, 1
-%else
- mova xm4, [stkq+32*0] ; px
- vinserti128 m4, [stkq+32*1], 1
-%endif
- pxor m15, m15 ; sum
- mova m7, m4 ; max
- mova m8, m4 ; min
+ BORDER_LOAD_BLOCK %1, %2, 1
.border_k_loop:
vpbroadcastb m2, [priq+kq] ; pri_taps
vpbroadcastb m3, [secq+kq] ; sec_taps
-
- ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
- ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
- ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
-
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
dec kq
jge .border_k_loop
vpbroadcastd m10, [pw_2048]
- pcmpgtw m9, m11, m15
- paddw m15, m9
- pmulhrsw m15, m10
- paddw m4, m15
- pminsw m4, m7
- pmaxsw m4, m8
- packuswb m4, m4
- vextracti128 xm5, m4, 1
-%if %1 == 4
- movd [dstq+strideq*0], xm4
- pextrd [dstq+strideq*1], xm4, 1
- movd [dstq+strideq*2], xm5
- pextrd [dstq+stride3q], xm5, 1
-%else
- movq [dstq+strideq*0], xm4
- movq [dstq+strideq*1], xm5
+ BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_v_loop
%endif
+ RET
+.border_pri_only:
+ DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ DEFINE_ARGS dst, stride, dir, table, pri, _, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m1, [pw_2048]
%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+ dec kq
+ jge .border_pri_k_loop
+ BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
%define vloop_lines (mmsize/(%1*2))
lea dstq, [dstq+strideq*vloop_lines]
add stkq, 32*vloop_lines
dec hd
- jg .border_v_loop
+ jg .border_pri_v_loop
%endif
+ RET
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ cmovs secdmpd, zerod
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ DEFINE_ARGS dst, stride, dir, table, _, sec, stride3
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+ dec kq
+ jge .border_sec_k_loop
+ BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_sec_v_loop
+%endif
RET
%endmacro