ref: 8f8dc9281629cf187f542a64b023782b50ed1881
parent: 4f5261a0ed399dcec88c87f34d1095b0152b9ae1
author: Martin Storsjö <[email protected]>
date: Tue Mar 5 06:43:25 EST 2019
arm64: cdef: Use a smarter padding constant Pad with a value which works both as a large unsigned value and a negative signed value. This allows doing the max operation using signed max, avoiding the conditional altogether. Based on the same idea for x86 by Kyle Siefring. Before: Cortex A53 A72 A73 cdef_filter_4x4_8bpc_neon: 645.5 401.9 422.5 cdef_filter_4x8_8bpc_neon: 1193.7 756.6 782.4 cdef_filter_8x8_8bpc_neon: 2162.4 1361.9 1375.6 After: cdef_filter_4x4_8bpc_neon: 596.3 377.8 384.8 cdef_filter_4x8_8bpc_neon: 1097.4 705.5 707.1 cdef_filter_8x8_8bpc_neon: 1967.4 1232.3 1239.9
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -136,8 +136,7 @@
.macro padding_func w, stride, rn, rw
function cdef_padding\w\()_neon, export=1
- movi v30.16b, #255
- ushr v30.8h, v30.8h, #1 // INT16_MAX
+ movi v30.8h, #0x80, lsl #8
mov v31.16b, v30.16b
sub x0, x0, #2*(2*\stride+2)
tst w6, #4 // CDEF_HAVE_TOP
@@ -290,14 +289,10 @@
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
- cmeq v16.8h, \s1\().8h, v31.8h
- cmeq v17.8h, \s2\().8h, v31.8h
- bic v16.16b, \s1\().16b, v16.16b
- bic v17.16b, \s2\().16b, v17.16b
umin v2.8h, v2.8h, \s1\().8h
- umax v3.8h, v3.8h, v16.8h
+ smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
- umax v3.8h, v3.8h, v17.8h
+ smax v3.8h, v3.8h, \s2\().8h
cbz \threshold, 3f
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
@@ -308,8 +303,8 @@
uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
cmhi v18.8h, v0.8h, \s1\().8h // px > p0
cmhi v22.8h, v0.8h, \s2\().8h // px > p1
- smin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
- smin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
+ umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
+ umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
dup v19.8h, \tap // taps[k]/taps[k]
neg v16.8h, v17.8h // -imin()
neg v20.8h, v21.8h // -imin()
@@ -330,10 +325,8 @@
add x8, x8, w9, uxtw #1
movrel x9, directions\w
add x5, x9, w5, uxtw #1
- movi v31.16b, #255
movi v30.8h, #15
dup v28.8h, w6 // damping
- ushr v31.8h, v31.8h, #1 // INT16_MAX
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold