ref: bc26e300d1ef47040df247923c40491c0e31863d
parent: d2c94ee1d8f4bc436d4cbb2632fc0b27d6966c62
author: Martin Storsjö <[email protected]>
date: Tue Oct 1 19:05:17 EDT 2019
arm64: cdef: Rewrite an expression slightly Instead of apply_sign(imin(abs(diff), clip), diff), do imax(imin(diff, clip), -clip). Before: Cortex A53 A72 A73 cdef_filter_4x4_8bpc_neon: 592.7 374.5 384.5 cdef_filter_4x8_8bpc_neon: 1093.0 704.4 706.6 cdef_filter_8x8_8bpc_neon: 1962.6 1239.4 1252.1 After: cdef_filter_4x4_8bpc_neon: 593.7 355.5 373.2 cdef_filter_4x8_8bpc_neon: 1091.6 663.2 685.3 cdef_filter_8x8_8bpc_neon: 1964.2 1182.5 1210.8
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -299,17 +299,17 @@
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
- uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
- uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
- cmhi v18.8h, v0.8h, \s1\().8h // px > p0
- cmhi v22.8h, v0.8h, \s2\().8h // px > p1
- umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
- umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
+ uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
+ sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
+ sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
+ neg v16.8h, v17.8h // -clip
+ neg v20.8h, v21.8h // -clip
+ smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
+ smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
- neg v16.8h, v17.8h // -imin()
- neg v20.8h, v21.8h // -imin()
- bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
- bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
+ smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
+ smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
3: