ref: 361a3c8ee2d03f87f42a76213ee0f93e49fa9ec3
parent: 6ad9bd5f92621d81a227b6d271c29dfaa578000a
author: Martin Storsjö <[email protected]>
date: Tue Jan 28 06:07:14 EST 2020
arm: cdef: Add special cased versions for pri_strength/sec_strength being zero Before: ARM32: Cortex A7 A8 A9 A53 A72 A73 cdef_filter_4x4_8bpc_neon: 964.6 599.5 707.9 601.2 465.1 405.2 cdef_filter_4x8_8bpc_neon: 1726.0 1066.2 1238.7 1041.7 798.6 725.3 cdef_filter_8x8_8bpc_neon: 2974.4 1671.8 1943.9 1806.1 1229.8 1242.1 ARM64: cdef_filter_4x4_8bpc_neon: 569.2 337.8 348.7 cdef_filter_4x8_8bpc_neon: 1031.1 623.3 633.6 cdef_filter_8x8_8bpc_neon: 1847.5 1097.7 1117.5 After: ARM32: Cortex A7 A8 A9 A53 A72 A73 cdef_filter_4x4_8bpc_neon: 798.4 524.2 617.3 506.8 432.4 361.1 cdef_filter_4x8_8bpc_neon: 1394.7 910.4 1054.0 863.6 730.2 632.2 cdef_filter_8x8_8bpc_neon: 2364.6 1453.8 1675.1 1466.0 1086.4 1107.7 ARM64: cdef_filter_4x4_8bpc_neon: 461.7 303.1 308.6 cdef_filter_4x8_8bpc_neon: 833.0 547.5 556.0 cdef_filter_8x8_8bpc_neon: 1459.3 934.1 967.9
--- a/src/arm/32/cdef.S
+++ b/src/arm/32/cdef.S
@@ -311,14 +311,13 @@
vld1.16 {\d22}, [r9] // p1
.endif
.endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
- cmp \threshold, #0
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
+.if \min
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
-
- beq 3f
+.endif
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
@@ -342,22 +341,24 @@
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
- push {r4-r9,lr}
- vpush {q4-q7}
- ldrd r4, r5, [sp, #92]
- ldrd r6, r7, [sp, #100]
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+.if \pri
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
+.endif
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
+.if \pri
vdup.16 q5, r3 // threshold
+.endif
+.if \sec
vdup.16 q7, r4 // threshold
+.endif
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
@@ -364,8 +365,12 @@
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
+.if \sec
vdup.16 q6, d8[1]
+.endif
+.if \pri
vdup.16 q4, d8[0]
+.endif
1:
.if \w == 8
@@ -377,39 +382,54 @@
.endif
vmov.u16 q1, #0 // sum
+.if \min
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max
+.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
mov lr, #2 // sec_taps[0]
2:
+.if \pri
ldrsb r9, [r5] // off1
load_px d28, d29, d30, d31, \w
+.endif
+.if \sec
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
+.endif
+.if \pri
ldrb r12, [r8] // *pri_taps
- handle_pixel q14, q15, r3, q5, q4, r12
+ handle_pixel q14, q15, r3, q5, q4, r12, \min
+.endif
+.if \sec
load_px d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
- handle_pixel q14, q15, r4, q7, q6, lr
+ handle_pixel q14, q15, r4, q7, q6, lr, \min
load_px d28, d29, d30, d31, \w
- handle_pixel q14, q15, r4, q7, q6, lr
+ handle_pixel q14, q15, r4, q7, q6, lr, \min
sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1;
+.else
+ add r5, r5, #1 // r5 += 1
+.endif
subs lr, lr, #1 // sec_tap-- (value)
+.if \pri
add r8, r8, #1 // pri_taps++ (pointer)
+.endif
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
@@ -416,8 +436,10 @@
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
+.if \min
vmin.s16 q0, q0, q3
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
+.endif
vmovn.u16 d0, q0
.if \w == 8
add r2, r2, #2*16 // tmp += tmp_stride
@@ -432,11 +454,35 @@
// Reset pri_taps and directions back to the original point
sub r5, r5, #2
+.if \pri
sub r8, r8, #2
+.endif
bgt 1b
vpop {q4-q7}
pop {r4-r9,pc}
+endfunc
+.endm
+
+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_neon, export=1
+ push {r4-r9,lr}
+ vpush {q4-q7}
+ ldrd r4, r5, [sp, #92]
+ ldrd r6, r7, [sp, #100]
+ cmp r3, #0 // pri_strength
+ bne 1f
+ b cdef_filter\w\()_sec_neon // only sec
+1:
+ cmp r4, #0 // sec_strength
+ bne 1f
+ b cdef_filter\w\()_pri_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -286,13 +286,13 @@
ld1 {\d2\().d}[1], [x9] // p1
.endif
.endm
-.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap, min
+.if \min
umin v2.8h, v2.8h, \s1\().8h
smax v3.8h, v3.8h, \s1\().8h
umin v2.8h, v2.8h, \s2\().8h
smax v3.8h, v3.8h, \s2\().8h
-
- cbz \threshold, 3f
+.endif
uabd v16.8h, v0.8h, \s1\().8h // abs(diff)
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
@@ -316,25 +316,35 @@
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
-.macro filter w
-function cdef_filter\w\()_neon, export=1
+.macro filter_func w, pri, sec, min, suffix
+function cdef_filter\w\suffix\()_neon
+.if \pri
movrel x8, pri_taps
and w9, w3, #1
add x8, x8, w9, uxtw #1
+.endif
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.4h, #15
dup v28.4h, w6 // damping
+.if \pri
dup v25.8h, w3 // threshold
+.endif
+.if \sec
dup v27.8h, w4 // threshold
+.endif
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
+.if \sec
dup v26.8h, v24.h[1]
+.endif
+.if \pri
dup v24.8h, v24.h[0]
+.endif
1:
.if \w == 8
@@ -346,37 +356,52 @@
.endif
movi v1.8h, #0 // sum
+.if \min
mov v2.16b, v0.16b // min
mov v3.16b, v0.16b // max
+.endif
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
+ // This is also used as loop counter.
mov w11, #2 // sec_taps[0]
2:
+.if \pri
ldrb w9, [x5] // off1
load_px v4, v5, \w
+.endif
+.if \sec
add x5, x5, #4 // +2*2
ldrb w9, [x5] // off2
load_px v6, v7, \w
+.endif
+.if \pri
ldrb w10, [x8] // *pri_taps
- handle_pixel v4, v5, w3, v25.8h, v24.8h, w10
+ handle_pixel v4, v5, w3, v25.8h, v24.8h, w10, \min
+.endif
+.if \sec
add x5, x5, #8 // +2*4
ldrb w9, [x5] // off3
load_px v4, v5, \w
- handle_pixel v6, v7, w4, v27.8h, v26.8h, w11
+ handle_pixel v6, v7, w4, v27.8h, v26.8h, w11, \min
- handle_pixel v4, v5, w4, v27.8h, v26.8h, w11
+ handle_pixel v4, v5, w4, v27.8h, v26.8h, w11, \min
sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1;
+.else
+ add x5, x5, #1 // x5 += 1
+.endif
subs w11, w11, #1 // sec_tap-- (value)
+.if \pri
add x8, x8, #1 // pri_taps++ (pointer)
+.endif
b.ne 2b
sshr v4.8h, v1.8h, #15 // -(sum < 0)
@@ -383,8 +408,10 @@
add v1.8h, v1.8h, v4.8h // sum - (sum < 0)
srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4
add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4
+.if \min
smin v0.8h, v0.8h, v3.8h
smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max)
+.endif
xtn v0.8b, v0.8h
.if \w == 8
add x2, x2, #2*16 // tmp += tmp_stride
@@ -399,10 +426,28 @@
// Reset pri_taps and directions back to the original point
sub x5, x5, #2
+.if \pri
sub x8, x8, #2
+.endif
b.gt 1b
ret
+endfunc
+.endm
+
+.macro filter w
+filter_func \w, pri=1, sec=0, min=0, suffix=_pri
+filter_func \w, pri=0, sec=1, min=0, suffix=_sec
+filter_func \w, pri=1, sec=1, min=1, suffix=_pri_sec
+
+function cdef_filter\w\()_neon, export=1
+ cbnz w3, 1f // pri_strength
+ b cdef_filter\w\()_sec_neon // only sec
+1:
+ cbnz w4, 1f // sec_strength
+ b cdef_filter\w\()_pri_neon // only pri
+1:
+ b cdef_filter\w\()_pri_sec_neon // both pri and sec
endfunc
.endm