ref: b1167ce169f004f90bcc4a9e8841ffb90fe4abf1
parent: 0bad117eb0f97594a938f17ba05d3ca89ba81a9f
author: Martin Storsjö <[email protected]>
date: Sat Feb 1 09:33:58 EST 2020
arm64: mc: Use two regs for alternating output rows for w4/8 in avg/w_avg/mask It was already done this way for w32/64. Not doing it for w16 as it didn't help there (and instead gave a small slowdown due to the two setup instructions). This gives a small speedup on in-order cores like A53. Before: Cortex A53 A72 A73 avg_w4_8bpc_neon: 60.9 25.6 29.0 avg_w8_8bpc_neon: 143.6 52.8 64.0 After: avg_w4_8bpc_neon: 56.7 26.7 28.5 avg_w8_8bpc_neon: 137.2 54.5 64.4
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -85,38 +85,44 @@
\type v4, v0, v1, v2, v3
sub x7, x7, w4, uxtw
br x7
+40:
+ add x7, x0, x1
+ lsl x1, x1, #1
4:
cmp w5, #4
st1 {v4.s}[0], [x0], x1
- st1 {v4.s}[1], [x0], x1
+ st1 {v4.s}[1], [x7], x1
st1 {v4.s}[2], [x0], x1
- st1 {v4.s}[3], [x0], x1
+ st1 {v4.s}[3], [x7], x1
b.eq 0f
\type v5, v0, v1, v2, v3
cmp w5, #8
st1 {v5.s}[0], [x0], x1
- st1 {v5.s}[1], [x0], x1
+ st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
- st1 {v5.s}[3], [x0], x1
+ st1 {v5.s}[3], [x7], x1
b.eq 0f
\type v4, v0, v1, v2, v3
st1 {v4.s}[0], [x0], x1
- st1 {v4.s}[1], [x0], x1
+ st1 {v4.s}[1], [x7], x1
\type v5, v0, v1, v2, v3
st1 {v4.s}[2], [x0], x1
- st1 {v4.s}[3], [x0], x1
+ st1 {v4.s}[3], [x7], x1
st1 {v5.s}[0], [x0], x1
- st1 {v5.s}[1], [x0], x1
+ st1 {v5.s}[1], [x7], x1
st1 {v5.s}[2], [x0], x1
- st1 {v5.s}[3], [x0], x1
+ st1 {v5.s}[3], [x7], x1
ret
+80:
+ add x7, x0, x1
+ lsl x1, x1, #1
8:
st1 {v4.d}[0], [x0], x1
\type v5, v0, v1, v2, v3
- st1 {v4.d}[1], [x0], x1
+ st1 {v4.d}[1], [x7], x1
st1 {v5.d}[0], [x0], x1
subs w5, w5, #4
- st1 {v5.d}[1], [x0], x1
+ st1 {v5.d}[1], [x7], x1
b.le 0f
\type v4, v0, v1, v2, v3
b 8b
@@ -185,8 +191,8 @@
.hword L(\type\()_tbl) - 640b
.hword L(\type\()_tbl) - 320b
.hword L(\type\()_tbl) - 16b
- .hword L(\type\()_tbl) - 8b
- .hword L(\type\()_tbl) - 4b
+ .hword L(\type\()_tbl) - 80b
+ .hword L(\type\()_tbl) - 40b
endfunc
.endm