ref: 7ebcb777b99b954c8570141c469418371b3c2e75
parent: 911942ca3a9ef052b787ceffac9fb55149d6fd33
author: Martin Storsjö <[email protected]>
date: Thu Sep 24 08:09:07 EDT 2020
arm64: looprestoration16: Reorder instructions to avoid close data dependencies Before: Cortex A53 A72 A73 wiener_chroma_10bpc_neon: 177063.6 129197.3 127987.9 wiener_chroma_12bpc_neon: 177034.4 129206.8 128409.5 wiener_luma_10bpc_neon: 177072.6 129198.1 127931.8 wiener_luma_12bpc_neon: 177052.4 129196.0 127955.2 After: wiener_chroma_10bpc_neon: 176319.7 125992.1 128162.4 wiener_chroma_12bpc_neon: 176386.2 125986.4 128343.8 wiener_luma_10bpc_neon: 176174.0 126001.7 128227.8 wiener_luma_12bpc_neon: 176176.5 125992.1 128204.8 This gives a small speedup on A53, a bit larger one on A72 and little change (mostly noise?) on A73.
--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -172,13 +172,13 @@
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
+ ext v18.16b, v2.16b, v3.16b, #6
ext v16.16b, v2.16b, v3.16b, #2
ext v17.16b, v2.16b, v3.16b, #4
- ext v18.16b, v2.16b, v3.16b, #6
ext v19.16b, v2.16b, v3.16b, #8
ext v20.16b, v2.16b, v3.16b, #10
- ext v21.16b, v2.16b, v3.16b, #12
ushll_sz v6, v7, v18, #7, \wd
+ ext v21.16b, v2.16b, v3.16b, #12
smlal v6.4s, v2.4h, v0.h[0]
smlal v6.4s, v16.4h, v0.h[1]
smlal v6.4s, v17.4h, v0.h[2]
@@ -195,13 +195,13 @@
smlal2 v7.4s, v20.8h, v0.h[5]
smlal2 v7.4s, v21.8h, v0.h[6]
.endif
+ ext v21.16b, v4.16b, v5.16b, #6
ext v19.16b, v4.16b, v5.16b, #2
ext v20.16b, v4.16b, v5.16b, #4
- ext v21.16b, v4.16b, v5.16b, #6
ext v22.16b, v4.16b, v5.16b, #8
ext v23.16b, v4.16b, v5.16b, #10
- ext v24.16b, v4.16b, v5.16b, #12
ushll_sz v16, v17, v21, #7, \wd
+ ext v24.16b, v4.16b, v5.16b, #12
smlal v16.4s, v4.4h, v0.h[0]
smlal v16.4s, v19.4h, v0.h[1]
smlal v16.4s, v20.4h, v0.h[2]