shithub: dav1d

Download patch

ref: 7ebcb777b99b954c8570141c469418371b3c2e75
parent: 911942ca3a9ef052b787ceffac9fb55149d6fd33
author: Martin Storsjö <[email protected]>
date: Thu Sep 24 08:09:07 EDT 2020

arm64: looprestoration16: Reorder instructions to avoid close data dependencies

Before:                  Cortex A53       A72       A73
wiener_chroma_10bpc_neon:  177063.6  129197.3  127987.9
wiener_chroma_12bpc_neon:  177034.4  129206.8  128409.5
wiener_luma_10bpc_neon:    177072.6  129198.1  127931.8
wiener_luma_12bpc_neon:    177052.4  129196.0  127955.2
After:
wiener_chroma_10bpc_neon:  176319.7  125992.1  128162.4
wiener_chroma_12bpc_neon:  176386.2  125986.4  128343.8
wiener_luma_10bpc_neon:    176174.0  126001.7  128227.8
wiener_luma_12bpc_neon:    176176.5  125992.1  128204.8

This gives a small speedup on A53, a bit larger one on A72 and little
change (mostly noise?) on A73.

--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -172,13 +172,13 @@
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
+        ext             v18.16b, v2.16b,  v3.16b, #6
         ext             v16.16b, v2.16b,  v3.16b, #2
         ext             v17.16b, v2.16b,  v3.16b, #4
-        ext             v18.16b, v2.16b,  v3.16b, #6
         ext             v19.16b, v2.16b,  v3.16b, #8
         ext             v20.16b, v2.16b,  v3.16b, #10
-        ext             v21.16b, v2.16b,  v3.16b, #12
         ushll_sz        v6,  v7,  v18, #7, \wd
+        ext             v21.16b, v2.16b,  v3.16b, #12
         smlal           v6.4s,   v2.4h,   v0.h[0]
         smlal           v6.4s,   v16.4h,  v0.h[1]
         smlal           v6.4s,   v17.4h,  v0.h[2]
@@ -195,13 +195,13 @@
         smlal2          v7.4s,   v20.8h,  v0.h[5]
         smlal2          v7.4s,   v21.8h,  v0.h[6]
 .endif
+        ext             v21.16b, v4.16b,  v5.16b, #6
         ext             v19.16b, v4.16b,  v5.16b, #2
         ext             v20.16b, v4.16b,  v5.16b, #4
-        ext             v21.16b, v4.16b,  v5.16b, #6
         ext             v22.16b, v4.16b,  v5.16b, #8
         ext             v23.16b, v4.16b,  v5.16b, #10
-        ext             v24.16b, v4.16b,  v5.16b, #12
         ushll_sz        v16, v17, v21, #7, \wd
+        ext             v24.16b, v4.16b,  v5.16b, #12
         smlal           v16.4s,  v4.4h,   v0.h[0]
         smlal           v16.4s,  v19.4h,  v0.h[1]
         smlal           v16.4s,  v20.4h,  v0.h[2]