shithub: dav1d

Download patch

ref: fa6a0924d7aef7fbbdb02c7a8df0714d00e40408
parent: 1f83575018b39d12410407dc08bdc9c445504406
author: Martin Storsjö <[email protected]>
date: Fri Oct 4 09:53:49 EDT 2019

arm64: cdef: Calculate two initial parameters in the same vector

As there's only two individual parameters, we can insert them into
the same vector, reducing the number of actual calculation instructions,
but adding a few more instructions to dup the results to the final
vectors instead.

--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -323,19 +323,18 @@
         add             x8,  x8,  w9, uxtw #1
         movrel          x9,  directions\w
         add             x5,  x9,  w5, uxtw #1
-        movi            v30.8h,   #15
-        dup             v28.8h,   w6                // damping
+        movi            v30.4h,   #15
+        dup             v28.4h,   w6                // damping
 
         dup             v25.8h, w3                  // threshold
         dup             v27.8h, w4                  // threshold
-        clz             v24.8h, v25.8h              // clz(threshold)
-        clz             v26.8h, v27.8h              // clz(threshold)
-        sub             v24.8h, v30.8h, v24.8h      // ulog2(threshold)
-        sub             v26.8h, v30.8h, v26.8h      // ulog2(threshold)
-        uqsub           v24.8h, v28.8h, v24.8h      // shift = imax(0, damping - ulog2(threshold))
-        uqsub           v26.8h, v28.8h, v26.8h      // shift = imax(0, damping - ulog2(threshold))
-        neg             v24.8h, v24.8h              // -shift
-        neg             v26.8h, v26.8h              // -shift
+        trn1            v24.4h, v25.4h, v27.4h
+        clz             v24.4h, v24.4h              // clz(threshold)
+        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
+        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.4h, v24.4h              // -shift
+        dup             v26.8h, v24.h[1]
+        dup             v24.8h, v24.h[0]
 
 1:
 .if \w == 8