shithub: dav1d

Download patch

ref: cbd4827fb7cfc7931902ed9e73dab8369250a32b
parent: d6beb0a055d46d238c7c4ad73afda8bb2f7a2c26
author: Martin Storsjö <[email protected]>
date: Mon Nov 30 06:35:53 EST 2020

arm64: looprestoration16: Don't keep precalculated squares in box3/5_h

Instead of calculating squares of pixels once, and shifting and
adding the precalculated squares, just do multiply-accumulate of
the pixels that are shifted anyway for the non-squared sum. This
results in more multiplications in total, but fewer instructions,
and multiplications aren't that much more expensive than regular
arithmetic operations anyway.

On Cortex A53 and A72, this is a fairly substantial gain, on A73
it's a very marginal gain.

The runtimes for the box3/5_h functions themselves are reduced
by around 16-20%, and the overall runtime for SGR is reduced
by around 2-8%.

Before:                   Cortex A53       A72       A73
selfguided_3x3_10bpc_neon:  513086.5  385767.7  348774.3
selfguided_5x5_10bpc_neon:  378108.6  291133.5  253251.4
selfguided_mix_10bpc_neon:  876833.1  662801.0  586387.4

After:                    Cortex A53       A72       A73
selfguided_3x3_10bpc_neon:  502734.0  363754.5  343199.8
selfguided_5x5_10bpc_neon:  361696.4  265848.2  249476.8
selfguided_mix_10bpc_neon:  852683.8  615848.6  577615.0

--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -770,13 +770,6 @@
         ext             v16.16b, v18.16b, v16.16b, #12
 
 2:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
-
         tst             w7,  #2 // LR_HAVE_RIGHT
         b.ne            4f
         // If we'll need to pad the right edge, load that byte to pad with
@@ -796,41 +789,33 @@
         b               6f
 
 4:      // Loop horizontally
-.macro ext_n            dst1, dst2, src1, src2, src3, n, w
-        ext             \dst1,  \src1,  \src2,  \n
+.macro add3 w, wd
+        ext             v26.16b, v0.16b,  v1.16b,  #2
+        ext             v28.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v0.16b,  v1.16b,  #4
+        ext             v29.16b, v16.16b, v17.16b, #4
+
+        add             v6\wd,   v0\wd,   v26\wd
+        umull           v22.4s,  v0.4h,   v0.4h
+        umlal           v22.4s,  v26.4h,  v26.4h
+        umlal           v22.4s,  v27.4h,  v27.4h
+        add             v7\wd,   v16\wd,  v28\wd
+        umull           v24.4s,  v16.4h,  v16.4h
+        umlal           v24.4s,  v28.4h,  v28.4h
+        umlal           v24.4s,  v29.4h,  v29.4h
+        add             v6\wd,   v6\wd,   v27\wd
 .if \w > 4
-        ext             \dst2,  \src2,  \src3,  \n
+        umull2          v23.4s,  v0.8h,   v0.8h
+        umlal2          v23.4s,  v26.8h,  v26.8h
+        umlal2          v23.4s,  v27.8h,  v27.8h
 .endif
-.endm
-.macro add_n            dst1, dst2, src1, src2, src3, src4, w
-        add             \dst1,  \src1,  \src3
+        add             v7\wd,   v7\wd,   v29\wd
 .if \w > 4
-        add             \dst2,  \src2,  \src4
+        umull2          v25.4s,  v16.8h,  v16.8h
+        umlal2          v25.4s,  v28.8h,  v28.8h
+        umlal2          v25.4s,  v29.8h,  v29.8h
 .endif
 .endm
-
-.macro add3 w, wd
-        ext             v24.16b, v0.16b,  v1.16b,  #2
-        ext             v25.16b, v0.16b,  v1.16b,  #4
-        ext             v26.16b, v16.16b, v17.16b, #2
-        ext             v27.16b, v16.16b, v17.16b, #4
-        add             v6\wd,   v0\wd,   v24\wd
-        add             v7\wd,   v16\wd,  v26\wd
-        add             v6\wd,   v6\wd,   v25\wd
-        add             v7\wd,   v7\wd,   v27\wd
-
-        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4, \w
-        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8, \w
-
-        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
-        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
-
-        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
-        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
-
-        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
-        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
-.endm
         add3            8, .8h
         st1             {v6.8h},         [x1],  #16
         st1             {v7.8h},         [x11], #16
@@ -844,12 +829,6 @@
         mov             v16.16b, v17.16b
         ld1             {v1.8h},  [x3],  #16
         ld1             {v17.8h}, [x12], #16
-        mov             v2.16b,  v4.16b
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        mov             v18.16b, v20.16b
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
 
         b.ne            4b // If we don't need to pad, just keep summing.
         b               3b // If we need to pad, check how many pixels we have left.
@@ -907,11 +886,6 @@
         .hword L(box3_variable_shift_tbl) - 55b
 
 88:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-
         add3            4, .4h
         subs            w5,  w5,  #4
         st1             {v6.4h},  [x1],  #8
@@ -921,10 +895,6 @@
         b.le            9f
         ext             v0.16b,  v0.16b,  v0.16b,  #8
         ext             v16.16b, v16.16b, v16.16b, #8
-        mov             v2.16b,  v3.16b
-        mov             v3.16b,  v4.16b
-        mov             v18.16b, v19.16b
-        mov             v19.16b, v20.16b
         // Only one needed pixel left, but do a normal 4 pixel
         // addition anyway
         add3            4, .4h
@@ -1036,13 +1006,6 @@
         ext             v16.16b, v18.16b, v16.16b, #10
 
 2:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
-
         tst             w7,  #2 // LR_HAVE_RIGHT
         b.ne            4f
         // If we'll need to pad the right edge, load that byte to pad with
@@ -1063,43 +1026,53 @@
 
 4:      // Loop horizontally
 .macro add5 w, wd
-        ext             v24.16b, v0.16b,  v1.16b,  #2
-        ext             v25.16b, v0.16b,  v1.16b,  #4
-        ext             v26.16b, v0.16b,  v1.16b,  #6
-        ext             v27.16b, v0.16b,  v1.16b,  #8
+        ext             v26.16b, v0.16b,  v1.16b,  #2
+        ext             v28.16b, v16.16b, v17.16b, #2
+        ext             v27.16b, v0.16b,  v1.16b,  #4
+        ext             v29.16b, v16.16b, v17.16b, #4
 
-        add             v6\wd,   v0\wd,   v24\wd
-        add             v25\wd,  v25\wd,  v26\wd
+        add             v6\wd,   v0\wd,   v26\wd
+        umull           v22.4s,  v0.4h,   v0.4h
+        umlal           v22.4s,  v26.4h,  v26.4h
+        umlal           v22.4s,  v27.4h,  v27.4h
+        add             v7\wd,   v16\wd,  v28\wd
+        umull           v24.4s,  v16.4h,  v16.4h
+        umlal           v24.4s,  v28.4h,  v28.4h
+        umlal           v24.4s,  v29.4h,  v29.4h
         add             v6\wd,   v6\wd,   v27\wd
+.if \w > 4
+        umull2          v23.4s,  v0.8h,   v0.8h
+        umlal2          v23.4s,  v26.8h,  v26.8h
+        umlal2          v23.4s,  v27.8h,  v27.8h
+.endif
+        add             v7\wd,   v7\wd,   v29\wd
+.if \w > 4
+        umull2          v25.4s,  v16.8h,  v16.8h
+        umlal2          v25.4s,  v28.8h,  v28.8h
+        umlal2          v25.4s,  v29.8h,  v29.8h
+.endif
 
-        ext             v26.16b, v16.16b, v17.16b, #2
-        ext             v27.16b, v16.16b, v17.16b, #4
+        ext             v26.16b, v0.16b,  v1.16b,  #6
         ext             v28.16b, v16.16b, v17.16b, #6
+        ext             v27.16b, v0.16b,  v1.16b,  #8
         ext             v29.16b, v16.16b, v17.16b, #8
 
-        add             v7\wd,   v16\wd,  v26\wd
-        add             v27\wd,  v27\wd,  v28\wd
+        add             v6\wd,   v6\wd,   v26\wd
+        umlal           v22.4s,  v26.4h,  v26.4h
+        umlal           v22.4s,  v27.4h,  v27.4h
+        add             v7\wd,   v7\wd,   v28\wd
+        umlal           v24.4s,  v28.4h,  v28.4h
+        umlal           v24.4s,  v29.4h,  v29.4h
+        add             v6\wd,   v6\wd,   v27\wd
+.if \w > 4
+        umlal2          v23.4s,  v26.8h,  v26.8h
+        umlal2          v23.4s,  v27.8h,  v27.8h
+.endif
         add             v7\wd,   v7\wd,   v29\wd
-        add             v6\wd,   v6\wd,   v25\wd
-        add             v7\wd,   v7\wd,   v27\wd
-
-        ext_n           v24.16b, v25.16b, v2.16b,  v3.16b,  v4.16b,  #4,  \w
-        ext_n           v26.16b, v27.16b, v2.16b,  v3.16b,  v4.16b,  #8,  \w
-        ext_n           v28.16b, v29.16b, v2.16b,  v3.16b,  v4.16b,  #12, \w
-
-        add_n           v22.4s,  v23.4s,  v2.4s,   v3.4s,   v24.4s,  v25.4s,  \w
-        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
-        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v3.4s,   v4.4s,   \w
-        add_n           v22.4s,  v23.4s,  v22.4s,  v23.4s,  v26.4s,  v27.4s,  \w
-
-        ext_n           v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4,  \w
-        ext_n           v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8,  \w
-        ext_n           v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
-
-        add_n           v24.4s,  v25.4s,  v18.4s,  v19.4s,  v24.4s,  v25.4s,  \w
-        add_n           v26.4s,  v27.4s,  v26.4s,  v27.4s,  v28.4s,  v29.4s,  \w
-        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v19.4s,  v20.4s,  \w
-        add_n           v24.4s,  v25.4s,  v24.4s,  v25.4s,  v26.4s,  v27.4s,  \w
+.if \w > 4
+        umlal2          v25.4s,  v28.8h,  v28.8h
+        umlal2          v25.4s,  v29.8h,  v29.8h
+.endif
 .endm
         add5            8, .8h
         st1             {v6.8h},         [x1],  #16
@@ -1114,12 +1087,6 @@
         mov             v16.16b, v17.16b
         ld1             {v1.8h},  [x3],  #16
         ld1             {v17.8h}, [x12], #16
-        mov             v2.16b,  v4.16b
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        mov             v18.16b, v20.16b
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
 
         b.ne            4b // If we don't need to pad, just keep summing.
         b               3b // If we need to pad, check how many pixels we have left.
@@ -1193,13 +1160,6 @@
         .hword L(box5_variable_shift_tbl) - 77b
 
 88:
-        umull           v2.4s,   v0.4h,   v0.4h
-        umull2          v3.4s,   v0.8h,   v0.8h
-        umull           v4.4s,   v1.4h,   v1.4h
-        umull           v18.4s,  v16.4h,  v16.4h
-        umull2          v19.4s,  v16.8h,  v16.8h
-        umull           v20.4s,  v17.4h,  v17.4h
-
         add5            4, .4h
         subs            w5,  w5,  #4
         st1             {v6.4h},  [x1],  #8
@@ -1209,10 +1169,6 @@
         b.le            9f
         ext             v0.16b,  v0.16b,  v1.16b,  #8
         ext             v16.16b, v16.16b, v17.16b, #8
-        mov             v2.16b,  v3.16b
-        mov             v3.16b,  v4.16b
-        mov             v18.16b, v19.16b
-        mov             v19.16b, v20.16b
         add5            4, .4h
         st1             {v6.4h},  [x1],  #8
         st1             {v7.4h},  [x11], #8