ref: cbd4827fb7cfc7931902ed9e73dab8369250a32b
parent: d6beb0a055d46d238c7c4ad73afda8bb2f7a2c26
author: Martin Storsjö <[email protected]>
date: Mon Nov 30 06:35:53 EST 2020
arm64: looprestoration16: Don't keep precalculated squares in box3/5_h Instead of calculating squares of pixels once, and shifting and adding the precalculated squares, just do multiply-accumulate of the pixels that are shifted anyway for the non-squared sum. This results in more multiplications in total, but fewer instructions, and multiplications aren't that much more expensive than regular arithmetic operations anyway. On Cortex A53 and A72, this is a fairly substantial gain, on A73 it's a very marginal gain. The runtimes for the box3/5_h functions themselves are reduced by around 16-20%, and the overall runtime for SGR is reduced by around 2-8%. Before: Cortex A53 A72 A73 selfguided_3x3_10bpc_neon: 513086.5 385767.7 348774.3 selfguided_5x5_10bpc_neon: 378108.6 291133.5 253251.4 selfguided_mix_10bpc_neon: 876833.1 662801.0 586387.4 After: Cortex A53 A72 A73 selfguided_3x3_10bpc_neon: 502734.0 363754.5 343199.8 selfguided_5x5_10bpc_neon: 361696.4 265848.2 249476.8 selfguided_mix_10bpc_neon: 852683.8 615848.6 577615.0
--- a/src/arm/64/looprestoration16.S
+++ b/src/arm/64/looprestoration16.S
@@ -770,13 +770,6 @@
ext v16.16b, v18.16b, v16.16b, #12
2:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
-
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
@@ -796,41 +789,33 @@
b 6f
4: // Loop horizontally
-.macro ext_n dst1, dst2, src1, src2, src3, n, w
- ext \dst1, \src1, \src2, \n
+.macro add3 w, wd
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
+
+ add v6\wd, v0\wd, v26\wd
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7\wd, v16\wd, v28\wd
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6\wd, v6\wd, v27\wd
.if \w > 4
- ext \dst2, \src2, \src3, \n
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
.endif
-.endm
-.macro add_n dst1, dst2, src1, src2, src3, src4, w
- add \dst1, \src1, \src3
+ add v7\wd, v7\wd, v29\wd
.if \w > 4
- add \dst2, \src2, \src4
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
.endif
.endm
-
-.macro add3 w, wd
- ext v24.16b, v0.16b, v1.16b, #2
- ext v25.16b, v0.16b, v1.16b, #4
- ext v26.16b, v16.16b, v17.16b, #2
- ext v27.16b, v16.16b, v17.16b, #4
- add v6\wd, v0\wd, v24\wd
- add v7\wd, v16\wd, v26\wd
- add v6\wd, v6\wd, v25\wd
- add v7\wd, v7\wd, v27\wd
-
- ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w
- ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w
-
- add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w
- add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w
-
- ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
- ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
-
- add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w
- add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w
-.endm
add3 8, .8h
st1 {v6.8h}, [x1], #16
st1 {v7.8h}, [x11], #16
@@ -844,12 +829,6 @@
mov v16.16b, v17.16b
ld1 {v1.8h}, [x3], #16
ld1 {v17.8h}, [x12], #16
- mov v2.16b, v4.16b
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- mov v18.16b, v20.16b
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
@@ -907,11 +886,6 @@
.hword L(box3_variable_shift_tbl) - 55b
88:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
-
add3 4, .4h
subs w5, w5, #4
st1 {v6.4h}, [x1], #8
@@ -921,10 +895,6 @@
b.le 9f
ext v0.16b, v0.16b, v0.16b, #8
ext v16.16b, v16.16b, v16.16b, #8
- mov v2.16b, v3.16b
- mov v3.16b, v4.16b
- mov v18.16b, v19.16b
- mov v19.16b, v20.16b
// Only one needed pixel left, but do a normal 4 pixel
// addition anyway
add3 4, .4h
@@ -1036,13 +1006,6 @@
ext v16.16b, v18.16b, v16.16b, #10
2:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
-
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
@@ -1063,43 +1026,53 @@
4: // Loop horizontally
.macro add5 w, wd
- ext v24.16b, v0.16b, v1.16b, #2
- ext v25.16b, v0.16b, v1.16b, #4
- ext v26.16b, v0.16b, v1.16b, #6
- ext v27.16b, v0.16b, v1.16b, #8
+ ext v26.16b, v0.16b, v1.16b, #2
+ ext v28.16b, v16.16b, v17.16b, #2
+ ext v27.16b, v0.16b, v1.16b, #4
+ ext v29.16b, v16.16b, v17.16b, #4
- add v6\wd, v0\wd, v24\wd
- add v25\wd, v25\wd, v26\wd
+ add v6\wd, v0\wd, v26\wd
+ umull v22.4s, v0.4h, v0.4h
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7\wd, v16\wd, v28\wd
+ umull v24.4s, v16.4h, v16.4h
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
add v6\wd, v6\wd, v27\wd
+.if \w > 4
+ umull2 v23.4s, v0.8h, v0.8h
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+.endif
+ add v7\wd, v7\wd, v29\wd
+.if \w > 4
+ umull2 v25.4s, v16.8h, v16.8h
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+.endif
- ext v26.16b, v16.16b, v17.16b, #2
- ext v27.16b, v16.16b, v17.16b, #4
+ ext v26.16b, v0.16b, v1.16b, #6
ext v28.16b, v16.16b, v17.16b, #6
+ ext v27.16b, v0.16b, v1.16b, #8
ext v29.16b, v16.16b, v17.16b, #8
- add v7\wd, v16\wd, v26\wd
- add v27\wd, v27\wd, v28\wd
+ add v6\wd, v6\wd, v26\wd
+ umlal v22.4s, v26.4h, v26.4h
+ umlal v22.4s, v27.4h, v27.4h
+ add v7\wd, v7\wd, v28\wd
+ umlal v24.4s, v28.4h, v28.4h
+ umlal v24.4s, v29.4h, v29.4h
+ add v6\wd, v6\wd, v27\wd
+.if \w > 4
+ umlal2 v23.4s, v26.8h, v26.8h
+ umlal2 v23.4s, v27.8h, v27.8h
+.endif
add v7\wd, v7\wd, v29\wd
- add v6\wd, v6\wd, v25\wd
- add v7\wd, v7\wd, v27\wd
-
- ext_n v24.16b, v25.16b, v2.16b, v3.16b, v4.16b, #4, \w
- ext_n v26.16b, v27.16b, v2.16b, v3.16b, v4.16b, #8, \w
- ext_n v28.16b, v29.16b, v2.16b, v3.16b, v4.16b, #12, \w
-
- add_n v22.4s, v23.4s, v2.4s, v3.4s, v24.4s, v25.4s, \w
- add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w
- add_n v22.4s, v23.4s, v22.4s, v23.4s, v3.4s, v4.4s, \w
- add_n v22.4s, v23.4s, v22.4s, v23.4s, v26.4s, v27.4s, \w
-
- ext_n v24.16b, v25.16b, v18.16b, v19.16b, v20.16b, #4, \w
- ext_n v26.16b, v27.16b, v18.16b, v19.16b, v20.16b, #8, \w
- ext_n v28.16b, v29.16b, v18.16b, v19.16b, v20.16b, #12, \w
-
- add_n v24.4s, v25.4s, v18.4s, v19.4s, v24.4s, v25.4s, \w
- add_n v26.4s, v27.4s, v26.4s, v27.4s, v28.4s, v29.4s, \w
- add_n v24.4s, v25.4s, v24.4s, v25.4s, v19.4s, v20.4s, \w
- add_n v24.4s, v25.4s, v24.4s, v25.4s, v26.4s, v27.4s, \w
+.if \w > 4
+ umlal2 v25.4s, v28.8h, v28.8h
+ umlal2 v25.4s, v29.8h, v29.8h
+.endif
.endm
add5 8, .8h
st1 {v6.8h}, [x1], #16
@@ -1114,12 +1087,6 @@
mov v16.16b, v17.16b
ld1 {v1.8h}, [x3], #16
ld1 {v17.8h}, [x12], #16
- mov v2.16b, v4.16b
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- mov v18.16b, v20.16b
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
b.ne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
@@ -1193,13 +1160,6 @@
.hword L(box5_variable_shift_tbl) - 77b
88:
- umull v2.4s, v0.4h, v0.4h
- umull2 v3.4s, v0.8h, v0.8h
- umull v4.4s, v1.4h, v1.4h
- umull v18.4s, v16.4h, v16.4h
- umull2 v19.4s, v16.8h, v16.8h
- umull v20.4s, v17.4h, v17.4h
-
add5 4, .4h
subs w5, w5, #4
st1 {v6.4h}, [x1], #8
@@ -1209,10 +1169,6 @@
b.le 9f
ext v0.16b, v0.16b, v1.16b, #8
ext v16.16b, v16.16b, v17.16b, #8
- mov v2.16b, v3.16b
- mov v3.16b, v4.16b
- mov v18.16b, v19.16b
- mov v19.16b, v20.16b
add5 4, .4h
st1 {v6.4h}, [x1], #8
st1 {v7.4h}, [x11], #8