ref: 38629906c2bbd417c061a5bc7072924fbb6ca13c
parent: 801966ca946661881755a8078661e1c880995e46
author: Martin Storsjö <[email protected]>
date: Tue Mar 24 07:58:41 EDT 2020
arm64: ipred: Integrate aggregation into the first pass of cfl_ac Before: Cortex A53 A72 A73 cfl_ac_420_w4_8bpc_neon: 131.8 75.6 70.8 cfl_ac_420_w8_8bpc_neon: 199.4 106.4 117.8 cfl_ac_420_w16_8bpc_neon: 370.6 194.6 213.3 cfl_ac_422_w4_8bpc_neon: 98.4 61.4 56.6 cfl_ac_422_w8_8bpc_neon: 237.7 134.2 141.0 cfl_ac_422_w16_8bpc_neon: 456.5 256.2 279.5 After: cfl_ac_420_w4_8bpc_neon: 121.1 76.3 67.2 cfl_ac_420_w8_8bpc_neon: 188.7 106.6 115.3 cfl_ac_420_w16_8bpc_neon: 331.7 177.4 199.8 cfl_ac_422_w4_8bpc_neon: 88.7 57.3 51.6 cfl_ac_422_w8_8bpc_neon: 208.2 121.2 130.7 cfl_ac_422_w16_8bpc_neon: 393.8 226.3 239.3
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1955,6 +1955,10 @@
adr x7, L(ipred_cfl_ac_420_tbl)
sub w8, w8, #27
ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
@@ -1963,9 +1967,8 @@
clz w10, w10 // ctz(height)
add w9, w9, w10 // log2sz
add x10, x1, x2
- lsl x2, x2, #1
dup v31.4s, w9
- mov w9, w6
+ lsl x2, x2, #1
neg v31.4s, v31.4s // -log2sz
br x7
@@ -1981,6 +1984,7 @@
shl v0.8h, v0.8h, #1
subs w8, w8, #2
st1 {v0.8h}, [x0], #16
+ add v16.8h, v16.8h, v0.8h
b.gt 1b
trn2 v1.2d, v0.2d, v0.2d
trn2 v0.2d, v0.2d, v0.2d
@@ -1989,28 +1993,19 @@
2: // Vertical padding (h_pad > 0)
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
b.gt 2b
3:
- sub x0, x0, w6, uxtw #3
- // Sum the produced ac values
- subs w6, w6, #4
- ld1 {v0.8h, v1.8h}, [x0], #32
- b.le 5f
-4:
- ld1 {v2.8h, v3.8h}, [x0], #32
- subs w6, w6, #4
- add v0.8h, v0.8h, v2.8h
- add v1.8h, v1.8h, v3.8h
- b.gt 4b
-5:
- add v0.8h, v0.8h, v1.8h
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
uaddlv s0, v0.8h // sum
- sub x0, x0, w9, uxtw #3
+ sub x0, x0, w6, uxtw #3
urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
dup v4.8h, v4.h[0]
6: // Subtract dc from ac
ld1 {v0.8h, v1.8h}, [x0]
- subs w9, w9, #4
+ subs w6, w6, #4
sub v0.8h, v0.8h, v4.8h
sub v1.8h, v1.8h, v4.8h
st1 {v0.8h, v1.8h}, [x0], #32
@@ -2034,6 +2029,8 @@
shl v1.8h, v2.8h, #1
subs w8, w8, #2
st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
b.gt 1b
mov v0.16b, v1.16b
b L(ipred_cfl_ac_420_w8_hpad)
@@ -2053,6 +2050,10 @@
trn2 v2.2d, v0.2d, v0.2d
subs w8, w8, #2
st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+ add v16.4h, v16.4h, v0.4h
+ add v17.4h, v17.4h, v1.4h
+ add v18.4h, v18.4h, v2.4h
+ add v19.4h, v19.4h, v3.4h
b.gt 1b
trn1 v0.2d, v2.2d, v3.2d
trn1 v1.2d, v2.2d, v3.2d
@@ -2062,37 +2063,28 @@
2: // Vertical padding (h_pad > 0)
subs w4, w4, #4
st1 {v0.8h, v1.8h}, [x0], #32
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
st1 {v0.8h, v1.8h}, [x0], #32
+ add v18.8h, v18.8h, v0.8h
+ add v19.8h, v19.8h, v1.8h
b.gt 2b
3:
L(ipred_cfl_ac_420_w8_calc_subtract_dc):
- sub x0, x0, w6, uxtw #4
- // Sum the produced ac values
- subs w6, w6, #4
- ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
- b.le 5f
-4:
- ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
- subs w6, w6, #4
- add v0.8h, v0.8h, v4.8h
- add v1.8h, v1.8h, v5.8h
- add v2.8h, v2.8h, v6.8h
- add v3.8h, v3.8h, v7.8h
- b.gt 4b
-5:
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
+ // Aggregate the sums
+ add v0.8h, v16.8h, v17.8h
+ add v2.8h, v18.8h, v19.8h
uaddlp v0.4s, v0.8h
uaddlp v2.4s, v2.8h
add v0.4s, v0.4s, v2.4s
addv s0, v0.4s // sum
- sub x0, x0, w9, uxtw #4
+ sub x0, x0, w6, uxtw #4
urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
dup v4.8h, v4.h[0]
6: // Subtract dc from ac
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
- subs w9, w9, #4
+ subs w6, w6, #4
sub v0.8h, v0.8h, v4.8h
sub v1.8h, v1.8h, v4.8h
sub v2.8h, v2.8h, v4.8h
@@ -2131,6 +2123,10 @@
shl v3.8h, v5.8h, #1
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2168,6 +2164,10 @@
trn1 v3.2d, v3.2d, v5.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2191,6 +2191,10 @@
dup v3.8h, v2.h[7]
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2216,6 +2220,10 @@
trn1 v2.2d, v2.2d, v3.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2226,7 +2234,15 @@
2: // Vertical padding (h_pad > 0)
subs w4, w4, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 2b
3:
@@ -2257,6 +2273,10 @@
adr x7, L(ipred_cfl_ac_422_tbl)
sub w8, w8, #27
ldrh w8, [x7, w8, uxtw #1]
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
@@ -2265,9 +2285,8 @@
clz w10, w10 // ctz(height)
add w9, w9, w10 // log2sz
add x10, x1, x2
- lsl x2, x2, #1
dup v31.4s, w9
- mov w9, w6
+ lsl x2, x2, #1
neg v31.4s, v31.4s // -log2sz
br x7
@@ -2282,6 +2301,8 @@
shl v0.8h, v0.8h, #2
shl v1.8h, v1.8h, #2
subs w8, w8, #4
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
st1 {v0.8h, v1.8h}, [x0], #32
b.gt 1b
trn2 v0.2d, v1.2d, v1.2d
@@ -2305,6 +2326,10 @@
shl v3.8h, v3.8h, #2
subs w8, w8, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v3.16b
mov v1.16b, v3.16b
@@ -2330,6 +2355,10 @@
trn1 v2.2d, v2.2d, v6.2d
subs w8, w8, #4
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v3.16b
mov v1.16b, v3.16b
@@ -2355,6 +2384,10 @@
shl v3.8h, v3.8h, #2
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2380,6 +2413,10 @@
trn1 v3.2d, v3.2d, v5.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2397,6 +2434,10 @@
dup v3.8h, v2.h[7]
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b
@@ -2416,6 +2457,10 @@
trn1 v2.2d, v2.2d, v3.2d
subs w8, w8, #2
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ add v16.8h, v16.8h, v0.8h
+ add v17.8h, v17.8h, v1.8h
+ add v18.8h, v18.8h, v2.8h
+ add v19.8h, v19.8h, v3.8h
b.gt 1b
mov v0.16b, v2.16b
mov v1.16b, v3.16b