ref: bf920fba5782a8b272b44792df0942c211ec5886
parent: f64fdae55128ff1c2204f578ee26b6d577862b26
author: Martin Storsjö <[email protected]>
date: Sun May 19 17:10:55 EDT 2019
arm: mc: Fix 8tap_v w8 with OBMC 3/4 heights Also make sure that the w4 case can exit after processing 12 pixels, where it is convenient. This gives a small slowdown for in-order cores like A7, A8, A53, but acutally seems to give a small speedup for out-of-order cores like A9, A72 and A73. AArch64: Before: Cortex A53 A72 A73 mc_8tap_regular_w8_v_8bpc_neon: 223.8 247.3 228.5 After: mc_8tap_regular_w8_v_8bpc_neon: 232.5 243.9 223.4 AArch32: Before: Cortex A7 A8 A9 A53 A72 A73 mc_8tap_regular_w8_v_8bpc_neon: 550.2 470.7 520.5 257.0 256.4 248.2 After: mc_8tap_regular_w8_v_8bpc_neon: 554.3 474.2 511.6 267.5 252.6 246.8
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1112,7 +1112,7 @@
vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
- b 48b
+ bgt 48b
0:
vpop {q4}
pop {r4-r11,pc}
@@ -1145,7 +1145,7 @@
0:
pop {r4-r11,pc}
-880: // 8x8, 8x16, 8x32 v
+880: // 8x6, 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
@@ -1178,12 +1178,17 @@
mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
shift_store_8 \type, \d_strd, q3, d6, q4, d8
ble 9f
- subs \h, \h, #4
- load_reg \sr2, \src, \s_strd, d30, d2, d4, d6
- vmovl_u8 q15, d30, q1, d2, q2, d4, q3, d6
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d30, d2
+ vmovl_u8 q15, d30, q1, d2
mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
+ shift_store_8 \type, \d_strd, q8, d16, q9, d18
+ ble 9f
+ subs \h, \h, #2
+ load_reg \sr2, \src, \s_strd, d4, d6
+ vmovl_u8 q2, d4, q3, d6
mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
- shift_store_8 \type, \d_strd, q8, d16, q9, d18, q10, d20, q11, d22
+ shift_store_8 \type, \d_strd, q10, d20, q11, d22
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1119,7 +1119,7 @@
uxtl_b v18, v19, v20, v21
mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
- b 48b
+ b.gt 48b
0:
ret
@@ -1151,7 +1151,7 @@
0:
ret
-880: // 8x8, 8x16, 8x32 v
+880: // 8x6, 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
@@ -1183,12 +1183,17 @@
mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
- subs \h, \h, #4
- load_8b \sr2, \src, \s_strd, v27, v16, v17, v18
- uxtl_b v27, v16, v17, v18
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v27, v16
+ uxtl_b v27, v16
mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
+ shift_store_8 \type, \d_strd, v1, v2
+ b.le 9f
+ subs \h, \h, #2
+ load_8b \sr2, \src, \s_strd, v17, v18
+ uxtl_b v17, v18
mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
- shift_store_8 \type, \d_strd, v1, v2, v3, v4
+ shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22