ref: e80955cc94e78e1de28d8ef9462cd2df026f6fad
parent: 72af9329c0c003f68639301be33d4632147245b6
author: Martin Storsjö <[email protected]>
date: Thu Jan 10 05:48:50 EST 2019
arm64: mc: Optimize mc_8tap_regular_w4_hv_8bpc for A53 Before: Cortex A53 Snapdragon 835 mc_8tap_regular_w4_hv_8bpc_neon: 543.6 359.1 After: mc_8tap_regular_w4_hv_8bpc_neon: 466.7 355.5 The same kind of change doesn't seem to give any benefits on the 8 pixel wide hv filtering though, potentially related to the fact that it uses not only smull/smlal but also smull2/smlal2.
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1447,14 +1447,17 @@
mov v18.8b, v29.8b
4:
- smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
- smull v3.4s, v17.4h, v1.h[0]
+ // Interleaving the mul/mla chains actually hurts performance
+ // significantly on Cortex A53, thus keeping mul/mla tightly
+ // chained like this.
+ smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
- smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
- smlal v3.4s, v28.4h, v1.h[2]
smlal v2.4s, v28.4h, v1.h[3]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v28.4h, v1.h[2]
smlal v3.4s, v29.4h, v1.h[3]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
@@ -1508,22 +1511,22 @@
mov v22.8b, v29.8b
48:
- smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
- smull v3.4s, v17.4h, v1.h[0]
+ smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
- smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
- smlal v3.4s, v19.4h, v1.h[2]
smlal v2.4s, v19.4h, v1.h[3]
- smlal v3.4s, v20.4h, v1.h[3]
smlal v2.4s, v20.4h, v1.h[4]
- smlal v3.4s, v21.4h, v1.h[4]
smlal v2.4s, v21.4h, v1.h[5]
- smlal v3.4s, v22.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
- smlal v3.4s, v28.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
+ smull v3.4s, v17.4h, v1.h[0]
+ smlal v3.4s, v18.4h, v1.h[1]
+ smlal v3.4s, v19.4h, v1.h[2]
+ smlal v3.4s, v20.4h, v1.h[3]
+ smlal v3.4s, v21.4h, v1.h[4]
+ smlal v3.4s, v22.4h, v1.h[5]
+ smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv