ref: 5647a57eabc454e2e2360429aba494452af00cb3
parent: 3489a9c116ae2b2e258d41509fe35c9acf7cf5f5
author: Martin Storsjö <[email protected]>
date: Mon Oct 7 08:24:04 EDT 2019
arm64: mc: Use addp instead of addv+trn1 in warp Before: Cortex A53 A72 A73 warp_8x8_8bpc_neon: 1952.8 1161.3 1151.1 warp_8x8t_8bpc_neon: 1937.1 1147.5 1139.0 After: warp_8x8_8bpc_neon: 1860.8 1068.6 1105.8 warp_8x8t_8bpc_neon: 1846.9 1056.4 1099.8
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -3007,28 +3007,20 @@
saddlp v19.4s, v19.8h
mul v22.8h, v22.8h, v5.8h
saddlp v20.4s, v20.8h
- addv s23, v23.4s
saddlp v21.4s, v21.8h
- addv s18, v18.4s
saddlp v22.4s, v22.8h
- addv s19, v19.4s
- trn1 v18.2s, v23.2s, v18.2s
- addv s20, v20.4s
+ addp v18.4s, v23.4s, v18.4s
ext v23.16b, v16.16b, v17.16b, #2*6
- trn1 v19.2s, v19.2s, v20.2s
- addv s21, v21.4s
+ addp v19.4s, v19.4s, v20.4s
mul v23.8h, v23.8h, v6.8h
ext v20.16b, v16.16b, v17.16b, #2*7
- addv s22, v22.4s
mul v20.8h, v20.8h, v7.8h
saddlp v23.4s, v23.8h
- trn1 v21.2s, v21.2s, v22.2s
+ addp v21.4s, v21.4s, v22.4s
saddlp v20.4s, v20.8h
- addv s23, v23.4s
- addv s20, v20.4s
- trn1 v20.2s, v23.2s, v20.2s
- trn1 v18.2d, v18.2d, v19.2d
- trn1 v20.2d, v21.2d, v20.2d
+ addp v20.4s, v23.4s, v20.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v20.4s, v21.4s, v20.4s
add w5, w5, w8