ref: a6711a5c2b12b74e4bc887c525c25a6981158930
parent: 39d6c599352bff68038500756488e80f9cd31295
author: Martin Storsjö <[email protected]>
date: Mon May 4 04:58:12 EDT 2020
arm64: itx: Fix the eob checking for dct_dct_64x16 Before this, we never did the early exit from the first pass. Before: Cortex A53 A72 A73 inv_txfm_add_64x16_dct_dct_1_8bpc_neon: 7275.7 5198.3 5250.9 inv_txfm_add_64x16_dct_dct_2_8bpc_neon: 7276.1 5197.0 5251.3 inv_txfm_add_64x16_dct_dct_3_8bpc_neon: 7275.8 5196.2 5254.5 inv_txfm_add_64x16_dct_dct_4_8bpc_neon: 7273.6 5198.8 5254.2 After: inv_txfm_add_64x16_dct_dct_1_8bpc_neon: 5187.8 3763.8 3735.0 inv_txfm_add_64x16_dct_dct_2_8bpc_neon: 7280.6 5185.6 5256.3 inv_txfm_add_64x16_dct_dct_3_8bpc_neon: 7270.7 5179.8 5250.3 inv_txfm_add_64x16_dct_dct_4_8bpc_neon: 7271.7 5212.4 5256.4 The other related variants didn't have this bug and properly exited early when possible.
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -3218,7 +3218,6 @@
mov w8, #(16 - \i)
cmp w3, w12
b.lt 1f
- ldrh w12, [x13], #2
.endif
add x7, x2, #(\i*2)
mov x8, #16*2
@@ -3226,6 +3225,9 @@
bl inv_txfm_dct_clear_8x64_neon
add x6, x4, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+ ldrh w12, [x13], #2
+.endif
.endr
b 3f