ref: 801966ca946661881755a8078661e1c880995e46
parent: f481d69b0ffac087504036375d505f4323d7ef5e
author: Martin Storsjö <[email protected]>
date: Mon Mar 23 19:59:41 EDT 2020
arm64: ipred: Use rounded shifts instead of a separate addition
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1962,14 +1962,11 @@
clz w9, w9 // ctz(width)
clz w10, w10 // ctz(height)
add w9, w9, w10 // log2sz
- movi v16.4s, #1
add x10, x1, x2
lsl x2, x2, #1
- dup v17.4s, w9
- sshl v16.4s, v16.4s, v17.4s // 1 << log2sz
- neg v17.4s, v17.4s // -log2sz
- ushr v16.4s, v16.4s, #1 // 1 << (log2sz - 1)
+ dup v31.4s, w9
mov w9, w6
+ neg v31.4s, v31.4s // -log2sz
br x7
L(ipred_cfl_ac_420_w4):
@@ -2009,8 +2006,7 @@
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h // sum
sub x0, x0, w9, uxtw #3
- add v0.2s, v0.2s, v16.2s // sum += 1 << (log2sz - 1)
- ushl v4.2s, v0.2s, v17.2s // sum >>= log2sz
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
dup v4.8h, v4.h[0]
6: // Subtract dc from ac
ld1 {v0.8h, v1.8h}, [x0]
@@ -2092,8 +2088,7 @@
add v0.4s, v0.4s, v2.4s
addv s0, v0.4s // sum
sub x0, x0, w9, uxtw #4
- add v0.2s, v0.2s, v16.2s // sum += 1 << (log2sz - 1)
- ushl v4.2s, v0.2s, v17.2s // sum >>= log2sz
+ urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz
dup v4.8h, v4.h[0]
6: // Subtract dc from ac
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
@@ -2269,14 +2264,11 @@
clz w9, w9 // ctz(width)
clz w10, w10 // ctz(height)
add w9, w9, w10 // log2sz
- movi v16.4s, #1
add x10, x1, x2
lsl x2, x2, #1
- dup v17.4s, w9
- sshl v16.4s, v16.4s, v17.4s // 1 << log2sz
- neg v17.4s, v17.4s // -log2sz
- ushr v16.4s, v16.4s, #1 // 1 << (log2sz - 1)
+ dup v31.4s, w9
mov w9, w6
+ neg v31.4s, v31.4s // -log2sz
br x7
L(ipred_cfl_ac_422_w4):