ref: 77b3b25c636f5e874bdba362bfa8e028e9620931
parent: f90ada0d08e99ccfb676e59b8e3c497e77879915
author: Martin Storsjö <[email protected]>
date: Thu Sep 24 07:07:02 EDT 2020
arm32: looprestoration: Fix missed vertical alignment
--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -40,8 +40,8 @@
mov r8, r5
vld1.16 {q0}, [r4]
movw r9, #(1 << 14) - (1 << 2)
- vdup.16 q14, r9
- vmov.s16 q15, #2048
+ vdup.16 q14, r9
+ vmov.s16 q15, #2048
// Calculate mid_stride
add r10, r5, #7
bic r10, r10, #7
@@ -108,8 +108,8 @@
0:
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
- vdup.8 q1, d4[0]
- vdup.8 q8, d18[0]
+ vdup.8 q1, d4[0]
+ vdup.8 q8, d18[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out.
sub r2, r2, #3
@@ -127,7 +127,7 @@
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
- sub r9, r5, #14
+ sub r9, r5, #14
ldrb r11, [r2, r9]
ldrb r9, [lr, r9]
// Fill q12/q13 with the right padding pixel
@@ -338,11 +338,11 @@
vdup.16 d25, d16[3]
vpadd.s16 d6, d6, d6
vtrn.16 d24, d25
- vshl.s16 d24, d24, #7
- vsub.s16 d24, d24, d28
- vqadd.s16 d6, d6, d24
- vshr.s16 d6, d6, #3
- vadd.s16 d6, d6, d30
+ vshl.s16 d24, d24, #7
+ vsub.s16 d24, d24, d28
+ vqadd.s16 d6, d6, d24
+ vshr.s16 d6, d6, #3
+ vadd.s16 d6, d6, d30
vst1.s16 {d6[0]}, [r0, :16]!
vst1.s16 {d6[1]}, [r12, :16]!
subs r5, r5, #1
@@ -422,22 +422,22 @@
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
- vmull.s16 q2, d16, d0[0]
- vmlal.s16 q2, d18, d0[1]
- vmlal.s16 q2, d20, d0[2]
- vmlal.s16 q2, d22, d0[3]
- vmlal.s16 q2, d24, d1[0]
- vmlal.s16 q2, d26, d1[1]
- vmlal.s16 q2, d28, d1[2]
- vmull.s16 q3, d17, d0[0]
- vmlal.s16 q3, d19, d0[1]
- vmlal.s16 q3, d21, d0[2]
- vmlal.s16 q3, d23, d0[3]
- vmlal.s16 q3, d25, d1[0]
- vmlal.s16 q3, d27, d1[1]
- vmlal.s16 q3, d29, d1[2]
- vqrshrun.s32 d4, q2, #11
- vqrshrun.s32 d5, q3, #11
+ vmull.s16 q2, d16, d0[0]
+ vmlal.s16 q2, d18, d0[1]
+ vmlal.s16 q2, d20, d0[2]
+ vmlal.s16 q2, d22, d0[3]
+ vmlal.s16 q2, d24, d1[0]
+ vmlal.s16 q2, d26, d1[1]
+ vmlal.s16 q2, d28, d1[2]
+ vmull.s16 q3, d17, d0[0]
+ vmlal.s16 q3, d19, d0[1]
+ vmlal.s16 q3, d21, d0[2]
+ vmlal.s16 q3, d23, d0[3]
+ vmlal.s16 q3, d25, d1[0]
+ vmlal.s16 q3, d27, d1[1]
+ vmlal.s16 q3, d29, d1[2]
+ vqrshrun.s32 d4, q2, #11
+ vqrshrun.s32 d5, q3, #11
vqmovun.s16 d4, q2
vst1.8 {d4}, [r0], r1
.if \compare
@@ -473,7 +473,7 @@
52: // 2 rows in total, q11 already loaded, load q12 with content data
// and 2 rows of edge.
vld1.16 {q14}, [r2, :128], r7
- vmov q15, q14
+ vmov q15, q14
b 8f
53:
// 3 rows in total, q11 already loaded, load q12 and q13 with content
@@ -785,7 +785,7 @@
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
- sub lr, r5, #(2 + 16 - 2 + 1)
+ sub lr, r5, #(2 + 16 - 2 + 1)
ldrb r11, [r3, lr]
ldrb lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
@@ -1058,7 +1058,7 @@
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here.
- sub lr, r5, #(2 + 16 - 3 + 1)
+ sub lr, r5, #(2 + 16 - 3 + 1)
ldrb r11, [r3, lr]
ldrb lr, [r12, lr]
// Fill q14/q15 with the right padding pixel
@@ -1100,7 +1100,7 @@
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
vaddw_u16_n q12, q13, d22, d23, \w
- vadd_i32_n q12, q13, q8, q9, \w
+ vadd_i32_n q12, q13, q8, q9, \w
vext.8 q8, q5, q6, #2
vext.8 q9, q5, q6, #4
vext.8 q10, q5, q6, #6
@@ -1152,7 +1152,7 @@
6: // Pad the right edge and produce the last few pixels.
// w < 7, w+1 pixels valid in q0/q4
- sub lr, r5, #1
+ sub lr, r5, #1
// lr = pixels valid - 2
adr r11, L(box5_variable_shift_tbl)
ldr lr, [r11, lr, lsl #2]