shithub: dav1d

Download patch

ref: b7c66fa6555ae5d56f38150e4cac0b3a81ac0673
parent: cbd4827fb7cfc7931902ed9e73dab8369250a32b
author: Martin Storsjö <[email protected]>
date: Fri Nov 27 11:26:54 EST 2020

arm32: looprestoration: Specify alignment in loads/stores in SGR where possible

--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -1675,12 +1675,12 @@
         vmov.i16        q14, #3
         vmov.i32        q15, #3
 1:
-        vld1.16         {q0},       [r9]!
-        vld1.16         {q1},       [r4]!
-        vld1.16         {q2},       [r10]!
-        vld1.32         {q8,  q9},  [r7]!
-        vld1.32         {q10, q11}, [r3]!
-        vld1.32         {q12, q13}, [r8]!
+        vld1.16         {q0},       [r9,  :128]!
+        vld1.16         {q1},       [r4,  :128]!
+        vld1.16         {q2},       [r10, :128]!
+        vld1.32         {q8,  q9},  [r7,  :128]!
+        vld1.32         {q10, q11}, [r3,  :128]!
+        vld1.32         {q12, q13}, [r8,  :128]!
 
 2:
         subs            r5,  r5,  #4
@@ -1711,7 +1711,7 @@
         vadd.i32        q3,  q3,  q5
         vext.8          q7,  q12, q13, #4  // +stride
         vext.8          q10, q12, q13, #8  // +1+stride
-        vld1.32         {d24[0]}, [r1]!    // src
+        vld1.32         {d24[0]}, [r1, :32]! // src
         vadd.i32        q3,  q3,  q7       // +stride
         vadd.i32        q8,  q8,  q10      // +1+stride
         vshl.i32        q3,  q3,  #2
@@ -1728,12 +1728,12 @@
         vmov            q8,  q9
         vmov            q10, q11
         vmov            q12, q13
-        vld1.16         {d1},  [r9]!
-        vld1.16         {d3},  [r4]!
-        vld1.16         {d5},  [r10]!
-        vld1.32         {q9},  [r7]!
-        vld1.32         {q11}, [r3]!
-        vld1.32         {q13}, [r8]!
+        vld1.16         {d1},  [r9,  :64]!
+        vld1.16         {d3},  [r4,  :64]!
+        vld1.16         {d5},  [r10, :64]!
+        vld1.32         {q9},  [r7,  :128]!
+        vld1.32         {q11}, [r3,  :128]!
+        vld1.32         {q13}, [r8,  :128]!
         b               2b
 
 3:
@@ -1779,12 +1779,12 @@
         mov             lr,  r5
 
 1:
-        vld1.16         {q0,  q1},  [r4]!
-        vld1.16         {q2,  q3},  [r8]!
-        vld1.32         {q8,  q9},  [r3]!
-        vld1.32         {q11, q12}, [r7]!
-        vld1.32         {q10},      [r3]!
-        vld1.32         {q13},      [r7]!
+        vld1.16         {q0,  q1},  [r4, :128]!
+        vld1.16         {q2,  q3},  [r8, :128]!
+        vld1.32         {q8,  q9},  [r3, :128]!
+        vld1.32         {q11, q12}, [r7, :128]!
+        vld1.32         {q10},      [r3, :128]!
+        vld1.32         {q13},      [r7, :128]!
 
 2:
         vmov.i16        q14, #5
@@ -1816,7 +1816,7 @@
         vext.8          q8,  q11, q12, #4  // +stride
         vext.8          q11, q12, q13, #4
 
-        vld1.8          {d4}, [r1]!
+        vld1.8          {d4}, [r1, :64]!
 
         vmov.i32        q14, #5
         vmov.i32        q15, #6
@@ -1835,15 +1835,15 @@
         vrshrn.i32      d8,  q4,  #9
         vrshrn.i32      d9,  q5,  #9
         vmov            q2,  q3
-        vst1.16         {q4}, [r0]!
+        vst1.16         {q4}, [r0, :128]!
 
         ble             3f
         vmov            q8,  q10
         vmov            q11, q13
-        vld1.16         {q1},       [r4]!
-        vld1.16         {q3},       [r8]!
-        vld1.32         {q9,  q10}, [r3]!
-        vld1.32         {q12, q13}, [r7]!
+        vld1.16         {q1},       [r4, :128]!
+        vld1.16         {q3},       [r8, :128]!
+        vld1.32         {q9,  q10}, [r3, :128]!
+        vld1.32         {q12, q13}, [r7, :128]!
         b               2b
 
 3:
@@ -1857,9 +1857,9 @@
         add             r4,  r4,  r12, lsl #1
         add             r8,  r8,  r12, lsl #1
 
-        vld1.32         {q8, q9}, [r3]!
-        vld1.16         {q0, q1}, [r4]!
-        vld1.32         {q10},    [r3]!
+        vld1.32         {q8, q9}, [r3, :128]!
+        vld1.16         {q0, q1}, [r4, :128]!
+        vld1.32         {q10},    [r3, :128]!
 
         vmov.i16        q12, #5
         vmov.i16        q13, #6
@@ -1876,7 +1876,7 @@
         vext.8          q7,  q9,  q10, #8
         vmul.i16        q2,  q2,  q13      // * 6
         vmla.i16        q2,  q0,  q12      // * 5 -> a
-        vld1.8          {d22}, [r1]!
+        vld1.8          {d22}, [r1, :64]!
         vadd.i32        q8,  q8,  q6       // -1, +1
         vadd.i32        q9,  q9,  q7
         vmovl.u8        q11, d22
@@ -1891,11 +1891,11 @@
         vrshrn.i32      d8,  q4,  #8
         vrshrn.i32      d9,  q5,  #8
         vmov            q8,  q10
-        vst1.16         {q4}, [r0]!
+        vst1.16         {q4}, [r0, :128]!
 
         ble             5f
-        vld1.16         {q1},      [r4]!
-        vld1.32         {q9, q10}, [r3]!
+        vld1.16         {q1},      [r4, :128]!
+        vld1.32         {q9, q10}, [r3, :128]!
         b               4b
 
 5:
@@ -1939,10 +1939,10 @@
         mov             r8,  r5
         blt             2f
 1:
-        vld1.8          {d0},  [r2]!
-        vld1.8          {d16}, [r12]!
-        vld1.16         {q1},  [r4]!
-        vld1.16         {q9},  [lr]!
+        vld1.8          {d0},  [r2,  :64]!
+        vld1.8          {d16}, [r12, :64]!
+        vld1.16         {q1},  [r4,  :128]!
+        vld1.16         {q9},  [lr,  :128]!
         subs            r5,  r5,  #8
         vshll.u8        q0,  d0,  #4     // u
         vshll.u8        q8,  d16, #4     // u
@@ -1980,8 +1980,8 @@
         b               1b
 
 2:
-        vld1.8          {d0}, [r2]!
-        vld1.16         {q1}, [r4]!
+        vld1.8          {d0}, [r2, :64]!
+        vld1.16         {q1}, [r4, :128]!
         subs            r5,  r5,  #8
         vshll.u8        q0,  d0,  #4     // u
         vsub.i16        q1,  q1,  q0     // t1 - u
@@ -2025,12 +2025,12 @@
         mov             r9,  r6
         blt             2f
 1:
-        vld1.8          {d0},  [r2]!
-        vld1.8          {d16}, [r11]!
-        vld1.16         {q1},  [r4]!
-        vld1.16         {q9},  [r12]!
-        vld1.16         {q2},  [r5]!
-        vld1.16         {q10}, [lr]!
+        vld1.8          {d0},  [r2,  :64]!
+        vld1.8          {d16}, [r11, :64]!
+        vld1.16         {q1},  [r4,  :128]!
+        vld1.16         {q9},  [r12, :128]!
+        vld1.16         {q2},  [r5,  :128]!
+        vld1.16         {q10}, [lr,  :128]!
         subs            r6,  r6,  #8
         vshll.u8        q0,  d0,  #4     // u
         vshll.u8        q8,  d16, #4     // u
@@ -2076,9 +2076,9 @@
         b               1b
 
 2:
-        vld1.8          {d0}, [r2]!
-        vld1.16         {q1}, [r4]!
-        vld1.16         {q2}, [r5]!
+        vld1.8          {d0}, [r2, :64]!
+        vld1.16         {q1}, [r4, :128]!
+        vld1.16         {q2}, [r5, :128]!
         subs            r6,  r6,  #8
         vshll.u8        q0,  d0,  #4     // u
         vsub.i16        q1,  q1,  q0     // t1 - u