shithub: dav1d

Download patch

ref: b0d00020e06a3528977b977c61a252e91969b1a0
parent: 2ef970a885990ff462c30b6573bea5044bb4b0f5
author: B Krishnan Iyer <[email protected]>
date: Tue Aug 6 16:30:30 EDT 2019

arm: mc: Speed up due to memory alignment in ldr/str instructions

blend/blend_h/blend_v:

Before:               Cortex A7      A8      A9     A53     A72     A73
blend_h_w2_8bpc_neon:     169.5   194.2   153.1   134.0    63.0    72.6
blend_h_w4_8bpc_neon:     164.4   171.8   142.2   137.8    60.5    60.2
blend_h_w8_8bpc_neon:     184.8   121.0   146.5   123.4    55.9    63.1
blend_h_w16_8bpc_neon:    291.0   178.6   237.3   181.0    88.6    83.9
blend_h_w32_8bpc_neon:    531.9   321.5   432.2   358.3   155.6   156.2
blend_h_w64_8bpc_neon:    957.6   600.3   827.4   631.2   279.7   268.4
blend_h_w128_8bpc_neon:  2161.5  1398.4  1931.8  1403.4   607.0   597.9
blend_v_w2_8bpc_neon:     249.3   373.4   269.2   195.6   107.9   117.6
blend_v_w4_8bpc_neon:     451.7   676.1   555.3   376.1   198.6   266.9
blend_v_w8_8bpc_neon:     561.0   475.2   607.6   357.0   213.9   204.1
blend_v_w16_8bpc_neon:    928.4   626.8   823.8   592.3   269.9   245.3
blend_v_w32_8bpc_neon:   1477.6  1024.8  1186.6   994.5   346.6   370.0
blend_w4_8bpc_neon:       103.3   113.0    86.2    91.5    38.6    35.2
blend_w8_8bpc_neon:       174.9   116.6   137.1   123.1    50.8    55.0
blend_w16_8bpc_neon:      533.0   334.3   446.6   348.6   150.7   155.4
blend_w32_8bpc_neon:     1299.2   836.8  1170.7   909.9   370.5   386.3

After:
blend_h_w2_8bpc_neon:     169.6   169.8   140.9   134.0    62.3    72.5
blend_h_w4_8bpc_neon:     164.5   149.1   127.6   137.7    59.1    60.1
blend_h_w8_8bpc_neon:     184.9   102.7   126.3   123.4    54.9    63.2
blend_h_w16_8bpc_neon:    291.0   163.8   232.1   180.9    88.4    83.9
blend_h_w32_8bpc_neon:    531.2   285.6   422.6   358.4   155.5   155.9
blend_h_w64_8bpc_neon:    956.0   541.9   809.9   631.6   280.0   270.6
blend_h_w128_8bpc_neon:  2159.0  1253.6  1889.0  1404.8   606.2   600.5
blend_v_w2_8bpc_neon:     249.9   362.0   269.4   195.6   107.8   117.6
blend_v_w4_8bpc_neon:     452.6   541.6   538.2   376.1   199.5   266.9
blend_v_w8_8bpc_neon:     561.0   348.9   551.3   357.7   214.3   204.4
blend_v_w16_8bpc_neon:    926.8   510.9   785.0   592.1   270.7   245.8
blend_v_w32_8bpc_neon:   1474.4   913.3  1151.4   995.7   347.5   371.2
blend_w4_8bpc_neon:       103.3    96.6    76.9    91.5    33.7    35.3
blend_w8_8bpc_neon:       174.9    88.2   114.8   123.1    51.5    55.0
blend_w16_8bpc_neon:      532.8   282.2   445.3   348.5   149.8   155.7
blend_w32_8bpc_neon:     1295.1   735.2  1122.8   908.4   372.0   386.5

w_mask_444/422/420:

Before:                    Cortex A7        A8        A9       A53       A72      A73
w_mask_420_w4_8bpc_neon:       218.1     144.4     187.3     152.7      86.9     89.0
w_mask_420_w8_8bpc_neon:       544.0     393.7     437.0     372.5     211.1    230.9
w_mask_420_w16_8bpc_neon:     1537.2    1063.5    1182.3    1024.3     566.4    667.7
w_mask_420_w32_8bpc_neon:     5734.7    4207.2    4716.8    3822.8    2340.5   2521.3
w_mask_420_w64_8bpc_neon:    14317.6   10165.0   13220.2    9578.5    5578.9   5989.9
w_mask_420_w128_8bpc_neon:   37932.8   25299.1   39562.9   25203.8   14916.4  15465.1
w_mask_422_w4_8bpc_neon:       206.8     141.4     177.9     143.4      82.1     84.8
w_mask_422_w8_8bpc_neon:       511.8     380.8     416.7     342.5     198.5    221.7
w_mask_422_w16_8bpc_neon:     1632.8    1154.4    1282.9    1061.2     595.3    684.9
w_mask_422_w32_8bpc_neon:     6087.8    4560.3    5173.3    3945.8    2319.1   2608.7
w_mask_422_w64_8bpc_neon:    15183.7   11013.9   14435.6    9904.6    5449.9   6100.9
w_mask_422_w128_8bpc_neon:   39951.2   27441.0   42398.2   25995.1   14624.9  15529.2
w_mask_444_w4_8bpc_neon:       193.4     127.0     170.0     135.4      76.8     81.4
w_mask_444_w8_8bpc_neon:       477.8     340.0     427.9     319.3     187.2    214.7
w_mask_444_w16_8bpc_neon:     1529.0    1058.8    1209.4     987.0     571.7    677.3
w_mask_444_w32_8bpc_neon:     5687.9    4166.9    4882.4    3667.0    2286.8   2518.7
w_mask_444_w64_8bpc_neon:    14394.7   10055.1   14057.9    9372.0    5369.3   5898.7
w_mask_444_w128_8bpc_neon:   37952.0   25008.8   42169.9   24988.8   22973.7  15241.1

After:
w_mask_420_w4_8bpc_neon:       219.7     120.7     178.0     152.7      87.2     89.0
w_mask_420_w8_8bpc_neon:       547.5     355.2     404.4     372.4     211.4    231.0
w_mask_420_w16_8bpc_neon:     1540.9     987.1    1113.0    1024.9     567.4    669.5
w_mask_420_w32_8bpc_neon:     5915.4    3905.8    4516.8    3929.3    2363.7   2523.6
w_mask_420_w64_8bpc_neon:    14860.9    9437.1   12609.7    9586.4    5627.3   6005.8
w_mask_420_w128_8bpc_neon:   38799.1   23536.1   38598.3   24787.7   14595.7  15474.9
w_mask_422_w4_8bpc_neon:       208.3     115.4     168.6     143.4      82.4     84.8
w_mask_422_w8_8bpc_neon:       515.2     335.7     383.2     342.5     198.9    221.8
w_mask_422_w16_8bpc_neon:     1643.2    1053.6    1199.3    1062.2     595.6    685.7
w_mask_422_w32_8bpc_neon:     6335.1    4161.0    4959.3    4088.5    2353.0   2606.4
w_mask_422_w64_8bpc_neon:    15689.4   10039.8   13806.1    9937.7    5535.3   6099.8
w_mask_422_w128_8bpc_neon:   40754.4   25033.3   41390.5   25683.7   14668.8  15537.1
w_mask_444_w4_8bpc_neon:       194.9     107.4     162.0     135.4      77.1     81.4
w_mask_444_w8_8bpc_neon:       481.1     300.2     422.0     319.1     187.6    214.6
w_mask_444_w16_8bpc_neon:     1542.6     956.1    1137.7     988.4     572.4    677.5
w_mask_444_w32_8bpc_neon:     5896.1    3766.1    4731.9    3801.2    2322.9   2521.8
w_mask_444_w64_8bpc_neon:    14814.0    9084.7   13515.4    9311.0    5497.3   5896.3
w_mask_444_w128_8bpc_neon:   38587.7   22615.2   41389.9   24639.4   17705.8  15244.3

--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -252,8 +252,8 @@
         .word 8f     - L(w_mask_\type\()_tbl) + CONFIG_THUMB
         .word 4f     - L(w_mask_\type\()_tbl) + CONFIG_THUMB
 4:
-        vld1.16         {d0,  d1,  d2,  d3},  [r2]! // tmp1 (four rows at once)
-        vld1.16         {d4,  d5,  d6,  d7},  [r3]! // tmp2 (four rows at once)
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1 (four rows at once)
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2 (four rows at once)
         subs            r5,  r5,  #4
         vsub.i16        q8,  q2,  q0    // tmp2-tmp1
         vsub.i16        q9,  q3,  q1
@@ -275,13 +275,13 @@
         vmovn.u16       d20, q10        // 64 - m
         vmovn.u16       d21, q11
         vsub.i8         q10, q15, q10   // m
-        vst1.8          {d20, d21}, [r6]!
+        vst1.8          {d20, d21}, [r6,  :128]!
 .elseif \type == 422
         vpadd.s16       d20, d20, d21   // (64 - m) + (64 - n) (column wise addition)
         vpadd.s16       d21, d22, d23
         vmovn.s16       d6,  q10
         vhsub.u8        d6,  d30, d6    // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
-        vst1.8          {d6},  [r6]!
+        vst1.8          {d6},  [r6,  :64]!
 .elseif \type == 420
         vadd.s16        d20, d20, d21   // (64 - my1) + (64 - my2) (row wise addition)
         vadd.s16        d21, d22, d23
@@ -288,17 +288,17 @@
         vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
         vsub.s16        d20, d30, d20   // (256 - sign) - ((128 - m) + (128 - n))
         vrshrn.u16      d20, q10,  #2   // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        vst1.32         {d20[0]},  [r6]!
+        vst1.32         {d20[0]}, [r6,  :32]!
 .endif
-        vst1.32         {d24[0]}, [r0],  r1
-        vst1.32         {d24[1]}, [r12], r1
-        vst1.32         {d25[0]}, [r0],  r1
-        vst1.32         {d25[1]}, [r12], r1
+        vst1.32         {d24[0]}, [r0,  :32], r1
+        vst1.32         {d24[1]}, [r12, :32], r1
+        vst1.32         {d25[0]}, [r0,  :32], r1
+        vst1.32         {d25[1]}, [r12, :32], r1
         bgt             4b
         pop             {r4-r10,pc}
 8:
-        vld1.16         {d0,  d1,  d2,  d3},  [r2]! // tmp1y1, tmp1y2
-        vld1.16         {d4,  d5,  d6,  d7},  [r3]! // tmp2y1, tmp2y2
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1y1, tmp1y2
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2y1, tmp2y2
         subs            r5,  r5,  #2
         vsub.i16        q8,  q2,  q0    // tmp2y1 - tmp1y1
         vsub.i16        q9,  q3,  q1    // tmp2y2 - tmp1y2
@@ -320,22 +320,22 @@
         vmovn.u16       d20, q10        // 64 - m
         vmovn.u16       d21, q11
         vsub.i8         q10, q15, q10   // m
-        vst1.8          {d20, d21}, [r6]!
+        vst1.8          {d20, d21}, [r6,  :128]!
 .elseif \type == 422
         vpadd.s16       d20, d20, d21   // (64 - my1) + (64 - ny1) (column wise addition)
         vpadd.s16       d21, d22, d23   // (64 - my2) + (64 - ny2)
         vmovn.s16       d20, q10
         vhsub.u8        d20, d30, d20   // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
-        vst1.8          {d20}, [r6]!
+        vst1.8          {d20}, [r6,  :64]!
 .elseif \type == 420
         vadd.s16        q10, q10, q11   // (64 - my1) + (64 - my2) (row wise addition)
         vpadd.s16       d20, d20, d21   // (128 - m) + (128 - n) (column wise addition)
         vsub.s16        d20, d30, d20   // (256 - sign) - ((128 - m) + (128 - n))
         vrshrn.u16      d20, q10, #2    // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        vst1.32         {d20[0]}, [r6]!
+        vst1.32         {d20[0]}, [r6,  :32]!
 .endif
-        vst1.16         {d24}, [r0],  r1
-        vst1.16         {d25}, [r12], r1
+        vst1.16         {d24}, [r0,  :64], r1
+        vst1.16         {d25}, [r12, :64], r1
         bgt             8b
         pop             {r4-r10,pc}
 1280:
@@ -354,9 +354,9 @@
 161:
         mov             r8,  r4
 16:
-        vld1.16         {d0,  d1,  d2,  d3},  [r2]! // tmp1y1
-        vld1.16         {d4,  d5,  d6,  d7},  [r3]! // tmp2y1
-        vld1.16         {d16, d17, d18, d19}, [r7]! // tmp1y2
+        vld1.16         {d0,  d1,  d2,  d3},  [r2,  :128]! // tmp1y1
+        vld1.16         {d4,  d5,  d6,  d7},  [r3,  :128]! // tmp2y1
+        vld1.16         {d16, d17, d18, d19}, [r7,  :128]! // tmp1y2
         subs            r8,  r8,  #16
         vsub.i16        q2,  q2,  q0    // tmp2y1 - tmp1y1
         vsub.i16        q3,  q3,  q1
@@ -372,24 +372,24 @@
         vqdmulh.s16     q13, q13, q3
         vadd.i16        q12, q12, q0    // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
         vadd.i16        q13, q13, q1
-        vld1.16         {d0,  d1,  d2,  d3},  [r9]! // tmp2h2
+        vld1.16         {d0,  d1,  d2,  d3},  [r9,  :128]! // tmp2h2
 .if \type == 444
         vmovn.u16       d20, q10        // 64 - my1
         vmovn.u16       d21, q11
         vsub.i8         q10, q15, q10   // my1
-        vst1.8          {d20, d21}, [r6]!
+        vst1.8          {d20, d21}, [r6,  :128]!
 .elseif \type == 422
         vpadd.s16       d20, d20, d21   // (64 - my1) + (64 - ny1) (column wise addition)
         vpadd.s16       d21, d22, d23
         vmovn.s16       d20, q10
         vhsub.u8        d20, d30, d20   // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
-        vst1.8          {d20}, [r6]!
+        vst1.8          {d20}, [r6,  :64]!
 .endif
         vqrshrun.s16    d24, q12, #4    // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
         vqrshrun.s16    d25, q13, #4
         vsub.i16        q0,  q0,  q8    // tmp2y2 - tmp1y2
         vsub.i16        q1,  q1,  q9
-        vst1.16         {d24, d25}, [r0]!    // store dsty1
+        vst1.16         {d24, d25}, [r0,  :128]!    // store dsty1
         vabs.s16        q2,  q0         // abs(tmp2y2 - tmp1y2)
         vabs.s16        q3,  q1
         vqsub.u16       q2,  q14, q2    // 6903 - abs(tmp2y2 - tmp1y2)
@@ -402,13 +402,13 @@
         vmovn.u16       d4,  q2         // 64 - my2
         vmovn.u16       d5,  q3
         vsub.i8         q2,  q15, q2    // my2
-        vst1.8          {d4,  d5},  [r10]!
+        vst1.8          {d4,  d5},  [r10, :128]!
 .elseif \type == 422
         vpadd.s16       d4,  d4,  d5    // (64 - my2) + (64 - ny2) (column wise addition)
         vpadd.s16       d5,  d6,  d7
         vmovn.s16       d4,  q2
         vhsub.u8        d4,  d30, d4    // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
-        vst1.8          {d4},  [r10]!
+        vst1.8          {d4},  [r10, :64]!
 .elseif \type == 420
         vadd.s16        q10, q10, q2    // (64 - my1) + (64 - my2) (row wise addition)
         vadd.s16        q11, q11, q3
@@ -416,7 +416,7 @@
         vpadd.s16       d21, d22, d23
         vsub.s16        q10, q15, q10   // (256 - sign) - ((128 - m) + (128 - n))
         vrshrn.u16      d20, q10, #2    // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
-        vst1.8          {d20}, [r6]!
+        vst1.8          {d20}, [r6,  :64]!
 .endif
         vqdmulh.s16     q12, q12, q0    // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
         vqdmulh.s16     q13, q13, q1
@@ -424,7 +424,7 @@
         vadd.i16        q13, q13, q9
         vqrshrun.s16    d24, q12, #4    // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
         vqrshrun.s16    d25, q13, #4
-        vst1.16         {d24, d25}, [r12]!   // store dsty2
+        vst1.16         {d24, d25}, [r12, :128]!   // store dsty2
         bgt             16b
         subs            r5,  r5,  #2
         add             r2,  r2,  r4,  lsl #1
@@ -472,17 +472,17 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 4:
-        vld1.u8         {d2},     [r5]!
-        vld1.u8         {d1},     [r2]!
-        vld1.32         {d0[]},   [r0]
+        vld1.u8         {d2},     [r5,  :64]!
+        vld1.u8         {d1},     [r2,  :64]!
+        vld1.32         {d0[]},   [r0,  :32]
         subs            r4,  r4,  #2
-        vld1.32         {d0[1]},  [r12]
+        vld1.32         {d0[1]},  [r12, :32]
         vsub.i8         d3,  d22, d2
         vmull.u8        q8,  d1,  d2
         vmlal.u8        q8,  d0,  d3
         vrshrn.i16      d20, q8,  #6
-        vst1.32         {d20[0]}, [r0],  r1
-        vst1.32         {d20[1]}, [r12], r1
+        vst1.32         {d20[0]}, [r0,  :32], r1
+        vst1.32         {d20[1]}, [r12, :32], r1
         bgt             4b
         pop             {r4-r5,pc}
 80:
@@ -490,11 +490,11 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 8:
-        vld1.u8         {q1},  [r5]!
-        vld1.u8         {q2},  [r2]!
-        vld1.u8         {d0},  [r0]
+        vld1.u8         {q1},  [r5,  :128]!
+        vld1.u8         {q2},  [r2,  :128]!
+        vld1.u8         {d0},  [r0,  :64]
         vsub.i8         d17, d16, d2
-        vld1.u8         {d1},  [r12]
+        vld1.u8         {d1},  [r12, :64]
         subs            r4,  r4,  #2
         vsub.i8         d18, d16, d3
         vmull.u8        q3,  d2,  d4
@@ -503,8 +503,8 @@
         vmlal.u8        q10, d1,  d18
         vrshrn.i16      d22, q3,  #6
         vrshrn.i16      d23, q10, #6
-        vst1.u8         {d22}, [r0],  r1
-        vst1.u8         {d23}, [r12], r1
+        vst1.u8         {d22}, [r0,  :64], r1
+        vst1.u8         {d23}, [r12, :64], r1
         bgt             8b
         pop             {r4-r5,pc}
 160:
@@ -512,12 +512,12 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 16:
-        vld1.u8         {q1,  q2},  [r5]!
-        vld1.u8         {q8,  q9},  [r2]!
-        vld1.u8         {q0},  [r0]
+        vld1.u8         {q1,  q2},  [r5,  :128]!
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0},  [r0,  :128]
         subs            r4,  r4,  #2
         vsub.i8         q15, q12, q1
-        vld1.u8         {q13}, [r12]
+        vld1.u8         {q13}, [r12, :128]
         vmull.u8        q3,  d16, d2
         vmlal.u8        q3,  d0,  d30
         vmull.u8        q14, d17, d3
@@ -531,16 +531,16 @@
         vmlal.u8        q14, d27, d31
         vrshrn.i16      d22, q3,  #6
         vrshrn.i16      d23, q14, #6
-        vst1.u8         {q10}, [r0],  r1
-        vst1.u8         {q11}, [r12], r1
+        vst1.u8         {q10}, [r0,  :128], r1
+        vst1.u8         {q11}, [r12, :128], r1
         bgt             16b
         pop             {r4-r5,pc}
 320:
         vmov.i8         q10, #64
 32:
-        vld1.u8         {q2,  q3},  [r5]!
-        vld1.u8         {q8,  q9},  [r2]!
-        vld1.u8         {q0,  q1},  [r0]
+        vld1.u8         {q2,  q3},  [r5,  :128]!
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
         subs            r4,  r4,  #1
         vsub.i8         q11, q10, q2
         vmull.u8        q15, d16, d4
@@ -556,7 +556,7 @@
         vmlal.u8        q14, d3,  d23
         vrshrn.i16      d26, q15, #6
         vrshrn.i16      d27, q14, #6
-        vst1.u8         {q12, q13}, [r0],  r1
+        vst1.u8         {q12, q13}, [r0,  :128],  r1
         bgt             32b
         pop             {r4-r5,pc}
 endfunc
@@ -588,18 +588,18 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 2:
-        vld1.16         {d2[], d3[]},  [r5]!
-        vld1.32         {d1[0]},  [r2]!
+        vld1.16         {d2[], d3[]},  [r5,  :16]!
+        vld1.32         {d1[0]},  [r2,  :32]!
         subs            r4,  r4,  #2
-        vld1.16         {d0[]},   [r0]
+        vld1.16         {d0[]},   [r0,  :16]
         vzip.8          d2,  d3
         vsub.i8         d4,  d22, d2
-        vld1.16         {d0[1]},  [r12]
+        vld1.16         {d0[1]},  [r12, :16]
         vmull.u8        q8,  d1,  d2
         vmlal.u8        q8,  d0,  d4
         vrshrn.i16      d20, q8,  #6
-        vst1.16         {d20[0]}, [r0],  r1
-        vst1.16         {d20[1]}, [r12], r1
+        vst1.16         {d20[0]}, [r0,  :16], r1
+        vst1.16         {d20[1]}, [r12, :16], r1
         bgt             2b
         pop             {r4-r8,pc}
 40:
@@ -607,18 +607,18 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 4:
-        vld2.u8         {d2[],  d3[]},   [r5]!
-        vld1.u8         {d1},     [r2]!
+        vld2.u8         {d2[],  d3[]},   [r5,  :16]!
+        vld1.u8         {d1},     [r2,  :64]!
         subs            r4,  r4,  #2
         vext.u8         d2,  d2,  d3,   #4
-        vld1.32         {d0[]},   [r0]
+        vld1.32         {d0[]},   [r0,  :32]
         vsub.i8         d6,  d22, d2
-        vld1.32         {d0[1]},  [r12]
+        vld1.32         {d0[1]},  [r12, :32]
         vmull.u8        q8,  d1,  d2
         vmlal.u8        q8,  d0,  d6
         vrshrn.i16      d20, q8,  #6
-        vst1.32         {d20[0]}, [r0],  r1
-        vst1.32         {d20[1]}, [r12], r1
+        vst1.32         {d20[0]}, [r0,  :32], r1
+        vst1.32         {d20[1]}, [r12, :32], r1
         bgt             4b
         pop             {r4-r8,pc}
 80:
@@ -626,11 +626,11 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 8:
-        vld2.u8         {d2[],  d3[]},  [r5]!
-        vld1.u8         {d4,  d5},  [r2]!
-        vld1.u8         {d0},   [r0]
+        vld2.u8         {d2[],  d3[]},  [r5,  :16]!
+        vld1.u8         {d4,  d5},  [r2,  :128]!
+        vld1.u8         {d0},   [r0,  :64]
         vsub.i8         q9,  q8,  q1
-        vld1.u8         {d1},   [r12]
+        vld1.u8         {d1},   [r12, :64]
         subs            r4,  r4,  #2
         vmull.u8        q3,  d2,  d4
         vmlal.u8        q3,  d0,  d18
@@ -638,8 +638,8 @@
         vmlal.u8        q10, d1,  d19
         vrshrn.i16      d22, q3,  #6
         vrshrn.i16      d23, q10, #6
-        vst1.u8         {d22}, [r0],  r1
-        vst1.u8         {d23}, [r12], r1
+        vst1.u8         {d22}, [r0,  :64], r1
+        vst1.u8         {d23}, [r12, :64], r1
         bgt             8b
         pop             {r4-r8,pc}
 160:
@@ -647,12 +647,12 @@
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
 16:
-        vld2.u8         {d28[], d29[]}, [r5]!
-        vld1.u8         {d2,  d3,  d4,  d5},  [r2]!
+        vld2.u8         {d28[], d29[]}, [r5,  :16]!
+        vld1.u8         {d2,  d3,  d4,  d5},  [r2,  :128]!
         vsub.i8         q15, q12, q14
-        vld1.u8         {q0},  [r0]
+        vld1.u8         {q0},  [r0,  :128]
         subs            r4,  r4,  #2
-        vld1.u8         {q13}, [r12]
+        vld1.u8         {q13}, [r12, :128]
         vmull.u8        q3,  d2,  d28
         vmlal.u8        q3,  d0,  d30
         vmull.u8        q8,  d3,  d28
@@ -665,8 +665,8 @@
         vmlal.u8        q8,  d27, d31
         vrshrn.i16      d20, q3,  #6
         vrshrn.i16      d21, q8,  #6
-        vst1.u8         {q9},  [r0],  r1
-        vst1.u8         {q10}, [r12], r1
+        vst1.u8         {q9},  [r0,  :128], r1
+        vst1.u8         {q10}, [r12, :128], r1
         bgt             16b
         pop             {r4-r8,pc}
 320:
@@ -679,8 +679,8 @@
         vsub.i8         d7,  d20, d6
         mov             r8,  r3
 32:
-        vld1.u8         {q8,  q9},  [r2]!
-        vld1.u8         {q0,  q1},  [r0]
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
         vmull.u8        q15, d16, d6
         vmlal.u8        q15, d0,  d7
         vmull.u8        q14, d17, d6
@@ -693,7 +693,7 @@
         vmlal.u8        q14, d3,  d7
         vrshrn.i16      d2,  q15, #6
         vrshrn.i16      d3,  q14, #6
-        vst1.u8         {q0,  q1},  [r0]!
+        vst1.u8         {q0,  q1},  [r0,  :128]!
         subs            r8,  r8,  #32
         bgt             32b
         add             r0,  r0,  r1
@@ -728,7 +728,7 @@
         lsl             r1,  r1,  #1
         vsub.i8         d3,  d22, d2
 2:
-        vld1.16         {d1[0]},  [r2]!
+        vld1.16         {d1[0]},  [r2,  :16]!
         vld1.8          {d0[]},   [r0]
         subs            r4,  r4,  #2
         vld1.8          {d1[1]},  [r2]
@@ -743,21 +743,21 @@
         pop             {r4-r5,pc}
 40:
         vmov.i8         d22, #64
-        vld1.32         {d4[]},   [r5]
+        vld1.32         {d4[]},   [r5,  :32]
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
         vsub.i8         d5,  d22, d4
         sub             r1,  r1,  #3
 4:
-        vld1.u8         {d2},     [r2]!
-        vld1.32         {d0[]},   [r0]
-        vld1.32         {d0[1]},  [r12]
+        vld1.u8         {d2},     [r2,  :64]!
+        vld1.32         {d0[]},   [r0,  :32]
+        vld1.32         {d0[1]},  [r12, :32]
         subs            r4,  r4,  #2
         vmull.u8        q3,  d2,  d4
         vmlal.u8        q3,  d0,  d5
         vrshrn.i16      d20, q3,  #6
-        vst1.16         {d20[0]}, [r0]!
-        vst1.16         {d20[2]}, [r12]!
+        vst1.16         {d20[0]}, [r0,  :16]!
+        vst1.16         {d20[2]}, [r12, :16]!
         vst1.8          {d20[2]}, [r0]!
         vst1.8          {d20[6]}, [r12]!
         add             r0,  r0,  r1
@@ -766,15 +766,15 @@
         pop             {r4-r5,pc}
 80:
         vmov.i8         d16, #64
-        vld1.u8         {d2},  [r5]
+        vld1.u8         {d2},  [r5,  :64]
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
         vsub.i8         d17, d16, d2
         sub             r1,  r1,  #6
 8:
-        vld1.u8         {d4, d5},  [r2]!
-        vld1.u8         {d0},  [r0]
-        vld1.u8         {d1},  [r12]
+        vld1.u8         {d4,  d5},  [r2,  :128]!
+        vld1.u8         {d0},  [r0,  :64]
+        vld1.u8         {d1},  [r12, :64]
         subs            r4,  r4,  #2
         vmull.u8        q3,  d2,  d4
         vmlal.u8        q3,  d0,  d17
@@ -782,10 +782,10 @@
         vmlal.u8        q10, d1,  d17
         vrshrn.i16      d22, q3,  #6
         vrshrn.i16      d23, q10, #6
-        vst1.32         {d22[0]}, [r0]!
-        vst1.32         {d23[0]}, [r12]!
-        vst1.16         {d22[2]}, [r0]!
-        vst1.16         {d23[2]}, [r12]!
+        vst1.32         {d22[0]}, [r0,  :32]!
+        vst1.32         {d23[0]}, [r12, :32]!
+        vst1.16         {d22[2]}, [r0,  :16]!
+        vst1.16         {d23[2]}, [r12, :16]!
         add             r0,  r0,  r1
         add             r12, r12, r1
         bgt             8b
@@ -792,16 +792,16 @@
         pop             {r4-r5,pc}
 160:
         vmov.i8         q12, #64
-        vld1.u8         {q14}, [r5]
+        vld1.u8         {q14}, [r5,  :128]
         add             r12, r0,  r1
         lsl             r1,  r1,  #1
         vsub.i8         q11, q12, q14
         sub             r1,  r1,  #12
 16:
-        vld1.u8         {q1,  q2},  [r2]!
-        vld1.u8         {q0},  [r0]
+        vld1.u8         {q1,  q2},  [r2,  :128]!
+        vld1.u8         {q0},  [r0,  :128]
         subs            r4,  r4,  #2
-        vld1.u8         {q13}, [r12]
+        vld1.u8         {q13}, [r12, :128]
         vmull.u8        q3,  d2,  d28
         vmlal.u8        q3,  d0,  d22
         vmull.u8        q8,  d3,  d29
@@ -814,10 +814,10 @@
         vmlal.u8        q8,  d27, d23
         vrshrn.i16      d20, q3,  #6
         vrshrn.i16      d21, q8,  #6
-        vst1.u8         {d18},    [r0]!
-        vst1.u8         {d20},    [r12]!
-        vst1.32         {d19[0]}, [r0]!
-        vst1.32         {d21[0]}, [r12]!
+        vst1.u8         {d18},    [r0,  :64]!
+        vst1.u8         {d20},    [r12, :64]!
+        vst1.32         {d19[0]}, [r0,  :32]!
+        vst1.32         {d21[0]}, [r12, :32]!
         add             r0,  r0,  r1
         add             r12, r12, r1
         bgt             16b
@@ -824,12 +824,12 @@
         pop             {r4-r5,pc}
 320:
         vmov.i8         q10, #64
-        vld1.u8         {q2, q3},  [r5]
+        vld1.u8         {q2,  q3},  [r5,  :128]
         vsub.i8         q11, q10, q2
         vsub.i8         q12, q10, q3
 32:
-        vld1.u8         {q8,  q9},  [r2]!
-        vld1.u8         {q0,  q1},  [r0]
+        vld1.u8         {q8,  q9},  [r2,  :128]!
+        vld1.u8         {q0,  q1},  [r0,  :128]
         subs            r4,  r4,  #1
         vmull.u8        q15, d16, d4
         vmlal.u8        q15, d0,  d22
@@ -840,7 +840,7 @@
         vmull.u8        q15, d18, d6
         vmlal.u8        q15, d2,  d24
         vrshrn.i16      d2,  q15, #6
-        vst1.u8         {d0,  d1,  d2},  [r0],  r1
+        vst1.u8         {d0,  d1,  d2},  [r0,  :64],  r1
         bgt             32b
         pop             {r4-r5,pc}
 endfunc
--- a/src/tables.c
+++ b/src/tables.c
@@ -861,7 +861,7 @@
     }
 };
 
-const uint8_t dav1d_obmc_masks[64] = {
+const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
     /* Unused */
      0,  0,
     /* 2 */