shithub: dav1d

Download patch

ref: 14d4edcd4891c9bf6c7b573dbd4b97757a2fd3e9
parent: 3b33c52d739da0e8a1e7214c84c49f3298a68ad0
author: Martin Storsjö <[email protected]>
date: Mon Oct 7 19:26:40 EDT 2019

arm: looprestoration: Port the ARM64 SGR NEON assembly to 32 bit arm

The code is mostly a 1:1 port of the ARM64 code, with slightly worse
scheduling due to fewer temporary registers available. The
sgr_finish_filter1_neon function (used in the 3x3 and mix cases)
processes 4 pixels at a time while the ARM64 version processes 8,
due to not having enough registers available.

Relative speedup over C code:
                       Cortex A7     A8     A9    A53    A72    A73
selfguided_3x3_8bpc_neon:   2.12   2.89   1.79   2.61   2.03   3.87
selfguided_5x5_8bpc_neon:   2.50   3.41   2.16   3.14   2.74   4.64
selfguided_mix_8bpc_neon:   2.24   2.98   1.94   2.82   2.28   4.14

Comparison to the original ARM64 assembly:
ARM64:                    Cortex A53        A72        A73
selfguided_3x3_8bpc_neon:   486215.5   359445.6   341317.7
selfguided_5x5_8bpc_neon:   351210.8   267427.2   243399.3
selfguided_mix_8bpc_neon:   820489.1   610909.8   569946.6
ARM32:
selfguided_3x3_8bpc_neon:   542958.8   379448.8   353229.1
selfguided_5x5_8bpc_neon:   351299.6   263685.2   242415.9
selfguided_mix_8bpc_neon:   881587.6   629934.0   580121.2

--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -26,6 +26,7 @@
  */
 
 #include "src/arm/asm.S"
+#include "util.S"
 
 // void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
 //                                 const pixel *src, ptrdiff_t stride,
@@ -682,4 +683,1428 @@
         add             r0,  r0,  r1
         bgt             70b
         pop             {r4,pc}
+endfunc
+
+#define SUM_STRIDE (384+16)
+
+// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
+//                            const pixel (*left)[4],
+//                            const pixel *src, const ptrdiff_t stride,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_h_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             0f
+        // !LR_HAVE_RIGHT
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3
+        b               1f
+0:
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+1:
+        sub             r9,  r9,  lr, lsl #1
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Subtract the number of pixels read from the input from the stride
+        add             lr,  r5,  #14
+        bic             lr,  lr,  #7
+        sub             r4,  r4,  lr
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #2
+        sub             r12, r12, #2
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 2 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #2
+
+
+1:      // Loop vertically
+        vld1.8          {q0}, [r3]!
+        vld1.8          {q4}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.32         {d3[]}, [r2]!
+        // Move r3/r12 back to account for the last 2 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #2
+        sub             r12, r12, #2
+        vld1.32         {d11[]}, [r2]!
+        vext.8          q0,  q1,  q0,  #14
+        vext.8          q4,  q5,  q4,  #14
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+        // and shift q0 to have 2x the first byte at the front.
+        vdup.8          q1,  d0[0]
+        vdup.8          q5,  d8[0]
+        // Move r3 back to account for the last 2 bytes we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #2
+        sub             r12, r12, #2
+        vext.8          q0,  q1,  q0,  #14
+        vext.8          q4,  q5,  q4,  #14
+
+2:
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5, #(2 + 16 - 2 + 1)
+        ldrb            r11, [r3,  lr]
+        ldrb            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.8          q14, r11
+        vdup.8          q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #10
+        bge             4f   // If w >= 10, all used input pixels are valid
+        cmp             r5,  #6
+        bge             5f   // If w >= 6, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro vaddl_u16_n      dst1, dst2, src1, src2, src3, src4, w
+        vaddl.u16       \dst1,  \src1,  \src3
+.if \w > 4
+        vaddl.u16       \dst2,  \src2,  \src4
+.endif
+.endm
+.macro vaddw_u16_n      dst1, dst2, src1, src2, w
+        vaddw.u16       \dst1,  \dst1,  \src1
+.if \w > 4
+        vaddw.u16       \dst2,  \dst2,  \src2
+.endif
+.endm
+.macro vadd_i32_n       dst1, dst2, src1, src2, w
+        vadd.i32        \dst1,  \dst1,  \src1
+.if \w > 4
+        vadd.i32        \dst2,  \dst2,  \src2
+.endif
+.endm
+
+.macro add3 w
+        vext.8          d16, d0,  d1,  #1
+        vext.8          d17, d0,  d1,  #2
+        vext.8          d18, d8,  d9,  #1
+        vext.8          d19, d8,  d9,  #2
+        vaddl.u8        q3,  d0,  d16
+        vaddw.u8        q3,  q3,  d17
+        vaddl.u8        q7,  d8,  d18
+        vaddw.u8        q7,  q7,  d19
+
+        vext.8          q8,  q1,  q2,  #2
+        vext.8          q9,  q1,  q2,  #4
+        vext.8          q10, q5,  q6,  #2
+        vext.8          q11, q5,  q6,  #4
+
+        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
+        vaddw_u16_n     q12, q13, d18, d19, \w
+
+        vaddl_u16_n     q8,  q9,  d10, d11, d20, d21, \w
+        vaddw_u16_n     q8,  q9,  d22, d23, \w
+.endm
+        add3            8
+        vst1.16         {q3},       [r1,  :128]!
+        vst1.16         {q7},       [r11, :128]!
+        vst1.32         {q12, q13}, [r0,  :128]!
+        vst1.32         {q8,  q9},  [r10, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vld1.8          {d6},  [r3]!
+        vld1.8          {d14}, [r12]!
+        vmov            q1,  q2
+        vmov            q5,  q6
+        vext.8          q0,  q0,  q3,  #8
+        vext.8          q4,  q4,  q7,  #8
+        vmull.u8        q2,  d6,  d6
+        vmull.u8        q6,  d14, d14
+
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 6 <= w < 10
+        add3            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q8},  [r10, :128]!
+
+        subs            r5,  r5,  #4 // 2 <= w < 6
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+
+6:      // Pad the right edge and produce the last few pixels.
+        // 2 <= w < 6, 2-5 pixels valid in q0
+        sub             lr,  r5,  #2
+        // lr = (pixels valid - 2)
+        adr             r11, L(box3_variable_shift_tbl)
+        ldr             lr,  [r11, lr, lsl #2]
+        add             r11, r11, lr
+        bx              r11
+
+        .align 2
+L(box3_variable_shift_tbl):
+        .word 22f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 33f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 44f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(box3_variable_shift_tbl) + CONFIG_THUMB
+
+        // Shift q0 right, shifting out invalid pixels,
+        // shift q0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        vext.8          q0,  q0,  q0,  #2
+        vext.8          q4,  q4,  q4,  #2
+        vext.8          q0,  q0,  q14, #14
+        vext.8          q4,  q4,  q15, #14
+        b               88f
+33:     // 3 pixels valid
+        vext.8          q0,  q0,  q0,  #3
+        vext.8          q4,  q4,  q4,  #3
+        vext.8          q0,  q0,  q14, #13
+        vext.8          q4,  q4,  q15, #13
+        b               88f
+44:     // 4 pixels valid
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q0,  q0,  q14, #12
+        vext.8          q4,  q4,  q15, #12
+        b               88f
+55:     // 5 pixels valid
+        vext.8          q0,  q0,  q0,  #5
+        vext.8          q4,  q4,  q4,  #5
+        vext.8          q0,  q0,  q14, #11
+        vext.8          q4,  q4,  q15, #11
+
+88:
+        // Restore r11 after using it for a temporary value above
+        add             r11, r1,  #(2*SUM_STRIDE)
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        add3            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q8},  [r10, :128]!
+        subs            r5,  r5,  #4
+        ble             9f
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q5,  q5,  q6,  #8
+        // Only one needed pixel left, but do a normal 4 pixel
+        // addition anyway
+        add3            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q8},  [r10, :128]!
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,
+//                            const pixel (*left)[4],
+//                            const pixel *src, const ptrdiff_t stride,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_h_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        add             r5,  r5,  #2 // w += 2
+
+        // Set up pointers for reading/writing alternate rows
+        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
+        add             r11, r1,  #(2*SUM_STRIDE)   // sum
+        add             r12, r3,  r4                // src
+        lsl             r4,  r4,  #1
+        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
+
+        // Subtract the aligned width from the output stride.
+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.
+        // Subtract the number of pixels read from the input from the stride.
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             0f
+        // !LR_HAVE_RIGHT
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3
+        add             r8,  r5,  #13
+        b               1f
+0:
+        add             lr,  r5,  #7
+        bic             lr,  lr,  #7
+        add             r8,  r5,  #15
+1:
+        sub             r9,  r9,  lr, lsl #1
+        bic             r8,  r8,  #7
+        sub             r4,  r4,  r8
+
+        // Store the width for the vertical loop
+        mov             r8,  r5
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r2,  #0
+        bne             0f
+        // left == NULL
+        sub             r3,  r3,  #3
+        sub             r12, r12, #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r4,  r4,  #3
+
+1:      // Loop vertically
+        vld1.8          {q0}, [r3]!
+        vld1.8          {q4}, [r12]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r2,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.32         {d3[]}, [r2]!
+        // Move r3/r12 back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             r3,  r3,  #3
+        sub             r12, r12, #3
+        vld1.32         {d11[]}, [r2]!
+        vext.8          q0,  q1,  q0,  #13
+        vext.8          q4,  q5,  q4,  #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+        // and shift q0 to have 2x the first byte at the front.
+        vdup.8          q1,  d0[0]
+        vdup.8          q5,  d8[0]
+        // Move r3 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             r3,  r3,  #3
+        sub             r12, r12, #3
+        vext.8          q0,  q1,  q0,  #13
+        vext.8          q4,  q5,  q4,  #13
+
+2:
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             lr,  r5, #(2 + 16 - 3 + 1)
+        ldrb            r11, [r3,  lr]
+        ldrb            lr,  [r12, lr]
+        // Fill q14/q15 with the right padding pixel
+        vdup.8          q14, r11
+        vdup.8          q15, lr
+        // Restore r11 after using it for a temporary value
+        add             r11, r1,  #(2*SUM_STRIDE)
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can produce 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro add5 w
+        vext.8          d16, d0,  d1,  #1
+        vext.8          d17, d0,  d1,  #2
+        vext.8          d18, d0,  d1,  #3
+        vext.8          d19, d0,  d1,  #4
+        vext.8          d20, d8,  d9,  #1
+        vext.8          d21, d8,  d9,  #2
+        vext.8          d22, d8,  d9,  #3
+        vext.8          d23, d8,  d9,  #4
+        vaddl.u8        q3,  d0,  d16
+        vaddl.u8        q12, d17, d18
+        vaddl.u8        q7,  d8,  d20
+        vaddl.u8        q13, d21, d22
+        vaddw.u8        q3,  q3,  d19
+        vaddw.u8        q7,  q7,  d23
+        vadd.u16        q3,  q3,  q12
+        vadd.u16        q7,  q7,  q13
+
+        vext.8          q8,  q1,  q2,  #2
+        vext.8          q9,  q1,  q2,  #4
+        vext.8          q10, q1,  q2,  #6
+        vext.8          q11, q1,  q2,  #8
+        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
+        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w
+        vaddw_u16_n     q12, q13, d22, d23, \w
+        vadd_i32_n      q12, q13, q8,  q9, \w
+        vext.8          q8,  q5,  q6,  #2
+        vext.8          q9,  q5,  q6,  #4
+        vext.8          q10, q5,  q6,  #6
+        vext.8          q11, q5,  q6,  #8
+.if \w > 4
+        vaddl_u16_n     q1,  q5,  d10, d11, d16, d17, 8
+        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, 8
+        vaddw_u16_n     q1,  q5,  d22, d23, 8
+        vadd.i32        q10, q1,  q8
+        vadd.i32        q11, q5,  q9
+.else
+        // Can't clobber q1/q5 if only doing 4 pixels
+        vaddl.u16       q8,  d10, d16
+        vaddl.u16       q9,  d18, d20
+        vaddw.u16       q8,  q8,  d22
+        vadd.i32        q10, q8,  q9
+.endif
+.endm
+        add5            8
+        vst1.16         {q3},       [r1,  :128]!
+        vst1.16         {q7},       [r11, :128]!
+        vst1.32         {q12, q13}, [r0,  :128]!
+        vst1.32         {q10, q11}, [r10, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vld1.8          {d6},  [r3]!
+        vld1.8          {d14}, [r12]!
+        vmov            q1,  q2
+        vmov            q5,  q6
+        vext.8          q0,  q0,  q3,  #8
+        vext.8          q4,  q4,  q7,  #8
+        vmull.u8        q2,  d6,  d6
+        vmull.u8        q6,  d14, d14
+        bne             4b // If we don't need to pad, just keep summing.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Produce 4 pixels, 7 <= w < 11
+        add5            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q10}, [r10, :128]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+
+6:      // Pad the right edge and produce the last few pixels.
+        // w < 7, w+1 pixels valid in q0/q4
+        sub             lr,   r5,  #1
+        // lr = pixels valid - 2
+        adr             r11, L(box5_variable_shift_tbl)
+        ldr             lr,  [r11, lr, lsl #2]
+        add             r11, r11, lr
+        bx              r11
+
+        .align 2
+L(box5_variable_shift_tbl):
+        .word 22f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 33f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 44f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(box5_variable_shift_tbl) + CONFIG_THUMB
+
+        // Shift q0 right, shifting out invalid pixels,
+        // shift q0 left to the original offset, shifting in padding pixels.
+22:     // 2 pixels valid
+        vext.8          q0,  q0,  q0,  #2
+        vext.8          q4,  q4,  q4,  #2
+        vext.8          q0,  q0,  q14, #14
+        vext.8          q4,  q4,  q15, #14
+        b               88f
+33:     // 3 pixels valid
+        vext.8          q0,  q0,  q0,  #3
+        vext.8          q4,  q4,  q4,  #3
+        vext.8          q0,  q0,  q14, #13
+        vext.8          q4,  q4,  q15, #13
+        b               88f
+44:     // 4 pixels valid
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q0,  q0,  q14, #12
+        vext.8          q4,  q4,  q15, #12
+        b               88f
+55:     // 5 pixels valid
+        vext.8          q0,  q0,  q0,  #5
+        vext.8          q4,  q4,  q4,  #5
+        vext.8          q0,  q0,  q14, #11
+        vext.8          q4,  q4,  q15, #11
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q0,  q0,  q0,  #6
+        vext.8          q4,  q4,  q4,  #6
+        vext.8          q0,  q0,  q14, #10
+        vext.8          q4,  q4,  q15, #10
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q0,  q0,  q0,  #7
+        vext.8          q4,  q4,  q4,  #7
+        vext.8          q0,  q0,  q14, #9
+        vext.8          q4,  q4,  q15, #9
+
+88:
+        // Restore r11 after using it for a temporary value above
+        add             r11, r1,  #(2*SUM_STRIDE)
+        vmull.u8        q1,  d0,  d0
+        vmull.u8        q2,  d1,  d1
+        vmull.u8        q5,  d8,  d8
+        vmull.u8        q6,  d9,  d9
+
+        add5            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q10}, [r10, :128]!
+        subs            r5,  r5,  #4
+        ble             9f
+        vext.8          q0,  q0,  q0,  #4
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q4,  q4,  q4,  #4
+        vext.8          q5,  q5,  q6,  #8
+        add5            4
+        vst1.16         {d6},  [r1,  :64]!
+        vst1.16         {d14}, [r11, :64]!
+        vst1.32         {q12}, [r0,  :128]!
+        vst1.32         {q10}, [r10, :128]!
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r9, lsl #1
+        add             r10, r10, r9, lsl #1
+        add             r1,  r1,  r9
+        add             r11, r11, r9
+        add             r3,  r3,  r4
+        add             r12, r12, r4
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box3_v_neon, export=1
+        push            {r4-r9,lr}
+        ldr             r4,  [sp, #28]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #2 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
+        add             lr,  lr,  #2
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q8-q13 and q0-q2 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q8,  q9},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q1},       [r6, :128], r8
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q10, q8
+        vmov            q11, q9
+        vmov            q1,  q0
+        vmov            q12, q8
+        vmov            q13, q9
+        vmov            q2,  q0
+
+3:
+        subs            r3,  r3,  #1
+.macro add3
+        vadd.i32        q8,  q8,  q10
+        vadd.i32        q9,  q9,  q11
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q8,  q8,  q12
+        vadd.i32        q9,  q9,  q13
+        vadd.i16        q0,  q0,  q2
+        vst1.32         {q8, q9}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        vmov            q10, q12
+        vmov            q11, q13
+        vmov            q1,  q2
+        ble             4f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        b               3b
+
+4:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             5f
+        // !LR_HAVE_BOTTOM
+        // Produce two more rows, extending the already loaded rows.
+        add3
+        vmov            q8,  q10
+        vmov            q9,  q11
+        vmov            q0,  q1
+        add3
+
+5:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        pop             {r4-r9,pc}
+.purgem add3
+endfunc
+
+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
+//                            const int w, const int h,
+//                            const enum LrEdgeFlags edges);
+function sgr_box5_v_neon, export=1
+        push            {r4-r9,lr}
+        vpush           {q5-q7}
+        ldr             r4,  [sp, #76]
+        add             r12, r3,  #2 // Number of output rows to move back
+        mov             lr,  r3      // Number of input rows to move back
+        add             r2,  r2,  #8 // Actual summed width
+        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
+        mov             r8,       #(2*SUM_STRIDE) // sum stride
+        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
+        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
+
+        tst             r4,  #4 // LR_HAVE_TOP
+        beq             0f
+        // If have top, read from row -2.
+        sub             r5,  r0,  #(4*SUM_STRIDE)
+        sub             r6,  r1,  #(2*SUM_STRIDE)
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_TOP
+        // If we don't have top, read from row 0 even if
+        // we start writing to row -1.
+        add             r5,  r0,  #(4*SUM_STRIDE)
+        add             r6,  r1,  #(2*SUM_STRIDE)
+1:
+
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        beq             0f
+        // LR_HAVE_BOTTOM
+        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
+        add             lr,  lr,  #2
+        b               1f
+0:
+        // !LR_HAVE_BOTTOM
+        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
+1:
+        mov             r9,  r3       // Backup of h for next loops
+
+1:
+        // Start of horizontal loop; start one vertical filter slice.
+        // Start loading rows into q6-q15 and q0-q3,q5 taking top
+        // padding into consideration.
+        tst             r4,  #4 // LR_HAVE_TOP
+        vld1.32         {q6,  q7},  [r5, :128], r7
+        vld1.16         {q0},       [r6, :128], r8
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.32         {q10, q11}, [r5, :128], r7
+        vld1.16         {q2},       [r6, :128], r8
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q8,  q6
+        vmov            q9,  q7
+        vmov            q1,  q0
+        vmov            q10, q6
+        vmov            q11, q7
+        vmov            q2,  q0
+        vmov            q12, q6
+        vmov            q13, q7
+        vmov            q3,  q0
+
+3:
+        cmp             r3,  #0
+        beq             4f
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+
+3:
+        // Start of vertical loop
+        subs            r3,  r3,  #2
+.macro add5
+        vadd.i32        q6,  q6,  q8
+        vadd.i32        q7,  q7,  q9
+        vadd.i16        q0,  q0,  q1
+        vadd.i32        q6,  q6,  q10
+        vadd.i32        q7,  q7,  q11
+        vadd.i16        q0,  q0,  q2
+        vadd.i32        q6,  q6,  q12
+        vadd.i32        q7,  q7,  q13
+        vadd.i16        q0,  q0,  q3
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q15
+        vadd.i16        q0,  q0,  q5
+        vst1.32         {q6, q7}, [r0, :128], r7
+        vst1.16         {q0},     [r1, :128], r8
+.endm
+        add5
+.macro shift2
+        vmov            q6,  q10
+        vmov            q7,  q11
+        vmov            q0,  q2
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q1,  q3
+        vmov            q10, q14
+        vmov            q11, q15
+        vmov            q2,  q5
+.endm
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        ble             5f
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        vld1.32         {q14, q15}, [r5, :128], r7
+        vld1.16         {q5},       [r6, :128], r8
+        b               3b
+
+4:
+        // h == 1, !LR_HAVE_BOTTOM.
+        // Pad the last row with the only content row, and add.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        add5
+        b               6f
+
+5:
+        tst             r4,  #8 // LR_HAVE_BOTTOM
+        bne             6f
+        // !LR_HAVE_BOTTOM
+        cmp             r3,  #0
+        bne             5f
+        // The intended three edge rows left; output the one at h-2 and
+        // the past edge one at h.
+        vld1.32         {q12, q13}, [r5, :128], r7
+        vld1.16         {q3},       [r6, :128], r8
+        // Pad the past-edge row from the last content row.
+        vmov            q14, q12
+        vmov            q15, q13
+        vmov            q5,  q3
+        add5
+        shift2
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        // The last two rows are already padded properly here.
+        add5
+        b               6f
+
+5:
+        // r3 == -1, two rows left, output one.
+        // Pad the last two rows from the mid one.
+        vmov            q12, q10
+        vmov            q13, q11
+        vmov            q3,  q2
+        vmov            q14, q10
+        vmov            q15, q11
+        vmov            q5,  q2
+        add5
+        add             r0,  r0,  r7
+        add             r1,  r1,  r8
+        b               6f
+
+6:      // End of one vertical slice.
+        subs            r2,  r2,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        // Input pointers
+        mls             r5,  r7,  lr,  r5
+        mls             r6,  r8,  lr,  r6
+        // Output pointers
+        mls             r0,  r7,  r12, r0
+        mls             r1,  r8,  r12, r1
+        add             r0,  r0,  #32
+        add             r1,  r1,  #16
+        add             r5,  r5,  #32
+        add             r6,  r6,  #16
+        mov             r3,  r9
+        b               1b
+
+0:
+        vpop            {q5-q7}
+        pop             {r4-r9,pc}
+.purgem add5
+endfunc
+
+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
+//                              const int w, const int h, const int strength);
+function sgr_calc_ab1_neon, export=1
+        push            {r4-r5,lr}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #76]
+        add             r3,  r3,  #2   // h += 2
+        vmov.i32        q15, #9        // n
+        movw            r5,  #455
+        mov             lr,  #SUM_STRIDE
+        b               sgr_calc_ab_neon
+endfunc
+
+function sgr_calc_ab2_neon, export=1
+        push            {r4-r5,lr}
+        vpush           {q4-q7}
+        ldr             r4,  [sp, #76]
+        add             r3,  r3,  #3   // h += 3
+        asr             r3,  r3,  #1   // h /= 2
+        vmov.i32        q15, #25       // n
+        mov             r5,  #164
+        mov             lr,  #(2*SUM_STRIDE)
+endfunc
+
+function sgr_calc_ab_neon
+        movrel          r12, X(sgr_x_by_x)
+        vld1.8          {q8, q9}, [r12, :128]!
+        vmov.i8         q11, #5
+        vmov.i8         d10, #55       // idx of last 5
+        vld1.8          {q10},    [r12, :128]
+        vmov.i8         d11, #72       // idx of last 4
+        vmov.i8         d12, #101      // idx of last 3
+        vmov.i8         d13, #169      // idx of last 2
+        vmov.i8         d14, #254      // idx of last 1
+        vmov.i8         d15, #32       // elements consumed in first vtbl
+        add             r2,  r2,  #2   // w += 2
+        add             r12, r2,  #7
+        bic             r12, r12, #7   // aligned w
+        sub             r12, lr,  r12  // increment between rows
+        vmov.i16        q13, #256
+        vdup.32         q12, r4
+        vdup.32         q14, r5        // one_by_x
+        sub             r0,  r0,  #(4*(SUM_STRIDE))
+        sub             r1,  r1,  #(2*(SUM_STRIDE))
+        mov             r4,  r2        // backup of w
+        vsub.i8         q8,  q8,  q11
+        vsub.i8         q9,  q9,  q11
+        vsub.i8         q10, q10, q11
+1:
+        subs            r2,  r2,  #8
+        vld1.32         {q0, q1}, [r0, :128] // a
+        vld1.16         {q2},     [r1, :128] // b
+        vmul.i32        q0,  q0,  q15  // a * n
+        vmul.i32        q1,  q1,  q15  // a * n
+        vmull.u16       q3,  d4,  d4   // b * b
+        vmull.u16       q4,  d5,  d5   // b * b
+        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
+        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
+        vmul.i32        q0,  q0,  q12  // p * s
+        vmul.i32        q1,  q1,  q12  // p * s
+        vqshrn.u32      d0,  q0,  #16
+        vqshrn.u32      d1,  q1,  #16
+        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
+
+        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
+        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
+        vtbl.8          d1,  {q8, q9}, d0
+        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
+        vsub.i8         d9,  d0,  d15  // indices for vtbx
+        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
+        vadd.i8         d2,  d2,  d3
+        vtbx.8          d1,  {q10}, d9
+        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
+        vadd.i8         d6,  d6,  d7
+        vadd.i8         d8,  d8,  d22
+        vadd.i8         d2,  d2,  d6
+        vadd.i8         d1,  d1,  d8
+        vadd.i8         d1,  d1,  d2
+        vmovl.u8        q0,  d1        // x
+
+        vmull.u16       q1,  d0,  d4   // x * BB[i]
+        vmull.u16       q2,  d1,  d5   // x * BB[i]
+        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
+        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
+        vrshr.s32       q1,  q1,  #12  // AA[i]
+        vrshr.s32       q2,  q2,  #12  // AA[i]
+        vsub.i16        q0,  q13, q0   // 256 - x
+
+        vst1.32         {q1, q2}, [r0, :128]!
+        vst1.16         {q0},     [r1, :128]!
+        bgt             1b
+
+        subs            r3,  r3,  #1
+        ble             0f
+        add             r0,  r0,  r12, lsl #2
+        add             r1,  r1,  r12, lsl #1
+        mov             r2,  r4
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r5,pc}
+endfunc
+
+#define FILTER_OUT_STRIDE 384
+
+// void dav1d_sgr_finish_filter1_neon(coef *tmp,
+//                                    const pixel *src, const ptrdiff_t stride,
+//                                    const int32_t *a, const int16_t *b,
+//                                    const int w, const int h);
+function sgr_finish_filter1_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        sub             r7,  r3,  #(4*SUM_STRIDE)
+        add             r8,  r3,  #(4*SUM_STRIDE)
+        sub             r9,  r4,  #(2*SUM_STRIDE)
+        add             r10, r4,  #(2*SUM_STRIDE)
+        mov             r11, #SUM_STRIDE
+        mov             r12, #FILTER_OUT_STRIDE
+        add             lr,  r5,  #3
+        bic             lr,  lr,  #3 // Aligned width
+        sub             r2,  r2,  lr
+        sub             r12, r12, lr
+        sub             r11, r11, lr
+        sub             r11, r11, #4 // We read 4 extra elements from both a and b
+        mov             lr,  r5
+        vmov.i16        q14, #3
+        vmov.i32        q15, #3
+1:
+        vld1.16         {q0},       [r9]!
+        vld1.16         {q1},       [r4]!
+        vld1.16         {q2},       [r10]!
+        vld1.32         {q8,  q9},  [r7]!
+        vld1.32         {q10, q11}, [r3]!
+        vld1.32         {q12, q13}, [r8]!
+
+2:
+        subs            r5,  r5,  #4
+        vext.8          d6,  d0,  d1,  #2  // -stride
+        vext.8          d7,  d2,  d3,  #2  // 0
+        vext.8          d8,  d4,  d5,  #2  // +stride
+        vext.8          d9,  d0,  d1,  #4  // +1-stride
+        vext.8          d10, d2,  d3,  #4  // +1
+        vext.8          d11, d4,  d5,  #4  // +1+stride
+        vadd.i16        d2,  d2,  d6       // -1, -stride
+        vadd.i16        d7,  d7,  d8       // 0, +stride
+        vadd.i16        d0,  d0,  d9       // -1-stride, +1-stride
+        vadd.i16        d2,  d2,  d7
+        vadd.i16        d4,  d4,  d11      // -1+stride, +1+stride
+        vadd.i16        d2,  d2,  d10      // +1
+        vadd.i16        d0,  d0,  d4
+
+        vext.8          q3,  q8,  q9,  #4  // -stride
+        vshl.i16        d2,  d2,  #2
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q10, q11, #4  // 0
+        vext.8          q6,  q10, q11, #8  // +1
+        vmla.i16        d2,  d0,  d28      // * 3 -> a
+        vadd.i32        q3,  q3,  q10      // -stride, -1
+        vadd.i32        q8,  q8,  q4       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q6       // 0, +1
+        vadd.i32        q8,  q8,  q12      // -1+stride
+        vadd.i32        q3,  q3,  q5
+        vext.8          q7,  q12, q13, #4  // +stride
+        vext.8          q10, q12, q13, #8  // +1+stride
+        vld1.32         {d24[0]}, [r1]!    // src
+        vadd.i32        q3,  q3,  q7       // +stride
+        vadd.i32        q8,  q8,  q10      // +1+stride
+        vshl.i32        q3,  q3,  #2
+        vmla.i32        q3,  q8,  q15      // * 3 -> b
+        vmovl.u8        q12, d24           // src
+        vmov            d0,  d1
+        vmlal.u16       q3,  d2,  d24      // b + a * src
+        vmov            d2,  d3
+        vrshrn.i32      d6,  q3,  #9
+        vmov            d4,  d5
+        vst1.16         {d6}, [r0]!
+
+        ble             3f
+        vmov            q8,  q9
+        vmov            q10, q11
+        vmov            q12, q13
+        vld1.16         {d1},  [r9]!
+        vld1.16         {d3},  [r4]!
+        vld1.16         {d5},  [r10]!
+        vld1.32         {q9},  [r7]!
+        vld1.32         {q11}, [r3]!
+        vld1.32         {q13}, [r8]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r12, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r11, lsl #2
+        add             r7,  r7,  r11, lsl #2
+        add             r8,  r8,  r11, lsl #2
+        add             r4,  r4,  r11, lsl #1
+        add             r9,  r9,  r11, lsl #1
+        add             r10, r10, r11, lsl #1
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_finish_filter2_neon(coef *tmp,
+//                                    const pixel *src, const ptrdiff_t stride,
+//                                    const int32_t *a, const int16_t *b,
+//                                    const int w, const int h);
+function sgr_finish_filter2_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldr             r6,  [sp, #108]
+        add             r7,  r3,  #(4*(SUM_STRIDE))
+        sub             r3,  r3,  #(4*(SUM_STRIDE))
+        add             r8,  r4,  #(2*(SUM_STRIDE))
+        sub             r4,  r4,  #(2*(SUM_STRIDE))
+        mov             r9,  #(2*SUM_STRIDE)
+        mov             r10, #FILTER_OUT_STRIDE
+        add             r11, r5,  #7
+        bic             r11, r11, #7 // Aligned width
+        sub             r2,  r2,  r11
+        sub             r10, r10, r11
+        sub             r9,  r9,  r11
+        sub             r9,  r9,  #4 // We read 4 extra elements from a
+        sub             r12, r9,  #4 // We read 8 extra elements from b
+        mov             lr,  r5
+
+1:
+        vld1.16         {q0,  q1},  [r4]!
+        vld1.16         {q2,  q3},  [r8]!
+        vld1.32         {q8,  q9},  [r3]!
+        vld1.32         {q11, q12}, [r7]!
+        vld1.32         {q10},      [r3]!
+        vld1.32         {q13},      [r7]!
+
+2:
+        vmov.i16        q14, #5
+        vmov.i16        q15, #6
+        subs            r5,  r5,  #8
+        vext.8          q4,  q0,  q1,  #4  // +1-stride
+        vext.8          q5,  q2,  q3,  #4  // +1+stride
+        vext.8          q6,  q0,  q1,  #2  // -stride
+        vext.8          q7,  q2,  q3,  #2  // +stride
+        vadd.i16        q0,  q0,  q4       // -1-stride, +1-stride
+        vadd.i16        q5,  q2,  q5       // -1+stride, +1+stride
+        vadd.i16        q2,  q6,  q7       // -stride, +stride
+        vadd.i16        q0,  q0,  q5
+
+        vext.8          q4,  q8,  q9,  #8  // +1-stride
+        vext.8          q5,  q9,  q10, #8
+        vext.8          q6,  q11, q12, #8  // +1+stride
+        vext.8          q7,  q12, q13, #8
+        vmul.i16        q0,  q0,  q14      // * 5
+        vmla.i16        q0,  q2,  q15      // * 6
+        vadd.i32        q4,  q4,  q8       // -1-stride, +1-stride
+        vadd.i32        q5,  q5,  q9
+        vadd.i32        q6,  q6,  q11      // -1+stride, +1+stride
+        vadd.i32        q7,  q7,  q12
+        vadd.i32        q4,  q4,  q6
+        vadd.i32        q5,  q5,  q7
+        vext.8          q6,  q8,  q9,  #4  // -stride
+        vext.8          q7,  q9,  q10, #4
+        vext.8          q8,  q11, q12, #4  // +stride
+        vext.8          q11, q12, q13, #4
+
+        vld1.8          {d4}, [r1]!
+
+        vmov.i32        q14, #5
+        vmov.i32        q15, #6
+
+        vadd.i32        q6,  q6,  q8       // -stride, +stride
+        vadd.i32        q7,  q7,  q11
+        vmul.i32        q4,  q4,  q14      // * 5
+        vmla.i32        q4,  q6,  q15      // * 6
+        vmul.i32        q5,  q5,  q14      // * 5
+        vmla.i32        q5,  q7,  q15      // * 6
+
+        vmovl.u8        q2,  d4
+        vmlal.u16       q4,  d0,  d4       // b + a * src
+        vmlal.u16       q5,  d1,  d5       // b + a * src
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #9
+        vrshrn.i32      d9,  q5,  #9
+        vmov            q2,  q3
+        vst1.16         {q4}, [r0]!
+
+        ble             3f
+        vmov            q8,  q10
+        vmov            q11, q13
+        vld1.16         {q1},       [r4]!
+        vld1.16         {q3},       [r8]!
+        vld1.32         {q9,  q10}, [r3]!
+        vld1.32         {q12, q13}, [r7]!
+        b               2b
+
+3:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        add             r3,  r3,  r9,  lsl #2
+        add             r7,  r7,  r9,  lsl #2
+        add             r4,  r4,  r12, lsl #1
+        add             r8,  r8,  r12, lsl #1
+
+        vld1.32         {q8, q9}, [r3]!
+        vld1.16         {q0, q1}, [r4]!
+        vld1.32         {q10},    [r3]!
+
+        vmov.i16        q12, #5
+        vmov.i16        q13, #6
+
+4:
+        subs            r5,  r5,  #8
+        vext.8          q3,  q0,  q1,  #4  // +1
+        vext.8          q2,  q0,  q1,  #2  // 0
+        vadd.i16        q0,  q0,  q3       // -1, +1
+
+        vext.8          q4,  q8,  q9,  #4  // 0
+        vext.8          q5,  q9,  q10, #4
+        vext.8          q6,  q8,  q9,  #8  // +1
+        vext.8          q7,  q9,  q10, #8
+        vmul.i16        q2,  q2,  q13      // * 6
+        vmla.i16        q2,  q0,  q12      // * 5 -> a
+        vld1.8          {d22}, [r1]!
+        vadd.i32        q8,  q8,  q6       // -1, +1
+        vadd.i32        q9,  q9,  q7
+        vmovl.u8        q11, d22
+        vmul.i32        q4,  q4,  q15      // * 6
+        vmla.i32        q4,  q8,  q14      // * 5 -> b
+        vmul.i32        q5,  q5,  q15      // * 6
+        vmla.i32        q5,  q9,  q14      // * 5 -> b
+
+        vmlal.u16       q4,  d4,  d22      // b + a * src
+        vmlal.u16       q5,  d5,  d23
+        vmov            q0,  q1
+        vrshrn.i32      d8,  q4,  #8
+        vrshrn.i32      d9,  q5,  #8
+        vmov            q8,  q10
+        vst1.16         {q4}, [r0]!
+
+        ble             5f
+        vld1.16         {q1},      [r4]!
+        vld1.32         {q9, q10}, [r3]!
+        b               4b
+
+5:
+        subs            r6,  r6,  #1
+        ble             0f
+        mov             r5,  lr
+        sub             r3,  r3,  r11, lsl #2 // Rewind r3/r4 to where they started
+        sub             r4,  r4,  r11, lsl #1
+        add             r0,  r0,  r10, lsl #1
+        add             r1,  r1,  r2
+        sub             r3,  r3,  #16
+        sub             r4,  r4,  #16
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
+//                               const pixel *src, const ptrdiff_t src_stride,
+//                               const coef *t1, const int w, const int h,
+//                               const int wt);
+function sgr_weighted1_neon, export=1
+        push            {r4-r9,lr}
+        ldrd            r4,  r5,  [sp, #28]
+        ldrd            r6,  r7,  [sp, #36]
+        ldr             r8,  [sp, #44]
+        vdup.16         d31, r7
+        cmp             r6,  #2
+        add             r9,  r0,  r1
+        add             r12, r2,  r3
+        add             lr,  r4,  #2*FILTER_OUT_STRIDE
+        mov             r7,  #(4*FILTER_OUT_STRIDE)
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r8,  r5,  #7
+        bic             r8,  r8,  #7 // Aligned width
+        sub             r1,  r1,  r8
+        sub             r3,  r3,  r8
+        sub             r7,  r7,  r8, lsl #1
+        mov             r8,  r5
+        blt             2f
+1:
+        vld1.8          {d0},  [r2]!
+        vld1.8          {d16}, [r12]!
+        vld1.16         {q1},  [r4]!
+        vld1.16         {q9},  [lr]!
+        subs            r5,  r5,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vshll.u16       q10, d16, #7     // u << 7
+        vshll.u16       q11, d17, #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vmlal.s16       q10, d18, d31    // v
+        vmlal.s16       q11, d19, d31    // v
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vrshrn.i32      d20, q10, #11
+        vrshrn.i32      d21, q11, #11
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d20, q10
+        vst1.8          {d4},  [r0]!
+        vst1.8          {d20}, [r9]!
+        bgt             1b
+
+        sub             r6,  r6,  #2
+        cmp             r6,  #1
+        blt             0f
+        mov             r5,  r8
+        add             r0,  r0,  r1
+        add             r9,  r9,  r1
+        add             r2,  r2,  r3
+        add             r12, r12, r3
+        add             r4,  r4,  r7
+        add             lr,  lr,  r7
+        beq             2f
+        b               1b
+
+2:
+        vld1.8          {d0}, [r2]!
+        vld1.16         {q1}, [r4]!
+        subs            r5,  r5,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vshll.u16       q2,  d0,  #7     // u << 7
+        vshll.u16       q3,  d1,  #7     // u << 7
+        vmlal.s16       q2,  d2,  d31    // v
+        vmlal.s16       q3,  d3,  d31    // v
+        vrshrn.i32      d4,  q2,  #11
+        vrshrn.i32      d5,  q3,  #11
+        vqmovun.s16     d2,  q2
+        vst1.8          {d2}, [r0]!
+        bgt             2b
+0:
+        pop             {r4-r9,pc}
+endfunc
+
+// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *src, const ptrdiff_t src_stride,
+//                               const coef *t1, const coef *t2,
+//                               const int w, const int h,
+//                               const int16_t wt[2]);
+function sgr_weighted2_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        ldr             r8,  [sp, #52]
+        cmp             r7,  #2
+        add             r10, r0,  r1
+        add             r11, r2,  r3
+        add             r12, r4,  #2*FILTER_OUT_STRIDE
+        add             lr,  r5,  #2*FILTER_OUT_STRIDE
+        vld2.16         {d30[], d31[]}, [r8] // wt[0], wt[1]
+        mov             r8,  #4*FILTER_OUT_STRIDE
+        lsl             r1,  r1,  #1
+        lsl             r3,  r3,  #1
+        add             r9,  r6,  #7
+        bic             r9,  r9,  #7 // Aligned width
+        sub             r1,  r1,  r9
+        sub             r3,  r3,  r9
+        sub             r8,  r8,  r9, lsl #1
+        mov             r9,  r6
+        blt             2f
+1:
+        vld1.8          {d0},  [r2]!
+        vld1.8          {d16}, [r11]!
+        vld1.16         {q1},  [r4]!
+        vld1.16         {q9},  [r12]!
+        vld1.16         {q2},  [r5]!
+        vld1.16         {q10}, [lr]!
+        subs            r6,  r6,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vshll.u8        q8,  d16, #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vsub.i16        q9,  q9,  q8     // t1 - u
+        vsub.i16        q10, q10, q8     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vshll.u16       q11, d16, #7     // u << 7
+        vshll.u16       q8,  d17, #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q11, d18, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q11, d20, d31    // wt[1] * (t2 - u)
+        vmlal.s16       q8,  d19, d30    // wt[0] * (t1 - u)
+        vmlal.s16       q8,  d21, d31    // wt[1] * (t2 - u)
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vrshrn.i32      d22, q11, #11
+        vrshrn.i32      d23, q8,  #11
+        vqmovun.s16     d6,  q3
+        vqmovun.s16     d22, q11
+        vst1.8          {d6},  [r0]!
+        vst1.8          {d22}, [r10]!
+        bgt             1b
+
+        subs            r7,  r7,  #2
+        cmp             r7,  #1
+        blt             0f
+        mov             r6,  r9
+        add             r0,  r0,  r1
+        add             r10, r10, r1
+        add             r2,  r2,  r3
+        add             r11, r11, r3
+        add             r4,  r4,  r8
+        add             r12, r12, r8
+        add             r5,  r5,  r8
+        add             lr,  lr,  r8
+        beq             2f
+        b               1b
+
+2:
+        vld1.8          {d0}, [r2]!
+        vld1.16         {q1}, [r4]!
+        vld1.16         {q2}, [r5]!
+        subs            r6,  r6,  #8
+        vshll.u8        q0,  d0,  #4     // u
+        vsub.i16        q1,  q1,  q0     // t1 - u
+        vsub.i16        q2,  q2,  q0     // t2 - u
+        vshll.u16       q3,  d0,  #7     // u << 7
+        vshll.u16       q0,  d1,  #7     // u << 7
+        vmlal.s16       q3,  d2,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q3,  d4,  d31    // wt[1] * (t2 - u)
+        vmlal.s16       q0,  d3,  d30    // wt[0] * (t1 - u)
+        vmlal.s16       q0,  d5,  d31    // wt[1] * (t2 - u)
+        vrshrn.i32      d6,  q3,  #11
+        vrshrn.i32      d7,  q0,  #11
+        vqmovun.s16     d6,  q3
+        vst1.8          {d6}, [r0]!
+        bgt             1b
+0:
+        pop             {r4-r11,pc}
 endfunc
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -91,7 +91,6 @@
     }
 }
 
-#if ARCH_AARCH64
 void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
                            const pixel (*left)[4],
                            const pixel *src, const ptrdiff_t stride,
@@ -253,7 +252,6 @@
         }
     }
 }
-#endif // ARCH_AARCH64
 #endif // BITDEPTH == 8
 
 COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
@@ -263,8 +261,6 @@
 
 #if BITDEPTH == 8
     c->wiener = wiener_filter_neon;
-#if ARCH_AARCH64
     c->selfguided = sgr_filter_neon;
-#endif
 #endif
 }
--- a/src/tables.c
+++ b/src/tables.c
@@ -406,7 +406,7 @@
     { 2, 0,  22,   -1 },
 };
 
-const uint8_t dav1d_sgr_x_by_x[256] = {
+const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
     255, 128,  85,  64,  51,  43,  37,  32,  28,  26,  23,  21,  20,  18,  17,
      16,  15,  14,  13,  13,  12,  12,  11,  11,  10,  10,   9,   9,   9,   9,
       8,   8,   8,   8,   7,   7,   7,   7,   7,   6,   6,   6,   6,   6,   6,