shithub: dav1d

--- a/src/arm/64/looprestoration.S

+++ b/src/arm/64/looprestoration.S

@@ -26,6 +26,7 @@

*/

 #include "src/arm/asm.S"

+#include "util.S"

 // void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],

 //                                 const pixel *src, ptrdiff_t stride,

@@ -612,4 +613,1375 @@

         .hword L(copy_narrow_tbl) - 50b

         .hword L(copy_narrow_tbl) - 60b

         .hword L(copy_narrow_tbl) - 70b

+endfunc

+#define SUM_STRIDE (384+16)

+// void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,

+//                            const pixel (*left)[4],

+//                            const pixel *src, const ptrdiff_t stride,

+//                            const int w, const int h,

+//                            const enum LrEdgeFlags edges);

+function sgr_box3_h_neon, export=1

+        add             w5,  w5,  #2 // w += 2

+        // Set up pointers for reading/writing alternate rows

+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq

+        add             x11, x1,  #(2*SUM_STRIDE)   // sum

+        add             x12, x3,  x4                // src

+        lsl             x4,  x4,  #1

+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride

+        // Subtract the aligned width from the output stride.

+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.

+        tst             w7,  #2 // LR_HAVE_RIGHT

+        b.ne            0f

+        // !LR_HAVE_RIGHT

+        add             w13, w5,  #3

+        bic             w13, w13, #3

+        b               1f

+0:

+        add             w13, w5,  #7

+        bic             w13, w13, #7

+1:

+        sub             x9,  x9,  w13, uxtw #1

+        // Store the width for the vertical loop

+        mov             w8,  w5

+        // Subtract the number of pixels read from the input from the stride

+        add             w13, w5,  #14

+        bic             w13, w13, #7

+        sub             x4,  x4,  w13, uxtw

+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL

+        tst             w7,  #1 // LR_HAVE_LEFT

+        b.eq            2f

+        // LR_HAVE_LEFT

+        cbnz            x2,  0f

+        // left == NULL

+        sub             x3,  x3,  #2

+        sub             x12, x12, #2

+        b               1f

+0:      // LR_HAVE_LEFT, left != NULL

+2:      // !LR_HAVE_LEFT, increase the stride.

+        // For this case we don't read the left 2 pixels from the src pointer,

+        // but shift it as if we had done that.

+        add             x4,  x4,  #2

+1:      // Loop vertically

+        ld1             {v0.16b},  [x3],  #16

+        ld1             {v4.16b},  [x12], #16

+        tst             w7,  #1 // LR_HAVE_LEFT

+        b.eq            0f

+        cbz             x2,  2f

+        // LR_HAVE_LEFT, left != NULL

+        ld1             {v1.s}[3],  [x2], #4

+        // Move x3/x12 back to account for the last 2 bytes we loaded earlier,

+        // which we'll shift out.

+        sub             x3,  x3,  #2

+        sub             x12, x12, #2

+        ld1             {v5.s}[3],  [x2], #4

+        ext             v0.16b, v1.16b, v0.16b, #14

+        ext             v4.16b, v5.16b, v4.16b, #14

+        b               2f

+0:

+        // !LR_HAVE_LEFT, fill v1 with the leftmost byte

+        // and shift v0 to have 2x the first byte at the front.

+        dup             v1.16b, v0.b[0]

+        dup             v5.16b, v4.b[0]

+        // Move x3 back to account for the last 2 bytes we loaded before,

+        // which we shifted out.

+        sub             x3,  x3,  #2

+        sub             x12, x12, #2

+        ext             v0.16b, v1.16b, v0.16b, #14

+        ext             v4.16b, v5.16b, v4.16b, #14

+2:

+        umull           v1.8h,   v0.8b,   v0.8b

+        umull2          v2.8h,   v0.16b,  v0.16b

+        umull           v5.8h,   v4.8b,   v4.8b

+        umull2          v6.8h,   v4.16b,  v4.16b

+        tst             w7,  #2 // LR_HAVE_RIGHT

+        b.ne            4f

+        // If we'll need to pad the right edge, load that byte to pad with

+        // here since we can find it pretty easily from here.

+        sub             w13, w5, #(2 + 16 - 2 + 1)

+        ldr             b30, [x3,  w13, sxtw]

+        ldr             b31, [x12, w13, sxtw]

+        // Fill v30/v31 with the right padding pixel

+        dup             v30.8b,  v30.b[0]

+        dup             v31.8b,  v31.b[0]

+3:      // !LR_HAVE_RIGHT

+        // If we'll have to pad the right edge we need to quit early here.

+        cmp             w5,  #10

+        b.ge            4f   // If w >= 10, all used input pixels are valid

+        cmp             w5,  #6

+        b.ge            5f   // If w >= 6, we can filter 4 pixels

+        b               6f

+4:      // Loop horizontally

+.macro uaddl_nh         dst1, dst2, src1, src2, w

+        uaddl           \dst1,  \src1\().4h,  \src2\().4h

+.if \w > 4

+        uaddl2          \dst2,  \src1\().8h,  \src2\().8h

+.endif

+.endm

+.macro uaddw_nh         dst1, dst2, src, w

+        uaddw           \dst1,  \dst1,  \src\().4h

+.if \w > 4

+        uaddw2          \dst2,  \dst2,  \src\().8h

+.endif

+.endm

+.macro add_nh           dst1, dst2, src1, src2, w

+        add             \dst1,  \dst1,  \src1

+.if \w > 4

+        add             \dst2,  \dst2,  \src2

+.endif

+.endm

+.macro add3 w

+        ext             v16.16b, v0.16b,  v0.16b, #1

+        ext             v17.16b, v0.16b,  v0.16b, #2

+        ext             v18.16b, v4.16b,  v4.16b, #1

+        ext             v19.16b, v4.16b,  v4.16b, #2

+        uaddl           v3.8h,   v0.8b,   v16.8b

+        uaddw           v3.8h,   v3.8h,   v17.8b

+        uaddl           v7.8h,   v4.8b,   v18.8b

+        uaddw           v7.8h,   v7.8h,   v19.8b

+        ext             v20.16b, v1.16b,  v2.16b, #2

+        ext             v21.16b, v1.16b,  v2.16b, #4

+        ext             v22.16b, v5.16b,  v6.16b, #2

+        ext             v23.16b, v5.16b,  v6.16b, #4

+        uaddl_nh        v26.4s,  v27.4s,  v1,   v20,  \w

+        uaddw_nh        v26.4s,  v27.4s,  v21,        \w

+        uaddl_nh        v28.4s,  v29.4s,  v5,   v22,  \w

+        uaddw_nh        v28.4s,  v29.4s,  v23,        \w

+.endm

+        add3            8

+        st1             {v3.8h},         [x1],  #16

+        st1             {v7.8h},         [x11], #16

+        st1             {v26.4s,v27.4s}, [x0],  #32

+        st1             {v28.4s,v29.4s}, [x10], #32

+        subs            w5,  w5,  #8

+        b.le            9f

+        tst             w7,  #2 // LR_HAVE_RIGHT

+        ld1             {v3.8b},  [x3],  #8

+        ld1             {v7.8b},  [x12], #8

+        mov             v1.16b,  v2.16b

+        mov             v5.16b,  v6.16b

+        ext             v0.16b,  v0.16b,  v3.16b, #8

+        ext             v4.16b,  v4.16b,  v7.16b, #8

+        umull           v2.8h,   v3.8b,   v3.8b

+        umull           v6.8h,   v7.8b,   v7.8b

+        b.ne            4b // If we don't need to pad, just keep summing.

+        b               3b // If we need to pad, check how many pixels we have left.

+5:      // Produce 4 pixels, 6 <= w < 10

+        add3            4

+        st1             {v3.4h},  [x1],  #8

+        st1             {v7.4h},  [x11], #8

+        st1             {v26.4s}, [x0],  #16

+        st1             {v28.4s}, [x10], #16

+        subs            w5,  w5,  #4 // 2 <= w < 6

+        ext             v0.16b,  v0.16b,  v0.16b, #4

+        ext             v4.16b,  v4.16b,  v4.16b, #4

+6:      // Pad the right edge and produce the last few pixels.

+        // 2 <= w < 6, 2-5 pixels valid in v0

+        sub             w13,  w5,  #2

+        // w13 = (pixels valid - 2)

+        adr             x14, L(box3_variable_shift_tbl)

+        ldrh            w13, [x14, w13, uxtw #1]

+        sub             x13, x14, w13, uxth

+        br              x13

+        // Shift v0 right, shifting out invalid pixels,

+        // shift v0 left to the original offset, shifting in padding pixels.

+22:     // 2 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #2

+        ext             v4.16b,  v4.16b,  v4.16b,  #2

+        ext             v0.16b,  v0.16b,  v30.16b, #14

+        ext             v4.16b,  v4.16b,  v31.16b, #14

+        b               88f

+33:     // 3 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #3

+        ext             v4.16b,  v4.16b,  v4.16b,  #3

+        ext             v0.16b,  v0.16b,  v30.16b, #13

+        ext             v4.16b,  v4.16b,  v31.16b, #13

+        b               88f

+44:     // 4 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #4

+        ext             v4.16b,  v4.16b,  v4.16b,  #4

+        ext             v0.16b,  v0.16b,  v30.16b, #12

+        ext             v4.16b,  v4.16b,  v31.16b, #12

+        b               88f

+55:     // 5 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #5

+        ext             v4.16b,  v4.16b,  v4.16b,  #5

+        ext             v0.16b,  v0.16b,  v30.16b, #11

+        ext             v4.16b,  v4.16b,  v31.16b, #11

+        b               88f

+L(box3_variable_shift_tbl):

+        .hword L(box3_variable_shift_tbl) - 22b

+        .hword L(box3_variable_shift_tbl) - 33b

+        .hword L(box3_variable_shift_tbl) - 44b

+        .hword L(box3_variable_shift_tbl) - 55b

+88:

+        umull           v1.8h,   v0.8b,   v0.8b

+        umull2          v2.8h,   v0.16b,  v0.16b

+        umull           v5.8h,   v4.8b,   v4.8b

+        umull2          v6.8h,   v4.16b,  v4.16b

+        add3            4

+        st1             {v3.4h},  [x1],  #8

+        st1             {v7.4h},  [x11], #8

+        st1             {v26.4s}, [x0],  #16

+        st1             {v28.4s}, [x10], #16

+        subs            w5,  w5,  #4

+        b.le            9f

+        ext             v0.16b,  v0.16b,  v0.16b, #4

+        ext             v4.16b,  v4.16b,  v4.16b, #4

+        ext             v1.16b,  v1.16b,  v2.16b, #8

+        ext             v5.16b,  v5.16b,  v6.16b, #8

+        // Only one needed pixel left, but do a normal 4 pixel

+        // addition anyway

+        add3            4

+        st1             {v3.4h},  [x1],  #8

+        st1             {v7.4h},  [x11], #8

+        st1             {v26.4s}, [x0],  #16

+        st1             {v28.4s}, [x10], #16

+9:

+        subs            w6,  w6,  #2

+        b.le            0f

+        // Jump to the next row and loop horizontally

+        add             x0,  x0,  x9, lsl #1

+        add             x10, x10, x9, lsl #1

+        add             x1,  x1,  x9

+        add             x11, x11, x9

+        add             x3,  x3,  x4

+        add             x12, x12, x4

+        mov             w5,  w8

+        b               1b

+0:

+        ret

+.purgem add3

+endfunc

+// void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,

+//                            const pixel (*left)[4],

+//                            const pixel *src, const ptrdiff_t stride,

+//                            const int w, const int h,

+//                            const enum LrEdgeFlags edges);

+function sgr_box5_h_neon, export=1

+        add             w5,  w5,  #2 // w += 2

+        // Set up pointers for reading/writing alternate rows

+        add             x10, x0,  #(4*SUM_STRIDE)   // sumsq

+        add             x11, x1,  #(2*SUM_STRIDE)   // sum

+        add             x12, x3,  x4                // src

+        lsl             x4,  x4,  #1

+        mov             x9,       #(2*2*SUM_STRIDE) // double sum stride

+        // Subtract the aligned width from the output stride.

+        // With LR_HAVE_RIGHT, align to 8, without it, align to 4.

+        // Subtract the number of pixels read from the input from the stride.

+        tst             w7,  #2 // LR_HAVE_RIGHT

+        b.ne            0f

+        // !LR_HAVE_RIGHT

+        add             w13, w5,  #3

+        bic             w13, w13, #3

+        add             w14, w5,  #13

+        b               1f

+0:

+        add             w13, w5,  #7

+        bic             w13, w13, #7

+        add             w14, w5,  #15

+1:

+        sub             x9,  x9,  w13, uxtw #1

+        bic             w14, w14, #7

+        sub             x4,  x4,  w14, uxtw

+        // Store the width for the vertical loop

+        mov             w8,  w5

+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL

+        tst             w7,  #1 // LR_HAVE_LEFT

+        b.eq            2f

+        // LR_HAVE_LEFT

+        cbnz            x2,  0f

+        // left == NULL

+        sub             x3,  x3,  #3

+        sub             x12, x12, #3

+        b               1f

+0:      // LR_HAVE_LEFT, left != NULL

+2:      // !LR_HAVE_LEFT, increase the stride.

+        // For this case we don't read the left 3 pixels from the src pointer,

+        // but shift it as if we had done that.

+        add             x4,  x4,  #3

+1:      // Loop vertically

+        ld1             {v0.16b},  [x3],  #16

+        ld1             {v4.16b},  [x12], #16

+        tst             w7,  #1 // LR_HAVE_LEFT

+        b.eq            0f

+        cbz             x2,  2f

+        // LR_HAVE_LEFT, left != NULL

+        ld1             {v1.s}[3],  [x2], #4

+        // Move x3/x12 back to account for the last 3 bytes we loaded earlier,

+        // which we'll shift out.

+        sub             x3,  x3,  #3

+        sub             x12, x12, #3

+        ld1             {v5.s}[3],  [x2], #4

+        ext             v0.16b, v1.16b, v0.16b, #13

+        ext             v4.16b, v5.16b, v4.16b, #13

+        b               2f

+0:

+        // !LR_HAVE_LEFT, fill v2 with the leftmost byte

+        // and shift v3 to have 2x the first byte at the front.

+        dup             v1.16b, v0.b[0]

+        dup             v5.16b, v4.b[0]

+        // Move x3 back to account for the last 3 bytes we loaded before,

+        // which we shifted out.

+        sub             x3,  x3,  #3

+        sub             x12, x12, #3

+        ext             v0.16b, v1.16b, v0.16b, #13

+        ext             v4.16b, v5.16b, v4.16b, #13

+2:

+        umull           v1.8h,   v0.8b,   v0.8b

+        umull2          v2.8h,   v0.16b,  v0.16b

+        umull           v5.8h,   v4.8b,   v4.8b

+        umull2          v6.8h,   v4.16b,  v4.16b

+        tst             w7,  #2 // LR_HAVE_RIGHT

+        b.ne            4f

+        // If we'll need to pad the right edge, load that byte to pad with

+        // here since we can find it pretty easily from here.

+        sub             w13, w5, #(2 + 16 - 3 + 1)

+        ldr             b30, [x3,  w13, sxtw]

+        ldr             b31, [x12, w13, sxtw]

+        // Fill v30/v31 with the right padding pixel

+        dup             v30.8b,  v30.b[0]

+        dup             v31.8b,  v31.b[0]

+3:      // !LR_HAVE_RIGHT

+        // If we'll have to pad the right edge we need to quit early here.

+        cmp             w5,  #11

+        b.ge            4f   // If w >= 11, all used input pixels are valid

+        cmp             w5,  #7

+        b.ge            5f   // If w >= 7, we can produce 4 pixels

+        b               6f

+4:      // Loop horizontally

+.macro add5 w

+        ext             v16.16b, v0.16b,  v0.16b, #1

+        ext             v17.16b, v0.16b,  v0.16b, #2

+        ext             v18.16b, v0.16b,  v0.16b, #3

+        ext             v19.16b, v0.16b,  v0.16b, #4

+        ext             v20.16b, v4.16b,  v4.16b, #1

+        ext             v21.16b, v4.16b,  v4.16b, #2

+        ext             v22.16b, v4.16b,  v4.16b, #3

+        ext             v23.16b, v4.16b,  v5.16b, #4

+        uaddl           v3.8h,   v0.8b,   v16.8b

+        uaddl           v24.8h,  v17.8b,  v18.8b

+        uaddl           v7.8h,   v4.8b,   v20.8b

+        uaddw           v3.8h,   v3.8h,   v19.8b

+        uaddl           v25.8h,  v21.8b,  v22.8b

+        uaddw           v7.8h,   v7.8h,   v23.8b

+        add             v3.8h,   v3.8h,   v24.8h

+        add             v7.8h,   v7.8h,   v25.8h

+        ext             v16.16b, v1.16b,  v2.16b, #2

+        ext             v17.16b, v1.16b,  v2.16b, #4

+        ext             v18.16b, v1.16b,  v2.16b, #6

+        ext             v19.16b, v1.16b,  v2.16b, #8

+        ext             v20.16b, v5.16b,  v6.16b, #2

+        ext             v21.16b, v5.16b,  v6.16b, #4

+        ext             v22.16b, v5.16b,  v6.16b, #6

+        ext             v23.16b, v5.16b,  v6.16b, #8

+        uaddl_nh        v26.4s,  v27.4s,  v1,   v16,  \w

+        uaddl_nh        v16.4s,  v17.4s,  v17,  v18,  \w

+        uaddl_nh        v28.4s,  v29.4s,  v5,   v20,  \w

+        uaddw_nh        v26.4s,  v27.4s,  v19,        \w

+        uaddl_nh        v20.4s,  v21.4s,  v21,  v22,  \w

+        uaddw_nh        v28.4s,  v29.4s,  v23,        \w

+        add_nh          v26.4s,  v27.4s,  v16.4s, v17.4s, \w

+        add_nh          v28.4s,  v29.4s,  v20.4s, v21.4s, \w

+.endm

+        add5            8

+        st1             {v3.8h},         [x1],  #16

+        st1             {v7.8h},         [x11], #16

+        st1             {v26.4s,v27.4s}, [x0],  #32

+        st1             {v28.4s,v29.4s}, [x10], #32

+        subs            w5,  w5,  #8

+        b.le            9f

+        tst             w7,  #2 // LR_HAVE_RIGHT

+        ld1             {v3.8b},  [x3],  #8

+        ld1             {v7.8b},  [x12], #8

+        mov             v1.16b,  v2.16b

+        mov             v5.16b,  v6.16b

+        ext             v0.16b,  v0.16b,  v3.16b, #8

+        ext             v4.16b,  v4.16b,  v7.16b, #8

+        umull           v2.8h,   v3.8b,   v3.8b

+        umull           v6.8h,   v7.8b,   v7.8b

+        b.ne            4b // If we don't need to pad, just keep summing.

+        b               3b // If we need to pad, check how many pixels we have left.

+5:      // Produce 4 pixels, 7 <= w < 11

+        add5            4

+        st1             {v3.4h},  [x1],  #8

+        st1             {v7.4h},  [x11], #8

+        st1             {v26.4s}, [x0],  #16

+        st1             {v28.4s}, [x10], #16

+        subs            w5,  w5,  #4 // 3 <= w < 7

+        ext             v0.16b,  v0.16b,  v0.16b, #4

+        ext             v4.16b,  v4.16b,  v4.16b, #4

+6:      // Pad the right edge and produce the last few pixels.

+        // w < 7, w+1 pixels valid in v3/v5

+        sub             w13,  w5,  #1

+        // w13 = pixels valid - 2

+        adr             x14, L(box5_variable_shift_tbl)

+        ldrh            w13, [x14, w13, uxtw #1]

+        sub             x13, x14, w13, uxth

+        br              x13

+        // Shift v3 right, shifting out invalid pixels,

+        // shift v3 left to the original offset, shifting in padding pixels.

+22:     // 2 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #2

+        ext             v4.16b,  v4.16b,  v4.16b,  #2

+        ext             v0.16b,  v0.16b,  v30.16b, #14

+        ext             v4.16b,  v4.16b,  v31.16b, #14

+        b               88f

+33:     // 3 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #3

+        ext             v4.16b,  v4.16b,  v4.16b,  #3

+        ext             v0.16b,  v0.16b,  v30.16b, #13

+        ext             v4.16b,  v4.16b,  v31.16b, #13

+        b               88f

+44:     // 4 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #4

+        ext             v4.16b,  v4.16b,  v4.16b,  #4

+        ext             v0.16b,  v0.16b,  v30.16b, #12

+        ext             v4.16b,  v4.16b,  v31.16b, #12

+        b               88f

+55:     // 5 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #5

+        ext             v4.16b,  v4.16b,  v4.16b,  #5

+        ext             v0.16b,  v0.16b,  v30.16b, #11

+        ext             v4.16b,  v4.16b,  v31.16b, #11

+        b               88f

+66:     // 6 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #6

+        ext             v4.16b,  v4.16b,  v4.16b,  #6

+        ext             v0.16b,  v0.16b,  v30.16b, #10

+        ext             v4.16b,  v4.16b,  v31.16b, #10

+        b               88f

+77:     // 7 pixels valid

+        ext             v0.16b,  v0.16b,  v0.16b,  #7

+        ext             v4.16b,  v4.16b,  v4.16b,  #7

+        ext             v0.16b,  v0.16b,  v30.16b, #9

+        ext             v4.16b,  v4.16b,  v31.16b, #9

+        b               88f

+L(box5_variable_shift_tbl):

+        .hword L(box5_variable_shift_tbl) - 22b

+        .hword L(box5_variable_shift_tbl) - 33b

+        .hword L(box5_variable_shift_tbl) - 44b

+        .hword L(box5_variable_shift_tbl) - 55b

+        .hword L(box5_variable_shift_tbl) - 66b

+        .hword L(box5_variable_shift_tbl) - 77b

+88:

+        umull           v1.8h,   v0.8b,   v0.8b

+        umull2          v2.8h,   v0.16b,  v0.16b

+        umull           v5.8h,   v4.8b,   v4.8b

+        umull2          v6.8h,   v4.16b,  v4.16b

+        add5            4

+        st1             {v3.4h},  [x1],  #8

+        st1             {v7.4h},  [x11], #8

+        st1             {v26.4s}, [x0],  #16

+        st1             {v28.4s}, [x10], #16

+        subs            w5,  w5,  #4

+        b.le            9f

+        ext             v0.16b,  v0.16b,  v0.16b, #4

+        ext             v1.16b,  v1.16b,  v2.16b, #8

+        ext             v4.16b,  v4.16b,  v4.16b, #4

+        ext             v5.16b,  v5.16b,  v6.16b, #8

+        add5            4

+        st1             {v3.4h},  [x1],  #8

+        st1             {v7.4h},  [x11], #8

+        st1             {v26.4s}, [x0],  #16

+        st1             {v28.4s}, [x10], #16

+9:

+        subs            w6,  w6,  #2

+        b.le            0f

+        // Jump to the next row and loop horizontally

+        add             x0,  x0,  x9, lsl #1

+        add             x10, x10, x9, lsl #1

+        add             x1,  x1,  x9

+        add             x11, x11, x9

+        add             x3,  x3,  x4

+        add             x12, x12, x4

+        mov             w5,  w8

+        b               1b

+0:

+        ret

+.purgem add5

+endfunc

+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,

+//                            const int w, const int h,

+//                            const enum LrEdgeFlags edges);

+function sgr_box3_v_neon, export=1

+        add             w10, w3,  #2 // Number of output rows to move back

+        mov             w11, w3      // Number of input rows to move back

+        add             w2,  w2,  #2 // Actual summed width

+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride

+        mov             x8,       #(2*SUM_STRIDE) // sum stride

+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride

+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride

+        tst             w4,  #4 // LR_HAVE_TOP

+        b.eq            0f

+        // If have top, read from row -2.

+        sub             x5,  x0,  #(4*SUM_STRIDE)

+        sub             x6,  x1,  #(2*SUM_STRIDE)

+        add             w11, w11, #2

+        b               1f

+0:

+        // !LR_HAVE_TOP

+        // If we don't have top, read from row 0 even if

+        // we start writing to row -1.

+        add             x5,  x0,  #(4*SUM_STRIDE)

+        add             x6,  x1,  #(2*SUM_STRIDE)

+1:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.eq            1f

+        // LR_HAVE_BOTTOM

+        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop

+        add             w11, w11, #2

+1:

+        mov             w9,  w3       // Backup of h for next loops

+1:

+        // Start of horizontal loop; start one vertical filter slice.

+        // Start loading rows into v16-v21 and v24-v26 taking top

+        // padding into consideration.

+        tst             w4,  #4 // LR_HAVE_TOP

+        ld1             {v16.4s, v17.4s}, [x5], x7

+        ld1             {v24.8h},         [x6], x8

+        b.eq            2f

+        // LR_HAVE_TOP

+        ld1             {v18.4s, v19.4s}, [x5], x7

+        ld1             {v25.8h},         [x6], x8

+        ld1             {v20.4s, v21.4s}, [x5], x7

+        ld1             {v26.8h},         [x6], x8

+        b               3f

+2:      // !LR_HAVE_TOP

+        mov             v18.16b, v16.16b

+        mov             v19.16b, v17.16b

+        mov             v25.16b, v24.16b

+        mov             v20.16b, v16.16b

+        mov             v21.16b, v17.16b

+        mov             v26.16b, v24.16b

+3:

+        subs            w3,  w3,  #1

+.macro add3

+        add             v16.4s,  v16.4s,  v18.4s

+        add             v17.4s,  v17.4s,  v19.4s

+        add             v24.8h,  v24.8h,  v25.8h

+        add             v16.4s,  v16.4s,  v20.4s

+        add             v17.4s,  v17.4s,  v21.4s

+        add             v24.8h,  v24.8h,  v26.8h

+        st1             {v16.4s, v17.4s}, [x0], x7

+        st1             {v24.8h},         [x1], x8

+.endm

+        add3

+        mov             v16.16b, v18.16b

+        mov             v17.16b, v19.16b

+        mov             v24.16b, v25.16b

+        mov             v18.16b, v20.16b

+        mov             v19.16b, v21.16b

+        mov             v25.16b, v26.16b

+        b.le            4f

+        ld1             {v20.4s, v21.4s}, [x5], x7

+        ld1             {v26.8h},         [x6], x8

+        b               3b

+4:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.ne            5f

+        // !LR_HAVE_BOTTOM

+        // Produce two more rows, extending the already loaded rows.

+        add3

+        mov             v16.16b, v18.16b

+        mov             v17.16b, v19.16b

+        mov             v24.16b, v25.16b

+        add3

+5:      // End of one vertical slice.

+        subs            w2,  w2,  #8

+        b.le            0f

+        // Move pointers back up to the top and loop horizontally.

+        // Input pointers

+        msub            x5,  x7,  x11, x5

+        msub            x6,  x8,  x11, x6

+        // Output pointers

+        msub            x0,  x7,  x10, x0

+        msub            x1,  x8,  x10, x1

+        add             x0,  x0,  #32

+        add             x1,  x1,  #16

+        add             x5,  x5,  #32

+        add             x6,  x6,  #16

+        mov             w3,  w9

+        b               1b

+0:

+        ret

+.purgem add3

+endfunc

+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,

+//                            const int w, const int h,

+//                            const enum LrEdgeFlags edges);

+function sgr_box5_v_neon, export=1

+        add             w10, w3,  #2 // Number of output rows to move back

+        mov             w11, w3      // Number of input rows to move back

+        add             w2,  w2,  #8 // Actual summed width

+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride

+        mov             x8,       #(2*SUM_STRIDE) // sum stride

+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride

+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride

+        tst             w4,  #4 // LR_HAVE_TOP

+        b.eq            0f

+        // If have top, read from row -2.

+        sub             x5,  x0,  #(4*SUM_STRIDE)

+        sub             x6,  x1,  #(2*SUM_STRIDE)

+        add             w11, w11, #2

+        b               1f

+0:

+        // !LR_HAVE_TOP

+        // If we don't have top, read from row 0 even if

+        // we start writing to row -1.

+        add             x5,  x0,  #(4*SUM_STRIDE)

+        add             x6,  x1,  #(2*SUM_STRIDE)

+1:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.eq            0f

+        // LR_HAVE_BOTTOM

+        add             w3,  w3,  #2  // Handle h+2 lines with the main loop

+        add             w11, w11, #2

+        b               1f

+0:

+        // !LR_HAVE_BOTTOM

+        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop

+1:

+        mov             w9,  w3       // Backup of h for next loops

+1:

+        // Start of horizontal loop; start one vertical filter slice.

+        // Start loading rows into v16-v25 and v26-v30 taking top

+        // padding into consideration.

+        tst             w4,  #4 // LR_HAVE_TOP

+        ld1             {v16.4s, v17.4s}, [x5], x7

+        ld1             {v26.8h},         [x6], x8

+        b.eq            2f

+        // LR_HAVE_TOP

+        ld1             {v20.4s, v21.4s}, [x5], x7

+        ld1             {v28.8h},         [x6], x8

+        mov             v18.16b, v16.16b

+        mov             v19.16b, v17.16b

+        mov             v27.16b, v26.16b

+        ld1             {v22.4s, v23.4s}, [x5], x7

+        ld1             {v29.8h},         [x6], x8

+        b               3f

+2:      // !LR_HAVE_TOP

+        mov             v18.16b, v16.16b

+        mov             v19.16b, v17.16b

+        mov             v27.16b, v26.16b

+        mov             v20.16b, v16.16b

+        mov             v21.16b, v17.16b

+        mov             v28.16b, v26.16b

+        mov             v22.16b, v16.16b

+        mov             v23.16b, v17.16b

+        mov             v29.16b, v26.16b

+3:

+        cbz             w3,  4f

+        ld1             {v24.4s, v25.4s}, [x5], x7

+        ld1             {v30.8h},         [x6], x8

+3:

+        // Start of vertical loop

+        subs            w3,  w3,  #2

+.macro add5

+        add             v16.4s,  v16.4s,  v18.4s

+        add             v17.4s,  v17.4s,  v19.4s

+        add             v26.8h,  v26.8h,  v27.8h

+        add             v0.4s,   v20.4s,  v22.4s

+        add             v1.4s,   v21.4s,  v23.4s

+        add             v2.8h,   v28.8h,  v29.8h

+        add             v16.4s,  v16.4s,  v24.4s

+        add             v17.4s,  v17.4s,  v25.4s

+        add             v26.8h,  v26.8h,  v30.8h

+        add             v16.4s,  v16.4s,  v0.4s

+        add             v17.4s,  v17.4s,  v1.4s

+        add             v26.8h,  v26.8h,  v2.8h

+        st1             {v16.4s, v17.4s}, [x0], x7

+        st1             {v26.8h},         [x1], x8

+.endm

+        add5

+.macro shift2

+        mov             v16.16b, v20.16b

+        mov             v17.16b, v21.16b

+        mov             v26.16b, v28.16b

+        mov             v18.16b, v22.16b

+        mov             v19.16b, v23.16b

+        mov             v27.16b, v29.16b

+        mov             v20.16b, v24.16b

+        mov             v21.16b, v25.16b

+        mov             v28.16b, v30.16b

+.endm

+        shift2

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        b.le            5f

+        ld1             {v22.4s, v23.4s}, [x5], x7

+        ld1             {v29.8h},         [x6], x8

+        ld1             {v24.4s, v25.4s}, [x5], x7

+        ld1             {v30.8h},         [x6], x8

+        b               3b

+4:

+        // h == 1, !LR_HAVE_BOTTOM.

+        // Pad the last row with the only content row, and add.

+        mov             v24.16b, v22.16b

+        mov             v25.16b, v23.16b

+        mov             v30.16b, v29.16b

+        add5

+        shift2

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        add5

+        b               6f

+5:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.ne            6f

+        // !LR_HAVE_BOTTOM

+        cbnz            w3,  5f

+        // The intended three edge rows left; output the one at h-2 and

+        // the past edge one at h.

+        ld1             {v22.4s, v23.4s}, [x5], x7

+        ld1             {v29.8h},         [x6], x8

+        // Pad the past-edge row from the last content row.

+        mov             v24.16b, v22.16b

+        mov             v25.16b, v23.16b

+        mov             v30.16b, v29.16b

+        add5

+        shift2

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        // The last two rows are already padded properly here.

+        add5

+        b               6f

+5:

+        // w3 == -1, two rows left, output one.

+        // Pad the last two rows from the mid one.

+        mov             v22.16b, v20.16b

+        mov             v23.16b, v21.16b

+        mov             v29.16b, v28.16b

+        mov             v24.16b, v20.16b

+        mov             v25.16b, v21.16b

+        mov             v30.16b, v28.16b

+        add5

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        b               6f

+6:      // End of one vertical slice.

+        subs            w2,  w2,  #8

+        b.le            0f

+        // Move pointers back up to the top and loop horizontally.

+        // Input pointers

+        msub            x5,  x7,  x11, x5

+        msub            x6,  x8,  x11, x6

+        // Output pointers

+        msub            x0,  x7,  x10, x0

+        msub            x1,  x8,  x10, x1

+        add             x0,  x0,  #32

+        add             x1,  x1,  #16

+        add             x5,  x5,  #32

+        add             x6,  x6,  #16

+        mov             w3,  w9

+        b               1b

+0:

+        ret

+.purgem add5

+endfunc

+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,

+//                              const int w, const int h, const int strength);

+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,

+//                              const int w, const int h, const int strength);

+function sgr_calc_ab1_neon, export=1

+        add             x3,  x3,  #2 // h += 2

+        movi            v31.4s,   #9 // n

+        mov             x5,  #455

+        mov             x8,  #SUM_STRIDE

+        b               sgr_calc_ab_neon

+endfunc

+function sgr_calc_ab2_neon, export=1

+        add             x3,  x3,  #3  // h += 3

+        asr             x3,  x3,  #1  // h /= 2

+        movi            v31.4s,   #25 // n

+        mov             x5,  #164

+        mov             x8,  #(2*SUM_STRIDE)

+endfunc

+function sgr_calc_ab_neon

+        movrel          x12, X(sgr_x_by_x)

+        ld1             {v16.16b, v17.16b, v18.16b}, [x12]

+        movi            v19.16b,  #5

+        movi            v20.8b,   #55  // idx of last 5

+        movi            v21.8b,   #72  // idx of last 4

+        movi            v22.8b,   #101 // idx of last 3

+        movi            v23.8b,   #169 // idx of last 2

+        movi            v24.8b,   #254 // idx of last 1

+        add             x2,  x2,  #2 // w += 2

+        add             x7,  x2,  #7

+        bic             x7,  x7,  #7 // aligned w

+        sub             x7,  x8,  x7 // increment between rows

+        movi            v29.8h,   #1, lsl #8

+        dup             v28.4s,   w4

+        dup             v30.4s,   w5 // one_by_x

+        sub             x0,  x0,  #(4*(SUM_STRIDE))

+        sub             x1,  x1,  #(2*(SUM_STRIDE))

+        mov             x6,  x2   // backup of w

+        sub             v16.16b, v16.16b, v19.16b

+        sub             v17.16b, v17.16b, v19.16b

+        sub             v18.16b, v18.16b, v19.16b

+1:

+        subs            x2,  x2,  #8

+        ld1             {v0.4s, v1.4s}, [x0]   // a

+        ld1             {v2.8h}, [x1]          // b

+        mul             v0.4s,  v0.4s,  v31.4s // a * n

+        mul             v1.4s,  v1.4s,  v31.4s // a * n

+        umull           v3.4s,  v2.4h,  v2.4h  // b * b

+        umull2          v4.4s,  v2.8h,  v2.8h  // b * b

+        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)

+        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)

+        mul             v0.4s,  v0.4s,  v28.4s // p * s

+        mul             v1.4s,  v1.4s,  v28.4s // p * s

+        uqshrn          v0.4h,  v0.4s,  #16

+        uqshrn2         v0.8h,  v1.4s,  #16

+        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)

+        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5

+        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4

+        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b

+        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3

+        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2

+        add             v25.8b, v25.8b, v26.8b

+        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1

+        add             v27.8b, v27.8b, v5.8b

+        add             v6.8b,  v6.8b,  v19.8b

+        add             v25.8b, v25.8b, v27.8b

+        add             v1.8b,  v1.8b,  v6.8b

+        add             v1.8b,  v1.8b,  v25.8b

+        uxtl            v1.8h,  v1.8b          // x

+        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]

+        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]

+        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x

+        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x

+        srshr           v3.4s,  v3.4s,  #12    // AA[i]

+        srshr           v4.4s,  v4.4s,  #12    // AA[i]

+        sub             v2.8h,  v29.8h, v1.8h  // 256 - x

+        st1             {v3.4s, v4.4s}, [x0], #32

+        st1             {v2.8h}, [x1], #16

+        b.gt            1b

+        subs            x3,  x3,  #1

+        b.le            0f

+        add             x0,  x0,  x7, lsl #2

+        add             x1,  x1,  x7, lsl #1

+        mov             x2,  x6

+        b               1b

+0:

+        ret

+endfunc

+#define FILTER_OUT_STRIDE 384

+// void dav1d_sgr_finish_filter1_neon(coef *tmp,

+//                                    const pixel *src, const ptrdiff_t stride,

+//                                    const int32_t *a, const int16_t *b,

+//                                    const int w, const int h);

+function sgr_finish_filter1_neon, export=1

+        sub             x7,  x3,  #(4*SUM_STRIDE)

+        add             x8,  x3,  #(4*SUM_STRIDE)

+        sub             x9,  x4,  #(2*SUM_STRIDE)

+        add             x10, x4,  #(2*SUM_STRIDE)

+        mov             x11, #SUM_STRIDE

+        mov             x12, #FILTER_OUT_STRIDE

+        add             x13, x5,  #7

+        bic             x13, x13, #7 // Aligned width

+        sub             x2,  x2,  x13

+        sub             x12, x12, x13

+        sub             x11, x11, x13

+        sub             x11, x11, #4 // We read 4 extra elements from a

+        sub             x14, x11, #4 // We read 8 extra elements from b

+        mov             x13, x5

+        movi            v6.8h,  #3

+        movi            v7.4s,  #3

+1:

+        ld1             {v0.8h, v1.8h}, [x9], #32

+        ld1             {v2.8h, v3.8h}, [x4], #32

+        ld1             {v4.8h, v5.8h}, [x10], #32

+        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48

+        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48

+        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48

+2:

+        subs            x5,  x5,  #8

+        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride

+        ext             v26.16b, v2.16b,  v3.16b, #2  // 0

+        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride

+        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride

+        ext             v29.16b, v2.16b,  v3.16b, #4  // +1

+        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride

+        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride

+        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride

+        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride

+        add             v2.8h,   v2.8h,   v26.8h

+        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride

+        add             v2.8h,   v2.8h,   v29.8h      // +1

+        add             v0.8h,   v0.8h,   v4.8h

+        ext             v25.16b, v16.16b, v17.16b, #4 // -stride

+        ext             v26.16b, v17.16b, v18.16b, #4

+        shl             v2.8h,   v2.8h,   #2

+        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride

+        ext             v28.16b, v17.16b, v18.16b, #8

+        ext             v29.16b, v19.16b, v20.16b, #4 // 0

+        ext             v30.16b, v20.16b, v21.16b, #4

+        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a

+        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1

+        add             v26.4s,  v26.4s,  v20.4s

+        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride

+        add             v17.4s,  v17.4s,  v28.4s

+        ext             v27.16b, v19.16b, v20.16b, #8 // +1

+        ext             v28.16b, v20.16b, v21.16b, #8

+        add             v16.4s,  v16.4s,  v22.4s      // -1+stride

+        add             v17.4s,  v17.4s,  v23.4s

+        add             v29.4s,  v29.4s,  v27.4s      // 0, +1

+        add             v30.4s,  v30.4s,  v28.4s

+        add             v25.4s,  v25.4s,  v29.4s

+        add             v26.4s,  v26.4s,  v30.4s

+        ext             v27.16b, v22.16b, v23.16b, #4 // +stride

+        ext             v28.16b, v23.16b, v24.16b, #4

+        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride

+        ext             v30.16b, v23.16b, v24.16b, #8

+        ld1             {v19.8b}, [x1], #8            // src

+        add             v25.4s,  v25.4s,  v27.4s      // +stride

+        add             v26.4s,  v26.4s,  v28.4s

+        add             v16.4s,  v16.4s,  v29.4s      // +1+stride

+        add             v17.4s,  v17.4s,  v30.4s

+        shl             v25.4s,  v25.4s,  #2

+        shl             v26.4s,  v26.4s,  #2

+        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b

+        mla             v26.4s,  v17.4s,  v7.4s

+        uxtl            v19.8h,  v19.8b               // src

+        mov             v0.16b,  v1.16b

+        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src

+        umlal2          v26.4s,  v2.8h,   v19.8h

+        mov             v2.16b,  v3.16b

+        rshrn           v25.4h,  v25.4s,  #9

+        rshrn2          v25.8h,  v26.4s,  #9

+        mov             v4.16b,  v5.16b

+        st1             {v25.8h}, [x0], #16

+        b.le            3f

+        mov             v16.16b, v18.16b

+        mov             v19.16b, v21.16b

+        mov             v22.16b, v24.16b

+        ld1             {v1.8h}, [x9], #16

+        ld1             {v3.8h}, [x4], #16

+        ld1             {v5.8h}, [x10], #16

+        ld1             {v17.4s, v18.4s}, [x7], #32

+        ld1             {v20.4s, v21.4s}, [x3], #32

+        ld1             {v23.4s, v24.4s}, [x8], #32

+        b               2b

+3:

+        subs            x6,  x6,  #1

+        b.le            0f

+        mov             x5,  x13

+        add             x0,  x0,  x12, lsl #1

+        add             x1,  x1,  x2

+        add             x3,  x3,  x11, lsl #2

+        add             x7,  x7,  x11, lsl #2

+        add             x8,  x8,  x11, lsl #2

+        add             x4,  x4,  x14, lsl #1

+        add             x9,  x9,  x14, lsl #1

+        add             x10, x10, x14, lsl #1

+        b               1b

+0:

+        ret

+endfunc

+// void dav1d_sgr_finish_filter2_neon(coef *tmp,

+//                                    const pixel *src, const ptrdiff_t stride,

+//                                    const int32_t *a, const int16_t *b,

+//                                    const int w, const int h);

+function sgr_finish_filter2_neon, export=1

+        add             x7,  x3,  #(4*(SUM_STRIDE))

+        sub             x3,  x3,  #(4*(SUM_STRIDE))

+        add             x8,  x4,  #(2*(SUM_STRIDE))

+        sub             x4,  x4,  #(2*(SUM_STRIDE))

+        mov             x9,  #(2*SUM_STRIDE)

+        mov             x10, #FILTER_OUT_STRIDE

+        add             x11, x5,  #7

+        bic             x11, x11, #7 // Aligned width

+        sub             x2,  x2,  x11

+        sub             x10, x10, x11

+        sub             x9,  x9,  x11

+        sub             x9,  x9,  #4 // We read 4 extra elements from a

+        sub             x12, x9,  #4 // We read 8 extra elements from b

+        mov             x11, x5

+        movi            v4.8h,  #5

+        movi            v5.4s,  #5

+        movi            v6.8h,  #6

+        movi            v7.4s,  #6

+1:

+        ld1             {v0.8h, v1.8h}, [x4], #32

+        ld1             {v2.8h, v3.8h}, [x8], #32

+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

+        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

+2:

+        subs            x5,  x5,  #8

+        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride

+        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride

+        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride

+        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride

+        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride

+        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride

+        add             v0.8h,   v0.8h,   v25.8h

+        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride

+        ext             v22.16b, v16.16b, v17.16b, #4 // -stride

+        ext             v23.16b, v17.16b, v18.16b, #4

+        ext             v24.16b, v19.16b, v20.16b, #4 // +stride

+        ext             v25.16b, v20.16b, v21.16b, #4

+        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride

+        ext             v27.16b, v17.16b, v18.16b, #8

+        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride

+        ext             v29.16b, v20.16b, v21.16b, #8

+        mul             v0.8h,   v0.8h,   v4.8h       // * 5

+        mla             v0.8h,   v2.8h,   v6.8h       // * 6

+        ld1             {v31.8b}, [x1], #8

+        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride

+        add             v17.4s,  v17.4s,  v27.4s

+        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride

+        add             v20.4s,  v20.4s,  v29.4s

+        add             v16.4s,  v16.4s,  v19.4s

+        add             v17.4s,  v17.4s,  v20.4s

+        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride

+        add             v23.4s,  v23.4s,  v25.4s

+        // This is, surprisingly, faster than other variants where the

+        // mul+mla pairs are further apart, on Cortex A53.

+        mul             v16.4s,  v16.4s,  v5.4s       // * 5

+        mla             v16.4s,  v22.4s,  v7.4s       // * 6

+        mul             v17.4s,  v17.4s,  v5.4s       // * 5

+        mla             v17.4s,  v23.4s,  v7.4s       // * 6

+        uxtl            v31.8h,  v31.8b

+        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src

+        umlal2          v17.4s,  v0.8h,   v31.8h

+        mov             v0.16b,  v1.16b

+        rshrn           v16.4h,  v16.4s,  #9

+        rshrn2          v16.8h,  v17.4s,  #9

+        mov             v2.16b,  v3.16b

+        st1             {v16.8h}, [x0], #16

+        b.le            3f

+        mov             v16.16b, v18.16b

+        mov             v19.16b, v21.16b

+        ld1             {v1.8h}, [x4], #16

+        ld1             {v3.8h}, [x8], #16

+        ld1             {v17.4s, v18.4s}, [x3], #32

+        ld1             {v20.4s, v21.4s}, [x7], #32

+        b               2b

+3:

+        subs            x6,  x6,  #1

+        b.le            0f

+        mov             x5,  x11

+        add             x0,  x0,  x10, lsl #1

+        add             x1,  x1,  x2

+        add             x3,  x3,  x9, lsl #2

+        add             x7,  x7,  x9, lsl #2

+        add             x4,  x4,  x12, lsl #1

+        add             x8,  x8,  x12, lsl #1

+        mov             x13, x3

+        mov             x14, x4

+        ld1             {v0.8h, v1.8h}, [x4], #32

+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

+4:

+        subs            x5,  x5,  #8

+        ext             v22.16b, v0.16b,  v1.16b, #2  // 0

+        ext             v23.16b, v0.16b,  v1.16b, #4  // +1

+        add             v0.8h,   v0.8h,   v23.8h      // -1, +1

+        ext             v24.16b, v16.16b, v17.16b, #4 // 0

+        ext             v25.16b, v17.16b, v18.16b, #4

+        ext             v26.16b, v16.16b, v17.16b, #8 // +1

+        ext             v27.16b, v17.16b, v18.16b, #8

+        mul             v2.8h,   v22.8h,  v6.8h       // * 6

+        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a

+        ld1             {v31.8b}, [x1], #8

+        add             v16.4s,  v16.4s,  v26.4s      // -1, +1

+        add             v17.4s,  v17.4s,  v27.4s

+        uxtl            v31.8h,  v31.8b

+        // This is, surprisingly, faster than other variants where the

+        // mul+mla pairs are further apart, on Cortex A53.

+        mul             v24.4s,  v24.4s,  v7.4s       // * 6

+        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b

+        mul             v25.4s,  v25.4s,  v7.4s       // * 6

+        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b

+        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src

+        umlal2          v25.4s,  v2.8h,   v31.8h

+        mov             v0.16b,  v1.16b

+        rshrn           v24.4h,  v24.4s,  #8

+        rshrn2          v24.8h,  v25.4s,  #8

+        mov             v16.16b, v18.16b

+        st1             {v24.8h}, [x0], #16

+        b.le            5f

+        ld1             {v1.8h}, [x4], #16

+        ld1             {v17.4s, v18.4s}, [x3], #32

+        b               4b

+5:

+        subs            x6,  x6,  #1

+        b.le            0f

+        mov             x5,  x11

+        add             x0,  x0,  x10, lsl #1

+        add             x1,  x1,  x2

+        mov             x3,  x13 // Rewind x3/x4 to where they started

+        mov             x4,  x14

+        b               1b

+0:

+        ret

+endfunc

+// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,

+//                               const pixel *src, const ptrdiff_t src_stride,

+//                               const coef *t1, const int w, const int h,

+//                               const int wt);

+function sgr_weighted1_neon, export=1

+        dup             v31.8h, w7

+        cmp             x6,  #2

+        add             x9,  x0,  x1

+        add             x10, x2,  x3

+        add             x11, x4,  #2*FILTER_OUT_STRIDE

+        mov             x7,  #(4*FILTER_OUT_STRIDE)

+        lsl             x1,  x1,  #1

+        lsl             x3,  x3,  #1

+        add             x8,  x5,  #7

+        bic             x8,  x8,  #7 // Aligned width

+        sub             x1,  x1,  x8

+        sub             x3,  x3,  x8

+        sub             x7,  x7,  x8, lsl #1

+        mov             x8,  x5

+        b.lt            2f

+1:

+        ld1             {v0.8b}, [x2],  #8

+        ld1             {v4.8b}, [x10], #8

+        ld1             {v1.8h}, [x4],  #16

+        ld1             {v5.8h}, [x11], #16

+        subs            x5,  x5,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        ushll           v4.8h,  v4.8b,  #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u

+        ushll           v2.4s,  v0.4h,  #7     // u << 7

+        ushll2          v3.4s,  v0.8h,  #7     // u << 7

+        ushll           v6.4s,  v4.4h,  #7     // u << 7

+        ushll2          v7.4s,  v4.8h,  #7     // u << 7

+        smlal           v2.4s,  v1.4h,  v31.4h // v

+        smlal2          v3.4s,  v1.8h,  v31.8h // v

+        smlal           v6.4s,  v5.4h,  v31.4h // v

+        smlal2          v7.4s,  v5.8h,  v31.8h // v

+        rshrn           v2.4h,  v2.4s,  #11

+        rshrn2          v2.8h,  v3.4s,  #11

+        rshrn           v6.4h,  v6.4s,  #11

+        rshrn2          v6.8h,  v7.4s,  #11

+        sqxtun          v2.8b,  v2.8h

+        sqxtun          v6.8b,  v6.8h

+        st1             {v2.8b}, [x0], #8

+        st1             {v6.8b}, [x9], #8

+        b.gt            1b

+        sub             x6,  x6,  #2

+        cmp             x6,  #1

+        b.lt            0f

+        mov             x5,  x8

+        add             x0,  x0,  x1

+        add             x9,  x9,  x1

+        add             x2,  x2,  x3

+        add             x10, x10, x3

+        add             x4,  x4,  x7

+        add             x11, x11, x7

+        b.eq            2f

+        b               1b

+2:

+        ld1             {v0.8b}, [x2], #8

+        ld1             {v1.8h}, [x4], #16

+        subs            x5,  x5,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        ushll           v2.4s,  v0.4h,  #7     // u << 7

+        ushll2          v3.4s,  v0.8h,  #7     // u << 7

+        smlal           v2.4s,  v1.4h,  v31.4h // v

+        smlal2          v3.4s,  v1.8h,  v31.8h // v

+        rshrn           v2.4h,  v2.4s,  #11

+        rshrn2          v2.8h,  v3.4s,  #11

+        sqxtun          v2.8b,  v2.8h

+        st1             {v2.8b}, [x0], #8

+        b.gt            2b

+0:

+        ret

+endfunc

+// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,

+//                               const pixel *src, const ptrdiff_t src_stride,

+//                               const coef *t1, const coef *t2,

+//                               const int w, const int h,

+//                               const int16_t wt[2]);

+function sgr_weighted2_neon, export=1

+        ldr             x8,  [sp]

+        ld1             {v31.s}[0], [x8]

+        cmp             x7,  #2

+        add             x10, x0,  x1

+        add             x11, x2,  x3

+        add             x12, x4,  #2*FILTER_OUT_STRIDE

+        add             x13, x5,  #2*FILTER_OUT_STRIDE

+        mov             x8,  #4*FILTER_OUT_STRIDE

+        lsl             x1,  x1,  #1

+        lsl             x3,  x3,  #1

+        add             x9,  x6,  #7

+        bic             x9,  x9,  #7 // Aligned width

+        sub             x1,  x1,  x9

+        sub             x3,  x3,  x9

+        sub             x8,  x8,  x9, lsl #1

+        dup             v30.8h, v31.h[0] // wt[0]

+        dup             v31.8h, v31.h[1] // wt[1]

+        mov             x9,  x6

+        b.lt            2f

+1:

+        ld1             {v0.8b},  [x2],  #8

+        ld1             {v16.8b}, [x11], #8

+        ld1             {v1.8h},  [x4],  #16

+        ld1             {v17.8h}, [x12], #16

+        ld1             {v2.8h},  [x5],  #16

+        ld1             {v18.8h}, [x13], #16

+        subs            x6,  x6,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        ushll           v16.8h, v16.8b, #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u

+        sub             v17.8h, v17.8h, v16.8h // t1 - u

+        sub             v18.8h, v18.8h, v16.8h // t2 - u

+        ushll           v3.4s,  v0.4h,  #7     // u << 7

+        ushll2          v4.4s,  v0.8h,  #7     // u << 7

+        ushll           v19.4s, v16.4h, #7     // u << 7

+        ushll2          v20.4s, v16.8h, #7     // u << 7

+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)

+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)

+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)

+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)

+        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)

+        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)

+        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)

+        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)

+        rshrn           v3.4h,  v3.4s,  #11

+        rshrn2          v3.8h,  v4.4s,  #11

+        rshrn           v19.4h, v19.4s, #11

+        rshrn2          v19.8h, v20.4s, #11

+        sqxtun          v3.8b,  v3.8h

+        sqxtun          v19.8b, v19.8h

+        st1             {v3.8b},  [x0],  #8

+        st1             {v19.8b}, [x10], #8

+        b.gt            1b

+        subs            x7,  x7,  #2

+        cmp             x7,  #1

+        b.lt            0f

+        mov             x6,  x9

+        add             x0,  x0,  x1

+        add             x10, x10, x1

+        add             x2,  x2,  x3

+        add             x11, x11, x3

+        add             x4,  x4,  x8

+        add             x12, x12, x8

+        add             x5,  x5,  x8

+        add             x13, x13, x8

+        b.eq            2f

+        b               1b

+2:

+        ld1             {v0.8b}, [x2], #8

+        ld1             {v1.8h}, [x4], #16

+        ld1             {v2.8h}, [x5], #16

+        subs            x6,  x6,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u

+        ushll           v3.4s,  v0.4h,  #7     // u << 7

+        ushll2          v4.4s,  v0.8h,  #7     // u << 7

+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)

+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)

+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)

+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)

+        rshrn           v3.4h,  v3.4s,  #11

+        rshrn2          v3.8h,  v4.4s,  #11

+        sqxtun          v3.8b,  v3.8h

+        st1             {v3.8b}, [x0], #8

+        b.gt            1b

+0:

+        ret

 endfunc

--- a/src/arm/looprestoration_init_tmpl.c

+++ b/src/arm/looprestoration_init_tmpl.c

@@ -29,6 +29,7 @@

 #include "src/looprestoration.h"

 #include "common/attributes.h"

+#include "src/tables.h"

 #if BITDEPTH == 8

 // This calculates things slightly differently than the reference C version.

@@ -91,8 +92,172 @@

         dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);

-#endif

+#if ARCH_AARCH64

+void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,

+                           const pixel (*left)[4],

+                           const pixel *src, const ptrdiff_t stride,

+                           const int w, const int h,

+                           const enum LrEdgeFlags edges);

+void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,

+                           const int w, const int h,

+                           const enum LrEdgeFlags edges);

+void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,

+                             const int w, const int h, const int strength);

+void dav1d_sgr_finish_filter1_neon(coef *tmp,

+                                   const pixel *src, const ptrdiff_t stride,

+                                   const int32_t *a, const int16_t *b,

+                                   const int w, const int h);

+/* filter with a 3x3 box (radius=1) */

+static void dav1d_sgr_filter1_neon(coef *tmp,

+                                   const pixel *src, const ptrdiff_t stride,

+                                   const pixel (*left)[4],

+                                   const pixel *lpf, const ptrdiff_t lpf_stride,

+                                   const int w, const int h, const int strength,

+                                   const enum LrEdgeFlags edges)

+{

+    ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);

+    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;

+    ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);

+    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;

+    dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);

+    if (edges & LR_HAVE_TOP)

+        dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],

+                              NULL, lpf, lpf_stride, w, 1, edges);

+    if (edges & LR_HAVE_BOTTOM)

+        dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],

+                              NULL, lpf + 6 * PXSTRIDE(lpf_stride),

+                              lpf_stride, w, 2, edges);

+    dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);

+    dav1d_sgr_calc_ab1_neon(a, b, w, h, strength);

+    dav1d_sgr_finish_filter1_neon(tmp, src, stride, a, b, w, h);

+}

+void dav1d_sgr_box5_h_neon(int32_t *sumsq, int16_t *sum,

+                           const pixel (*left)[4],

+                           const pixel *src, const ptrdiff_t stride,

+                           const int w, const int h,

+                           const enum LrEdgeFlags edges);

+void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,

+                           const int w, const int h,

+                           const enum LrEdgeFlags edges);

+void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,

+                             const int w, const int h, const int strength);

+void dav1d_sgr_finish_filter2_neon(coef *tmp,

+                                   const pixel *src, const ptrdiff_t stride,

+                                   const int32_t *a, const int16_t *b,

+                                   const int w, const int h);

+/* filter with a 5x5 box (radius=2) */

+static void dav1d_sgr_filter2_neon(coef *tmp,

+                                   const pixel *src, const ptrdiff_t stride,

+                                   const pixel (*left)[4],

+                                   const pixel *lpf, const ptrdiff_t lpf_stride,

+                                   const int w, const int h, const int strength,

+                                   const enum LrEdgeFlags edges)

+{

+    ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);

+    int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;

+    ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);

+    int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;

+    dav1d_sgr_box5_h_neon(sumsq, sum, left, src, stride, w, h, edges);

+    if (edges & LR_HAVE_TOP)

+        dav1d_sgr_box5_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],

+                              NULL, lpf, lpf_stride, w, 2, edges);

+    if (edges & LR_HAVE_BOTTOM)

+        dav1d_sgr_box5_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],

+                              NULL, lpf + 6 * PXSTRIDE(lpf_stride),

+                              lpf_stride, w, 2, edges);

+    dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);

+    dav1d_sgr_calc_ab2_neon(a, b, w, h, strength);

+    dav1d_sgr_finish_filter2_neon(tmp, src, stride, a, b, w, h);

+}

+void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,

+                              const pixel *src, const ptrdiff_t src_stride,

+                              const coef *t1, const int w, const int h,

+                              const int wt);

+void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,

+                              const pixel *src, const ptrdiff_t src_stride,

+                              const coef *t1, const coef *t2,

+                              const int w, const int h,

+                              const int16_t wt[2]);

+static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,

+                             const pixel (*const left)[4],

+                             const pixel *lpf, const ptrdiff_t lpf_stride,

+                             const int w, const int h, const int sgr_idx,

+                             const int16_t sgr_wt[7], const enum LrEdgeFlags edges)

+{

+    if (!dav1d_sgr_params[sgr_idx][0]) {

+        ALIGN_STK_16(coef, tmp, 64 * 384,);

+        dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,

+                               w, h, dav1d_sgr_params[sgr_idx][3], edges);

+        if (w >= 8)

+            dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,

+                                     tmp, w & ~7, h, (1 << 7) - sgr_wt[1]);

+        if (w & 7) {

+            // For uneven widths, do a full 8 pixel wide filtering into a temp

+            // buffer and copy out the narrow slice of pixels separately into

+            // dest.

+            ALIGN_STK_16(pixel, stripe, 64 * 8,);

+            dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,

+                                     tmp + (w & ~7), w & 7, h,

+                                     (1 << 7) - sgr_wt[1]);

+            dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,

+                                   w & 7, h);

+        }

+    } else if (!dav1d_sgr_params[sgr_idx][1]) {

+        ALIGN_STK_16(coef, tmp, 64 * 384,);

+        dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,

+                               w, h, dav1d_sgr_params[sgr_idx][2], edges);

+        if (w >= 8)

+            dav1d_sgr_weighted1_neon(dst, dst_stride, dst, dst_stride,

+                                     tmp, w & ~7, h, sgr_wt[0]);

+        if (w & 7) {

+            // For uneven widths, do a full 8 pixel wide filtering into a temp

+            // buffer and copy out the narrow slice of pixels separately into

+            // dest.

+            ALIGN_STK_16(pixel, stripe, 64 * 8,);

+            dav1d_sgr_weighted1_neon(stripe, w & 7, dst + (w & ~7), dst_stride,

+                                     tmp + (w & ~7), w & 7, h, sgr_wt[0]);

+            dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,

+                                   w & 7, h);

+        }

+    } else {

+        ALIGN_STK_16(coef, tmp1, 64 * 384,);

+        ALIGN_STK_16(coef, tmp2, 64 * 384,);

+        dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,

+                               w, h, dav1d_sgr_params[sgr_idx][2], edges);

+        dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,

+                               w, h, dav1d_sgr_params[sgr_idx][3], edges);

+        const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] };

+        if (w >= 8)

+            dav1d_sgr_weighted2_neon(dst, dst_stride, dst, dst_stride,

+                                     tmp1, tmp2, w & ~7, h, wt);

+        if (w & 7) {

+            // For uneven widths, do a full 8 pixel wide filtering into a temp

+            // buffer and copy out the narrow slice of pixels separately into

+            // dest.

+            ALIGN_STK_16(pixel, stripe, 64 * 8,);

+            dav1d_sgr_weighted2_neon(stripe, w & 7, dst + (w & ~7), dst_stride,

+                                     tmp1 + (w & ~7), tmp2 + (w & ~7),

+                                     w & 7, h, wt);

+            dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, stripe,

+                                   w & 7, h);

+        }

+    }

+}

+#endif // ARCH_AARCH64

+#endif // BITDEPTH == 8

 void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {

     const unsigned flags = dav1d_get_cpu_flags();

@@ -100,5 +265,8 @@

 #if BITDEPTH == 8

     c->wiener = wiener_filter_neon;

+#if ARCH_AARCH64

+    c->selfguided = sgr_filter_neon;

+#endif

 #endif