shithub: dav1d

--- a/src/arm/64/looprestoration.S

+++ b/src/arm/64/looprestoration.S

@@ -617,6 +617,8 @@

 #define SUM_STRIDE (384+16)

+#include "looprestoration_tmpl.S"

 // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum,

 //                                 const pixel (*left)[4],

 //                                 const pixel *src, const ptrdiff_t stride,

@@ -1145,841 +1147,4 @@

0:

ret

 .purgem add5

-endfunc

-// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,

-//                            const int w, const int h,

-//                            const enum LrEdgeFlags edges);

-function sgr_box3_v_neon, export=1

-        add             w10, w3,  #2 // Number of output rows to move back

-        mov             w11, w3      // Number of input rows to move back

-        add             w2,  w2,  #2 // Actual summed width

-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride

-        mov             x8,       #(2*SUM_STRIDE) // sum stride

-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride

-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride

-        tst             w4,  #4 // LR_HAVE_TOP

-        b.eq            0f

-        // If have top, read from row -2.

-        sub             x5,  x0,  #(4*SUM_STRIDE)

-        sub             x6,  x1,  #(2*SUM_STRIDE)

-        add             w11, w11, #2

-        b               1f

-0:

-        // !LR_HAVE_TOP

-        // If we don't have top, read from row 0 even if

-        // we start writing to row -1.

-        add             x5,  x0,  #(4*SUM_STRIDE)

-        add             x6,  x1,  #(2*SUM_STRIDE)

-1:

-        tst             w4,  #8 // LR_HAVE_BOTTOM

-        b.eq            1f

-        // LR_HAVE_BOTTOM

-        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop

-        add             w11, w11, #2

-1:

-        mov             w9,  w3       // Backup of h for next loops

-1:

-        // Start of horizontal loop; start one vertical filter slice.

-        // Start loading rows into v16-v21 and v24-v26 taking top

-        // padding into consideration.

-        tst             w4,  #4 // LR_HAVE_TOP

-        ld1             {v16.4s, v17.4s}, [x5], x7

-        ld1             {v24.8h},         [x6], x8

-        b.eq            2f

-        // LR_HAVE_TOP

-        ld1             {v18.4s, v19.4s}, [x5], x7

-        ld1             {v25.8h},         [x6], x8

-        ld1             {v20.4s, v21.4s}, [x5], x7

-        ld1             {v26.8h},         [x6], x8

-        b               3f

-2:      // !LR_HAVE_TOP

-        mov             v18.16b, v16.16b

-        mov             v19.16b, v17.16b

-        mov             v25.16b, v24.16b

-        mov             v20.16b, v16.16b

-        mov             v21.16b, v17.16b

-        mov             v26.16b, v24.16b

-3:

-        subs            w3,  w3,  #1

-.macro add3

-        add             v16.4s,  v16.4s,  v18.4s

-        add             v17.4s,  v17.4s,  v19.4s

-        add             v24.8h,  v24.8h,  v25.8h

-        add             v16.4s,  v16.4s,  v20.4s

-        add             v17.4s,  v17.4s,  v21.4s

-        add             v24.8h,  v24.8h,  v26.8h

-        st1             {v16.4s, v17.4s}, [x0], x7

-        st1             {v24.8h},         [x1], x8

-.endm

-        add3

-        mov             v16.16b, v18.16b

-        mov             v17.16b, v19.16b

-        mov             v24.16b, v25.16b

-        mov             v18.16b, v20.16b

-        mov             v19.16b, v21.16b

-        mov             v25.16b, v26.16b

-        b.le            4f

-        ld1             {v20.4s, v21.4s}, [x5], x7

-        ld1             {v26.8h},         [x6], x8

-        b               3b

-4:

-        tst             w4,  #8 // LR_HAVE_BOTTOM

-        b.ne            5f

-        // !LR_HAVE_BOTTOM

-        // Produce two more rows, extending the already loaded rows.

-        add3

-        mov             v16.16b, v18.16b

-        mov             v17.16b, v19.16b

-        mov             v24.16b, v25.16b

-        add3

-5:      // End of one vertical slice.

-        subs            w2,  w2,  #8

-        b.le            0f

-        // Move pointers back up to the top and loop horizontally.

-        // Input pointers

-        msub            x5,  x7,  x11, x5

-        msub            x6,  x8,  x11, x6

-        // Output pointers

-        msub            x0,  x7,  x10, x0

-        msub            x1,  x8,  x10, x1

-        add             x0,  x0,  #32

-        add             x1,  x1,  #16

-        add             x5,  x5,  #32

-        add             x6,  x6,  #16

-        mov             w3,  w9

-        b               1b

-0:

-        ret

-.purgem add3

-endfunc

-// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,

-//                            const int w, const int h,

-//                            const enum LrEdgeFlags edges);

-function sgr_box5_v_neon, export=1

-        add             w10, w3,  #2 // Number of output rows to move back

-        mov             w11, w3      // Number of input rows to move back

-        add             w2,  w2,  #8 // Actual summed width

-        mov             x7,       #(4*SUM_STRIDE) // sumsq stride

-        mov             x8,       #(2*SUM_STRIDE) // sum stride

-        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride

-        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride

-        tst             w4,  #4 // LR_HAVE_TOP

-        b.eq            0f

-        // If have top, read from row -2.

-        sub             x5,  x0,  #(4*SUM_STRIDE)

-        sub             x6,  x1,  #(2*SUM_STRIDE)

-        add             w11, w11, #2

-        b               1f

-0:

-        // !LR_HAVE_TOP

-        // If we don't have top, read from row 0 even if

-        // we start writing to row -1.

-        add             x5,  x0,  #(4*SUM_STRIDE)

-        add             x6,  x1,  #(2*SUM_STRIDE)

-1:

-        tst             w4,  #8 // LR_HAVE_BOTTOM

-        b.eq            0f

-        // LR_HAVE_BOTTOM

-        add             w3,  w3,  #2  // Handle h+2 lines with the main loop

-        add             w11, w11, #2

-        b               1f

-0:

-        // !LR_HAVE_BOTTOM

-        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop

-1:

-        mov             w9,  w3       // Backup of h for next loops

-1:

-        // Start of horizontal loop; start one vertical filter slice.

-        // Start loading rows into v16-v25 and v26-v30 taking top

-        // padding into consideration.

-        tst             w4,  #4 // LR_HAVE_TOP

-        ld1             {v16.4s, v17.4s}, [x5], x7

-        ld1             {v26.8h},         [x6], x8

-        b.eq            2f

-        // LR_HAVE_TOP

-        ld1             {v20.4s, v21.4s}, [x5], x7

-        ld1             {v28.8h},         [x6], x8

-        mov             v18.16b, v16.16b

-        mov             v19.16b, v17.16b

-        mov             v27.16b, v26.16b

-        ld1             {v22.4s, v23.4s}, [x5], x7

-        ld1             {v29.8h},         [x6], x8

-        b               3f

-2:      // !LR_HAVE_TOP

-        mov             v18.16b, v16.16b

-        mov             v19.16b, v17.16b

-        mov             v27.16b, v26.16b

-        mov             v20.16b, v16.16b

-        mov             v21.16b, v17.16b

-        mov             v28.16b, v26.16b

-        mov             v22.16b, v16.16b

-        mov             v23.16b, v17.16b

-        mov             v29.16b, v26.16b

-3:

-        cbz             w3,  4f

-        ld1             {v24.4s, v25.4s}, [x5], x7

-        ld1             {v30.8h},         [x6], x8

-3:

-        // Start of vertical loop

-        subs            w3,  w3,  #2

-.macro add5

-        add             v16.4s,  v16.4s,  v18.4s

-        add             v17.4s,  v17.4s,  v19.4s

-        add             v26.8h,  v26.8h,  v27.8h

-        add             v0.4s,   v20.4s,  v22.4s

-        add             v1.4s,   v21.4s,  v23.4s

-        add             v2.8h,   v28.8h,  v29.8h

-        add             v16.4s,  v16.4s,  v24.4s

-        add             v17.4s,  v17.4s,  v25.4s

-        add             v26.8h,  v26.8h,  v30.8h

-        add             v16.4s,  v16.4s,  v0.4s

-        add             v17.4s,  v17.4s,  v1.4s

-        add             v26.8h,  v26.8h,  v2.8h

-        st1             {v16.4s, v17.4s}, [x0], x7

-        st1             {v26.8h},         [x1], x8

-.endm

-        add5

-.macro shift2

-        mov             v16.16b, v20.16b

-        mov             v17.16b, v21.16b

-        mov             v26.16b, v28.16b

-        mov             v18.16b, v22.16b

-        mov             v19.16b, v23.16b

-        mov             v27.16b, v29.16b

-        mov             v20.16b, v24.16b

-        mov             v21.16b, v25.16b

-        mov             v28.16b, v30.16b

-.endm

-        shift2

-        add             x0,  x0,  x7

-        add             x1,  x1,  x8

-        b.le            5f

-        ld1             {v22.4s, v23.4s}, [x5], x7

-        ld1             {v29.8h},         [x6], x8

-        ld1             {v24.4s, v25.4s}, [x5], x7

-        ld1             {v30.8h},         [x6], x8

-        b               3b

-4:

-        // h == 1, !LR_HAVE_BOTTOM.

-        // Pad the last row with the only content row, and add.

-        mov             v24.16b, v22.16b

-        mov             v25.16b, v23.16b

-        mov             v30.16b, v29.16b

-        add5

-        shift2

-        add             x0,  x0,  x7

-        add             x1,  x1,  x8

-        add5

-        b               6f

-5:

-        tst             w4,  #8 // LR_HAVE_BOTTOM

-        b.ne            6f

-        // !LR_HAVE_BOTTOM

-        cbnz            w3,  5f

-        // The intended three edge rows left; output the one at h-2 and

-        // the past edge one at h.

-        ld1             {v22.4s, v23.4s}, [x5], x7

-        ld1             {v29.8h},         [x6], x8

-        // Pad the past-edge row from the last content row.

-        mov             v24.16b, v22.16b

-        mov             v25.16b, v23.16b

-        mov             v30.16b, v29.16b

-        add5

-        shift2

-        add             x0,  x0,  x7

-        add             x1,  x1,  x8

-        // The last two rows are already padded properly here.

-        add5

-        b               6f

-5:

-        // w3 == -1, two rows left, output one.

-        // Pad the last two rows from the mid one.

-        mov             v22.16b, v20.16b

-        mov             v23.16b, v21.16b

-        mov             v29.16b, v28.16b

-        mov             v24.16b, v20.16b

-        mov             v25.16b, v21.16b

-        mov             v30.16b, v28.16b

-        add5

-        add             x0,  x0,  x7

-        add             x1,  x1,  x8

-        b               6f

-6:      // End of one vertical slice.

-        subs            w2,  w2,  #8

-        b.le            0f

-        // Move pointers back up to the top and loop horizontally.

-        // Input pointers

-        msub            x5,  x7,  x11, x5

-        msub            x6,  x8,  x11, x6

-        // Output pointers

-        msub            x0,  x7,  x10, x0

-        msub            x1,  x8,  x10, x1

-        add             x0,  x0,  #32

-        add             x1,  x1,  #16

-        add             x5,  x5,  #32

-        add             x6,  x6,  #16

-        mov             w3,  w9

-        b               1b

-0:

-        ret

-.purgem add5

-endfunc

-// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,

-//                              const int w, const int h, const int strength);

-// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,

-//                              const int w, const int h, const int strength);

-function sgr_calc_ab1_neon, export=1

-        add             x3,  x3,  #2 // h += 2

-        movi            v31.4s,   #9 // n

-        mov             x5,  #455

-        mov             x8,  #SUM_STRIDE

-        b               sgr_calc_ab_neon

-endfunc

-function sgr_calc_ab2_neon, export=1

-        add             x3,  x3,  #3  // h += 3

-        asr             x3,  x3,  #1  // h /= 2

-        movi            v31.4s,   #25 // n

-        mov             x5,  #164

-        mov             x8,  #(2*SUM_STRIDE)

-endfunc

-function sgr_calc_ab_neon

-        movrel          x12, X(sgr_x_by_x)

-        ld1             {v16.16b, v17.16b, v18.16b}, [x12]

-        movi            v19.16b,  #5

-        movi            v20.8b,   #55  // idx of last 5

-        movi            v21.8b,   #72  // idx of last 4

-        movi            v22.8b,   #101 // idx of last 3

-        movi            v23.8b,   #169 // idx of last 2

-        movi            v24.8b,   #254 // idx of last 1

-        add             x2,  x2,  #2 // w += 2

-        add             x7,  x2,  #7

-        bic             x7,  x7,  #7 // aligned w

-        sub             x7,  x8,  x7 // increment between rows

-        movi            v29.8h,   #1, lsl #8

-        dup             v28.4s,   w4

-        dup             v30.4s,   w5 // one_by_x

-        sub             x0,  x0,  #(4*(SUM_STRIDE))

-        sub             x1,  x1,  #(2*(SUM_STRIDE))

-        mov             x6,  x2   // backup of w

-        sub             v16.16b, v16.16b, v19.16b

-        sub             v17.16b, v17.16b, v19.16b

-        sub             v18.16b, v18.16b, v19.16b

-1:

-        subs            x2,  x2,  #8

-        ld1             {v0.4s, v1.4s}, [x0]   // a

-        ld1             {v2.8h}, [x1]          // b

-        mul             v0.4s,  v0.4s,  v31.4s // a * n

-        mul             v1.4s,  v1.4s,  v31.4s // a * n

-        umull           v3.4s,  v2.4h,  v2.4h  // b * b

-        umull2          v4.4s,  v2.8h,  v2.8h  // b * b

-        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)

-        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)

-        mul             v0.4s,  v0.4s,  v28.4s // p * s

-        mul             v1.4s,  v1.4s,  v28.4s // p * s

-        uqshrn          v0.4h,  v0.4s,  #16

-        uqshrn2         v0.8h,  v1.4s,  #16

-        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)

-        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5

-        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4

-        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b

-        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3

-        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2

-        add             v25.8b, v25.8b, v26.8b

-        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1

-        add             v27.8b, v27.8b, v5.8b

-        add             v6.8b,  v6.8b,  v19.8b

-        add             v25.8b, v25.8b, v27.8b

-        add             v1.8b,  v1.8b,  v6.8b

-        add             v1.8b,  v1.8b,  v25.8b

-        uxtl            v1.8h,  v1.8b          // x

-        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]

-        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]

-        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x

-        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x

-        srshr           v3.4s,  v3.4s,  #12    // AA[i]

-        srshr           v4.4s,  v4.4s,  #12    // AA[i]

-        sub             v2.8h,  v29.8h, v1.8h  // 256 - x

-        st1             {v3.4s, v4.4s}, [x0], #32

-        st1             {v2.8h}, [x1], #16

-        b.gt            1b

-        subs            x3,  x3,  #1

-        b.le            0f

-        add             x0,  x0,  x7, lsl #2

-        add             x1,  x1,  x7, lsl #1

-        mov             x2,  x6

-        b               1b

-0:

-        ret

-endfunc

-#define FILTER_OUT_STRIDE 384

-// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,

-//                                         const pixel *src, const ptrdiff_t stride,

-//                                         const int32_t *a, const int16_t *b,

-//                                         const int w, const int h);

-function sgr_finish_filter1_8bpc_neon, export=1

-        sub             x7,  x3,  #(4*SUM_STRIDE)

-        add             x8,  x3,  #(4*SUM_STRIDE)

-        sub             x9,  x4,  #(2*SUM_STRIDE)

-        add             x10, x4,  #(2*SUM_STRIDE)

-        mov             x11, #SUM_STRIDE

-        mov             x12, #FILTER_OUT_STRIDE

-        add             x13, x5,  #7

-        bic             x13, x13, #7 // Aligned width

-        sub             x2,  x2,  x13

-        sub             x12, x12, x13

-        sub             x11, x11, x13

-        sub             x11, x11, #4 // We read 4 extra elements from a

-        sub             x14, x11, #4 // We read 8 extra elements from b

-        mov             x13, x5

-        movi            v6.8h,  #3

-        movi            v7.4s,  #3

-1:

-        ld1             {v0.8h, v1.8h}, [x9], #32

-        ld1             {v2.8h, v3.8h}, [x4], #32

-        ld1             {v4.8h, v5.8h}, [x10], #32

-        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48

-        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48

-        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48

-2:

-        subs            x5,  x5,  #8

-        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride

-        ext             v26.16b, v2.16b,  v3.16b, #2  // 0

-        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride

-        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride

-        ext             v29.16b, v2.16b,  v3.16b, #4  // +1

-        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride

-        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride

-        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride

-        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride

-        add             v2.8h,   v2.8h,   v26.8h

-        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride

-        add             v2.8h,   v2.8h,   v29.8h      // +1

-        add             v0.8h,   v0.8h,   v4.8h

-        ext             v25.16b, v16.16b, v17.16b, #4 // -stride

-        ext             v26.16b, v17.16b, v18.16b, #4

-        shl             v2.8h,   v2.8h,   #2

-        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride

-        ext             v28.16b, v17.16b, v18.16b, #8

-        ext             v29.16b, v19.16b, v20.16b, #4 // 0

-        ext             v30.16b, v20.16b, v21.16b, #4

-        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a

-        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1

-        add             v26.4s,  v26.4s,  v20.4s

-        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride

-        add             v17.4s,  v17.4s,  v28.4s

-        ext             v27.16b, v19.16b, v20.16b, #8 // +1

-        ext             v28.16b, v20.16b, v21.16b, #8

-        add             v16.4s,  v16.4s,  v22.4s      // -1+stride

-        add             v17.4s,  v17.4s,  v23.4s

-        add             v29.4s,  v29.4s,  v27.4s      // 0, +1

-        add             v30.4s,  v30.4s,  v28.4s

-        add             v25.4s,  v25.4s,  v29.4s

-        add             v26.4s,  v26.4s,  v30.4s

-        ext             v27.16b, v22.16b, v23.16b, #4 // +stride

-        ext             v28.16b, v23.16b, v24.16b, #4

-        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride

-        ext             v30.16b, v23.16b, v24.16b, #8

-        ld1             {v19.8b}, [x1], #8            // src

-        add             v25.4s,  v25.4s,  v27.4s      // +stride

-        add             v26.4s,  v26.4s,  v28.4s

-        add             v16.4s,  v16.4s,  v29.4s      // +1+stride

-        add             v17.4s,  v17.4s,  v30.4s

-        shl             v25.4s,  v25.4s,  #2

-        shl             v26.4s,  v26.4s,  #2

-        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b

-        mla             v26.4s,  v17.4s,  v7.4s

-        uxtl            v19.8h,  v19.8b               // src

-        mov             v0.16b,  v1.16b

-        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src

-        umlal2          v26.4s,  v2.8h,   v19.8h

-        mov             v2.16b,  v3.16b

-        rshrn           v25.4h,  v25.4s,  #9

-        rshrn2          v25.8h,  v26.4s,  #9

-        mov             v4.16b,  v5.16b

-        st1             {v25.8h}, [x0], #16

-        b.le            3f

-        mov             v16.16b, v18.16b

-        mov             v19.16b, v21.16b

-        mov             v22.16b, v24.16b

-        ld1             {v1.8h}, [x9], #16

-        ld1             {v3.8h}, [x4], #16

-        ld1             {v5.8h}, [x10], #16

-        ld1             {v17.4s, v18.4s}, [x7], #32

-        ld1             {v20.4s, v21.4s}, [x3], #32

-        ld1             {v23.4s, v24.4s}, [x8], #32

-        b               2b

-3:

-        subs            x6,  x6,  #1

-        b.le            0f

-        mov             x5,  x13

-        add             x0,  x0,  x12, lsl #1

-        add             x1,  x1,  x2

-        add             x3,  x3,  x11, lsl #2

-        add             x7,  x7,  x11, lsl #2

-        add             x8,  x8,  x11, lsl #2

-        add             x4,  x4,  x14, lsl #1

-        add             x9,  x9,  x14, lsl #1

-        add             x10, x10, x14, lsl #1

-        b               1b

-0:

-        ret

-endfunc

-// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,

-//                                         const pixel *src, const ptrdiff_t stride,

-//                                         const int32_t *a, const int16_t *b,

-//                                         const int w, const int h);

-function sgr_finish_filter2_8bpc_neon, export=1

-        add             x7,  x3,  #(4*(SUM_STRIDE))

-        sub             x3,  x3,  #(4*(SUM_STRIDE))

-        add             x8,  x4,  #(2*(SUM_STRIDE))

-        sub             x4,  x4,  #(2*(SUM_STRIDE))

-        mov             x9,  #(2*SUM_STRIDE)

-        mov             x10, #FILTER_OUT_STRIDE

-        add             x11, x5,  #7

-        bic             x11, x11, #7 // Aligned width

-        sub             x2,  x2,  x11

-        sub             x10, x10, x11

-        sub             x9,  x9,  x11

-        sub             x9,  x9,  #4 // We read 4 extra elements from a

-        sub             x12, x9,  #4 // We read 8 extra elements from b

-        mov             x11, x5

-        movi            v4.8h,  #5

-        movi            v5.4s,  #5

-        movi            v6.8h,  #6

-        movi            v7.4s,  #6

-1:

-        ld1             {v0.8h, v1.8h}, [x4], #32

-        ld1             {v2.8h, v3.8h}, [x8], #32

-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

-        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

-2:

-        subs            x5,  x5,  #8

-        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride

-        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride

-        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride

-        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride

-        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride

-        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride

-        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride

-        add             v0.8h,   v0.8h,   v25.8h

-        ext             v22.16b, v16.16b, v17.16b, #4 // -stride

-        ext             v23.16b, v17.16b, v18.16b, #4

-        ext             v24.16b, v19.16b, v20.16b, #4 // +stride

-        ext             v25.16b, v20.16b, v21.16b, #4

-        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride

-        ext             v27.16b, v17.16b, v18.16b, #8

-        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride

-        ext             v29.16b, v20.16b, v21.16b, #8

-        mul             v0.8h,   v0.8h,   v4.8h       // * 5

-        mla             v0.8h,   v2.8h,   v6.8h       // * 6

-        ld1             {v31.8b}, [x1], #8

-        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride

-        add             v17.4s,  v17.4s,  v27.4s

-        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride

-        add             v20.4s,  v20.4s,  v29.4s

-        add             v16.4s,  v16.4s,  v19.4s

-        add             v17.4s,  v17.4s,  v20.4s

-        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride

-        add             v23.4s,  v23.4s,  v25.4s

-        // This is, surprisingly, faster than other variants where the

-        // mul+mla pairs are further apart, on Cortex A53.

-        mul             v16.4s,  v16.4s,  v5.4s       // * 5

-        mla             v16.4s,  v22.4s,  v7.4s       // * 6

-        mul             v17.4s,  v17.4s,  v5.4s       // * 5

-        mla             v17.4s,  v23.4s,  v7.4s       // * 6

-        uxtl            v31.8h,  v31.8b

-        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src

-        umlal2          v17.4s,  v0.8h,   v31.8h

-        mov             v0.16b,  v1.16b

-        rshrn           v16.4h,  v16.4s,  #9

-        rshrn2          v16.8h,  v17.4s,  #9

-        mov             v2.16b,  v3.16b

-        st1             {v16.8h}, [x0], #16

-        b.le            3f

-        mov             v16.16b, v18.16b

-        mov             v19.16b, v21.16b

-        ld1             {v1.8h}, [x4], #16

-        ld1             {v3.8h}, [x8], #16

-        ld1             {v17.4s, v18.4s}, [x3], #32

-        ld1             {v20.4s, v21.4s}, [x7], #32

-        b               2b

-3:

-        subs            x6,  x6,  #1

-        b.le            0f

-        mov             x5,  x11

-        add             x0,  x0,  x10, lsl #1

-        add             x1,  x1,  x2

-        add             x3,  x3,  x9, lsl #2

-        add             x7,  x7,  x9, lsl #2

-        add             x4,  x4,  x12, lsl #1

-        add             x8,  x8,  x12, lsl #1

-        mov             x13, x3

-        mov             x14, x4

-        ld1             {v0.8h, v1.8h}, [x4], #32

-        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

-4:

-        subs            x5,  x5,  #8

-        ext             v23.16b, v0.16b,  v1.16b, #4  // +1

-        ext             v22.16b, v0.16b,  v1.16b, #2  // 0

-        add             v0.8h,   v0.8h,   v23.8h      // -1, +1

-        ext             v24.16b, v16.16b, v17.16b, #4 // 0

-        ext             v25.16b, v17.16b, v18.16b, #4

-        ext             v26.16b, v16.16b, v17.16b, #8 // +1

-        ext             v27.16b, v17.16b, v18.16b, #8

-        mul             v2.8h,   v22.8h,  v6.8h       // * 6

-        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a

-        ld1             {v31.8b}, [x1], #8

-        add             v16.4s,  v16.4s,  v26.4s      // -1, +1

-        add             v17.4s,  v17.4s,  v27.4s

-        uxtl            v31.8h,  v31.8b

-        // This is, surprisingly, faster than other variants where the

-        // mul+mla pairs are further apart, on Cortex A53.

-        mul             v24.4s,  v24.4s,  v7.4s       // * 6

-        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b

-        mul             v25.4s,  v25.4s,  v7.4s       // * 6

-        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b

-        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src

-        umlal2          v25.4s,  v2.8h,   v31.8h

-        mov             v0.16b,  v1.16b

-        rshrn           v24.4h,  v24.4s,  #8

-        rshrn2          v24.8h,  v25.4s,  #8

-        mov             v16.16b, v18.16b

-        st1             {v24.8h}, [x0], #16

-        b.le            5f

-        ld1             {v1.8h}, [x4], #16

-        ld1             {v17.4s, v18.4s}, [x3], #32

-        b               4b

-5:

-        subs            x6,  x6,  #1

-        b.le            0f

-        mov             x5,  x11

-        add             x0,  x0,  x10, lsl #1

-        add             x1,  x1,  x2

-        mov             x3,  x13 // Rewind x3/x4 to where they started

-        mov             x4,  x14

-        b               1b

-0:

-        ret

-endfunc

-// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,

-//                                    const pixel *src, const ptrdiff_t src_stride,

-//                                    const int16_t *t1, const int w, const int h,

-//                                    const int wt);

-function sgr_weighted1_8bpc_neon, export=1

-        dup             v31.8h, w7

-        cmp             x6,  #2

-        add             x9,  x0,  x1

-        add             x10, x2,  x3

-        add             x11, x4,  #2*FILTER_OUT_STRIDE

-        mov             x7,  #(4*FILTER_OUT_STRIDE)

-        lsl             x1,  x1,  #1

-        lsl             x3,  x3,  #1

-        add             x8,  x5,  #7

-        bic             x8,  x8,  #7 // Aligned width

-        sub             x1,  x1,  x8

-        sub             x3,  x3,  x8

-        sub             x7,  x7,  x8, lsl #1

-        mov             x8,  x5

-        b.lt            2f

-1:

-        ld1             {v0.8b}, [x2],  #8

-        ld1             {v4.8b}, [x10], #8

-        ld1             {v1.8h}, [x4],  #16

-        ld1             {v5.8h}, [x11], #16

-        subs            x5,  x5,  #8

-        ushll           v0.8h,  v0.8b,  #4     // u

-        ushll           v4.8h,  v4.8b,  #4     // u

-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

-        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u

-        ushll           v2.4s,  v0.4h,  #7     // u << 7

-        ushll2          v3.4s,  v0.8h,  #7     // u << 7

-        ushll           v6.4s,  v4.4h,  #7     // u << 7

-        ushll2          v7.4s,  v4.8h,  #7     // u << 7

-        smlal           v2.4s,  v1.4h,  v31.4h // v

-        smlal2          v3.4s,  v1.8h,  v31.8h // v

-        smlal           v6.4s,  v5.4h,  v31.4h // v

-        smlal2          v7.4s,  v5.8h,  v31.8h // v

-        rshrn           v2.4h,  v2.4s,  #11

-        rshrn2          v2.8h,  v3.4s,  #11

-        rshrn           v6.4h,  v6.4s,  #11

-        rshrn2          v6.8h,  v7.4s,  #11

-        sqxtun          v2.8b,  v2.8h

-        sqxtun          v6.8b,  v6.8h

-        st1             {v2.8b}, [x0], #8

-        st1             {v6.8b}, [x9], #8

-        b.gt            1b

-        sub             x6,  x6,  #2

-        cmp             x6,  #1

-        b.lt            0f

-        mov             x5,  x8

-        add             x0,  x0,  x1

-        add             x9,  x9,  x1

-        add             x2,  x2,  x3

-        add             x10, x10, x3

-        add             x4,  x4,  x7

-        add             x11, x11, x7

-        b.eq            2f

-        b               1b

-2:

-        ld1             {v0.8b}, [x2], #8

-        ld1             {v1.8h}, [x4], #16

-        subs            x5,  x5,  #8

-        ushll           v0.8h,  v0.8b,  #4     // u

-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

-        ushll           v2.4s,  v0.4h,  #7     // u << 7

-        ushll2          v3.4s,  v0.8h,  #7     // u << 7

-        smlal           v2.4s,  v1.4h,  v31.4h // v

-        smlal2          v3.4s,  v1.8h,  v31.8h // v

-        rshrn           v2.4h,  v2.4s,  #11

-        rshrn2          v2.8h,  v3.4s,  #11

-        sqxtun          v2.8b,  v2.8h

-        st1             {v2.8b}, [x0], #8

-        b.gt            2b

-0:

-        ret

-endfunc

-// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,

-//                                    const pixel *src, const ptrdiff_t src_stride,

-//                                    const int16_t *t1, const int16_t *t2,

-//                                    const int w, const int h,

-//                                    const int16_t wt[2]);

-function sgr_weighted2_8bpc_neon, export=1

-        ldr             x8,  [sp]

-        cmp             x7,  #2

-        add             x10, x0,  x1

-        add             x11, x2,  x3

-        add             x12, x4,  #2*FILTER_OUT_STRIDE

-        add             x13, x5,  #2*FILTER_OUT_STRIDE

-        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]

-        mov             x8,  #4*FILTER_OUT_STRIDE

-        lsl             x1,  x1,  #1

-        lsl             x3,  x3,  #1

-        add             x9,  x6,  #7

-        bic             x9,  x9,  #7 // Aligned width

-        sub             x1,  x1,  x9

-        sub             x3,  x3,  x9

-        sub             x8,  x8,  x9, lsl #1

-        mov             x9,  x6

-        b.lt            2f

-1:

-        ld1             {v0.8b},  [x2],  #8

-        ld1             {v16.8b}, [x11], #8

-        ld1             {v1.8h},  [x4],  #16

-        ld1             {v17.8h}, [x12], #16

-        ld1             {v2.8h},  [x5],  #16

-        ld1             {v18.8h}, [x13], #16

-        subs            x6,  x6,  #8

-        ushll           v0.8h,  v0.8b,  #4     // u

-        ushll           v16.8h, v16.8b, #4     // u

-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

-        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u

-        sub             v17.8h, v17.8h, v16.8h // t1 - u

-        sub             v18.8h, v18.8h, v16.8h // t2 - u

-        ushll           v3.4s,  v0.4h,  #7     // u << 7

-        ushll2          v4.4s,  v0.8h,  #7     // u << 7

-        ushll           v19.4s, v16.4h, #7     // u << 7

-        ushll2          v20.4s, v16.8h, #7     // u << 7

-        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)

-        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)

-        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)

-        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)

-        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)

-        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)

-        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)

-        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)

-        rshrn           v3.4h,  v3.4s,  #11

-        rshrn2          v3.8h,  v4.4s,  #11

-        rshrn           v19.4h, v19.4s, #11

-        rshrn2          v19.8h, v20.4s, #11

-        sqxtun          v3.8b,  v3.8h

-        sqxtun          v19.8b, v19.8h

-        st1             {v3.8b},  [x0],  #8

-        st1             {v19.8b}, [x10], #8

-        b.gt            1b

-        subs            x7,  x7,  #2

-        cmp             x7,  #1

-        b.lt            0f

-        mov             x6,  x9

-        add             x0,  x0,  x1

-        add             x10, x10, x1

-        add             x2,  x2,  x3

-        add             x11, x11, x3

-        add             x4,  x4,  x8

-        add             x12, x12, x8

-        add             x5,  x5,  x8

-        add             x13, x13, x8

-        b.eq            2f

-        b               1b

-2:

-        ld1             {v0.8b}, [x2], #8

-        ld1             {v1.8h}, [x4], #16

-        ld1             {v2.8h}, [x5], #16

-        subs            x6,  x6,  #8

-        ushll           v0.8h,  v0.8b,  #4     // u

-        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

-        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u

-        ushll           v3.4s,  v0.4h,  #7     // u << 7

-        ushll2          v4.4s,  v0.8h,  #7     // u << 7

-        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)

-        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)

-        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)

-        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)

-        rshrn           v3.4h,  v3.4s,  #11

-        rshrn2          v3.8h,  v4.4s,  #11

-        sqxtun          v3.8b,  v3.8h

-        st1             {v3.8b}, [x0], #8

-        b.gt            1b

-0:

-        ret

 endfunc

--- /dev/null

+++ b/src/arm/64/looprestoration_common.S

@@ -1,0 +1,422 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Martin Storsjo

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "src/arm/asm.S"

+#include "util.S"

+#define SUM_STRIDE (384+16)

+// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,

+//                            const int w, const int h,

+//                            const enum LrEdgeFlags edges);

+function sgr_box3_v_neon, export=1

+        add             w10, w3,  #2 // Number of output rows to move back

+        mov             w11, w3      // Number of input rows to move back

+        add             w2,  w2,  #2 // Actual summed width

+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride

+        mov             x8,       #(2*SUM_STRIDE) // sum stride

+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride

+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride

+        tst             w4,  #4 // LR_HAVE_TOP

+        b.eq            0f

+        // If have top, read from row -2.

+        sub             x5,  x0,  #(4*SUM_STRIDE)

+        sub             x6,  x1,  #(2*SUM_STRIDE)

+        add             w11, w11, #2

+        b               1f

+0:

+        // !LR_HAVE_TOP

+        // If we don't have top, read from row 0 even if

+        // we start writing to row -1.

+        add             x5,  x0,  #(4*SUM_STRIDE)

+        add             x6,  x1,  #(2*SUM_STRIDE)

+1:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.eq            1f

+        // LR_HAVE_BOTTOM

+        add             w3,  w3,  #2  // Sum all h+2 lines with the main loop

+        add             w11, w11, #2

+1:

+        mov             w9,  w3       // Backup of h for next loops

+1:

+        // Start of horizontal loop; start one vertical filter slice.

+        // Start loading rows into v16-v21 and v24-v26 taking top

+        // padding into consideration.

+        tst             w4,  #4 // LR_HAVE_TOP

+        ld1             {v16.4s, v17.4s}, [x5], x7

+        ld1             {v24.8h},         [x6], x8

+        b.eq            2f

+        // LR_HAVE_TOP

+        ld1             {v18.4s, v19.4s}, [x5], x7

+        ld1             {v25.8h},         [x6], x8

+        ld1             {v20.4s, v21.4s}, [x5], x7

+        ld1             {v26.8h},         [x6], x8

+        b               3f

+2:      // !LR_HAVE_TOP

+        mov             v18.16b, v16.16b

+        mov             v19.16b, v17.16b

+        mov             v25.16b, v24.16b

+        mov             v20.16b, v16.16b

+        mov             v21.16b, v17.16b

+        mov             v26.16b, v24.16b

+3:

+        subs            w3,  w3,  #1

+.macro add3

+        add             v16.4s,  v16.4s,  v18.4s

+        add             v17.4s,  v17.4s,  v19.4s

+        add             v24.8h,  v24.8h,  v25.8h

+        add             v16.4s,  v16.4s,  v20.4s

+        add             v17.4s,  v17.4s,  v21.4s

+        add             v24.8h,  v24.8h,  v26.8h

+        st1             {v16.4s, v17.4s}, [x0], x7

+        st1             {v24.8h},         [x1], x8

+.endm

+        add3

+        mov             v16.16b, v18.16b

+        mov             v17.16b, v19.16b

+        mov             v24.16b, v25.16b

+        mov             v18.16b, v20.16b

+        mov             v19.16b, v21.16b

+        mov             v25.16b, v26.16b

+        b.le            4f

+        ld1             {v20.4s, v21.4s}, [x5], x7

+        ld1             {v26.8h},         [x6], x8

+        b               3b

+4:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.ne            5f

+        // !LR_HAVE_BOTTOM

+        // Produce two more rows, extending the already loaded rows.

+        add3

+        mov             v16.16b, v18.16b

+        mov             v17.16b, v19.16b

+        mov             v24.16b, v25.16b

+        add3

+5:      // End of one vertical slice.

+        subs            w2,  w2,  #8

+        b.le            0f

+        // Move pointers back up to the top and loop horizontally.

+        // Input pointers

+        msub            x5,  x7,  x11, x5

+        msub            x6,  x8,  x11, x6

+        // Output pointers

+        msub            x0,  x7,  x10, x0

+        msub            x1,  x8,  x10, x1

+        add             x0,  x0,  #32

+        add             x1,  x1,  #16

+        add             x5,  x5,  #32

+        add             x6,  x6,  #16

+        mov             w3,  w9

+        b               1b

+0:

+        ret

+.purgem add3

+endfunc

+// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,

+//                            const int w, const int h,

+//                            const enum LrEdgeFlags edges);

+function sgr_box5_v_neon, export=1

+        add             w10, w3,  #2 // Number of output rows to move back

+        mov             w11, w3      // Number of input rows to move back

+        add             w2,  w2,  #8 // Actual summed width

+        mov             x7,       #(4*SUM_STRIDE) // sumsq stride

+        mov             x8,       #(2*SUM_STRIDE) // sum stride

+        sub             x0,  x0,  #(4*SUM_STRIDE) // sumsq -= stride

+        sub             x1,  x1,  #(2*SUM_STRIDE) // sum   -= stride

+        tst             w4,  #4 // LR_HAVE_TOP

+        b.eq            0f

+        // If have top, read from row -2.

+        sub             x5,  x0,  #(4*SUM_STRIDE)

+        sub             x6,  x1,  #(2*SUM_STRIDE)

+        add             w11, w11, #2

+        b               1f

+0:

+        // !LR_HAVE_TOP

+        // If we don't have top, read from row 0 even if

+        // we start writing to row -1.

+        add             x5,  x0,  #(4*SUM_STRIDE)

+        add             x6,  x1,  #(2*SUM_STRIDE)

+1:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.eq            0f

+        // LR_HAVE_BOTTOM

+        add             w3,  w3,  #2  // Handle h+2 lines with the main loop

+        add             w11, w11, #2

+        b               1f

+0:

+        // !LR_HAVE_BOTTOM

+        sub             w3,  w3,  #1  // Handle h-1 lines with the main loop

+1:

+        mov             w9,  w3       // Backup of h for next loops

+1:

+        // Start of horizontal loop; start one vertical filter slice.

+        // Start loading rows into v16-v25 and v26-v30 taking top

+        // padding into consideration.

+        tst             w4,  #4 // LR_HAVE_TOP

+        ld1             {v16.4s, v17.4s}, [x5], x7

+        ld1             {v26.8h},         [x6], x8

+        b.eq            2f

+        // LR_HAVE_TOP

+        ld1             {v20.4s, v21.4s}, [x5], x7

+        ld1             {v28.8h},         [x6], x8

+        mov             v18.16b, v16.16b

+        mov             v19.16b, v17.16b

+        mov             v27.16b, v26.16b

+        ld1             {v22.4s, v23.4s}, [x5], x7

+        ld1             {v29.8h},         [x6], x8

+        b               3f

+2:      // !LR_HAVE_TOP

+        mov             v18.16b, v16.16b

+        mov             v19.16b, v17.16b

+        mov             v27.16b, v26.16b

+        mov             v20.16b, v16.16b

+        mov             v21.16b, v17.16b

+        mov             v28.16b, v26.16b

+        mov             v22.16b, v16.16b

+        mov             v23.16b, v17.16b

+        mov             v29.16b, v26.16b

+3:

+        cbz             w3,  4f

+        ld1             {v24.4s, v25.4s}, [x5], x7

+        ld1             {v30.8h},         [x6], x8

+3:

+        // Start of vertical loop

+        subs            w3,  w3,  #2

+.macro add5

+        add             v16.4s,  v16.4s,  v18.4s

+        add             v17.4s,  v17.4s,  v19.4s

+        add             v26.8h,  v26.8h,  v27.8h

+        add             v0.4s,   v20.4s,  v22.4s

+        add             v1.4s,   v21.4s,  v23.4s

+        add             v2.8h,   v28.8h,  v29.8h

+        add             v16.4s,  v16.4s,  v24.4s

+        add             v17.4s,  v17.4s,  v25.4s

+        add             v26.8h,  v26.8h,  v30.8h

+        add             v16.4s,  v16.4s,  v0.4s

+        add             v17.4s,  v17.4s,  v1.4s

+        add             v26.8h,  v26.8h,  v2.8h

+        st1             {v16.4s, v17.4s}, [x0], x7

+        st1             {v26.8h},         [x1], x8

+.endm

+        add5

+.macro shift2

+        mov             v16.16b, v20.16b

+        mov             v17.16b, v21.16b

+        mov             v26.16b, v28.16b

+        mov             v18.16b, v22.16b

+        mov             v19.16b, v23.16b

+        mov             v27.16b, v29.16b

+        mov             v20.16b, v24.16b

+        mov             v21.16b, v25.16b

+        mov             v28.16b, v30.16b

+.endm

+        shift2

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        b.le            5f

+        ld1             {v22.4s, v23.4s}, [x5], x7

+        ld1             {v29.8h},         [x6], x8

+        ld1             {v24.4s, v25.4s}, [x5], x7

+        ld1             {v30.8h},         [x6], x8

+        b               3b

+4:

+        // h == 1, !LR_HAVE_BOTTOM.

+        // Pad the last row with the only content row, and add.

+        mov             v24.16b, v22.16b

+        mov             v25.16b, v23.16b

+        mov             v30.16b, v29.16b

+        add5

+        shift2

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        add5

+        b               6f

+5:

+        tst             w4,  #8 // LR_HAVE_BOTTOM

+        b.ne            6f

+        // !LR_HAVE_BOTTOM

+        cbnz            w3,  5f

+        // The intended three edge rows left; output the one at h-2 and

+        // the past edge one at h.

+        ld1             {v22.4s, v23.4s}, [x5], x7

+        ld1             {v29.8h},         [x6], x8

+        // Pad the past-edge row from the last content row.

+        mov             v24.16b, v22.16b

+        mov             v25.16b, v23.16b

+        mov             v30.16b, v29.16b

+        add5

+        shift2

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        // The last two rows are already padded properly here.

+        add5

+        b               6f

+5:

+        // w3 == -1, two rows left, output one.

+        // Pad the last two rows from the mid one.

+        mov             v22.16b, v20.16b

+        mov             v23.16b, v21.16b

+        mov             v29.16b, v28.16b

+        mov             v24.16b, v20.16b

+        mov             v25.16b, v21.16b

+        mov             v30.16b, v28.16b

+        add5

+        add             x0,  x0,  x7

+        add             x1,  x1,  x8

+        b               6f

+6:      // End of one vertical slice.

+        subs            w2,  w2,  #8

+        b.le            0f

+        // Move pointers back up to the top and loop horizontally.

+        // Input pointers

+        msub            x5,  x7,  x11, x5

+        msub            x6,  x8,  x11, x6

+        // Output pointers

+        msub            x0,  x7,  x10, x0

+        msub            x1,  x8,  x10, x1

+        add             x0,  x0,  #32

+        add             x1,  x1,  #16

+        add             x5,  x5,  #32

+        add             x6,  x6,  #16

+        mov             w3,  w9

+        b               1b

+0:

+        ret

+.purgem add5

+endfunc

+// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,

+//                              const int w, const int h, const int strength);

+// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,

+//                              const int w, const int h, const int strength);

+function sgr_calc_ab1_neon, export=1

+        add             x3,  x3,  #2 // h += 2

+        movi            v31.4s,   #9 // n

+        mov             x5,  #455

+        mov             x8,  #SUM_STRIDE

+        b               sgr_calc_ab_neon

+endfunc

+function sgr_calc_ab2_neon, export=1

+        add             x3,  x3,  #3  // h += 3

+        asr             x3,  x3,  #1  // h /= 2

+        movi            v31.4s,   #25 // n

+        mov             x5,  #164

+        mov             x8,  #(2*SUM_STRIDE)

+endfunc

+function sgr_calc_ab_neon

+        movrel          x12, X(sgr_x_by_x)

+        ld1             {v16.16b, v17.16b, v18.16b}, [x12]

+        movi            v19.16b,  #5

+        movi            v20.8b,   #55  // idx of last 5

+        movi            v21.8b,   #72  // idx of last 4

+        movi            v22.8b,   #101 // idx of last 3

+        movi            v23.8b,   #169 // idx of last 2

+        movi            v24.8b,   #254 // idx of last 1

+        add             x2,  x2,  #2 // w += 2

+        add             x7,  x2,  #7

+        bic             x7,  x7,  #7 // aligned w

+        sub             x7,  x8,  x7 // increment between rows

+        movi            v29.8h,   #1, lsl #8

+        dup             v28.4s,   w4

+        dup             v30.4s,   w5 // one_by_x

+        sub             x0,  x0,  #(4*(SUM_STRIDE))

+        sub             x1,  x1,  #(2*(SUM_STRIDE))

+        mov             x6,  x2   // backup of w

+        sub             v16.16b, v16.16b, v19.16b

+        sub             v17.16b, v17.16b, v19.16b

+        sub             v18.16b, v18.16b, v19.16b

+1:

+        subs            x2,  x2,  #8

+        ld1             {v0.4s, v1.4s}, [x0]   // a

+        ld1             {v2.8h}, [x1]          // b

+        mul             v0.4s,  v0.4s,  v31.4s // a * n

+        mul             v1.4s,  v1.4s,  v31.4s // a * n

+        umull           v3.4s,  v2.4h,  v2.4h  // b * b

+        umull2          v4.4s,  v2.8h,  v2.8h  // b * b

+        uqsub           v0.4s,  v0.4s,  v3.4s  // imax(a * n - b * b, 0)

+        uqsub           v1.4s,  v1.4s,  v4.4s  // imax(a * n - b * b, 0)

+        mul             v0.4s,  v0.4s,  v28.4s // p * s

+        mul             v1.4s,  v1.4s,  v28.4s // p * s

+        uqshrn          v0.4h,  v0.4s,  #16

+        uqshrn2         v0.8h,  v1.4s,  #16

+        uqrshrn         v0.8b,  v0.8h,  #4     // imin(z, 255)

+        cmhi            v25.8b, v0.8b,  v20.8b // = -1 if sgr_x_by_x[v0] < 5

+        cmhi            v26.8b, v0.8b,  v21.8b // = -1 if sgr_x_by_x[v0] < 4

+        tbl             v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b

+        cmhi            v27.8b, v0.8b,  v22.8b // = -1 if sgr_x_by_x[v0] < 3

+        cmhi            v5.8b,  v0.8b,  v23.8b // = -1 if sgr_x_by_x[v0] < 2

+        add             v25.8b, v25.8b, v26.8b

+        cmhi            v6.8b,  v0.8b,  v24.8b // = -1 if sgr_x_by_x[v0] < 1

+        add             v27.8b, v27.8b, v5.8b

+        add             v6.8b,  v6.8b,  v19.8b

+        add             v25.8b, v25.8b, v27.8b

+        add             v1.8b,  v1.8b,  v6.8b

+        add             v1.8b,  v1.8b,  v25.8b

+        uxtl            v1.8h,  v1.8b          // x

+        umull           v3.4s,  v1.4h,  v2.4h  // x * BB[i]

+        umull2          v4.4s,  v1.8h,  v2.8h  // x * BB[i]

+        mul             v3.4s,  v3.4s,  v30.4s // x * BB[i] * sgr_one_by_x

+        mul             v4.4s,  v4.4s,  v30.4s // x * BB[i] * sgr_one_by_x

+        srshr           v3.4s,  v3.4s,  #12    // AA[i]

+        srshr           v4.4s,  v4.4s,  #12    // AA[i]

+        sub             v2.8h,  v29.8h, v1.8h  // 256 - x

+        st1             {v3.4s, v4.4s}, [x0], #32

+        st1             {v2.8h}, [x1], #16

+        b.gt            1b

+        subs            x3,  x3,  #1

+        b.le            0f

+        add             x0,  x0,  x7, lsl #2

+        add             x1,  x1,  x7, lsl #1

+        mov             x2,  x6

+        b               1b

+0:

+        ret

+endfunc

--- /dev/null

+++ b/src/arm/64/looprestoration_tmpl.S

@@ -1,0 +1,474 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Martin Storsjo

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "src/arm/asm.S"

+#define FILTER_OUT_STRIDE 384

+// void dav1d_sgr_finish_filter1_8bpc_neon(int16_t *tmp,

+//                                         const pixel *src, const ptrdiff_t stride,

+//                                         const int32_t *a, const int16_t *b,

+//                                         const int w, const int h);

+function sgr_finish_filter1_8bpc_neon, export=1

+        sub             x7,  x3,  #(4*SUM_STRIDE)

+        add             x8,  x3,  #(4*SUM_STRIDE)

+        sub             x9,  x4,  #(2*SUM_STRIDE)

+        add             x10, x4,  #(2*SUM_STRIDE)

+        mov             x11, #SUM_STRIDE

+        mov             x12, #FILTER_OUT_STRIDE

+        add             x13, x5,  #7

+        bic             x13, x13, #7 // Aligned width

+        sub             x2,  x2,  x13

+        sub             x12, x12, x13

+        sub             x11, x11, x13

+        sub             x11, x11, #4 // We read 4 extra elements from a

+        sub             x14, x11, #4 // We read 8 extra elements from b

+        mov             x13, x5

+        movi            v6.8h,  #3

+        movi            v7.4s,  #3

+1:

+        ld1             {v0.8h, v1.8h}, [x9], #32

+        ld1             {v2.8h, v3.8h}, [x4], #32

+        ld1             {v4.8h, v5.8h}, [x10], #32

+        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48

+        ld1             {v19.4s, v20.4s, v21.4s}, [x3], #48

+        ld1             {v22.4s, v23.4s, v24.4s}, [x8], #48

+2:

+        subs            x5,  x5,  #8

+        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride

+        ext             v26.16b, v2.16b,  v3.16b, #2  // 0

+        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride

+        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride

+        ext             v29.16b, v2.16b,  v3.16b, #4  // +1

+        ext             v30.16b, v4.16b,  v5.16b, #4  // +1+stride

+        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride

+        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride

+        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride

+        add             v2.8h,   v2.8h,   v26.8h

+        add             v4.8h,   v4.8h,   v30.8h      // -1+stride, +1+stride

+        add             v2.8h,   v2.8h,   v29.8h      // +1

+        add             v0.8h,   v0.8h,   v4.8h

+        ext             v25.16b, v16.16b, v17.16b, #4 // -stride

+        ext             v26.16b, v17.16b, v18.16b, #4

+        shl             v2.8h,   v2.8h,   #2

+        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride

+        ext             v28.16b, v17.16b, v18.16b, #8

+        ext             v29.16b, v19.16b, v20.16b, #4 // 0

+        ext             v30.16b, v20.16b, v21.16b, #4

+        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a

+        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1

+        add             v26.4s,  v26.4s,  v20.4s

+        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride

+        add             v17.4s,  v17.4s,  v28.4s

+        ext             v27.16b, v19.16b, v20.16b, #8 // +1

+        ext             v28.16b, v20.16b, v21.16b, #8

+        add             v16.4s,  v16.4s,  v22.4s      // -1+stride

+        add             v17.4s,  v17.4s,  v23.4s

+        add             v29.4s,  v29.4s,  v27.4s      // 0, +1

+        add             v30.4s,  v30.4s,  v28.4s

+        add             v25.4s,  v25.4s,  v29.4s

+        add             v26.4s,  v26.4s,  v30.4s

+        ext             v27.16b, v22.16b, v23.16b, #4 // +stride

+        ext             v28.16b, v23.16b, v24.16b, #4

+        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride

+        ext             v30.16b, v23.16b, v24.16b, #8

+        ld1             {v19.8b}, [x1], #8            // src

+        add             v25.4s,  v25.4s,  v27.4s      // +stride

+        add             v26.4s,  v26.4s,  v28.4s

+        add             v16.4s,  v16.4s,  v29.4s      // +1+stride

+        add             v17.4s,  v17.4s,  v30.4s

+        shl             v25.4s,  v25.4s,  #2

+        shl             v26.4s,  v26.4s,  #2

+        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b

+        mla             v26.4s,  v17.4s,  v7.4s

+        uxtl            v19.8h,  v19.8b               // src

+        mov             v0.16b,  v1.16b

+        umlal           v25.4s,  v2.4h,   v19.4h      // b + a * src

+        umlal2          v26.4s,  v2.8h,   v19.8h

+        mov             v2.16b,  v3.16b

+        rshrn           v25.4h,  v25.4s,  #9

+        rshrn2          v25.8h,  v26.4s,  #9

+        mov             v4.16b,  v5.16b

+        st1             {v25.8h}, [x0], #16

+        b.le            3f

+        mov             v16.16b, v18.16b

+        mov             v19.16b, v21.16b

+        mov             v22.16b, v24.16b

+        ld1             {v1.8h}, [x9], #16

+        ld1             {v3.8h}, [x4], #16

+        ld1             {v5.8h}, [x10], #16

+        ld1             {v17.4s, v18.4s}, [x7], #32

+        ld1             {v20.4s, v21.4s}, [x3], #32

+        ld1             {v23.4s, v24.4s}, [x8], #32

+        b               2b

+3:

+        subs            x6,  x6,  #1

+        b.le            0f

+        mov             x5,  x13

+        add             x0,  x0,  x12, lsl #1

+        add             x1,  x1,  x2

+        add             x3,  x3,  x11, lsl #2

+        add             x7,  x7,  x11, lsl #2

+        add             x8,  x8,  x11, lsl #2

+        add             x4,  x4,  x14, lsl #1

+        add             x9,  x9,  x14, lsl #1

+        add             x10, x10, x14, lsl #1

+        b               1b

+0:

+        ret

+endfunc

+// void dav1d_sgr_finish_filter2_8bpc_neon(int16_t *tmp,

+//                                         const pixel *src, const ptrdiff_t stride,

+//                                         const int32_t *a, const int16_t *b,

+//                                         const int w, const int h);

+function sgr_finish_filter2_8bpc_neon, export=1

+        add             x7,  x3,  #(4*(SUM_STRIDE))

+        sub             x3,  x3,  #(4*(SUM_STRIDE))

+        add             x8,  x4,  #(2*(SUM_STRIDE))

+        sub             x4,  x4,  #(2*(SUM_STRIDE))

+        mov             x9,  #(2*SUM_STRIDE)

+        mov             x10, #FILTER_OUT_STRIDE

+        add             x11, x5,  #7

+        bic             x11, x11, #7 // Aligned width

+        sub             x2,  x2,  x11

+        sub             x10, x10, x11

+        sub             x9,  x9,  x11

+        sub             x9,  x9,  #4 // We read 4 extra elements from a

+        sub             x12, x9,  #4 // We read 8 extra elements from b

+        mov             x11, x5

+        movi            v4.8h,  #5

+        movi            v5.4s,  #5

+        movi            v6.8h,  #6

+        movi            v7.4s,  #6

+1:

+        ld1             {v0.8h, v1.8h}, [x4], #32

+        ld1             {v2.8h, v3.8h}, [x8], #32

+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

+        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

+2:

+        subs            x5,  x5,  #8

+        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride

+        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride

+        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride

+        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride

+        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride

+        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride

+        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride

+        add             v0.8h,   v0.8h,   v25.8h

+        ext             v22.16b, v16.16b, v17.16b, #4 // -stride

+        ext             v23.16b, v17.16b, v18.16b, #4

+        ext             v24.16b, v19.16b, v20.16b, #4 // +stride

+        ext             v25.16b, v20.16b, v21.16b, #4

+        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride

+        ext             v27.16b, v17.16b, v18.16b, #8

+        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride

+        ext             v29.16b, v20.16b, v21.16b, #8

+        mul             v0.8h,   v0.8h,   v4.8h       // * 5

+        mla             v0.8h,   v2.8h,   v6.8h       // * 6

+        ld1             {v31.8b}, [x1], #8

+        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride

+        add             v17.4s,  v17.4s,  v27.4s

+        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride

+        add             v20.4s,  v20.4s,  v29.4s

+        add             v16.4s,  v16.4s,  v19.4s

+        add             v17.4s,  v17.4s,  v20.4s

+        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride

+        add             v23.4s,  v23.4s,  v25.4s

+        // This is, surprisingly, faster than other variants where the

+        // mul+mla pairs are further apart, on Cortex A53.

+        mul             v16.4s,  v16.4s,  v5.4s       // * 5

+        mla             v16.4s,  v22.4s,  v7.4s       // * 6

+        mul             v17.4s,  v17.4s,  v5.4s       // * 5

+        mla             v17.4s,  v23.4s,  v7.4s       // * 6

+        uxtl            v31.8h,  v31.8b

+        umlal           v16.4s,  v0.4h,   v31.4h      // b + a * src

+        umlal2          v17.4s,  v0.8h,   v31.8h

+        mov             v0.16b,  v1.16b

+        rshrn           v16.4h,  v16.4s,  #9

+        rshrn2          v16.8h,  v17.4s,  #9

+        mov             v2.16b,  v3.16b

+        st1             {v16.8h}, [x0], #16

+        b.le            3f

+        mov             v16.16b, v18.16b

+        mov             v19.16b, v21.16b

+        ld1             {v1.8h}, [x4], #16

+        ld1             {v3.8h}, [x8], #16

+        ld1             {v17.4s, v18.4s}, [x3], #32

+        ld1             {v20.4s, v21.4s}, [x7], #32

+        b               2b

+3:

+        subs            x6,  x6,  #1

+        b.le            0f

+        mov             x5,  x11

+        add             x0,  x0,  x10, lsl #1

+        add             x1,  x1,  x2

+        add             x3,  x3,  x9, lsl #2

+        add             x7,  x7,  x9, lsl #2

+        add             x4,  x4,  x12, lsl #1

+        add             x8,  x8,  x12, lsl #1

+        mov             x13, x3

+        mov             x14, x4

+        ld1             {v0.8h, v1.8h}, [x4], #32

+        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48

+4:

+        subs            x5,  x5,  #8

+        ext             v23.16b, v0.16b,  v1.16b, #4  // +1

+        ext             v22.16b, v0.16b,  v1.16b, #2  // 0

+        add             v0.8h,   v0.8h,   v23.8h      // -1, +1

+        ext             v24.16b, v16.16b, v17.16b, #4 // 0

+        ext             v25.16b, v17.16b, v18.16b, #4

+        ext             v26.16b, v16.16b, v17.16b, #8 // +1

+        ext             v27.16b, v17.16b, v18.16b, #8

+        mul             v2.8h,   v22.8h,  v6.8h       // * 6

+        mla             v2.8h,   v0.8h,   v4.8h       // * 5 -> a

+        ld1             {v31.8b}, [x1], #8

+        add             v16.4s,  v16.4s,  v26.4s      // -1, +1

+        add             v17.4s,  v17.4s,  v27.4s

+        uxtl            v31.8h,  v31.8b

+        // This is, surprisingly, faster than other variants where the

+        // mul+mla pairs are further apart, on Cortex A53.

+        mul             v24.4s,  v24.4s,  v7.4s       // * 6

+        mla             v24.4s,  v16.4s,  v5.4s       // * 5 -> b

+        mul             v25.4s,  v25.4s,  v7.4s       // * 6

+        mla             v25.4s,  v17.4s,  v5.4s       // * 5 -> b

+        umlal           v24.4s,  v2.4h,   v31.4h      // b + a * src

+        umlal2          v25.4s,  v2.8h,   v31.8h

+        mov             v0.16b,  v1.16b

+        rshrn           v24.4h,  v24.4s,  #8

+        rshrn2          v24.8h,  v25.4s,  #8

+        mov             v16.16b, v18.16b

+        st1             {v24.8h}, [x0], #16

+        b.le            5f

+        ld1             {v1.8h}, [x4], #16

+        ld1             {v17.4s, v18.4s}, [x3], #32

+        b               4b

+5:

+        subs            x6,  x6,  #1

+        b.le            0f

+        mov             x5,  x11

+        add             x0,  x0,  x10, lsl #1

+        add             x1,  x1,  x2

+        mov             x3,  x13 // Rewind x3/x4 to where they started

+        mov             x4,  x14

+        b               1b

+0:

+        ret

+endfunc

+// void dav1d_sgr_weighted1_8bpc_neon(pixel *dst, const ptrdiff_t dst_stride,

+//                                    const pixel *src, const ptrdiff_t src_stride,

+//                                    const int16_t *t1, const int w, const int h,

+//                                    const int wt);

+function sgr_weighted1_8bpc_neon, export=1

+        dup             v31.8h, w7

+        cmp             x6,  #2

+        add             x9,  x0,  x1

+        add             x10, x2,  x3

+        add             x11, x4,  #2*FILTER_OUT_STRIDE

+        mov             x7,  #(4*FILTER_OUT_STRIDE)

+        lsl             x1,  x1,  #1

+        lsl             x3,  x3,  #1

+        add             x8,  x5,  #7

+        bic             x8,  x8,  #7 // Aligned width

+        sub             x1,  x1,  x8

+        sub             x3,  x3,  x8

+        sub             x7,  x7,  x8, lsl #1

+        mov             x8,  x5

+        b.lt            2f

+1:

+        ld1             {v0.8b}, [x2],  #8

+        ld1             {v4.8b}, [x10], #8

+        ld1             {v1.8h}, [x4],  #16

+        ld1             {v5.8h}, [x11], #16

+        subs            x5,  x5,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        ushll           v4.8h,  v4.8b,  #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        sub             v5.8h,  v5.8h,  v4.8h  // t1 - u

+        ushll           v2.4s,  v0.4h,  #7     // u << 7

+        ushll2          v3.4s,  v0.8h,  #7     // u << 7

+        ushll           v6.4s,  v4.4h,  #7     // u << 7

+        ushll2          v7.4s,  v4.8h,  #7     // u << 7

+        smlal           v2.4s,  v1.4h,  v31.4h // v

+        smlal2          v3.4s,  v1.8h,  v31.8h // v

+        smlal           v6.4s,  v5.4h,  v31.4h // v

+        smlal2          v7.4s,  v5.8h,  v31.8h // v

+        rshrn           v2.4h,  v2.4s,  #11

+        rshrn2          v2.8h,  v3.4s,  #11

+        rshrn           v6.4h,  v6.4s,  #11

+        rshrn2          v6.8h,  v7.4s,  #11

+        sqxtun          v2.8b,  v2.8h

+        sqxtun          v6.8b,  v6.8h

+        st1             {v2.8b}, [x0], #8

+        st1             {v6.8b}, [x9], #8

+        b.gt            1b

+        sub             x6,  x6,  #2

+        cmp             x6,  #1

+        b.lt            0f

+        mov             x5,  x8

+        add             x0,  x0,  x1

+        add             x9,  x9,  x1

+        add             x2,  x2,  x3

+        add             x10, x10, x3

+        add             x4,  x4,  x7

+        add             x11, x11, x7

+        b.eq            2f

+        b               1b

+2:

+        ld1             {v0.8b}, [x2], #8

+        ld1             {v1.8h}, [x4], #16

+        subs            x5,  x5,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        ushll           v2.4s,  v0.4h,  #7     // u << 7

+        ushll2          v3.4s,  v0.8h,  #7     // u << 7

+        smlal           v2.4s,  v1.4h,  v31.4h // v

+        smlal2          v3.4s,  v1.8h,  v31.8h // v

+        rshrn           v2.4h,  v2.4s,  #11

+        rshrn2          v2.8h,  v3.4s,  #11

+        sqxtun          v2.8b,  v2.8h

+        st1             {v2.8b}, [x0], #8

+        b.gt            2b

+0:

+        ret

+endfunc

+// void dav1d_sgr_weighted2_8bpc_neon(pixel *dst, const ptrdiff_t stride,

+//                                    const pixel *src, const ptrdiff_t src_stride,

+//                                    const int16_t *t1, const int16_t *t2,

+//                                    const int w, const int h,

+//                                    const int16_t wt[2]);

+function sgr_weighted2_8bpc_neon, export=1

+        ldr             x8,  [sp]

+        cmp             x7,  #2

+        add             x10, x0,  x1

+        add             x11, x2,  x3

+        add             x12, x4,  #2*FILTER_OUT_STRIDE

+        add             x13, x5,  #2*FILTER_OUT_STRIDE

+        ld2r            {v30.8h, v31.8h}, [x8] // wt[0], wt[1]

+        mov             x8,  #4*FILTER_OUT_STRIDE

+        lsl             x1,  x1,  #1

+        lsl             x3,  x3,  #1

+        add             x9,  x6,  #7

+        bic             x9,  x9,  #7 // Aligned width

+        sub             x1,  x1,  x9

+        sub             x3,  x3,  x9

+        sub             x8,  x8,  x9, lsl #1

+        mov             x9,  x6

+        b.lt            2f

+1:

+        ld1             {v0.8b},  [x2],  #8

+        ld1             {v16.8b}, [x11], #8

+        ld1             {v1.8h},  [x4],  #16

+        ld1             {v17.8h}, [x12], #16

+        ld1             {v2.8h},  [x5],  #16

+        ld1             {v18.8h}, [x13], #16

+        subs            x6,  x6,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        ushll           v16.8h, v16.8b, #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u

+        sub             v17.8h, v17.8h, v16.8h // t1 - u

+        sub             v18.8h, v18.8h, v16.8h // t2 - u

+        ushll           v3.4s,  v0.4h,  #7     // u << 7

+        ushll2          v4.4s,  v0.8h,  #7     // u << 7

+        ushll           v19.4s, v16.4h, #7     // u << 7

+        ushll2          v20.4s, v16.8h, #7     // u << 7

+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)

+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)

+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)

+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)

+        smlal           v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u)

+        smlal           v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u)

+        smlal2          v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u)

+        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u)

+        rshrn           v3.4h,  v3.4s,  #11

+        rshrn2          v3.8h,  v4.4s,  #11

+        rshrn           v19.4h, v19.4s, #11

+        rshrn2          v19.8h, v20.4s, #11

+        sqxtun          v3.8b,  v3.8h

+        sqxtun          v19.8b, v19.8h

+        st1             {v3.8b},  [x0],  #8

+        st1             {v19.8b}, [x10], #8

+        b.gt            1b

+        subs            x7,  x7,  #2

+        cmp             x7,  #1

+        b.lt            0f

+        mov             x6,  x9

+        add             x0,  x0,  x1

+        add             x10, x10, x1

+        add             x2,  x2,  x3

+        add             x11, x11, x3

+        add             x4,  x4,  x8

+        add             x12, x12, x8

+        add             x5,  x5,  x8

+        add             x13, x13, x8

+        b.eq            2f

+        b               1b

+2:

+        ld1             {v0.8b}, [x2], #8

+        ld1             {v1.8h}, [x4], #16

+        ld1             {v2.8h}, [x5], #16

+        subs            x6,  x6,  #8

+        ushll           v0.8h,  v0.8b,  #4     // u

+        sub             v1.8h,  v1.8h,  v0.8h  // t1 - u

+        sub             v2.8h,  v2.8h,  v0.8h  // t2 - u

+        ushll           v3.4s,  v0.4h,  #7     // u << 7

+        ushll2          v4.4s,  v0.8h,  #7     // u << 7

+        smlal           v3.4s,  v1.4h,  v30.4h // wt[0] * (t1 - u)

+        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * (t2 - u)

+        smlal2          v4.4s,  v1.8h,  v30.8h // wt[0] * (t1 - u)

+        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * (t2 - u)

+        rshrn           v3.4h,  v3.4s,  #11

+        rshrn2          v3.8h,  v4.4s,  #11

+        sqxtun          v3.8b,  v3.8h

+        st1             {v3.8b}, [x0], #8

+        b.gt            1b

+0:

+        ret

+endfunc

--- a/src/meson.build

+++ b/src/meson.build

@@ -102,6 +102,7 @@

         if host_machine.cpu_family() == 'aarch64'

             libdav1d_sources += files(

+                'arm/64/looprestoration_common.S',

                 'arm/64/msac.S',