shithub: dav1d

--- a/src/arm/64/mc16.S

+++ b/src/arm/64/mc16.S

@@ -3407,3 +3407,163 @@

 warp

 warp t

+// void dav1d_emu_edge_16bpc_neon(

+//         const intptr_t bw, const intptr_t bh,

+//         const intptr_t iw, const intptr_t ih,

+//         const intptr_t x, const intptr_t y,

+//         pixel *dst, const ptrdiff_t dst_stride,

+//         const pixel *ref, const ptrdiff_t ref_stride)

+function emu_edge_16bpc_neon, export=1

+        ldp             x8,  x9,  [sp]

+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)

+        // ref += iclip(x, 0, iw - 1)

+        sub             x12, x3,  #1           // ih - 1

+        cmp             x5,  x3

+        sub             x13, x2,  #1           // iw - 1

+        csel            x12, x12, x5,  ge      // min(y, ih - 1)

+        cmp             x4,  x2

+        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)

+        csel            x13, x13, x4,  ge      // min(x, iw - 1)

+        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)

+        madd            x8,  x12, x9,  x8      // ref += iclip() * stride

+        add             x8,  x8,  x13, lsl #1  // ref += iclip()

+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)

+        // top_ext = iclip(-y, 0, bh - 1)

+        add             x10, x5,  x1           // y + bh

+        neg             x5,  x5                // -y

+        sub             x10, x10, x3           // y + bh - ih

+        sub             x12, x1,  #1           // bh - 1

+        cmp             x10, x1

+        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)

+        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)

+        cmp             x5,  x1

+        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)

+        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)

+        // right_ext = iclip(x + bw - iw, 0, bw - 1)

+        // left_ext = iclip(-x, 0, bw - 1)

+        add             x11, x4,  x0           // x + bw

+        neg             x4,  x4                // -x

+        sub             x11, x11, x2           // x + bw - iw

+        sub             x13, x0,  #1           // bw - 1

+        cmp             x11, x0

+        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)

+        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)

+        cmp             x4,  x0

+        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)

+        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)

+        // center_h = bh - top_ext - bottom_ext

+        // dst += top_ext * PXSTRIDE(dst_stride)

+        // center_w = bw - left_ext - right_ext

+        sub             x1,  x1,  x5           // bh - top_ext

+        madd            x6,  x5,  x7,  x6

+        sub             x2,  x0,  x4           // bw - left_ext

+        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext

+        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext

+        mov             x14, x6                // backup of dst

+.macro v_loop need_left, need_right

+0:

+.if \need_left

+        ld1r            {v0.8h}, [x8]

+        mov             x12, x6                // out = dst

+        mov             x3,  x4

+        mov             v1.16b,  v0.16b

+1:

+        subs            x3,  x3,  #16

+        st1             {v0.8h, v1.8h}, [x12], #32

+        b.gt            1b

+.endif

+        mov             x13, x8

+        add             x12, x6,  x4, lsl #1   // out = dst + left_ext

+        mov             x3,  x2

+1:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64

+        subs            x3,  x3,  #32

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64

+        b.gt            1b

+.if \need_right

+        add             x3,  x8,  x2, lsl #1   // in + center_w

+        sub             x3,  x3,  #2           // in + center_w - 1

+        add             x12, x6,  x4, lsl #1   // dst + left_ext

+        ld1r            {v0.8h}, [x3]

+        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w

+        mov             x3,  x11

+        mov             v1.16b,  v0.16b

+1:

+        subs            x3,  x3,  #16

+        st1             {v0.8h, v1.8h}, [x12], #32

+        b.gt            1b

+.endif

+        subs            x1,  x1,  #1           // center_h--

+        add             x6,  x6,  x7

+        add             x8,  x8,  x9

+        b.gt            0b

+.endm

+        cbz             x4,  2f

+        // need_left

+        cbz             x11, 3f

+        // need_left + need_right

+        v_loop          1,   1

+        b               5f

+2:

+        // !need_left

+        cbz             x11, 4f

+        // !need_left + need_right

+        v_loop          0,   1

+        b               5f

+3:

+        // need_left + !need_right

+        v_loop          1,   0

+        b               5f

+4:

+        // !need_left + !need_right

+        v_loop          0,   0

+5:

+        cbz             x10, 3f

+        // need_bottom

+        sub             x8,  x6,  x7           // ref = dst - stride

+        mov             x4,  x0

+1:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64

+        mov             x3,  x10

+2:

+        subs            x3,  x3,  #1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7

+        b.gt            2b

+        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride

+        subs            x4,  x4,  #32          // bw -= 32

+        add             x6,  x6,  #64          // dst += 32

+        b.gt            1b

+3:

+        cbz             x5,  3f

+        // need_top

+        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride

+1:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64

+        mov             x3,  x5

+2:

+        subs            x3,  x3,  #1

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7

+        b.gt            2b

+        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride

+        subs            x0,  x0,  #32          // bw -= 32

+        add             x6,  x6,  #64          // dst += 32

+        b.gt            1b

+3:

+        ret

+endfunc

--- a/src/arm/mc_init_tmpl.c

+++ b/src/arm/mc_init_tmpl.c

@@ -112,7 +112,7 @@

     c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);

     c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);

 #endif

-#if BITDEPTH == 8 && ARCH_AARCH64

+#if ARCH_AARCH64

     c->emu_edge = BF(dav1d_emu_edge, neon);

 #endif