shithub: dav1d

--- a/src/x86/mc.asm

+++ b/src/x86/mc.asm

@@ -3281,4 +3281,198 @@

     jg .w128_loop

RET

+INIT_YMM avx2

+cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \

+                             bottomext, rightext

+    ; we assume that the buffer (stride) is larger than width, so we can

+    ; safely overwrite by a few bytes

+    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)

+    xor                r12d, r12d

+    lea                 r10, [ihq-1]

+    cmp                  yq, ihq

+    cmovl               r10, yq

+    test                 yq, yq

+    cmovl               r10, r12

+    imul                r10, sstrideq

+    add                srcq, r10

+    ; ref += iclip(x, 0, iw - 1)

+    lea                 r10, [iwq-1]

+    cmp                  xq, iwq

+    cmovl               r10, xq

+    test                 xq, xq

+    cmovl               r10, r12

+    add                srcq, r10

+    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)

+    lea          bottomextq, [yq+bhq]

+    sub          bottomextq, ihq

+    lea                  r3, [bhq-1]

+    cmovl        bottomextq, r12

+    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \

+                bottomext, rightext

+    ; top_ext = iclip(-y, 0, bh - 1)

+    neg             topextq

+    cmovl           topextq, r12

+    cmp          bottomextq, bhq

+    cmovge       bottomextq, r3

+    cmp             topextq, bhq

+    cmovg           topextq, r3

+    ; right_ext = iclip(x + bw - iw, 0, bw - 1)

+    lea           rightextq, [xq+bwq]

+    sub           rightextq, iwq

+    lea                  r2, [bwq-1]

+    cmovl         rightextq, r12

+    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \

+                bottomext, rightext

+    ; left_ext = iclip(-x, 0, bw - 1)

+    neg            leftextq

+    cmovl          leftextq, r12

+    cmp           rightextq, bwq

+    cmovge        rightextq, r2

+    cmp            leftextq, bwq

+    cmovge         leftextq, r2

+    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \

+                dst, dstride, src, sstride, bottomext, rightext

+    ; center_h = bh - top_ext - bottom_ext

+    lea                  r3, [bottomextq+topextq]

+    sub            centerhq, r3

+    ; blk += top_ext * PXSTRIDE(dst_stride)

+    mov                  r2, topextq

+    imul                 r2, dstrideq

+    add                dstq, r2

+    mov                 r9m, dstq

+    ; center_w = bw - left_ext - right_ext

+    mov            centerwq, bwq

+    lea                  r3, [rightextq+leftextq]

+    sub            centerwq, r3

+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix

+.v_loop_%3:

+%if %1

+    test           leftextq, leftextq

+    jz .body_%3

+    ; left extension

+    xor                  r3, r3

+    vpbroadcastb         m0, [srcq]

+.left_loop_%3:

+    mova          [dstq+r3], m0

+    add                  r3, 32

+    cmp                  r3, leftextq

+    jl .left_loop_%3

+    ; body

+.body_%3:

+    lea                 r12, [dstq+leftextq]

+%endif

+    xor                  r3, r3

+.body_loop_%3:

+    movu                 m0, [srcq+r3]

+%if %1

+    movu           [r12+r3], m0

+%else

+    movu          [dstq+r3], m0

+%endif

+    add                  r3, 32

+    cmp                  r3, centerwq

+    jl .body_loop_%3

+%if %2

+    ; right extension

+    test          rightextq, rightextq

+    jz .body_loop_end_%3

+%if %1

+    add                 r12, centerwq

+%else

+    lea                 r12, [dstq+centerwq]

+%endif

+    xor                  r3, r3

+    vpbroadcastb         m0, [srcq+centerwq-1]

+.right_loop_%3:

+    movu           [r12+r3], m0

+    add                  r3, 32

+    cmp                  r3, rightextq

+    jl .right_loop_%3

+.body_loop_end_%3:

+%endif

+    add                dstq, dstrideq

+    add                srcq, sstrideq

+    dec            centerhq

+    jg .v_loop_%3

+%endmacro

+    test           leftextq, leftextq

+    jnz .need_left_ext

+    test          rightextq, rightextq

+    jnz .need_right_ext

+    v_loop                0, 0, 0

+    jmp .body_done

+.need_left_ext:

+    test          rightextq, rightextq

+    jnz .need_left_right_ext

+    v_loop                1, 0, 1

+    jmp .body_done

+.need_left_right_ext:

+    v_loop                1, 1, 2

+    jmp .body_done

+.need_right_ext:

+    v_loop                0, 1, 3

+.body_done:

+    ; bottom edge extension

+    test         bottomextq, bottomextq

+    jz .top

+    mov                srcq, dstq

+    sub                srcq, dstrideq

+    xor                  r1, r1

+.bottom_x_loop:

+    mova                 m0, [srcq+r1]

+    lea                  r3, [dstq+r1]

+    mov                  r4, bottomextq

+.bottom_y_loop:

+    mova               [r3], m0

+    add                  r3, dstrideq

+    dec                  r4

+    jg .bottom_y_loop

+    add                  r1, 32

+    cmp                  r1, bwq

+    jl .bottom_x_loop

+.top:

+    ; top edge extension

+    test            topextq, topextq

+    jz .end

+    mov                srcq, r9m

+    mov                dstq, dstm

+    xor                  r1, r1

+.top_x_loop:

+    mova                 m0, [srcq+r1]

+    lea                  r3, [dstq+r1]

+    mov                  r4, topextq

+.top_y_loop:

+    mova               [r3], m0

+    add                  r3, dstrideq

+    dec                  r4

+    jg .top_y_loop

+    add                  r1, 32

+    cmp                  r1, bwq

+    jl .top_x_loop

+.end:

+    RET

 %endif ; ARCH_X86_64

--- a/src/x86/mc_init_tmpl.c

+++ b/src/x86/mc_init_tmpl.c

@@ -58,6 +58,8 @@

 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);

 decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);

+decl_emu_edge_fn(dav1d_emu_edge_avx2);

 void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {

 #define init_mc_fn(type, name, suffix) \

     c->mc[type] = dav1d_put_##name##_##suffix

@@ -97,5 +99,7 @@

     c->warp8x8  = dav1d_warp_affine_8x8_avx2;

     c->warp8x8t = dav1d_warp_affine_8x8t_avx2;

+    c->emu_edge = dav1d_emu_edge_avx2;

 #endif