shithub: dav1d

Download patch

ref: e4fbbbce672937dfa53a9783803776b6d8c76a44
parent: 8676fda34cfd0c0cb92b39b0a8ad203b906e062c
author: Henrik Gramner <[email protected]>
date: Wed Nov 7 16:42:38 EST 2018

Add blend AVX2 asm

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -43,6 +43,9 @@
 bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
 deint_shuf4:    db 0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
 
+blend_shuf: ; bits 0-3: 0, 0, 0, 0, 1, 1, 1, 1
+pb_64:   times 4 db 64
+         times 4 db 1
 pw_8:    times 2 dw 8
 pw_26:   times 2 dw 26
 pw_34:   times 2 dw 34
@@ -58,12 +61,13 @@
 cextern mc_subpel_filters
 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
 
-%macro BIDIR_JMP_TABLE 1-7 4, 8, 16, 32, 64, 128
-    %xdefine %1_table (%%table - 2*4)
+%macro BIDIR_JMP_TABLE 1-* 4, 8, 16, 32, 64, 128
+    %xdefine %1_table (%%table - 2*%2)
+    %xdefine %%base %1_table
     %xdefine %%prefix mangle(private_prefix %+ _%1)
     %%table:
-    %rep 6
-        dd %%prefix %+ .w%2 - (%%table - 2*4)
+    %rep %0 - 1
+        dd %%prefix %+ .w%2 - %%base
         %rotate 1
     %endrep
 %endmacro
@@ -72,6 +76,7 @@
 BIDIR_JMP_TABLE w_avg_avx2
 BIDIR_JMP_TABLE mask_avx2
 BIDIR_JMP_TABLE w_mask_420_avx2
+BIDIR_JMP_TABLE blend_avx2, 2, 4, 8, 16, 32, 64, 128
 
 %macro BASE_JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - %3)
@@ -3281,7 +3286,410 @@
     jg .w128_loop
     RET
 
-INIT_YMM avx2
+cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms
+%define base r6-blend_avx2_table
+    lea                  r6, [blend_avx2_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movifnidn         maskq, maskmp
+    movsxd               wq, dword [r6+wq*4]
+    vpbroadcastd         m4, [base+pb_64]
+    vpbroadcastd         m5, [base+pw_512]
+    add                  wq, r6
+    mov                 msq, msmp
+    jmp                  wq
+.w2:
+    cmp                 msq, 1
+    jb .w2_s0
+    je .w2_s1
+.w2_s2:
+    movd                xm1, [maskq]
+    movd                xm0, [dstq+dsq*0]
+    pinsrw              xm0, [dstq+dsq*1], 1
+    psubb               xm2, xm4, xm1
+    punpcklbw           xm2, xm1
+    movd                xm1, [tmpq]
+    add               maskq, 2*2
+    add                tmpq, 2*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w2_s2
+    RET
+.w2_s1:
+    movd                xm1, [maskq]
+    movd                xm0, [dstq+dsq*0]
+    psubb               xm2, xm4, xm1
+    punpcklbw           xm2, xm1
+    pinsrw              xm0, [dstq+dsq*1], 1
+    movd                xm1, [tmpq]
+    punpcklwd           xm2, xm2
+    add               maskq, 2
+    add                tmpq, 2*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w2_s1
+    RET
+.w2_s0:
+    vpbroadcastw        xm0, [maskq]
+    psubb               xm4, xm0
+    punpcklbw           xm4, xm0
+.w2_s0_loop:
+    movd                xm0, [dstq+dsq*0]
+    pinsrw              xm0, [dstq+dsq*1], 1
+    movd                xm1, [tmpq]
+    add                tmpq, 2*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm4
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    pextrw     [dstq+dsq*0], xm0, 0
+    pextrw     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w2_s0_loop
+    RET
+ALIGN function_align
+.w4:
+    cmp                 msq, 1
+    jb .w4_s0
+    je .w4_s1
+.w4_s4:
+    movq                xm1, [maskq]
+    movd                xm0, [dstq+dsq*0]
+    pinsrd              xm0, [dstq+dsq*1], 1
+    psubb               xm2, xm4, xm1
+    punpcklbw           xm2, xm1
+    movq                xm1, [tmpq]
+    add               maskq, 4*2
+    add                tmpq, 4*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w4_s4
+    RET
+.w4_s1:
+    movq                xm3, [blend_shuf]
+.w4_s1_loop:
+    movd                xm1, [maskq]
+    movd                xm0, [dstq+dsq*0]
+    pshufb              xm1, xm3
+    psubb               xm2, xm4, xm1
+    pinsrd              xm0, [dstq+dsq*1], 1
+    punpcklbw           xm2, xm1
+    movq                xm1, [tmpq]
+    add               maskq, 2
+    add                tmpq, 4*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm2
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w4_s1_loop
+    RET
+.w4_s0:
+    vpbroadcastd        xm0, [maskq]
+    psubb               xm4, xm0
+    punpcklbw           xm4, xm0
+.w4_s0_loop:
+    movd                xm0, [dstq+dsq*0]
+    pinsrd              xm0, [dstq+dsq*1], 1
+    movq                xm1, [tmpq]
+    add                tmpq, 4*2
+    punpcklbw           xm0, xm1
+    pmaddubsw           xm0, xm4
+    pmulhrsw            xm0, xm5
+    packuswb            xm0, xm0
+    movd       [dstq+dsq*0], xm0
+    pextrd     [dstq+dsq*1], xm0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w4_s0_loop
+    RET
+ALIGN function_align
+.w8:
+    cmp                 msq, 1
+    jb .w8_s0
+    je .w8_s1
+.w8_s8:
+    movq                xm1, [maskq+8*1]
+    vinserti128          m1, [maskq+8*0], 1
+    vpbroadcastq         m2, [dstq+dsq*0]
+    movq                xm0, [dstq+dsq*1]
+    vpblendd             m0, m2, 0x30
+    psubb                m2, m4, m1
+    punpcklbw            m2, m1
+    movq                xm1, [tmpq+8*1]
+    vinserti128          m1, [tmpq+8*0], 1
+    add               maskq, 8*2
+    add                tmpq, 8*2
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m2
+    pmulhrsw             m0, m5
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movhps     [dstq+dsq*0], xm0
+    movq       [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w8_s8
+    RET
+.w8_s1:
+    vpbroadcastd         m0, [blend_shuf+0]
+    vpbroadcastd        xm3, [blend_shuf+4]
+    vpblendd             m3, m0, 0xf0
+.w8_s1_loop:
+    vpbroadcastd         m0, [maskq]
+    vpbroadcastq         m1, [dstq+dsq*0]
+    pshufb               m0, m3
+    psubb                m2, m4, m0
+    punpcklbw            m2, m0
+    movq                xm0, [dstq+dsq*1]
+    vpblendd             m0, m1, 0x30
+    movq                xm1, [tmpq+8*1]
+    vinserti128          m1, [tmpq+8*0], 1
+    add               maskq, 2
+    add                tmpq, 8*2
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m2
+    pmulhrsw             m0, m5
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movhps     [dstq+dsq*0], xm0
+    movq       [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w8_s1_loop
+    RET
+.w8_s0:
+    vpbroadcastq         m0, [maskq]
+    psubb                m4, m0
+    punpcklbw            m4, m0
+.w8_s0_loop:
+    vpbroadcastq         m2, [dstq+dsq*0]
+    movq                xm0, [dstq+dsq*1]
+    vpblendd             m0, m2, 0x30
+    movq                xm1, [tmpq+8*1]
+    vinserti128          m1, [tmpq+8*0], 1
+    add                tmpq, 8*2
+    punpcklbw            m0, m1
+    pmaddubsw            m0, m4
+    pmulhrsw             m0, m5
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movhps     [dstq+dsq*0], xm0
+    movq       [dstq+dsq*1], xm0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w8_s0_loop
+    RET
+ALIGN function_align
+.w16:
+    cmp                 msq, 1
+    jb .w16_s0
+    WIN64_SPILL_XMM       7
+    je .w16_s1
+.w16_s16:
+    mova                 m0, [maskq]
+    mova                xm1, [dstq+dsq*0]
+    vinserti128          m1, [dstq+dsq*1], 1
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    mova                 m6, [tmpq]
+    add               maskq, 16*2
+    add                tmpq, 16*2
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w16_s16
+    RET
+.w16_s1:
+    vpbroadcastd        xm6, [blend_shuf]
+    vpbroadcastd         m0, [blend_shuf+4]
+    vpblendd             m6, m0, 0xf0
+.w16_s1_loop:
+    vpbroadcastd         m2, [maskq]
+    mova                xm1, [dstq+dsq*0]
+    pshufb               m2, m6
+    psubb                m3, m4, m2
+    vinserti128          m1, [dstq+dsq*1], 1
+    punpcklbw            m3, m2
+    mova                 m2, [tmpq]
+    add               maskq, 2
+    add                tmpq, 16*2
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w16_s1_loop
+    RET
+.w16_s0:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       6
+    vbroadcasti128       m0, [maskq]
+    psubb                m4, m0
+    punpcklbw            m3, m4, m0
+    punpckhbw            m4, m0
+.w16_s0_loop:
+    mova                xm1, [dstq+dsq*0]
+    vinserti128          m1, [dstq+dsq*1], 1
+    mova                 m2, [tmpq]
+    add                tmpq, 16*2
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m4
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova         [dstq+dsq*0], xm0
+    vextracti128 [dstq+dsq*1], m0, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w16_s0_loop
+    RET
+ALIGN function_align
+.w32:
+    mov                  wd, 32
+    jmp .w32_start
+.w64:
+    mov                  wd, 64
+    jmp .w32_start
+.w128:
+    mov                  wd, 128
+.w32_start:
+    WIN64_SPILL_XMM       7
+    cmp                 msq, 1
+    jb .w32_s0
+    je .w32_s1
+    sub                 dsq, wq
+.w32_s32:
+    mov                 r6d, wd
+.w32_s32_loop:
+    mova                 m0, [maskq]
+    mova                 m1, [dstq]
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+    mova                 m6, [tmpq]
+    add               maskq, 32
+    add                tmpq, 32
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, 32
+    sub                 r6d, 32
+    jg .w32_s32_loop
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_s32
+    RET
+.w32_s1:
+    sub                 dsq, wq
+.w32_s1_loop0:
+    vpbroadcastb         m0, [maskq]
+    mov                 r6d, wd
+    inc               maskq
+    psubb                m3, m4, m0
+    punpcklbw            m3, m0
+.w32_s1_loop:
+    mova                 m1, [dstq]
+    mova                 m2, [tmpq]
+    add                tmpq, 32
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, 32
+    sub                 r6d, 32
+    jg .w32_s1_loop
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_s1_loop0
+    RET
+.w32_s0:
+%if WIN64
+    PUSH                 r7
+    PUSH                 r8
+    %define regs_used 9
+%endif
+    lea                 r6d, [hq+wq*8-256]
+    mov                  r7, dstq
+    mov                  r8, tmpq
+.w32_s0_loop0:
+    mova                 m0, [maskq]
+    add               maskq, 32
+    psubb                m3, m4, m0
+    punpcklbw            m2, m3, m0
+    punpckhbw            m3, m0
+.w32_s0_loop:
+    mova                 m1, [dstq]
+    mova                 m6, [tmpq]
+    add                tmpq, wq
+    punpcklbw            m0, m1, m6
+    punpckhbw            m1, m6
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_s0_loop
+    add                  r7, 32
+    add                  r8, 32
+    mov                dstq, r7
+    mov                tmpq, r8
+    mov                  hb, r6b
+    sub                 r6d, 256
+    jg .w32_s0_loop0
+    RET
+
 cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
                              bottomext, rightext
     ; we assume that the buffer (stride) is larger than width, so we can
@@ -3475,4 +3883,5 @@
 
 .end:
     RET
+
 %endif ; ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -54,6 +54,7 @@
 decl_w_avg_fn(dav1d_w_avg_avx2);
 decl_mask_fn(dav1d_mask_avx2);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
+decl_blend_fn(dav1d_blend_avx2);
 
 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
 decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
@@ -96,6 +97,7 @@
     c->w_avg = dav1d_w_avg_avx2;
     c->mask = dav1d_mask_avx2;
     c->w_mask[2] = dav1d_w_mask_420_avx2;
+    c->blend = dav1d_blend_avx2;
 
     c->warp8x8  = dav1d_warp_affine_8x8_avx2;
     c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -237,12 +237,12 @@
 }
 
 static void check_blend(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp, 128 * 128,);
-    ALIGN_STK_32(pixel, c_dst, 128 * 128,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 128,);
-    ALIGN_STK_32(uint8_t, mask, 128 * 128,);
+    ALIGN_STK_32(pixel, tmp, 128 * 32,);
+    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_32(uint8_t, mask, 128 * 32,);
 
-    for (int i = 0; i < 128 * 128; i++) {
+    for (int i = 0; i < 128 * 32; i++) {
         tmp[i] = rand() & ((1 << BITDEPTH) - 1);
         mask[i] = rand() % 65;
     }
@@ -252,9 +252,11 @@
 
     for (int w = 2; w <= 128; w <<= 1) {
         const ptrdiff_t dst_stride = w * sizeof(pixel);
+        const int h_min = (w == 128) ? 4 : 2;
+        const int h_max = (w > 32) ? 32 : (w == 2) ? 64 : 128;
         for (int ms = 0; ms <= w; ms += ms ? w - 1 : 1)
             if (check_func(c->blend, "blend_w%d_ms%d_%dbpc", w, ms, BITDEPTH))
-                for (int h = imax(w / 8, 2); h <= imin(w * 8, 128); h <<= 1) {
+                for (int h = h_min; h <= h_max; h <<= 1) {
                     for (int i = 0; i < w * h; i++)
                         c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);