shithub: dav1d

Download patch

ref: 973d11bfab8296ed172790ddbb79538c3daa5566
parent: 22e144fb4c5dae7580c149d8f98f888e3d688b26
author: Henrik Gramner <[email protected]>
date: Mon Nov 5 09:13:02 EST 2018

x86: Add warp8x8 and warp8x8t AVX2 asm

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -30,6 +30,10 @@
 
 SECTION_RODATA 32
 
+warp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
+                db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
+warp_8x8_shufB: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
+                db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
 subpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
                 db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
 subpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
@@ -47,8 +51,9 @@
 pw_1024: times 2 dw 1024
 pw_2048: times 2 dw 2048
 pw_8192: times 2 dw 8192
-pd_32:   dd 32
-pd_512:  dd 512
+pd_32:    dd 32
+pd_512:   dd 512
+pd_32768: dd 32768
 
 cextern mc_subpel_filters
 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
@@ -123,6 +128,8 @@
 
 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
 
+cextern mc_warp_filter
+
 SECTION .text
 
 INIT_XMM avx2
@@ -2626,6 +2633,211 @@
     sub                 r6d, 1<<8
     jg .hv_w8_loop0
     RET
+
+%macro WARP_V 5 ; dst, 01, 23, 45, 67
+    ; Can be done using gathers, but that's terribly slow on many CPU:s
+    lea               tmp1d, [myq+deltaq*1]
+    lea               tmp2d, [myq+deltaq*2]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    movq                xm8, [filterq+myq  *8]
+    movq               xm10, [filterq+tmp1q*8]
+    lea               tmp1d, [tmp2q+deltaq*1]
+    lea                 myd, [tmp2q+deltaq*2]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                xm0, [filterq+tmp2q*8]
+    movq                xm9, [filterq+tmp1q*8]
+    lea               tmp1d, [myq+deltaq*1]
+    lea               tmp2d, [myq+deltaq*2]
+    shr                 myd, 10
+    shr               tmp1d, 10
+    vinserti128          m8, [filterq+myq  *8], 1 ; a e
+    vinserti128         m10, [filterq+tmp1q*8], 1 ; b f
+    lea               tmp1d, [tmp2q+deltaq*1]
+    lea                 myd, [tmp2q+gammaq]       ; my += gamma
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    punpcklbw            m8, m10
+    vpbroadcastq        m10, [filterq+tmp2q*8]    ; c g
+    vpblendd             m0, m10, 0x30
+    vpbroadcastq        m10, [filterq+tmp1q*8]    ; d h
+    vpblendd             m9, m10, 0x30
+    punpcklbw            m0, m9
+    punpcklwd            m9, m8, m0
+    punpckhwd            m8, m0
+    pxor                m10, m10
+    punpcklbw            m0, m9, m8
+    punpckhbw            m9, m8
+    punpcklbw            m8, m10, m0 ; a0 a4 b0 b4 c0 c4 d0 d4 << 8
+    punpckhbw            m0, m10, m0 ; a1 a5 b1 b5 c1 c5 d1 d5 << 8
+    pmaddwd             m%2, m8
+    pmaddwd              m0, m%3
+    punpcklbw            m8, m10, m9 ; a2 a6 b2 b6 c2 c6 d2 d6 << 8
+    punpckhbw            m9, m10, m9 ; a3 a7 b3 b7 c3 c7 d3 d7 << 8
+    pmaddwd              m8, m%4
+    pmaddwd              m9, m%5
+    paddd                m0, m%2
+    mova                m%2, m%3
+    paddd                m0, m8
+    mova                m%3, m%4
+    paddd               m%1, m0, m9
+    mova                m%4, m%5
+%endmacro
+
+cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
+%if WIN64
+    sub                 rsp, 0xa0
+%endif
+    call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
+.loop:
+    psrad               m11, 13
+    psrad                m0, 13
+    packssdw            m11, m0
+    pmulhrsw            m11, m14 ; (x + (1 << 6)) >> 7
+    vpermq               m0, m11, q3120
+    mova         [tmpq+tsq*0], xm0
+    vextracti128 [tmpq+tsq*2], m0, 1
+    dec                 r4d
+    jz   mangle(private_prefix %+ _warp_affine_8x8_avx2).end
+    call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
+    lea                tmpq, [tmpq+tsq*4]
+    jmp .loop
+
+cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+                                   beta, filter, tmp1, delta, my, gamma
+%if WIN64
+    sub                 rsp, 0xa0
+    %assign xmm_regs_used 16
+    %assign stack_size_padded 0xa0
+    %assign stack_offset stack_offset+stack_size_padded
+%endif
+    call .main
+    jmp .start
+.loop:
+    call .main2
+    lea                dstq, [dstq+dsq*2]
+.start:
+    psrad               m11, 17
+    psrad                m0, 17
+    packssdw            m11, m0
+    pmulhrsw            m11, m14 ; (x + (1 << 10)) >> 11
+    vextracti128        xm0, m11, 1
+    packuswb           xm11, xm0
+    pshufd              xm0, xm11, q3120
+    movq       [dstq+dsq*0], xm0
+    movhps     [dstq+dsq*1], xm0
+    dec                 r4d
+    jg .loop
+.end:
+    RET
+ALIGN function_align
+.main:
+    ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+    mov               abcdq, r5m
+    mov                 mxd, r6m
+    movaps [rsp+stack_offset+0x10], xmm6
+    movaps [rsp+stack_offset+0x20], xmm7
+    movaps       [rsp+0x28], xmm8
+    movaps       [rsp+0x38], xmm9
+    movaps       [rsp+0x48], xmm10
+    movaps       [rsp+0x58], xmm11
+    movaps       [rsp+0x68], xmm12
+    movaps       [rsp+0x78], xmm13
+    movaps       [rsp+0x88], xmm14
+    movaps       [rsp+0x98], xmm15
+%endif
+    movsx            alphad, word [abcdq+2*0]
+    movsx             betad, word [abcdq+2*1]
+    mova                m12, [warp_8x8_shufA]
+    mova                m13, [warp_8x8_shufB]
+    vpbroadcastd        m14, [pw_8192]
+    vpbroadcastd        m15, [pd_32768]
+    lea             filterq, [mc_warp_filter]
+    lea               tmp1q, [ssq*3+3]
+    add                 mxd, 512+(64<<10)
+    lea               tmp2d, [alphaq*3]
+    add               tmp2d, tmp2d
+    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
+    sub               betad, tmp2d    ; beta -= alpha*6
+    mov                 myd, r7m
+    call .h
+    psrld                m1, m0, 16
+    call .h
+    pblendw              m1, m0, 0xaa ; 01
+    psrld                m2, m0, 16
+    call .h
+    pblendw              m2, m0, 0xaa ; 12
+    psrld                m3, m0, 16
+    call .h
+    pblendw              m3, m0, 0xaa ; 23
+    psrld                m4, m0, 16
+    call .h
+    pblendw              m4, m0, 0xaa ; 34
+    psrld                m5, m0, 16
+    call .h
+    pblendw              m5, m0, 0xaa ; 45
+    psrld                m6, m0, 16
+    call .h
+    pblendw              m6, m0, 0xaa ; 56
+    movsx            deltad, word [abcdq+2*2]
+    movsx            gammad, word [abcdq+2*3]
+    add                 myd, 512+(64<<10)
+    mov                 r4d, 4
+    lea               tmp1d, [deltaq*3]
+    add               tmp1d, tmp1d
+    sub              gammad, tmp1d    ; gamma -= delta*6
+.main2:
+    call .h
+    psrld                m7, m6, 16
+    pblendw              m7, m0, 0xaa ; 67
+    WARP_V               11, 1, 3, 5, 7
+    call .h
+    psrld                m7, 16
+    pblendw              m7, m0, 0xaa ; 78
+    WARP_V                0, 2, 4, 6, 7
+    ret
+ALIGN function_align
+.h:
+    lea               tmp1d, [mxq+alphaq*1]
+    lea               tmp2d, [mxq+alphaq*2]
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    vbroadcasti128      m10, [srcq]
+    movq                xm8, [filterq+mxq  *8]
+    movhps              xm8, [filterq+tmp1q*8]
+    lea               tmp1d, [tmp2q+alphaq*1]
+    lea                 mxd, [tmp2q+alphaq*2]
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    movq                xm9, [filterq+tmp2q*8]
+    movhps              xm9, [filterq+tmp1q*8]
+    lea               tmp1d, [mxq+alphaq*1]
+    lea               tmp2d, [mxq+alphaq*2]
+    shr                 mxd, 10
+    shr               tmp1d, 10
+    vpbroadcastq         m0, [filterq+mxq  *8]
+    vpblendd             m8, m0, 0x30
+    vpbroadcastq         m0, [filterq+tmp1q*8]
+    vpblendd             m8, m0, 0xc0      ; 0 1   4 5
+    pshufb               m0, m10, m12
+    pmaddubsw            m0, m8
+    lea               tmp1d, [tmp2q+alphaq*1]
+    lea                 mxd, [tmp2q+betaq] ; mx += beta
+    shr               tmp2d, 10
+    shr               tmp1d, 10
+    vpbroadcastq         m8, [filterq+tmp2q*8]
+    vpblendd             m9, m8, 0x30
+    vpbroadcastq         m8, [filterq+tmp1q*8]
+    vpblendd             m9, m8, 0xc0      ; 2 3   6 7
+    pshufb              m10, m13
+    pmaddubsw           m10, m9
+    add                srcq, ssq
+    phaddw               m0, m10
+    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
+    paddd                m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+    ret
 
 %macro BIDIR_FN 1 ; op
     %1                    0
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -55,6 +55,9 @@
 decl_mask_fn(dav1d_mask_avx2);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 
+decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+
 void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = dav1d_put_##name##_##suffix
@@ -91,5 +94,8 @@
     c->w_avg = dav1d_w_avg_avx2;
     c->mask = dav1d_mask_avx2;
     c->w_mask[2] = dav1d_w_mask_420_avx2;
+
+    c->warp8x8  = dav1d_warp_affine_8x8_avx2;
+    c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
 #endif
 }