shithub: dav1d

Download patch

ref: 5bc43169057a4b2386ffd828ae1c2bba8f6ddab2
parent: 785f00feccb5ed7c5739fc72bdcb9b422d8386ca
author: Henrik Gramner <[email protected]>
date: Fri May 31 18:01:44 EDT 2019

x86: Optimize warp8x8 AVX2 asm

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -2639,55 +2639,47 @@
     jg .hv_w8_loop0
     RET
 
-%macro WARP_V 5 ; dst, 01, 23, 45, 67
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
     ; Can be done using gathers, but that's terribly slow on many CPU:s
-    lea               tmp1d, [myq+deltaq*1]
-    lea               tmp2d, [myq+deltaq*2]
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
     shr                 myd, 10
     shr               tmp1d, 10
     movq                xm8, [filterq+myq  *8]
-    movq               xm10, [filterq+tmp1q*8]
-    lea               tmp1d, [tmp2q+deltaq*1]
-    lea                 myd, [tmp2q+deltaq*2]
+    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
+    lea               tmp1d, [tmp2q+deltaq*4]
+    lea                 myd, [tmp2q+deltaq*1]
     shr               tmp2d, 10
     shr               tmp1d, 10
     movq                xm0, [filterq+tmp2q*8]
-    movq                xm9, [filterq+tmp1q*8]
-    lea               tmp1d, [myq+deltaq*1]
-    lea               tmp2d, [myq+deltaq*2]
+    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
+    lea               tmp1d, [myq+deltaq*4]
+    lea               tmp2d, [myq+deltaq*1]
     shr                 myd, 10
     shr               tmp1d, 10
-    vinserti128          m8, [filterq+myq  *8], 1 ; a e
-    vinserti128         m10, [filterq+tmp1q*8], 1 ; b f
-    lea               tmp1d, [tmp2q+deltaq*1]
+    movq                xm9, [filterq+myq  *8]
+    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
+    lea               tmp1d, [tmp2q+deltaq*4]
     lea                 myd, [tmp2q+gammaq]       ; my += gamma
     shr               tmp2d, 10
     shr               tmp1d, 10
-    punpcklbw            m8, m10
-    vpbroadcastq        m10, [filterq+tmp2q*8]    ; c g
-    vpblendd             m0, m10, 0x30
-    vpbroadcastq        m10, [filterq+tmp1q*8]    ; d h
-    vpblendd             m9, m10, 0x30
-    punpcklbw            m0, m9
-    punpcklwd            m9, m8, m0
-    punpckhwd            m8, m0
-    pxor                m10, m10
-    punpcklbw            m0, m9, m8
-    punpckhbw            m9, m8
-    punpcklbw            m8, m10, m0 ; a0 a4 b0 b4 c0 c4 d0 d4 << 8
-    punpckhbw            m0, m10, m0 ; a1 a5 b1 b5 c1 c5 d1 d5 << 8
+    punpcklwd            m8, m0
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
+    punpcklwd            m0, m9, m0
+    punpckldq            m9, m8, m0
+    punpckhdq            m0, m8, m0
+    punpcklbw            m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+    punpckhbw            m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
     pmaddwd             m%2, m8
-    pmaddwd              m0, m%3
-    punpcklbw            m8, m10, m9 ; a2 a6 b2 b6 c2 c6 d2 d6 << 8
-    punpckhbw            m9, m10, m9 ; a3 a7 b3 b7 c3 c7 d3 d7 << 8
+    pmaddwd              m9, m%3
+    punpcklbw            m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+    punpckhbw            m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
     pmaddwd              m8, m%4
-    pmaddwd              m9, m%5
-    paddd                m0, m%2
-    mova                m%2, m%3
+    pmaddwd              m0, m%5
+    paddd               m%2, m9
     paddd                m0, m8
-    mova                m%3, m%4
-    paddd               m%1, m0, m9
-    mova                m%4, m%5
+    paddd               m%1, m0, m%2
 %endmacro
 
 cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
@@ -2696,13 +2688,13 @@
 %endif
     call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
 .loop:
-    psrad               m11, 13
+    psrad                m7, 13
     psrad                m0, 13
-    packssdw            m11, m0
-    pmulhrsw            m11, m14 ; (x + (1 << 6)) >> 7
-    vpermq               m0, m11, q3120
-    mova         [tmpq+tsq*0], xm0
-    vextracti128 [tmpq+tsq*2], m0, 1
+    packssdw             m7, m0
+    pmulhrsw             m7, m14 ; (x + (1 << 6)) >> 7
+    vpermq               m7, m7, q3120
+    mova         [tmpq+tsq*0], xm7
+    vextracti128 [tmpq+tsq*2], m7, 1
     dec                 r4d
     jz   mangle(private_prefix %+ _warp_affine_8x8_avx2).end
     call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
@@ -2723,15 +2715,15 @@
     call .main2
     lea                dstq, [dstq+dsq*2]
 .start:
-    psrad               m11, 17
-    psrad                m0, 17
-    packssdw            m11, m0
-    pmulhrsw            m11, m14 ; (x + (1 << 10)) >> 11
-    vextracti128        xm0, m11, 1
-    packuswb           xm11, xm0
-    pshufd              xm0, xm11, q3120
-    movq       [dstq+dsq*0], xm0
-    movhps     [dstq+dsq*1], xm0
+    psrad                m7, 18
+    psrad                m0, 18
+    packusdw             m7, m0
+    pavgw                m7, m11 ; (x + (1 << 10)) >> 11
+    vextracti128        xm0, m7, 1
+    packuswb            xm7, xm0
+    pshufd              xm7, xm7, q3120
+    movq       [dstq+dsq*0], xm7
+    movhps     [dstq+dsq*1], xm7
     dec                 r4d
     jg .loop
 .end:
@@ -2759,83 +2751,82 @@
     mova                m13, [warp_8x8_shufB]
     vpbroadcastd        m14, [pw_8192]
     vpbroadcastd        m15, [pd_32768]
+    pxor                m11, m11
     lea             filterq, [mc_warp_filter]
     lea               tmp1q, [ssq*3+3]
     add                 mxd, 512+(64<<10)
     lea               tmp2d, [alphaq*3]
-    add               tmp2d, tmp2d
     sub                srcq, tmp1q    ; src -= src_stride*3 + 3
-    sub               betad, tmp2d    ; beta -= alpha*6
+    sub               betad, tmp2d    ; beta -= alpha*3
     mov                 myd, r7m
     call .h
     psrld                m1, m0, 16
     call .h
-    pblendw              m1, m0, 0xaa ; 01
-    psrld                m2, m0, 16
+    psrld                m4, m0, 16
     call .h
-    pblendw              m2, m0, 0xaa ; 12
-    psrld                m3, m0, 16
+    pblendw              m1, m0, 0xaa ; 02
     call .h
-    pblendw              m3, m0, 0xaa ; 23
-    psrld                m4, m0, 16
+    pblendw              m4, m0, 0xaa ; 13
     call .h
-    pblendw              m4, m0, 0xaa ; 34
-    psrld                m5, m0, 16
+    psrld                m2, m1, 16
+    pblendw              m2, m0, 0xaa ; 24
     call .h
-    pblendw              m5, m0, 0xaa ; 45
-    psrld                m6, m0, 16
+    psrld                m5, m4, 16
+    pblendw              m5, m0, 0xaa ; 35
     call .h
-    pblendw              m6, m0, 0xaa ; 56
+    psrld                m3, m2, 16
+    pblendw              m3, m0, 0xaa ; 46
     movsx            deltad, word [abcdq+2*2]
     movsx            gammad, word [abcdq+2*3]
     add                 myd, 512+(64<<10)
     mov                 r4d, 4
     lea               tmp1d, [deltaq*3]
-    add               tmp1d, tmp1d
-    sub              gammad, tmp1d    ; gamma -= delta*6
+    sub              gammad, tmp1d    ; gamma -= delta*3
 .main2:
     call .h
-    psrld                m7, m6, 16
-    pblendw              m7, m0, 0xaa ; 67
-    WARP_V               11, 1, 3, 5, 7
+    psrld                m6, m5, 16
+    pblendw              m6, m0, 0xaa ; 57
+    WARP_V                7, 1, 3, 4, 6
     call .h
-    psrld                m7, 16
-    pblendw              m7, m0, 0xaa ; 78
-    WARP_V                0, 2, 4, 6, 7
+    mova                 m1, m2
+    mova                 m2, m3
+    psrld                m3, 16
+    pblendw              m3, m0, 0xaa ; 68
+    WARP_V                0, 4, 6, 1, 3
+    mova                 m4, m5
+    mova                 m5, m6
     ret
 ALIGN function_align
 .h:
-    lea               tmp1d, [mxq+alphaq*1]
-    lea               tmp2d, [mxq+alphaq*2]
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
+    vbroadcasti128      m10, [srcq]
     shr                 mxd, 10
     shr               tmp1d, 10
-    vbroadcasti128      m10, [srcq]
     movq                xm8, [filterq+mxq  *8]
-    movhps              xm8, [filterq+tmp1q*8]
-    lea               tmp1d, [tmp2q+alphaq*1]
-    lea                 mxd, [tmp2q+alphaq*2]
+    vinserti128          m8, [filterq+tmp1q*8], 1
+    lea               tmp1d, [tmp2q+alphaq*4]
+    lea                 mxd, [tmp2q+alphaq*1]
     shr               tmp2d, 10
     shr               tmp1d, 10
-    movq                xm9, [filterq+tmp2q*8]
-    movhps              xm9, [filterq+tmp1q*8]
-    lea               tmp1d, [mxq+alphaq*1]
-    lea               tmp2d, [mxq+alphaq*2]
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1
+    lea               tmp1d, [mxq+alphaq*4]
+    lea               tmp2d, [mxq+alphaq*1]
     shr                 mxd, 10
     shr               tmp1d, 10
-    vpbroadcastq         m0, [filterq+mxq  *8]
-    vpblendd             m8, m0, 0x30
-    vpbroadcastq         m0, [filterq+tmp1q*8]
-    vpblendd             m8, m0, 0xc0      ; 0 1   4 5
-    pshufb               m0, m10, m12
-    pmaddubsw            m0, m8
-    lea               tmp1d, [tmp2q+alphaq*1]
+    movq                xm9, [filterq+mxq  *8]
+    vinserti128          m9, [filterq+tmp1q*8], 1
+    lea               tmp1d, [tmp2q+alphaq*4]
     lea                 mxd, [tmp2q+betaq] ; mx += beta
     shr               tmp2d, 10
     shr               tmp1d, 10
-    vpbroadcastq         m8, [filterq+tmp2q*8]
-    vpblendd             m9, m8, 0x30
-    vpbroadcastq         m8, [filterq+tmp1q*8]
-    vpblendd             m9, m8, 0xc0      ; 2 3   6 7
+    punpcklqdq           m8, m0  ; 0 1   4 5
+    movq                xm0, [filterq+tmp2q*8]
+    vinserti128          m0, [filterq+tmp1q*8], 1
+    punpcklqdq           m9, m0  ; 2 3   6 7
+    pshufb               m0, m10, m12
+    pmaddubsw            m0, m8
     pshufb              m10, m13
     pmaddubsw           m10, m9
     add                srcq, ssq