shithub: dav1d

Download patch

ref: 652e5b38b07fddb7f000a0d001a7c36fcbb599a7
parent: 6cf58c8e7deb54e287afeee6710b2a3774eded9c
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Tue Jun 30 12:32:42 EDT 2020

x86: Minor changes to MC scaled AVX2 asm

--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -2766,9 +2766,9 @@
 %ifidn %1, put
  %assign isprep 0
  %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
  %else
-cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
  %endif
  %xdefine base_reg r12
  %define rndshift 10
@@ -2775,11 +2775,11 @@
 %else
  %assign isprep 1
  %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
   %xdefine tmp_stridem r14q
  %else
-cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
-  %define tmp_stridem qword [rsp+104]
+cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
+  %define tmp_stridem qword [rsp+120]
  %endif
  %xdefine base_reg r11
  %define rndshift 6
@@ -2808,7 +2808,7 @@
   %define hm r6m
  %endif
  %if required_stack_alignment > STACK_ALIGNMENT
-  %define dsm [rsp+96]
+  %define dsm [rsp+112]
   %define rX r1
   %define rXd r1d
  %else
@@ -2824,7 +2824,7 @@
   %define dxm r7m
  %else
   DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
-  %define hm [rsp+96]
+  %define hm [rsp+112]
  %endif
  MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
  %define rX r14
@@ -3104,181 +3104,9 @@
     lea                srcq, [srcq+ssq*2]
     jmp .w4_loop
 .w8:
-%ifidn %1, put
-    movifnidn           dsm, dsq
-%endif
-    shr                 t0d, 16
-    sub                srcq, 3
-    movd               xm15, t0d
-    pmaddwd              m8, [base+rescale_mul]
-    vpbroadcastq        m11, [base+pq_0x40000000]
-    vpbroadcastd        m15, xm15
-    paddd               m14, m8 ; mx+dx*[0-7]
-    pand                 m6, m14, m10
-    psrld                m6, 6
-    paddd               m15, m6
-    pcmpeqd              m6, m9
-    vextracti128        xm7, m15, 1
-    movd                r4d, xm15
-    pextrd              r6d, xm15, 2
-    pextrd              r7d, xm15, 1
-    pextrd              r9d, xm15, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
-    vinserti128         m15, [base+subpel_filters+r10*8], 1
-    vinserti128         m10, [base+subpel_filters+r11*8], 1
-    vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
-    psrld               m14, 10
-    mova              [rsp], xm14
-    vextracti128        xm7, m14, 1
-    movd                r4d, xm14
-    pextrd              r6d, xm14, 2
-    pextrd              r7d, xm14, 1
-    pextrd              r9d, xm14, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    pshufd               m5, m6, q1100
-    pshufd               m6, m6, q3322
-    vpblendd            m15, m9, 0xc0
-    vpblendd            m10, m8, 0xc0
-    pblendvb            m15, m11, m5
-    pblendvb            m10, m11, m6
-    vbroadcasti128      m14, [base+subpel_s_shuf8]
-    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
-    mov                 dyd, dym
-    pshufb               m0, m14    ; 01a 01b
-    pshufb               m1, m14    ; 23a 23b
-    pshufb               m2, m14    ; 45a 45b
-    pshufb               m3, m14    ; 67a 67b
-    vbroadcasti128      m14, [base+wswap]
-.w8_loop:
-    and                 myd, 0x3ff
-    mov                 r6d, 64 << 24
-    mov                 r4d, myd
-    shr                 r4d, 6
-    lea                 r4d, [t1+r4]
-    cmovnz              r6q, [base+subpel_filters+r4*8]
-    movq               xm11, r6q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pmaddwd              m4, m0, m8
-    pmaddwd              m5, m1, m9
-    pshufd               m8, m11, q2222
-    pshufd              m11, m11, q3333
-    pmaddwd              m6, m2, m8
-    pmaddwd              m7, m3, m11
-    paddd                m4, m5
-    paddd                m6, m7
-    paddd                m4, m13
-    paddd                m4, m6
-    psrad                m4, rndshift
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-%ifidn %1, put
-    packuswb            xm4, xm4
-    movq             [dstq], xm4
-    add                dstq, dsm
-%else
-    mova             [tmpq], xm4
-    add                tmpq, 16
-%endif
-    dec                  hd
-    jz .ret
-    add                 myd, dyd
-    test                myd, ~0x3ff
-    jz .w8_loop
-    test                myd, 0x400
-    mov            [rsp+16], myd
-    mov                 r4d, [rsp+ 0]
-    mov                 r6d, [rsp+ 8]
-    mov                 r7d, [rsp+ 4]
-    mov                 r9d, [rsp+12]
-    jz .w8_skip_line
-    vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+ rX]
-    movq                xm4, [srcq+ r4]
-    movq                xm5, [srcq+ r6]
-    movhps              xm4, [srcq+ r7]
-    movhps              xm5, [srcq+ r9]
-    vinserti128          m4, [srcq+r10], 1
-    vinserti128          m5, [srcq+r11], 1
-    add                srcq, ssq
-    mov                 myd, [rsp+16]
-    mov                 dyd, dym
-    pshufb               m0, m14
-    pshufb               m1, m14
-    pshufb               m2, m14
-    pshufb               m3, m14
-    vpblendd             m4, m6, 0xc0
-    vpblendd             m5, m7, 0xc0
-    pmaddubsw            m4, m15
-    pmaddubsw            m5, m10
-    phaddw               m4, m5
-    pslld                m5, m4, 16
-    paddw                m4, m5
-    pmulhrsw             m4, m12
-    pblendw              m0, m1, 0xaa
-    pblendw              m1, m2, 0xaa
-    pblendw              m2, m3, 0xaa
-    pblendw              m3, m4, 0xaa
-    jmp .w8_loop
-.w8_skip_line:
-    mova                 m0, m1
-    mova                 m1, m2
-    mova                 m2, m3
-    vpbroadcastq         m7, [srcq+r13]
-    vpbroadcastq         m8, [srcq+ rX]
-    movq                xm3, [srcq+ r4]
-    movq                xm4, [srcq+ r6]
-    movhps              xm3, [srcq+ r7]
-    movhps              xm4, [srcq+ r9]
-    vinserti128          m3, [srcq+r10], 1
-    vinserti128          m4, [srcq+r11], 1
-    add                srcq, ssq
-    movq                xm5, [srcq+ r4]
-    movq                xm6, [srcq+ r6]
-    movhps              xm5, [srcq+ r7]
-    movhps              xm6, [srcq+ r9]
-    vinserti128          m5, [srcq+r10], 1
-    vinserti128          m6, [srcq+r11], 1
-    vpbroadcastq         m9, [srcq+r13]
-    vpbroadcastq        m11, [srcq+ rX]
-    add                srcq, ssq
-    mov                 myd, [rsp+16]
-    mov                 dyd, dym
-    vpblendd             m3, m7, 0xc0
-    vpblendd             m4, m8, 0xc0
-    vpblendd             m5, m9, 0xc0
-    vpblendd             m6, m11, 0xc0
-    pmaddubsw            m3, m15
-    pmaddubsw            m4, m10
-    pmaddubsw            m5, m15
-    pmaddubsw            m6, m10
-    phaddw               m3, m4
-    phaddw               m5, m6
-    psrld                m4, m3, 16
-    pslld                m6, m5, 16
-    paddw                m3, m4
-    paddw                m5, m6
-    pblendw              m3, m5, 0xaa
-    pmulhrsw             m3, m12
-    jmp .w8_loop
+    mov      dword [rsp+48], 1
+    movifprep   tmp_stridem, 16
+    jmp .w_start
 .w16:
     mov      dword [rsp+48], 2
     movifprep   tmp_stridem, 32
@@ -3698,127 +3526,9 @@
     jg .dy1_w4_loop
     MC_8TAP_SCALED_RET
 .dy1_w8:
-%ifidn %1, put
-    movifnidn           dsm, dsq
-%endif
-    shr                 t0d, 16
-    sub                srcq, 3
-    movd               xm15, t0d
-    pmaddwd              m8, [base+rescale_mul]
-    vpbroadcastq        m11, [base+pq_0x40000000]
-    vpbroadcastd        m15, xm15
-    paddd               m14, m8 ; mx+dx*[0-7]
-    pand                 m6, m14, m10
-    psrld                m6, 6
-    paddd               m15, m6
-    pcmpeqd              m6, m9
-    vextracti128        xm7, m15, 1
-    movd                r4d, xm15
-    pextrd              r6d, xm15, 2
-    pextrd              r7d, xm15, 1
-    pextrd              r9d, xm15, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+ r4*8]
-    movq               xm10, [base+subpel_filters+ r6*8]
-    movhps             xm15, [base+subpel_filters+ r7*8]
-    movhps             xm10, [base+subpel_filters+ r9*8]
-    vinserti128         m15, [base+subpel_filters+r10*8], 1
-    vinserti128         m10, [base+subpel_filters+r11*8], 1
-    vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
-    psrld               m14, 10
-    vextracti128        xm7, m14, 1
-    movd                r4d, xm14
-    pextrd              r6d, xm14, 2
-    pextrd              r7d, xm14, 1
-    pextrd              r9d, xm14, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    mov            [rsp+32], r7d
-    pshufd               m5, m6, q1100
-    pshufd               m6, m6, q3322
-    vpblendd            m15, m9, 0xc0
-    vpblendd            m10, m8, 0xc0
-    pblendvb            m15, m11, m5
-    pblendvb            m10, m11, m6
-    vbroadcasti128      m14, [base+subpel_s_shuf8]
-    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
-    movu              [rsp], m10
-    pshufb               m0, m14    ; 01a 01b
-    pshufb               m1, m14    ; 23a 23b
-    pshufb               m2, m14    ; 45a 45b
-    pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    lea                 myd, [t1+myq]
-    mov                 t1d, 64 << 24
-    cmovnz              t1q, [base+subpel_filters+myq*8]
-    vbroadcasti128      m14, [base+wswap]
-    movq               xm11, t1q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    mov                 r7d, [rsp+32]
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pshufd              m10, m11, q2222
-    pshufd              m11, m11, q3333
-.dy1_w8_loop:
-    pmaddwd              m4, m0, m8
-    pmaddwd              m5, m1, m9
-    pmaddwd              m6, m2, m10
-    pmaddwd              m7, m3, m11
-    paddd                m4, m5
-    paddd                m6, m7
-    paddd                m4, m13
-    paddd                m4, m6
-    psrad                m4, rndshift
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-%ifidn %1, put
-    packuswb            xm4, xm4
-    movq             [dstq], xm4
-    add                dstq, dsm
-%else
-    mova             [tmpq], xm4
-    add                tmpq, 16
-%endif
-    dec                  hd
-    jz .ret
-    movq                xm4, [srcq+ r4]
-    movq                xm5, [srcq+ r6]
-    movhps              xm4, [srcq+ r7]
-    movhps              xm5, [srcq+ r9]
-    vinserti128          m4, [srcq+r10], 1
-    vinserti128          m5, [srcq+r11], 1
-    vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+ rX]
-    add                srcq, ssq
-    pshufb               m0, m14
-    pshufb               m1, m14
-    pshufb               m2, m14
-    pshufb               m3, m14
-    vpblendd             m4, m6, 0xc0
-    vpblendd             m5, m7, 0xc0
-    pmaddubsw            m4, m15
-    pmaddubsw            m5, [rsp]
-    phaddw               m4, m5
-    pslld                m5, m4, 16
-    paddw                m4, m5
-    pmulhrsw             m4, m12
-    pblendw              m0, m1, 0xaa
-    pblendw              m1, m2, 0xaa
-    pblendw              m2, m3, 0xaa
-    pblendw              m3, m4, 0xaa
-    jmp .dy1_w8_loop
+    mov      dword [rsp+72], 1
+    movifprep   tmp_stridem, 16
+    jmp .dy1_w_start
 .dy1_w16:
     mov      dword [rsp+72], 2
     movifprep   tmp_stridem, 32
@@ -3835,11 +3545,16 @@
     mov      dword [rsp+72], 16
     movifprep   tmp_stridem, 256
 .dy1_w_start:
+    mov                 myd, mym
 %ifidn %1, put
     movifnidn           dsm, dsq
 %endif
     shr                 t0d, 16
     sub                srcq, 3
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
     pmaddwd              m8, [base+rescale_mul]
     movd               xm15, t0d
     mov            [rsp+76], t0d
@@ -3851,6 +3566,10 @@
     shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
+    movq                xm0, r4q
+    punpcklbw           xm0, xm0
+    psraw               xm0, 8
+    mova           [rsp+96], xm0
     jmp .dy1_hloop
 .dy1_hloop_prep:
     dec      dword [rsp+72]
@@ -3910,27 +3629,16 @@
     MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
     MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
     MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
     movu              [rsp], m10
+    vpbroadcastd         m8, [rsp+0x60]
+    vpbroadcastd         m9, [rsp+0x64]
+    vpbroadcastd        m10, [rsp+0x68]
+    vpbroadcastd        m11, [rsp+0x6c]
     pshufb               m0, m14    ; 01a 01b
     pshufb               m1, m14    ; 23a 23b
     pshufb               m2, m14    ; 45a 45b
     pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    mov                 r4d, 64 << 24
-    lea                 myd, [t1+myq]
-    cmovnz              r4q, [base+subpel_filters+myq*8]
     vbroadcasti128      m14, [base+wswap]
-    movq               xm11, r4q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    mov                 r4d, [rsp+64]
-    mov                 r7d, [rsp+68]
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pshufd              m10, m11, q2222
-    pshufd              m11, m11, q3333
 .dy1_vloop:
     pmaddwd              m4, m0, m8
     pmaddwd              m5, m1, m9
@@ -4182,137 +3890,9 @@
     jg .dy2_w4_loop
     MC_8TAP_SCALED_RET
 .dy2_w8:
-%ifidn %1, put
-    movifnidn           dsm, dsq
-%endif
-    shr                 t0d, 16
-    sub                srcq, 3
-    movd               xm15, t0d
-    pmaddwd              m8, [base+rescale_mul]
-    vpbroadcastq        m11, [base+pq_0x40000000]
-    vpbroadcastd        m15, xm15
-    paddd               m14, m8 ; mx+dx*[0-7]
-    pand                 m6, m14, m10
-    psrld                m6, 6
-    paddd               m15, m6
-    pcmpeqd              m6, m9
-    vextracti128        xm7, m15, 1
-    movd                r4d, xm15
-    pextrd              r6d, xm15, 2
-    pextrd              r7d, xm15, 1
-    pextrd              r9d, xm15, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+ r4*8]
-    movq               xm10, [base+subpel_filters+ r6*8]
-    movhps             xm15, [base+subpel_filters+ r7*8]
-    movhps             xm10, [base+subpel_filters+ r9*8]
-    vinserti128         m15, [base+subpel_filters+r10*8], 1
-    vinserti128         m10, [base+subpel_filters+r11*8], 1
-    vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
-    psrld               m14, 10
-    vextracti128        xm7, m14, 1
-    movd                r4d, xm14
-    pextrd              r6d, xm14, 2
-    pextrd              r7d, xm14, 1
-    pextrd              r9d, xm14, 3
-    movd               r10d, xm7
-    pextrd             r11d, xm7, 2
-    pextrd             r13d, xm7, 1
-    pextrd              rXd, xm7, 3
-    mov               [rsp], r7d
-    pshufd               m5, m6, q1100
-    pshufd               m6, m6, q3322
-    vpblendd            m15, m9, 0xc0
-    vpblendd            m10, m8, 0xc0
-    pblendvb            m15, m11, m5
-    pblendvb            m10, m11, m6
-    vbroadcasti128      m14, [base+subpel_s_shuf8]
-    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
-    pshufb               m0, m14    ; 01a 01b
-    pshufb               m1, m14    ; 23a 23b
-    pshufb               m2, m14    ; 45a 45b
-    pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    lea                 myd, [t1+myq]
-    mov                 t1d, 64 << 24
-    cmovnz              t1q, [base+subpel_filters+myq*8]
-    movq               xm11, t1q
-    punpcklbw          xm11, xm11
-    psraw              xm11, 8
-    vinserti128         m11, xm11, 1
-    mov                 r7d, [rsp]
-    pshufd               m8, m11, q0000
-    pshufd               m9, m11, q1111
-    pshufd              m14, m11, q2222
-    pshufd              m11, m11, q3333
-.dy2_w8_loop:
-    pmaddwd              m4, m0, m8
-    pmaddwd              m5, m1, m9
-    pmaddwd              m6, m2, m14
-    pmaddwd              m7, m3, m11
-    paddd                m4, m5
-    paddd                m6, m7
-    paddd                m4, m13
-    paddd                m4, m6
-    psrad                m4, rndshift
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-%ifidn %1, put
-    packuswb            xm4, xm4
-    movq             [dstq], xm4
-    add                dstq, dsm
-%else
-    mova             [tmpq], xm4
-    add                tmpq, 16
-%endif
-    dec                  hd
-    jz .ret
-    mova                 m0, m1
-    mova                 m1, m2
-    mova                 m2, m3
-    movq                xm3, [srcq+ r4]
-    movq                xm4, [srcq+ r6]
-    movhps              xm3, [srcq+ r7]
-    movhps              xm4, [srcq+ r9]
-    vinserti128          m3, [srcq+r10], 1
-    vinserti128          m4, [srcq+r11], 1
-    vpbroadcastq         m5, [srcq+r13]
-    vpbroadcastq         m6, [srcq+ rX]
-    add                srcq, ssq
-    vpblendd             m3, m5, 0xc0
-    vpblendd             m4, m6, 0xc0
-    pmaddubsw            m3, m15
-    pmaddubsw            m4, m10
-    phaddw               m3, m4
-    movq                xm4, [srcq+ r4]
-    movq                xm5, [srcq+ r6]
-    movhps              xm4, [srcq+ r7]
-    movhps              xm5, [srcq+ r9]
-    vinserti128          m4, [srcq+r10], 1
-    vinserti128          m5, [srcq+r11], 1
-    vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+ rX]
-    add                srcq, ssq
-    vpblendd             m4, m6, 0xc0
-    vpblendd             m5, m7, 0xc0
-    pmaddubsw            m4, m15
-    pmaddubsw            m5, m10
-    phaddw               m4, m5
-    psrld                m5, m3, 16
-    pslld                m6, m4, 16
-    paddw                m3, m5
-    paddw                m4, m6
-    pblendw              m3, m4, 0xaa
-    pmulhrsw             m3, m12
-    jmp .dy2_w8_loop
+    mov      dword [rsp+40], 1
+    movifprep   tmp_stridem, 16
+    jmp .dy2_w_start
 .dy2_w16:
     mov      dword [rsp+40], 2
     movifprep   tmp_stridem, 32
@@ -4329,11 +3909,16 @@
     mov      dword [rsp+40], 16
     movifprep   tmp_stridem, 256
 .dy2_w_start:
+    mov                 myd, mym
 %ifidn %1, put
     movifnidn           dsm, dsq
 %endif
     shr                 t0d, 16
     sub                srcq, 3
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
     pmaddwd              m8, [base+rescale_mul]
     movd               xm15, t0d
     mov            [rsp+64], t0d
@@ -4345,6 +3930,10 @@
     shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
+    movq                xm0, r4q
+    punpcklbw           xm0, xm0
+    psraw               xm0, 8
+    mova         [rsp+0x50], xm0
     jmp .dy2_hloop
 .dy2_hloop_prep:
     dec      dword [rsp+40]
@@ -4384,7 +3973,6 @@
     vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
-    movq           [rsp+32], xm14
     movd                r4d, xm14
     pextrd              r6d, xm14, 2
     pextrd              r7d, xm14, 1
@@ -4404,25 +3992,15 @@
     MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
     MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
     MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
-    mov                 myd, mym
+    vpbroadcastd         m8, [rsp+0x50]
+    vpbroadcastd         m9, [rsp+0x54]
+    vpbroadcastd        m11, [rsp+0x58]
+    vpbroadcastd         m4, [rsp+0x5c]
     pshufb               m0, m14    ; 01a 01b
     pshufb               m1, m14    ; 23a 23b
     pshufb               m2, m14    ; 45a 45b
     pshufb               m3, m14    ; 67a 67b
-    shr                 myd, 6
-    mov                 r4d, 64 << 24
-    lea                 myd, [t1+myq]
-    cmovnz              r4q, [base+subpel_filters+myq*8]
-    movq               xm14, r4q
-    punpcklbw          xm14, xm14
-    psraw              xm14, 8
-    vinserti128         m14, xm14, 1
-    mov                 r4d, [rsp+32]
-    mov                 r7d, [rsp+36]
-    pshufd               m8, m14, q0000
-    pshufd               m9, m14, q1111
-    pshufd              m11, m14, q2222
-    pshufd              m14, m14, q3333
+    SWAP                m14, m4
 .dy2_vloop:
     pmaddwd              m4, m0, m8
     pmaddwd              m5, m1, m9