shithub: dav1d

Download patch

ref: f753caeac09bcf7ebe5d3fe1c0903deb277e9433
parent: f813285c1d1a5421e0180efbb7cbdd377cd31c69
author: Henrik Gramner <[email protected]>
date: Mon Jan 14 10:47:13 EST 2019

Add minor x86 bilin mc optimizations

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -265,7 +265,6 @@
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
     imul               mxyd, 0xff01
     vbroadcasti128       m4, [bilin_h_shuf8]
-    WIN64_SPILL_XMM       7
     add                mxyd, 16 << 8
     movd                xm5, mxyd
     mov                mxyd, r7m ; my
@@ -273,7 +272,7 @@
     test               mxyd, mxyd
     jnz .hv
     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_h)]
-    vpbroadcastd         m6, [pw_2048]
+    vpbroadcastd         m3, [pw_2048]
     add                  wq, t2
     jmp                  wq
 .h_w2:
@@ -282,7 +281,7 @@
     lea                srcq, [srcq+ssq*2]
     pshufb              xm0, xm4
     pmaddubsw           xm0, xm5
-    pmulhrsw            xm0, xm6
+    pmulhrsw            xm0, xm3
     packuswb            xm0, xm0
     pextrw     [dstq+dsq*0], xm0, 0
     pextrw     [dstq+dsq*1], xm0, 2
@@ -298,7 +297,7 @@
     lea                srcq, [srcq+ssq*2]
     pshufb              xm0, xm4
     pmaddubsw           xm0, xm5
-    pmulhrsw            xm0, xm6
+    pmulhrsw            xm0, xm3
     packuswb            xm0, xm0
     movd       [dstq+dsq*0], xm0
     pextrd     [dstq+dsq*1], xm0, 1
@@ -314,8 +313,8 @@
     pshufb              xm1, xm4
     pmaddubsw           xm0, xm5
     pmaddubsw           xm1, xm5
-    pmulhrsw            xm0, xm6
-    pmulhrsw            xm1, xm6
+    pmulhrsw            xm0, xm3
+    pmulhrsw            xm1, xm3
     packuswb            xm0, xm1
     movq       [dstq+dsq*0], xm0
     movhps     [dstq+dsq*1], xm0
@@ -333,8 +332,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     mova         [dstq+dsq*0], xm0
     vextracti128 [dstq+dsq*1], m0, 1
@@ -350,8 +349,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     mova             [dstq], m0
     add                dstq, dsq
@@ -361,25 +360,25 @@
 .h_w64:
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
-    movu                 m2, [srcq+8*4]
-    movu                 m3, [srcq+8*5]
-    add                srcq, ssq
     pshufb               m0, m4
     pshufb               m1, m4
-    pshufb               m2, m4
-    pshufb               m3, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmaddubsw            m2, m5
-    pmaddubsw            m3, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
-    pmulhrsw             m2, m6
-    pmulhrsw             m3, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
-    packuswb             m2, m3
+    movu                 m1, [srcq+8*4]
+    movu                 m2, [srcq+8*5]
+    add                srcq, ssq
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    packuswb             m1, m2
     mova        [dstq+32*0], m0
-    mova        [dstq+32*1], m2
+    mova        [dstq+32*1], m1
     add                dstq, dsq
     dec                  hd
     jg .h_w64
@@ -393,8 +392,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     mova     [dstq+t1+32*3], m0
     add                  t1, 32
@@ -406,14 +405,12 @@
     RET
 .v:
     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]
-    %assign stack_offset stack_offset - stack_size_padded
-    WIN64_SPILL_XMM       8
     imul               mxyd, 0xff01
-    vpbroadcastd         m7, [pw_2048]
+    vpbroadcastd         m5, [pw_2048]
     add                mxyd, 16 << 8
     add                  wq, t2
-    movd                xm6, mxyd
-    vpbroadcastw         m6, xm6
+    movd                xm4, mxyd
+    vpbroadcastw         m4, xm4
     jmp                  wq
 .v_w2:
     movd                xm0,      [srcq+ssq*0]
@@ -423,8 +420,8 @@
     pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
     pshuflw             xm1, xm1, q2301           ; 1 0
     punpcklbw           xm1, xm0, xm1
-    pmaddubsw           xm1, xm6
-    pmulhrsw            xm1, xm7
+    pmaddubsw           xm1, xm4
+    pmulhrsw            xm1, xm5
     packuswb            xm1, xm1
     pextrw     [dstq+dsq*0], xm1, 1
     pextrw     [dstq+dsq*1], xm1, 0
@@ -441,8 +438,8 @@
     vpbroadcastd        xm0, [srcq+ssq*0]
     vpblendd            xm1, xm1, xm0, 0x02 ; 1 2
     punpcklbw           xm1, xm2
-    pmaddubsw           xm1, xm6
-    pmulhrsw            xm1, xm7
+    pmaddubsw           xm1, xm4
+    pmulhrsw            xm1, xm5
     packuswb            xm1, xm1
     movd       [dstq+dsq*0], xm1
     pextrd     [dstq+dsq*1], xm1, 1
@@ -453,20 +450,18 @@
 .v_w8:
     movq                xm0, [srcq+ssq*0]
 .v_w8_loop:
-    vpbroadcastq        xm1, [srcq+ssq*1]
+    movq                xm3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    vpblendd            xm2, xm1, xm0, 0x03 ; 0 1
-    vpbroadcastq        xm0, [srcq+ssq*0]
-    vpblendd            xm1, xm1, xm0, 0x0c ; 1 2
-    punpcklbw           xm3, xm1, xm2
-    punpckhbw           xm1, xm2
-    pmaddubsw           xm3, xm6
-    pmaddubsw           xm1, xm6
-    pmulhrsw            xm3, xm7
-    pmulhrsw            xm1, xm7
-    packuswb            xm3, xm1
-    movq       [dstq+dsq*0], xm3
-    movhps     [dstq+dsq*1], xm3
+    punpcklbw           xm1, xm3, xm0
+    movq                xm0, [srcq+ssq*0]
+    punpcklbw           xm2, xm0, xm3
+    pmaddubsw           xm1, xm4
+    pmaddubsw           xm2, xm4
+    pmulhrsw            xm1, xm5
+    pmulhrsw            xm2, xm5
+    packuswb            xm1, xm2
+    movq       [dstq+dsq*0], xm1
+    movhps     [dstq+dsq*1], xm1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w8_loop
@@ -481,10 +476,10 @@
     vpblendd             m2, m2, m0, 0xf0 ; 1 2
     punpcklbw            m1, m2, m3
     punpckhbw            m2, m3
-    pmaddubsw            m1, m6
-    pmaddubsw            m2, m6
-    pmulhrsw             m1, m7
-    pmulhrsw             m2, m7
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
     packuswb             m1, m2
     mova         [dstq+dsq*0], xm1
     vextracti128 [dstq+dsq*1], m1, 1
@@ -496,25 +491,25 @@
 %macro PUT_BILIN_V_W32 0
     movu                 m0, [srcq+ssq*0]
 %%loop:
-    movu                 m4, [srcq+ssq*1]
+    movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m4, m0
-    punpckhbw            m3, m4, m0
+    punpcklbw            m1, m3, m0
+    punpckhbw            m2, m3, m0
     movu                 m0, [srcq+ssq*0]
-    punpcklbw            m2, m0, m4
-    punpckhbw            m4, m0, m4
-    pmaddubsw            m1, m6
-    pmaddubsw            m3, m6
-    pmaddubsw            m2, m6
-    pmaddubsw            m4, m6
-    pmulhrsw             m1, m7
-    pmulhrsw             m3, m7
-    pmulhrsw             m2, m7
-    pmulhrsw             m4, m7
-    packuswb             m1, m3
-    packuswb             m2, m4
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
     mova       [dstq+dsq*0], m1
-    mova       [dstq+dsq*1], m2
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova       [dstq+dsq*1], m1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg %%loop
@@ -527,25 +522,25 @@
 .v_w64_loop:
     add                srcq, ssq
     movu                 m3, [srcq+32*0]
-    movu                 m4, [srcq+32*1]
     punpcklbw            m2, m3, m0
-    punpckhbw            m5, m3, m0
-    pmaddubsw            m2, m6
-    pmaddubsw            m5, m6
+    punpckhbw            m0, m3, m0
+    pmaddubsw            m2, m4
+    pmaddubsw            m0, m4
+    pmulhrsw             m2, m5
+    pmulhrsw             m0, m5
+    packuswb             m2, m0
     mova                 m0, m3
-    pmulhrsw             m2, m7
-    pmulhrsw             m5, m7
-    packuswb             m2, m5
-    punpcklbw            m3, m4, m1
-    punpckhbw            m5, m4, m1
-    pmaddubsw            m3, m6
-    pmaddubsw            m5, m6
-    mova                 m1, m4
-    pmulhrsw             m3, m7
-    pmulhrsw             m5, m7
-    packuswb             m3, m5
+    movu                 m3, [srcq+32*1]
     mova        [dstq+32*0], m2
-    mova        [dstq+32*1], m3
+    punpcklbw            m2, m3, m1
+    punpckhbw            m1, m3, m1
+    pmaddubsw            m2, m4
+    pmaddubsw            m1, m4
+    pmulhrsw             m2, m5
+    pmulhrsw             m1, m5
+    packuswb             m2, m1
+    mova                 m1, m3
+    mova        [dstq+32*1], m2
     add                dstq, dsq
     dec                  hd
     jg .v_w64_loop
@@ -568,7 +563,6 @@
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
-    %assign stack_offset stack_offset - stack_size_padded
     WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
     vpbroadcastd         m7, [pw_2048]
@@ -684,7 +678,14 @@
     jg .hv_w16_loop
     RET
 .hv_w32:
-%macro PUT_BILIN_HV_W32 0
+    xor                 t2d, t2d
+.hv_w32gt:
+    mov                  t0, dstq
+    mov                  t1, srcq
+%if WIN64
+    movaps              r4m, xmm8
+%endif
+.hv_w32_loop0:
     movu                 m0,     [srcq+8*0]
     vinserti128          m0, m0, [srcq+8*2], 1
     movu                 m1,     [srcq+8*1]
@@ -693,10 +694,7 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-%if WIN64
-    movaps              r4m, xmm8
-%endif
-%%loop:
+.hv_w32_loop:
     add                srcq, ssq
     movu                xm2,     [srcq+8*1]
     vinserti128          m2, m2, [srcq+8*3], 1
@@ -722,41 +720,24 @@
     mova             [dstq], m3
     add                dstq, dsq
     dec                  hd
-    jg %%loop
-%if WIN64
-    movaps             xmm8, r4m
-%endif
-%endmacro
-    PUT_BILIN_HV_W32
-    RET
-.hv_w64:
-    mov                  t0, dstq
-    mov                  t1, srcq
-    lea                 t2d, [hq+(1<<8)]
-.hv_w64_loop:
-    PUT_BILIN_HV_W32
-    mov                  hb, t2b
+    jg .hv_w32_loop
+    movzx                hd, t2b
     add                  t0, 32
     add                  t1, 32
     mov                dstq, t0
     mov                srcq, t1
     sub                 t2d, 1<<8
-    jg .hv_w64_loop
+    jg .hv_w32_loop0
+%if WIN64
+    movaps             xmm8, r4m
+%endif
     RET
+.hv_w64:
+    lea                 t2d, [hq+(1<<8)]
+    jmp .hv_w32gt
 .hv_w128:
-    mov                  t0, dstq
-    mov                  t1, srcq
     lea                 t2d, [hq+(3<<8)]
-.hv_w128_loop:
-    PUT_BILIN_HV_W32
-    mov                  hb, t2b
-    add                  t0, 32
-    add                  t1, 32
-    mov                dstq, t0
-    mov                srcq, t1
-    sub                 t2d, 1<<8
-    jg .hv_w128_loop
-    RET
+    jmp .hv_w32gt
 
 DECLARE_REG_TMP 3, 5, 6
 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -266,7 +266,6 @@
     imul               mxyd, 0xff01
     mova                 m4, [base+bilin_h_shuf8]
     mova                 m0, [base+bilin_h_shuf4]
-    WIN64_SPILL_XMM       7
     add                mxyd, 16 << 8
     movd                 m5, mxyd
     mov                mxyd, r7m ; my
@@ -275,7 +274,7 @@
     test               mxyd, mxyd
     jnz .hv
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
-    mova                 m6, [base+pw_2048]
+    mova                 m3, [base+pw_2048]
     add                  wq, t0
     RESTORE_DSQ_32       t0
     jmp                  wq
@@ -288,7 +287,7 @@
     punpckldq            m0, m1
     pshufb               m0, m4
     pmaddubsw            m0, m5
-    pmulhrsw             m0, m6
+    pmulhrsw             m0, m3
     packuswb             m0, m0
     movd                r6d, m0
     mov        [dstq+dsq*0], r6w
@@ -304,10 +303,10 @@
     lea                srcq, [srcq+ssq*2]
     pshufb               m4, m0
     pmaddubsw            m4, m5
-    pmulhrsw             m4, m6
+    pmulhrsw             m4, m3
     packuswb             m4, m4
     movd       [dstq+dsq*0], m4
-    pshufd               m4, m4, q0101
+    psrlq                m4, 32
     movd       [dstq+dsq*1], m4
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
@@ -321,8 +320,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     movq       [dstq+dsq*0], m0
     movhps     [dstq+dsq*1], m0
@@ -338,8 +337,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     mova             [dstq], m0
     add                dstq, dsq
@@ -349,25 +348,25 @@
 .h_w32:
     movu                 m0, [srcq+mmsize*0+8*0]
     movu                 m1, [srcq+mmsize*0+8*1]
-    movu                 m2, [srcq+mmsize*1+8*0]
-    movu                 m3, [srcq+mmsize*1+8*1]
-    add                srcq, ssq
     pshufb               m0, m4
     pshufb               m1, m4
-    pshufb               m2, m4
-    pshufb               m3, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmaddubsw            m2, m5
-    pmaddubsw            m3, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
-    pmulhrsw             m2, m6
-    pmulhrsw             m3, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
-    packuswb             m2, m3
+    movu                 m1, [srcq+mmsize*1+8*0]
+    movu                 m2, [srcq+mmsize*1+8*1]
+    add                srcq, ssq
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmulhrsw             m1, m3
+    pmulhrsw             m2, m3
+    packuswb             m1, m2
     mova        [dstq+16*0], m0
-    mova        [dstq+16*1], m2
+    mova        [dstq+16*1], m1
     add                dstq, dsq
     dec                  hd
     jg .h_w32
@@ -381,8 +380,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     mova     [dstq+r6+16*3], m0
     add                  r6, 16
@@ -401,8 +400,8 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
-    pmulhrsw             m0, m6
-    pmulhrsw             m1, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     packuswb             m0, m1
     mova     [dstq+r6+16*7], m0
     add                  r6, 16
@@ -414,15 +413,13 @@
     RET
 .v:
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
-    %assign stack_offset stack_offset - stack_size_padded
-    WIN64_SPILL_XMM       8
     imul               mxyd, 0xff01
-    mova                 m7, [base+pw_2048]
+    mova                 m5, [base+pw_2048]
     add                mxyd, 16 << 8
     add                  wq, t0
-    movd                 m6, mxyd
-    pshuflw              m6, m6, q0000
-    punpcklqdq           m6, m6
+    movd                 m4, mxyd
+    pshuflw              m4, m4, q0000
+    punpcklqdq           m4, m4
     RESTORE_DSQ_32       t0
     jmp                  wq
 .v_w2:
@@ -433,8 +430,8 @@
     pshuflw              m2, m0, q2301
     pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
     punpcklbw            m1, m0, m2
-    pmaddubsw            m1, m6
-    pmulhrsw             m1, m7
+    pmaddubsw            m1, m4
+    pmulhrsw             m1, m5
     packuswb             m1, m1
     movd                r6d, m1
     mov        [dstq+dsq*1], r6w
@@ -453,8 +450,8 @@
     movd                 m0, [srcq+ssq*0]
     punpckldq            m1, m0  ; 1 2
     punpcklbw            m1, m2
-    pmaddubsw            m1, m6
-    pmulhrsw             m1, m7
+    pmaddubsw            m1, m4
+    pmulhrsw             m1, m5
     packuswb             m1, m1
     movd       [dstq+dsq*0], m1
     psrlq                m1, 32
@@ -467,20 +464,18 @@
 .v_w8:
     movq                 m0, [srcq+ssq*0]
 .v_w8_loop:
-    movddup              m2, [srcq+ssq*1]
+    movq                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklqdq           m3, m0, m2 ; 0 1 m2qh:m0ql
-    movddup              m0, [srcq+ssq*0]
-    punpcklqdq           m4, m2, m0 ; 1 2 m0qh:m2ql
-    punpcklbw            m1, m4, m3
-    punpckhbw            m4, m3
-    pmaddubsw            m1, m6
-    pmaddubsw            m4, m6
-    pmulhrsw             m1, m7
-    pmulhrsw             m4, m7
-    packuswb             m1, m4
-    movq         [dstq+dsq*0], m1
-    movhps       [dstq+dsq*1], m1
+    punpcklbw            m1, m3, m0
+    movq                 m0, [srcq+ssq*0]
+    punpcklbw            m2, m0, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    movq       [dstq+dsq*0], m1
+    movhps     [dstq+dsq*1], m1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .v_w8_loop
@@ -489,25 +484,25 @@
 %macro PUT_BILIN_V_W16 0
     movu                 m0, [srcq+ssq*0]
 %%loop:
-    movu                 m4, [srcq+ssq*1]
+    movu                 m3, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
-    punpcklbw            m1, m4, m0
-    punpckhbw            m3, m4, m0
+    punpcklbw            m1, m3, m0
+    punpckhbw            m2, m3, m0
     movu                 m0, [srcq+ssq*0]
-    punpcklbw            m2, m0, m4
-    pmaddubsw            m1, m6
-    pmaddubsw            m3, m6
-    pmulhrsw             m1, m7
-    pmulhrsw             m3, m7
-    packuswb             m1, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
     mova       [dstq+dsq*0], m1
-    punpckhbw            m3, m0, m4
-    pmaddubsw            m2, m6
-    pmaddubsw            m3, m6
-    pmulhrsw             m2, m7
-    pmulhrsw             m3, m7
-    packuswb             m2, m3
-    mova       [dstq+dsq*1], m2
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    packuswb             m1, m2
+    mova       [dstq+dsq*1], m1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg %%loop
@@ -549,7 +544,6 @@
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
-    %assign stack_offset stack_offset - stack_size_padded
     WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
     mova                 m7, [base+pw_2048]
@@ -579,10 +573,14 @@
     paddw                m1, m2   ; src[x] + (my * (src[x + src_stride] - src[x])
     pmulhrsw             m1, m7
     packuswb             m1, m1
+%if ARCH_X86_64
+    movq                 r6, m1
+%else
     pshuflw              m1, m1, q2020
     movd                r6d, m1
+%endif
     mov        [dstq+dsq*0], r6w
-    shr                 r6d, 16
+    shr                  r6, gprsize*4
     mov        [dstq+dsq*1], r6w
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
@@ -595,9 +593,9 @@
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w4_loop:
-    movq                 m1,     [srcq+ssq*1]
-    lea                srcq,     [srcq+ssq*2]
-    movhps               m1,     [srcq+ssq*0]
+    movq                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*0]
     pshufb               m1, m4
     pmaddubsw            m1, m5           ; 1 2
     shufps               m2, m0, m1, q1032 ; 0 1
@@ -617,21 +615,21 @@
     RET
 .hv_w8:
     RESTORE_DSQ_32       t0
-    movu                 m0,     [srcq+ssq*0+8*0]
+    movu                 m0, [srcq+ssq*0+8*0]
     pshufb               m0, m4
     pmaddubsw            m0, m5
 .hv_w8_loop:
-    movu                 m2,     [srcq+ssq*1+8*0]
-    lea                srcq,     [srcq+ssq*2]
-    movu                 m3,     [srcq+ssq*0+8*0]
+    movu                 m2, [srcq+ssq*1+8*0]
+    lea                srcq, [srcq+ssq*2]
     pshufb               m2, m4
-    pshufb               m3, m4
     pmaddubsw            m2, m5
     psubw                m1, m2, m0
     paddw                m1, m1
     pmulhw               m1, m6
     paddw                m1, m0
-    pmaddubsw            m0, m3, m5
+    movu                 m0, [srcq+ssq*0+8*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
     psubw                m3, m0, m2
     paddw                m3, m3
     pmulhw               m3, m6
@@ -639,15 +637,21 @@
     pmulhrsw             m1, m7
     pmulhrsw             m3, m7
     packuswb             m1, m3
-    movq         [dstq+dsq*0], m1
-    movhps       [dstq+dsq*1], m1
+    movq       [dstq+dsq*0], m1
+    movhps     [dstq+dsq*1], m1
     lea                dstq, [dstq+dsq*2]
     sub                  hd, 2
     jg .hv_w8_loop
     RET
-    ;
-    ; 32bit has ssq, dsq free
-%macro PUT_BILIN_HV_W16 0
+.hv_w16:
+    xor                 t0d, t0d
+.hv_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+ %if WIN64
+    movaps              r4m, xmm8
+ %endif
+.hv_w16_loop0:
     movu                 m0,     [srcq+8*0]
     movu                 m1,     [srcq+8*1]
     pshufb               m0, m4
@@ -654,64 +658,48 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
- %if WIN64
-    movaps              r4m, xmm8
- %endif
-%%loop:
+.hv_w16_loop:
 %if ARCH_X86_32
- %define m3back [dstq]
- %define dsqval dsm
+ %define m0tmp [dstq]
 %else
- %define m3back m8
- %define dsqval dsq
+ %define m0tmp m8
 %endif
     add                srcq, ssq
-    movu                 m2,     [srcq+8*1]
+    movu                 m2, [srcq+8*0]
+    movu                 m3, [srcq+8*1]
     pshufb               m2, m4
+    pshufb               m3, m4
     pmaddubsw            m2, m5
-    psubw                m3, m2, m1
+    pmaddubsw            m3, m5
+    mova              m0tmp, m2
+    psubw                m2, m0
+    paddw                m2, m2
+    pmulhw               m2, m6
+    paddw                m2, m0
+    mova                 m0, m3
+    psubw                m3, m1
     paddw                m3, m3
     pmulhw               m3, m6
     paddw                m3, m1
-    mova                 m1, m2
+    mova                 m1, m0
+    mova                 m0, m0tmp
+    pmulhrsw             m2, m7
     pmulhrsw             m3, m7
-    mova             m3back, m3
-    movu                 m2,     [srcq+8*0]
-    pshufb               m2, m4
-    pmaddubsw            m2, m5
-    psubw                m3, m2, m0
-    paddw                m3, m3
-    pmulhw               m3, m6
-    paddw                m3, m0
-    mova                 m0, m2
-    pmulhrsw             m3, m7
-    packuswb             m3, m3back
-    mova             [dstq], m3
-    add                dstq, dsqval
+    packuswb             m2, m3
+    mova             [dstq], m2
+    add                dstq, dsmp
     dec                  hd
-    jg %%loop
- %if WIN64
-    movaps             xmm8, r4m
- %endif
- %undef m3back
- %undef dsqval
-%endmacro
-    ;
-.hv_w16:
-    PUT_BILIN_HV_W16
-    RET
-.hv_w16gt:
-    mov                  r4, dstq
-    mov                  r6, srcq
-.hv_w16gt_loop:
-    PUT_BILIN_HV_W16
-    mov                  hw, t0w
+    jg .hv_w16_loop
+    movzx                hd, t0w
     add                  r4, mmsize
     add                  r6, mmsize
     mov                dstq, r4
     mov                srcq, r6
     sub                 t0d, 1<<16
-    jg .hv_w16gt_loop
+    jg .hv_w16_loop0
+ %if WIN64
+    movaps             xmm8, r4m
+ %endif
     RET
 .hv_w32:
     lea                 t0d, [hq+(1<<16)]