shithub: dav1d

Download patch

ref: ee58d65dd59d424f840c04079ef3c3153f9d9576
parent: aaba9f8e770eefd4daede06bf01bf80c04a4f9b3
author: Francois Cartegnie <[email protected]>
date: Mon Dec 24 10:54:49 EST 2018

Add SSSE3 put_bilin

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -38,6 +38,7 @@
 decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
 decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
 decl_mc_fn(dav1d_put_bilin_avx2);
+decl_mc_fn(dav1d_put_bilin_ssse3);
 
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
@@ -83,6 +84,8 @@
         return;
 
 #if BITDEPTH == 8
+    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               ssse3);
+
     c->avg = dav1d_avg_ssse3;
     c->w_avg = dav1d_w_avg_ssse3;
     c->mask = dav1d_mask_ssse3;
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -45,8 +45,11 @@
             db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
             db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
             db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
-blend_shuf: db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 
+bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
+bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+
 pb_64:   times 16 db 64
 pw_8:    times 8 dw 8
 pw_26:   times 8 dw 26
@@ -76,9 +79,649 @@
 BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
 BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
 
+%macro BASE_JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - %3)
+    %xdefine %%base %1_%2
+    %%table:
+    %rep %0 - 2
+        dw %%base %+ _w%3 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+
+BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+    %xdefine %%base %1_%3
+    %assign %%types %4
+    %if %%types & 1
+        %xdefine %1_%2_h_%3_table  (%%h  - %5)
+        %%h:
+        %rep %0 - 4
+            dw %%prefix %+ .h_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 2
+        %xdefine %1_%2_v_%3_table  (%%v  - %5)
+        %%v:
+        %rep %0 - 4
+            dw %%prefix %+ .v_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 4
+        %xdefine %1_%2_hv_%3_table (%%hv - %5)
+        %%hv:
+        %rep %0 - 4
+            dw %%prefix %+ .hv_w%5 - %%base
+            %rotate 1
+        %endrep
+    %endif
+%endmacro
+
+HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
 SECTION .text
 
 INIT_XMM ssse3
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+%define base t0-put_ssse3
+%else
+DECLARE_REG_TMP 7
+%define base 0
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+%endif
+;
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+   mov                  %1, dsm ; restore dsq
+ %endif
+%endmacro
+;
+    movifnidn          mxyd, r6m ; mx
+    LEA                  t0, put_ssse3
+    tzcnt                wd, wm
+    mov                  hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r7m ; my
+    test               mxyd, mxyd
+    jnz .v
+.put:
+    movzx                wd, word [t0+wq*2+table_offset(put,)]
+    add                  wq, t0
+    lea                  r6, [ssq*3]
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.put_w2:
+    movzx               r4d, word [srcq+ssq*0]
+    movzx               r6d, word [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], r4w
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w2
+    RET
+.put_w4:
+    mov                 r4d, [srcq+ssq*0]
+    mov                 r6d, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], r4d
+    mov        [dstq+dsq*1], r6d
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w4
+    RET
+.put_w8:
+    movq                 m0, [srcq+ssq*0]
+    movq                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq       [dstq+dsq*0], m0
+    movq       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w8
+    RET
+.put_w16:
+    lea                  r4, [dsq*3]
+.put_w16_in:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    movu                 m2, [srcq+ssq*2]
+    movu                 m3, [srcq+r6   ]
+    lea                srcq, [srcq+ssq*4]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    mova       [dstq+dsq*2], m2
+    mova       [dstq+r4   ], m3
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .put_w16_in
+    RET
+.put_w32:
+    movu                 m0, [srcq+ssq*0+16*0]
+    movu                 m1, [srcq+ssq*0+16*1]
+    movu                 m2, [srcq+ssq*1+16*0]
+    movu                 m3, [srcq+ssq*1+16*1]
+    lea                srcq, [srcq+ssq*2]
+    mova  [dstq+dsq*0+16*0], m0
+    mova  [dstq+dsq*0+16*1], m1
+    mova  [dstq+dsq*1+16*0], m2
+    mova  [dstq+dsq*1+16*1], m3
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w32
+    RET
+.put_w64:
+    movu                 m0, [srcq+16*0]
+    movu                 m1, [srcq+16*1]
+    movu                 m2, [srcq+16*2]
+    movu                 m3, [srcq+16*3]
+    add                srcq, ssq
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    mova        [dstq+16*2], m2
+    mova        [dstq+16*3], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w64
+    RET
+.put_w128:
+    movu                 m0, [srcq+16*0]
+    movu                 m1, [srcq+16*1]
+    movu                 m2, [srcq+16*2]
+    movu                 m3, [srcq+16*3]
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    mova        [dstq+16*2], m2
+    mova        [dstq+16*3], m3
+    movu                 m0, [srcq+16*4]
+    movu                 m1, [srcq+16*5]
+    movu                 m2, [srcq+16*6]
+    movu                 m3, [srcq+16*7]
+    mova        [dstq+16*4], m0
+    mova        [dstq+16*5], m1
+    mova        [dstq+16*6], m2
+    mova        [dstq+16*7], m3
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w128
+    RET
+.h:
+    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+    imul               mxyd, 0xff01
+    mova                 m4, [base+bilin_h_shuf8]
+    mova                 m0, [base+bilin_h_shuf4]
+    WIN64_SPILL_XMM       7
+    add                mxyd, 16 << 8
+    movd                 m5, mxyd
+    mov                mxyd, r7m ; my
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+    mova                 m6, [base+pw_2048]
+    add                  wq, t0
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.h_w2:
+    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+    movd                 m0, [srcq+ssq*0]
+    movd                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m0, m1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    pmulhrsw             m0, m6
+    packuswb             m0, m0
+    movd                r6d, m0
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2_loop
+    RET
+.h_w4:
+    movq                 m4, [srcq+ssq*0]
+    movhps               m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m4, m0
+    pmaddubsw            m4, m5
+    pmulhrsw             m4, m6
+    packuswb             m4, m4
+    movd       [dstq+dsq*0], m4
+    pshufd               m4, m4, q0101
+    movd       [dstq+dsq*1], m4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4
+    RET
+.h_w8:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w16
+    RET
+.h_w32:
+    movu                 m0, [srcq+mmsize*0+8*0]
+    movu                 m1, [srcq+mmsize*0+8*1]
+    movu                 m2, [srcq+mmsize*1+8*0]
+    movu                 m3, [srcq+mmsize*1+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    pmulhrsw             m2, m6
+    pmulhrsw             m3, m6
+    packuswb             m0, m1
+    packuswb             m2, m3
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m2
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w32
+    RET
+.h_w64:
+    mov                  r6, -16*3
+.h_w64_loop:
+    movu                 m0, [srcq+r6+16*3+8*0]
+    movu                 m1, [srcq+r6+16*3+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova     [dstq+r6+16*3], m0
+    add                  r6, 16
+    jle .h_w64_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w64
+    RET
+.h_w128:
+    mov                  r6, -16*7
+.h_w128_loop:
+    movu                 m0, [srcq+r6+16*7+8*0]
+    movu                 m1, [srcq+r6+16*7+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova     [dstq+r6+16*7], m0
+    add                  r6, 16
+    jle .h_w128_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w128
+    RET
+.v:
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
+    imul               mxyd, 0xff01
+    mova                 m7, [base+pw_2048]
+    add                mxyd, 16 << 8
+    add                  wq, t0
+    movd                 m6, mxyd
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.v_w2:
+    movd                 m0, [srcq+ssq*0]
+.v_w2_loop:
+    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
+    lea                srcq, [srcq+ssq*2]
+    pshuflw              m2, m0, q2301
+    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
+    punpcklbw            m1, m0, m2
+    pmaddubsw            m1, m6
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd                r6d, m1
+    mov        [dstq+dsq*1], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*0], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                 m0, [srcq+ssq*0]
+.v_w4_loop:
+    movd                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m2, m0, m1 ; 0 1
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m1, m0  ; 1 2
+    punpcklbw            m1, m2
+    pmaddubsw            m1, m6
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    ;
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                 m0, [srcq+ssq*0]
+.v_w8_loop:
+    movddup              m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklqdq           m3, m0, m2 ; 0 1 m2qh:m0ql
+    movddup              m0, [srcq+ssq*0]
+    punpcklqdq           m4, m2, m0 ; 1 2 m0qh:m2ql
+    punpcklbw            m1, m4, m3
+    punpckhbw            m4, m3
+    pmaddubsw            m1, m6
+    pmaddubsw            m4, m6
+    pmulhrsw             m1, m7
+    pmulhrsw             m4, m7
+    packuswb             m1, m4
+    movq         [dstq+dsq*0], m1
+    movhps       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+    ;
+%macro PUT_BILIN_V_W16 0
+    movu                 m0, [srcq+ssq*0]
+%%loop:
+    movu                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw            m1, m4, m0
+    punpckhbw            m3, m4, m0
+    movu                 m0, [srcq+ssq*0]
+    punpcklbw            m2, m0, m4
+    pmaddubsw            m1, m6
+    pmaddubsw            m3, m6
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    mova       [dstq+dsq*0], m1
+    punpckhbw            m3, m0, m4
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    pmulhrsw             m2, m7
+    pmulhrsw             m3, m7
+    packuswb             m2, m3
+    mova       [dstq+dsq*1], m2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg %%loop
+%endmacro
+    ;
+.v_w16:
+    PUT_BILIN_V_W16
+    RET
+.v_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+.v_w16gt_loop:
+%if ARCH_X86_32
+    mov                bakm, t0q
+    RESTORE_DSQ_32       t0
+    PUT_BILIN_V_W16
+    mov                 t0q, bakm
+%else
+    PUT_BILIN_V_W16
+%endif
+    mov                  hw, t0w
+    add                  r4, mmsize
+    add                  r6, mmsize
+    mov                dstq, r4
+    mov                srcq, r6
+    sub                 t0d, 1<<16
+    jg .v_w16gt
+    RET
+.v_w32:
+    lea                 t0d, [hq+(1<<16)]
+    jmp .v_w16gt
+.v_w64:
+    lea                 t0d, [hq+(3<<16)]
+    jmp .v_w16gt
+.v_w128:
+    lea                 t0d, [hq+(7<<16)]
+    jmp .v_w16gt
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
+    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
+    mova                 m7, [base+pw_2048]
+    movd                 m6, mxyd
+    add                  wq, t0
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    jmp                  wq
+.hv_w2:
+    RESTORE_DSQ_32       t0
+    movd                 m0, [srcq+ssq*0]
+    pshufd               m0, m0, q0000      ; src[x - src_stride]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w2_loop:
+    movd                 m1, [srcq+ssq*1]   ; src[x]
+    lea                srcq, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*0]   ; src[x + src_stride]
+    pshufd               m1, m1, q3120
+    pshufb               m1, m4
+    pmaddubsw            m1, m5             ; 1 _ 2 _
+    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
+    mova                 m0, m1
+    psubw                m1, m2   ; src[x + src_stride] - src[x]
+    paddw                m1, m1
+    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x])
+    paddw                m1, m2   ; src[x] + (my * (src[x + src_stride] - src[x])
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    pshuflw              m1, m1, q2020
+    movd                r6d, m1
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                 m4, [base+bilin_h_shuf4]
+    RESTORE_DSQ_32       t0
+    movddup             xm0, [srcq+ssq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w4_loop:
+    movq                 m1,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    movhps               m1,     [srcq+ssq*0]
+    pshufb               m1, m4
+    pmaddubsw            m1, m5           ; 1 2
+    shufps               m2, m0, m1, q1032 ; 0 1
+    mova                 m0, m1
+    psubw                m1, m2
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m2
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    RESTORE_DSQ_32       t0
+    movu                 m0,     [srcq+ssq*0+8*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                 m2,     [srcq+ssq*1+8*0]
+    lea                srcq,     [srcq+ssq*2]
+    movu                 m3,     [srcq+ssq*0+8*0]
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m2, m5
+    psubw                m1, m2, m0
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m0
+    pmaddubsw            m0, m3, m5
+    psubw                m3, m0, m2
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m2
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    movq         [dstq+dsq*0], m1
+    movhps       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    RET
+    ;
+    ; 32bit has ssq, dsq free
+%macro PUT_BILIN_HV_W16 0
+    movu                 m0,     [srcq+8*0]
+    movu                 m1,     [srcq+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+ %if WIN64
+    movaps              r4m, xmm8
+ %endif
+%%loop:
+%if ARCH_X86_32
+ %define m3back [dstq]
+ %define dsqval dsm
+%else
+ %define m3back m8
+ %define dsqval dsq
+%endif
+    add                srcq, ssq
+    movu                 m2,     [srcq+8*1]
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m1
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m1
+    mova                 m1, m2
+    pmulhrsw             m3, m7
+    mova             m3back, m3
+    movu                 m2,     [srcq+8*0]
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m0
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m0
+    mova                 m0, m2
+    pmulhrsw             m3, m7
+    packuswb             m3, m3back
+    mova             [dstq], m3
+    add                dstq, dsqval
+    dec                  hd
+    jg %%loop
+ %if WIN64
+    movaps             xmm8, r4m
+ %endif
+ %undef m3back
+ %undef dsqval
+%endmacro
+    ;
+.hv_w16:
+    PUT_BILIN_HV_W16
+    RET
+.hv_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+.hv_w16gt_loop:
+    PUT_BILIN_HV_W16
+    mov                  hw, t0w
+    add                  r4, mmsize
+    add                  r6, mmsize
+    mov                dstq, r4
+    mov                srcq, r6
+    sub                 t0d, 1<<16
+    jg .hv_w16gt_loop
+    RET
+.hv_w32:
+    lea                 t0d, [hq+(1<<16)]
+    jmp .hv_w16gt
+.hv_w64:
+    lea                 t0d, [hq+(3<<16)]
+    jmp .hv_w16gt
+.hv_w128:
+    lea                 t0d, [hq+(7<<16)]
+    jmp .hv_w16gt
 
 %if WIN64
 DECLARE_REG_TMP 6, 4