shithub: dav1d

Download patch

ref: 0afec6b13fbacdc4fb25117c17ce4472945f901d
parent: 65ee1233cf86f03e029d0520f7cc5a3e152d3bbd
author: Francois Cartegnie <[email protected]>
date: Fri Feb 15 07:17:54 EST 2019

x86: add SSSE3 mc prep_8tap implementation

---------------------
x86_64:
------------------------------------------
mct_8tap_regular_w4_0_8bpc_c: 115.6
mct_8tap_regular_w4_0_8bpc_ssse3: 13.1
mct_8tap_regular_w4_0_8bpc_avx2: 13.3
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 363.0
mct_8tap_regular_w4_h_8bpc_ssse3: 19.1
mct_8tap_regular_w4_h_8bpc_avx2: 16.5
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 832.2
mct_8tap_regular_w4_hv_8bpc_ssse3: 113.4
mct_8tap_regular_w4_hv_8bpc_avx2: 53.1
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 488.5
mct_8tap_regular_w4_v_8bpc_ssse3: 38.9
mct_8tap_regular_w4_v_8bpc_avx2: 26.0
------------------------------------------
mct_8tap_regular_w8_0_8bpc_c: 259.3
mct_8tap_regular_w8_0_8bpc_ssse3: 20.4
mct_8tap_regular_w8_0_8bpc_avx2: 18.0
------------------------------------------
mct_8tap_regular_w8_h_8bpc_c: 1124.3
mct_8tap_regular_w8_h_8bpc_ssse3: 67.7
mct_8tap_regular_w8_h_8bpc_avx2: 43.3
------------------------------------------
mct_8tap_regular_w8_hv_8bpc_c: 2155.0
mct_8tap_regular_w8_hv_8bpc_ssse3: 340.8
mct_8tap_regular_w8_hv_8bpc_avx2: 151.3
------------------------------------------
mct_8tap_regular_w8_v_8bpc_c: 1195.4
mct_8tap_regular_w8_v_8bpc_ssse3: 72.4
mct_8tap_regular_w8_v_8bpc_avx2: 39.8
------------------------------------------
mct_8tap_regular_w16_0_8bpc_c: 158.3
mct_8tap_regular_w16_0_8bpc_ssse3: 52.9
mct_8tap_regular_w16_0_8bpc_avx2: 30.2
------------------------------------------
mct_8tap_regular_w16_h_8bpc_c: 4267.4
mct_8tap_regular_w16_h_8bpc_ssse3: 211.9
mct_8tap_regular_w16_h_8bpc_avx2: 121.4
------------------------------------------
mct_8tap_regular_w16_hv_8bpc_c: 5430.9
mct_8tap_regular_w16_hv_8bpc_ssse3: 986.8
mct_8tap_regular_w16_hv_8bpc_avx2: 428.4
------------------------------------------
mct_8tap_regular_w16_v_8bpc_c: 4604.2
mct_8tap_regular_w16_v_8bpc_ssse3: 199.1
mct_8tap_regular_w16_v_8bpc_avx2: 100.7
------------------------------------------
mct_8tap_regular_w32_0_8bpc_c: 372.9
mct_8tap_regular_w32_0_8bpc_ssse3: 231.9
mct_8tap_regular_w32_0_8bpc_avx2: 99.7
------------------------------------------
mct_8tap_regular_w32_h_8bpc_c: 15975.0
mct_8tap_regular_w32_h_8bpc_ssse3: 802.9
mct_8tap_regular_w32_h_8bpc_avx2: 468.5
------------------------------------------
mct_8tap_regular_w32_hv_8bpc_c: 18555.5
mct_8tap_regular_w32_hv_8bpc_ssse3: 3673.5
mct_8tap_regular_w32_hv_8bpc_avx2: 1587.6
------------------------------------------
mct_8tap_regular_w32_v_8bpc_c: 16632.4
mct_8tap_regular_w32_v_8bpc_ssse3: 743.5
mct_8tap_regular_w32_v_8bpc_avx2: 337.8
------------------------------------------
mct_8tap_regular_w64_0_8bpc_c: 675.9
mct_8tap_regular_w64_0_8bpc_ssse3: 513.6
mct_8tap_regular_w64_0_8bpc_avx2: 285.4
------------------------------------------
mct_8tap_regular_w64_h_8bpc_c: 37161.3
mct_8tap_regular_w64_h_8bpc_ssse3: 1929.7
mct_8tap_regular_w64_h_8bpc_avx2: 1138.1
------------------------------------------
mct_8tap_regular_w64_hv_8bpc_c: 42434.0
mct_8tap_regular_w64_hv_8bpc_ssse3: 8822.1
mct_8tap_regular_w64_hv_8bpc_avx2: 3853.5
------------------------------------------
mct_8tap_regular_w64_v_8bpc_c: 37969.1
mct_8tap_regular_w64_v_8bpc_ssse3: 1805.6
mct_8tap_regular_w64_v_8bpc_avx2: 826.1
------------------------------------------
mct_8tap_regular_w128_0_8bpc_c: 1532.7
mct_8tap_regular_w128_0_8bpc_ssse3: 1397.7
mct_8tap_regular_w128_0_8bpc_avx2: 813.8
------------------------------------------
mct_8tap_regular_w128_h_8bpc_c: 91204.3
mct_8tap_regular_w128_h_8bpc_ssse3: 4783.0
mct_8tap_regular_w128_h_8bpc_avx2: 2767.2
------------------------------------------
mct_8tap_regular_w128_hv_8bpc_c: 102396.0
mct_8tap_regular_w128_hv_8bpc_ssse3: 22202.3
mct_8tap_regular_w128_hv_8bpc_avx2: 9637.2
------------------------------------------
mct_8tap_regular_w128_v_8bpc_c: 92294.3
mct_8tap_regular_w128_v_8bpc_ssse3: 4952.8
mct_8tap_regular_w128_v_8bpc_avx2: 2370.1
------------------------------------------

---------------------
x86_32:
------------------------------------------
mct_8tap_regular_w4_0_8bpc_c: 131.3
mct_8tap_regular_w4_0_8bpc_ssse3: 18.7
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 422.0
mct_8tap_regular_w4_h_8bpc_ssse3: 27.3
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 1012.6
mct_8tap_regular_w4_hv_8bpc_ssse3: 123.6
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 589.6
mct_8tap_regular_w4_v_8bpc_ssse3: 48.9
------------------------------------------
mct_8tap_regular_w8_0_8bpc_c: 278.5
mct_8tap_regular_w8_0_8bpc_ssse3: 26.3
------------------------------------------
mct_8tap_regular_w8_h_8bpc_c: 1129.3
mct_8tap_regular_w8_h_8bpc_ssse3: 80.6
------------------------------------------
mct_8tap_regular_w8_hv_8bpc_c: 2556.4
mct_8tap_regular_w8_hv_8bpc_ssse3: 354.6
------------------------------------------
mct_8tap_regular_w8_v_8bpc_c: 1460.2
mct_8tap_regular_w8_v_8bpc_ssse3: 103.8
------------------------------------------
mct_8tap_regular_w16_0_8bpc_c: 218.9
mct_8tap_regular_w16_0_8bpc_ssse3: 58.4
------------------------------------------
mct_8tap_regular_w16_h_8bpc_c: 4471.8
mct_8tap_regular_w16_h_8bpc_ssse3: 237.2
------------------------------------------
mct_8tap_regular_w16_hv_8bpc_c: 5570.5
mct_8tap_regular_w16_hv_8bpc_ssse3: 1044.1
------------------------------------------
mct_8tap_regular_w16_v_8bpc_c: 4885.5
mct_8tap_regular_w16_v_8bpc_ssse3: 268.3
------------------------------------------
mct_8tap_regular_w32_0_8bpc_c: 495.6
mct_8tap_regular_w32_0_8bpc_ssse3: 236.6
------------------------------------------
mct_8tap_regular_w32_h_8bpc_c: 15903.5
mct_8tap_regular_w32_h_8bpc_ssse3: 872.5
------------------------------------------
mct_8tap_regular_w32_hv_8bpc_c: 19402.2
mct_8tap_regular_w32_hv_8bpc_ssse3: 3832.8
------------------------------------------
mct_8tap_regular_w32_v_8bpc_c: 17119.5
mct_8tap_regular_w32_v_8bpc_ssse3: 935.2
------------------------------------------
mct_8tap_regular_w64_0_8bpc_c: 877.0
mct_8tap_regular_w64_0_8bpc_ssse3: 515.7
------------------------------------------
mct_8tap_regular_w64_h_8bpc_c: 36832.1
mct_8tap_regular_w64_h_8bpc_ssse3: 2094.1
------------------------------------------
mct_8tap_regular_w64_hv_8bpc_c: 43965.3
mct_8tap_regular_w64_hv_8bpc_ssse3: 9423.0
------------------------------------------
mct_8tap_regular_w64_v_8bpc_c: 37041.2
mct_8tap_regular_w64_v_8bpc_ssse3: 2348.9
------------------------------------------
mct_8tap_regular_w128_0_8bpc_c: 1929.9
mct_8tap_regular_w128_0_8bpc_ssse3: 1392.3
------------------------------------------
mct_8tap_regular_w128_h_8bpc_c: 86022.5
mct_8tap_regular_w128_h_8bpc_ssse3: 5110.8
------------------------------------------
mct_8tap_regular_w128_hv_8bpc_c: 105793.5
mct_8tap_regular_w128_hv_8bpc_ssse3: 23278.8
------------------------------------------
mct_8tap_regular_w128_v_8bpc_c: 88223.5
mct_8tap_regular_w128_v_8bpc_ssse3: 7442.7
------------------------------------------

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -50,14 +50,23 @@
 decl_mc_fn(dav1d_put_bilin_ssse3);
 
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
 decl_mct_fn(dav1d_prep_bilin_avx2);
 decl_mct_fn(dav1d_prep_bilin_ssse3);
 
@@ -108,6 +117,15 @@
     init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
 
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
 
     c->avg = dav1d_avg_ssse3;
     c->w_avg = dav1d_w_avg_ssse3;
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -64,6 +64,7 @@
 pw_2048: times 8 dw 2048
 pw_6903: times 8 dw 6903
 pw_8192: times 8 dw 8192
+pd_32:   times 4 dd 32
 pd_512:  times 4 dd 512
 
 pw_258:  times 2 dw 258
@@ -141,6 +142,7 @@
 %endmacro
 
 HV_JMP_TABLE put,   8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
 
@@ -2421,6 +2423,891 @@
     mov                srcq, r7
 %endif
     sub                 r6d, 1<<16
+    jg .hv_w8_loop0
+    RET
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PREP_8TAP_FN regular,        REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_FN sharp,          SHARP,   SHARP
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep_ssse3
+ %define W32_RESTORE_SSQ mov strideq, stridem
+%else
+ %define base_reg r7
+ %define base 0
+ %define W32_RESTORE_SSQ
+%endif
+
+cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+    imul                mxd, mxm, 0x010101
+    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
+    imul                myd, mym, 0x010101
+    add                 myd, t1d ; 8tap_v, my, 4tap_v
+    movsxd               wq, wm
+    movifnidn          srcd, srcm
+    movifnidn            hd, hm
+    LEA            base_reg, prep_ssse3
+    test                mxd, 0xf00
+    jnz .h
+    test                myd, 0xf00
+    jnz .v
+    tzcnt                wd, wd
+    movzx                wd, word [base_reg+wq*2+table_offset(prep,)]
+    add                  wq, base_reg
+    movifnidn       strided, stridem
+    lea                  r6, [strideq*3]
+    %assign stack_offset org_stack_offset
+%if WIN64
+    pop                  r8
+    pop                  r7
+%endif
+    jmp                  wq
+.h:
+    test                myd, 0xf00
+    jnz .hv
+    WIN64_SPILL_XMM      12
+    cmp                  wd, 4
+    je .h_w4
+    tzcnt                wd, wd
+%if ARCH_X86_64
+    mova                m10, [base+subpel_h_shufA]
+    mova                m11, [base+subpel_h_shufB]
+    mova                 m9, [base+subpel_h_shufC]
+%endif
+    shr                 mxd, 16
+    sub                srcq, 3
+    movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+    movd                 m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
+    pshufd               m5, m5, q0000
+    movd                 m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
+    pshufd               m6, m6, q0000
+    mova                 m7, [base+pw_8192]
+    add                  wq, base_reg
+    jmp                  wq
+.h_w4:
+%if ARCH_X86_32
+    and                 mxd, 0xff
+%else
+    movzx               mxd, mxb
+%endif
+    dec                srcq
+    movd                 m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+    pshufd               m4, m4, q0000
+    mova                 m6, [base+pw_8192]
+    mova                 m5, [base+subpel_h_shufA]
+    W32_RESTORE_SSQ
+%if ARCH_X86_64
+    lea            stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+    movq                 m0, [srcq+strideq*0] ; 0
+    movq                 m1, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    movq                 m2, [srcq+strideq*0] ; 2
+    movq                 m3, [srcq+strideq*1] ; 3
+    lea                srcq, [srcq+strideq*2]
+%else
+    movq                 m2, [srcq+strideq*2] ; 2
+    movq                 m3, [srcq+stride3q ] ; 3
+    lea                srcq, [srcq+strideq*4]
+%endif
+    pshufb               m0, m5 ; subpel_h_shufA
+    pshufb               m1, m5
+    pshufb               m2, m5
+    pshufb               m3, m5
+    pmaddubsw            m0, m4 ; subpel_filters + 2
+    pmaddubsw            m1, m4
+    pmaddubsw            m2, m4
+    pmaddubsw            m3, m4
+    phaddw               m0, m1
+    phaddw               m2, m3
+    pmulhrsw             m0, m6 ; pw_8192
+    pmulhrsw             m2, m6 ; pw_8192
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m2
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+    ;
+%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
+%if ARCH_X86_32
+    pshufb               %2, %1, [base+subpel_h_shufB]
+    pshufb               %3, %1, [base+subpel_h_shufC]
+    pshufb               %1,     [base+subpel_h_shufA]
+%else
+    pshufb               %2, %1, m11; subpel_h_shufB
+    pshufb               %3, %1, m9 ; subpel_h_shufC
+    pshufb               %1, m10    ; subpel_h_shufA
+%endif
+    pmaddubsw            %4, %2, m5  ; subpel +0 B0
+    pmaddubsw            %2, m6      ; subpel +4 B4
+    pmaddubsw            %3, m6      ; subpel +4 C4
+    pmaddubsw            %1, m5      ; subpel +0 A0
+    paddw                %3, %4
+    paddw                %1, %2
+    phaddw               %1, %3
+    pmulhrsw             %1, m7      ; 8192
+%endmacro
+    ;
+.h_w8:
+%if ARCH_X86_32
+    mov                  r3, r2
+    %define        base_reg  r3
+    W32_RESTORE_SSQ
+%endif
+.h_w8_loop:
+    movu                 m0,     [srcq+strideq*0]
+    movu                 m1,     [srcq+strideq*1]
+    lea                srcq,     [srcq+strideq*2]
+    PREP_8TAP_H          m0, m2, m3, m4
+    PREP_8TAP_H          m1, m2, m3, m4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    add                tmpq, 32
+    sub                  hd, 2
+    jg .h_w8_loop
+    RET
+.h_w16:
+    xor                 r6d, r6d
+    jmp .h_start
+.h_w32:
+    mov                  r6, -16*1
+    jmp .h_start
+.h_w64:
+    mov                  r6, -16*3
+    jmp .h_start
+.h_w128:
+    mov                  r6, -16*7
+.h_start:
+%if ARCH_X86_32
+    mov                  r3, r2
+ %define           base_reg  r3
+%endif
+    sub                srcq, r6
+    mov                  r5, r6
+    W32_RESTORE_SSQ
+.h_loop:
+    movu                 m0,     [srcq+r6+8*0]
+    movu                 m1,     [srcq+r6+8*1]
+    PREP_8TAP_H          m0, m2, m3, m4
+    PREP_8TAP_H          m1, m2, m3, m4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    add                tmpq, 32
+    add                  r6, 16
+    jle .h_loop
+    add                srcq, strideq
+    mov                  r6, r5
+    dec                  hd
+    jg .h_loop
+    RET
+%if ARCH_X86_32
+ %define            base_reg r2
+%endif
+
+.v:
+%if ARCH_X86_32
+    mov                 mxd, myd
+    and                 mxd, 0xff
+%else
+ %assign stack_offset org_stack_offset
+    WIN64_SPILL_XMM      16
+    movzx               mxd, myb
+%endif
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    lea                 myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    mova                 m2, [base+pw_512]
+    psrlw                m2, m2, 1 ; 0x0100
+    mova                 m7, [base+pw_8192]
+%if ARCH_X86_32
+ %define            subpel0  [rsp+mmsize*0]
+ %define            subpel1  [rsp+mmsize*1]
+ %define            subpel2  [rsp+mmsize*2]
+ %define            subpel3  [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+    ALLOC_STACK   -mmsize*4
+%assign regs_used 7
+    movd                 m0, [myq+0]
+    pshufb               m0, m2
+    mova            subpel0, m0
+    movd                 m0, [myq+2]
+    pshufb               m0, m2
+    mova            subpel1, m0
+    movd                 m0, [myq+4]
+    pshufb               m0, m2
+    mova            subpel2, m0
+    movd                 m0, [myq+6]
+    pshufb               m0, m2
+    mova            subpel3, m0
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    lea             strideq, [strideq*3]
+    sub [rstk+stack_offset+gprsize*2], strideq
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    mov                srcq, [rstk+stack_offset+gprsize*2]
+%else
+ %define            subpel0  m8
+ %define            subpel1  m9
+ %define            subpel2  m10
+ %define            subpel3  m11
+    movd            subpel0, [myq+0]
+    pshufb          subpel0, m2
+    movd            subpel1, [myq+2]
+    pshufb          subpel1, m2
+    movd            subpel2, [myq+4]
+    pshufb          subpel2, m2
+    movd            subpel3, [myq+6]
+    pshufb          subpel3, m2
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    cmp                  wd, 8
+    jg .v_w16
+    je .v_w8
+%endif
+.v_w4:
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+ %define               srcm [rsp+mmsize*4+gprsize*1]
+ %define               tmpm [rsp+mmsize*4+gprsize*2]
+%endif
+    mov                tmpm, tmpq
+    mov                srcm, srcq
+    lea                 r5d, [wq - 4] ; horizontal loop
+    shl                 r5d, (16 - 2)  ; (wq / 4) << 16
+    mov                 r5w, hw
+.v_w4_loop0:
+%endif
+    movd                 m2, [srcq+strideq*0] ; 0
+    movhps               m2, [srcq+strideq*2] ; 0 _ 2
+    movd                 m3, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    movhps               m3, [srcq+strideq*1] ; 1 _ 3
+    lea                srcq, [srcq+strideq*2]
+%else
+    movhps               m3, [srcq+stride3q ] ; 1 _ 3
+    lea                srcq, [srcq+strideq*4]
+%endif
+    pshufd               m2, m2, q2020    ; 0 2 0 2
+    pshufd               m3, m3, q2020    ; 1 3 1 3
+    punpckldq            m2, m3           ; 0 1 2 3
+    movd                 m3, [srcq+strideq*0] ; 4
+    movd                 m1, [srcq+strideq*1] ; 5
+    movd                 m0, [srcq+strideq*2] ; 6
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    add                srcq, strideq
+%else
+    add                srcq, stride3q
+%endif
+    punpckldq            m3, m1           ; 4 5 _ _
+    punpckldq            m1, m0           ; 5 6 _ _
+    palignr              m4, m3, m2, 4    ; 1 2 3 4
+    punpcklbw            m3, m1           ; 45 56
+    punpcklbw            m1, m2, m4       ; 01 12
+    punpckhbw            m2, m4           ; 23 34
+.v_w4_loop:
+    pmaddubsw            m5, m1, subpel0  ; a0 b0
+    mova                 m1, m2
+    pmaddubsw            m2, subpel1      ; a1 b1
+    paddw                m5, m2
+    mova                 m2, m3
+    pmaddubsw            m3, subpel2      ; a2 b2
+    paddw                m5, m3
+    movd                 m4, [srcq+strideq*0]
+    punpckldq            m3, m0, m4       ; 6 7 _ _
+    movd                 m0, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m4, m0           ; 7 8 _ _
+    punpcklbw            m3, m4           ; 67 78
+    pmaddubsw            m4, m3, subpel3  ; a3 b3
+    paddw                m5, m4
+    pmulhrsw             m5, m7
+    movq        [tmpq+wq*0], m5
+    movhps      [tmpq+wq*2], m5
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w4_loop
+%if ARCH_X86_32
+    mov                  hw, r5w ; reset vertical loop
+    mov                tmpq, tmpm
+    mov                srcq, srcm
+    add                tmpq, 8
+    add                srcq, 4
+    mov                tmpm, tmpq
+    mov                srcm, srcq
+    sub                 r5d, 1<<16 ; horizontal--
+    jg .v_w4_loop0
+%endif
+    RET
+
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+    lea                 r5d, [wq - 8] ; horizontal loop
+    mov                  r8, tmpq
+    mov                  r6, srcq
+    shl                 r5d, 8 - 3; (wq / 8) << 8
+    mov                 r5b, hb
+.v_w8_loop0:
+    movq                 m4, [srcq+strideq*0]   ; 0
+    movq                 m5, [srcq+strideq*1]   ; 1
+    lea                srcq, [srcq+strideq*2]
+    movq                 m6, [srcq+strideq*0]   ; 2
+    movq                 m0, [srcq+strideq*1]   ; 3
+    lea                srcq, [srcq+strideq*2]
+    movq                 m1, [srcq+strideq*0]   ; 4
+    movq                 m2, [srcq+strideq*1]   ; 5
+    lea                srcq, [srcq+strideq*2]   ;
+    movq                 m3, [srcq+strideq*0]   ; 6
+    shufpd               m4, m0, 0x0c
+    shufpd               m5, m1, 0x0c
+    punpcklbw            m1, m4, m5 ; 01
+    punpckhbw            m4, m5     ; 34
+    shufpd               m6, m2, 0x0c
+    punpcklbw            m2, m5, m6 ; 12
+    punpckhbw            m5, m6     ; 45
+    shufpd               m0, m3, 0x0c
+    punpcklbw            m3, m6, m0 ; 23
+    punpckhbw            m6, m0     ; 56
+.v_w8_loop:
+    movq                m12, [srcq+strideq*1]   ; 8
+    lea                srcq, [srcq+strideq*2]
+    movq                m13, [srcq+strideq*0]   ; 9
+    pmaddubsw           m14, m1, subpel0 ; a0
+    pmaddubsw           m15, m2, subpel0 ; b0
+    mova                 m1, m3
+    mova                 m2, m4
+    pmaddubsw            m3, subpel1 ; a1
+    pmaddubsw            m4, subpel1 ; b1
+    paddw               m14, m3
+    paddw               m15, m4
+    mova                 m3, m5
+    mova                 m4, m6
+    pmaddubsw            m5, subpel2 ; a2
+    pmaddubsw            m6, subpel2 ; b2
+    paddw               m14, m5
+    paddw               m15, m6
+    shufpd               m6, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m6, m0  ; 67
+    punpckhbw            m6, m0      ; 78
+    pmaddubsw           m12, m5, subpel3 ; a3
+    pmaddubsw           m13, m6, subpel3 ; b3
+    paddw               m14, m12
+    paddw               m15, m13
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    movu        [tmpq+wq*0], xm14
+    movu        [tmpq+wq*2], xm15
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_w8_loop
+    movzx                hd, r5b ; reset vertical loop
+    add                  r8, 16
+    add                  r6, 8
+    mov                tmpq, r8
+    mov                srcq, r6
+    sub                 r5d, 1<<8 ; horizontal--
+    jg .v_w8_loop0
+    RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+
+.hv:
+    %assign stack_offset org_stack_offset
+    cmp                  wd, 4
+    jg .hv_w8
+    and                 mxd, 0xff
+    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+%if ARCH_X86_32
+    mov                 mxd, myd
+    and                 mxd, 0xff
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    mov                  r5, r2; use as new base
+ %define           base_reg  r5
+ %assign regs_used 2
+    ALLOC_STACK  -mmsize*14
+ %assign regs_used 7
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    lea             strideq, [strideq*3 + 1]
+    sub [rstk+stack_offset+gprsize*2], strideq
+    mov             strideq, [rstk+stack_offset+gprsize*3]
+    mov                srcq, [rstk+stack_offset+gprsize*2]
+ %define           subpelv0  [rsp+mmsize*0]
+ %define           subpelv1  [rsp+mmsize*1]
+ %define           subpelv2  [rsp+mmsize*2]
+ %define           subpelv3  [rsp+mmsize*3]
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd               m6, m0, q0000
+    mova           subpelv0, m6
+    pshufd               m6, m0, q1111
+    mova           subpelv1, m6
+    pshufd               m6, m0, q2222
+    mova           subpelv2, m6
+    pshufd               m6, m0, q3333
+    mova           subpelv3, m6
+%else
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    ALLOC_STACK   mmsize*14, 14
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    dec                srcq
+ %define           subpelv0  m10
+ %define           subpelv1  m11
+ %define           subpelv2  m12
+ %define           subpelv3  m13
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    mova                 m8, [base+pw_8192]
+    mova                 m9, [base+pd_32]
+    pshufd              m10, m0, q0000
+    pshufd              m11, m0, q1111
+    pshufd              m12, m0, q2222
+    pshufd              m13, m0, q3333
+%endif
+    pshufd               m7, m1, q0000
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+    ;
+    ;
+%if ARCH_X86_32
+ %define           w8192reg  [base+pw_8192]
+ %define             d32reg  [base+pd_32]
+%else
+ %define           w8192reg  m8
+ %define             d32reg  m9
+%endif
+    ; lower shuffle 0 1 2 3 4
+    mova                 m6, [base+subpel_h_shuf4]
+    movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
+    movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
+    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    add                srcq, strideq
+    movhps               m4, [srcq+strideq*0]   ; 2 _ 3 _
+    add                srcq, strideq
+%else
+    movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
+    lea                srcq, [srcq+strideq*4]
+%endif
+    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+    pmaddubsw            m2, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m2, m0 ;H 0 1 2 3
+    pmulhrsw             m2, w8192reg ;H pw_8192
+    SAVELINE_W4          m2, 2, 0
+    ; upper shuffle 2 3 4 5 6
+    mova                 m6, [base+subpel_h_shuf4+16]
+    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+    pmaddubsw            m2, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m2, m0 ;H 0 1 2 3
+    pmulhrsw             m2, w8192reg ;H pw_8192
+    ;
+    ; lower shuffle
+    mova                 m6, [base+subpel_h_shuf4]
+    movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
+    movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
+    movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
+    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+    pmaddubsw            m3, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m3, m0 ;H 4 5 6 7
+    pmulhrsw             m3, w8192reg ;H pw_8192
+    SAVELINE_W4          m3, 3, 0
+    ; upper shuffle
+    mova                 m6, [base+subpel_h_shuf4+16]
+    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+    pmaddubsw            m3, m7 ;H subpel_filters
+    pmaddubsw            m0, m7 ;H subpel_filters
+    phaddw               m3, m0 ;H 4 5 6 7
+    pmulhrsw             m3, w8192reg ;H pw_8192
+    ;
+%if ARCH_X86_32
+    lea                srcq, [srcq+strideq*2]
+    add                srcq, strideq
+%else
+    add                srcq, stride3q
+%endif
+    ;process high
+    palignr              m4, m3, m2, 4;V 1 2 3 4
+    punpcklwd            m1, m2, m4  ; V 01 12
+    punpckhwd            m2, m4      ; V 23 34
+    pshufd               m0, m3, q2121;V 5 6 5 6
+    punpcklwd            m3, m0      ; V 45 56
+    SAVELINE_W4          m0, 0, 1
+    SAVELINE_W4          m1, 1, 1
+    SAVELINE_W4          m2, 2, 1
+    SAVELINE_W4          m3, 3, 1
+    ;process low
+    RESTORELINE_W4       m2, 2, 0
+    RESTORELINE_W4       m3, 3, 0
+    palignr              m4, m3, m2, 4;V 1 2 3 4
+    punpcklwd            m1, m2, m4  ; V 01 12
+    punpckhwd            m2, m4      ; V 23 34
+    pshufd               m0, m3, q2121;V 5 6 5 6
+    punpcklwd            m3, m0      ; V 45 56
+.hv_w4_loop:
+    ;process low
+    pmaddwd              m5, m1, subpelv0 ; V a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, subpelv1; V a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, subpelv2; V a2 b2
+    paddd                m5, m3
+    ;
+    mova                 m6, [base+subpel_h_shuf4]
+    movq                 m4, [srcq+strideq*0] ; 7
+    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
+    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+    pmaddubsw            m4, m7 ;H subpel_filters
+    phaddw               m4, m4 ;H                7 8 7 8
+    pmulhrsw             m4, w8192reg ;H pw_8192
+    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    mova                 m0, m4
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m3, subpelv3; a3 b3
+    paddd                m5, d32reg ; pd_32
+    paddd                m5, m4
+    psrad                m5, 6
+    SAVELINE_W4          m0, 0, 0
+    SAVELINE_W4          m1, 1, 0
+    SAVELINE_W4          m2, 2, 0
+    SAVELINE_W4          m3, 3, 0
+    SAVELINE_W4          m5, 5, 0
+    ;process high
+    RESTORELINE_W4       m0, 0, 1
+    RESTORELINE_W4       m1, 1, 1
+    RESTORELINE_W4       m2, 2, 1
+    RESTORELINE_W4       m3, 3, 1
+    pmaddwd              m5, m1, subpelv0; V a0 b0
+    mova                 m1, m2
+    pmaddwd              m2, subpelv1; V a1 b1
+    paddd                m5, m2
+    mova                 m2, m3
+    pmaddwd              m3, subpelv2; V a2 b2
+    paddd                m5, m3
+    ;
+    mova                 m6, [base+subpel_h_shuf4+16]
+    movq                 m4, [srcq+strideq*0] ; 7
+    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
+    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+    pmaddubsw            m4, m7 ;H subpel_filters
+    phaddw               m4, m4 ;H                7 8 7 8
+    pmulhrsw             m4, w8192reg ;H pw_8192
+    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    mova                 m0, m4
+    punpcklwd            m3, m4      ; 67 78
+    pmaddwd              m4, m3, subpelv3; a3 b3
+    paddd                m5, d32reg ; pd_32
+    paddd                m5, m4
+    psrad                m4, m5, 6
+    ;
+    RESTORELINE_W4       m5, 5, 0
+    packssdw             m5, m4
+    pshufd               m5, m5, q3120
+    movu             [tmpq], m5
+    lea                srcq, [srcq+strideq*2]
+    add                tmpq, 16
+    sub                  hd, 2
+    SAVELINE_W4          m0, 0, 1
+    SAVELINE_W4          m1, 1, 1
+    SAVELINE_W4          m2, 2, 1
+    SAVELINE_W4          m3, 3, 1
+    RESTORELINE_W4       m0, 0, 0
+    RESTORELINE_W4       m1, 1, 0
+    RESTORELINE_W4       m2, 2, 0
+    RESTORELINE_W4       m3, 3, 0
+    jg .hv_w4_loop
+    RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+    ;
+
+
+.hv_w8:
+    %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+    shr                 mxd, 16
+%if ARCH_X86_32
+ %define           base_reg  r2
+ %define           subpelh0  [rsp+mmsize*5]
+ %define           subpelh1  [rsp+mmsize*6]
+ %define           subpelv0  [rsp+mmsize*7]
+ %define           subpelv1  [rsp+mmsize*8]
+ %define           subpelv2  [rsp+mmsize*9]
+ %define           subpelv3  [rsp+mmsize*10]
+ %define             accuv0  [rsp+mmsize*11]
+ %define             accuv1  [rsp+mmsize*12]
+    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+    movzx               mxd, myw
+    and                 mxd, 0xff
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    ALLOC_STACK  -mmsize*13
+%if STACK_ALIGNMENT < mmsize
+    mov                rstk, r2m
+ %define               tmpm  [rsp+mmsize*13+gprsize*1]
+ %define               srcm  [rsp+mmsize*13+gprsize*2]
+ %define            stridem  [rsp+mmsize*13+gprsize*3]
+    mov             stridem, rstk
+%endif
+    mov                  r6, r2
+%define base_reg r6
+    pshufd               m0, m1, q0000
+    pshufd               m1, m1, q1111
+    punpcklbw            m5, m5
+    psraw                m5, 8 ; sign-extend
+    pshufd               m2, m5, q0000
+    pshufd               m3, m5, q1111
+    pshufd               m4, m5, q2222
+    pshufd               m5, m5, q3333
+    mova           subpelh0, m0
+    mova           subpelh1, m1
+    mova           subpelv0, m2
+    mova           subpelv1, m3
+    mova           subpelv2, m4
+    mova           subpelv3, m5
+    W32_RESTORE_SSQ
+    lea             strided, [strided*3]
+    sub                srcd, strided
+    sub                srcd, 3
+    mov                srcm, srcd
+    W32_RESTORE_SSQ
+%else
+    ALLOC_STACK    mmsize*5, 16
+ %define           subpelh0  m10
+ %define           subpelh1  m11
+ %define           subpelv0  m12
+ %define           subpelv1  m13
+ %define           subpelv2  m14
+ %define           subpelv3  m15
+ %define             accuv0  m8
+ %define             accuv1  m9
+    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmovle              myd, mxd
+    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    pshufd         subpelh0, m0, q0000
+    pshufd         subpelh1, m0, q1111
+    punpcklbw            m1, m1
+    psraw                m1, 8 ; sign-extend
+    pshufd         subpelv0, m1, q0000
+    pshufd         subpelv1, m1, q1111
+    pshufd         subpelv2, m1, q2222
+    pshufd         subpelv3, m1, q3333
+    lea                stride3q, [strideq*3]
+    sub                srcq, 3
+    sub                srcq, stride3q
+    mov                  r6, srcq
+%endif
+    lea                 r5d, [wq-4]
+%if ARCH_X86_64
+    mov                  r8, tmpq
+%else
+    mov                tmpm, tmpq
+%endif
+    shl                 r5d, (16 - 2)
+    mov                 r5w, hw
+.hv_w8_loop0:
+    movu                 m4, [srcq+strideq*0] ; 0 = _ _
+    movu                 m5, [srcq+strideq*1] ; 1 = _ _
+    lea                srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+    mova                 m7, [base+subpel_h_shufA]
+    mova                 m8, [base+subpel_h_shufB]
+    mova                 m9, [base+subpel_h_shufC]
+%endif
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+    movu                 m6, [srcq+strideq*0] ; 2 = _ _
+    movu                 m0, [srcq+strideq*1] ; 3 = _ _
+    lea                srcq, [srcq+strideq*2]
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+    ;
+    mova                 m7, [base+pw_8192]
+    pmulhrsw             m4, m7 ; H pw_8192
+    pmulhrsw             m5, m7 ; H pw_8192
+    pmulhrsw             m6, m7 ; H pw_8192
+    pmulhrsw             m0, m7 ; H pw_8192
+    punpcklwd            m1, m4, m5  ; 0 1 ~
+    punpcklwd            m2, m5, m6  ; 1 2 ~
+    punpcklwd            m3, m6, m0  ; 2 3 ~
+    SAVELINE_W8           1, m1
+    SAVELINE_W8           2, m2
+    SAVELINE_W8           3, m3
+    ;
+    mova                 m7, [base+subpel_h_shufA]
+    movu                 m4, [srcq+strideq*0]       ; 4 = _ _
+    movu                 m5, [srcq+strideq*1]       ; 5 = _ _
+    lea                srcq, [srcq+strideq*2]
+    movu                 m6, [srcq+strideq*0]       ; 6 = _ _
+    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+    mova                 m7, [base+pw_8192]
+    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
+    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
+    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
+    punpcklwd            m4, m0, m1  ; 3 4 ~
+    punpcklwd            m5, m1, m2  ; 4 5 ~
+    punpcklwd            m6, m2, m3  ; 5 6 ~
+    ;
+    SAVELINE_W8           6, m3
+    RESTORELINE_W8        1, m1
+    RESTORELINE_W8        2, m2
+    RESTORELINE_W8        3, m3
+.hv_w8_loop:
+    ; m8 accu for V a
+    ; m9 accu for V b
+    SAVELINE_W8           1, m3
+    SAVELINE_W8           2, m4
+    SAVELINE_W8           3, m5
+    SAVELINE_W8           4, m6
+%if ARCH_X86_32
+    pmaddwd              m0, m1, subpelv0 ; a0
+    pmaddwd              m7, m2, subpelv0 ; b0
+    pmaddwd              m3, subpelv1     ; a1
+    pmaddwd              m4, subpelv1     ; b1
+    paddd                m0, m3
+    paddd                m7, m4
+    pmaddwd              m5, subpelv2     ; a2
+    pmaddwd              m6, subpelv2     ; b2
+    paddd                m0, m5
+    paddd                m7, m6
+    mova                 m5, [base+pd_32]
+    paddd                m0, m5 ;   pd_512
+    paddd                m7, m5 ;   pd_512
+    mova             accuv0, m0
+    mova             accuv1, m7
+%else
+    pmaddwd              m8, m1, subpelv0 ; a0
+    pmaddwd              m9, m2, subpelv0 ; b0
+    pmaddwd              m3, subpelv1     ; a1
+    pmaddwd              m4, subpelv1     ; b1
+    paddd                m8, m3
+    paddd                m9, m4
+    pmaddwd              m5, subpelv2     ; a2
+    pmaddwd              m6, subpelv2     ; b2
+    paddd                m8, m5
+    paddd                m9, m6
+    mova                 m7, [base+pd_32]
+    paddd                m8, m7 ;   pd_512
+    paddd                m9, m7 ;   pd_512
+    mova                 m7, [base+subpel_h_shufB]
+    mova                 m6, [base+subpel_h_shufC]
+    mova                 m5, [base+subpel_h_shufA]
+%endif
+    movu                 m0, [srcq+strideq*1] ; 7
+    movu                 m4, [srcq+strideq*2] ; 8
+    lea                srcq, [srcq+strideq*2]
+    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
+    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
+    mova                 m5, [base+pw_8192]
+    pmulhrsw             m0, m5 ; H pw_8192
+    pmulhrsw             m4, m5 ; H pw_8192
+    RESTORELINE_W8        6, m6
+    punpcklwd            m5, m6, m0  ; 6 7  ~
+    punpcklwd            m6, m0, m4  ; 7 8 ~
+    pmaddwd              m1, m5, subpelv3 ; a3
+    paddd                m2, m1, accuv0
+    pmaddwd              m1, m6, subpelv3 ; b3
+    paddd                m1, m1, accuv1 ; H + V
+    psrad                m2, 6
+    psrad                m1, 6
+    packssdw             m2, m1      ; d -> w
+    movq        [tmpq+wq*0], m2
+    movhps      [tmpq+wq*2], m2
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jle .hv_w8_outer
+    SAVELINE_W8           6, m4
+    RESTORELINE_W8        1, m1
+    RESTORELINE_W8        2, m2
+    RESTORELINE_W8        3, m3
+    RESTORELINE_W8        4, m4
+    jmp .hv_w8_loop
+.hv_w8_outer:
+    movzx                hd, r5w
+%if ARCH_X86_32
+    add          dword tmpm, 8
+    mov                tmpq, tmpm
+    mov                srcq, srcm
+    add                srcq, 4
+    mov                srcm, srcq
+%else
+    add                  r8, 8
+    mov                tmpq, r8
+    add                  r6, 4
+    mov                srcq, r6
+%endif
+    sub                 r5d, 1<<16
     jg .hv_w8_loop0
     RET