shithub: dav1d

Download patch

ref: 22fb8a42a1cb5e86f4b14070cd57430b37e57e2c
parent: 83956bf10e7cb4af3660cb7be2754657d9ecf1cd
author: Victorien Le Couviour--Tuffet <[email protected]>
date: Sun Jun 7 15:49:51 EDT 2020

x86: Adapt SSSE3 prep_8tap to SSE2

---------------------
x86_64:
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 302.3
mct_8tap_regular_w4_h_8bpc_sse2: 47.3
mct_8tap_regular_w4_h_8bpc_ssse3: 19.5
---------------------
mct_8tap_regular_w8_h_8bpc_c: 745.5
mct_8tap_regular_w8_h_8bpc_sse2: 235.2
mct_8tap_regular_w8_h_8bpc_ssse3: 70.4
---------------------
mct_8tap_regular_w16_h_8bpc_c: 1844.3
mct_8tap_regular_w16_h_8bpc_sse2: 755.6
mct_8tap_regular_w16_h_8bpc_ssse3: 225.9
---------------------
mct_8tap_regular_w32_h_8bpc_c: 6685.5
mct_8tap_regular_w32_h_8bpc_sse2: 2954.4
mct_8tap_regular_w32_h_8bpc_ssse3: 795.8
---------------------
mct_8tap_regular_w64_h_8bpc_c: 15633.5
mct_8tap_regular_w64_h_8bpc_sse2: 7120.4
mct_8tap_regular_w64_h_8bpc_ssse3: 1900.4
---------------------
mct_8tap_regular_w128_h_8bpc_c: 37772.1
mct_8tap_regular_w128_h_8bpc_sse2: 17698.1
mct_8tap_regular_w128_h_8bpc_ssse3: 4665.5
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 306.5
mct_8tap_regular_w4_v_8bpc_sse2: 71.7
mct_8tap_regular_w4_v_8bpc_ssse3: 37.9
---------------------
mct_8tap_regular_w8_v_8bpc_c: 923.3
mct_8tap_regular_w8_v_8bpc_sse2: 168.7
mct_8tap_regular_w8_v_8bpc_ssse3: 71.3
---------------------
mct_8tap_regular_w16_v_8bpc_c: 3040.1
mct_8tap_regular_w16_v_8bpc_sse2: 505.1
mct_8tap_regular_w16_v_8bpc_ssse3: 199.7
---------------------
mct_8tap_regular_w32_v_8bpc_c: 12354.8
mct_8tap_regular_w32_v_8bpc_sse2: 1942.0
mct_8tap_regular_w32_v_8bpc_ssse3: 714.2
---------------------
mct_8tap_regular_w64_v_8bpc_c: 29427.9
mct_8tap_regular_w64_v_8bpc_sse2: 4637.4
mct_8tap_regular_w64_v_8bpc_ssse3: 1829.2
---------------------
mct_8tap_regular_w128_v_8bpc_c: 72756.9
mct_8tap_regular_w128_v_8bpc_sse2: 11301.0
mct_8tap_regular_w128_v_8bpc_ssse3: 5020.6
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 876.9
mct_8tap_regular_w4_hv_8bpc_sse2: 171.7
mct_8tap_regular_w4_hv_8bpc_ssse3: 112.2
---------------------
mct_8tap_regular_w8_hv_8bpc_c: 2215.1
mct_8tap_regular_w8_hv_8bpc_sse2: 730.2
mct_8tap_regular_w8_hv_8bpc_ssse3: 330.9
---------------------
mct_8tap_regular_w16_hv_8bpc_c: 6075.5
mct_8tap_regular_w16_hv_8bpc_sse2: 2252.1
mct_8tap_regular_w16_hv_8bpc_ssse3: 973.4
---------------------
mct_8tap_regular_w32_hv_8bpc_c: 22182.7
mct_8tap_regular_w32_hv_8bpc_sse2: 7692.6
mct_8tap_regular_w32_hv_8bpc_ssse3: 3599.8
---------------------
mct_8tap_regular_w64_hv_8bpc_c: 50876.8
mct_8tap_regular_w64_hv_8bpc_sse2: 18499.6
mct_8tap_regular_w64_hv_8bpc_ssse3: 8815.6
---------------------
mct_8tap_regular_w128_hv_8bpc_c: 122926.3
mct_8tap_regular_w128_hv_8bpc_sse2: 45120.0
mct_8tap_regular_w128_hv_8bpc_ssse3: 22085.7
------------------------------------------

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -62,30 +62,39 @@
 decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sse2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sse2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_sse2);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2);
 decl_mct_fn(dav1d_prep_bilin_avx512icl);
 decl_mct_fn(dav1d_prep_bilin_avx2);
 decl_mct_fn(dav1d_prep_bilin_ssse3);
@@ -144,6 +153,15 @@
 
 #if BITDEPTH == 8
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          sse2);
 
     c->warp8x8  = dav1d_warp_affine_8x8_sse2;
     c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -66,9 +66,8 @@
 
 pb_64:    times 16 db 64
 pw_m256:  times 8 dw -256
-%if ARCH_X86_32
 pw_1:     times 8 dw 1
-%endif
+pw_2:     times 8 dw 2
 pw_8:     times 8 dw 8
 pw_26:    times 8 dw 26
 pw_34:    times 8 dw 34
@@ -159,6 +158,7 @@
     %endif
 %endmacro
 
+HV_JMP_TABLE prep,  8tap,  sse2, 1,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, bilin,  sse2, 7,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE put,   8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
@@ -2573,13 +2573,198 @@
     jg .hv_w8_loop0
     RET
 
-%if ARCH_X86_32
-DECLARE_REG_TMP 1, 2
-%elif WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+  %if %5 == 1
+    pcmpeqd              %2, %2
+    psrlq                %2, 32
+  %endif
+    psrldq               %3, %1, 1
+    pshufd               %3, %3, q2301
+    pand                 %1, %2
+    pandn                %4, %2, %3
+    por                  %1, %4
+ %endif
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+    mova                 %1, %2
+ %endif
+    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
+%endmacro
+
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+    psrlq                %1, %2, 16
+ %elifnidn %1, %2
+    mova                 %1, %2
+ %endif
+    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
+%endmacro
+
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+    palignr              %1, %2, %3, %4
+ %else
+  %if %0 == 4
+   %assign %%i regnumof%+%1 + 1
+   %define %%tmp m %+ %%i
+  %else
+   %define %%tmp %5
+  %endif
+    psrldq               %1, %3, %4
+    pslldq            %%tmp, %2, 16-%4
+    por                  %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+    phaddw               %1, %2
+ %else
+  %ifnidn %1, %2
+   %if %4 == 1
+    mova                 %3, [pw_1]
+   %endif
+    pmaddwd              %1, %3
+    pmaddwd              %2, %3
+    packssdw             %1, %2
+  %else
+   %if %4 == 1
+    pmaddwd              %1, [pw_1]
+   %else
+    pmaddwd              %1, %3
+   %endif
+    packssdw             %1, %1
+  %endif
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+    pmulhrsw             %1, %2, %3
+ %else
+    paddw                %1, %2, %3
+    psraw                %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+    PMULHRSW_POW2        %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+   movd                  %1, [%2+0]
+   movd                  %3, [%2+1]
+   movd                  %4, [%2+2]
+   movd                  %5, [%2+3]
+   punpckldq             %1, %3
+   punpckldq             %4, %5
+   punpcklqdq            %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+    movu                m%1, [%2]
+    pshufb               m2, m%1, m11 ; subpel_h_shufB
+    pshufb               m3, m%1, m9  ; subpel_h_shufC
+    pshufb              m%1, m10      ; subpel_h_shufA
+ %else
+  %if ARCH_X86_64
+    SWAP                m12, m5
+    SWAP                m13, m6
+    SWAP                m14, m7
+   %define %%mx0 m%+%%i
+   %define %%mx1 m%+%%j
+   %assign %%i 0
+   %rep 12
+    movd              %%mx0, [%2+%%i]
+    %assign %%i %%i+1
+   %endrep
+   %assign %%i 0
+   %rep 6
+    %assign %%j %%i+1
+    punpckldq         %%mx0, %%mx1
+    %assign %%i %%i+2
+   %endrep
+   %assign %%i 0
+   %rep 3
+    %assign %%j %%i+2
+    punpcklqdq        %%mx0, %%mx1
+    %assign %%i %%i+4
+   %endrep
+    SWAP                m%1, m0
+    SWAP                 m2, m4
+    SWAP                 m3, m8
+    SWAP                 m5, m12
+    SWAP                 m6, m13
+    SWAP                 m7, m14
+  %else
+    PREP_8TAP_H_LOAD4    m0, %2+0, m1, m4, m7
+    PREP_8TAP_H_LOAD4    m2, %2+4, m1, m4, m7
+    PREP_8TAP_H_LOAD4    m3, %2+8, m1, m4, m7
+    SWAP                m%1, m0
+  %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+    PREP_8TAP_H_LOAD     %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+    SWAP                 m8, m1
+    SWAP                 m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+    mova                 m4, m2
+    PMADDUBSW            m4, m5, m1, m7, 1  ; subpel +0 B0
+    PMADDUBSW            m2, m6, m1, m7, 0  ; subpel +4 B4
+    PMADDUBSW            m3, m6, m1, m7, 0  ; subpel +4 C4
+    PMADDUBSW            mX, m5, m1, m7, 0  ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+    SWAP                 m1, m8
+    SWAP                 m7, m9
+ %endif
+    paddw                m3, m4
+    paddw               m%1, m2
+    PHADDW              m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+    PMULHRSW_8192       m%1, m%1, m7
+ %else
+    PMULHRSW_8192       m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+    movu                 %1, [%2]
+    pshufb               m2, %1, shufB
+    pshufb               m3, %1, shufC
+    pshufb               %1, shufA
+ %else
+    PREP_8TAP_H_LOAD4    %1, %2+0, m1, %3, %4
+    PREP_8TAP_H_LOAD4    m2, %2+4, m1, %3, %4
+    PREP_8TAP_H_LOAD4    m3, %2+8, m1, %3, %4
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+    PREP_8TAP_HV_LOAD %{1:4}
+    mova                 m1, m2
+    PMADDUBSW            m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+    PMADDUBSW            m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+    PMADDUBSW            m2, subpelh1, %3, %4, 0 ; C4
+    PMADDUBSW            %1, subpelh0, %3, %4, 0 ; A0
+    paddw                m1, m3           ; C0+B4
+    paddw                %1, m2           ; A0+C4
+    PHADDW               %1, m1, %3, 1
+%endmacro
+
 %macro PREP_8TAP_FN 3 ; type, type_h, type_v
 cglobal prep_8tap_%1
     mov                 t0d, FILTER_%2
@@ -2589,6 +2774,14 @@
 %endif
 %endmacro
 
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
 PREP_8TAP_FN regular,        REGULAR, REGULAR
 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
@@ -2601,7 +2794,7 @@
 
 %if ARCH_X86_32
  %define base_reg r2
- %define base base_reg-prep_ssse3
+ %define base base_reg-prep%+SUFFIX
  %define W32_RESTORE_SSQ mov strideq, stridem
 %else
  %define base_reg r7
@@ -2608,7 +2801,6 @@
  %define base 0
  %define W32_RESTORE_SSQ
 %endif
-
 cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %assign org_stack_offset stack_offset
     imul                mxd, mxm, 0x010101
@@ -2618,13 +2810,13 @@
     movsxd               wq, wm
     movifnidn          srcd, srcm
     movifnidn            hd, hm
-    LEA            base_reg, prep_ssse3
     test                mxd, 0xf00
     jnz .h
     test                myd, 0xf00
     jnz .v
+    LEA            base_reg, prep_ssse3
     tzcnt                wd, wd
-    movzx                wd, word [base_reg+wq*2+table_offset(prep,)]
+    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
     add                  wq, base_reg
     movifnidn       strided, stridem
     lea                  r6, [strideq*3]
@@ -2635,25 +2827,49 @@
 %endif
     jmp                  wq
 .h:
+    LEA            base_reg, prep%+SUFFIX
     test                myd, 0xf00
     jnz .hv
+%if cpuflag(ssse3)
     WIN64_SPILL_XMM      12
+%else
+    WIN64_SPILL_XMM      16
+%endif
     cmp                  wd, 4
     je .h_w4
     tzcnt                wd, wd
-%if ARCH_X86_64
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
     mova                m10, [base+subpel_h_shufA]
     mova                m11, [base+subpel_h_shufB]
     mova                 m9, [base+subpel_h_shufC]
+ %else
+  %define m10 [base+subpel_h_shufA]
+  %define m11 [base+subpel_h_shufB]
+  %define m9  [base+subpel_h_shufC]
+ %endif
 %endif
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
-    movd                 m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
+    movd                 m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
     pshufd               m5, m5, q0000
-    movd                 m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
+    movd                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
     pshufd               m6, m6, q0000
+%if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
+%else
+    punpcklbw            m5, m5
+    punpcklbw            m6, m6
+    psraw                m5, 8
+    psraw                m6, 8
+ %if ARCH_X86_64
+    mova                 m7, [pw_2]
+    mova                m15, [pw_1]
+ %else
+  %define m15 m4
+ %endif
+%endif
     add                  wq, base_reg
     jmp                  wq
 .h_w4:
@@ -2663,39 +2879,115 @@
     movzx               mxd, mxb
 %endif
     dec                srcq
-    movd                 m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+    movd                 m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
     pshufd               m4, m4, q0000
+%if cpuflag(ssse3)
     mova                 m6, [base+pw_8192]
     mova                 m5, [base+subpel_h_shufA]
+%else
+    mova                 m6, [base+pw_2]
+ %if ARCH_X86_64
+    mova                m14, [pw_1]
+ %else
+  %define m14 m7
+ %endif
+    punpcklbw            m4, m4
+    psraw                m4, 8
+%endif
     W32_RESTORE_SSQ
 %if ARCH_X86_64
     lea            stride3q, [strideq*3]
 %endif
 .h_w4_loop:
+%if cpuflag(ssse3)
     movq                 m0, [srcq+strideq*0] ; 0
     movq                 m1, [srcq+strideq*1] ; 1
-%if ARCH_X86_32
+ %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
     movq                 m2, [srcq+strideq*0] ; 2
     movq                 m3, [srcq+strideq*1] ; 3
     lea                srcq, [srcq+strideq*2]
-%else
+ %else
     movq                 m2, [srcq+strideq*2] ; 2
     movq                 m3, [srcq+stride3q ] ; 3
     lea                srcq, [srcq+strideq*4]
-%endif
-    pshufb               m0, m5 ; subpel_h_shufA
+ %endif
+    pshufb               m0, m5
     pshufb               m1, m5
     pshufb               m2, m5
     pshufb               m3, m5
-    pmaddubsw            m0, m4 ; subpel_filters + 2
-    pmaddubsw            m1, m4
-    pmaddubsw            m2, m4
-    pmaddubsw            m3, m4
-    phaddw               m0, m1
-    phaddw               m2, m3
-    pmulhrsw             m0, m6 ; pw_8192
-    pmulhrsw             m2, m6 ; pw_8192
+%else
+ %if ARCH_X86_64
+    movd                 m0, [srcq+strideq*0+0]
+    movd                m12, [srcq+strideq*0+1]
+    movd                 m1, [srcq+strideq*1+0]
+    movd                 m5, [srcq+strideq*1+1]
+    movd                 m2, [srcq+strideq*2+0]
+    movd                m13, [srcq+strideq*2+1]
+    movd                 m3, [srcq+stride3q +0]
+    movd                 m7, [srcq+stride3q +1]
+    punpckldq            m0, m12
+    punpckldq            m1, m5
+    punpckldq            m2, m13
+    punpckldq            m3, m7
+    movd                m12, [srcq+strideq*0+2]
+    movd                 m8, [srcq+strideq*0+3]
+    movd                 m5, [srcq+strideq*1+2]
+    movd                 m9, [srcq+strideq*1+3]
+    movd                m13, [srcq+strideq*2+2]
+    movd                m10, [srcq+strideq*2+3]
+    movd                 m7, [srcq+stride3q +2]
+    movd                m11, [srcq+stride3q +3]
+    lea                srcq, [srcq+strideq*4]
+    punpckldq           m12, m8
+    punpckldq            m5, m9
+    punpckldq           m13, m10
+    punpckldq            m7, m11
+    punpcklqdq           m0, m12 ; 0
+    punpcklqdq           m1, m5  ; 1
+    punpcklqdq           m2, m13 ; 2
+    punpcklqdq           m3, m7  ; 3
+ %else
+    movd                 m0, [srcq+strideq*0+0]
+    movd                 m1, [srcq+strideq*0+1]
+    movd                 m2, [srcq+strideq*0+2]
+    movd                 m3, [srcq+strideq*0+3]
+    punpckldq            m0, m1
+    punpckldq            m2, m3
+    punpcklqdq           m0, m2 ; 0
+    movd                 m1, [srcq+strideq*1+0]
+    movd                 m2, [srcq+strideq*1+1]
+    movd                 m3, [srcq+strideq*1+2]
+    movd                 m7, [srcq+strideq*1+3]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m1, m2
+    punpckldq            m3, m7
+    punpcklqdq           m1, m3 ; 1
+    movd                 m2, [srcq+strideq*0+0]
+    movd                 m3, [srcq+strideq*0+1]
+    movd                 m7, [srcq+strideq*0+2]
+    movd                 m5, [srcq+strideq*0+3]
+    punpckldq            m2, m3
+    punpckldq            m7, m5
+    punpcklqdq           m2, m7 ; 2
+    movd                 m3, [srcq+strideq*1+0]
+    movd                 m7, [srcq+strideq*1+1]
+    punpckldq            m3, m7
+    movd                 m7, [srcq+strideq*1+2]
+    movd                 m5, [srcq+strideq*1+3]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m7, m5
+    punpcklqdq           m3, m7 ; 3
+ %endif
+%endif
+    PMADDUBSW            m0, m4, m5, m7, 1 ; subpel_filters + 2
+    PMADDUBSW            m1, m4, m5, m7, 0
+    PMADDUBSW            m2, m4, m5, m7, 0
+    PMADDUBSW            m3, m4, m5, m7, 0
+    PHADDW               m0, m1, m14, ARCH_X86_32
+    PHADDW               m2, m3, m14, 0
+    PMULHRSW_8192        m0, m0, m6
+    PMULHRSW_8192        m2, m2, m6
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m2
     add                tmpq, 32
@@ -2703,55 +2995,41 @@
     jg .h_w4_loop
     RET
     ;
-%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
-%if ARCH_X86_32
-    pshufb               %2, %1, [base+subpel_h_shufB]
-    pshufb               %3, %1, [base+subpel_h_shufC]
-    pshufb               %1,     [base+subpel_h_shufA]
-%else
-    pshufb               %2, %1, m11; subpel_h_shufB
-    pshufb               %3, %1, m9 ; subpel_h_shufC
-    pshufb               %1, m10    ; subpel_h_shufA
-%endif
-    pmaddubsw            %4, %2, m5  ; subpel +0 B0
-    pmaddubsw            %2, m6      ; subpel +4 B4
-    pmaddubsw            %3, m6      ; subpel +4 C4
-    pmaddubsw            %1, m5      ; subpel +0 A0
-    paddw                %3, %4
-    paddw                %1, %2
-    phaddw               %1, %3
-    pmulhrsw             %1, m7      ; 8192
-%endmacro
-    ;
 .h_w8:
 %if ARCH_X86_32
     mov                  r3, r2
-    %define        base_reg  r3
+ %define           base_reg  r3
     W32_RESTORE_SSQ
 %endif
 .h_w8_loop:
-    movu                 m0,     [srcq+strideq*0]
-    movu                 m1,     [srcq+strideq*1]
-    lea                srcq,     [srcq+strideq*2]
-    PREP_8TAP_H          m0, m2, m3, m4
-    PREP_8TAP_H          m1, m2, m3, m4
+%if cpuflag(ssse3)
+    PREP_8TAP_H           0, srcq+strideq*0
+    PREP_8TAP_H           1, srcq+strideq*1
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
+    lea                srcq, [srcq+strideq*2]
     add                tmpq, 32
     sub                  hd, 2
+%else
+    PREP_8TAP_H           0, srcq
+    mova             [tmpq], m0
+    add                srcq, strideq
+    add                tmpq, 16
+    dec                  hd
+%endif
     jg .h_w8_loop
     RET
 .h_w16:
-    xor                 r6d, r6d
+    mov                  r6, -16*1
     jmp .h_start
 .h_w32:
-    mov                  r6, -16*1
+    mov                  r6, -16*2
     jmp .h_start
 .h_w64:
-    mov                  r6, -16*3
+    mov                  r6, -16*4
     jmp .h_start
 .h_w128:
-    mov                  r6, -16*7
+    mov                  r6, -16*8
 .h_start:
 %if ARCH_X86_32
     mov                  r3, r2
@@ -2761,15 +3039,20 @@
     mov                  r5, r6
     W32_RESTORE_SSQ
 .h_loop:
-    movu                 m0,     [srcq+r6+8*0]
-    movu                 m1,     [srcq+r6+8*1]
-    PREP_8TAP_H          m0, m2, m3, m4
-    PREP_8TAP_H          m1, m2, m3, m4
+%if cpuflag(ssse3)
+    PREP_8TAP_H           0, srcq+r6+8*0
+    PREP_8TAP_H           1, srcq+r6+8*1
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
     add                tmpq, 32
     add                  r6, 16
-    jle .h_loop
+%else
+    PREP_8TAP_H           0, srcq+r6
+    mova             [tmpq], m0
+    add                tmpq, 16
+    add                  r6, 8
+%endif
+    jl .h_loop
     add                srcq, strideq
     mov                  r6, r5
     dec                  hd
@@ -2778,8 +3061,9 @@
 %if ARCH_X86_32
  %define            base_reg r2
 %endif
-
+    ;
 .v:
+    LEA            base_reg, prep%+SUFFIX
 %if ARCH_X86_32
     mov                 mxd, myd
     and                 mxd, 0x7f
@@ -2791,10 +3075,12 @@
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    lea                 myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    lea                 myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
     mova                 m2, [base+pw_512]
     psrlw                m2, m2, 1 ; 0x0100
     mova                 m7, [base+pw_8192]
+%endif
 %if ARCH_X86_32
  %define            subpel0  [rsp+mmsize*0]
  %define            subpel1  [rsp+mmsize*1]
@@ -2801,20 +3087,28 @@
  %define            subpel2  [rsp+mmsize*2]
  %define            subpel3  [rsp+mmsize*3]
 %assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
     ALLOC_STACK   -mmsize*4
+ %else
+    ALLOC_STACK   -mmsize*5
+ %endif
 %assign regs_used 7
     movd                 m0, [myq+0]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel0, m0
     movd                 m0, [myq+2]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel1, m0
     movd                 m0, [myq+4]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel2, m0
     movd                 m0, [myq+6]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel3, m0
+ %if notcpuflag(ssse3)
+    mov                  r6, base_reg
+  %define base_reg r6
+ %endif
     mov             strideq, [rstk+stack_offset+gprsize*3]
     lea             strideq, [strideq*3]
     sub [rstk+stack_offset+gprsize*2], strideq
@@ -2826,25 +3120,30 @@
  %define            subpel2  m10
  %define            subpel3  m11
     movd            subpel0, [myq+0]
-    pshufb          subpel0, m2
+    PSHUFB_0X1X     subpel0, m2
     movd            subpel1, [myq+2]
-    pshufb          subpel1, m2
+    PSHUFB_0X1X     subpel1, m2
     movd            subpel2, [myq+4]
-    pshufb          subpel2, m2
+    PSHUFB_0X1X     subpel2, m2
     movd            subpel3, [myq+6]
-    pshufb          subpel3, m2
+    PSHUFB_0X1X     subpel3, m2
     lea            stride3q, [strideq*3]
     sub                srcq, stride3q
     cmp                  wd, 8
-    jg .v_w16
-    je .v_w8
+    jns .v_w8
 %endif
 .v_w4:
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define               srcm [rsp+mmsize*4+gprsize*1]
- %define               tmpm [rsp+mmsize*4+gprsize*2]
+%if notcpuflag(ssse3)
+    pxor                 m6, m6
+ %if ARCH_X86_64
+    mova                 m7, [base+pw_2]
+ %endif
 %endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+  %define srcm [esp+stack_size+gprsize*1]
+  %define tmpm [esp+stack_size+gprsize*2]
+ %endif
     mov                tmpm, tmpq
     mov                srcm, srcq
     lea                 r5d, [wq - 4] ; horizontal loop
@@ -2877,17 +3176,30 @@
 %endif
     punpckldq            m3, m1           ; 4 5 _ _
     punpckldq            m1, m0           ; 5 6 _ _
-    palignr              m4, m3, m2, 4    ; 1 2 3 4
+    PALIGNR              m4, m3, m2, 4    ; 1 2 3 4
     punpcklbw            m3, m1           ; 45 56
     punpcklbw            m1, m2, m4       ; 01 12
     punpckhbw            m2, m4           ; 23 34
 .v_w4_loop:
-    pmaddubsw            m5, m1, subpel0  ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel0
+ %define subpel0 m7
+%endif
+    mova                 m5, m1
+    PMADDUBSW            m5, subpel0, m6, m4, 0  ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel1
+ %define subpel1 m7
+%endif
     mova                 m1, m2
-    pmaddubsw            m2, subpel1      ; a1 b1
+    PMADDUBSW            m2, subpel1, m6, m4, 0  ; a1 b1
     paddw                m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel2
+ %define subpel2 m7
+%endif
     mova                 m2, m3
-    pmaddubsw            m3, subpel2      ; a2 b2
+    PMADDUBSW            m3, subpel2, m6, m4, 0  ; a2 b2
     paddw                m5, m3
     movd                 m4, [srcq+strideq*0]
     punpckldq            m3, m0, m4       ; 6 7 _ _
@@ -2895,9 +3207,27 @@
     lea                srcq, [srcq+strideq*2]
     punpckldq            m4, m0           ; 7 8 _ _
     punpcklbw            m3, m4           ; 67 78
-    pmaddubsw            m4, m3, subpel3  ; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m12, m0
+ %else
+    mova     [esp+mmsize*4], m0
+    mova                 m7, subpel3
+  %define subpel3 m7
+ %endif
+%endif
+    mova                 m4, m3
+    PMADDUBSW            m4, subpel3, m6, m0, 0  ; a3 b3
     paddw                m5, m4
-    pmulhrsw             m5, m7
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+    SWAP                 m0, m12
+ %endif
+    PMULHRSW_8192        m5, m5, m7
+%else
+    mova                 m0, [esp+mmsize*4]
+    PMULHRSW_8192        m5, m5, [base+pw_2]
+%endif
     movq        [tmpq+wq*0], m5
     movhps      [tmpq+wq*2], m5
     lea                tmpq, [tmpq+wq*4]
@@ -2915,10 +3245,12 @@
     jg .v_w4_loop0
 %endif
     RET
-
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ %define base_reg r2
+%endif
+    ;
 %if ARCH_X86_64
 .v_w8:
-.v_w16:
     lea                 r5d, [wq - 8] ; horizontal loop
     mov                  r8, tmpq
     mov                  r6, srcq
@@ -2925,16 +3257,16 @@
     shl                 r5d, 8 - 3; (wq / 8) << 8
     mov                 r5b, hb
 .v_w8_loop0:
-    movq                 m4, [srcq+strideq*0]   ; 0
-    movq                 m5, [srcq+strideq*1]   ; 1
+    movq                 m4, [srcq+strideq*0]
+    movq                 m5, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                 m6, [srcq+strideq*0]   ; 2
-    movq                 m0, [srcq+strideq*1]   ; 3
+    movq                 m6, [srcq+strideq*0]
+    movq                 m0, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                 m1, [srcq+strideq*0]   ; 4
-    movq                 m2, [srcq+strideq*1]   ; 5
-    lea                srcq, [srcq+strideq*2]   ;
-    movq                 m3, [srcq+strideq*0]   ; 6
+    movq                 m1, [srcq+strideq*0]
+    movq                 m2, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                 m3, [srcq+strideq*0]
     shufpd               m4, m0, 0x0c
     shufpd               m5, m1, 0x0c
     punpcklbw            m1, m4, m5 ; 01
@@ -2946,9 +3278,10 @@
     punpcklbw            m3, m6, m0 ; 23
     punpckhbw            m6, m0     ; 56
 .v_w8_loop:
-    movq                m12, [srcq+strideq*1]   ; 8
+%if cpuflag(ssse3)
+    movq                m12, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                m13, [srcq+strideq*0]   ; 9
+    movq                m13, [srcq+strideq*0]
     pmaddubsw           m14, m1, subpel0 ; a0
     pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m1, m3
@@ -2973,8 +3306,43 @@
     paddw               m15, m13
     pmulhrsw            m14, m7
     pmulhrsw            m15, m7
-    movu        [tmpq+wq*0], xm14
-    movu        [tmpq+wq*2], xm15
+    movu        [tmpq+wq*0], m14
+    movu        [tmpq+wq*2], m15
+%else
+    mova                m14, m1
+    PMADDUBSW           m14, subpel0, m7, m12, 1 ; a0
+    mova                 m1, m3
+    PMADDUBSW            m3, subpel1, m7, m12, 0 ; a1
+    paddw               m14, m3
+    mova                 m3, m5
+    PMADDUBSW            m5, subpel2, m7, m12, 0 ; a2
+    paddw               m14, m5
+    movq                m12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                m13, [srcq+strideq*0]
+    shufpd              m15, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m15, m0  ; 67
+    punpckhbw           m15, m0       ; 78
+    mova                m13, m5
+    PMADDUBSW           m13, subpel3, m7, m12, 0 ; a3
+    paddw               m14, m13
+    PMULHRSW_8192       m14, m14, [base+pw_2]
+    movu        [tmpq+wq*0], m14
+    mova                m14, m2
+    PMADDUBSW           m14, subpel0, m7, m12, 0 ; b0
+    mova                 m2, m4
+    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
+    paddw               m14, m4
+    mova                 m4, m6
+    PMADDUBSW            m6, subpel2, m7, m12, 0 ; b2
+    paddw               m14, m6
+    mova                 m6, m15
+    PMADDUBSW           m15, subpel3, m7, m12, 0 ; b3
+    paddw               m14, m15
+    PMULHRSW_8192       m14, m14, [base+pw_2]
+    movu        [tmpq+wq*2], m14
+%endif
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .v_w8_loop
@@ -2991,13 +3359,13 @@
 %undef subpel1
 %undef subpel2
 %undef subpel3
-
+    ;
 .hv:
     %assign stack_offset org_stack_offset
     cmp                  wd, 4
     jg .hv_w8
     and                 mxd, 0x7f
-    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+    movd                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
 %if ARCH_X86_32
     mov                 mxd, myd
     shr                 myd, 16
@@ -3004,7 +3372,7 @@
     and                 mxd, 0x7f
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
     mov                  r5, r2; use as new base
  %define           base_reg  r5
  %assign regs_used 2
@@ -3020,7 +3388,7 @@
  %define           subpelv2  [rsp+mmsize*2]
  %define           subpelv3  [rsp+mmsize*3]
     punpcklbw            m0, m0
-    psraw                m0, 8 ; sign-extend
+    psraw                m0, 8
     pshufd               m6, m0, q0000
     mova           subpelv0, m6
     pshufd               m6, m0, q1111
@@ -3034,8 +3402,12 @@
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
     ALLOC_STACK   mmsize*14, 14
+ %else
+    ALLOC_STACK   mmsize*14, 16
+ %endif
     lea            stride3q, [strideq*3]
     sub                srcq, stride3q
     dec                srcq
@@ -3044,8 +3416,12 @@
  %define           subpelv2  m12
  %define           subpelv3  m13
     punpcklbw            m0, m0
-    psraw                m0, 8 ; sign-extend
+    psraw                m0, 8
+ %if cpuflag(ssse3)
     mova                 m8, [base+pw_8192]
+ %else
+    mova                 m8, [base+pw_2]
+ %endif
     mova                 m9, [base+pd_32]
     pshufd              m10, m0, q0000
     pshufd              m11, m0, q1111
@@ -3053,7 +3429,10 @@
     pshufd              m13, m0, q3333
 %endif
     pshufd               m7, m1, q0000
-.hv_w4:
+%if notcpuflag(ssse3)
+    punpcklbw            m7, m7
+    psraw                m7, 8
+%endif
 %define hv4_line_0_0 4
 %define hv4_line_0_1 5
 %define hv4_line_0_2 6
@@ -3064,10 +3443,12 @@
 %define hv4_line_1_1 11
 %define hv4_line_1_2 12
 %define hv4_line_1_3 13
-    ;
-    ;
 %if ARCH_X86_32
- %define           w8192reg  [base+pw_8192]
+ %if cpuflag(ssse3)
+  %define           w8192reg  [base+pw_8192]
+ %else
+  %define           w8192reg  [base+pw_2]
+ %endif
  %define             d32reg  [base+pd_32]
 %else
  %define           w8192reg  m8
@@ -3074,7 +3455,15 @@
  %define             d32reg  m9
 %endif
     ; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+    mova                m15, [pw_1]
+ %else
+  %define               m15 m1
+ %endif
+%endif
     movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
     movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
@@ -3087,43 +3476,61 @@
     movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
     lea                srcq, [srcq+strideq*4]
 %endif
-    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
-    pmaddubsw            m2, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m2, m0 ;H 0 1 2 3
-    pmulhrsw             m2, w8192reg ;H pw_8192
+    PSHUFB_SUBPEL_H_4a   m2, m5, m6, m1, m3, 1    ;H subpel_h_shuf4 0~1~
+    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
+    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
+    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+    PMULHRSW_8192        m2, m2, w8192reg
     SAVELINE_W4          m2, 2, 0
     ; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
-    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
-    pmaddubsw            m2, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m2, m0 ;H 0 1 2 3
-    pmulhrsw             m2, w8192reg ;H pw_8192
-    ;
+%endif
+    PSHUFB_SUBPEL_H_4b   m2, m5, m6, m1, m3, 0    ;H subpel_h_shuf4 0~1~
+    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
+    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
+    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+    PMULHRSW_8192        m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m2
+ %else
+    mova     [esp+mmsize*4], m2
+ %endif
+%endif
     ; lower shuffle
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
+%endif
     movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
     movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
-    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
-    pmaddubsw            m3, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m3, m0 ;H 4 5 6 7
-    pmulhrsw             m3, w8192reg ;H pw_8192
+    PSHUFB_SUBPEL_H_4a   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
+    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
+    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
+    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+    PMULHRSW_8192        m3, m3, w8192reg
     SAVELINE_W4          m3, 3, 0
     ; upper shuffle
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
-    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
-    pmaddubsw            m3, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m3, m0 ;H 4 5 6 7
-    pmulhrsw             m3, w8192reg ;H pw_8192
-    ;
+%endif
+    PSHUFB_SUBPEL_H_4b   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
+    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
+    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
+    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+    PMULHRSW_8192        m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m2, m14
+ %else
+    mova                 m2, [esp+mmsize*4]
+ %endif
+%endif
 %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
     add                srcq, strideq
@@ -3131,7 +3538,7 @@
     add                srcq, stride3q
 %endif
     ;process high
-    palignr              m4, m3, m2, 4;V 1 2 3 4
+    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
     punpckhwd            m2, m4      ; V 23 34
     pshufd               m0, m3, q2121;V 5 6 5 6
@@ -3143,7 +3550,7 @@
     ;process low
     RESTORELINE_W4       m2, 2, 0
     RESTORELINE_W4       m3, 3, 0
-    palignr              m4, m3, m2, 4;V 1 2 3 4
+    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
     punpckhwd            m2, m4      ; V 23 34
     pshufd               m0, m3, q2121;V 5 6 5 6
@@ -3157,18 +3564,35 @@
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m5
+ %else
+    mova     [esp+mmsize*4], m5
+  %define m15 m3
+ %endif
+%endif
     ;
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
+%endif
     movq                 m4, [srcq+strideq*0] ; 7
     movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
-    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
-    pmaddubsw            m4, m7 ;H subpel_filters
-    phaddw               m4, m4 ;H                7 8 7 8
-    pmulhrsw             m4, w8192reg ;H pw_8192
-    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    PSHUFB_SUBPEL_H_4a   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
+    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
+    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
+    PMULHRSW_8192        m4, m4, w8192reg
+    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
     mova                 m0, m4
     punpcklwd            m3, m4      ; 67 78
     pmaddwd              m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m5, m14
+ %else
+    mova                 m5, [esp+mmsize*4]
+ %endif
+%endif
     paddd                m5, d32reg ; pd_32
     paddd                m5, m4
     psrad                m5, 6
@@ -3189,18 +3613,34 @@
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m5
+ %else
+    mova         [esp+0xA0], m5
+ %endif
+%endif
     ;
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
+%endif
     movq                 m4, [srcq+strideq*0] ; 7
     movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
-    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
-    pmaddubsw            m4, m7 ;H subpel_filters
-    phaddw               m4, m4 ;H                7 8 7 8
-    pmulhrsw             m4, w8192reg ;H pw_8192
-    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    PSHUFB_SUBPEL_H_4b   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
+    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
+    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
+    PMULHRSW_8192        m4, m4, w8192reg
+    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
     mova                 m0, m4
     punpcklwd            m3, m4      ; 67 78
     pmaddwd              m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m5, m14
+ %else
+    mova                 m5, [esp+0xA0]
+ %endif
+%endif
     paddd                m5, d32reg ; pd_32
     paddd                m5, m4
     psrad                m4, m5, 6
@@ -3227,8 +3667,6 @@
 %undef subpelv2
 %undef subpelv3
     ;
-
-
 .hv_w8:
     %assign stack_offset org_stack_offset
 %define hv8_line_1 0
@@ -3247,27 +3685,35 @@
  %define           subpelv3  [rsp+mmsize*10]
  %define             accuv0  [rsp+mmsize*11]
  %define             accuv1  [rsp+mmsize*12]
-    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+    movq                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
     mov                 mxd, myd
     shr                 myd, 16
     and                 mxd, 0x7f
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
     ALLOC_STACK  -mmsize*13
-%if STACK_ALIGNMENT < mmsize
+ %if STACK_ALIGNMENT < mmsize
     mov                rstk, r2m
- %define               tmpm  [rsp+mmsize*13+gprsize*1]
- %define               srcm  [rsp+mmsize*13+gprsize*2]
- %define            stridem  [rsp+mmsize*13+gprsize*3]
+  %define               tmpm  [rsp+mmsize*13+gprsize*1]
+  %define               srcm  [rsp+mmsize*13+gprsize*2]
+  %define            stridem  [rsp+mmsize*13+gprsize*3]
     mov             stridem, rstk
-%endif
+ %endif
     mov                  r6, r2
-%define base_reg r6
+ %define base_reg r6
     pshufd               m0, m1, q0000
     pshufd               m1, m1, q1111
     punpcklbw            m5, m5
-    psraw                m5, 8 ; sign-extend
+ %if notcpuflag(ssse3)
+    punpcklbw            m0, m0
+    punpcklbw            m1, m1
+ %endif
+    psraw                m5, 8
+ %if notcpuflag(ssse3)
+    psraw                m0, 8
+    psraw                m1, 8
+ %endif
     pshufd               m2, m5, q0000
     pshufd               m3, m5, q1111
     pshufd               m4, m5, q2222
@@ -3294,20 +3740,31 @@
  %define           subpelv3  m15
  %define             accuv0  m8
  %define             accuv1  m9
-    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+    movq                 m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
     punpcklbw            m1, m1
-    psraw                m1, 8 ; sign-extend
+ %if notcpuflag(ssse3)
+    punpcklbw      subpelh0, subpelh0
+    punpcklbw      subpelh1, subpelh1
+ %endif
+    psraw                m1, 8
+ %if notcpuflag(ssse3)
+    psraw          subpelh0, 8
+    psraw          subpelh1, 8
+ %endif
     pshufd         subpelv0, m1, q0000
     pshufd         subpelv1, m1, q1111
     pshufd         subpelv2, m1, q2222
     pshufd         subpelv3, m1, q3333
+ %if notcpuflag(ssse3)
+    mova                 m7, [base+pw_2]
+ %endif
     lea                stride3q, [strideq*3]
     sub                srcq, 3
     sub                srcq, stride3q
@@ -3322,57 +3779,89 @@
     shl                 r5d, (16 - 2)
     mov                 r5w, hw
 .hv_w8_loop0:
-    movu                 m4, [srcq+strideq*0] ; 0 = _ _
-    movu                 m5, [srcq+strideq*1] ; 1 = _ _
-    lea                srcq, [srcq+strideq*2]
-%if ARCH_X86_64
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
     mova                 m7, [base+subpel_h_shufA]
     mova                 m8, [base+subpel_h_shufB]
     mova                 m9, [base+subpel_h_shufC]
+  %define shufA m7
+  %define shufB m8
+  %define shufC m9
+ %else
+  %define shufA [base+subpel_h_shufA]
+  %define shufB [base+subpel_h_shufB]
+  %define shufC [base+subpel_h_shufC]
+ %endif
 %endif
-    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
-    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
-    movu                 m6, [srcq+strideq*0] ; 2 = _ _
-    movu                 m0, [srcq+strideq*1] ; 3 = _ _
+    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
+    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
     lea                srcq, [srcq+strideq*2]
-    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
-    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
-    ;
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m9, m4
+ %else
+    mova              [esp], m4
+ %endif
+%endif
+    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m4
+    PREP_8TAP_HV         m0, srcq+strideq*1, m7, m4
+    lea                srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
-    pmulhrsw             m4, m7 ; H pw_8192
-    pmulhrsw             m5, m7 ; H pw_8192
-    pmulhrsw             m6, m7 ; H pw_8192
-    pmulhrsw             m0, m7 ; H pw_8192
-    punpcklwd            m1, m4, m5  ; 0 1 ~
-    punpcklwd            m2, m5, m6  ; 1 2 ~
-    punpcklwd            m3, m6, m0  ; 2 3 ~
+%else
+    mova                 m7, [base+pw_2]
+ %if ARCH_X86_64
+    SWAP                 m4, m9
+ %else
+    mova                 m4, [esp]
+ %endif
+%endif
+    PMULHRSW_8192        m4, m4, m7
+    PMULHRSW_8192        m5, m5, m7
+    PMULHRSW_8192        m6, m6, m7
+    PMULHRSW_8192        m0, m0, m7
+    punpcklwd            m1, m4, m5 ; 01
+    punpcklwd            m2, m5, m6 ; 12
+    punpcklwd            m3, m6, m0 ; 23
     SAVELINE_W8           1, m1
     SAVELINE_W8           2, m2
     SAVELINE_W8           3, m3
-    ;
+%if cpuflag(ssse3)
     mova                 m7, [base+subpel_h_shufA]
-    movu                 m4, [srcq+strideq*0]       ; 4 = _ _
-    movu                 m5, [srcq+strideq*1]       ; 5 = _ _
+%else
+ %if ARCH_X86_64
+    SWAP                 m8, m7
+    SWAP                 m9, m0
+ %else
+    mova         [esp+0x30], m0
+ %endif
+%endif
+    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
+    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
+    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
     lea                srcq, [srcq+strideq*2]
-    movu                 m6, [srcq+strideq*0]       ; 6 = _ _
-    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
-    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
-    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+%if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
-    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
-    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
-    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
-    punpcklwd            m4, m0, m1  ; 3 4 ~
-    punpcklwd            m5, m1, m2  ; 4 5 ~
-    punpcklwd            m6, m2, m3  ; 5 6 ~
-    ;
+%else
+ %if ARCH_X86_64
+    SWAP                 m0, m9
+    SWAP                 m7, m8
+ %else
+    mova                 m0, [esp+0x30]
+    mova                 m7, [base+pw_2]
+ %endif
+%endif
+    PMULHRSW_8192        m1, m4, m7
+    PMULHRSW_8192        m2, m5, m7
+    PMULHRSW_8192        m3, m6, m7
+    punpcklwd            m4, m0, m1 ; 34
+    punpcklwd            m5, m1, m2 ; 45
+    punpcklwd            m6, m2, m3 ; 56
     SAVELINE_W8           6, m3
     RESTORELINE_W8        1, m1
     RESTORELINE_W8        2, m2
     RESTORELINE_W8        3, m3
 .hv_w8_loop:
-    ; m8 accu for V a
-    ; m9 accu for V b
     SAVELINE_W8           1, m3
     SAVELINE_W8           2, m4
     SAVELINE_W8           3, m5
@@ -3389,46 +3878,53 @@
     paddd                m0, m5
     paddd                m7, m6
     mova                 m5, [base+pd_32]
-    paddd                m0, m5 ;   pd_512
-    paddd                m7, m5 ;   pd_512
+    paddd                m0, m5
+    paddd                m7, m5
     mova             accuv0, m0
     mova             accuv1, m7
 %else
-    pmaddwd              m8, m1, subpelv0 ; a0
-    pmaddwd              m9, m2, subpelv0 ; b0
+    pmaddwd          accuv0, m1, subpelv0 ; a0
+    pmaddwd          accuv1, m2, subpelv0 ; b0
     pmaddwd              m3, subpelv1     ; a1
     pmaddwd              m4, subpelv1     ; b1
-    paddd                m8, m3
-    paddd                m9, m4
+    paddd            accuv0, m3
+    paddd            accuv1, m4
     pmaddwd              m5, subpelv2     ; a2
     pmaddwd              m6, subpelv2     ; b2
-    paddd                m8, m5
-    paddd                m9, m6
+    paddd            accuv0, m5
+    paddd            accuv1, m6
     mova                 m7, [base+pd_32]
-    paddd                m8, m7 ;   pd_512
-    paddd                m9, m7 ;   pd_512
+    paddd            accuv0, m7
+    paddd            accuv1, m7
+ %if cpuflag(ssse3)
     mova                 m7, [base+subpel_h_shufB]
     mova                 m6, [base+subpel_h_shufC]
     mova                 m5, [base+subpel_h_shufA]
+  %define shufA m5
+  %define shufB m7
+  %define shufC m6
+ %endif
 %endif
-    movu                 m0, [srcq+strideq*1] ; 7
-    movu                 m4, [srcq+strideq*2] ; 8
+    PREP_8TAP_HV         m0, srcq+strideq*1, m5, m6
+    PREP_8TAP_HV         m4, srcq+strideq*2, m5, m6
     lea                srcq, [srcq+strideq*2]
-    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
-    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
+%if cpuflag(ssse3)
     mova                 m5, [base+pw_8192]
-    pmulhrsw             m0, m5 ; H pw_8192
-    pmulhrsw             m4, m5 ; H pw_8192
+%else
+    mova                 m5, [base+pw_2]
+%endif
+    PMULHRSW_8192        m0, m0, m5
+    PMULHRSW_8192        m4, m4, m5
     RESTORELINE_W8        6, m6
-    punpcklwd            m5, m6, m0  ; 6 7  ~
-    punpcklwd            m6, m0, m4  ; 7 8 ~
+    punpcklwd            m5, m6, m0 ; 67
+    punpcklwd            m6, m0, m4 ; 78
     pmaddwd              m1, m5, subpelv3 ; a3
     paddd                m2, m1, accuv0
     pmaddwd              m1, m6, subpelv3 ; b3
-    paddd                m1, m1, accuv1 ; H + V
+    paddd                m1, m1, accuv1
     psrad                m2, 6
     psrad                m1, 6
-    packssdw             m2, m1      ; d -> w
+    packssdw             m2, m1
     movq        [tmpq+wq*0], m2
     movhps      [tmpq+wq*2], m2
     lea                tmpq, [tmpq+wq*4]
@@ -3457,6 +3953,7 @@
     sub                 r5d, 1<<16
     jg .hv_w8_loop0
     RET
+%endmacro
 
 %if ARCH_X86_32
  %macro SAVE_ALPHA_BETA 0
@@ -5423,6 +5920,7 @@
 
 INIT_XMM ssse3
 PREP_BILIN
+PREP_8TAP
 WARP_AFFINE_8X8
 WARP_AFFINE_8X8T
 
@@ -5432,5 +5930,6 @@
 
 INIT_XMM sse2
 PREP_BILIN
+PREP_8TAP
 WARP_AFFINE_8X8
 WARP_AFFINE_8X8T