ref: a12ba9c94d9eddc0f67dea83810e0775db346e27
parent: e94dafeaf7c82fb1109909a7b4dd0a9219f5a126
author: Francois Cartegnie <[email protected]>
date: Fri Jan 4 13:01:13 EST 2019
add SSSE3 put_8tap
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -29,14 +29,23 @@
#include "src/mc.h"
decl_mc_fn(dav1d_put_8tap_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_ssse3);
decl_mc_fn(dav1d_put_8tap_regular_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_smooth_ssse3);
decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_regular_sharp_ssse3);
decl_mc_fn(dav1d_put_8tap_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_ssse3);
decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_regular_ssse3);
decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_smooth_sharp_ssse3);
decl_mc_fn(dav1d_put_8tap_sharp_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_ssse3);
decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_regular_ssse3);
decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
+decl_mc_fn(dav1d_put_8tap_sharp_smooth_ssse3);
decl_mc_fn(dav1d_put_bilin_avx2);
decl_mc_fn(dav1d_put_bilin_ssse3);
@@ -88,6 +97,15 @@
#if BITDEPTH == 8
init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -46,18 +46,31 @@
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 16 db 64
+pw_8: times 8 dw 8
+pw_26: times 8 dw 26
+pw_34: times 8 dw 34
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
pw_6903: times 8 dw 6903
+pw_8192: times 8 dw 8192
+pd_512: times 4 dd 512
pw_258: times 2 dw 258
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
%macro BIDIR_JMP_TABLE 1-*
;evaluated at definition time (in loop below)
%xdefine %1_table (%%table - 2*%2)
@@ -127,6 +140,7 @@
%endif
%endmacro
+HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
@@ -137,13 +151,11 @@
INIT_XMM ssse3
%if ARCH_X86_32
-DECLARE_REG_TMP 1
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
-%define base t0-put_ssse3
+ DECLARE_REG_TMP 1
+ %define base t0-put_ssse3
%else
-DECLARE_REG_TMP 7
-%define base 0
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ DECLARE_REG_TMP 7
+ %define base 0
%endif
;
%macro RESTORE_DSQ_32 1
@@ -152,6 +164,7 @@
%endif
%endmacro
;
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
movifnidn mxyd, r6m ; mx
LEA t0, put_ssse3
tzcnt wd, wm
@@ -1299,6 +1312,1117 @@
lea t2d, [hq+(7<<16)]
mov t0d, 256
jmp .hv_w16_start
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%macro PUT_8TAP_FN 3 ; type, type_h, type_v
+cglobal put_8tap_%1
+ mov t0d, FILTER_%2
+ mov t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PUT_8TAP_FN regular, REGULAR, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r1
+ %define base base_reg-put_ssse3
+ %define W32_RESTORE_DSQ mov dsq, dsm
+ %define W32_RESTORE_SSQ mov ssq, ssm
+%else
+ %define base_reg r8
+ %define base 0
+ %define W32_RESTORE_DSQ
+ %define W32_RESTORE_SSQ
+%endif
+
+cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+%if ARCH_X86_64
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+%else
+ imul ssd, mym, 0x010101
+ add ssd, t1d ; 8tap_v, my, 4tap_v
+ mov srcq, srcm
+%endif
+ mov wd, wm
+ movifnidn hd, hm
+ LEA base_reg, put_ssse3
+ test mxd, 0xf00
+ jnz .h
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(put,)]
+ add wq, base_reg
+; put_bilin mangling jump
+%assign stack_offset org_stack_offset
+%if ARCH_X86_32
+ mov dsq, dsm
+ mov ssq, ssm
+%elif WIN64
+ pop r8
+%endif
+ lea r6, [ssq*3]
+ jmp wq
+.h:
+%if ARCH_X86_32
+ test ssd, 0xf00
+%else
+ test myd, 0xf00
+%endif
+ jnz .hv
+ W32_RESTORE_SSQ
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ jl .h_w2
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
+ movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
+ pshufd m5, m5, q0000
+ movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
+ pshufd m6, m6, q0000
+ mova m7, [base+pw_34] ; 2 + (8 << 2)
+ add wq, base_reg
+ jmp wq
+.h_w2:
+%if ARCH_X86_32
+ and mxd, 0xff
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ mova m4, [base+subpel_h_shuf4]
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ pshufd m3, m3, q0000
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ W32_RESTORE_DSQ
+.h_w2_loop:
+ movq m0, [srcq+ssq*0]
+ movhps m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pmaddubsw m0, m3
+ phaddw m0, m0
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd r4d, m0
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0xff
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+ pshufd m3, m3, q0000
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ mova m6, [base+subpel_h_shufA]
+ W32_RESTORE_DSQ
+.h_w4_loop:
+ movq m0, [srcq+ssq*0] ; 1
+ movq m1, [srcq+ssq*1] ; 2
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m6 ; subpel_h_shufA
+ pshufb m1, m6 ; subpel_h_shufA
+ pmaddubsw m0, m3 ; subpel_filters
+ pmaddubsw m1, m3 ; subpel_filters
+ phaddw m0, m1
+ paddw m0, m5 ; pw34
+ psraw m0, 6
+ packuswb m0, m0
+ movd [dstq+dsq*0], m0
+ psrlq m0, 32
+ movd [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+ ;
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ %if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+ %endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; C4
+ pmaddubsw %1, m5 ; A0
+ paddw %3, %4 ; C4+B0
+ paddw %1, %2 ; A0+B4
+ phaddw %1, %3
+ paddw %1, m7 ; pw34
+ psraw %1, 6
+%endmacro
+ ;
+.h_w8:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+%if ARCH_X86_32
+ movq [dstq ], m0
+ add dstq, dsm
+ movhps [dstq ], m0
+ add dstq, dsm
+%else
+ movq [dstq+dsq*0], m0
+ movhps [dstq+dsq*1], m0
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ xor r6d, r6d
+ jmp .h_start
+.h_w32:
+ mov r6, -16*1
+ jmp .h_start
+.h_w64:
+ mov r6, -16*3
+ jmp .h_start
+.h_w128:
+ mov r6, -16*7
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H m0, m2, m3, m4
+ PUT_8TAP_H m1, m2, m3, m4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, mmsize
+ jle .h_loop
+ add srcq, ssq
+%if ARCH_X86_32
+ add dstq, dsm
+%else
+ add dstq, dsq
+%endif
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 4
+ cmovle ssd, mxd
+ lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+%endif
+ tzcnt r6d, wd
+ movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ mova m7, [base+pw_512]
+ psrlw m2, m7, 1 ; 0x0100
+ add r6, base_reg
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
+ ALLOC_STACK -mmsize*4
+%assign regs_used 7
+ movd m0, [ssq+0]
+ pshufb m0, m2
+ mova subpel0, m0
+ movd m0, [ssq+2]
+ pshufb m0, m2
+ mova subpel1, m0
+ movd m0, [ssq+4]
+ pshufb m0, m2
+ mova subpel2, m0
+ movd m0, [ssq+6]
+ pshufb m0, m2
+ mova subpel3, m0
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ lea ssq, [ssq*3]
+ sub srcq, ssq
+ mov ssq, [rstk+stack_offset+gprsize*4]
+ mov dsq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ movd subpel0, [myq+0]
+ pshufb subpel0, m2
+ movd subpel1, [myq+2]
+ pshufb subpel1, m2
+ movd subpel2, [myq+4]
+ pshufb subpel2, m2
+ movd subpel3, [myq+6]
+ pshufb subpel3, m2
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+%endif
+ jmp r6
+.v_w2:
+ movd m2, [srcq+ssq*0] ; 0
+ pinsrw m2, [srcq+ssq*1], 2 ; 0 1
+ pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3
+ add srcq, ssq
+%else
+ pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+%endif
+ movd m3, [srcq+ssq*0] ; 4
+ movd m1, [srcq+ssq*1] ; 5
+ movd m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ punpckldq m3, m1 ; 4 5 _ _
+ punpckldq m1, m0 ; 5 6 _ _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ punpcklbw m3, m1 ; 45 56
+ punpcklbw m1, m2, m4 ; 01 12
+ punpckhbw m2, m4 ; 23 34
+.v_w2_loop:
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ movd m4, [srcq+ssq*0] ; 7
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ pshuflw m5, m5, q2020
+ movd r6d, m5
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+%if ARCH_X86_32
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+%endif ; ARCH_X86_32
+ lea r6d, [wq - 4] ; horizontal loop
+ mov r4, dstq
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+ %define srcm [rsp+mmsize*4+gprsize]
+%endif
+ mov srcm, srcq
+%else
+ mov r7, srcq
+%endif
+ shl r6d, (16 - 2) ; (wq / 4) << 16
+ mov r6w, hw
+.v_w4_loop0:
+ movd m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*2] ; 0 _ 2
+ movd m3, [srcq+ssq*1] ; 1
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ movhps m3, [srcq+ssq*0] ; 1 _ 3
+ lea srcq, [srcq+ssq*1]
+%else
+ movhps m3, [srcq+ss3q ] ; 1 _ 3
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufd m2, m2, q2020 ; 0 2 0 2
+ pshufd m3, m3, q2020 ; 1 3 1 3
+ punpckldq m2, m3 ; 0 1 2 3
+ movd m3, [srcq+ssq*0] ; 4
+ movd m1, [srcq+ssq*1] ; 5
+ movd m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ punpckldq m3, m1 ; 4 5 _ _
+ punpckldq m1, m0 ; 5 6 _ _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ punpcklbw m3, m1 ; 45 56
+ punpcklbw m1, m2, m4 ; 01 12
+ punpckhbw m2, m4 ; 23 34
+.v_w4_loop:
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ movd m4, [srcq+ssq*0]
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ packuswb m5, m5
+ movd [dstq+dsq*0], m5
+ pshufd m5, m5, q0101
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ mov hw, r6w ; reset vertical loop
+ add r4, 4
+ mov dstq, r4
+%if ARCH_X86_32
+ mov srcq, srcm
+ add srcq, 4
+ mov srcm, srcq
+%else
+ add r7, 4
+ mov srcq, r7
+%endif
+ sub r6d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+ RET
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq - 8] ; horizontal loop
+ mov r4, dstq
+ mov r7, srcq
+ shl r6d, 8 - 3; (wq / 8) << 8
+ mov r6b, hb
+.v_w8_loop0:
+ movq m4, [srcq+ssq*0] ; 0
+ movq m5, [srcq+ssq*1] ; 1
+ lea srcq, [srcq+ssq*2]
+ movq m6, [srcq+ssq*0] ; 2
+ movq m0, [srcq+ssq*1] ; 3
+ lea srcq, [srcq+ssq*2]
+ movq m1, [srcq+ssq*0] ; 4
+ movq m2, [srcq+ssq*1] ; 5
+ lea srcq, [srcq+ssq*2] ;
+ movq m3, [srcq+ssq*0] ; 6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w8_loop:
+ movq m12, [srcq+ssq*1] ; 8
+ lea srcq, [srcq+ssq*2]
+ movq m13, [srcq+ssq*0] ; 9
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, subpel3 ; a3
+ pmaddubsw m13, m6, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ movq [dstq+dsq*0], xm14
+ movhps [dstq+dsq*1], xm14
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ movzx hd, r6b ; reset vertical loop
+ add r4, 8
+ add r7, 8
+ mov dstq, r4
+ mov srcq, r7
+ sub r6d, 1<<8 ; horizontal--
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+ and mxd, 0xff
+ dec srcq
+ movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
+%if ARCH_X86_32
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 4
+ cmovle ssd, mxd
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ W32_RESTORE_SSQ
+ lea r6, [ssq*3]
+ sub srcq, r6
+ %define base_reg r6
+ mov r6, r1; use as new base
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov dsq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklqdq m0, m0
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklqdq m0, m0
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+ cmp wd, 4
+ je .hv_w4
+.hv_w2:
+ mova m6, [base+subpel_h_shuf4]
+ ;
+ movq m2, [srcq+ssq*0] ; 0
+ movhps m2, [srcq+ssq*1] ; 0 _ 1
+ movq m0, [srcq+ssq*2] ; 2
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
+ lea srcq, [srcq+ssq*1]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+ movhps m0, [srcq+ss3q ] ; 2 _ 3
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m6 ; 0 ~ 1 ~
+ pshufb m0, m6 ; 2 ~ 3 ~
+ pmaddubsw m2, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m2, m0 ; 0 1 2 3
+ pmulhrsw m2, w8192reg
+ ;
+ movq m3, [srcq+ssq*0] ; 4
+ movhps m3, [srcq+ssq*1] ; 4 _ 5
+ movq m0, [srcq+ssq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ pshufb m3, m6 ; 4 ~ 5 ~
+ pshufb m0, m6 ; 6 ~
+ pmaddubsw m3, m7 ; subpel_filters
+ pmaddubsw m0, m7 ; subpel_filters
+ phaddw m3, m0 ; 4 5 6 _
+ pmulhrsw m3, w8192reg
+ ;
+ palignr m4, m3, m2, 4; V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
+ punpckhwd m2, m4 ; V 23 34 2 3 3 4
+ pshufd m0, m3, q2121; V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56 4 5 5 6
+.hv_w2_loop:
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2 ; V
+ pmaddwd m2, subpelv1 ; V a1 b1
+ paddd m5, m2 ; V
+ mova m2, m3 ; V
+ pmaddwd m3, subpelv2 ; a2 b2
+ paddd m5, m3 ; V
+ movq m4, [srcq+ssq*0] ; V 7
+ movhps m4, [srcq+ssq*1] ; V 7 8
+ lea srcq, [srcq+ssq*2] ; V
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, w8192reg
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; V 67 78
+ pmaddwd m4, m3, subpelv3 ; V a3 b3
+ paddd m5, d512reg
+ paddd m5, m4
+ psrad m5, 10
+ packssdw m5, m5
+ packuswb m5, m5
+ movd r4d, m5
+ mov [dstq+dsq*0], r4w
+ shr r4d, 16
+ mov [dstq+dsq*1], r4w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+%undef w8192reg
+%undef d512reg
+ ;
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+ ;
+%macro SAVELINE_W4 3
+ mova [rsp+mmsize*hv4_line_%3_%2], %1
+%endmacro
+%macro RESTORELINE_W4 3
+ mova %1, [rsp+mmsize*hv4_line_%3_%2]
+%endmacro
+ ;
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d512reg [base+pd_512]
+%else
+ %define w8192reg m8
+ %define d512reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 0 _ _ _
+ movhps m5, [srcq+ssq*1] ; 0 _ 1 _
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+ movhps m4, [srcq+ssq*0] ; 2 _ 3 _
+ add srcq, ssq
+%else
+ movhps m4, [srcq+ss3q ] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+ssq*0] ; 4 _ _ _
+ movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+ movq m4, [srcq+ssq*2] ; 6 _ _ _
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ add srcq, ssq
+%else
+ add srcq, ss3q
+%endif
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m5, 10
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+ssq*0] ; 7
+ movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d512reg ; pd_512
+ paddd m5, m4
+ psrad m4, m5, 10
+ ;
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4 ; d -> w
+ packuswb m5, m5 ; w -> b
+ pshuflw m5, m5, q3120
+ lea srcq, [srcq+ssq*2]
+ movd [dstq+dsq*0], m5
+ psrlq m5, 32
+ movd [dstq+dsq*1], m5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+ ;
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+%macro SAVELINE_W8 2
+ mova [rsp+hv8_line_%1*mmsize], %2
+%endmacro
+%macro RESTORELINE_W8 2
+ mova %2, [rsp+hv8_line_%1*mmsize]
+%endmacro
+ shr mxd, 16
+ sub srcq, 3
+%if ARCH_X86_32
+ %define base_reg r1
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, ssb
+ shr ssd, 16
+ cmp hd, 4
+ cmovle ssd, mxd
+ movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ mov ssq, ssmp
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < 16
+ %define srcm [rsp+mmsize*13+gprsize*1]
+ %define dsm [rsp+mmsize*13+gprsize*2]
+ mov r6, [rstk+stack_offset+gprsize*2]
+ mov dsm, r6
+%endif
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ lea r6, [ssq*3]
+ sub srcq, r6
+ mov srcm, srcq
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklqdq m1, m1
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ mov r7, srcq
+%endif
+ lea r6d, [wq-4]
+ mov r4, dstq
+ shl r6d, (16 - 2)
+ mov r6w, hw
+.hv_w8_loop0:
+ movu m4, [srcq+ssq*0] ; 0 = _ _
+ movu m5, [srcq+ssq*1] ; 1 = _ _
+ lea srcq, [srcq+ssq*2]
+ ;
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ %if ARCH_X86_32
+ pshufb %3, %1, [base+subpel_h_shufB]
+ pshufb %4, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+ %else
+ pshufb %3, %1, %6 ; subpel_h_shufB
+ pshufb %4, %1, %7 ; subpel_h_shufC
+ pshufb %1, %5 ; subpel_h_shufA
+ %endif
+ pmaddubsw %2, %3, subpelh0 ; subpel +0 C0
+ pmaddubsw %4, subpelh1; subpel +4 B4
+ pmaddubsw %3, subpelh1; C4
+ pmaddubsw %1, subpelh0; A0
+ paddw %2, %4 ; C0+B4
+ paddw %1, %3 ; A0+C4
+ phaddw %1, %2
+%endmacro
+ ;
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+ movu m6, [srcq+ssq*0] ; 2 = _ _
+ movu m0, [srcq+ssq*1] ; 3 = _ _
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ ;
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ ;
+ mova m7, [base+subpel_h_shufA]
+ movu m4, [srcq+ssq*0] ; 4 = _ _
+ movu m5, [srcq+ssq*1] ; 5 = _ _
+ lea srcq, [srcq+ssq*2]
+ movu m6, [srcq+ssq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ ;
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_512]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_512]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+ssq*1] ; 7
+ movu m4, [srcq+ssq*2] ; 8
+ lea srcq, [srcq+ssq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 10
+ psrad m1, 10
+ packssdw m2, m1 ; d -> w
+ packuswb m2, m1 ; w -> b
+ movd [dstq+dsq*0], m2
+ psrlq m2, 32
+%if ARCH_X86_32
+ add dstq, dsm
+ movd [dstq+dsq*0], m2
+ add dstq, dsm
+%else
+ movd [dstq+dsq*1], m2
+ lea dstq, [dstq+dsq*2]
+%endif
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+ movzx hd, r6w
+ add r4, 4
+ mov dstq, r4
+%if ARCH_X86_32
+ mov srcq, srcm
+ add srcq, 4
+ mov srcm, srcq
+%else
+ add r7, 4
+ mov srcq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w8_loop0
+ RET
%if WIN64
DECLARE_REG_TMP 6, 4