ref: f753caeac09bcf7ebe5d3fe1c0903deb277e9433
parent: f813285c1d1a5421e0180efbb7cbdd377cd31c69
author: Henrik Gramner <[email protected]>
date: Mon Jan 14 10:47:13 EST 2019
Add minor x86 bilin mc optimizations
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -265,7 +265,6 @@
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 0xff01
vbroadcasti128 m4, [bilin_h_shuf8]
- WIN64_SPILL_XMM 7
add mxyd, 16 << 8
movd xm5, mxyd
mov mxyd, r7m ; my
@@ -273,7 +272,7 @@
test mxyd, mxyd
jnz .hv
movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
- vpbroadcastd m6, [pw_2048]
+ vpbroadcastd m3, [pw_2048]
add wq, t2
jmp wq
.h_w2:
@@ -282,7 +281,7 @@
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pmaddubsw xm0, xm5
- pmulhrsw xm0, xm6
+ pmulhrsw xm0, xm3
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 2
@@ -298,7 +297,7 @@
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pmaddubsw xm0, xm5
- pmulhrsw xm0, xm6
+ pmulhrsw xm0, xm3
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
@@ -314,8 +313,8 @@
pshufb xm1, xm4
pmaddubsw xm0, xm5
pmaddubsw xm1, xm5
- pmulhrsw xm0, xm6
- pmulhrsw xm1, xm6
+ pmulhrsw xm0, xm3
+ pmulhrsw xm1, xm3
packuswb xm0, xm1
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
@@ -333,8 +332,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
@@ -350,8 +349,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
@@ -361,25 +360,25 @@
.h_w64:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
- movu m2, [srcq+8*4]
- movu m3, [srcq+8*5]
- add srcq, ssq
pshufb m0, m4
pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
- pmulhrsw m2, m6
- pmulhrsw m3, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
- packuswb m2, m3
+ movu m1, [srcq+8*4]
+ movu m2, [srcq+8*5]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
mova [dstq+32*0], m0
- mova [dstq+32*1], m2
+ mova [dstq+32*1], m1
add dstq, dsq
dec hd
jg .h_w64
@@ -393,8 +392,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+t1+32*3], m0
add t1, 32
@@ -406,14 +405,12 @@
RET
.v:
movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 8
imul mxyd, 0xff01
- vpbroadcastd m7, [pw_2048]
+ vpbroadcastd m5, [pw_2048]
add mxyd, 16 << 8
add wq, t2
- movd xm6, mxyd
- vpbroadcastw m6, xm6
+ movd xm4, mxyd
+ vpbroadcastw m4, xm4
jmp wq
.v_w2:
movd xm0, [srcq+ssq*0]
@@ -423,8 +420,8 @@
pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xm1, xm1, q2301 ; 1 0
punpcklbw xm1, xm0, xm1
- pmaddubsw xm1, xm6
- pmulhrsw xm1, xm7
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
packuswb xm1, xm1
pextrw [dstq+dsq*0], xm1, 1
pextrw [dstq+dsq*1], xm1, 0
@@ -441,8 +438,8 @@
vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm1, xm1, xm0, 0x02 ; 1 2
punpcklbw xm1, xm2
- pmaddubsw xm1, xm6
- pmulhrsw xm1, xm7
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
packuswb xm1, xm1
movd [dstq+dsq*0], xm1
pextrd [dstq+dsq*1], xm1, 1
@@ -453,20 +450,18 @@
.v_w8:
movq xm0, [srcq+ssq*0]
.v_w8_loop:
- vpbroadcastq xm1, [srcq+ssq*1]
+ movq xm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd xm2, xm1, xm0, 0x03 ; 0 1
- vpbroadcastq xm0, [srcq+ssq*0]
- vpblendd xm1, xm1, xm0, 0x0c ; 1 2
- punpcklbw xm3, xm1, xm2
- punpckhbw xm1, xm2
- pmaddubsw xm3, xm6
- pmaddubsw xm1, xm6
- pmulhrsw xm3, xm7
- pmulhrsw xm1, xm7
- packuswb xm3, xm1
- movq [dstq+dsq*0], xm3
- movhps [dstq+dsq*1], xm3
+ punpcklbw xm1, xm3, xm0
+ movq xm0, [srcq+ssq*0]
+ punpcklbw xm2, xm0, xm3
+ pmaddubsw xm1, xm4
+ pmaddubsw xm2, xm4
+ pmulhrsw xm1, xm5
+ pmulhrsw xm2, xm5
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
@@ -481,10 +476,10 @@
vpblendd m2, m2, m0, 0xf0 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
- pmaddubsw m1, m6
- pmaddubsw m2, m6
- pmulhrsw m1, m7
- pmulhrsw m2, m7
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*0], xm1
vextracti128 [dstq+dsq*1], m1, 1
@@ -496,25 +491,25 @@
%macro PUT_BILIN_V_W32 0
movu m0, [srcq+ssq*0]
%%loop:
- movu m4, [srcq+ssq*1]
+ movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m4, m0
- punpckhbw m3, m4, m0
+ punpcklbw m1, m3, m0
+ punpckhbw m2, m3, m0
movu m0, [srcq+ssq*0]
- punpcklbw m2, m0, m4
- punpckhbw m4, m0, m4
- pmaddubsw m1, m6
- pmaddubsw m3, m6
- pmaddubsw m2, m6
- pmaddubsw m4, m6
- pmulhrsw m1, m7
- pmulhrsw m3, m7
- pmulhrsw m2, m7
- pmulhrsw m4, m7
- packuswb m1, m3
- packuswb m2, m4
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
mova [dstq+dsq*0], m1
- mova [dstq+dsq*1], m2
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
@@ -527,25 +522,25 @@
.v_w64_loop:
add srcq, ssq
movu m3, [srcq+32*0]
- movu m4, [srcq+32*1]
punpcklbw m2, m3, m0
- punpckhbw m5, m3, m0
- pmaddubsw m2, m6
- pmaddubsw m5, m6
+ punpckhbw m0, m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m0, m4
+ pmulhrsw m2, m5
+ pmulhrsw m0, m5
+ packuswb m2, m0
mova m0, m3
- pmulhrsw m2, m7
- pmulhrsw m5, m7
- packuswb m2, m5
- punpcklbw m3, m4, m1
- punpckhbw m5, m4, m1
- pmaddubsw m3, m6
- pmaddubsw m5, m6
- mova m1, m4
- pmulhrsw m3, m7
- pmulhrsw m5, m7
- packuswb m3, m5
+ movu m3, [srcq+32*1]
mova [dstq+32*0], m2
- mova [dstq+32*1], m3
+ punpcklbw m2, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m2, m4
+ pmaddubsw m1, m4
+ pmulhrsw m2, m5
+ pmulhrsw m1, m5
+ packuswb m2, m1
+ mova m1, m3
+ mova [dstq+32*1], m2
add dstq, dsq
dec hd
jg .v_w64_loop
@@ -568,7 +563,6 @@
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
vpbroadcastd m7, [pw_2048]
@@ -684,7 +678,14 @@
jg .hv_w16_loop
RET
.hv_w32:
-%macro PUT_BILIN_HV_W32 0
+ xor t2d, t2d
+.hv_w32gt:
+ mov t0, dstq
+ mov t1, srcq
+%if WIN64
+ movaps r4m, xmm8
+%endif
+.hv_w32_loop0:
movu m0, [srcq+8*0]
vinserti128 m0, m0, [srcq+8*2], 1
movu m1, [srcq+8*1]
@@ -693,10 +694,7 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
-%if WIN64
- movaps r4m, xmm8
-%endif
-%%loop:
+.hv_w32_loop:
add srcq, ssq
movu xm2, [srcq+8*1]
vinserti128 m2, m2, [srcq+8*3], 1
@@ -722,41 +720,24 @@
mova [dstq], m3
add dstq, dsq
dec hd
- jg %%loop
-%if WIN64
- movaps xmm8, r4m
-%endif
-%endmacro
- PUT_BILIN_HV_W32
- RET
-.hv_w64:
- mov t0, dstq
- mov t1, srcq
- lea t2d, [hq+(1<<8)]
-.hv_w64_loop:
- PUT_BILIN_HV_W32
- mov hb, t2b
+ jg .hv_w32_loop
+ movzx hd, t2b
add t0, 32
add t1, 32
mov dstq, t0
mov srcq, t1
sub t2d, 1<<8
- jg .hv_w64_loop
+ jg .hv_w32_loop0
+%if WIN64
+ movaps xmm8, r4m
+%endif
RET
+.hv_w64:
+ lea t2d, [hq+(1<<8)]
+ jmp .hv_w32gt
.hv_w128:
- mov t0, dstq
- mov t1, srcq
lea t2d, [hq+(3<<8)]
-.hv_w128_loop:
- PUT_BILIN_HV_W32
- mov hb, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .hv_w128_loop
- RET
+ jmp .hv_w32gt
DECLARE_REG_TMP 3, 5, 6
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -266,7 +266,6 @@
imul mxyd, 0xff01
mova m4, [base+bilin_h_shuf8]
mova m0, [base+bilin_h_shuf4]
- WIN64_SPILL_XMM 7
add mxyd, 16 << 8
movd m5, mxyd
mov mxyd, r7m ; my
@@ -275,7 +274,7 @@
test mxyd, mxyd
jnz .hv
movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
- mova m6, [base+pw_2048]
+ mova m3, [base+pw_2048]
add wq, t0
RESTORE_DSQ_32 t0
jmp wq
@@ -288,7 +287,7 @@
punpckldq m0, m1
pshufb m0, m4
pmaddubsw m0, m5
- pmulhrsw m0, m6
+ pmulhrsw m0, m3
packuswb m0, m0
movd r6d, m0
mov [dstq+dsq*0], r6w
@@ -304,10 +303,10 @@
lea srcq, [srcq+ssq*2]
pshufb m4, m0
pmaddubsw m4, m5
- pmulhrsw m4, m6
+ pmulhrsw m4, m3
packuswb m4, m4
movd [dstq+dsq*0], m4
- pshufd m4, m4, q0101
+ psrlq m4, 32
movd [dstq+dsq*1], m4
lea dstq, [dstq+dsq*2]
sub hd, 2
@@ -321,8 +320,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
@@ -338,8 +337,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
@@ -349,25 +348,25 @@
.h_w32:
movu m0, [srcq+mmsize*0+8*0]
movu m1, [srcq+mmsize*0+8*1]
- movu m2, [srcq+mmsize*1+8*0]
- movu m3, [srcq+mmsize*1+8*1]
- add srcq, ssq
pshufb m0, m4
pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
- pmulhrsw m2, m6
- pmulhrsw m3, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
- packuswb m2, m3
+ movu m1, [srcq+mmsize*1+8*0]
+ movu m2, [srcq+mmsize*1+8*1]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
mova [dstq+16*0], m0
- mova [dstq+16*1], m2
+ mova [dstq+16*1], m1
add dstq, dsq
dec hd
jg .h_w32
@@ -381,8 +380,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+r6+16*3], m0
add r6, 16
@@ -401,8 +400,8 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- pmulhrsw m0, m6
- pmulhrsw m1, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+r6+16*7], m0
add r6, 16
@@ -414,15 +413,13 @@
RET
.v:
movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 8
imul mxyd, 0xff01
- mova m7, [base+pw_2048]
+ mova m5, [base+pw_2048]
add mxyd, 16 << 8
add wq, t0
- movd m6, mxyd
- pshuflw m6, m6, q0000
- punpcklqdq m6, m6
+ movd m4, mxyd
+ pshuflw m4, m4, q0000
+ punpcklqdq m4, m4
RESTORE_DSQ_32 t0
jmp wq
.v_w2:
@@ -433,8 +430,8 @@
pshuflw m2, m0, q2301
pinsrw m0, [srcq+ssq*0], 0 ; 2 1
punpcklbw m1, m0, m2
- pmaddubsw m1, m6
- pmulhrsw m1, m7
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
packuswb m1, m1
movd r6d, m1
mov [dstq+dsq*1], r6w
@@ -453,8 +450,8 @@
movd m0, [srcq+ssq*0]
punpckldq m1, m0 ; 1 2
punpcklbw m1, m2
- pmaddubsw m1, m6
- pmulhrsw m1, m7
+ pmaddubsw m1, m4
+ pmulhrsw m1, m5
packuswb m1, m1
movd [dstq+dsq*0], m1
psrlq m1, 32
@@ -467,20 +464,18 @@
.v_w8:
movq m0, [srcq+ssq*0]
.v_w8_loop:
- movddup m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklqdq m3, m0, m2 ; 0 1 m2qh:m0ql
- movddup m0, [srcq+ssq*0]
- punpcklqdq m4, m2, m0 ; 1 2 m0qh:m2ql
- punpcklbw m1, m4, m3
- punpckhbw m4, m3
- pmaddubsw m1, m6
- pmaddubsw m4, m6
- pmulhrsw m1, m7
- pmulhrsw m4, m7
- packuswb m1, m4
- movq [dstq+dsq*0], m1
- movhps [dstq+dsq*1], m1
+ punpcklbw m1, m3, m0
+ movq m0, [srcq+ssq*0]
+ punpcklbw m2, m0, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
@@ -489,25 +484,25 @@
%macro PUT_BILIN_V_W16 0
movu m0, [srcq+ssq*0]
%%loop:
- movu m4, [srcq+ssq*1]
+ movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m4, m0
- punpckhbw m3, m4, m0
+ punpcklbw m1, m3, m0
+ punpckhbw m2, m3, m0
movu m0, [srcq+ssq*0]
- punpcklbw m2, m0, m4
- pmaddubsw m1, m6
- pmaddubsw m3, m6
- pmulhrsw m1, m7
- pmulhrsw m3, m7
- packuswb m1, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
mova [dstq+dsq*0], m1
- punpckhbw m3, m0, m4
- pmaddubsw m2, m6
- pmaddubsw m3, m6
- pmulhrsw m2, m7
- pmulhrsw m3, m7
- packuswb m2, m3
- mova [dstq+dsq*1], m2
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
@@ -549,7 +544,6 @@
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
- %assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
mova m7, [base+pw_2048]
@@ -579,10 +573,14 @@
paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
pmulhrsw m1, m7
packuswb m1, m1
+%if ARCH_X86_64
+ movq r6, m1
+%else
pshuflw m1, m1, q2020
movd r6d, m1
+%endif
mov [dstq+dsq*0], r6w
- shr r6d, 16
+ shr r6, gprsize*4
mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
@@ -595,9 +593,9 @@
pshufb m0, m4
pmaddubsw m0, m5
.hv_w4_loop:
- movq m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movhps m1, [srcq+ssq*0]
+ movq m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps m1, [srcq+ssq*0]
pshufb m1, m4
pmaddubsw m1, m5 ; 1 2
shufps m2, m0, m1, q1032 ; 0 1
@@ -617,21 +615,21 @@
RET
.hv_w8:
RESTORE_DSQ_32 t0
- movu m0, [srcq+ssq*0+8*0]
+ movu m0, [srcq+ssq*0+8*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
- movu m2, [srcq+ssq*1+8*0]
- lea srcq, [srcq+ssq*2]
- movu m3, [srcq+ssq*0+8*0]
+ movu m2, [srcq+ssq*1+8*0]
+ lea srcq, [srcq+ssq*2]
pshufb m2, m4
- pshufb m3, m4
pmaddubsw m2, m5
psubw m1, m2, m0
paddw m1, m1
pmulhw m1, m6
paddw m1, m0
- pmaddubsw m0, m3, m5
+ movu m0, [srcq+ssq*0+8*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
psubw m3, m0, m2
paddw m3, m3
pmulhw m3, m6
@@ -639,15 +637,21 @@
pmulhrsw m1, m7
pmulhrsw m3, m7
packuswb m1, m3
- movq [dstq+dsq*0], m1
- movhps [dstq+dsq*1], m1
+ movq [dstq+dsq*0], m1
+ movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
- ;
- ; 32bit has ssq, dsq free
-%macro PUT_BILIN_HV_W16 0
+.hv_w16:
+ xor t0d, t0d
+.hv_w16gt:
+ mov r4, dstq
+ mov r6, srcq
+ %if WIN64
+ movaps r4m, xmm8
+ %endif
+.hv_w16_loop0:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
@@ -654,64 +658,48 @@
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
- %if WIN64
- movaps r4m, xmm8
- %endif
-%%loop:
+.hv_w16_loop:
%if ARCH_X86_32
- %define m3back [dstq]
- %define dsqval dsm
+ %define m0tmp [dstq]
%else
- %define m3back m8
- %define dsqval dsq
+ %define m0tmp m8
%endif
add srcq, ssq
- movu m2, [srcq+8*1]
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
pshufb m2, m4
+ pshufb m3, m4
pmaddubsw m2, m5
- psubw m3, m2, m1
+ pmaddubsw m3, m5
+ mova m0tmp, m2
+ psubw m2, m0
+ paddw m2, m2
+ pmulhw m2, m6
+ paddw m2, m0
+ mova m0, m3
+ psubw m3, m1
paddw m3, m3
pmulhw m3, m6
paddw m3, m1
- mova m1, m2
+ mova m1, m0
+ mova m0, m0tmp
+ pmulhrsw m2, m7
pmulhrsw m3, m7
- mova m3back, m3
- movu m2, [srcq+8*0]
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m0
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m0
- mova m0, m2
- pmulhrsw m3, m7
- packuswb m3, m3back
- mova [dstq], m3
- add dstq, dsqval
+ packuswb m2, m3
+ mova [dstq], m2
+ add dstq, dsmp
dec hd
- jg %%loop
- %if WIN64
- movaps xmm8, r4m
- %endif
- %undef m3back
- %undef dsqval
-%endmacro
- ;
-.hv_w16:
- PUT_BILIN_HV_W16
- RET
-.hv_w16gt:
- mov r4, dstq
- mov r6, srcq
-.hv_w16gt_loop:
- PUT_BILIN_HV_W16
- mov hw, t0w
+ jg .hv_w16_loop
+ movzx hd, t0w
add r4, mmsize
add r6, mmsize
mov dstq, r4
mov srcq, r6
sub t0d, 1<<16
- jg .hv_w16gt_loop
+ jg .hv_w16_loop0
+ %if WIN64
+ movaps xmm8, r4m
+ %endif
RET
.hv_w32:
lea t0d, [hq+(1<<16)]