ref: 5173de3032572e7aabdfacf0c0fac2a9a65fab8c
parent: 50e876c60efd251ab7a5af36b0d39572dcce627c
author: Henrik Gramner <[email protected]>
date: Sun Sep 13 14:10:07 EDT 2020
x86: Add misc mc asm tweaks
--- a/src/x86/mc_avx2.asm
+++ b/src/x86/mc_avx2.asm
@@ -59,8 +59,8 @@
subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
@@ -76,6 +76,7 @@
pb_64: times 4 db 64
pw_m256: times 2 dw -256
+pw_15: times 2 dw 15
pw_32: times 2 dw 32
pw_34: times 2 dw 34
pw_258: times 2 dw 258
@@ -201,10 +202,9 @@
SECTION .text
INIT_XMM avx2
-DECLARE_REG_TMP 4, 6, 7
cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
movifnidn mxyd, r6m ; mx
- lea t2, [put_avx2]
+ lea r7, [put_avx2]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -213,35 +213,35 @@
test mxyd, mxyd
jnz .v
.put:
- movzx wd, word [t2+wq*2+table_offset(put,)]
- add wq, t2
+ movzx wd, word [r7+wq*2+table_offset(put,)]
+ add wq, r7
jmp wq
.put_w2:
- movzx t0d, word [srcq+ssq*0]
- movzx t1d, word [srcq+ssq*1]
+ movzx r6d, word [srcq+ssq*0]
+ movzx r7d, word [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0w
- mov [dstq+dsq*1], t1w
+ mov [dstq+dsq*0], r6w
+ mov [dstq+dsq*1], r7w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
- mov t0d, [srcq+ssq*0]
- mov t1d, [srcq+ssq*1]
+ mov r6d, [srcq+ssq*0]
+ mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0d
- mov [dstq+dsq*1], t1d
+ mov [dstq+dsq*0], r6d
+ mov [dstq+dsq*1], r7d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
- mov t0, [srcq+ssq*0]
- mov t1, [srcq+ssq*1]
+ mov r6, [srcq+ssq*0]
+ mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0
- mov [dstq+dsq*1], t1
+ mov [dstq+dsq*0], r6
+ mov [dstq+dsq*1], r7
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
@@ -298,17 +298,17 @@
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
+ imul mxyd, 255
vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
+ add mxyd, 16
movd xm5, mxyd
mov mxyd, r7m ; my
vpbroadcastw m5, xm5
test mxyd, mxyd
jnz .hv
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)]
vpbroadcastd m3, [pw_2048]
- add wq, t2
+ add wq, r7
jmp wq
.h_w2:
movd xm0, [srcq+ssq*0]
@@ -419,10 +419,10 @@
jg .h_w64
RET
.h_w128:
- mov t1, -32*3
+ mov r6, -32*3
.h_w128_loop:
- movu m0, [srcq+t1+32*3+8*0]
- movu m1, [srcq+t1+32*3+8*1]
+ movu m0, [srcq+r6+32*3+8*0]
+ movu m1, [srcq+r6+32*3+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
@@ -430,8 +430,8 @@
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
- mova [dstq+t1+32*3], m0
- add t1, 32
+ mova [dstq+r6+32*3], m0
+ add r6, 32
jle .h_w128_loop
add srcq, ssq
add dstq, dsq
@@ -439,11 +439,11 @@
jg .h_w128
RET
.v:
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 255
vpbroadcastd m5, [pw_2048]
- add mxyd, 16 << 8
- add wq, t2
+ add mxyd, 16
+ add wq, r7
movd xm4, mxyd
vpbroadcastw m4, xm4
jmp wq
@@ -454,7 +454,7 @@
lea srcq, [srcq+ssq*2]
pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xm1, xm1, q2301 ; 1 0
- punpcklbw xm1, xm0, xm1
+ punpcklbw xm1, xm0
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
packuswb xm1, xm1
@@ -467,11 +467,11 @@
.v_w4:
movd xm0, [srcq+ssq*0]
.v_w4_loop:
- vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd xm2, xm1, xm0, 0x01 ; 0 1
+ vpblendd xm1, xm2, xm0, 0x01 ; 0 1
vpbroadcastd xm0, [srcq+ssq*0]
- vpblendd xm1, xm0, 0x02 ; 1 2
+ vpblendd xm2, xm0, 0x02 ; 1 2
punpcklbw xm1, xm2
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
@@ -485,11 +485,11 @@
.v_w8:
movq xm0, [srcq+ssq*0]
.v_w8_loop:
- movq xm3, [srcq+ssq*1]
+ movq xm2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw xm1, xm3, xm0
+ punpcklbw xm1, xm0, xm2
movq xm0, [srcq+ssq*0]
- punpcklbw xm2, xm0, xm3
+ punpcklbw xm2, xm0
pmaddubsw xm1, xm4
pmaddubsw xm2, xm4
pmulhrsw xm1, xm5
@@ -504,11 +504,11 @@
.v_w16:
movu xm0, [srcq+ssq*0]
.v_w16_loop:
- vbroadcasti128 m2, [srcq+ssq*1]
+ vbroadcasti128 m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- vpblendd m3, m2, m0, 0x0f ; 0 1
+ vpblendd m2, m3, m0, 0x0f ; 0 1
vbroadcasti128 m0, [srcq+ssq*0]
- vpblendd m2, m0, 0xf0 ; 1 2
+ vpblendd m3, m0, 0xf0 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m4
@@ -528,8 +528,8 @@
%%loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m2, m3, m0
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
movu m0, [srcq+ssq*0]
pmaddubsw m1, m4
pmaddubsw m2, m4
@@ -536,15 +536,15 @@
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
- mova [dstq+dsq*0], m1
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m4
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
pmaddubsw m2, m4
- pmulhrsw m1, m5
+ pmaddubsw m3, m4
pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*1], m1
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
@@ -557,8 +557,8 @@
.v_w64_loop:
add srcq, ssq
movu m3, [srcq+32*0]
- punpcklbw m2, m3, m0
- punpckhbw m0, m3, m0
+ punpcklbw m2, m0, m3
+ punpckhbw m0, m3
pmaddubsw m2, m4
pmaddubsw m0, m4
pmulhrsw m2, m5
@@ -567,8 +567,8 @@
mova m0, m3
movu m3, [srcq+32*1]
mova [dstq+32*0], m2
- punpcklbw m2, m3, m1
- punpckhbw m1, m3, m1
+ punpcklbw m2, m1, m3
+ punpckhbw m1, m3
pmaddubsw m2, m4
pmaddubsw m1, m4
pmulhrsw m2, m5
@@ -581,28 +581,29 @@
jg .v_w64_loop
RET
.v_w128:
- mov t0, dstq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
+ lea r6d, [hq+(3<<8)]
+ mov r4, srcq
+ mov r7, dstq
.v_w128_loop:
PUT_BILIN_V_W32
- movzx hd, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
jg .v_w128_loop
RET
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+ movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
- vpbroadcastd m7, [pw_2048]
+ vpbroadcastd m7, [pw_15]
movd xm6, mxyd
- add wq, t2
+ add wq, r7
+ paddb m5, m5
vpbroadcastw m6, xm6
jmp wq
.hv_w2:
@@ -618,10 +619,10 @@
shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
mova xm0, xm1
psubw xm1, xm2
- paddw xm1, xm1
pmulhw xm1, xm6
+ pavgw xm2, xm7
paddw xm1, xm2
- pmulhrsw xm1, xm7
+ psrlw xm1, 4
packuswb xm1, xm1
pextrw [dstq+dsq*0], xm1, 0
pextrw [dstq+dsq*1], xm1, 2
@@ -643,10 +644,10 @@
shufps xm2, xm0, xm1, q1032 ; 0 1
mova xm0, xm1
psubw xm1, xm2
- paddw xm1, xm1
pmulhw xm1, xm6
+ pavgw xm2, xm7
paddw xm1, xm2
- pmulhrsw xm1, xm7
+ psrlw xm1, 4
packuswb xm1, xm1
movd [dstq+dsq*0], xm1
pextrd [dstq+dsq*1], xm1, 1
@@ -667,10 +668,10 @@
vperm2i128 m2, m0, m1, 0x21 ; 0 1
mova m0, m1
psubw m1, m2
- paddw m1, m1
pmulhw m1, m6
+ pavgw m2, m7
paddw m1, m2
- pmulhrsw m1, m7
+ psrlw m1, 4
vextracti128 xm2, m1, 1
packuswb xm1, xm2
movq [dstq+dsq*0], xm1
@@ -694,16 +695,16 @@
pshufb m3, m4
pmaddubsw m2, m5
psubw m1, m2, m0
- paddw m1, m1
pmulhw m1, m6
+ pavgw m0, m7
paddw m1, m0
pmaddubsw m0, m3, m5
psubw m3, m0, m2
- paddw m3, m3
pmulhw m3, m6
+ pavgw m2, m7
paddw m3, m2
- pmulhrsw m1, m7
- pmulhrsw m3, m7
+ psrlw m1, 4
+ psrlw m3, 4
packuswb m1, m3
vpermq m1, m1, q3120
mova [dstq+dsq*0], xm1
@@ -712,19 +713,21 @@
sub hd, 2
jg .hv_w16_loop
RET
+.hv_w128:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w32_start
+.hv_w64:
+ lea r6d, [hq+(1<<16)]
+.hv_w32_start:
+ mov r4, srcq
+ mov r7, dstq
.hv_w32:
- xor t2d, t2d
-.hv_w32gt:
- mov t0, dstq
- mov t1, srcq
%if WIN64
movaps r4m, xmm8
%endif
.hv_w32_loop0:
movu m0, [srcq+8*0]
- vinserti128 m0, [srcq+8*2], 1
movu m1, [srcq+8*1]
- vinserti128 m1, [srcq+8*3], 1
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
@@ -731,53 +734,44 @@
pmaddubsw m1, m5
.hv_w32_loop:
add srcq, ssq
- movu xm2, [srcq+8*1]
- vinserti128 m2, [srcq+8*3], 1
+ movu m2, [srcq+8*0]
+ movu m3, [srcq+8*1]
pshufb m2, m4
+ pshufb m3, m4
pmaddubsw m2, m5
- psubw m3, m2, m1
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m1
- mova m1, m2
- pmulhrsw m8, m3, m7
- movu xm2, [srcq+8*0]
- vinserti128 m2, [srcq+8*2], 1
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m0
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m0
+ pmaddubsw m3, m5
+ psubw m8, m2, m0
+ pmulhw m8, m6
+ pavgw m0, m7
+ paddw m8, m0
mova m0, m2
- pmulhrsw m3, m7
- packuswb m3, m8
- mova [dstq], m3
+ psubw m2, m3, m1
+ pmulhw m2, m6
+ pavgw m1, m7
+ paddw m2, m1
+ mova m1, m3
+ psrlw m8, 4
+ psrlw m2, 4
+ packuswb m8, m2
+ mova [dstq], m8
add dstq, dsq
dec hd
jg .hv_w32_loop
- movzx hd, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r4, 32
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<16
jg .hv_w32_loop0
%if WIN64
movaps xmm8, r4m
%endif
RET
-.hv_w64:
- lea t2d, [hq+(1<<8)]
- jmp .hv_w32gt
-.hv_w128:
- lea t2d, [hq+(3<<8)]
- jmp .hv_w32gt
-DECLARE_REG_TMP 3, 5, 6
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
- lea t2, [prep%+SUFFIX]
+ lea r6, [prep%+SUFFIX]
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -786,8 +780,8 @@
test mxyd, mxyd
jnz .v
.prep:
- movzx wd, word [t2+wq*2+table_offset(prep,)]
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
@@ -906,16 +900,16 @@
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
+ imul mxyd, 255
vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
+ add mxyd, 16
movd xm5, mxyd
mov mxyd, r6m ; my
vpbroadcastw m5, xm5
test mxyd, mxyd
jnz .hv
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.h_w4:
@@ -1079,10 +1073,10 @@
RET
.v:
WIN64_SPILL_XMM 7
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 255
+ add mxyd, 16
+ add wq, r6
lea stride3q, [strideq*3]
movd xm6, mxyd
vpbroadcastw m6, xm6
@@ -1100,9 +1094,9 @@
vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
vpblendd m1, m3, 0xaa ; 0 1 2 3
vpblendd m2, m3, 0x55 ; 1 2 3 4
- punpcklbw m2, m1
- pmaddubsw m2, m6
- mova [tmpq], m2
+ punpcklbw m1, m2
+ pmaddubsw m1, m6
+ mova [tmpq], m1
add tmpq, 32
sub hd, 4
jg .v_w4_loop
@@ -1116,15 +1110,15 @@
lea srcq, [srcq+strideq*4]
vpblendd m1, m0, 0x03 ; 0 2 2 2
vpbroadcastq m0, [srcq+strideq*0]
- vpblendd m3, m2, 0x33 ; 1 3 1 3
- vpblendd m2, m1, m3, 0x0f ; 1 3 2 2
- vpblendd m1, m3, 0xf0 ; 0 2 1 3
- vpblendd m2, m0, 0xc0 ; 1 3 2 4
- punpcklbw m3, m2, m1
- punpckhbw m2, m1
- pmaddubsw m3, m6
+ vpblendd m2, m3, 0xcc ; 1 3 1 3
+ vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2
+ vpblendd m2, m1, 0x0f ; 0 2 1 3
+ vpblendd m3, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m6
pmaddubsw m2, m6
- mova [tmpq+32*0], m3
+ mova [tmpq+32*0], m1
mova [tmpq+32*1], m2
add tmpq, 32*2
sub hd, 4
@@ -1133,25 +1127,25 @@
.v_w16:
vbroadcasti128 m0, [srcq+strideq*0]
.v_w16_loop:
- vbroadcasti128 m1, [srcq+strideq*2]
- vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
vbroadcasti128 m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h
+ shufpd m4, m0, m2, 0x0c ; 0 2
vbroadcasti128 m0, [srcq+strideq*0]
- shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h
- shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h
- punpcklbw m3, m2, m4
+ shufpd m1, m3, 0x0c ; 1 3
+ shufpd m2, m0, 0x0c ; 2 4
+ punpcklbw m3, m4, m1
punpcklbw m5, m1, m2
+ punpckhbw m4, m1
punpckhbw m1, m2
- punpckhbw m2, m4
pmaddubsw m3, m6
pmaddubsw m5, m6
- pmaddubsw m2, m6
+ pmaddubsw m4, m6
pmaddubsw m1, m6
mova [tmpq+32*0], m3
mova [tmpq+32*1], m5
- mova [tmpq+32*2], m2
+ mova [tmpq+32*2], m4
mova [tmpq+32*3], m1
add tmpq, 32*4
sub hd, 4
@@ -1164,32 +1158,32 @@
vpermq m2, [srcq+strideq*2], q3120
vpermq m3, [srcq+stride3q ], q3120
lea srcq, [srcq+strideq*4]
- punpcklbw m4, m1, m0
- punpckhbw m5, m1, m0
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
vpermq m0, [srcq+strideq*0], q3120
pmaddubsw m4, m6
pmaddubsw m5, m6
mova [tmpq+32*0], m4
mova [tmpq+32*1], m5
- punpcklbw m4, m2, m1
- punpckhbw m5, m2, m1
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ punpcklbw m5, m2, m3
+ punpckhbw m2, m3
pmaddubsw m5, m6
+ pmaddubsw m2, m6
mova [tmpq+32*2], m4
- mova [tmpq+32*3], m5
+ mova [tmpq+32*3], m1
add tmpq, 32*8
- punpcklbw m4, m3, m2
- punpckhbw m5, m3, m2
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m4, m6
- pmaddubsw m5, m6
+ punpcklbw m1, m3, m0
+ punpckhbw m3, m0
pmaddubsw m1, m6
- pmaddubsw m2, m6
- mova [tmpq-32*4], m4
- mova [tmpq-32*3], m5
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m5
+ mova [tmpq-32*3], m2
mova [tmpq-32*2], m1
- mova [tmpq-32*1], m2
+ mova [tmpq-32*1], m3
sub hd, 4
jg .v_w32_loop
RET
@@ -1200,14 +1194,14 @@
vpermq m2, [srcq+strideq*1+32*0], q3120
vpermq m3, [srcq+strideq*1+32*1], q3120
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- punpckhbw m5, m2, m0
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
pmaddubsw m4, m6
- pmaddubsw m5, m6
+ pmaddubsw m0, m6
mova [tmpq+32*0], m4
- mova [tmpq+32*1], m5
- punpcklbw m4, m3, m1
- punpckhbw m5, m3, m1
+ mova [tmpq+32*1], m0
+ punpcklbw m4, m1, m3
+ punpckhbw m5, m1, m3
vpermq m0, [srcq+strideq*0+32*0], q3120
vpermq m1, [srcq+strideq*0+32*1], q3120
pmaddubsw m4, m6
@@ -1215,52 +1209,52 @@
mova [tmpq+32*2], m4
mova [tmpq+32*3], m5
add tmpq, 32*8
- punpcklbw m4, m0, m2
- punpckhbw m5, m0, m2
- punpcklbw m2, m1, m3
- punpckhbw m3, m1, m3
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
+ punpcklbw m5, m3, m1
+ punpckhbw m3, m1
pmaddubsw m4, m6
- pmaddubsw m5, m6
pmaddubsw m2, m6
+ pmaddubsw m5, m6
pmaddubsw m3, m6
mova [tmpq-32*4], m4
- mova [tmpq-32*3], m5
- mova [tmpq-32*2], m2
+ mova [tmpq-32*3], m2
+ mova [tmpq-32*2], m5
mova [tmpq-32*1], m3
sub hd, 2
jg .v_w64_loop
RET
.v_w128:
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
+ lea r6d, [hq+(3<<8)]
+ mov r3, srcq
+ mov r5, tmpq
.v_w128_loop0:
vpermq m0, [srcq+strideq*0], q3120
.v_w128_loop:
vpermq m1, [srcq+strideq*1], q3120
lea srcq, [srcq+strideq*2]
- punpcklbw m2, m1, m0
- punpckhbw m3, m1, m0
+ punpcklbw m2, m0, m1
+ punpckhbw m3, m0, m1
vpermq m0, [srcq+strideq*0], q3120
- punpcklbw m4, m0, m1
- punpckhbw m5, m0, m1
pmaddubsw m2, m6
pmaddubsw m3, m6
+ punpcklbw m4, m1, m0
+ punpckhbw m1, m0
pmaddubsw m4, m6
- pmaddubsw m5, m6
+ pmaddubsw m1, m6
mova [tmpq+32*0], m2
mova [tmpq+32*1], m3
mova [tmpq+32*8], m4
- mova [tmpq+32*9], m5
+ mova [tmpq+32*9], m1
add tmpq, 32*16
sub hd, 2
jg .v_w128_loop
- movzx hd, t2b
- add t0, 64
- add t1, 32
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r3, 32
+ add r5, 64
+ movzx hd, r6b
+ mov srcq, r3
+ mov tmpq, r5
+ sub r6d, 1<<8
jg .v_w128_loop0
RET
.hv:
@@ -1268,11 +1262,11 @@
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 7
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
shl mxyd, 11
movd xm6, mxyd
vpbroadcastw m6, xm6
- add wq, t2
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
@@ -1388,10 +1382,19 @@
dec hd
jg .hv_w32_loop
RET
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r6d, 256
+ jmp .hv_w64_start
.hv_w64:
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 128
+.hv_w64_start:
+%if WIN64
+ PUSH r7
+%endif
+ mov r5, srcq
+ mov r7, tmpq
.hv_w64_loop0:
movu xm0, [srcq+strideq*0+8*0]
vinserti128 m0, [srcq+strideq*0+8*1], 1
@@ -1413,57 +1416,22 @@
psubw m2, m0, m1
pmulhrsw m2, m6
paddw m2, m1
- mova [tmpq+32*0], m3
- add tmpq, 32*8
- mova [tmpq-32*4], m2
+ mova [tmpq+r6*0], m3
+ mova [tmpq+r6*1], m2
+ lea tmpq, [tmpq+r6*2]
sub hd, 2
jg .hv_w64_loop
- movzx hd, t2b
- add t0, 32
- add t1, 16
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
+ add r5, 16
+ add r7, 32
+ movzx hd, r3b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r3d, 1<<8
jg .hv_w64_loop0
+%if WIN64
+ POP r7
+%endif
RET
-.hv_w128:
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(7<<8)]
-.hv_w128_loop0:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w128_loop:
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- movu xm2, [srcq+strideq*0+8*0]
- vinserti128 m2, [srcq+strideq*0+8*1], 1
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- psubw m3, m1, m0
- pmulhrsw m3, m6
- paddw m3, m0
- pmaddubsw m0, m2, m5
- psubw m2, m0, m1
- pmulhrsw m2, m6
- paddw m2, m1
- mova [tmpq+32*0], m3
- mova [tmpq+32*8], m2
- add tmpq, 32*16
- sub hd, 2
- jg .hv_w128_loop
- movzx hd, t2b
- add t0, 32
- add t1, 16
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .hv_w128_loop0
- RET
; int8_t subpel_filters[5][15][8]
%assign FILTER_REGULAR (0*15 << 16) | 3*15
@@ -1676,12 +1644,12 @@
movd xm2, [srcq+ssq*0]
pinsrw xm2, [srcq+ssq*1], 2
pinsrw xm2, [srcq+ssq*2], 4
- pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
- movd xm3, [srcq+ssq*0]
- vpbroadcastd xm1, [srcq+ssq*1]
- vpbroadcastd xm0, [srcq+ssq*2]
add srcq, ss3q
+ pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm3, xm1, 0x02 ; 4 5
vpblendd xm1, xm0, 0x02 ; 5 6
palignr xm4, xm3, xm2, 4 ; 1 2 3 4
@@ -1696,10 +1664,10 @@
mova xm2, xm3
pmaddubsw xm3, xm10 ; a2 b2
paddw xm5, xm3
- vpbroadcastd xm4, [srcq+ssq*0]
- vpblendd xm3, xm0, xm4, 0x02 ; 6 7
- vpbroadcastd xm0, [srcq+ssq*1]
+ vpbroadcastd xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm4, xm0, 0x02 ; 7 8
punpcklbw xm3, xm4 ; 67 78
pmaddubsw xm4, xm3, xm11 ; a3 b3
@@ -1716,12 +1684,12 @@
movd xm2, [srcq+ssq*0]
pinsrd xm2, [srcq+ssq*1], 1
pinsrd xm2, [srcq+ssq*2], 2
- pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
- movd xm3, [srcq+ssq*0]
- vpbroadcastd xm1, [srcq+ssq*1]
- vpbroadcastd xm0, [srcq+ssq*2]
add srcq, ss3q
+ pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3
+ movd xm3, [srcq+ssq*1]
+ vpbroadcastd xm1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm3, xm1, 0x02 ; 4 5
vpblendd xm1, xm0, 0x02 ; 5 6
palignr xm4, xm3, xm2, 4 ; 1 2 3 4
@@ -1736,10 +1704,10 @@
mova xm2, xm3
pmaddubsw xm3, xm10 ; a2 b2
paddw xm5, xm3
- vpbroadcastd xm4, [srcq+ssq*0]
- vpblendd xm3, xm0, xm4, 0x02 ; 6 7
- vpbroadcastd xm0, [srcq+ssq*1]
+ vpbroadcastd xm4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm4, xm0, 0x02 ; 7 8
punpcklbw xm3, xm4 ; 67 78
pmaddubsw xm4, xm3, xm11 ; a3 b3
@@ -1756,12 +1724,12 @@
movq xm1, [srcq+ssq*0]
vpbroadcastq m4, [srcq+ssq*1]
vpbroadcastq m2, [srcq+ssq*2]
- vpbroadcastq m5, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpbroadcastq m6, [srcq+ssq*1]
- vpbroadcastq m0, [srcq+ssq*2]
add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m0, [srcq+ssq*0]
vpblendd m1, m4, 0x30
vpblendd m4, m2, 0x30
punpcklbw m1, m4 ; 01 12
@@ -1772,6 +1740,8 @@
vpblendd m6, m0, 0x30
punpcklbw m3, m6 ; 45 56
.v_w8_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddubsw m5, m1, m8 ; a0 b0
mova m1, m2
pmaddubsw m2, m9 ; a1 b1
@@ -1779,10 +1749,8 @@
mova m2, m3
pmaddubsw m3, m10 ; a2 b2
paddw m5, m3
- vpbroadcastq m4, [srcq+ssq*0]
vpblendd m3, m0, m4, 0x30
- vpbroadcastq m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vpbroadcastq m0, [srcq+ssq*0]
vpblendd m4, m0, 0x30
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, m11 ; a3 b3
@@ -1800,30 +1768,28 @@
.v_w32:
.v_w64:
.v_w128:
- lea r6d, [wq-16]
- mov r4, dstq
- mov r7, srcq
- shl r6d, 4
- mov r6b, hb
+ lea r6d, [wq*8-128]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*2]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+ssq*0]
vbroadcasti128 m5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m0, [srcq+ssq*1]
- vbroadcasti128 m6, [srcq+ssq*0]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m1, [srcq+ssq*0]
- vbroadcasti128 m2, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vbroadcasti128 m1, [srcq+ssq*1]
+ vbroadcasti128 m2, [srcq+ssq*2]
+ add srcq, ss3q
vbroadcasti128 m3, [srcq+ssq*0]
- shufpd m4, m4, m0, 0x0c
- shufpd m5, m5, m1, 0x0c
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
punpckhbw m4, m5 ; 34
- shufpd m6, m6, m2, 0x0c
+ shufpd m6, m2, 0x0c
punpcklbw m2, m5, m6 ; 12
punpckhbw m5, m6 ; 45
- shufpd m0, m0, m3, 0x0c
+ shufpd m0, m3, 0x0c
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w16_loop:
@@ -1861,11 +1827,11 @@
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w16_loop
- movzx hd, r6b
add r4, 16
add r7, 16
- mov dstq, r4
- mov srcq, r7
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
@@ -1898,12 +1864,12 @@
movq xm2, [srcq+ssq*0]
movhps xm2, [srcq+ssq*1]
movq xm0, [srcq+ssq*2]
- movhps xm0, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m1, [srcq+ssq*2]
add srcq, ss3q
+ movhps xm0, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ vpbroadcastq m4, [srcq+ssq*2]
+ add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
vpblendd m2, m3, 0x30
vpblendd m0, m1, 0x30
vpblendd m2, m4, 0xc0
@@ -1920,6 +1886,11 @@
pshufd xm0, xm3, q2121
punpcklwd xm3, xm0 ; 45 56
.hv_w2_loop:
+ movq xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm4, [srcq+ssq*0]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
pmaddwd xm5, xm1, xm10 ; a0 b0
mova xm1, xm2
pmaddwd xm2, xm11 ; a1 b1
@@ -1926,14 +1897,9 @@
paddd xm5, xm2
mova xm2, xm3
pmaddwd xm3, xm12 ; a2 b2
- paddd xm5, xm3
- movq xm4, [srcq+ssq*0]
- movhps xm4, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm4, xm6
- pmaddubsw xm4, xm7
phaddw xm4, xm4
pmulhrsw xm4, xm8
+ paddd xm5, xm3
palignr xm3, xm4, xm0, 12
mova xm0, xm4
punpcklwd xm3, xm0 ; 67 78
@@ -1954,13 +1920,13 @@
vpbroadcastq m2, [srcq+ssq*0]
vpbroadcastq m4, [srcq+ssq*1]
vpbroadcastq m0, [srcq+ssq*2]
- vpbroadcastq m5, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
+ add srcq, ss3q
+ vpbroadcastq m5, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
vpblendd m2, m4, 0xcc ; 0 1
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m1, [srcq+ssq*2]
+ vpbroadcastq m4, [srcq+ssq*2]
add srcq, ss3q
+ vpbroadcastq m1, [srcq+ssq*0]
vpblendd m0, m5, 0xcc ; 2 3
vpblendd m3, m4, 0xcc ; 4 5
pshufb m2, m6
@@ -1981,6 +1947,8 @@
pshufd m0, m3, q2121
punpcklwd m3, m0 ; 45 56
.hv_w4_loop:
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddwd m5, m1, m10 ; a0 b0
mova m1, m2
pmaddwd m2, m11 ; a1 b1
@@ -1988,9 +1956,7 @@
mova m2, m3
pmaddwd m3, m12 ; a2 b2
paddd m5, m3
- vpbroadcastq m4, [srcq+ssq*0]
- vpbroadcastq m3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ssq*0]
vpblendd m4, m3, 0xcc ; 7 8
pshufb m4, m6
pmaddubsw m4, m7
@@ -2031,25 +1997,23 @@
pshufd m13, m0, q1111
pshufd m14, m0, q2222
pshufd m15, m0, q3333
- lea r6d, [wq-8]
- mov r4, dstq
- mov r7, srcq
- shl r6d, 5
- mov r6b, hb
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
- vbroadcasti128 m8, [subpel_h_shufB]
- vbroadcasti128 m9, [subpel_h_shufC]
movu xm4, [srcq+ssq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movu xm6, [srcq+ssq*0]
- vbroadcasti128 m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm6, [srcq+ssq*2]
+ add srcq, ss3q
+ vbroadcasti128 m0, [srcq+ssq*0]
vpblendd m4, m0, 0xf0 ; 0 3
- vinserti128 m5, [srcq+ssq*0], 1 ; 1 4
- vinserti128 m6, [srcq+ssq*1], 1 ; 2 5
- lea srcq, [srcq+ssq*2]
+ vinserti128 m5, [srcq+ssq*1], 1 ; 1 4
+ vinserti128 m6, [srcq+ssq*2], 1 ; 2 5
+ add srcq, ss3q
vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
pshufb %3, %1, %6
@@ -2130,11 +2094,11 @@
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
- movzx hd, r6b
add r4, 8
add r7, 8
- mov dstq, r4
- mov srcq, r7
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
@@ -2153,48 +2117,6 @@
pmulhrsw m0, m4
%endmacro
-%macro PREP_8TAP_V_W4 5 ; round, weights
- movd xm0, [srcq+strideq*0]
- vpbroadcastd m1, [srcq+strideq*2]
- vpbroadcastd xm2, [srcq+strideq*1]
- vpbroadcastd m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
- vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
- vpbroadcastd m0, [srcq+strideq*0]
- vpbroadcastd m2, [srcq+strideq*1]
- vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
- vpbroadcastd m0, [srcq+strideq*2]
- vbroadcasti128 m5, [deint_shuf4]
- vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
- vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
- vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
- punpcklbw m1, m2, m3 ; 01 12 23 34
- vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
- punpckhbw m2, m3 ; 23 34 45 56
-.v_w4_loop:
- pinsrd xm0, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- vpbroadcastd m3, [srcq+strideq*0]
- vpbroadcastd m4, [srcq+strideq*1]
- vpblendd m3, m4, 0x20 ; _ _ 8 _ 8 9 _ _
- vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 9 _ _
- vpbroadcastd m0, [srcq+strideq*2]
- vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
- pshufb m3, m5 ; 67 78 89 9a
- pmaddubsw m4, m1, m%2
- vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
- pmaddubsw m2, m%3
- paddw m4, m2
- mova m2, m3
- pmaddubsw m3, m%5
- paddw m3, m4
- pmaddubsw m4, m1, m%4
- paddw m3, m4
- pmulhrsw m3, m%1
- mova [tmpq], m3
-%endmacro
-
%if WIN64
DECLARE_REG_TMP 6, 4
%else
@@ -2347,7 +2269,45 @@
jg .v_w16
je .v_w8
.v_w4:
- PREP_8TAP_V_W4 7, 8, 9, 10, 11
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ add srcq, stride3q
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*1]
+ vpbroadcastd m2, [srcq+strideq*2]
+ vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ lea srcq, [srcq+strideq*4]
+ pinsrd xm0, [srcq+strideq*0], 1
+ vpbroadcastd m3, [srcq+strideq*1]
+ vpbroadcastd m4, [srcq+strideq*2]
+ vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _
+ vpbroadcastd m0, [srcq+stride3q ]
+ vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _
+ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m5 ; 67 78 89 9a
+ pmaddubsw m4, m1, m8
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m9
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m11
+ paddw m3, m4
+ pmaddubsw m4, m1, m10
+ paddw m3, m4
+ pmulhrsw m3, m7
+ mova [tmpq], m3
add tmpq, 32
sub hd, 4
jg .v_w4_loop
@@ -2406,11 +2366,10 @@
jg .v_w8_loop
RET
.v_w16:
- lea r6d, [wq-16]
- mov r5, tmpq
- mov r7, srcq
- shl r6d, 4
- mov r6b, hb
+ add wd, wd
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+wq*8-256]
.v_w16_loop0:
vbroadcasti128 m4, [srcq+strideq*0]
vbroadcasti128 m5, [srcq+strideq*1]
@@ -2461,15 +2420,15 @@
pmulhrsw m14, m7
pmulhrsw m15, m7
mova [tmpq+wq*0], m14
- mova [tmpq+wq*2], m15
- lea tmpq, [tmpq+wq*4]
+ mova [tmpq+wq*1], m15
+ lea tmpq, [tmpq+wq*2]
sub hd, 2
jg .v_w16_loop
+ add r5, 16
+ add r7, 32
movzx hd, r6b
- add r5, 32
- add r7, 16
- mov tmpq, r5
- mov srcq, r7
+ mov srcq, r5
+ mov tmpq, r7
sub r6d, 1<<8
jg .v_w16_loop0
RET
@@ -2557,8 +2516,8 @@
vpbroadcastq m2, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
paddd m6, m4
- paddd m5, m3
vpbroadcastq m4, [srcq+strideq*0]
+ paddd m5, m3
vpbroadcastq m3, [srcq+strideq*1]
vpblendd m2, m4, 0xcc
vpbroadcastq m4, [srcq+strideq*2]
@@ -2591,18 +2550,17 @@
jg .hv_w4_loop
RET
.hv_w8:
- lea r6d, [wq-8]
- mov r5, tmpq
- mov r7, srcq
- shl r6d, 5
- mov r6b, hb
+ lea r6d, [wq*8-64]
+ mov r5, srcq
+ mov r7, tmpq
+ lea r6d, [hq+r6*4]
.hv_w8_loop0:
vbroadcasti128 m7, [subpel_h_shufA]
- vbroadcasti128 m8, [subpel_h_shufB]
- vbroadcasti128 m9, [subpel_h_shufC]
movu xm4, [srcq+strideq*0]
+ vbroadcasti128 m8, [subpel_h_shufB]
movu xm5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m9, [subpel_h_shufC]
movu xm6, [srcq+strideq*0]
vbroadcasti128 m0, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
@@ -2676,11 +2634,11 @@
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .hv_w8_loop
+ add r5, 8
+ add r7, 16
movzx hd, r6b
- add r5, 16
- add r7, 8
- mov tmpq, r5
- mov srcq, r7
+ mov srcq, r5
+ mov tmpq, r7
sub r6d, 1<<8
jg .hv_w8_loop0
RET
--- a/src/x86/mc_sse.asm
+++ b/src/x86/mc_sse.asm
@@ -57,8 +57,8 @@
subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
pb_8x0_8x8: times 8 db 0
@@ -77,6 +77,7 @@
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_8: times 8 dw 8
+pw_15: times 8 dw 15
pw_26: times 8 dw 26
pw_34: times 8 dw 34
pw_512: times 8 dw 512
@@ -220,16 +221,18 @@
DECLARE_REG_TMP 7
%define base 0
%endif
-;
+
%macro RESTORE_DSQ_32 1
%if ARCH_X86_32
mov %1, dsm ; restore dsq
%endif
%endmacro
-;
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+
+cglobal put_bilin, 1, 8, 0, dst, ds, src, ss, w, h, mxy
movifnidn mxyd, r6m ; mx
LEA t0, put_ssse3
+ movifnidn srcq, srcmp
+ movifnidn ssq, ssmp
tzcnt wd, wm
mov hd, hm
test mxyd, mxyd
@@ -335,20 +338,19 @@
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
+ imul mxyd, 0x00ff00ff
mova m4, [base+bilin_h_shuf8]
mova m0, [base+bilin_h_shuf4]
- add mxyd, 16 << 8
+ add mxyd, 0x00100010
movd m5, mxyd
mov mxyd, r7m ; my
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
+ pshufd m5, m5, q0000
test mxyd, mxyd
jnz .hv
movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
mova m3, [base+pw_2048]
add wq, t0
- RESTORE_DSQ_32 t0
+ movifnidn dsq, dsmp
jmp wq
.h_w2:
pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
@@ -485,14 +487,13 @@
RET
.v:
movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
+ imul mxyd, 0x00ff00ff
mova m5, [base+pw_2048]
- add mxyd, 16 << 8
+ add mxyd, 0x00100010
add wq, t0
movd m4, mxyd
- pshuflw m4, m4, q0000
- punpcklqdq m4, m4
- RESTORE_DSQ_32 t0
+ pshufd m4, m4, q0000
+ movifnidn dsq, dsmp
jmp wq
.v_w2:
movd m0, [srcq+ssq*0]
@@ -499,9 +500,9 @@
.v_w2_loop:
pinsrw m0, [srcq+ssq*1], 1 ; 0 1
lea srcq, [srcq+ssq*2]
- pshuflw m2, m0, q2301
+ pshuflw m1, m0, q2301
pinsrw m0, [srcq+ssq*0], 0 ; 2 1
- punpcklbw m1, m0, m2
+ punpcklbw m1, m0
pmaddubsw m1, m4
pmulhrsw m1, m5
packuswb m1, m1
@@ -516,11 +517,12 @@
.v_w4:
movd m0, [srcq+ssq*0]
.v_w4_loop:
- movd m1, [srcq+ssq*1]
+ movd m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpckldq m2, m0, m1 ; 0 1
+ mova m1, m0
movd m0, [srcq+ssq*0]
- punpckldq m1, m0 ; 1 2
+ punpckldq m1, m2 ; 0 1
+ punpckldq m2, m0 ; 1 2
punpcklbw m1, m2
pmaddubsw m1, m4
pmulhrsw m1, m5
@@ -536,11 +538,12 @@
.v_w8:
movq m0, [srcq+ssq*0]
.v_w8_loop:
- movq m3, [srcq+ssq*1]
+ movq m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
+ mova m1, m0
movq m0, [srcq+ssq*0]
- punpcklbw m2, m0, m3
+ punpcklbw m1, m2
+ punpcklbw m2, m0
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
@@ -552,66 +555,69 @@
sub hd, 2
jg .v_w8_loop
RET
- ;
%macro PUT_BILIN_V_W16 0
movu m0, [srcq+ssq*0]
%%loop:
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m2, m3, m0
+ mova m1, m0
+ mova m2, m0
movu m0, [srcq+ssq*0]
+ punpcklbw m1, m3
+ punpckhbw m2, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
- mova [dstq+dsq*0], m1
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m4
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
pmaddubsw m2, m4
- pmulhrsw m1, m5
+ pmaddubsw m3, m4
pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*1], m1
+ pmulhrsw m3, m5
+ packuswb m2, m3
+ mova [dstq+dsq*0], m1
+ mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
%endmacro
- ;
.v_w16:
PUT_BILIN_V_W16
RET
+.v_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .v_w16gt
+.v_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .v_w16gt
+.v_w32:
+ lea r6d, [hq+(1<<16)]
.v_w16gt:
- mov r4, dstq
- mov r6, srcq
+ mov r4, srcq
+%if ARCH_X86_64
+ mov r7, dstq
+%endif
.v_w16gt_loop:
-%if ARCH_X86_32
- mov bakm, t0q
- RESTORE_DSQ_32 t0
PUT_BILIN_V_W16
- mov t0q, bakm
+%if ARCH_X86_64
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
%else
- PUT_BILIN_V_W16
+ mov dstq, dstmp
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstmp, dstq
%endif
- mov hw, t0w
- add r4, mmsize
- add r6, mmsize
- mov dstq, r4
- mov srcq, r6
- sub t0d, 1<<16
+ sub r6d, 1<<16
jg .v_w16gt
RET
-.v_w32:
- lea t0d, [hq+(1<<16)]
- jmp .v_w16gt
-.v_w64:
- lea t0d, [hq+(3<<16)]
- jmp .v_w16gt
-.v_w128:
- lea t0d, [hq+(7<<16)]
- jmp .v_w16gt
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
@@ -618,32 +624,33 @@
movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
- mova m7, [base+pw_2048]
+ mova m7, [base+pw_15]
movd m6, mxyd
add wq, t0
pshuflw m6, m6, q0000
+ paddb m5, m5
punpcklqdq m6, m6
jmp wq
.hv_w2:
RESTORE_DSQ_32 t0
movd m0, [srcq+ssq*0]
- pshufd m0, m0, q0000 ; src[x - src_stride]
+ punpckldq m0, m0
pshufb m0, m4
pmaddubsw m0, m5
.hv_w2_loop:
- movd m1, [srcq+ssq*1] ; src[x]
+ movd m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- movhps m1, [srcq+ssq*0] ; src[x + src_stride]
- pshufd m1, m1, q3120
+ movd m2, [srcq+ssq*0]
+ punpckldq m1, m2
pshufb m1, m4
pmaddubsw m1, m5 ; 1 _ 2 _
shufps m2, m0, m1, q1032 ; 0 _ 1 _
mova m0, m1
- psubw m1, m2 ; src[x + src_stride] - src[x]
- paddw m1, m1
- pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x])
- paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
- pmulhrsw m1, m7
+ psubw m1, m2 ; 2 * (src[x + src_stride] - src[x])
+ pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4
+ pavgw m2, m7 ; src[x] + 8
+ paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
+ psrlw m1, 4
packuswb m1, m1
%if ARCH_X86_64
movq r6, m1
@@ -660,8 +667,8 @@
RET
.hv_w4:
mova m4, [base+bilin_h_shuf4]
- RESTORE_DSQ_32 t0
movddup xm0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
pshufb m0, m4
pmaddubsw m0, m5
.hv_w4_loop:
@@ -669,14 +676,14 @@
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
pshufb m1, m4
- pmaddubsw m1, m5 ; 1 2
+ pmaddubsw m1, m5 ; 1 2
shufps m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
- paddw m1, m1
pmulhw m1, m6
+ pavgw m2, m7
paddw m1, m2
- pmulhrsw m1, m7
+ psrlw m1, 4
packuswb m1, m1
movd [dstq+dsq*0], m1
psrlq m1, 32
@@ -686,28 +693,28 @@
jg .hv_w4_loop
RET
.hv_w8:
- RESTORE_DSQ_32 t0
- movu m0, [srcq+ssq*0+8*0]
+ movu m0, [srcq+ssq*0]
+ movifnidn dsq, dsmp
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
- movu m2, [srcq+ssq*1+8*0]
+ movu m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m2, m4
pmaddubsw m2, m5
psubw m1, m2, m0
- paddw m1, m1
pmulhw m1, m6
+ pavgw m0, m7
paddw m1, m0
- movu m0, [srcq+ssq*0+8*0]
+ movu m0, [srcq+ssq*0]
pshufb m0, m4
pmaddubsw m0, m5
psubw m3, m0, m2
- paddw m3, m3
pmulhw m3, m6
+ pavgw m2, m7
paddw m3, m2
- pmulhrsw m1, m7
- pmulhrsw m3, m7
+ psrlw m1, 4
+ psrlw m3, 4
packuswb m1, m3
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
@@ -715,27 +722,34 @@
sub hd, 2
jg .hv_w8_loop
RET
+.hv_w128:
+ lea r6d, [hq+(7<<16)]
+ jmp .hv_w16_start
+.hv_w64:
+ lea r6d, [hq+(3<<16)]
+ jmp .hv_w16_start
+.hv_w32:
+ lea r6d, [hq+(1<<16)]
+.hv_w16_start:
+ mov r4, srcq
+%if ARCH_X86_32
+ %define m8 [dstq]
+%else
+ mov r7, dstq
+%endif
.hv_w16:
- xor t0d, t0d
-.hv_w16gt:
- mov r4, dstq
- mov r6, srcq
- %if WIN64
- movaps r4m, xmm8
- %endif
+ movifnidn dsq, dsmp
+%if WIN64
+ movaps r4m, m8
+%endif
.hv_w16_loop0:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
.hv_w16_loop:
-%if ARCH_X86_32
- %define m0tmp [dstq]
-%else
- %define m0tmp m8
-%endif
add srcq, ssq
movu m2, [srcq+8*0]
movu m3, [srcq+8*1]
@@ -743,62 +757,51 @@
pshufb m3, m4
pmaddubsw m2, m5
pmaddubsw m3, m5
- mova m0tmp, m2
+ mova m8, m2
psubw m2, m0
- paddw m2, m2
pmulhw m2, m6
+ pavgw m0, m7
paddw m2, m0
mova m0, m3
psubw m3, m1
- paddw m3, m3
pmulhw m3, m6
+ pavgw m1, m7
paddw m3, m1
mova m1, m0
- mova m0, m0tmp
- pmulhrsw m2, m7
- pmulhrsw m3, m7
+ mova m0, m8
+ psrlw m2, 4
+ psrlw m3, 4
packuswb m2, m3
mova [dstq], m2
add dstq, dsmp
dec hd
jg .hv_w16_loop
- movzx hd, t0w
- add r4, mmsize
- add r6, mmsize
- mov dstq, r4
- mov srcq, r6
- sub t0d, 1<<16
- jg .hv_w16_loop0
- %if WIN64
- movaps xmm8, r4m
- %endif
+%if ARCH_X86_32
+ mov dstq, dstm
+ add r4, 16
+ movzx hd, r6w
+ add dstq, 16
+ mov srcq, r4
+ mov dstm, dstq
+%else
+ add r4, 16
+ add r7, 16
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+%endif
+ sub r6d, 1<<16
+ jg .hv_w16_loop0
+%if WIN64
+ movaps m8, r4m
+%endif
RET
-.hv_w32:
- lea t0d, [hq+(1<<16)]
- jmp .hv_w16gt
-.hv_w64:
- lea t0d, [hq+(3<<16)]
- jmp .hv_w16gt
-.hv_w128:
- lea t0d, [hq+(7<<16)]
- jmp .hv_w16gt
-%macro PSHUFB_0X1X 1-2 ; dst[, src]
- %if cpuflag(ssse3)
- pshufb %1, %2
- %else
- punpcklbw %1, %1
- psraw %1, 8
- pshufd %1, %1, q0000
- %endif
-%endmacro
-
%macro PSHUFB_BILIN_H8 2 ; dst, src
%if cpuflag(ssse3)
pshufb %1, %2
%else
- mova %2, %1
- psrldq %1, 1
+ psrldq %2, %1, 1
punpcklbw %1, %2
%endif
%endmacro
@@ -807,8 +810,7 @@
%if cpuflag(ssse3)
pshufb %1, %2
%else
- mova %2, %1
- psrldq %1, 1
+ psrldq %2, %1, 1
punpckhbw %3, %1, %2
punpcklbw %1, %2
punpcklqdq %1, %3
@@ -845,17 +847,15 @@
%endmacro
%macro PREP_BILIN 0
-
-DECLARE_REG_TMP 3, 5, 6
%if ARCH_X86_32
- %define base t2-prep%+SUFFIX
+ %define base r6-prep%+SUFFIX
%else
- %define base 0
+ %define base 0
%endif
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
movifnidn mxyd, r5m ; mx
- LEA t2, prep%+SUFFIX
+ LEA r6, prep%+SUFFIX
tzcnt wd, wm
movifnidn hd, hm
test mxyd, mxyd
@@ -865,11 +865,12 @@
jnz .v
.prep:
%if notcpuflag(ssse3)
- add t2, prep_ssse3 - prep_sse2
+ add r6, prep_ssse3 - prep_sse2
jmp prep_ssse3
%else
- movzx wd, word [t2+wq*2+table_offset(prep,)]
- add wq, t2
+ movzx wd, word [r6+wq*2+table_offset(prep,)]
+ pxor m4, m4
+ add wq, r6
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
@@ -877,17 +878,16 @@
movd m1, [srcq+strideq*1]
movd m2, [srcq+strideq*2]
movd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
punpckldq m0, m1
punpckldq m2, m3
- lea srcq, [srcq+strideq*4]
- pxor m1, m1
- punpcklbw m0, m1
- punpcklbw m2, m1
+ punpcklbw m0, m4
+ punpcklbw m2, m4
psllw m0, 4
psllw m2, 4
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m2
- add tmpq, 32
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
sub hd, 4
jg .prep_w4
RET
@@ -897,7 +897,6 @@
movq m2, [srcq+strideq*2]
movq m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- pxor m4, m4
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
@@ -915,16 +914,13 @@
jg .prep_w8
RET
.prep_w16:
- movq m0, [srcq+strideq*0+8*0]
- movq m1, [srcq+strideq*0+8*1]
- movq m2, [srcq+strideq*1+8*0]
- movq m3, [srcq+strideq*1+8*1]
+ movu m1, [srcq+strideq*0]
+ movu m3, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
psllw m0, 4
psllw m1, 4
psllw m2, 4
@@ -937,27 +933,25 @@
sub hd, 2
jg .prep_w16
RET
-.prep_w32:
- mov t2d, 1
- jmp .prep_w32_vloop
-.prep_w64:
- mov t2d, 2
- jmp .prep_w32_vloop
.prep_w128:
- mov t2d, 4
+ mov r3, -128
+ jmp .prep_w32_start
+.prep_w64:
+ mov r3, -64
+ jmp .prep_w32_start
+.prep_w32:
+ mov r3, -32
+.prep_w32_start:
+ sub srcq, r3
.prep_w32_vloop:
- mov t1q, srcq
- mov r3d, t2d
+ mov r6, r3
.prep_w32_hloop:
- movq m0, [t1q+8*0]
- movq m1, [t1q+8*1]
- movq m2, [t1q+8*2]
- movq m3, [t1q+8*3]
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m1, m4
- punpcklbw m2, m4
- punpcklbw m3, m4
+ movu m1, [srcq+r6+16*0]
+ movu m3, [srcq+r6+16*1]
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
psllw m0, 4
psllw m1, 4
psllw m2, 4
@@ -967,10 +961,9 @@
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
- add t1q, 32
- dec r3d
- jg .prep_w32_hloop
- lea srcq, [srcq+strideq]
+ add r6, 32
+ jl .prep_w32_hloop
+ add srcq, strideq
dec hd
jg .prep_w32_vloop
RET
@@ -978,40 +971,31 @@
.h:
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
%if cpuflag(ssse3)
+ imul mxyd, 0x00ff00ff
mova m4, [base+bilin_h_shuf8]
+ add mxyd, 0x00100010
+%else
+ imul mxyd, 0xffff
+ add mxyd, 16
%endif
- add mxyd, 16 << 8
movd m5, mxyd
mov mxyd, r6m ; my
-%if cpuflag(ssse3)
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
-%else
- PSHUFB_0X1X m5
-%endif
+ pshufd m5, m5, q0000
test mxyd, mxyd
jnz .hv
-%if ARCH_X86_32
- mov t1, t2 ; save base reg for w4
-%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
%if notcpuflag(ssse3)
WIN64_SPILL_XMM 8
pxor m6, m6
%endif
- add wq, t2
- lea stride3q, [strideq*3]
+ add wq, r6
jmp wq
.h_w4:
%if cpuflag(ssse3)
- %if ARCH_X86_32
- mova m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
- mova m4, [bilin_h_shuf4]
- %endif
+ mova m4, [base+bilin_h_shuf4]
%endif
+ lea stride3q, [strideq*3]
.h_w4_loop:
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
@@ -1029,6 +1013,8 @@
jg .h_w4_loop
RET
.h_w8:
+ lea stride3q, [strideq*3]
+.h_w8_loop:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
@@ -1048,7 +1034,7 @@
mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 4
- jg .h_w8
+ jg .h_w8_loop
RET
.h_w16:
movu m0, [srcq+strideq*0+8*0]
@@ -1072,22 +1058,23 @@
sub hd, 2
jg .h_w16
RET
-.h_w32:
- mov t2d, 1 << 0
- jmp .h_w32_vloop
-.h_w64:
- mov t2d, 1 << 1
- jmp .h_w32_vloop
.h_w128:
- mov t2d, 1 << 3
+ mov r3, -128
+ jmp .h_w32_start
+.h_w64:
+ mov r3, -64
+ jmp .h_w32_start
+.h_w32:
+ mov r3, -32
+.h_w32_start:
+ sub srcq, r3
.h_w32_vloop:
- mov t1q, srcq
- mov r3d, t2d
+ mov r6, r3
.h_w32_hloop:
- movu m0, [t1q+8*0]
- movu m1, [t1q+8*1]
- movu m2, [t1q+8*2]
- movu m3, [t1q+8*3]
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ movu m2, [srcq+r6+8*2]
+ movu m3, [srcq+r6+8*3]
PSHUFB_BILIN_H8 m0, m4
PSHUFB_BILIN_H8 m1, m4
PSHUFB_BILIN_H8 m2, m4
@@ -1101,11 +1088,10 @@
mova [tmpq+16*2], m2
mova [tmpq+16*3], m3
add tmpq, 16*4
- add t1q, 32
- shr r3d, 1
- jnz .h_w32_hloop
- lea srcq, [srcq+strideq]
- sub hd, 1
+ add r6, 32
+ jl .h_w32_hloop
+ add srcq, strideq
+ dec hd
jg .h_w32_vloop
RET
.v:
@@ -1113,19 +1099,19 @@
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
- add wq, t2
- lea stride3q, [strideq*3]
- movd m5, mxyd
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
%if cpuflag(ssse3)
- pshuflw m5, m5, q0000
- punpcklqdq m5, m5
+ imul mxyd, 0x00ff00ff
+ add mxyd, 0x00100010
%else
- PSHUFB_0X1X m5
+ imul mxyd, 0xffff
pxor m6, m6
+ add mxyd, 16
%endif
+ add wq, r6
+ lea stride3q, [strideq*3]
+ movd m5, mxyd
+ pshufd m5, m5, q0000
jmp wq
.v_w4:
movd m0, [srcq+strideq*0]
@@ -1134,20 +1120,18 @@
movd m2, [srcq+strideq*2]
movd m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- punpcklwd m0, m1 ; 0 1 _ _
- punpcklwd m1, m2 ; 1 2 _ _
- punpcklbw m1, m0
- PMADDUBSW m1, m5, m6, m7, 0
- pshufd m1, m1, q3120
- mova [tmpq+16*0], m1
+ punpckldq m0, m1
+ punpckldq m1, m2
+ punpcklbw m0, m1 ; 01 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
movd m0, [srcq+strideq*0]
- punpcklwd m2, m3 ; 2 3 _ _
- punpcklwd m3, m0 ; 3 4 _ _
- punpcklbw m3, m2
- PMADDUBSW m3, m5, m6, m7, 0
- pshufd m3, m3, q3120
- mova [tmpq+16*1], m3
- add tmpq, 32
+ punpckldq m2, m3
+ punpckldq m3, m0
+ punpcklbw m2, m3 ; 23 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
sub hd, 4
jg .v_w4_loop
RET
@@ -1154,26 +1138,23 @@
.v_w8:
movq m0, [srcq+strideq*0]
.v_w8_loop:
- movq m1, [srcq+strideq*2]
- movq m2, [srcq+strideq*1]
+ movq m1, [srcq+strideq*1]
+ movq m2, [srcq+strideq*2]
movq m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2
+ punpcklbw m0, m1 ; 01
+ punpcklbw m1, m2 ; 12
+ PMADDUBSW m0, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*0], m0
movq m0, [srcq+strideq*0]
- shufpd m2, m3, 0x0c ; 1 3
- shufpd m1, m0, 0x0c ; 2 4
- punpcklbw m3, m2, m4
+ punpcklbw m2, m3 ; 23
+ punpcklbw m3, m0 ; 34
+ PMADDUBSW m2, m5, m6, m7, 0
+ mova [tmpq+16*1], m1
PMADDUBSW m3, m5, m6, m7, 0
- mova [tmpq+16*0], m3
- punpckhbw m3, m2, m4
- PMADDUBSW m3, m5, m6, m7, 0
- mova [tmpq+16*2], m3
- punpcklbw m3, m1, m2
- punpckhbw m1, m2
- PMADDUBSW m3, m5, m6, m7, 0
- PMADDUBSW m1, m5, m6, m7, 0
- mova [tmpq+16*1], m3
- mova [tmpq+16*3], m1
+ mova [tmpq+16*2], m2
+ mova [tmpq+16*3], m3
add tmpq, 16*4
sub hd, 4
jg .v_w8_loop
@@ -1183,48 +1164,48 @@
.v_w16_loop:
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
- punpcklbw m3, m1, m0
- punpckhbw m4, m1, m0
- PMADDUBSW m3, m5, m6, m7, 0
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*0], m3
- mova [tmpq+16*1], m4
- punpcklbw m3, m2, m1
- punpckhbw m4, m2, m1
- PMADDUBSW m3, m5, m6, m7, 0
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*2], m3
- mova [tmpq+16*3], m4
movu m3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m0, m1
+ punpckhbw m0, m1
+ PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
+ mova [tmpq+16*0], m4
+ punpcklbw m4, m1, m2
+ punpckhbw m1, m2
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*1], m0
movu m0, [srcq+strideq*0]
- add tmpq, 16*8
- punpcklbw m1, m3, m2
- punpckhbw m4, m3, m2
PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*2], m4
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq-16*4], m1
- mova [tmpq-16*3], m4
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- PMADDUBSW m1, m5, m6, m7, 0
+ mova [tmpq+16*3], m1
PMADDUBSW m2, m5, m6, m7, 0
- mova [tmpq-16*2], m1
- mova [tmpq-16*1], m2
+ mova [tmpq+16*4], m4
+ punpcklbw m4, m3, m0
+ punpckhbw m3, m0
+ PMADDUBSW m4, m5, m6, m7, 0
+ mova [tmpq+16*5], m2
+ PMADDUBSW m3, m5, m6, m7, 0
+ mova [tmpq+16*6], m4
+ mova [tmpq+16*7], m3
+ add tmpq, 16*8
sub hd, 4
jg .v_w16_loop
RET
-.v_w32:
- lea t2d, [hq+(0<<16)]
- mov t0d, 64
+.v_w128:
+ lea r3d, [hq+(3<<8)]
+ mov r6d, 256
jmp .v_w32_start
.v_w64:
- lea t2d, [hq+(1<<16)]
- mov t0d, 128
+ lea r3d, [hq+(1<<8)]
+ mov r6d, 128
jmp .v_w32_start
-.v_w128:
- lea t2d, [hq+(3<<16)]
- mov t0d, 256
+.v_w32:
+ xor r3d, r3d
+ mov r6d, 64
.v_w32_start:
%if ARCH_X86_64
%if WIN64
@@ -1232,7 +1213,7 @@
%endif
mov r7, tmpq
%endif
- mov t1, srcq
+ mov r5, srcq
.v_w32_hloop:
movu m0, [srcq+strideq*0+16*0]
movu m1, [srcq+strideq*0+16*1]
@@ -1240,48 +1221,48 @@
movu m2, [srcq+strideq*1+16*0]
movu m3, [srcq+strideq*1+16*1]
lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
+ punpcklbw m4, m0, m2
+ punpckhbw m0, m2
PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m0, m5, m6, m7, 0
mova [tmpq+16*0], m4
- punpckhbw m4, m2, m0
+ mova [tmpq+16*1], m0
+ movu m0, [srcq+strideq*0+16*0]
+ punpcklbw m4, m1, m3
+ punpckhbw m1, m3
PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*1], m4
- punpcklbw m4, m3, m1
- PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m1, m5, m6, m7, 0
mova [tmpq+16*2], m4
- punpckhbw m4, m3, m1
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*3], m4
- add tmpq, t0q
- movu m0, [srcq+strideq*0+16*0]
+ mova [tmpq+16*3], m1
movu m1, [srcq+strideq*0+16*1]
- punpcklbw m4, m0, m2
+ add tmpq, r6
+ punpcklbw m4, m2, m0
+ punpckhbw m2, m0
PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m2, m5, m6, m7, 0
mova [tmpq+16*0], m4
- punpckhbw m4, m0, m2
+ mova [tmpq+16*1], m2
+ punpcklbw m4, m3, m1
+ punpckhbw m3, m1
PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*1], m4
- punpcklbw m4, m1, m3
- PMADDUBSW m4, m5, m6, m7, 0
+ PMADDUBSW m3, m5, m6, m7, 0
mova [tmpq+16*2], m4
- punpckhbw m4, m1, m3
- PMADDUBSW m4, m5, m6, m7, 0
- mova [tmpq+16*3], m4
- add tmpq, t0q
+ mova [tmpq+16*3], m3
+ add tmpq, r6
sub hd, 2
jg .v_w32_vloop
- movzx hd, t2w
- add t1, 32
- mov srcq, t1
+ add r5, 32
+ movzx hd, r3b
+ mov srcq, r5
%if ARCH_X86_64
- add r7, 2*16*2
+ add r7, 16*4
mov tmpq, r7
%else
mov tmpq, tmpmp
- add tmpq, 2*16*2
+ add tmpq, 16*4
mov tmpmp, tmpq
%endif
- sub t2d, 1<<16
+ sub r3d, 1<<8
jg .v_w32_hloop
%if WIN64
POP r7
@@ -1290,71 +1271,56 @@
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
%assign stack_offset stack_offset - stack_size_padded
%if cpuflag(ssse3)
+ imul mxyd, 0x08000800
WIN64_SPILL_XMM 8
%else
- WIN64_SPILL_XMM 10
-%endif
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
-%if cpuflag(ssse3)
- shl mxyd, 11
-%else
+ or mxyd, 1<<16
+ WIN64_SPILL_XMM 9
%if ARCH_X86_64
- mova m8, [pw_8]
+ mova m8, [base+pw_8]
%else
- %define m8 [t1-prep_sse2+pw_8]
+ %define m8 [base+pw_8]
%endif
pxor m7, m7
%endif
movd m6, mxyd
- add wq, t2
- pshuflw m6, m6, q0000
-%if cpuflag(ssse3)
- punpcklqdq m6, m6
-%elif ARCH_X86_64
- psrlw m0, m8, 3
- punpcklwd m6, m0
-%else
- punpcklwd m6, [base+pw_1]
-%endif
-%if ARCH_X86_32
- mov t1, t2 ; save base reg for w4
-%endif
- lea stride3q, [strideq*3]
+ add wq, r6
+ pshufd m6, m6, q0000
jmp wq
.hv_w4:
%if cpuflag(ssse3)
- %if ARCH_X86_32
- mova m4, [t1-prep_ssse3+bilin_h_shuf4]
- %else
- mova m4, [bilin_h_shuf4]
- %endif
-%endif
+ mova m4, [base+bilin_h_shuf4]
+ movddup m0, [srcq+strideq*0]
+%else
movhps m0, [srcq+strideq*0]
+%endif
+ lea r3, [strideq*3]
PSHUFB_BILIN_H4 m0, m4, m3
PMADDUBSW m0, m5, m7, m4, 0 ; _ 0
.hv_w4_loop:
movq m1, [srcq+strideq*1]
movhps m1, [srcq+strideq*2]
- movq m2, [srcq+stride3q ]
+ movq m2, [srcq+r3 ]
lea srcq, [srcq+strideq*4]
movhps m2, [srcq+strideq*0]
PSHUFB_BILIN_H4 m1, m4, m3
PSHUFB_BILIN_H4 m2, m4, m3
PMADDUBSW m1, m5, m7, m4, 0 ; 1 2
- shufpd m3, m0, m1, 0x01 ; 0 1
- mova m0, m2
- PMADDUBSW m0, m5, m7, m4, 0 ; 3 4
- shufpd m2, m1, m0, 0x01 ; 2 3
- psubw m1, m3
+ PMADDUBSW m2, m5, m7, m4, 0 ; 3 4
+ shufpd m0, m1, 0x01 ; 0 1
+ shufpd m3, m1, m2, 0x01 ; 2 3
+ psubw m1, m0
PMULHRSW m1, m6, m4, m8, 4
- paddw m1, m3
- psubw m3, m0, m2
- PMULHRSW m3, m6, m4, m8, 4
- paddw m3, m2
+ paddw m1, m0
+ mova m0, m2
+ psubw m2, m3
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m3
mova [tmpq+16*0], m1
- mova [tmpq+16*1], m3
+ mova [tmpq+16*1], m2
add tmpq, 32
sub hd, 4
jg .hv_w4_loop
@@ -1365,7 +1331,8 @@
PMADDUBSW m0, m5, m7, m4, 0 ; 0
.hv_w8_loop:
movu m1, [srcq+strideq*1]
- movu m2, [srcq+strideq*2]
+ lea srcq, [srcq+strideq*2]
+ movu m2, [srcq+strideq*0]
PSHUFB_BILIN_H8 m1, m4
PSHUFB_BILIN_H8 m2, m4
PMADDUBSW m1, m5, m7, m4, 0 ; 1
@@ -1373,68 +1340,40 @@
psubw m3, m1, m0
PMULHRSW m3, m6, m4, m8, 4
paddw m3, m0
-%if notcpuflag(ssse3) && ARCH_X86_64
- SWAP m9, m7
-%endif
- psubw m7, m2, m1
- PMULHRSW m7, m6, m4, m8, 4
- paddw m7, m1
+ mova m0, m2
+ psubw m2, m1
+ PMULHRSW m2, m6, m4, m8, 4
+ paddw m2, m1
mova [tmpq+16*0], m3
- mova [tmpq+16*1], m7
-%if notcpuflag(ssse3) && ARCH_X86_64
- SWAP m7, m9
-%endif
- movu m1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- movu m0, [srcq+strideq*0]
- PSHUFB_BILIN_H8 m1, m4
- PSHUFB_BILIN_H8 m0, m4
- PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3
- PMADDUBSW m0, m5, m7, m4, 0 ; 4
- psubw m3, m1, m2
- PMULHRSW m3, m6, m4, m8, 4
- paddw m3, m2
-%if notcpuflag(ssse3) && ARCH_X86_64
- SWAP m9, m7
-%endif
- psubw m7, m0, m1
- PMULHRSW m7, m6, m4, m8, 4
- paddw m7, m1
- mova [tmpq+16*2], m3
- mova [tmpq+16*3], m7
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
- SWAP m7, m9
- %else
- pxor m7, m7
- %endif
-%endif
- add tmpq, 16*4
- sub hd, 4
+ mova [tmpq+16*1], m2
+ add tmpq, 16*2
+ sub hd, 2
jg .hv_w8_loop
RET
-.hv_w16:
- mov t2d, hd
- mov t0d, 32
+.hv_w128:
+ lea r3d, [hq+(7<<8)]
+ mov r5d, 256
jmp .hv_w16_start
-.hv_w32:
- lea t2d, [hq+(1<<16)]
- mov t0d, 64
- jmp .hv_w16_start
.hv_w64:
- lea t2d, [hq+(3<<16)]
- mov t0d, 128
+ lea r3d, [hq+(3<<8)]
+ mov r5d, 128
jmp .hv_w16_start
-.hv_w128:
- lea t2d, [hq+(7<<16)]
- mov t0d, 256
+.hv_w32:
+ lea r3d, [hq+(1<<8)]
+ mov r5d, 64
+ jmp .hv_w16_start
+.hv_w16:
+ xor r3d, r3d
+ mov r5d, 32
.hv_w16_start:
+%if ARCH_X86_64 || cpuflag(ssse3)
+ mov r6, srcq
+%endif
%if ARCH_X86_64
%if WIN64
PUSH r7
%endif
mov r7, tmpq
- mov r5, srcq
%endif
.hv_w16_hloop:
movu m0, [srcq+strideq*0+8*0]
@@ -1459,7 +1398,7 @@
PMULHRSW m0, m6, m4, m8, 4
paddw m0, m1
mova [tmpq+16*1], m0
- add tmpq, t0q
+ add tmpq, r5
movu m0, [srcq+strideq*0+8*0]
PSHUFB_BILIN_H8 m0, m4
PMADDUBSW m0, m5, m7, m4, 0 ; 2a
@@ -1474,24 +1413,30 @@
PMULHRSW m2, m6, m4, m8, 4
paddw m2, m3
mova [tmpq+16*1], m2
- add tmpq, t0q
+ add tmpq, r5
sub hd, 2
jg .hv_w16_vloop
- movzx hd, t2w
+ movzx hd, r3b
%if ARCH_X86_64
- add r5, 16
+ add r6, 16
add r7, 2*16
- mov srcq, r5
+ mov srcq, r6
mov tmpq, r7
+%elif cpuflag(ssse3)
+ mov tmpq, tmpm
+ add r6, 16
+ add tmpq, 2*16
+ mov srcq, r6
+ mov tmpm, tmpq
%else
- mov srcq, srcmp
- mov tmpq, tmpmp
+ mov srcq, srcm
+ mov tmpq, tmpm
add srcq, 16
add tmpq, 2*16
- mov srcmp, srcq
- mov tmpmp, tmpq
+ mov srcm, srcq
+ mov tmpm, tmpq
%endif
- sub t2d, 1<<16
+ sub r3d, 1<<8
jg .hv_w16_hloop
%if WIN64
POP r7
@@ -1538,13 +1483,9 @@
%if ARCH_X86_32
%define base_reg r1
%define base base_reg-put_ssse3
- %define W32_RESTORE_DSQ mov dsq, dsm
- %define W32_RESTORE_SSQ mov ssq, ssm
%else
%define base_reg r8
%define base 0
- %define W32_RESTORE_DSQ
- %define W32_RESTORE_SSQ
%endif
cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
@@ -1575,10 +1516,9 @@
add wq, base_reg
; put_bilin mangling jump
%assign stack_offset org_stack_offset
-%if ARCH_X86_32
- mov dsq, dsm
- mov ssq, ssm
-%elif WIN64
+ movifnidn dsq, dsmp
+ movifnidn ssq, ssmp
+%if WIN64
pop r8
%endif
lea r6, [ssq*3]
@@ -1590,7 +1530,7 @@
test myd, 0xf00
%endif
jnz .hv
- W32_RESTORE_SSQ
+ movifnidn ssq, ssmp
WIN64_SPILL_XMM 12
cmp wd, 4
jl .h_w2
@@ -1604,11 +1544,10 @@
shr mxd, 16
sub srcq, 3
movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-put_ssse3+0]
- pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-put_ssse3+4]
- pshufd m6, m6, q0000
+ movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3]
mova m7, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
add wq, base_reg
jmp wq
.h_w2:
@@ -1620,9 +1559,9 @@
dec srcq
mova m4, [base+subpel_h_shuf4]
movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
- pshufd m3, m3, q0000
mova m5, [base+pw_34] ; 2 + (8 << 2)
- W32_RESTORE_DSQ
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
.h_w2_loop:
movq m0, [srcq+ssq*0]
movhps m0, [srcq+ssq*1]
@@ -1633,10 +1572,10 @@
paddw m0, m5 ; pw34
psraw m0, 6
packuswb m0, m0
- movd r4d, m0
- mov [dstq+dsq*0], r4w
- shr r4d, 16
- mov [dstq+dsq*1], r4w
+ movd r6d, m0
+ mov [dstq+dsq*0], r6w
+ shr r6d, 16
+ mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
@@ -1649,10 +1588,10 @@
%endif
dec srcq
movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
- pshufd m3, m3, q0000
- mova m5, [base+pw_34] ; 2 + (8 << 2)
mova m6, [base+subpel_h_shufA]
- W32_RESTORE_DSQ
+ mova m5, [base+pw_34] ; 2 + (8 << 2)
+ pshufd m3, m3, q0000
+ movifnidn dsq, dsmp
.h_w4_loop:
movq m0, [srcq+ssq*0] ; 1
movq m1, [srcq+ssq*1] ; 2
@@ -1672,7 +1611,6 @@
sub hd, 2
jg .h_w4_loop
RET
- ;
%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
%if ARCH_X86_32
pshufb %2, %1, [base+subpel_h_shufB]
@@ -1693,18 +1631,17 @@
paddw %1, m7 ; pw34
psraw %1, 6
%endmacro
- ;
.h_w8:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- PUT_8TAP_H m0, m2, m3, m4
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H m0, m2, m3, m4
PUT_8TAP_H m1, m2, m3, m4
packuswb m0, m1
%if ARCH_X86_32
- movq [dstq ], m0
+ movq [dstq], m0
add dstq, dsm
- movhps [dstq ], m0
+ movhps [dstq], m0
add dstq, dsm
%else
movq [dstq+dsq*0], m0
@@ -1714,22 +1651,23 @@
sub hd, 2
jg .h_w8
RET
-.h_w16:
- xor r6d, r6d
- jmp .h_start
-.h_w32:
- mov r6, -16*1
- jmp .h_start
-.h_w64:
- mov r6, -16*3
- jmp .h_start
.h_w128:
- mov r6, -16*7
-.h_start:
- sub srcq, r6
- sub dstq, r6
- mov r4, r6
-.h_loop:
+ mov r4, -16*7
+ jmp .h_w16_start
+.h_w64:
+ mov r4, -16*3
+ jmp .h_w16_start
+.h_w32:
+ mov r4, -16*1
+ jmp .h_w16_start
+.h_w16:
+ xor r4d, r4d
+.h_w16_start:
+ sub srcq, r4
+ sub dstq, r4
+.h_w16_loop_v:
+ mov r6, r4
+.h_w16_loop_h:
movu m0, [srcq+r6+8*0]
movu m1, [srcq+r6+8*1]
PUT_8TAP_H m0, m2, m3, m4
@@ -1736,17 +1674,12 @@
PUT_8TAP_H m1, m2, m3, m4
packuswb m0, m1
mova [dstq+r6], m0
- add r6, mmsize
- jle .h_loop
+ add r6, 16
+ jle .h_w16_loop_h
add srcq, ssq
-%if ARCH_X86_32
- add dstq, dsm
-%else
- add dstq, dsq
-%endif
- mov r6, r4
+ add dstq, dsmp
dec hd
- jg .h_loop
+ jg .h_w16_loop_v
RET
.v:
%if ARCH_X86_32
@@ -1754,7 +1687,7 @@
shr ssd, 16
cmp hd, 6
cmovs ssd, mxd
- lea ssq, [base_reg+ssq*8+subpel_filters-put_ssse3]
+ movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
%else
%assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
@@ -1762,12 +1695,12 @@
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-put_ssse3]
+ movq m0, [base_reg+myq*8+subpel_filters-put_ssse3]
%endif
tzcnt r6d, wd
movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)]
+ punpcklwd m0, m0
mova m7, [base+pw_512]
- psrlw m2, m7, 1 ; 0x0100
add r6, base_reg
%if ARCH_X86_32
%define subpel0 [rsp+mmsize*0]
@@ -1775,20 +1708,16 @@
%define subpel2 [rsp+mmsize*2]
%define subpel3 [rsp+mmsize*3]
%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
- ALLOC_STACK -mmsize*4
+ ALLOC_STACK -16*4
%assign regs_used 7
- movd m0, [ssq+0]
- pshufb m0, m2
- mova subpel0, m0
- movd m0, [ssq+2]
- pshufb m0, m2
- mova subpel1, m0
- movd m0, [ssq+4]
- pshufb m0, m2
- mova subpel2, m0
- movd m0, [ssq+6]
- pshufb m0, m2
- mova subpel3, m0
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
mov ssq, [rstk+stack_offset+gprsize*4]
lea ssq, [ssq*3]
sub srcq, ssq
@@ -1799,47 +1728,46 @@
%define subpel1 m9
%define subpel2 m10
%define subpel3 m11
- movd subpel0, [myq+0]
- pshufb subpel0, m2
- movd subpel1, [myq+2]
- pshufb subpel1, m2
- movd subpel2, [myq+4]
- pshufb subpel2, m2
- movd subpel3, [myq+6]
- pshufb subpel3, m2
lea ss3q, [ssq*3]
+ pshufd m8, m0, q0000
sub srcq, ss3q
+ pshufd m9, m0, q1111
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
%endif
jmp r6
.v_w2:
- movd m2, [srcq+ssq*0] ; 0
- pinsrw m2, [srcq+ssq*1], 2 ; 0 1
- pinsrw m2, [srcq+ssq*2], 4 ; 0 1 2
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
- pinsrw m2, [srcq+ssq*0], 6 ; 0 1 2 3
- add srcq, ssq
-%else
- pinsrw m2, [srcq+ss3q ], 6 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
-%endif
- movd m3, [srcq+ssq*0] ; 4
- movd m1, [srcq+ssq*1] ; 5
- movd m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- add srcq, ssq
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
%else
+ movd m2, [srcq+ssq*2]
add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
+ punpcklwd m1, m0 ; 0 1
+ punpcklwd m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpcklwd m2, m5 ; 2 3
+ punpcklwd m5, m3 ; 3 4
+ punpcklwd m3, m4 ; 4 5
+ punpcklwd m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
.v_w2_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddubsw m5, m1, subpel0 ; a0 b0
mova m1, m2
pmaddubsw m2, subpel1 ; a1 b1
@@ -1847,17 +1775,14 @@
mova m2, m3
pmaddubsw m3, subpel2 ; a2 b2
paddw m5, m3
- movd m4, [srcq+ssq*0] ; 7
- punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpckldq m4, m0 ; 7 8 _ _
+ punpcklwd m3, m0, m4 ; 6 7
+ movd m0, [srcq+ssq*0]
+ punpcklwd m4, m0 ; 7 8
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, subpel3 ; a3 b3
paddw m5, m4
pmulhrsw m5, m7
packuswb m5, m5
- pshuflw m5, m5, q2020
movd r6d, m5
mov [dstq+dsq*0], r6w
shr r6d, 16
@@ -1873,51 +1798,46 @@
.v_w32:
.v_w64:
.v_w128:
-%endif ; ARCH_X86_32
- lea r6d, [wq - 4] ; horizontal loop
- mov r4, dstq
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define srcm [rsp+mmsize*4+gprsize]
+ shl wd, 14
+%if STACK_ALIGNMENT < 16
+ %define dstm [rsp+mmsize*4+gprsize]
+ mov dstm, dstq
%endif
- mov srcm, srcq
-%else
- mov r7, srcq
-%endif
- shl r6d, (16 - 2) ; (wq / 4) << 16
- mov r6w, hw
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
.v_w4_loop0:
- movd m2, [srcq+ssq*0] ; 0
- movhps m2, [srcq+ssq*2] ; 0 _ 2
- movd m3, [srcq+ssq*1] ; 1
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m3, [srcq+ssq*0] ; 1 _ 3
- lea srcq, [srcq+ssq*1]
-%else
- movhps m3, [srcq+ss3q ] ; 1 _ 3
- lea srcq, [srcq+ssq*4]
%endif
- pshufd m2, m2, q2020 ; 0 2 0 2
- pshufd m3, m3, q2020 ; 1 3 1 3
- punpckldq m2, m3 ; 0 1 2 3
- movd m3, [srcq+ssq*0] ; 4
- movd m1, [srcq+ssq*1] ; 5
- movd m0, [srcq+ssq*2] ; 6
+ movd m1, [srcq+ssq*0]
+ movd m0, [srcq+ssq*1]
%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
+ movd m2, [srcq+ssq*0]
+ movd m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movd m3, [srcq+ssq*0]
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
%else
+ movd m2, [srcq+ssq*2]
add srcq, ss3q
+ movd m5, [srcq+ssq*0]
+ movd m3, [srcq+ssq*1]
+ movd m4, [srcq+ssq*2]
+ add srcq, ss3q
%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- palignr m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+ssq*0]
+ punpckldq m2, m5 ; 2 3
+ punpckldq m5, m3 ; 3 4
+ punpckldq m3, m4 ; 4 5
+ punpckldq m4, m0 ; 5 6
+ punpcklbw m2, m5 ; 23 34
+ punpcklbw m3, m4 ; 45 56
.v_w4_loop:
+ movd m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
pmaddubsw m5, m1, subpel0 ; a0 b0
mova m1, m2
pmaddubsw m2, subpel1 ; a1 b1
@@ -1925,10 +1845,8 @@
mova m2, m3
pmaddubsw m3, subpel2 ; a2 b2
paddw m5, m3
- movd m4, [srcq+ssq*0]
punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
+ movd m0, [srcq+ssq*0]
punpckldq m4, m0 ; 7 8 _ _
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, subpel3 ; a3 b3
@@ -1936,24 +1854,21 @@
pmulhrsw m5, m7
packuswb m5, m5
movd [dstq+dsq*0], m5
- pshufd m5, m5, q0101
+ psrlq m5, 32
movd [dstq+dsq*1], m5
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
- mov hw, r6w ; reset vertical loop
- add r4, 4
- mov dstq, r4
%if ARCH_X86_32
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
-%else
- add r7, 4
- mov srcq, r7
-%endif
- sub r6d, 1<<16 ; horizontal--
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
+ sub r6d, 1<<16
jg .v_w4_loop0
+%endif
RET
%if ARCH_X86_64
.v_w8:
@@ -1961,56 +1876,51 @@
.v_w32:
.v_w64:
.v_w128:
- lea r6d, [wq - 8] ; horizontal loop
- mov r4, dstq
- mov r7, srcq
- shl r6d, 8 - 3; (wq / 8) << 8
- mov r6b, hb
+ lea r6d, [wq*8-64]
+ mov r4, srcq
+ mov r7, dstq
+ lea r6d, [hq+r6*4]
.v_w8_loop0:
- movq m4, [srcq+ssq*0] ; 0
- movq m5, [srcq+ssq*1] ; 1
- lea srcq, [srcq+ssq*2]
- movq m6, [srcq+ssq*0] ; 2
- movq m0, [srcq+ssq*1] ; 3
- lea srcq, [srcq+ssq*2]
- movq m1, [srcq+ssq*0] ; 4
- movq m2, [srcq+ssq*1] ; 5
- lea srcq, [srcq+ssq*2] ;
- movq m3, [srcq+ssq*0] ; 6
- shufpd m4, m0, 0x0c
- shufpd m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
+ movq m1, [srcq+ssq*0]
+ movq m2, [srcq+ssq*1]
+ movq m3, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m4, [srcq+ssq*0]
+ movq m5, [srcq+ssq*1]
+ movq m6, [srcq+ssq*2]
+ add srcq, ss3q
+ movq m0, [srcq+ssq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
.v_w8_loop:
- movq m12, [srcq+ssq*1] ; 8
+ movq m13, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
- movq m13, [srcq+ssq*0] ; 9
pmaddubsw m14, m1, subpel0 ; a0
- pmaddubsw m15, m2, subpel0 ; b0
mova m1, m3
+ pmaddubsw m15, m2, subpel0 ; b0
mova m2, m4
pmaddubsw m3, subpel1 ; a1
+ mova m12, m0
pmaddubsw m4, subpel1 ; b1
+ movq m0, [srcq+ssq*0]
paddw m14, m3
paddw m15, m4
mova m3, m5
- mova m4, m6
pmaddubsw m5, subpel2 ; a2
+ mova m4, m6
pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m13 ; 67
+ punpcklbw m13, m0 ; 78
paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, subpel3 ; a3
- pmaddubsw m13, m6, subpel3 ; b3
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
paddw m14, m12
paddw m15, m13
pmulhrsw m14, m7
@@ -2021,12 +1931,12 @@
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
- movzx hd, r6b ; reset vertical loop
add r4, 8
add r7, 8
- mov dstq, r4
- mov srcq, r7
- sub r6d, 1<<8 ; horizontal--
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
+ sub r6d, 1<<8
jg .v_w8_loop0
RET
%endif ;ARCH_X86_64
@@ -2051,7 +1961,7 @@
cmp hd, 6
cmovs ssd, mxd
movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
- W32_RESTORE_SSQ
+ mov ssq, ssmp
lea r6, [ssq*3]
sub srcq, r6
%define base_reg r6
@@ -2064,7 +1974,6 @@
%define subpelv1 [rsp+mmsize*1]
%define subpelv2 [rsp+mmsize*2]
%define subpelv3 [rsp+mmsize*3]
- punpcklqdq m0, m0
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
pshufd m6, m0, q0000
@@ -2088,7 +1997,6 @@
%define subpelv1 m11
%define subpelv2 m12
%define subpelv3 m13
- punpcklqdq m0, m0
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
mova m8, [base+pw_8192]
@@ -2103,22 +2011,21 @@
je .hv_w4
.hv_w2:
mova m6, [base+subpel_h_shuf4]
- ;
movq m2, [srcq+ssq*0] ; 0
movhps m2, [srcq+ssq*1] ; 0 _ 1
- movq m0, [srcq+ssq*2] ; 2
%if ARCH_X86_32
%define w8192reg [base+pw_8192]
%define d512reg [base+pd_512]
lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m0, [srcq+ssq*0] ; 2 _ 3
- lea srcq, [srcq+ssq*1]
+ movq m0, [srcq+ssq*0] ; 2
+ movhps m0, [srcq+ssq*1] ; 2 _ 3
+ lea srcq, [srcq+ssq*2]
%else
%define w8192reg m8
%define d512reg m9
- movhps m0, [srcq+ss3q ] ; 2 _ 3
- lea srcq, [srcq+ssq*4]
+ movq m0, [srcq+ssq*2] ; 2
+ add srcq, ss3q
+ movhps m0, [srcq+ssq*0] ; 2 _ 3
%endif
pshufb m2, m6 ; 0 ~ 1 ~
pshufb m0, m6 ; 2 ~ 3 ~
@@ -2126,16 +2033,16 @@
pmaddubsw m0, m7 ; subpel_filters
phaddw m2, m0 ; 0 1 2 3
pmulhrsw m2, w8192reg
- ;
+%if ARCH_X86_32
movq m3, [srcq+ssq*0] ; 4
movhps m3, [srcq+ssq*1] ; 4 _ 5
- movq m0, [srcq+ssq*2] ; 6
-%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
%else
+ movq m3, [srcq+ssq*1] ; 4
+ movhps m3, [srcq+ssq*2] ; 4 _ 5
add srcq, ss3q
%endif
+ movq m0, [srcq+ssq*0] ; 6
pshufb m3, m6 ; 4 ~ 5 ~
pshufb m0, m6 ; 6 ~
pmaddubsw m3, m7 ; subpel_filters
@@ -2142,7 +2049,6 @@
pmaddubsw m0, m7 ; subpel_filters
phaddw m3, m0 ; 4 5 6 _
pmulhrsw m3, w8192reg
- ;
palignr m4, m3, m2, 4; V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2
punpckhwd m2, m4 ; V 23 34 2 3 3 4
@@ -2149,6 +2055,11 @@
pshufd m0, m3, q2121; V 5 6 5 6
punpcklwd m3, m0 ; V 45 56 4 5 5 6
.hv_w2_loop:
+ movq m4, [srcq+ssq*1] ; V 7
+ lea srcq, [srcq+ssq*2] ; V
+ movhps m4, [srcq+ssq*0] ; V 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
pmaddwd m5, m1, subpelv0; V a0 b0
mova m1, m2 ; V
pmaddwd m2, subpelv1 ; V a1 b1
@@ -2155,14 +2066,9 @@
paddd m5, m2 ; V
mova m2, m3 ; V
pmaddwd m3, subpelv2 ; a2 b2
- paddd m5, m3 ; V
- movq m4, [srcq+ssq*0] ; V 7
- movhps m4, [srcq+ssq*1] ; V 7 8
- lea srcq, [srcq+ssq*2] ; V
- pshufb m4, m6
- pmaddubsw m4, m7
phaddw m4, m4
pmulhrsw m4, w8192reg
+ paddd m5, m3 ; V
palignr m3, m4, m0, 12
mova m0, m4
punpcklwd m3, m0 ; V 67 78
@@ -2182,7 +2088,6 @@
RET
%undef w8192reg
%undef d512reg
- ;
.hv_w4:
%define hv4_line_0_0 4
%define hv4_line_0_1 5
@@ -2194,7 +2099,6 @@
%define hv4_line_1_1 11
%define hv4_line_1_2 12
%define hv4_line_1_3 13
- ;
%macro SAVELINE_W4 3
mova [rsp+mmsize*hv4_line_%3_%2], %1
%endmacro
@@ -2201,7 +2105,6 @@
%macro RESTORELINE_W4 3
mova %1, [rsp+mmsize*hv4_line_%3_%2]
%endmacro
- ;
%if ARCH_X86_32
%define w8192reg [base+pw_8192]
%define d512reg [base+pd_512]
@@ -2213,13 +2116,13 @@
mova m6, [base+subpel_h_shuf4]
movq m5, [srcq+ssq*0] ; 0 _ _ _
movhps m5, [srcq+ssq*1] ; 0 _ 1 _
- movq m4, [srcq+ssq*2] ; 2 _ _ _
%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- add srcq, ssq
- movhps m4, [srcq+ssq*0] ; 2 _ 3 _
- add srcq, ssq
+ movq m4, [srcq+ssq*0] ; 2 _ _ _
+ movhps m4, [srcq+ssq*1] ; 2 _ 3 _
+ lea srcq, [srcq+ssq*2]
%else
+ movq m4, [srcq+ssq*2] ; 2 _ _ _
movhps m4, [srcq+ss3q ] ; 2 _ 3 _
lea srcq, [srcq+ssq*4]
%endif
@@ -2243,7 +2146,14 @@
mova m6, [base+subpel_h_shuf4]
movq m5, [srcq+ssq*0] ; 4 _ _ _
movhps m5, [srcq+ssq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+ssq*2]
+ movq m4, [srcq+ssq*0] ; 6 _ _ _
+ add srcq, ssq
+%else
movq m4, [srcq+ssq*2] ; 6 _ _ _
+ add srcq, ss3q
+%endif
pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
pmaddubsw m3, m7 ;H subpel_filters
@@ -2259,13 +2169,6 @@
pmaddubsw m0, m7 ;H subpel_filters
phaddw m3, m0 ;H 4 5 6 7
pmulhrsw m3, w8192reg ;H pw_8192
- ;
-%if ARCH_X86_32
- lea srcq, [srcq+ssq*2]
- add srcq, ssq
-%else
- add srcq, ss3q
-%endif
;process high
palignr m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
@@ -2293,7 +2196,6 @@
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
- ;
mova m6, [base+subpel_h_shuf4]
movq m4, [srcq+ssq*0] ; 7
movhps m4, [srcq+ssq*1] ; 7 _ 8 _
@@ -2325,10 +2227,10 @@
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
- ;
mova m6, [base+subpel_h_shuf4+16]
movq m4, [srcq+ssq*0] ; 7
movhps m4, [srcq+ssq*1] ; 7 _ 8 _
+ lea srcq, [srcq+ssq*2]
pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
pmaddubsw m4, m7 ;H subpel_filters
phaddw m4, m4 ;H 7 8 7 8
@@ -2340,12 +2242,10 @@
paddd m5, d512reg ; pd_512
paddd m5, m4
psrad m4, m5, 10
- ;
RESTORELINE_W4 m5, 5, 0
packssdw m5, m4 ; d -> w
packuswb m5, m5 ; w -> b
pshuflw m5, m5, q3120
- lea srcq, [srcq+ssq*2]
movd [dstq+dsq*0], m5
psrlq m5, 32
movd [dstq+dsq*1], m5
@@ -2365,7 +2265,6 @@
%undef subpelv1
%undef subpelv2
%undef subpelv3
- ;
.hv_w8:
%assign stack_offset org_stack_offset
%define hv8_line_1 0
@@ -2400,7 +2299,7 @@
mov ssq, ssmp
ALLOC_STACK -mmsize*13
%if STACK_ALIGNMENT < 16
- %define srcm [rsp+mmsize*13+gprsize*1]
+ %define dstm [rsp+mmsize*13+gprsize*1]
%define dsm [rsp+mmsize*13+gprsize*2]
mov r6, [rstk+stack_offset+gprsize*2]
mov dsm, r6
@@ -2420,10 +2319,10 @@
mova subpelv2, m4
mova subpelv3, m5
lea r6, [ssq*3]
+ mov dstm, dstq
sub srcq, r6
- mov srcm, srcq
%else
- ALLOC_STACK mmsize*5, 16
+ ALLOC_STACK 16*5, 16
%define subpelh0 m10
%define subpelh1 m11
%define subpelv0 m12
@@ -2440,7 +2339,6 @@
movq m1, [base_reg+myq*8+subpel_filters-put_ssse3]
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
- punpcklqdq m1, m1
punpcklbw m1, m1
psraw m1, 8 ; sign-extend
pshufd subpelv0, m1, q0000
@@ -2448,18 +2346,18 @@
pshufd subpelv2, m1, q2222
pshufd subpelv3, m1, q3333
lea ss3q, [ssq*3]
+ mov r7, dstq
sub srcq, ss3q
- mov r7, srcq
%endif
- lea r6d, [wq-4]
- mov r4, dstq
- shl r6d, (16 - 2)
- mov r6w, hw
+ shl wd, 14
+ lea r6d, [hq+wq-(1<<16)]
+ mov r4, srcq
.hv_w8_loop0:
movu m4, [srcq+ssq*0] ; 0 = _ _
movu m5, [srcq+ssq*1] ; 1 = _ _
+%if ARCH_X86_32
lea srcq, [srcq+ssq*2]
- ;
+%endif
%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
%if ARCH_X86_32
pshufb %3, %1, [base+subpel_h_shufB]
@@ -2478,7 +2376,6 @@
paddw %1, %3 ; A0+C4
phaddw %1, %2
%endmacro
- ;
%if ARCH_X86_64
mova m7, [base+subpel_h_shufA]
mova m8, [base+subpel_h_shufB]
@@ -2486,12 +2383,17 @@
%endif
HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+%if ARCH_X86_32
movu m6, [srcq+ssq*0] ; 2 = _ _
movu m0, [srcq+ssq*1] ; 3 = _ _
lea srcq, [srcq+ssq*2]
+%else
+ movu m6, [srcq+ssq*2] ; 2 = _ _
+ add srcq, ss3q
+ movu m0, [srcq+ssq*0] ; 3 = _ _
+%endif
HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
- ;
mova m7, [base+pw_8192]
pmulhrsw m4, m7 ; H pw_8192
pmulhrsw m5, m7 ; H pw_8192
@@ -2503,11 +2405,16 @@
SAVELINE_W8 1, m1
SAVELINE_W8 2, m2
SAVELINE_W8 3, m3
- ;
mova m7, [base+subpel_h_shufA]
+%if ARCH_X86_32
movu m4, [srcq+ssq*0] ; 4 = _ _
movu m5, [srcq+ssq*1] ; 5 = _ _
lea srcq, [srcq+ssq*2]
+%else
+ movu m4, [srcq+ssq*1] ; 4 = _ _
+ movu m5, [srcq+ssq*2] ; 5 = _ _
+ add srcq, ss3q
+%endif
movu m6, [srcq+ssq*0] ; 6 = _ _
HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
@@ -2519,7 +2426,6 @@
punpcklwd m4, m0, m1 ; 3 4 ~
punpcklwd m5, m1, m2 ; 4 5 ~
punpcklwd m6, m2, m3 ; 5 6 ~
- ;
SAVELINE_W8 6, m3
RESTORELINE_W8 1, m1
RESTORELINE_W8 2, m2
@@ -2603,16 +2509,19 @@
RESTORELINE_W8 4, m4
jmp .hv_w8_loop
.hv_w8_outer:
- movzx hd, r6w
- add r4, 4
- mov dstq, r4
%if ARCH_X86_32
- mov srcq, srcm
- add srcq, 4
- mov srcm, srcq
+ mov dstq, dstm
+ add r4, 4
+ movzx hd, r6w
+ add dstq, 4
+ mov srcq, r4
+ mov dstm, dstq
%else
+ add r4, 4
add r7, 4
- mov srcq, r7
+ movzx hd, r6b
+ mov srcq, r4
+ mov dstq, r7
%endif
sub r6d, 1<<16
jg .hv_w8_loop0
@@ -2836,7 +2745,7 @@
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
- movsxd wq, wm
+ mov wd, wm
movifnidn srcd, srcm
movifnidn hd, hm
test mxd, 0xf00
@@ -2846,6 +2755,7 @@
LEA base_reg, prep_ssse3
tzcnt wd, wd
movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
+ pxor m4, m4
add wq, base_reg
movifnidn strided, stridem
lea r6, [strideq*3]
@@ -2885,16 +2795,13 @@
shr mxd, 16
sub srcq, 3
movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
- movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
- pshufd m5, m5, q0000
- movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
- pshufd m6, m6, q0000
+ movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
%if cpuflag(ssse3)
mova m7, [base+pw_8192]
+ pshufd m5, m6, q0000
+ pshufd m6, m6, q1111
%else
- punpcklbw m5, m5
punpcklbw m6, m6
- psraw m5, 8
psraw m6, 8
%if ARCH_X86_64
mova m7, [pw_2]
@@ -2902,6 +2809,8 @@
%else
%define m15 m4
%endif
+ pshufd m5, m6, q1010
+ punpckhqdq m6, m6
%endif
add wq, base_reg
jmp wq
@@ -2913,10 +2822,10 @@
%endif
dec srcq
movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
- pshufd m4, m4, q0000
%if cpuflag(ssse3)
mova m6, [base+pw_8192]
mova m5, [base+subpel_h_shufA]
+ pshufd m4, m4, q0000
%else
mova m6, [base+pw_2]
%if ARCH_X86_64
@@ -2926,6 +2835,7 @@
%endif
punpcklbw m4, m4
psraw m4, 8
+ punpcklqdq m4, m4
%endif
%if ARCH_X86_64
lea stride3q, [strideq*3]
@@ -3089,11 +2999,14 @@
shr myd, 16
cmp hd, 6
cmovs myd, mxd
- lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
%if cpuflag(ssse3)
mova m2, [base+pw_512]
- psrlw m2, m2, 1 ; 0x0100
mova m7, [base+pw_8192]
+ punpcklwd m0, m0
+%else
+ punpcklbw m0, m0
+ psraw m0, 8
%endif
%if ARCH_X86_32
%define subpel0 [rsp+mmsize*0]
@@ -3107,20 +3020,16 @@
ALLOC_STACK -mmsize*5
%endif
%assign regs_used 7
- movd m0, [myq+0]
- PSHUFB_0X1X m0, m2
- mova subpel0, m0
- movd m0, [myq+2]
- PSHUFB_0X1X m0, m2
- mova subpel1, m0
- movd m0, [myq+4]
- PSHUFB_0X1X m0, m2
- mova subpel2, m0
- movd m0, [myq+6]
- PSHUFB_0X1X m0, m2
- mova subpel3, m0
mov strideq, [rstk+stack_offset+gprsize*3]
+ pshufd m1, m0, q0000
+ mova subpel0, m1
+ pshufd m1, m0, q1111
+ mova subpel1, m1
lea r5, [strideq*3]
+ pshufd m1, m0, q2222
+ mova subpel2, m1
+ pshufd m1, m0, q3333
+ mova subpel3, m1
sub srcq, r5
%else
%define subpel0 m8
@@ -3127,15 +3036,11 @@
%define subpel1 m9
%define subpel2 m10
%define subpel3 m11
- movd subpel0, [myq+0]
- PSHUFB_0X1X subpel0, m2
- movd subpel1, [myq+2]
- PSHUFB_0X1X subpel1, m2
- movd subpel2, [myq+4]
- PSHUFB_0X1X subpel2, m2
- movd subpel3, [myq+6]
- PSHUFB_0X1X subpel3, m2
+ pshufd m8, m0, q0000
+ pshufd m9, m0, q1111
lea stride3q, [strideq*3]
+ pshufd m10, m0, q2222
+ pshufd m11, m0, q3333
sub srcq, stride3q
cmp wd, 8
jns .v_w8
@@ -3159,35 +3064,34 @@
mov r5w, hw
.v_w4_loop0:
%endif
- movd m2, [srcq+strideq*0] ; 0
- movhps m2, [srcq+strideq*2] ; 0 _ 2
- movd m3, [srcq+strideq*1] ; 1
+ movd m1, [srcq+strideq*0]
+ movd m0, [srcq+strideq*1]
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
- movhps m3, [srcq+strideq*1] ; 1 _ 3
+ movd m2, [srcq+strideq*0]
+ movd m4, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
-%else
- movhps m3, [srcq+stride3q ] ; 1 _ 3
- lea srcq, [srcq+strideq*4]
-%endif
- pshufd m2, m2, q2020 ; 0 2 0 2
- pshufd m3, m3, q2020 ; 1 3 1 3
- punpckldq m2, m3 ; 0 1 2 3
- movd m3, [srcq+strideq*0] ; 4
- movd m1, [srcq+strideq*1] ; 5
- movd m0, [srcq+strideq*2] ; 6
-%if ARCH_X86_32
+ movd m3, [srcq+strideq*0]
+ movd m5, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- add srcq, strideq
%else
+ movd m2, [srcq+strideq*2]
add srcq, stride3q
+ movd m4, [srcq+strideq*0]
+ movd m3, [srcq+strideq*1]
+ movd m5, [srcq+strideq*2]
+ add srcq, stride3q
%endif
- punpckldq m3, m1 ; 4 5 _ _
- punpckldq m1, m0 ; 5 6 _ _
- PALIGNR m4, m3, m2, 4 ; 1 2 3 4
- punpcklbw m3, m1 ; 45 56
- punpcklbw m1, m2, m4 ; 01 12
- punpckhbw m2, m4 ; 23 34
+ punpckldq m1, m0 ; 0 1
+ punpckldq m0, m2 ; 1 2
+ punpcklbw m1, m0 ; 01 12
+ movd m0, [srcq+strideq*0]
+ punpckldq m2, m4 ; 2 3
+ punpckldq m4, m3 ; 3 4
+ punpckldq m3, m5 ; 4 5
+ punpckldq m5, m0 ; 5 6
+ punpcklbw m2, m4 ; 23 34
+ punpcklbw m3, m5 ; 45 56
.v_w4_loop:
%if ARCH_X86_32 && notcpuflag(ssse3)
mova m7, subpel0
@@ -3208,11 +3112,11 @@
%endif
mova m2, m3
PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2
+ movd m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
paddw m5, m3
- movd m4, [srcq+strideq*0]
punpckldq m3, m0, m4 ; 6 7 _ _
- movd m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
+ movd m0, [srcq+strideq*0]
punpckldq m4, m0 ; 7 8 _ _
punpcklbw m3, m4 ; 67 78
%if notcpuflag(ssse3)
@@ -3242,13 +3146,13 @@
sub hd, 2
jg .v_w4_loop
%if ARCH_X86_32
- mov hw, r5w ; reset vertical loop
- mov tmpq, tmpm
mov srcq, srcm
- add tmpq, 8
+ mov tmpq, tmpm
+ movzx hd, r5w
add srcq, 4
- mov tmpm, tmpq
+ add tmpq, 8
mov srcm, srcq
+ mov tmpm, tmpq
sub r5d, 1<<16 ; horizontal--
jg .v_w4_loop0
%endif
@@ -3255,37 +3159,30 @@
RET
%if ARCH_X86_64
.v_w8:
- lea r5d, [wq - 8] ; horizontal loop
+ lea r6d, [wq*8-64]
+ mov r5, srcq
mov r8, tmpq
- mov r6, srcq
- shl r5d, 8 - 3; (wq / 8) << 8
- mov r5b, hb
+ lea r6d, [hq+r6*4]
.v_w8_loop0:
- movq m4, [srcq+strideq*0]
- movq m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movq m6, [srcq+strideq*0]
- movq m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
movq m1, [srcq+strideq*0]
movq m2, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movq m3, [srcq+strideq*0]
- shufpd m4, m0, 0x0c
- shufpd m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
+ movq m3, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m4, [srcq+strideq*0]
+ movq m5, [srcq+strideq*1]
+ movq m6, [srcq+strideq*2]
+ add srcq, stride3q
+ movq m0, [srcq+strideq*0]
+ punpcklbw m1, m2 ; 01
+ punpcklbw m2, m3 ; 12
+ punpcklbw m3, m4 ; 23
+ punpcklbw m4, m5 ; 34
+ punpcklbw m5, m6 ; 45
+ punpcklbw m6, m0 ; 56
.v_w8_loop:
-%if cpuflag(ssse3)
- movq m12, [srcq+strideq*1]
+ movq m13, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
- movq m13, [srcq+strideq*0]
+%if cpuflag(ssse3)
pmaddubsw m14, m1, subpel0 ; a0
pmaddubsw m15, m2, subpel0 ; b0
mova m1, m3
@@ -3298,64 +3195,59 @@
mova m4, m6
pmaddubsw m5, subpel2 ; a2
pmaddubsw m6, subpel2 ; b2
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
paddw m14, m5
+ mova m5, m12
+ pmaddubsw m12, subpel3 ; a3
paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, subpel3 ; a3
- pmaddubsw m13, m6, subpel3 ; b3
+ mova m6, m13
+ pmaddubsw m13, subpel3 ; b3
paddw m14, m12
paddw m15, m13
pmulhrsw m14, m7
pmulhrsw m15, m7
- movu [tmpq+wq*0], m14
- movu [tmpq+wq*2], m15
%else
mova m14, m1
PMADDUBSW m14, subpel0, m7, m12, 1 ; a0
+ mova m15, m2
+ PMADDUBSW m15, subpel0, m7, m12, 0 ; b0
mova m1, m3
PMADDUBSW m3, subpel1, m7, m12, 0 ; a1
+ mova m2, m4
+ PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
paddw m14, m3
mova m3, m5
PMADDUBSW m5, subpel2, m7, m12, 0 ; a2
- paddw m14, m5
- movq m12, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movq m13, [srcq+strideq*0]
- shufpd m15, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m15, m0 ; 67
- punpckhbw m15, m0 ; 78
- mova m13, m5
- PMADDUBSW m13, subpel3, m7, m12, 0 ; a3
- paddw m14, m13
- PMULHRSW_8192 m14, m14, [base+pw_2]
- movu [tmpq+wq*0], m14
- mova m14, m2
- PMADDUBSW m14, subpel0, m7, m12, 0 ; b0
- mova m2, m4
- PMADDUBSW m4, subpel1, m7, m12, 0 ; b1
- paddw m14, m4
+ paddw m15, m4
mova m4, m6
PMADDUBSW m6, subpel2, m7, m12, 0 ; b2
- paddw m14, m6
- mova m6, m15
- PMADDUBSW m15, subpel3, m7, m12, 0 ; b3
- paddw m14, m15
+ paddw m15, m6
+ punpcklbw m12, m0, m13 ; 67
+ movq m0, [srcq+strideq*0]
+ punpcklbw m13, m0 ; 78
+ paddw m14, m5
+ mova m5, m12
+ PMADDUBSW m12, subpel3, m7, m6, 0 ; a3
+ paddw m14, m12
+ mova m6, m13
+ PMADDUBSW m13, subpel3, m7, m12, 0 ; b3
+ paddw m15, m13
PMULHRSW_8192 m14, m14, [base+pw_2]
- movu [tmpq+wq*2], m14
+ PMULHRSW_8192 m15, m15, [base+pw_2]
%endif
+ movu [tmpq+wq*0], m14
+ movu [tmpq+wq*2], m15
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w8_loop
- movzx hd, r5b ; reset vertical loop
+ add r5, 8
add r8, 16
- add r6, 8
+ movzx hd, r6b
+ mov srcq, r5
mov tmpq, r8
- mov srcq, r6
- sub r5d, 1<<8 ; horizontal--
+ sub r6d, 1<<8
jg .v_w8_loop0
RET
%endif ;ARCH_X86_64
@@ -3363,7 +3255,6 @@
%undef subpel1
%undef subpel2
%undef subpel3
- ;
.hv:
%assign stack_offset org_stack_offset
cmp wd, 4
@@ -3466,13 +3357,13 @@
%endif
movq m5, [srcq+strideq*0] ; 0 _ _ _
movhps m5, [srcq+strideq*1] ; 0 _ 1 _
- movq m4, [srcq+strideq*2] ; 2 _ _ _
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
- add srcq, strideq
- movhps m4, [srcq+strideq*0] ; 2 _ 3 _
- add srcq, strideq
+ movq m4, [srcq+strideq*0] ; 2 _ _ _
+ movhps m4, [srcq+strideq*1] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*2]
%else
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
movhps m4, [srcq+stride3q ] ; 2 _ 3 _
lea srcq, [srcq+strideq*4]
%endif
@@ -3506,7 +3397,14 @@
%endif
movq m5, [srcq+strideq*0] ; 4 _ _ _
movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m4, [srcq+strideq*0] ; 6 _ _ _
+ add srcq, strideq
+%else
movq m4, [srcq+strideq*2] ; 6 _ _ _
+ add srcq, stride3q
+%endif
PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~
PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~
PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters
@@ -3531,12 +3429,6 @@
mova m2, [esp+mmsize*4]
%endif
%endif
-%if ARCH_X86_32
- lea srcq, [srcq+strideq*2]
- add srcq, strideq
-%else
- add srcq, stride3q
-%endif
;process high
PALIGNR m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
@@ -3572,7 +3464,6 @@
%define m15 m3
%endif
%endif
- ;
%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4]
%endif
@@ -3620,7 +3511,6 @@
mova [esp+0xA0], m5
%endif
%endif
- ;
%if cpuflag(ssse3)
mova m6, [base+subpel_h_shuf4+16]
%endif
@@ -3644,7 +3534,6 @@
paddd m5, d32reg ; pd_32
paddd m5, m4
psrad m4, m5, 6
- ;
RESTORELINE_W4 m5, 5, 0
packssdw m5, m4
pshufd m5, m5, q3120
@@ -3666,7 +3555,6 @@
%undef subpelv1
%undef subpelv2
%undef subpelv3
- ;
.hv_w8:
%assign stack_offset org_stack_offset
%define hv8_line_1 0
@@ -3699,20 +3587,20 @@
%define tmpm [rsp+mmsize*13+gprsize*1]
%define srcm [rsp+mmsize*13+gprsize*2]
%define stridem [rsp+mmsize*13+gprsize*3]
+ mov tmpm, tmpq
mov stridem, strideq
%endif
+ %if cpuflag(ssse3)
pshufd m0, m1, q0000
pshufd m1, m1, q1111
- punpcklbw m5, m5
- %if notcpuflag(ssse3)
- punpcklbw m0, m0
+ %else
punpcklbw m1, m1
- %endif
- psraw m5, 8
- %if notcpuflag(ssse3)
- psraw m0, 8
psraw m1, 8
+ pshufd m0, m1, q1010
+ punpckhqdq m1, m1
%endif
+ punpcklbw m5, m5
+ psraw m5, 8
pshufd m2, m5, q0000
pshufd m3, m5, q1111
pshufd m4, m5, q2222
@@ -3742,38 +3630,31 @@
cmp hd, 6
cmovs myd, mxd
movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
pshufd subpelh0, m0, q0000
pshufd subpelh1, m0, q1111
- punpcklbw m1, m1
- %if notcpuflag(ssse3)
- punpcklbw subpelh0, subpelh0
- punpcklbw subpelh1, subpelh1
+ %else
+ punpcklbw m0, m0
+ psraw m0, 8
+ pshufd subpelh0, m0, q1010
+ pshufd subpelh1, m0, q3232
+ mova m7, [base+pw_2]
%endif
+ punpcklbw m1, m1
psraw m1, 8
- %if notcpuflag(ssse3)
- psraw subpelh0, 8
- psraw subpelh1, 8
- %endif
pshufd subpelv0, m1, q0000
pshufd subpelv1, m1, q1111
pshufd subpelv2, m1, q2222
pshufd subpelv3, m1, q3333
- %if notcpuflag(ssse3)
- mova m7, [base+pw_2]
- %endif
lea stride3q, [strideq*3]
sub srcq, 3
sub srcq, stride3q
mov r6, srcq
-%endif
- lea r5d, [wq-4]
-%if ARCH_X86_64
mov r8, tmpq
-%else
- mov tmpm, tmpq
%endif
- shl r5d, (16 - 2)
- mov r5w, hw
+ lea r5d, [wq-4]
+ shl r5d, 14
+ add r5d, hd
.hv_w8_loop0:
%if cpuflag(ssse3)
%if ARCH_X86_64
@@ -3791,24 +3672,24 @@
%endif
PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
+%if ARCH_X86_64
+ PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
+ add srcq, stride3q
+ PREP_8TAP_HV m0, srcq+strideq*0, m7, m9
+%else
lea srcq, [srcq+strideq*2]
-%if notcpuflag(ssse3)
- %if ARCH_X86_64
- SWAP m9, m4
- %else
+ %if notcpuflag(ssse3)
mova [esp], m4
%endif
-%endif
PREP_8TAP_HV m6, srcq+strideq*0, m7, m4
PREP_8TAP_HV m0, srcq+strideq*1, m7, m4
lea srcq, [srcq+strideq*2]
+%endif
%if cpuflag(ssse3)
mova m7, [base+pw_8192]
%else
mova m7, [base+pw_2]
- %if ARCH_X86_64
- SWAP m4, m9
- %else
+ %if ARCH_X86_32
mova m4, [esp]
%endif
%endif
@@ -3824,28 +3705,26 @@
SAVELINE_W8 3, m3
%if cpuflag(ssse3)
mova m7, [base+subpel_h_shufA]
+%endif
+%if ARCH_X86_64
+ PREP_8TAP_HV m4, srcq+strideq*1, m8, m9
+ PREP_8TAP_HV m5, srcq+strideq*2, m8, m9
+ add srcq, stride3q
+ PREP_8TAP_HV m6, srcq+strideq*0, m8, m9
%else
- %if ARCH_X86_64
- SWAP m8, m7
- SWAP m9, m0
- %else
+ %if notcpuflag(ssse3)
mova [esp+0x30], m0
%endif
-%endif
PREP_8TAP_HV m4, srcq+strideq*0, m7, m0
PREP_8TAP_HV m5, srcq+strideq*1, m7, m0
- PREP_8TAP_HV m6, srcq+strideq*2, m7, m0
lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m6, srcq+strideq*0, m7, m0
+%endif
%if cpuflag(ssse3)
mova m7, [base+pw_8192]
-%else
- %if ARCH_X86_64
- SWAP m0, m9
- SWAP m7, m8
- %else
+%elif ARCH_X86_32
mova m0, [esp+0x30]
mova m7, [base+pw_2]
- %endif
%endif
PMULHRSW_8192 m1, m4, m7
PMULHRSW_8192 m2, m5, m7
@@ -3902,8 +3781,8 @@
%endif
%endif
PREP_8TAP_HV m0, srcq+strideq*1, m5, m6
- PREP_8TAP_HV m4, srcq+strideq*2, m5, m6
lea srcq, [srcq+strideq*2]
+ PREP_8TAP_HV m4, srcq+strideq*0, m5, m6
%if cpuflag(ssse3)
mova m5, [base+pw_8192]
%else
@@ -3933,19 +3812,20 @@
RESTORELINE_W8 4, m4
jmp .hv_w8_loop
.hv_w8_outer:
- movzx hd, r5w
%if ARCH_X86_32
mov srcq, srcm
mov tmpq, tmpm
+ movzx hd, r5w
add srcq, 4
add tmpq, 8
mov srcm, srcq
mov tmpm, tmpq
%else
- add r8, 8
- mov tmpq, r8
add r6, 4
+ add r8, 8
+ movzx hd, r5b
mov srcq, r6
+ mov tmpq, r8
%endif
sub r5d, 1<<16
jg .hv_w8_loop0