ref: 114e8f0ee53ba34d22452dc4bdc0e9ec263189c8
parent: e0b88bd2b2c97a2695edcc498485e1cb3003e7f1
author: Henrik Gramner <[email protected]>
date: Mon Apr 20 19:54:47 EDT 2020
x86: Eliminate redundant 3-operand register syntax in itx Purely a cosmetic change.
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -175,16 +175,16 @@
vpbroadcastd m%3, [o(pw_%8_%9)]
vpbroadcastd m%4, [o(pw_m%9_%8)]
vpbroadcastd xm%2, [o(pw_%6_%7)]
- vpblendd m%2, m%2, m%3, 0xf0
+ vpblendd m%2, m%3, 0xf0
vpbroadcastd xm%3, [o(pw_m%7_%6)]
%else
vpbroadcastd m%3, [o(pw_m%9_%8)]
vpbroadcastd m%4, [o(pw_%8_%9)]
vpbroadcastd xm%2, [o(pw_m%7_%6)]
- vpblendd m%2, m%2, m%3, 0xf0
+ vpblendd m%2, m%3, 0xf0
vpbroadcastd xm%3, [o(pw_%6_%7)]
%endif
- vpblendd m%3, m%3, m%4, 0xf0
+ vpblendd m%3, m%4, 0xf0
ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10)
%endmacro
@@ -355,7 +355,7 @@
punpckhdq m1, m0, m3
punpckldq m0, m3
IWHT4_1D_PACKED
- vpblendd m0, m0, m2, 0x03
+ vpblendd m0, m2, 0x03
ITX4_END 3, 0, 2, 1, 0
%macro INV_TXFM_FN 3 ; type1, type2, size
@@ -441,7 +441,7 @@
IDCT4_1D_PACKED
mova m2, [o(deint_shuf)]
shufps m3, m0, m1, q1331
- shufps m0, m0, m1, q0220
+ shufps m0, m1, q0220
pshufb m0, m2
pshufb m1, m3, m2
jmp tx2q
@@ -667,9 +667,9 @@
paddsw m4, m5 ; out6 -out1
vpbroadcastd m5, [o(pw_2896x8)]
vpblendd m3, m0, m4, 0x33 ; out6 -out7
- vpblendd m0, m0, m4, 0xcc ; out0 -out1
+ vpblendd m0, m4, 0xcc ; out0 -out1
shufps m4, m2, m1, q1032 ; t3 t7
- vpblendd m1, m2, m1, 0xcc ; t2 t6
+ vpblendd m1, m2, 0x33 ; t2 t6
psubsw m2, m1, m4 ; t2-t3 t6-t7
paddsw m1, m4 ; t2+t3 t6+t7
pmulhrsw m2, m5 ; out4 -out5
@@ -693,7 +693,7 @@
IDCT4_1D_PACKED
vbroadcasti128 m2, [o(deint_shuf)]
shufps m3, m0, m1, q1331
- shufps m0, m0, m1, q0220
+ shufps m0, m1, q0220
pshufb m0, m2
pshufb m1, m3, m2
jmp tx2q
@@ -702,8 +702,8 @@
vextracti128 xm3, m1, 1
call .main
vpbroadcastd m4, [o(pw_2048)]
- vinserti128 m0, m0, xm2, 1
- vinserti128 m1, m1, xm3, 1
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
pshufd m1, m1, q1032
jmp m(iadst_4x8_internal).end2
ALIGN function_align
@@ -735,12 +735,12 @@
pshufd xm5, xm1, q1032
call .main_pass2
vpbroadcastd m4, [o(pw_2048)]
- vinserti128 m0, m0, xm2, 1
- vinserti128 m1, m1, xm3, 1
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
pxor m5, m5
psubw m5, m4
.end:
- vpblendd m4, m4, m5, 0xcc
+ vpblendd m4, m5, 0xcc
.end2:
pmulhrsw m0, m4
pmulhrsw m1, m4
@@ -786,8 +786,8 @@
pshufd xm5, xm1, q1032
call m(iadst_4x8_internal).main_pass2
vpbroadcastd m5, [o(pw_2048)]
- vinserti128 m3, m3, xm1, 1
- vinserti128 m2, m2, xm0, 1
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
pxor m4, m4
psubw m4, m5
pshufd m0, m3, q1032
@@ -935,11 +935,11 @@
vextracti128 xm6, m2, 1
vextracti128 xm7, m3, 1
call .main
- vinserti128 m0, m0, xm4, 1
- vinserti128 m1, m1, xm5, 1
+ vinserti128 m0, xm4, 1
+ vinserti128 m1, xm5, 1
vpbroadcastd m5, [o(pw_2048)]
- vinserti128 m2, m2, xm6, 1
- vinserti128 m3, m3, xm7, 1
+ vinserti128 m2, xm6, 1
+ vinserti128 m3, xm7, 1
pshufd m1, m1, q1032
pshufd m3, m3, q1032
jmp m(iadst_4x16_internal).end2
@@ -980,9 +980,9 @@
vpbroadcastd m5, [o(pw_2048)]
pshufd m1, m1, q1032
vpblendd m4, m1, m0, 0x33
- vpblendd m0, m0, m2, 0x33
- vpblendd m2, m2, m3, 0x33
- vpblendd m3, m3, m1, 0x33
+ vpblendd m0, m2, 0x33
+ vpblendd m2, m3, 0x33
+ vpblendd m3, m1, 0x33
vpermq m0, m0, q2031
vpermq m1, m2, q1302
vpermq m2, m3, q3120
@@ -989,7 +989,7 @@
vpermq m3, m4, q0213
psubw m6, m7, m5
.end:
- vpblendd m5, m5, m6, 0xcc
+ vpblendd m5, m6, 0xcc
.end2:
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
WIN64_RESTORE_XMM
@@ -1009,9 +1009,9 @@
ALIGN function_align
.main:
vpblendd m4, m1, m0, 0xcc
- vpblendd m1, m1, m0, 0x33
+ vpblendd m1, m0, 0x33
vpblendd m5, m2, m3, 0xcc
- vpblendd m2, m2, m3, 0x33
+ vpblendd m2, m3, 0x33
vperm2i128 m3, m5, m2, 0x31
vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1
vperm2i128 m4, m1, m4, 0x31
@@ -1043,7 +1043,7 @@
psubsw m1, m2, m3 ; t13a t12a t15a t14a
paddsw m2, m3 ; t9a t8a t11a t10a
psubw m3, m7, m6 ; pw_3784_m1567
- vpblendd m6, m6, m3, 0xf0
+ vpblendd m6, m3, 0xf0
ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
vbroadcasti128 m5, [o(deint_shuf)]
@@ -1050,9 +1050,9 @@
pshufb m0, m5
pshufb m2, m5
vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a
- vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a
+ vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a
vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
- vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
+ vinserti128 m4, xm1, 1 ; t4a t5a t12 t13
pshufd m2, m2, q1032 ; t6a t7a t14 t15
psubsw m1, m0, m3 ; t3a t2a t11 t10
paddsw m0, m3 ; -out15 out0 out14 -out1
@@ -1059,7 +1059,7 @@
paddsw m3, m4, m2 ; -out3 out12 out2 -out13
psubsw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
- vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
+ vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a
ret
ALIGN function_align
.main_pass1_end:
@@ -1109,9 +1109,9 @@
vpbroadcastd m6, [o(pw_2048)]
pshufd m1, m1, q1032
vpblendd m4, m0, m2, 0x33
- vpblendd m0, m0, m1, 0xcc
- vpblendd m1, m1, m3, 0xcc
- vpblendd m2, m2, m3, 0x33
+ vpblendd m0, m1, 0xcc
+ vpblendd m1, m3, 0xcc
+ vpblendd m2, m3, 0x33
vpermq m0, m0, q3120
vpermq m1, m1, q0213
vpermq m2, m2, q2031
@@ -1226,7 +1226,7 @@
vinserti128 m3, m1, xm3, 1
vinserti128 m1, m0, xm2, 1
shufps m0, m1, m3, q0220
- shufps m1, m1, m3, q1331
+ shufps m1, m3, q1331
pshufb m0, m4
pshufb m1, m4
jmp tx2q
@@ -1250,8 +1250,8 @@
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
call m(iadst_4x8_internal).main_pass1
- vinserti128 m0, m0, xm2, 1
- vinserti128 m1, m1, xm3, 1
+ vinserti128 m0, xm2, 1
+ vinserti128 m1, xm3, 1
punpckhwd m2, m0, m1
punpcklwd m0, m1
pxor m3, m3
@@ -1295,8 +1295,8 @@
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
call m(iadst_4x8_internal).main_pass1
- vinserti128 m3, m3, xm1, 1
- vinserti128 m2, m2, xm0, 1
+ vinserti128 m3, xm1, 1
+ vinserti128 m2, xm0, 1
punpckhwd m1, m3, m2
punpcklwd m3, m2
pxor m0, m0
@@ -1317,10 +1317,10 @@
INV_TXFM_8X4_FN identity, identity
cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
- mova xm2, [cq+16*0]
- mova xm0, [cq+16*1]
- vinserti128 m2, m2, [cq+16*2], 1
- vinserti128 m0, m0, [cq+16*3], 1
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m0, [cq+16*3], 1
vpbroadcastd m3, [o(pw_2896x8)]
punpcklwd m1, m2, m0
punpckhwd m2, m0
@@ -1520,14 +1520,14 @@
INV_TXFM_8X8_FN identity, identity
cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
- mova xm3, [cq+16*0]
- mova xm2, [cq+16*1]
- vinserti128 m3, m3, [cq+16*4], 1
- vinserti128 m2, m2, [cq+16*5], 1
- mova xm4, [cq+16*2]
- mova xm0, [cq+16*3]
- vinserti128 m4, m4, [cq+16*6], 1
- vinserti128 m0, m0, [cq+16*7], 1
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti128 m3, [cq+16*4], 1
+ vinserti128 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti128 m4, [cq+16*6], 1
+ vinserti128 m0, [cq+16*7], 1
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
@@ -1583,13 +1583,13 @@
vpbroadcastd m10, [o(pw_16384)]
.pass1_end:
vperm2i128 m9, m3, m7, 0x31
- vinserti128 m3, m3, xm7, 1
+ vinserti128 m3, xm7, 1
vperm2i128 m8, m2, m6, 0x31
- vinserti128 m2, m2, xm6, 1
+ vinserti128 m2, xm6, 1
vperm2i128 m6, m1, m5, 0x31
- vinserti128 m1, m1, xm5, 1
+ vinserti128 m1, xm5, 1
vperm2i128 m5, m0, m4, 0x31
- vinserti128 m0, m0, xm4, 1
+ vinserti128 m0, xm4, 1
punpckhwd m4, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m1
@@ -1840,24 +1840,24 @@
%endmacro
cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
- mova xm3, [cq+16*0]
- mova xm2, [cq+16*2]
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*2]
add cq, 16*8
- vinserti128 m3, m3, [cq+16*0], 1
- vinserti128 m2, m2, [cq+16*2], 1
+ vinserti128 m3, [cq+16*0], 1
+ vinserti128 m2, [cq+16*2], 1
vpbroadcastd m9, [o(pw_2896x8)]
- mova xm4, [cq-16*4]
- mova xm5, [cq-16*2]
- vinserti128 m4, m4, [cq+16*4], 1
- vinserti128 m5, m5, [cq+16*6], 1
- mova xm7, [cq-16*7]
- mova xm6, [cq-16*5]
- vinserti128 m7, m7, [cq+16*1], 1
- vinserti128 m6, m6, [cq+16*3], 1
- mova xm8, [cq-16*3]
- mova xm0, [cq-16*1]
- vinserti128 m8, m8, [cq+16*5], 1
- vinserti128 m0, m0, [cq+16*7], 1
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*2]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*6], 1
+ mova xm7, [cq-16*7]
+ mova xm6, [cq-16*5]
+ vinserti128 m7, [cq+16*1], 1
+ vinserti128 m6, [cq+16*3], 1
+ mova xm8, [cq-16*3]
+ mova xm0, [cq-16*1]
+ vinserti128 m8, [cq+16*5], 1
+ vinserti128 m0, [cq+16*7], 1
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m5
@@ -1918,7 +1918,7 @@
pxor m3, m3
.dconly_loop:
mova xm1, [dstq]
- vinserti128 m1, m1, [dstq+strideq], 1
+ vinserti128 m1, [dstq+strideq], 1
punpckhbw m2, m1, m3
punpcklbw m1, m3
paddw m2, m0
@@ -2116,14 +2116,14 @@
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
- mova xm2, [cq+16*0]
- mova xm4, [cq+16*1]
- vinserti128 m2, m2, [cq+16*4], 1
- vinserti128 m4, m4, [cq+16*5], 1
- mova xm0, [cq+16*2]
- mova xm1, [cq+16*3]
- vinserti128 m0, m0, [cq+16*6], 1
- vinserti128 m1, m1, [cq+16*7], 1
+ mova xm2, [cq+16*0]
+ mova xm4, [cq+16*1]
+ vinserti128 m2, [cq+16*4], 1
+ vinserti128 m4, [cq+16*5], 1
+ mova xm0, [cq+16*2]
+ mova xm1, [cq+16*3]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
vpbroadcastd m7, [o(pw_1697x16)]
vpbroadcastd m8, [o(pw_16384)]
punpcklwd m3, m2, m4
@@ -2224,13 +2224,13 @@
punpckldq m8, m9, m5
punpckhdq m9, m5
vperm2i128 m4, m0, m6, 0x31
- vinserti128 m0, m0, xm6, 1
+ vinserti128 m0, xm6, 1
vperm2i128 m5, m1, m7, 0x31
- vinserti128 m1, m1, xm7, 1
+ vinserti128 m1, xm7, 1
vperm2i128 m6, m2, m8, 0x31
- vinserti128 m2, m2, xm8, 1
+ vinserti128 m2, xm8, 1
vperm2i128 m7, m3, m9, 0x31
- vinserti128 m3, m3, xm9, 1
+ vinserti128 m3, xm9, 1
jmp tx2q
.pass2:
call .main
@@ -2387,13 +2387,13 @@
punpckldq m5, m8, m2
punpckhdq m8, m2
vinserti128 m2, m6, xm5, 1
- vperm2i128 m6, m6, m5, 0x31
+ vperm2i128 m6, m5, 0x31
vperm2i128 m5, m1, m4, 0x31
- vinserti128 m1, m1, xm4, 1
+ vinserti128 m1, xm4, 1
vperm2i128 m4, m0, m3, 0x31
- vinserti128 m0, m0, xm3, 1
+ vinserti128 m0, xm3, 1
vinserti128 m3, m7, xm8, 1
- vperm2i128 m7, m7, m8, 0x31
+ vperm2i128 m7, m8, 0x31
jmp tx2q
.pass2:
call m(iadst_16x8_internal).main
@@ -2419,24 +2419,24 @@
INV_TXFM_16X8_FN identity, identity
cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
- mova xm7, [cq+16*0]
- mova xm2, [cq+16*1]
+ mova xm7, [cq+16*0]
+ mova xm2, [cq+16*1]
add cq, 16*8
vpbroadcastd m3, [o(pw_2896x8)]
- vinserti128 m7, m7, [cq+16*0], 1
- vinserti128 m2, m2, [cq+16*1], 1
- mova xm6, [cq-16*6]
- mova xm4, [cq-16*5]
- vinserti128 m6, m6, [cq+16*2], 1
- vinserti128 m4, m4, [cq+16*3], 1
- mova xm8, [cq-16*4]
- mova xm5, [cq-16*3]
- vinserti128 m8, m8, [cq+16*4], 1
- vinserti128 m5, m5, [cq+16*5], 1
- mova xm0, [cq-16*2]
- mova xm1, [cq-16*1]
- vinserti128 m0, m0, [cq+16*6], 1
- vinserti128 m1, m1, [cq+16*7], 1
+ vinserti128 m7, [cq+16*0], 1
+ vinserti128 m2, [cq+16*1], 1
+ mova xm6, [cq-16*6]
+ mova xm4, [cq-16*5]
+ vinserti128 m6, [cq+16*2], 1
+ vinserti128 m4, [cq+16*3], 1
+ mova xm8, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m8, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm0, [cq-16*2]
+ mova xm1, [cq-16*1]
+ vinserti128 m0, [cq+16*6], 1
+ vinserti128 m1, [cq+16*7], 1
vpbroadcastd m10, [o(pw_1697x16)]
vpbroadcastd m11, [o(pw_16384)]
REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
@@ -2524,19 +2524,19 @@
REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
pmulhrsw m1, [rsp+32*1]
vperm2i128 m8, m1, m9, 0x31
- vinserti128 m1, m1, xm9, 1
+ vinserti128 m1, xm9, 1
vperm2i128 m9, m2, m10, 0x31
- vinserti128 m2, m2, xm10, 1
+ vinserti128 m2, xm10, 1
vperm2i128 m10, m3, m11, 0x31
- vinserti128 m3, m3, xm11, 1
+ vinserti128 m3, xm11, 1
vperm2i128 m11, m4, m12, 0x31
- vinserti128 m4, m4, xm12, 1
+ vinserti128 m4, xm12, 1
vperm2i128 m12, m5, m13, 0x31
- vinserti128 m5, m5, xm13, 1
+ vinserti128 m5, xm13, 1
vperm2i128 m13, m6, m14, 0x31
- vinserti128 m6, m6, xm14, 1
+ vinserti128 m6, xm14, 1
vperm2i128 m14, m7, m15, 0x31
- vinserti128 m7, m7, xm15, 1
+ vinserti128 m7, xm15, 1
mova m15, [rsp+32*2]
.pass1_end3:
punpcklwd m0, m9, m10
@@ -3036,13 +3036,13 @@
LOAD_8ROWS cq+32*1, 32*2
call m(idct_16x8_internal).main
vperm2i128 m11, m0, m4, 0x31
- vinserti128 m0, m0, xm4, 1
+ vinserti128 m0, xm4, 1
vperm2i128 m4, m1, m5, 0x31
- vinserti128 m1, m1, xm5, 1
+ vinserti128 m1, xm5, 1
vperm2i128 m5, m2, m6, 0x31
- vinserti128 m2, m2, xm6, 1
+ vinserti128 m2, xm6, 1
vperm2i128 m6, m3, m7, 0x31
- vinserti128 m3, m3, xm7, 1
+ vinserti128 m3, xm7, 1
pxor m7, m7
REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
punpckhwd m7, m0, m1
@@ -3076,13 +3076,13 @@
LOAD_8ROWS cq+32*0, 32*2
call m(idct_16x8_internal).main
vperm2i128 m8, m0, m4, 0x31
- vinserti128 m0, m0, xm4, 1
+ vinserti128 m0, xm4, 1
vperm2i128 m4, m1, m5, 0x31
- vinserti128 m1, m1, xm5, 1
+ vinserti128 m1, xm5, 1
vperm2i128 m5, m2, m6, 0x31
- vinserti128 m2, m2, xm6, 1
+ vinserti128 m2, xm6, 1
vperm2i128 m6, m3, m7, 0x31
- vinserti128 m3, m3, xm7, 1
+ vinserti128 m3, xm7, 1
vpbroadcastd m9, [o(pw_8192)]
pxor m7, m7
REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
@@ -3285,7 +3285,7 @@
%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
vbroadcasti128 m%1, [cq+16*%3]
vbroadcasti128 m%2, [cq+16*%4]
- shufpd m%1, m%1, m%2, 0x0c
+ shufpd m%1, m%2, 0x0c
%endmacro
cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
@@ -3387,13 +3387,13 @@
pmulhrsw m12, [rsp+32*0]
mova [rsp+32*0], m8
vperm2i128 m4, m0, m6, 0x31
- vinserti128 m0, m0, xm6, 1
+ vinserti128 m0, xm6, 1
vperm2i128 m5, m1, m7, 0x31
- vinserti128 m1, m1, xm7, 1
+ vinserti128 m1, xm7, 1
vperm2i128 m6, m2, m9, 0x31
- vinserti128 m2, m2, xm9, 1
+ vinserti128 m2, xm9, 1
vperm2i128 m7, m3, m10, 0x31
- vinserti128 m3, m3, xm10, 1
+ vinserti128 m3, xm10, 1
call m(idct_16x8_internal).main
vpbroadcastd m8, [o(pw_2048)]
REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
@@ -3432,13 +3432,13 @@
punpckldq m9, m12, m5
punpckhdq m12, m5
vperm2i128 m4, m0, m6, 0x31
- vinserti128 m0, m0, xm6, 1
+ vinserti128 m0, xm6, 1
vperm2i128 m5, m1, m7, 0x31
- vinserti128 m1, m1, xm7, 1
+ vinserti128 m1, xm7, 1
vperm2i128 m6, m2, m9, 0x31
- vinserti128 m2, m2, xm9, 1
+ vinserti128 m2, xm9, 1
vperm2i128 m7, m3, m12, 0x31
- vinserti128 m3, m3, xm12, 1
+ vinserti128 m3, xm12, 1
call m(idct_16x8_internal).main2
vpbroadcastd m8, [o(pw_2048)]
REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
@@ -3457,26 +3457,26 @@
lea r4, [strideq*3]
sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
.loop:
- mova xm0, [cq+16* 0]
- mova xm1, [cq+16* 4]
- vinserti128 m0, m0, [cq+16* 1], 1
- vinserti128 m1, m1, [cq+16* 5], 1
+ mova xm0,[cq+16* 0]
+ mova xm1, [cq+16* 4]
+ vinserti128 m0, [cq+16* 1], 1
+ vinserti128 m1, [cq+16* 5], 1
pxor m8, m8
mova [cq+32*0], m8
mova [cq+32*2], m8
add cq, 16*16
- mova xm2, [cq-16* 8]
- mova xm3, [cq-16* 4]
- vinserti128 m2, m2, [cq-16* 7], 1
- vinserti128 m3, m3, [cq-16* 3], 1
- mova xm4, [cq+16* 0]
- mova xm5, [cq+16* 4]
- vinserti128 m4, m4, [cq+16* 1], 1
- vinserti128 m5, m5, [cq+16* 5], 1
- mova xm6, [cq+16* 8]
- mova xm7, [cq+16*12]
- vinserti128 m6, m6, [cq+16* 9], 1
- vinserti128 m7, m7, [cq+16*13], 1
+ mova xm2, [cq-16* 8]
+ mova xm3, [cq-16* 4]
+ vinserti128 m2, [cq-16* 7], 1
+ vinserti128 m3, [cq-16* 3], 1
+ mova xm4, [cq+16* 0]
+ mova xm5, [cq+16* 4]
+ vinserti128 m4, [cq+16* 1], 1
+ vinserti128 m5, [cq+16* 5], 1
+ mova xm6, [cq+16* 8]
+ mova xm7, [cq+16*12]
+ vinserti128 m6, [cq+16* 9], 1
+ vinserti128 m7, [cq+16*13], 1
REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6
REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
call .transpose8x8
@@ -3529,22 +3529,22 @@
lea r5, [dstq+strideq*4]
sub eobd, 107
.loop:
- mova xm0, [cq-16*8]
- mova xm1, [cq-16*7]
- vinserti128 m0, m0, [cq+16*0], 1
- vinserti128 m1, m1, [cq+16*1], 1
- mova xm2, [cq-16*6]
- mova xm3, [cq-16*5]
- vinserti128 m2, m2, [cq+16*2], 1
- vinserti128 m3, m3, [cq+16*3], 1
- mova xm4, [cq-16*4]
- mova xm5, [cq-16*3]
- vinserti128 m4, m4, [cq+16*4], 1
- vinserti128 m5, m5, [cq+16*5], 1
- mova xm6, [cq-16*2]
- mova xm7, [cq-16*1]
- vinserti128 m6, m6, [cq+16*6], 1
- vinserti128 m7, m7, [cq+16*7], 1
+ mova xm0, [cq-16*8]
+ mova xm1, [cq-16*7]
+ vinserti128 m0, [cq+16*0], 1
+ vinserti128 m1, [cq+16*1], 1
+ mova xm2, [cq-16*6]
+ mova xm3, [cq-16*5]
+ vinserti128 m2, [cq+16*2], 1
+ vinserti128 m3, [cq+16*3], 1
+ mova xm4, [cq-16*4]
+ mova xm5, [cq-16*3]
+ vinserti128 m4, [cq+16*4], 1
+ vinserti128 m5, [cq+16*5], 1
+ mova xm6, [cq-16*2]
+ mova xm7, [cq-16*1]
+ vinserti128 m6, [cq+16*6], 1
+ vinserti128 m7, [cq+16*7], 1
pxor m8, m8
REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -3716,28 +3716,28 @@
vextracti128 [r2+32*3+16], m14, 1
vinserti128 m8, m1, xm9, 1
vperm2i128 m12, m1, m9, 0x31
- mova xm0, [tmp1q-32*4]
- mova xm1, [tmp1q-32*3]
- vinserti128 m0, m0, [tmp1q+32*0], 1
- vinserti128 m1, m1, [tmp1q+32*1], 1
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp1q+32*0], 1
+ vinserti128 m1, [tmp1q+32*1], 1
vinserti128 m10, m5, xm13, 1
vperm2i128 m14, m5, m13, 0x31
- mova xm4, [tmp1q-32*4+16]
- mova xm5, [tmp1q-32*3+16]
- vinserti128 m4, m4, [tmp1q+32*0+16], 1
- vinserti128 m5, m5, [tmp1q+32*1+16], 1
+ mova xm4, [tmp1q-32*4+16]
+ mova xm5, [tmp1q-32*3+16]
+ vinserti128 m4, [tmp1q+32*0+16], 1
+ vinserti128 m5, [tmp1q+32*1+16], 1
vinserti128 m9, m3, xm11, 1
vperm2i128 m13, m3, m11, 0x31
- mova xm2, [tmp1q-32*2]
- mova xm3, [tmp1q-32*1]
- vinserti128 m2, m2, [tmp1q+32*2], 1
- vinserti128 m3, m3, [tmp1q+32*3], 1
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp1q+32*2], 1
+ vinserti128 m3, [tmp1q+32*3], 1
vinserti128 m11, m7, xm15, 1
vperm2i128 m15, m7, m15, 0x31
- mova xm6, [tmp1q-32*2+16]
- mova xm7, [tmp1q-32*1+16]
- vinserti128 m6, m6, [tmp1q+32*2+16], 1
- vinserti128 m7, m7, [tmp1q+32*3+16], 1
+ mova xm6, [tmp1q-32*2+16]
+ mova xm7, [tmp1q-32*1+16]
+ vinserti128 m6, [tmp1q+32*2+16], 1
+ vinserti128 m7, [tmp1q+32*3+16], 1
call .main_oddhalf
LOAD_8ROWS_H r2-32*4, 32
.idct16:
@@ -3985,7 +3985,7 @@
mova [tmp1q+32*(11-%2)], xm%2
vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
vperm2i128 m%2, m%1, m%4, 0x31
- vinserti128 m%1, m%1, xm%4, 1
+ vinserti128 m%1, xm%4, 1
%endmacro
cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
@@ -4103,22 +4103,22 @@
mov rax, cq
paddw m11, m12, m12 ; pw_16384
.loop:
- mova xm0, [cq+64* 0]
- mova xm1, [cq+64* 1]
- vinserti128 m0, m0, [cq+64* 8], 1
- vinserti128 m1, m1, [cq+64* 9], 1
- mova xm2, [cq+64* 2]
- mova xm3, [cq+64* 3]
- vinserti128 m2, m2, [cq+64*10], 1
- vinserti128 m3, m3, [cq+64*11], 1
- mova xm4, [cq+64* 4]
- mova xm5, [cq+64* 5]
- vinserti128 m4, m4, [cq+64*12], 1
- vinserti128 m5, m5, [cq+64*13], 1
- mova xm6, [cq+64* 6]
- mova xm7, [cq+64* 7]
- vinserti128 m6, m6, [cq+64*14], 1
- vinserti128 m7, m7, [cq+64*15], 1
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4171,22 +4171,22 @@
mov r5, dstq
mov rax, cq
.loop:
- mova xm0, [cq+32* 0]
- mova xm1, [cq+32* 1]
- vinserti128 m0, m0, [cq+32* 8], 1
- vinserti128 m1, m1, [cq+32* 9], 1
- mova xm2, [cq+32* 2]
- mova xm3, [cq+32* 3]
- vinserti128 m2, m2, [cq+32*10], 1
- vinserti128 m3, m3, [cq+32*11], 1
- mova xm4, [cq+32* 4]
- mova xm5, [cq+32* 5]
- vinserti128 m4, m4, [cq+32*12], 1
- vinserti128 m5, m5, [cq+32*13], 1
- mova xm6, [cq+32* 6]
- mova xm7, [cq+32* 7]
- vinserti128 m6, m6, [cq+32*14], 1
- vinserti128 m7, m7, [cq+32*15], 1
+ mova xm0, [cq+32* 0]
+ mova xm1, [cq+32* 1]
+ vinserti128 m0, [cq+32* 8], 1
+ vinserti128 m1, [cq+32* 9], 1
+ mova xm2, [cq+32* 2]
+ mova xm3, [cq+32* 3]
+ vinserti128 m2, [cq+32*10], 1
+ vinserti128 m3, [cq+32*11], 1
+ mova xm4, [cq+32* 4]
+ mova xm5, [cq+32* 5]
+ vinserti128 m4, [cq+32*12], 1
+ vinserti128 m5, [cq+32*13], 1
+ mova xm6, [cq+32* 6]
+ mova xm7, [cq+32* 7]
+ vinserti128 m6, [cq+32*14], 1
+ vinserti128 m7, [cq+32*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4374,22 +4374,22 @@
mov r5, dstq
lea rax, [cq+32]
.loop:
- mova xm0, [cq+64* 0]
- mova xm1, [cq+64* 1]
- vinserti128 m0, m0, [cq+64* 8], 1
- vinserti128 m1, m1, [cq+64* 9], 1
- mova xm2, [cq+64* 2]
- mova xm3, [cq+64* 3]
- vinserti128 m2, m2, [cq+64*10], 1
- vinserti128 m3, m3, [cq+64*11], 1
- mova xm4, [cq+64* 4]
- mova xm5, [cq+64* 5]
- vinserti128 m4, m4, [cq+64*12], 1
- vinserti128 m5, m5, [cq+64*13], 1
- mova xm6, [cq+64* 6]
- mova xm7, [cq+64* 7]
- vinserti128 m6, m6, [cq+64*14], 1
- vinserti128 m7, m7, [cq+64*15], 1
+ mova xm0, [cq+64* 0]
+ mova xm1, [cq+64* 1]
+ vinserti128 m0, [cq+64* 8], 1
+ vinserti128 m1, [cq+64* 9], 1
+ mova xm2, [cq+64* 2]
+ mova xm3, [cq+64* 3]
+ vinserti128 m2, [cq+64*10], 1
+ vinserti128 m3, [cq+64*11], 1
+ mova xm4, [cq+64* 4]
+ mova xm5, [cq+64* 5]
+ vinserti128 m4, [cq+64*12], 1
+ vinserti128 m5, [cq+64*13], 1
+ mova xm6, [cq+64* 6]
+ mova xm7, [cq+64* 7]
+ vinserti128 m6, [cq+64*14], 1
+ vinserti128 m7, [cq+64*15], 1
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
@@ -4532,27 +4532,27 @@
add eobd, 0x80000000
jnc .pass1_loop
lea r2, [rsp+32*23]
- mova xm0, [r2-32*4+ 0]
- mova xm1, [r2-32*2+ 0]
- vinserti128 m0, m0, [r2+32*0+ 0], 1
- vinserti128 m1, m1, [r2+32*2+ 0], 1
- mova xm2, [r2-32*4+16]
- mova xm3, [r2-32*2+16]
- vinserti128 m2, m2, [r2+32*0+16], 1
- vinserti128 m3, m3, [r2+32*2+16], 1
+ mova xm0, [r2-32*4+ 0]
+ mova xm1, [r2-32*2+ 0]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m1, [r2+32*2+ 0], 1
+ mova xm2, [r2-32*4+16]
+ mova xm3, [r2-32*2+16]
+ vinserti128 m2, [r2+32*0+16], 1
+ vinserti128 m3, [r2+32*2+16], 1
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
test r7d, r7d
jl .fast
lea r3, [r2+32*8]
- mova xm4, [r3-32*4+ 0]
- mova xm5, [r3-32*2+ 0]
- vinserti128 m4, m4, [r3+32*0+ 0], 1
- vinserti128 m5, m5, [r3+32*2+ 0], 1
- mova xm6, [r3-32*4+16]
- mova xm7, [r3-32*2+16]
- vinserti128 m6, m6, [r3+32*0+16], 1
- vinserti128 m7, m7, [r3+32*2+16], 1
+ mova xm4, [r3-32*4+ 0]
+ mova xm5, [r3-32*2+ 0]
+ vinserti128 m4, [r3+32*0+ 0], 1
+ vinserti128 m5, [r3+32*2+ 0], 1
+ mova xm6, [r3-32*4+16]
+ mova xm7, [r3-32*2+16]
+ vinserti128 m6, [r3+32*0+16], 1
+ vinserti128 m7, [r3+32*2+16], 1
.fast:
mova [rsp], m8
lea tmp1q, [rsp+32*7]
@@ -4575,26 +4575,26 @@
mova [tmp1q+32*1], m13
mova [tmp1q+32*2], m14
mova [tmp1q+32*3], m15
- mova xm0, [r2-32*3+ 0]
- mova xm1, [r2-32*1+ 0]
- vinserti128 m0, m0, [r2+32*1+ 0], 1
- vinserti128 m1, m1, [r2+32*3+ 0], 1
- mova xm2, [r2-32*3+16]
- mova xm3, [r2-32*1+16]
- vinserti128 m2, m2, [r2+32*1+16], 1
- vinserti128 m3, m3, [r2+32*3+16], 1
+ mova xm0, [r2-32*3+ 0]
+ mova xm1, [r2-32*1+ 0]
+ vinserti128 m0, [r2+32*1+ 0], 1
+ vinserti128 m1, [r2+32*3+ 0], 1
+ mova xm2, [r2-32*3+16]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m2, [r2+32*1+16], 1
+ vinserti128 m3, [r2+32*3+16], 1
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
test r7d, r7d
jl .fast2
- mova xm4, [r3-32*3+ 0]
- mova xm5, [r3-32*1+ 0]
- vinserti128 m4, m4, [r3+32*1+ 0], 1
- vinserti128 m5, m5, [r3+32*3+ 0], 1
- mova xm6, [r3-32*3+16]
- mova xm7, [r3-32*1+16]
- vinserti128 m6, m6, [r3+32*1+16], 1
- vinserti128 m7, m7, [r3+32*3+16], 1
+ mova xm4, [r3-32*3+ 0]
+ mova xm5, [r3-32*1+ 0]
+ vinserti128 m4, [r3+32*1+ 0], 1
+ vinserti128 m5, [r3+32*3+ 0], 1
+ mova xm6, [r3-32*3+16]
+ mova xm7, [r3-32*1+16]
+ vinserti128 m6, [r3+32*1+16], 1
+ vinserti128 m7, [r3+32*3+16], 1
.fast2:
add tmp1q, 32*8
lea tmp2q, [tmp1q+32*8]
@@ -4603,27 +4603,27 @@
vpbroadcastd m15, [o(pd_2048)]
add tmp1q, 32*16
add tmp2q, 32*32
- mova xm0, [r2-32*4+ 0]
- mova xm3, [r2-32*1+16]
- vinserti128 m0, m0, [r2+32*0+ 0], 1
- vinserti128 m3, m3, [r2+32*3+16], 1
- mova xm4, [r2-32*4+16]
- mova xm7, [r2-32*1+ 0]
- vinserti128 m4, m4, [r2+32*0+16], 1
- vinserti128 m7, m7, [r2+32*3+ 0], 1
+ mova xm0, [r2-32*4+ 0]
+ mova xm3, [r2-32*1+16]
+ vinserti128 m0, [r2+32*0+ 0], 1
+ vinserti128 m3, [r2+32*3+16], 1
+ mova xm4, [r2-32*4+16]
+ mova xm7, [r2-32*1+ 0]
+ vinserti128 m4, [r2+32*0+16], 1
+ vinserti128 m7, [r2+32*3+ 0], 1
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r7d, r7d
jl .fast3
add r3, 32*24
- mova xm1, [r3-32*1+16]
- mova xm2, [r3-32*4+ 0]
- vinserti128 m1, m1, [r3+32*3+16], 1
- vinserti128 m2, m2, [r3+32*0+ 0], 1
- mova xm5, [r3-32*1+ 0]
- mova xm6, [r3-32*4+16]
- vinserti128 m5, m5, [r3+32*3+ 0], 1
- vinserti128 m6, m6, [r3+32*0+16], 1
+ mova xm1, [r3-32*1+16]
+ mova xm2, [r3-32*4+ 0]
+ vinserti128 m1, [r3+32*3+16], 1
+ vinserti128 m2, [r3+32*0+ 0], 1
+ mova xm5, [r3-32*1+ 0]
+ mova xm6, [r3-32*4+16]
+ vinserti128 m5, [r3+32*3+ 0], 1
+ vinserti128 m6, [r3+32*0+16], 1
.fast3:
add rax, o_idct64_offset
call m(inv_txfm_add_dct_dct_16x64).main_part1
@@ -4630,26 +4630,26 @@
add rax, 8
add tmp1q, 32*8
sub tmp2q, 32*8
- mova xm0, [r2-32*2+ 0]
- mova xm3, [r2-32*3+16]
- vinserti128 m0, m0, [r2+32*2+ 0], 1
- vinserti128 m3, m3, [r2+32*1+16], 1
- mova xm4, [r2-32*2+16]
- mova xm7, [r2-32*3+ 0]
- vinserti128 m4, m4, [r2+32*2+16], 1
- vinserti128 m7, m7, [r2+32*1+ 0], 1
+ mova xm0, [r2-32*2+ 0]
+ mova xm3, [r2-32*3+16]
+ vinserti128 m0, [r2+32*2+ 0], 1
+ vinserti128 m3, [r2+32*1+16], 1
+ mova xm4, [r2-32*2+16]
+ mova xm7, [r2-32*3+ 0]
+ vinserti128 m4, [r2+32*2+16], 1
+ vinserti128 m7, [r2+32*1+ 0], 1
pxor m1, m1
REPX {mova x, m1}, m2, m5, m6
test r7d, r7d
jl .fast4
- mova xm1, [r3-32*3+16]
- mova xm2, [r3-32*2+ 0]
- vinserti128 m1, m1, [r3+32*1+16], 1
- vinserti128 m2, m2, [r3+32*2+ 0], 1
- mova xm5, [r3-32*3+ 0]
- mova xm6, [r3-32*2+16]
- vinserti128 m5, m5, [r3+32*1+ 0], 1
- vinserti128 m6, m6, [r3+32*2+16], 1
+ mova xm1, [r3-32*3+16]
+ mova xm2, [r3-32*2+ 0]
+ vinserti128 m1, [r3+32*1+16], 1
+ vinserti128 m2, [r3+32*2+ 0], 1
+ mova xm5, [r3-32*3+ 0]
+ mova xm6, [r3-32*2+16]
+ vinserti128 m5, [r3+32*1+ 0], 1
+ vinserti128 m6, [r3+32*2+16], 1
.fast4:
call m(inv_txfm_add_dct_dct_16x64).main_part1
call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
@@ -4933,38 +4933,38 @@
mov tmp2d, 4
.pass2_loop:
lea r3, [tmp1q-32*8]
- mova xm0, [r3 -32*4]
- mova xm1, [r3 -32*3]
- vinserti128 m0, m0, [tmp1q-32*4], 1
- vinserti128 m1, m1, [tmp1q-32*3], 1
- mova xm2, [r3 -32*2]
- mova xm3, [r3 -32*1]
- vinserti128 m2, m2, [tmp1q-32*2], 1
- vinserti128 m3, m3, [tmp1q-32*1], 1
- mova xm4, [r3 +32*0]
- mova xm5, [r3 +32*1]
- vinserti128 m4, m4, [tmp1q+32*0], 1
- vinserti128 m5, m5, [tmp1q+32*1], 1
- mova xm6, [r3 +32*2]
- mova xm7, [r3 +32*3]
- vinserti128 m6, m6, [tmp1q+32*2], 1
- vinserti128 m7, m7, [tmp1q+32*3], 1
- mova xm8, [r3 -32*4+16]
- mova xm9, [r3 -32*3+16]
- vinserti128 m8, m8, [tmp1q-32*4+16], 1
- vinserti128 m9, m9, [tmp1q-32*3+16], 1
- mova xm10, [r3 -32*2+16]
- mova xm11, [r3 -32*1+16]
- vinserti128 m10, m10, [tmp1q-32*2+16], 1
- vinserti128 m11, m11, [tmp1q-32*1+16], 1
- mova xm12, [r3 +32*0+16]
- mova xm13, [r3 +32*1+16]
- vinserti128 m12, m12, [tmp1q+32*0+16], 1
- vinserti128 m13, m13, [tmp1q+32*1+16], 1
- mova xm14, [r3 +32*2+16]
- mova xm15, [r3 +32*3+16]
- vinserti128 m14, m14, [tmp1q+32*2+16], 1
- vinserti128 m15, m15, [tmp1q+32*3+16], 1
+ mova xm0, [r3 -32*4]
+ mova xm1, [r3 -32*3]
+ vinserti128 m0, [tmp1q-32*4], 1
+ vinserti128 m1, [tmp1q-32*3], 1
+ mova xm2, [r3 -32*2]
+ mova xm3, [r3 -32*1]
+ vinserti128 m2, [tmp1q-32*2], 1
+ vinserti128 m3, [tmp1q-32*1], 1
+ mova xm4, [r3 +32*0]
+ mova xm5, [r3 +32*1]
+ vinserti128 m4, [tmp1q+32*0], 1
+ vinserti128 m5, [tmp1q+32*1], 1
+ mova xm6, [r3 +32*2]
+ mova xm7, [r3 +32*3]
+ vinserti128 m6, [tmp1q+32*2], 1
+ vinserti128 m7, [tmp1q+32*3], 1
+ mova xm8, [r3 -32*4+16]
+ mova xm9, [r3 -32*3+16]
+ vinserti128 m8, [tmp1q-32*4+16], 1
+ vinserti128 m9, [tmp1q-32*3+16], 1
+ mova xm10, [r3 -32*2+16]
+ mova xm11, [r3 -32*1+16]
+ vinserti128 m10, [tmp1q-32*2+16], 1
+ vinserti128 m11, [tmp1q-32*1+16], 1
+ mova xm12, [r3 +32*0+16]
+ mova xm13, [r3 +32*1+16]
+ vinserti128 m12, [tmp1q+32*0+16], 1
+ vinserti128 m13, [tmp1q+32*1+16], 1
+ mova xm14, [r3 +32*2+16]
+ mova xm15, [r3 +32*3+16]
+ vinserti128 m14, [tmp1q+32*2+16], 1
+ vinserti128 m15, [tmp1q+32*3+16], 1
mova [rsp+32*0], m6
mova [rsp+32*1], m7
vpbroadcastd m7, [o(pw_8192)]
@@ -5320,48 +5320,48 @@
mov tmp3d, 4
.loop:
lea tmp2q, [tmp1q+32*8]
- mova xm0, [tmp1q-32*4]
- mova xm1, [tmp1q-32*3]
- vinserti128 m0, m0, [tmp2q-32*4], 1
- vinserti128 m1, m1, [tmp2q-32*3], 1
- mova xm2, [tmp1q-32*2]
- mova xm3, [tmp1q-32*1]
- vinserti128 m2, m2, [tmp2q-32*2], 1
- vinserti128 m3, m3, [tmp2q-32*1], 1
- mova xm4, [tmp1q+32*0]
- mova xm5, [tmp1q+32*1]
- vinserti128 m4, m4, [tmp2q+32*0], 1
- vinserti128 m5, m5, [tmp2q+32*1], 1
- mova xm6, [tmp1q+32*2]
- mova xm7, [tmp1q+32*3]
- vinserti128 m6, m6, [tmp2q+32*2], 1
- vinserti128 m7, m7, [tmp2q+32*3], 1
+ mova xm0, [tmp1q-32*4]
+ mova xm1, [tmp1q-32*3]
+ vinserti128 m0, [tmp2q-32*4], 1
+ vinserti128 m1, [tmp2q-32*3], 1
+ mova xm2, [tmp1q-32*2]
+ mova xm3, [tmp1q-32*1]
+ vinserti128 m2, [tmp2q-32*2], 1
+ vinserti128 m3, [tmp2q-32*1], 1
+ mova xm4, [tmp1q+32*0]
+ mova xm5, [tmp1q+32*1]
+ vinserti128 m4, [tmp2q+32*0], 1
+ vinserti128 m5, [tmp2q+32*1], 1
+ mova xm6, [tmp1q+32*2]
+ mova xm7, [tmp1q+32*3]
+ vinserti128 m6, [tmp2q+32*2], 1
+ vinserti128 m7, [tmp2q+32*3], 1
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
- mova xm8, [tmp1q-32*4+16]
- mova xm9, [tmp1q-32*3+16]
- vinserti128 m8, m8, [tmp2q-32*4+16], 1
- vinserti128 m9, m9, [tmp2q-32*3+16], 1
+ mova xm8, [tmp1q-32*4+16]
+ mova xm9, [tmp1q-32*3+16]
+ vinserti128 m8, [tmp2q-32*4+16], 1
+ vinserti128 m9, [tmp2q-32*3+16], 1
mova [tmp1q-32*4], m0
mova [tmp2q-32*4], m1
mova [tmp1q-32*3], m2
mova [tmp2q-32*3], m3
- mova xm2, [tmp1q-32*2+16]
- mova xm3, [tmp1q-32*1+16]
- vinserti128 m2, m2, [tmp2q-32*2+16], 1
- vinserti128 m3, m3, [tmp2q-32*1+16], 1
+ mova xm2, [tmp1q-32*2+16]
+ mova xm3, [tmp1q-32*1+16]
+ vinserti128 m2, [tmp2q-32*2+16], 1
+ vinserti128 m3, [tmp2q-32*1+16], 1
mova [tmp1q-32*2], m4
mova [tmp2q-32*2], m5
mova [tmp1q-32*1], m6
mova [tmp2q-32*1], m7
- mova xm4, [tmp1q+32*0+16]
- mova xm5, [tmp1q+32*1+16]
- vinserti128 m4, m4, [tmp2q+32*0+16], 1
- vinserti128 m5, m5, [tmp2q+32*1+16], 1
- mova xm6, [tmp1q+32*2+16]
- mova xm7, [tmp1q+32*3+16]
- vinserti128 m6, m6, [tmp2q+32*2+16], 1
- vinserti128 m7, m7, [tmp2q+32*3+16], 1
+ mova xm4, [tmp1q+32*0+16]
+ mova xm5, [tmp1q+32*1+16]
+ vinserti128 m4, [tmp2q+32*0+16], 1
+ vinserti128 m5, [tmp2q+32*1+16], 1
+ mova xm6, [tmp1q+32*2+16]
+ mova xm7, [tmp1q+32*3+16]
+ vinserti128 m6, [tmp2q+32*2+16], 1
+ vinserti128 m7, [tmp2q+32*3+16], 1
pmulhrsw m0, m8, m10
pmulhrsw m1, m9, m10
REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7