ref: aa0fc4718bca67f521ac5ef21b29719764182cf1
parent: 2df874896225540b91bb31838845c80e2f854845
author: Henrik Gramner <[email protected]>
date: Wed Oct 3 16:08:28 EDT 2018
x86: Enable ITX AVX2 asm on 64-bit Windows
--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -26,7 +26,7 @@
%include "config.asm"
%include "ext/x86/x86inc.asm"
-%if ARCH_X86_64 && UNIX64 ; Fixme: Windows
+%if ARCH_X86_64
SECTION_RODATA 32
@@ -117,10 +117,6 @@
SECTION .text
-; Callee-saved registers has to be explicitly handled when jumping around
-; different functions since RET can't automatically deal with it.
-ASSERT ARCH_X86_64 && WIN64 == 0
-
; Code size reduction trickery: Intead of using rip-relative loads with
; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
; single rip-relative lea and then address things relative from that with
@@ -373,8 +369,8 @@
vpblendd m0, m0, m2, 0x03
ITX4_END 3, 0, 2, 1, 0
-%macro INV_TXFM_FN 5 ; type1, type2, fast_thresh, size, num_mmregs
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, %5, dst, stride, c, eob, tx2
+%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
+cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
%undef cmp
%define %%p1 m(i%1_%4_internal)
lea rax, [o_base]
@@ -396,7 +392,7 @@
%endmacro
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x4, 6
+ INV_TXFM_FN %1, %2, %3, 4x4
%ifidn %1_%2, dct_identity
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m0, [cq]
@@ -500,7 +496,7 @@
INV_TXFM_4X4_FN dct, flipadst, 0
INV_TXFM_4X4_FN dct, identity, 3
-cglobal idct_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
IDCT4_1D_PACKED
@@ -522,7 +518,7 @@
INV_TXFM_4X4_FN adst, flipadst, 0
INV_TXFM_4X4_FN adst, identity
-cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call .main
@@ -550,7 +546,7 @@
INV_TXFM_4X4_FN flipadst, flipadst, 0
INV_TXFM_4X4_FN flipadst, identity
-cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call m(iadst_4x4_internal).main
@@ -574,7 +570,7 @@
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
-cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
vpbroadcastd m2, [o(pw_5793x4)]
@@ -621,7 +617,7 @@
%endmacro
%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x8, 7
+ INV_TXFM_FN %1, %2, %3, 4x8
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
@@ -753,7 +749,7 @@
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
-cglobal idct_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m5, [o(pw_2896x8)]
@@ -785,7 +781,7 @@
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
-cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m2, [o(pw_2896x8)]
@@ -813,6 +809,7 @@
.end2:
pmulhrsw m0, m4
pmulhrsw m1, m4
+ WIN64_RESTORE_XMM
.end3:
pxor m2, m2
mova [cq+32*0], m2
@@ -832,7 +829,7 @@
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
-cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m2, [o(pw_2896x8)]
@@ -864,7 +861,7 @@
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
-cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m2, [cq+32*0], q3120
vpermq m0, [cq+32*1], q3120
vpbroadcastd m3, [o(pw_2896x8)]
@@ -885,7 +882,7 @@
jmp m(iadst_4x8_internal).end2
%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 4x16, 11
+ INV_TXFM_FN %1, %2, %3, 4x16
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd m0, [o(pw_2896x8)]
@@ -1040,7 +1037,7 @@
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
-cglobal idct_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
@@ -1081,7 +1078,7 @@
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
-cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
@@ -1115,6 +1112,7 @@
vpblendd m5, m5, m6, 0xcc
.end2:
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
.end3:
pxor m4, m4
mova [cq+32*0], m4
@@ -1195,7 +1193,7 @@
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
-cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
@@ -1232,7 +1230,7 @@
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
-cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m3, [cq+32*0]
mova m2, [cq+32*1]
mova m4, [cq+32*2]
@@ -1284,7 +1282,7 @@
%endmacro
%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x4, 7
+ INV_TXFM_FN %1, %2, %3, 8x4
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
@@ -1347,7 +1345,7 @@
INV_TXFM_8X4_FN dct, flipadst, 0
INV_TXFM_8X4_FN dct, identity, 3
-cglobal idct_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm3, [o(pw_2896x8)]
pmulhrsw xm0, xm3, [cq+16*0]
pmulhrsw xm1, xm3, [cq+16*1]
@@ -1373,7 +1371,7 @@
INV_TXFM_8X4_FN adst, flipadst
INV_TXFM_8X4_FN adst, identity
-cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm0, [o(pw_2896x8)]
pshufd xm4, [cq+16*0], q1032
pmulhrsw xm3, xm0, [cq+16*3]
@@ -1401,6 +1399,7 @@
vpbroadcastd m2, [o(pw_2048)]
pmulhrsw m0, m2
pmulhrsw m1, m2
+ WIN64_RESTORE_XMM
.end3:
pxor m2, m2
mova [cq+32*0], m2
@@ -1418,7 +1417,7 @@
INV_TXFM_8X4_FN flipadst, flipadst
INV_TXFM_8X4_FN flipadst, identity
-cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd xm0, [o(pw_2896x8)]
pshufd xm4, [cq+16*0], q1032
pmulhrsw xm3, xm0, [cq+16*3]
@@ -1448,7 +1447,7 @@
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity
-cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
mova xm2, [cq+16*0]
mova xm0, [cq+16*1]
vinserti128 m2, m2, [cq+16*2], 1
@@ -1472,7 +1471,7 @@
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x8, 7
+ INV_TXFM_FN %1, %2, %3, 8x8
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
pmulhrsw xm0, [cq]
@@ -1537,7 +1536,7 @@
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst
-cglobal idct_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120 ; 0 1
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m2, [cq+32*2], q3120 ; 4 5
@@ -1574,7 +1573,7 @@
INV_TXFM_8X8_FN adst, flipadst
INV_TXFM_8X8_FN adst, identity
-cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m4, [cq+32*0], q1302 ; 1 0
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
@@ -1615,6 +1614,7 @@
.end3:
pmulhrsw m2, m4
pmulhrsw m3, m4
+ WIN64_RESTORE_XMM
.end4:
pxor m4, m4
mova [cq+32*0], m4
@@ -1636,7 +1636,7 @@
INV_TXFM_8X8_FN flipadst, flipadst
INV_TXFM_8X8_FN flipadst, identity
-cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m4, [cq+32*0], q1302 ; 1 0
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
@@ -1682,7 +1682,7 @@
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
-cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
mova xm3, [cq+16*0]
mova xm2, [cq+16*1]
vinserti128 m3, m3, [cq+16*4], 1
@@ -1705,7 +1705,7 @@
jmp m(iadst_8x8_internal).end
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 8x16, 13
+ INV_TXFM_FN %1, %2, %3, 8x16
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
@@ -1720,6 +1720,7 @@
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_8x8).end2
%elifidn %1_%2, dct_identity
+ WIN64_SPILL_XMM 13
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m7, m0, [cq]
vpbroadcastd m1, [o(pw_16384)]
@@ -1798,7 +1799,7 @@
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst
-cglobal idct_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(idct_16x8_internal).main
vpbroadcastd m10, [o(pw_16384)]
@@ -1861,7 +1862,7 @@
INV_TXFM_8X16_FN adst, flipadst
INV_TXFM_8X16_FN adst, identity
-cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
vpbroadcastd m10, [o(pw_16384)]
@@ -1966,7 +1967,7 @@
INV_TXFM_8X16_FN flipadst, flipadst
INV_TXFM_8X16_FN flipadst, identity
-cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
vpbroadcastd m9, [o(pw_16384)]
@@ -2013,7 +2014,7 @@
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
-cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm3, [cq+16*0]
mova xm2, [cq+16*2]
add cq, 16*8
@@ -2077,7 +2078,7 @@
%endmacro
%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x4, 11
+ INV_TXFM_FN %1, %2, %3, 16x4
%if %3 >= 0
%ifidn %1_%2, dct_identity
vpbroadcastd xm3, [o(pw_2896x8)]
@@ -2188,7 +2189,7 @@
INV_TXFM_16X4_FN dct, flipadst, 0
INV_TXFM_16X4_FN dct, identity, 3
-cglobal idct_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova xm0, [cq+16*0]
mova xm1, [cq+16*1]
mova xm2, [cq+16*2]
@@ -2223,7 +2224,7 @@
INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity
-cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q1230
vpermq m3, [cq+32*3], q2103
vpermq m1, [cq+32*1], q1230
@@ -2259,6 +2260,7 @@
.end:
vpbroadcastd m4, [o(pw_2048)]
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ WIN64_RESTORE_XMM
.end2:
pxor m4, m4
mova [cq+32*0], m4
@@ -2319,7 +2321,7 @@
INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity
-cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q1230
vpermq m3, [cq+32*3], q2103
vpermq m1, [cq+32*1], q1230
@@ -2357,7 +2359,7 @@
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
-cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova xm2, [cq+16*0]
mova xm4, [cq+16*1]
vinserti128 m2, m2, [cq+16*4], 1
@@ -2391,7 +2393,7 @@
jmp m(iadst_16x4_internal).end
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x8, 13
+ INV_TXFM_FN %1, %2, %3, 16x8
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
@@ -2401,6 +2403,7 @@
mov r2d, 4
jmp m(inv_txfm_add_dct_dct_16x4).dconly
%elifidn %1_%2, dct_identity
+ WIN64_SPILL_XMM 13
vbroadcasti128 m7, [cq]
vpbroadcastd m0, [o(pw_2896x8)]
vpbroadcastd m1, [o(pw_16384)]
@@ -2474,7 +2477,7 @@
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst
-cglobal idct_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 3120
call m(idct_8x16_internal).main
vpbroadcastd m10, [o(pw_16384)]
@@ -2544,7 +2547,7 @@
INV_TXFM_16X8_FN adst, flipadst
INV_TXFM_16X8_FN adst, identity
-cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
vpbroadcastd m10, [o(pw_16384)]
@@ -2608,7 +2611,7 @@
INV_TXFM_16X8_FN flipadst, flipadst
INV_TXFM_16X8_FN flipadst, identity
-cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
vpbroadcastd m10, [o(pw_16384)]
@@ -2671,7 +2674,7 @@
INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity
-cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm7, [cq+16*0]
mova xm2, [cq+16*1]
add cq, 16*8
@@ -2728,7 +2731,7 @@
%define o_base pw_5 + 128
%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
- INV_TXFM_FN %1, %2, %3, 16x16, 16
+ INV_TXFM_FN %1, %2, %3, 16x16
%ifidn %1_%2, dct_dct
movd xm1, [o(pw_2896x8)]
pmulhrsw xm0, xm1, [cq]
@@ -2737,6 +2740,7 @@
mov r2d, 8
jmp m(inv_txfm_add_dct_dct_16x4).dconly
%elifidn %1_%2, dct_identity
+ WIN64_SPILL_XMM 7
vpbroadcastd m3, [o(pw_2896x8)]
pmulhrsw m3, [cq]
vpbroadcastd m0, [o(pw_8192)]
@@ -2832,7 +2836,7 @@
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
-cglobal idct_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call .main
.pass1_end:
@@ -2977,7 +2981,7 @@
INV_TXFM_16X16_FN adst, adst
INV_TXFM_16X16_FN adst, flipadst
-cglobal iadst_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call .main
vpbroadcastd m1, [o(pw_8192)]
@@ -3091,7 +3095,7 @@
INV_TXFM_16X16_FN flipadst, adst
INV_TXFM_16X16_FN flipadst, flipadst
-cglobal iflipadst_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call m(iadst_16x16_internal).main
vpbroadcastd m1, [o(pw_8192)]
@@ -3163,7 +3167,7 @@
INV_TXFM_16X16_FN identity, dct, 15
INV_TXFM_16X16_FN identity, identity
-cglobal iidentity_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
mova xm0, [cq+16*0]
mova xm15, [cq+16*1]
mova xm1, [cq+16*2]
@@ -3277,7 +3281,7 @@
lea rax, [o_base]
test eobd, eobd
jz .dconly
- PROLOGUE 0, 0, 16, 32*3, dst, stride, c, eob
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
%undef cmp
cmp eobd, 106
jle .fast
@@ -3575,7 +3579,7 @@
jg .dconly_loop
RET
.normal:
- PROLOGUE 0, 0, 16, 32*3, dst, stride, c, eob
+ PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob
%undef cmp
LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2
LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3
@@ -3882,11 +3886,11 @@
vextracti128 [r2+%7], m%3, 1
%endmacro
-cglobal inv_txfm_add_dct_dct_16x32, 4, 8, 0, dst, stride, c, eob
+cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jz .dconly
- PROLOGUE 0, 0, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+ PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
base, tmp3
%undef cmp
LOAD_16ROWS cq, 64, 1
@@ -4250,7 +4254,7 @@
vinserti128 m%1, m%1, xm%4, 1
%endmacro
-cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 0, dst, stride, c, eob
+cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
lea rax, [o_base]
test eobd, eobd
jnz .normal
@@ -4262,7 +4266,7 @@
mov r2d, 16
jmp m(inv_txfm_add_dct_dct_32x8).dconly
.normal:
- PROLOGUE 0, 0, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+ PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
vpbroadcastd m15, [o(pw_2896x8)]
pmulhrsw m0, m15, [cq+32* 1]
pmulhrsw m1, m15, [cq+32* 3]
--- a/src/x86/itx_init.c
+++ b/src/x86/itx_init.c
@@ -117,7 +117,7 @@
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
-#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows
+#if BITDEPTH == 8 && ARCH_X86_64
assign_itx17_fn( , 4, 4, avx2);
assign_itx16_fn(R, 4, 8, avx2);
assign_itx16_fn(R, 4, 16, avx2);