shithub: dav1d

Download patch

ref: aa0fc4718bca67f521ac5ef21b29719764182cf1
parent: 2df874896225540b91bb31838845c80e2f854845
author: Henrik Gramner <[email protected]>
date: Wed Oct 3 16:08:28 EDT 2018

x86: Enable ITX AVX2 asm on 64-bit Windows

--- a/src/x86/itx.asm
+++ b/src/x86/itx.asm
@@ -26,7 +26,7 @@
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
-%if ARCH_X86_64 && UNIX64 ; Fixme: Windows
+%if ARCH_X86_64
 
 SECTION_RODATA 32
 
@@ -117,10 +117,6 @@
 
 SECTION .text
 
-; Callee-saved registers has to be explicitly handled when jumping around
-; different functions since RET can't automatically deal with it.
-ASSERT ARCH_X86_64 && WIN64 == 0
-
 ; Code size reduction trickery: Intead of using rip-relative loads with
 ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
 ; single rip-relative lea and then address things relative from that with
@@ -373,8 +369,8 @@
     vpblendd             m0, m0, m2, 0x03
     ITX4_END              3, 0, 2, 1, 0
 
-%macro INV_TXFM_FN 5 ; type1, type2, fast_thresh, size, num_mmregs
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, %5, dst, stride, c, eob, tx2
+%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
+cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
     %undef cmp
     %define %%p1 m(i%1_%4_internal)
     lea                 rax, [o_base]
@@ -396,7 +392,7 @@
 %endmacro
 
 %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4, 6
+    INV_TXFM_FN          %1, %2, %3, 4x4
 %ifidn %1_%2, dct_identity
     vpbroadcastd         m0, [o(pw_2896x8)]
     pmulhrsw             m0, [cq]
@@ -500,7 +496,7 @@
 INV_TXFM_4X4_FN dct, flipadst, 0
 INV_TXFM_4X4_FN dct, identity, 3
 
-cglobal idct_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova                 m0, [cq+16*0]
     mova                 m1, [cq+16*1]
     IDCT4_1D_PACKED
@@ -522,7 +518,7 @@
 INV_TXFM_4X4_FN adst, flipadst, 0
 INV_TXFM_4X4_FN adst, identity
 
-cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova                 m0, [cq+16*0]
     mova                 m1, [cq+16*1]
     call .main
@@ -550,7 +546,7 @@
 INV_TXFM_4X4_FN flipadst, flipadst, 0
 INV_TXFM_4X4_FN flipadst, identity
 
-cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova                 m0, [cq+16*0]
     mova                 m1, [cq+16*1]
     call m(iadst_4x4_internal).main
@@ -574,7 +570,7 @@
 INV_TXFM_4X4_FN identity, flipadst
 INV_TXFM_4X4_FN identity, identity
 
-cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova                 m0, [cq+16*0]
     mova                 m1, [cq+16*1]
     vpbroadcastd         m2, [o(pw_5793x4)]
@@ -621,7 +617,7 @@
 %endmacro
 
 %macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x8, 7
+    INV_TXFM_FN          %1, %2, %3, 4x8
 %if %3 >= 0
 %ifidn %1_%2, dct_identity
     vpbroadcastd        xm0, [o(pw_2896x8)]
@@ -753,7 +749,7 @@
 INV_TXFM_4X8_FN dct, adst
 INV_TXFM_4X8_FN dct, flipadst
 
-cglobal idct_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120
     vpermq               m1, [cq+32*1], q3120
     vpbroadcastd         m5, [o(pw_2896x8)]
@@ -785,7 +781,7 @@
 INV_TXFM_4X8_FN adst, flipadst
 INV_TXFM_4X8_FN adst, identity
 
-cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120
     vpermq               m1, [cq+32*1], q3120
     vpbroadcastd         m2, [o(pw_2896x8)]
@@ -813,6 +809,7 @@
 .end2:
     pmulhrsw             m0, m4
     pmulhrsw             m1, m4
+    WIN64_RESTORE_XMM
 .end3:
     pxor                 m2, m2
     mova          [cq+32*0], m2
@@ -832,7 +829,7 @@
 INV_TXFM_4X8_FN flipadst, flipadst
 INV_TXFM_4X8_FN flipadst, identity
 
-cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120
     vpermq               m1, [cq+32*1], q3120
     vpbroadcastd         m2, [o(pw_2896x8)]
@@ -864,7 +861,7 @@
 INV_TXFM_4X8_FN identity, flipadst
 INV_TXFM_4X8_FN identity, identity
 
-cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m2, [cq+32*0], q3120
     vpermq               m0, [cq+32*1], q3120
     vpbroadcastd         m3, [o(pw_2896x8)]
@@ -885,7 +882,7 @@
     jmp m(iadst_4x8_internal).end2
 
 %macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x16, 11
+    INV_TXFM_FN          %1, %2, %3, 4x16
 %if %3 >= 0
 %ifidn %1_%2, dct_identity
     vpbroadcastd         m0, [o(pw_2896x8)]
@@ -1040,7 +1037,7 @@
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
 
-cglobal idct_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                 m0, [cq+32*0]
     mova                 m1, [cq+32*1]
     mova                 m2, [cq+32*2]
@@ -1081,7 +1078,7 @@
 INV_TXFM_4X16_FN adst, flipadst
 INV_TXFM_4X16_FN adst, identity
 
-cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                 m0, [cq+32*0]
     mova                 m1, [cq+32*1]
     mova                 m2, [cq+32*2]
@@ -1115,6 +1112,7 @@
     vpblendd             m5, m5, m6, 0xcc
 .end2:
     REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
+    WIN64_RESTORE_XMM
 .end3:
     pxor                 m4, m4
     mova          [cq+32*0], m4
@@ -1195,7 +1193,7 @@
 INV_TXFM_4X16_FN flipadst, flipadst
 INV_TXFM_4X16_FN flipadst, identity
 
-cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                 m0, [cq+32*0]
     mova                 m1, [cq+32*1]
     mova                 m2, [cq+32*2]
@@ -1232,7 +1230,7 @@
 INV_TXFM_4X16_FN identity, flipadst
 INV_TXFM_4X16_FN identity, identity
 
-cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                 m3, [cq+32*0]
     mova                 m2, [cq+32*1]
     mova                 m4, [cq+32*2]
@@ -1284,7 +1282,7 @@
 %endmacro
 
 %macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x4, 7
+    INV_TXFM_FN          %1, %2, %3, 8x4
 %if %3 >= 0
 %ifidn %1_%2, dct_identity
     vpbroadcastd        xm0, [o(pw_2896x8)]
@@ -1347,7 +1345,7 @@
 INV_TXFM_8X4_FN dct, flipadst, 0
 INV_TXFM_8X4_FN dct, identity, 3
 
-cglobal idct_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd        xm3, [o(pw_2896x8)]
     pmulhrsw            xm0, xm3, [cq+16*0]
     pmulhrsw            xm1, xm3, [cq+16*1]
@@ -1373,7 +1371,7 @@
 INV_TXFM_8X4_FN adst, flipadst
 INV_TXFM_8X4_FN adst, identity
 
-cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd        xm0, [o(pw_2896x8)]
     pshufd              xm4,      [cq+16*0], q1032
     pmulhrsw            xm3, xm0, [cq+16*3]
@@ -1401,6 +1399,7 @@
     vpbroadcastd         m2, [o(pw_2048)]
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
+    WIN64_RESTORE_XMM
 .end3:
     pxor                 m2, m2
     mova          [cq+32*0], m2
@@ -1418,7 +1417,7 @@
 INV_TXFM_8X4_FN flipadst, flipadst
 INV_TXFM_8X4_FN flipadst, identity
 
-cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd        xm0, [o(pw_2896x8)]
     pshufd              xm4,      [cq+16*0], q1032
     pmulhrsw            xm3, xm0, [cq+16*3]
@@ -1448,7 +1447,7 @@
 INV_TXFM_8X4_FN identity, flipadst
 INV_TXFM_8X4_FN identity, identity
 
-cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     mova                xm2,     [cq+16*0]
     mova                xm0,     [cq+16*1]
     vinserti128          m2, m2, [cq+16*2], 1
@@ -1472,7 +1471,7 @@
     jmp m(iadst_8x4_internal).end
 
 %macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x8, 7
+    INV_TXFM_FN          %1, %2, %3, 8x8
 %ifidn %1_%2, dct_identity
     vpbroadcastd        xm0, [o(pw_2896x8)]
     pmulhrsw            xm0, [cq]
@@ -1537,7 +1536,7 @@
 INV_TXFM_8X8_FN dct, adst
 INV_TXFM_8X8_FN dct, flipadst
 
-cglobal idct_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120 ; 0 1
     vpermq               m3, [cq+32*3], q3120 ; 6 7
     vpermq               m2, [cq+32*2], q3120 ; 4 5
@@ -1574,7 +1573,7 @@
 INV_TXFM_8X8_FN adst, flipadst
 INV_TXFM_8X8_FN adst, identity
 
-cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m4, [cq+32*0], q1302 ; 1 0
     vpermq               m3, [cq+32*3], q3120 ; 6 7
     vpermq               m5, [cq+32*1], q1302 ; 3 2
@@ -1615,6 +1614,7 @@
 .end3:
     pmulhrsw             m2, m4
     pmulhrsw             m3, m4
+    WIN64_RESTORE_XMM
 .end4:
     pxor                 m4, m4
     mova          [cq+32*0], m4
@@ -1636,7 +1636,7 @@
 INV_TXFM_8X8_FN flipadst, flipadst
 INV_TXFM_8X8_FN flipadst, identity
 
-cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m4, [cq+32*0], q1302 ; 1 0
     vpermq               m3, [cq+32*3], q3120 ; 6 7
     vpermq               m5, [cq+32*1], q1302 ; 3 2
@@ -1682,7 +1682,7 @@
 INV_TXFM_8X8_FN identity, flipadst
 INV_TXFM_8X8_FN identity, identity
 
-cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     mova                xm3,     [cq+16*0]
     mova                xm2,     [cq+16*1]
     vinserti128          m3, m3, [cq+16*4], 1
@@ -1705,7 +1705,7 @@
     jmp m(iadst_8x8_internal).end
 
 %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16, 13
+    INV_TXFM_FN          %1, %2, %3, 8x16
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -1720,6 +1720,7 @@
     mov                 r2d, 4
     jmp m(inv_txfm_add_dct_dct_8x8).end2
 %elifidn %1_%2, dct_identity
+    WIN64_SPILL_XMM      13
     vpbroadcastd         m0, [o(pw_2896x8)]
     pmulhrsw             m7, m0, [cq]
     vpbroadcastd         m1, [o(pw_16384)]
@@ -1798,7 +1799,7 @@
 INV_TXFM_8X16_FN dct, adst
 INV_TXFM_8X16_FN dct, flipadst
 
-cglobal idct_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
     call m(idct_16x8_internal).main
     vpbroadcastd        m10, [o(pw_16384)]
@@ -1861,7 +1862,7 @@
 INV_TXFM_8X16_FN adst, flipadst
 INV_TXFM_8X16_FN adst, identity
 
-cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
     call m(iadst_16x8_internal).main
     vpbroadcastd        m10, [o(pw_16384)]
@@ -1966,7 +1967,7 @@
 INV_TXFM_8X16_FN flipadst, flipadst
 INV_TXFM_8X16_FN flipadst, identity
 
-cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
     call m(iadst_16x8_internal).main
     vpbroadcastd         m9, [o(pw_16384)]
@@ -2013,7 +2014,7 @@
 INV_TXFM_8X16_FN identity, flipadst
 INV_TXFM_8X16_FN identity, identity
 
-cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     mova                xm3,     [cq+16*0]
     mova                xm2,     [cq+16*2]
     add                  cq, 16*8
@@ -2077,7 +2078,7 @@
 %endmacro
 
 %macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x4, 11
+    INV_TXFM_FN          %1, %2, %3, 16x4
 %if %3 >= 0
 %ifidn %1_%2, dct_identity
     vpbroadcastd        xm3, [o(pw_2896x8)]
@@ -2188,7 +2189,7 @@
 INV_TXFM_16X4_FN dct, flipadst, 0
 INV_TXFM_16X4_FN dct, identity, 3
 
-cglobal idct_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                xm0, [cq+16*0]
     mova                xm1, [cq+16*1]
     mova                xm2, [cq+16*2]
@@ -2223,7 +2224,7 @@
 INV_TXFM_16X4_FN adst, flipadst
 INV_TXFM_16X4_FN adst, identity
 
-cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q1230
     vpermq               m3, [cq+32*3], q2103
     vpermq               m1, [cq+32*1], q1230
@@ -2259,6 +2260,7 @@
 .end:
     vpbroadcastd         m4, [o(pw_2048)]
     REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
+    WIN64_RESTORE_XMM
 .end2:
     pxor                 m4, m4
     mova          [cq+32*0], m4
@@ -2319,7 +2321,7 @@
 INV_TXFM_16X4_FN flipadst, flipadst
 INV_TXFM_16X4_FN flipadst, identity
 
-cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q1230
     vpermq               m3, [cq+32*3], q2103
     vpermq               m1, [cq+32*1], q1230
@@ -2357,7 +2359,7 @@
 INV_TXFM_16X4_FN identity, flipadst
 INV_TXFM_16X4_FN identity, identity
 
-cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                xm2,     [cq+16*0]
     mova                xm4,     [cq+16*1]
     vinserti128          m2, m2, [cq+16*4], 1
@@ -2391,7 +2393,7 @@
     jmp m(iadst_16x4_internal).end
 
 %macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8, 13
+    INV_TXFM_FN          %1, %2, %3, 16x8
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -2401,6 +2403,7 @@
     mov                 r2d, 4
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
 %elifidn %1_%2, dct_identity
+    WIN64_SPILL_XMM      13
     vbroadcasti128       m7, [cq]
     vpbroadcastd         m0, [o(pw_2896x8)]
     vpbroadcastd         m1, [o(pw_16384)]
@@ -2474,7 +2477,7 @@
 INV_TXFM_16X8_FN dct, adst
 INV_TXFM_16X8_FN dct, flipadst
 
-cglobal idct_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 3120
     call m(idct_8x16_internal).main
     vpbroadcastd        m10, [o(pw_16384)]
@@ -2544,7 +2547,7 @@
 INV_TXFM_16X8_FN adst, flipadst
 INV_TXFM_16X8_FN adst, identity
 
-cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 1302
     call m(iadst_8x16_internal).main2
     vpbroadcastd        m10, [o(pw_16384)]
@@ -2608,7 +2611,7 @@
 INV_TXFM_16X8_FN flipadst, flipadst
 INV_TXFM_16X8_FN flipadst, identity
 
-cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 1302
     call m(iadst_8x16_internal).main2
     vpbroadcastd        m10, [o(pw_16384)]
@@ -2671,7 +2674,7 @@
 INV_TXFM_16X8_FN identity, flipadst
 INV_TXFM_16X8_FN identity, identity
 
-cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, c, eob, tx2
+cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     mova                xm7,     [cq+16*0]
     mova                xm2,     [cq+16*1]
     add                  cq, 16*8
@@ -2728,7 +2731,7 @@
 %define o_base pw_5 + 128
 
 %macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16, 16
+    INV_TXFM_FN          %1, %2, %3, 16x16
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -2737,6 +2740,7 @@
     mov                 r2d, 8
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
 %elifidn %1_%2, dct_identity
+    WIN64_SPILL_XMM       7
     vpbroadcastd         m3, [o(pw_2896x8)]
     pmulhrsw             m3, [cq]
     vpbroadcastd         m0, [o(pw_8192)]
@@ -2832,7 +2836,7 @@
 INV_TXFM_16X16_FN dct, adst
 INV_TXFM_16X16_FN dct, flipadst
 
-cglobal idct_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
     call .main
 .pass1_end:
@@ -2977,7 +2981,7 @@
 INV_TXFM_16X16_FN adst, adst
 INV_TXFM_16X16_FN adst, flipadst
 
-cglobal iadst_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
     call .main
     vpbroadcastd         m1, [o(pw_8192)]
@@ -3091,7 +3095,7 @@
 INV_TXFM_16X16_FN flipadst, adst
 INV_TXFM_16X16_FN flipadst, flipadst
 
-cglobal iflipadst_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
     call m(iadst_16x16_internal).main
     vpbroadcastd         m1, [o(pw_8192)]
@@ -3163,7 +3167,7 @@
 INV_TXFM_16X16_FN identity, dct,      15
 INV_TXFM_16X16_FN identity, identity
 
-cglobal iidentity_16x16_internal, 0, 0, 0, 32*3, dst, stride, c, eob, tx2
+cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     mova                xm0,      [cq+16*0]
     mova               xm15,      [cq+16*1]
     mova                xm1,      [cq+16*2]
@@ -3277,7 +3281,7 @@
     lea                 rax, [o_base]
     test               eobd, eobd
     jz .dconly
-    PROLOGUE              0, 0, 16, 32*3, dst, stride, c, eob
+    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
     %undef cmp
     cmp                eobd, 106
     jle .fast
@@ -3575,7 +3579,7 @@
     jg .dconly_loop
     RET
 .normal:
-    PROLOGUE              0, 0, 16, 32*3, dst, stride, c, eob
+    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
     %undef cmp
     LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
     LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
@@ -3882,11 +3886,11 @@
     vextracti128    [r2+%7], m%3, 1
 %endmacro
 
-cglobal inv_txfm_add_dct_dct_16x32, 4, 8, 0, dst, stride, c, eob
+cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob
     lea                 rax, [o_base]
     test               eobd, eobd
     jz .dconly
-    PROLOGUE              0, 0, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
+    PROLOGUE              0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
                                            base, tmp3
     %undef cmp
     LOAD_16ROWS          cq, 64, 1
@@ -4250,7 +4254,7 @@
     vinserti128         m%1, m%1, xm%4, 1
 %endmacro
 
-cglobal inv_txfm_add_dct_dct_32x16, 4, 6, 0, dst, stride, c, eob
+cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
     lea                 rax, [o_base]
     test               eobd, eobd
     jnz .normal
@@ -4262,7 +4266,7 @@
     mov                 r2d, 16
     jmp m(inv_txfm_add_dct_dct_32x8).dconly
 .normal:
-    PROLOGUE              0, 0, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
+    PROLOGUE              0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
     vpbroadcastd        m15, [o(pw_2896x8)]
     pmulhrsw             m0, m15, [cq+32* 1]
     pmulhrsw             m1, m15, [cq+32* 3]
--- a/src/x86/itx_init.c
+++ b/src/x86/itx_init.c
@@ -117,7 +117,7 @@
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
-#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows
+#if BITDEPTH == 8 && ARCH_X86_64
     assign_itx17_fn( ,  4,  4, avx2);
     assign_itx16_fn(R,  4,  8, avx2);
     assign_itx16_fn(R,  4, 16, avx2);