shithub: dav1d

--- a/src/x86/mc.asm

+++ b/src/x86/mc.asm

@@ -26,7 +26,7 @@

 %include "config.asm"

 %include "ext/x86/x86inc.asm"

-%if ARCH_X86_64 && UNIX64 ; FIXME: Windows

+%if ARCH_X86_64

 SECTION_RODATA 32

@@ -127,7 +127,7 @@

 INIT_XMM avx2

 DECLARE_REG_TMP 4, 6, 7

-cglobal put_bilin, 4, 8, 8, dst, ds, src, ss, w, h, mxy

+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy

     movifnidn          mxyd, r6m ; mx

     lea                  t2, [put_avx2]

     tzcnt                wd, wm

@@ -235,6 +235,7 @@

     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4

     imul               mxyd, 0xff01

     vbroadcasti128       m4, [bilin_h_shuf8]

+    WIN64_SPILL_XMM       7

     add                mxyd, 16 << 8

     movd                xm5, mxyd

     mov                mxyd, r7m ; my

@@ -375,6 +376,8 @@

RET

.v:

     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM       8

     imul               mxyd, 0xff01

     vpbroadcastd         m7, [pw_2048]

     add                mxyd, 16 << 8

@@ -535,6 +538,8 @@

     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8

     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4

     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM       8

     shl                mxyd, 11 ; can't shift by 12 due to signed overflow

     vpbroadcastd         m7, [pw_2048]

     movd                xm6, mxyd

@@ -658,6 +663,9 @@

     pshufb               m1, m4

     pmaddubsw            m0, m5

     pmaddubsw            m1, m5

+%if WIN64

+    movaps              r4m, xmm8

+%endif

 %%loop:

     add                srcq, ssq

     movu                xm2,     [srcq+8*1]

@@ -670,7 +678,6 @@

     paddw                m3, m1

     mova                 m1, m2

     pmulhrsw             m8, m3, m7

-ASSERT UNIX64 ; using an additional vector register here

     movu                xm2,     [srcq+8*0]

     vinserti128          m2, m2, [srcq+8*2], 1

     pshufb               m2, m4

@@ -686,6 +693,9 @@

     add                dstq, dsq

     dec                  hd

     jg %%loop

+%if WIN64

+    movaps             xmm8, r4m

+%endif

 %endmacro

     PUT_BILIN_HV_W32

RET

@@ -719,7 +729,7 @@

RET

 DECLARE_REG_TMP 3, 5, 6

-cglobal prep_bilin, 3, 7, 7, tmp, src, stride, w, h, mxy, stride3

+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3

     movifnidn          mxyd, r5m ; mx

     lea                  t2, [prep_avx2]

     tzcnt                wd, wm

@@ -1019,6 +1029,7 @@

     jg .h_w128

RET

.v:

+    WIN64_SPILL_XMM       7

     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]

     imul               mxyd, 0xff01

     add                mxyd, 16 << 8

@@ -1206,6 +1217,8 @@

 .hv:

     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4

     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM       7

     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]

     shl                mxyd, 11

     movd                xm6, mxyd

@@ -1408,7 +1421,11 @@

 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15

 %assign FILTER_SHARP   (2*15 << 16) | 3*15

+%if WIN64

+DECLARE_REG_TMP 4, 5

+%else

 DECLARE_REG_TMP 7, 8

+%endif

 %macro PUT_8TAP_FN 3 ; type, type_h, type_v

 cglobal put_8tap_%1

     mov                 t0d, FILTER_%2

@@ -1428,7 +1445,7 @@

 PUT_8TAP_FN sharp,          SHARP,   SHARP

 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH

-cglobal put_8tap, 4, 9, 16, dst, ds, src, ss, w, h, mx, my, ss3

+cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3

     imul                mxd, mxm, 0x010101

     add                 mxd, t0d ; 8tap_h, mx, 4tap_h

     imul                myd, mym, 0x010101

@@ -1445,11 +1462,15 @@

     add                  wq, r8

     lea                  r6, [ssq*3]

     lea                  r7, [dsq*3]

+%if WIN64

+    pop                  r8

+%endif

     jmp                  wq

.h:

     test                myd, 0xf00

     jnz .hv

     vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)

+    WIN64_SPILL_XMM      11

     cmp                  wd, 4

     jl .h_w2

     vbroadcasti128       m6, [subpel_h_shufA]

@@ -1577,6 +1598,8 @@

     jg .h_loop

RET

.v:

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM      16

     movzx               mxd, myb

     shr                 myd, 16

     cmp                  hd, 4

@@ -1791,6 +1814,8 @@

     jg .v_w16_loop0

RET

 .hv:

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM      16

     cmp                  wd, 4

     jg .hv_w8

     movzx               mxd, mxb

@@ -2058,7 +2083,11 @@

     jg .hv_w8_loop0

RET

+%if WIN64

+DECLARE_REG_TMP 6, 4

+%else

 DECLARE_REG_TMP 6, 7

+%endif

 %macro PREP_8TAP_FN 3 ; type, type_h, type_v

 cglobal prep_8tap_%1

     mov                 t0d, FILTER_%2

@@ -2078,7 +2107,7 @@

 PREP_8TAP_FN sharp,          SHARP,   SHARP

 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH

-cglobal prep_8tap, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3

+cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3

     imul                mxd, mxm, 0x010101

     add                 mxd, t0d ; 8tap_h, mx, 4tap_h

     imul                myd, mym, 0x010101

@@ -2094,6 +2123,9 @@

     movzx                wd, word [r7+wq*2+table_offset(prep,)]

     add                  wq, r7

     lea                  r6, [strideq*3]

+%if WIN64

+    pop                  r7

+%endif

     jmp                  wq

.h:

     test                myd, 0xf00

@@ -2100,6 +2132,7 @@

     jnz .hv

     vbroadcasti128       m5, [subpel_h_shufA]

     vpbroadcastd         m4, [pw_8192]

+    WIN64_SPILL_XMM      10

     cmp                  wd, 4

     je .h_w4

     tzcnt                wd, wd

@@ -2202,6 +2235,8 @@

     jg .h_loop

RET

.v:

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM      16

     movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.

     shr                 myd, 16  ; Note that the code is 8-tap only, having

     cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4

@@ -2384,6 +2419,8 @@

     jg .v_w16_loop0

RET

 .hv:

+    %assign stack_offset stack_offset - stack_size_padded

+    WIN64_SPILL_XMM      16

     cmp                  wd, 4

     jg .hv_w8

     movzx               mxd, mxb

--- a/src/x86/mc_init.c

+++ b/src/x86/mc_init.c

@@ -64,7 +64,7 @@

     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

-#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows

+#if BITDEPTH == 8 && ARCH_X86_64

     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);

     init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);

     init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);