shithub: dav1d

Download patch

ref: deab25342b5361df0d5aa2f8f9d38301c0146ab3
parent: 7ddd76f77e6b016f374fc3284fd4468a27969a23
author: Henrik Gramner <[email protected]>
date: Fri Sep 28 15:40:03 EDT 2018

x86: Enable MC AVX2 asm on 64-bit Windows

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -26,7 +26,7 @@
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
-%if ARCH_X86_64 && UNIX64 ; FIXME: Windows
+%if ARCH_X86_64
 
 SECTION_RODATA 32
 
@@ -127,7 +127,7 @@
 
 INIT_XMM avx2
 DECLARE_REG_TMP 4, 6, 7
-cglobal put_bilin, 4, 8, 8, dst, ds, src, ss, w, h, mxy
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
     movifnidn          mxyd, r6m ; mx
     lea                  t2, [put_avx2]
     tzcnt                wd, wm
@@ -235,6 +235,7 @@
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
     imul               mxyd, 0xff01
     vbroadcasti128       m4, [bilin_h_shuf8]
+    WIN64_SPILL_XMM       7
     add                mxyd, 16 << 8
     movd                xm5, mxyd
     mov                mxyd, r7m ; my
@@ -375,6 +376,8 @@
     RET
 .v:
     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_v)]
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
     imul               mxyd, 0xff01
     vpbroadcastd         m7, [pw_2048]
     add                mxyd, 16 << 8
@@ -535,6 +538,8 @@
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
     movzx                wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
     shl                mxyd, 11 ; can't shift by 12 due to signed overflow
     vpbroadcastd         m7, [pw_2048]
     movd                xm6, mxyd
@@ -658,6 +663,9 @@
     pshufb               m1, m4
     pmaddubsw            m0, m5
     pmaddubsw            m1, m5
+%if WIN64
+    movaps              r4m, xmm8
+%endif
 %%loop:
     add                srcq, ssq
     movu                xm2,     [srcq+8*1]
@@ -670,7 +678,6 @@
     paddw                m3, m1
     mova                 m1, m2
     pmulhrsw             m8, m3, m7
-ASSERT UNIX64 ; using an additional vector register here
     movu                xm2,     [srcq+8*0]
     vinserti128          m2, m2, [srcq+8*2], 1
     pshufb               m2, m4
@@ -686,6 +693,9 @@
     add                dstq, dsq
     dec                  hd
     jg %%loop
+%if WIN64
+    movaps             xmm8, r4m
+%endif
 %endmacro
     PUT_BILIN_HV_W32
     RET
@@ -719,7 +729,7 @@
     RET
 
 DECLARE_REG_TMP 3, 5, 6
-cglobal prep_bilin, 3, 7, 7, tmp, src, stride, w, h, mxy, stride3
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movifnidn          mxyd, r5m ; mx
     lea                  t2, [prep_avx2]
     tzcnt                wd, wm
@@ -1019,6 +1029,7 @@
     jg .h_w128
     RET
 .v:
+    WIN64_SPILL_XMM       7
     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
     imul               mxyd, 0xff01
     add                mxyd, 16 << 8
@@ -1206,6 +1217,8 @@
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       7
     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
     shl                mxyd, 11
     movd                xm6, mxyd
@@ -1408,7 +1421,11 @@
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
 %assign FILTER_SHARP   (2*15 << 16) | 3*15
 
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
 DECLARE_REG_TMP 7, 8
+%endif
 %macro PUT_8TAP_FN 3 ; type, type_h, type_v
 cglobal put_8tap_%1
     mov                 t0d, FILTER_%2
@@ -1428,7 +1445,7 @@
 PUT_8TAP_FN sharp,          SHARP,   SHARP
 PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
 
-cglobal put_8tap, 4, 9, 16, dst, ds, src, ss, w, h, mx, my, ss3
+cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
     imul                mxd, mxm, 0x010101
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
@@ -1445,11 +1462,15 @@
     add                  wq, r8
     lea                  r6, [ssq*3]
     lea                  r7, [dsq*3]
+%if WIN64
+    pop                  r8
+%endif
     jmp                  wq
 .h:
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
+    WIN64_SPILL_XMM      11
     cmp                  wd, 4
     jl .h_w2
     vbroadcasti128       m6, [subpel_h_shufA]
@@ -1577,6 +1598,8 @@
     jg .h_loop
     RET
 .v:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 4
@@ -1791,6 +1814,8 @@
     jg .v_w16_loop0
     RET
 .hv:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
     cmp                  wd, 4
     jg .hv_w8
     movzx               mxd, mxb
@@ -2058,7 +2083,11 @@
     jg .hv_w8_loop0
     RET
 
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
 DECLARE_REG_TMP 6, 7
+%endif
 %macro PREP_8TAP_FN 3 ; type, type_h, type_v
 cglobal prep_8tap_%1
     mov                 t0d, FILTER_%2
@@ -2078,7 +2107,7 @@
 PREP_8TAP_FN sharp,          SHARP,   SHARP
 PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
 
-cglobal prep_8tap, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3
+cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
     imul                mxd, mxm, 0x010101
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
@@ -2094,6 +2123,9 @@
     movzx                wd, word [r7+wq*2+table_offset(prep,)]
     add                  wq, r7
     lea                  r6, [strideq*3]
+%if WIN64
+    pop                  r7
+%endif
     jmp                  wq
 .h:
     test                myd, 0xf00
@@ -2100,6 +2132,7 @@
     jnz .hv
     vbroadcasti128       m5, [subpel_h_shufA]
     vpbroadcastd         m4, [pw_8192]
+    WIN64_SPILL_XMM      10
     cmp                  wd, 4
     je .h_w4
     tzcnt                wd, wd
@@ -2202,6 +2235,8 @@
     jg .h_loop
     RET
 .v:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
     movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
     shr                 myd, 16  ; Note that the code is 8-tap only, having
     cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
@@ -2384,6 +2419,8 @@
     jg .v_w16_loop0
     RET
 .hv:
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM      16
     cmp                  wd, 4
     jg .hv_w8
     movzx               mxd, mxb
--- a/src/x86/mc_init.c
+++ b/src/x86/mc_init.c
@@ -64,7 +64,7 @@
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
-#if BITDEPTH == 8 && ARCH_X86_64 && !defined(_WIN32) // FIXME: Windows
+#if BITDEPTH == 8 && ARCH_X86_64
     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);