shithub: dav1d

Download patch

ref: 9c29f229c5aa7d2d9564d44e8932011f23ac4e77
parent: 361a3c8ee2d03f87f42a76213ee0f93e49fa9ec3
author: Henrik Gramner <[email protected]>
date: Fri Jan 24 15:34:18 EST 2020

checkasm: Increase buffer alignment to 64-byte on x86-64

Required for AVX-512.

--- a/include/common/attributes.h
+++ b/include/common/attributes.h
@@ -43,15 +43,18 @@
 #endif
 
 #if ARCH_X86_64
-/* x86-64 needs 32-byte alignment for AVX2. */
+/* x86-64 needs 32- and 64-byte alignment for AVX2 and AVX-512. */
+#define ALIGN_64_VAL 64
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
 #elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
 /* ARM doesn't benefit from anything more than 16-byte alignment. */
+#define ALIGN_64_VAL 16
 #define ALIGN_32_VAL 16
 #define ALIGN_16_VAL 16
 #else
 /* No need for extra alignment on platforms without assembly. */
+#define ALIGN_64_VAL 8
 #define ALIGN_32_VAL 8
 #define ALIGN_16_VAL 8
 #endif
@@ -76,9 +79,10 @@
  * becomes:
  * ALIGN_STK_$align(uint8_t, var, 1, [2][3][4])
  */
+#define ALIGN_STK_64(type, var, sz1d, sznd) \
+    ALIGN(type var[sz1d]sznd, ALIGN_64_VAL)
 #define ALIGN_STK_32(type, var, sz1d, sznd) \
     ALIGN(type var[sz1d]sznd, ALIGN_32_VAL)
-// as long as stack is itself 16-byte aligned, this works (win64, gcc)
 #define ALIGN_STK_16(type, var, sz1d, sznd) \
     ALIGN(type var[sz1d]sznd, ALIGN_16_VAL)
 
--- a/tests/checkasm/cdef.c
+++ b/tests/checkasm/cdef.c
@@ -45,9 +45,9 @@
 }
 
 static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
-    ALIGN_STK_32(pixel, c_src, 10 * 16 + 8, ), *const c_dst = c_src + 8;
-    ALIGN_STK_32(pixel, a_src, 10 * 16 + 8, ), *const a_dst = a_src + 8;
-    ALIGN_STK_32(pixel, top, 16 * 2 + 8, );
+    ALIGN_STK_64(pixel, c_src, 10 * 16 + 8, ), *const c_dst = c_src + 8;
+    ALIGN_STK_64(pixel, a_src, 10 * 16 + 8, ), *const a_dst = a_src + 8;
+    ALIGN_STK_64(pixel, top, 16 * 2 + 8, );
     pixel left[8][2];
     pixel *const top_ptrs[2] = { top + 8, top + 24 };
     const ptrdiff_t stride = 16 * sizeof(pixel);
@@ -103,7 +103,7 @@
 }
 
 static void check_cdef_direction(const cdef_dir_fn fn) {
-    ALIGN_STK_32(pixel, src, 8 * 8,);
+    ALIGN_STK_64(pixel, src, 8 * 8,);
 
     declare_func(int, pixel *src, ptrdiff_t dst_stride, unsigned *var
                  HIGHBD_DECL_SUFFIX);
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c
@@ -137,9 +137,9 @@
 }
 
 static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, src, 128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, src, 128 * 32,);
     const ptrdiff_t stride = 128 * sizeof(pixel);
 
     declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
@@ -207,10 +207,10 @@
 }
 
 static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, src, 128 * 32,);
-    ALIGN_STK_32(pixel, luma_src, 128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, src, 128 * 32,);
+    ALIGN_STK_64(pixel, luma_src, 128 * 32,);
     const ptrdiff_t lstride = 128 * sizeof(pixel);
 
     declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
--- a/tests/checkasm/ipred.c
+++ b/tests/checkasm/ipred.c
@@ -66,9 +66,9 @@
 };
 
 static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, topleft_buf, 257,);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, topleft_buf, 257,);
     pixel *const topleft = topleft_buf + 128;
 
     declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@@ -132,9 +132,9 @@
 }
 
 static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(int16_t, c_dst, 32 * 32,);
-    ALIGN_STK_32(int16_t, a_dst, 32 * 32,);
-    ALIGN_STK_32(pixel, luma, 32 * 32,);
+    ALIGN_STK_64(int16_t, c_dst, 32 * 32,);
+    ALIGN_STK_64(int16_t, a_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, luma, 32 * 32,);
 
     declare_func(void, int16_t *ac, const pixel *y, ptrdiff_t stride,
                  int w_pad, int h_pad, int cw, int ch);
@@ -175,10 +175,10 @@
 }
 
 static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 32 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 32 * 32,);
-    ALIGN_STK_32(int16_t, ac, 32 * 32,);
-    ALIGN_STK_32(pixel, topleft_buf, 257,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_64(int16_t, ac, 32 * 32,);
+    ALIGN_STK_64(pixel, topleft_buf, 257,);
     pixel *const topleft = topleft_buf + 128;
 
     declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *topleft,
@@ -227,9 +227,9 @@
 }
 
 static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 64 * 64,);
-    ALIGN_STK_32(uint8_t, idx, 64 * 64,);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(uint8_t, idx, 64 * 64,);
     ALIGN_STK_16(uint16_t, pal, 8,);
 
     declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
--- a/tests/checkasm/itx.c
+++ b/tests/checkasm/itx.c
@@ -226,9 +226,9 @@
     Dav1dInvTxfmDSPContext c;
     bitfn(dav1d_itx_dsp_init)(&c);
 
-    ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
-    ALIGN_STK_32(pixel, c_dst, 64 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 64 * 64,);
+    ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
+    ALIGN_STK_64(pixel, c_dst, 64 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 64 * 64,);
 
     static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
         TX_4X4,   RTX_4X8,  RTX_4X16,
--- a/tests/checkasm/loopfilter.c
+++ b/tests/checkasm/loopfilter.c
@@ -95,8 +95,8 @@
                          const int n_blks, const int lf_idx,
                          const int is_chroma, const int dir)
 {
-    ALIGN_STK_32(pixel, c_dst_mem, 128 * 16,);
-    ALIGN_STK_32(pixel, a_dst_mem, 128 * 16,);
+    ALIGN_STK_64(pixel, c_dst_mem, 128 * 16,);
+    ALIGN_STK_64(pixel, a_dst_mem, 128 * 16,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const uint32_t *mask,
                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
--- a/tests/checkasm/looprestoration.c
+++ b/tests/checkasm/looprestoration.c
@@ -44,9 +44,9 @@
 }
 
 static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
     pixel left[64][4];
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride,
@@ -116,9 +116,9 @@
 }
 
 static void check_sgr(Dav1dLoopRestorationDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, a_dst, 448 * 64,);
-    ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+    ALIGN_STK_64(pixel, c_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 448 * 64,);
+    ALIGN_STK_64(pixel, h_edge, 448 * 8,);
     pixel left[64][4];
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride,
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -55,9 +55,9 @@
 }
 
 static void check_mc(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 135 * 135,);
-    ALIGN_STK_32(pixel, c_dst,   128 * 128,);
-    ALIGN_STK_32(pixel, a_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
     const pixel *src = src_buf + 135 * 3 + 3;
     const ptrdiff_t src_stride = 135 * sizeof(pixel);
 
@@ -118,9 +118,9 @@
 }
 
 static void check_mct(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 135 * 135,);
-    ALIGN_STK_32(int16_t, c_tmp, 128 * 128,);
-    ALIGN_STK_32(int16_t, a_tmp, 128 * 128,);
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
     const pixel *src = src_buf + 135 * 3 + 3;
     const ptrdiff_t src_stride = 135 * sizeof(pixel);
 
@@ -173,9 +173,9 @@
 }
 
 static void check_avg(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel, c_dst, 135 * 135,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 128,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                  const int16_t *tmp2, int w, int h HIGHBD_DECL_SUFFIX);
@@ -204,9 +204,9 @@
 }
 
 static void check_w_avg(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel, c_dst, 135 * 135,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel, c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 128,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                  const int16_t *tmp2, int w, int h, int weight HIGHBD_DECL_SUFFIX);
@@ -236,10 +236,10 @@
 }
 
 static void check_mask(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel,   c_dst, 135 * 135,);
-    ALIGN_STK_32(pixel,   a_dst, 128 * 128,);
-    ALIGN_STK_32(uint8_t, mask,  128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel,   c_dst, 135 * 135,);
+    ALIGN_STK_64(pixel,   a_dst, 128 * 128,);
+    ALIGN_STK_64(uint8_t, mask,  128 * 128,);
 
     for (int i = 0; i < 128 * 128; i++)
         mask[i] = rnd() % 65;
@@ -271,11 +271,11 @@
 }
 
 static void check_w_mask(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(int16_t, tmp, 2, [128 * 128]);
-    ALIGN_STK_32(pixel,   c_dst,  135 * 135,);
-    ALIGN_STK_32(pixel,   a_dst,  128 * 128,);
-    ALIGN_STK_32(uint8_t, c_mask, 128 * 128,);
-    ALIGN_STK_32(uint8_t, a_mask, 128 * 128,);
+    ALIGN_STK_64(int16_t, tmp, 2, [128 * 128]);
+    ALIGN_STK_64(pixel,   c_dst,  135 * 135,);
+    ALIGN_STK_64(pixel,   a_dst,  128 * 128,);
+    ALIGN_STK_64(uint8_t, c_mask, 128 * 128,);
+    ALIGN_STK_64(uint8_t, a_mask, 128 * 128,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const int16_t *tmp1,
                  const int16_t *tmp2, int w, int h, uint8_t *mask, int sign
@@ -321,10 +321,10 @@
 }
 
 static void check_blend(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp, 32 * 32,);
-    ALIGN_STK_32(pixel, c_dst, 32 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 32 * 32,);
-    ALIGN_STK_32(uint8_t, mask, 32 * 32,);
+    ALIGN_STK_64(pixel, tmp, 32 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 32,);
+    ALIGN_STK_64(uint8_t, mask, 32 * 32,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                  int w, int h, const uint8_t *mask);
@@ -357,9 +357,9 @@
 }
 
 static void check_blend_v(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp,   32 * 128,);
-    ALIGN_STK_32(pixel, c_dst, 32 * 128,);
-    ALIGN_STK_32(pixel, a_dst, 32 * 128,);
+    ALIGN_STK_64(pixel, tmp,   32 * 128,);
+    ALIGN_STK_64(pixel, c_dst, 32 * 128,);
+    ALIGN_STK_64(pixel, a_dst, 32 * 128,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                  int w, int h);
@@ -391,9 +391,9 @@
 }
 
 static void check_blend_h(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, tmp,   128 * 32,);
-    ALIGN_STK_32(pixel, c_dst, 128 * 32,);
-    ALIGN_STK_32(pixel, a_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, tmp,   128 * 32,);
+    ALIGN_STK_64(pixel, c_dst, 128 * 32,);
+    ALIGN_STK_64(pixel, a_dst, 128 * 32,);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *tmp,
                  int w, int h);
@@ -424,9 +424,9 @@
 }
 
 static void check_warp8x8(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 15 * 15,);
-    ALIGN_STK_32(pixel, c_dst,    8 *  8,);
-    ALIGN_STK_32(pixel, a_dst,    8 *  8,);
+    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+    ALIGN_STK_64(pixel, c_dst,    8 *  8,);
+    ALIGN_STK_64(pixel, a_dst,    8 *  8,);
     int16_t abcd[4];
     const pixel *src = src_buf + 15 * 3 + 3;
     const ptrdiff_t dst_stride =  8 * sizeof(pixel);
@@ -462,9 +462,9 @@
 }
 
 static void check_warp8x8t(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, src_buf, 15 * 15,);
-    ALIGN_STK_32(int16_t, c_tmp,  8 *  8,);
-    ALIGN_STK_32(int16_t, a_tmp,  8 *  8,);
+    ALIGN_STK_64(pixel, src_buf, 15 * 15,);
+    ALIGN_STK_64(int16_t, c_tmp,  8 *  8,);
+    ALIGN_STK_64(int16_t, a_tmp,  8 *  8,);
     int16_t abcd[4];
     const pixel *src = src_buf + 15 * 3 + 3;
     const ptrdiff_t src_stride = 15 * sizeof(pixel);
@@ -534,9 +534,9 @@
 }
 
 static void check_emuedge(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_32(pixel, c_dst, 135 * 192,);
-    ALIGN_STK_32(pixel, a_dst, 135 * 192,);
-    ALIGN_STK_32(pixel, src,   160 * 160,);
+    ALIGN_STK_64(pixel, c_dst, 135 * 192,);
+    ALIGN_STK_64(pixel, a_dst, 135 * 192,);
+    ALIGN_STK_64(pixel, src,   160 * 160,);
 
     for (int i = 0; i < 160 * 160; i++)
         src[i] = rnd() & ((1U << BITDEPTH) - 1);