shithub: dav1d

Download patch

ref: bb178db019da21d62fae8c255675efbd84327195
parent: dccc21b742f328060c124463c5d05f76472d758c
author: Henrik Gramner <[email protected]>
date: Wed Jan 29 09:17:11 EST 2020

Rework the CDEF top edge handling

Avoids some pointer chasing and simplifies the DSP code, at the cost
of making the initialization a little bit more complicated.

Also reduces memory usage by a small amount due to properly sizing
the buffers instead of always allocating enough space for 4:4:4.

--- a/src/arm/32/cdef.S
+++ b/src/arm/32/cdef.S
@@ -150,7 +150,7 @@
 
 // void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
 //                               ptrdiff_t src_stride, const pixel (*left)[2],
-//                               /*const*/ pixel *const top[2], int h,
+//                               const pixel *const top, int h,
 //                               enum CdefEdgeFlags edges);
 
 // n1 = s0/d0
@@ -175,10 +175,9 @@
         b               3f
 1:
         // CDEF_HAVE_TOP
-        ldr             r7,  [r4]
-        ldr             lr,  [r4, #4]
+        add             r7,  r4,  r2
         sub             r0,  r0,  #2*(2*\stride)
-        pad_top_bottom  r7,  lr,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+        pad_top_bottom  r4,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
 
         // Middle section
 3:
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -139,7 +139,7 @@
 
 // void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
 //                               ptrdiff_t src_stride, const pixel (*left)[2],
-//                               /*const*/ pixel *const top[2], int h,
+//                               const pixel *const top, int h,
 //                               enum CdefEdgeFlags edges);
 
 .macro padding_func w, stride, rn, rw
@@ -157,9 +157,8 @@
         b               3f
 1:
         // CDEF_HAVE_TOP
-        ldr             x8,  [x4]
-        ldr             x9,  [x4, #8]
-        pad_top_bottom  x8,  x9, \w, \stride, \rn, \rw, 0
+        add             x9,  x4,  x2
+        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0
 
         // Middle section
 3:
--- a/src/arm/cdef_init_tmpl.c
+++ b/src/arm/cdef_init_tmpl.c
@@ -32,11 +32,11 @@
 
 void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
                               ptrdiff_t src_stride, const pixel (*left)[2],
-                              /*const*/ pixel *const top[2], int h,
+                              const pixel *const top, int h,
                               enum CdefEdgeFlags edges);
 void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src,
                               ptrdiff_t src_stride, const pixel (*left)[2],
-                              /*const*/ pixel *const top[2], int h,
+                              const pixel *const top, int h,
                               enum CdefEdgeFlags edges);
 
 void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride,
@@ -48,17 +48,13 @@
 
 #define DEFINE_FILTER(w, h, tmp_stride)                                      \
 static void                                                                  \
-cdef_filter_##w##x##h##_neon(pixel *dst,                                     \
-                             const ptrdiff_t stride,                         \
-                             const pixel (*left)[2],                         \
-                             /*const*/ pixel *const top[2],                  \
-                             const int pri_strength,                         \
-                             const int sec_strength,                         \
-                             const int dir,                                  \
-                             const int damping,                              \
+cdef_filter_##w##x##h##_neon(pixel *const dst, const ptrdiff_t stride,       \
+                             const pixel (*left)[2], const pixel *const top, \
+                             const int pri_strength, const int sec_strength, \
+                             const int dir, const int damping,               \
                              const enum CdefEdgeFlags edges)                 \
 {                                                                            \
-    ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,);                     \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,);                   \
     uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8;                            \
     dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges);     \
     dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength,              \
--- a/src/cdef.h
+++ b/src/cdef.h
@@ -52,7 +52,7 @@
 // order to get access to pre-filter top pixels, use $top.
 #define decl_cdef_fn(name) \
 void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
-            /*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
+            const pixel *top, int pri_strength, int sec_strength, \
             int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
 typedef decl_cdef_fn(*cdef_fn);
 
--- a/src/cdef_apply_tmpl.c
+++ b/src/cdef_apply_tmpl.c
@@ -39,24 +39,28 @@
     BACKUP_2X8_UV = 1 << 1,
 };
 
-static void backup2lines(pixel *const dst[3][2],
-                         /*const*/ pixel *const src[3],
-                         const ptrdiff_t src_stride[2], int y_off, int w,
+static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
+                         const ptrdiff_t stride[2],
                          const enum Dav1dPixelLayout layout)
 {
-    pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
-    pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
+    const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
+    if (y_stride < 0)
+        pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
+    else
+        pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
 
-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
-    w >>= ss_hor;
-    y_off >>= ss_ver;
-    pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
-    pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+    if (layout != DAV1D_PIXEL_LAYOUT_I400) {
+        const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
+        if (uv_stride < 0) {
+            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
+            pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
+            pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
+        } else {
+            const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
+            pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
+            pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
+        }
+    }
 }
 
 static void backup2x8(pixel dst[3][8][2],
@@ -113,11 +117,8 @@
         const int by_idx = by & 30;
         if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
 
-        if (edges & CDEF_HAVE_BOTTOM) {
-            // backup pre-filter data for next iteration
-            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,
-                         8, f->bw * 4, layout);
-        }
+        if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
+            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);
 
         pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
         pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
@@ -125,7 +126,7 @@
         edges |= CDEF_HAVE_RIGHT;
         enum Backup2x8Flags prev_flag = 0;
         for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
-            const int sb128x = sbx >>1;
+            const int sb128x = sbx >> 1;
             const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
             const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
             if (cdef_idx == -1 ||
@@ -188,18 +189,12 @@
                     const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
                     if (adj_y_pri_lvl || y_sec_lvl)
                         dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
-                                        (pixel *const [2]) {
-                                            &f->lf.cdef_line[tf][0][0][bx * 4],
-                                            &f->lf.cdef_line[tf][0][1][bx * 4],
-                                        },
+                                        &f->lf.cdef_line[tf][0][bx * 4],
                                         adj_y_pri_lvl, y_sec_lvl, dir,
                                         damping, edges HIGHBD_CALL_SUFFIX);
                 } else if (y_sec_lvl)
                     dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
-                                    (pixel *const [2]) {
-                                        &f->lf.cdef_line[tf][0][0][bx * 4],
-                                        &f->lf.cdef_line[tf][0][1][bx * 4],
-                                    },
+                                    &f->lf.cdef_line[tf][0][bx * 4],
                                     0, y_sec_lvl, 0,
                                     damping, edges HIGHBD_CALL_SUFFIX);
                 if (uv_lvl) {
@@ -207,12 +202,8 @@
                     const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ?
                         ((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0;
                     for (int pl = 1; pl <= 2; pl++) {
-                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
-                                             lr_bak[bit][pl],
-                                             (pixel *const [2]) {
-                                                 &f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],
-                                                 &f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],
-                                             },
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
+                                             &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
                                              uv_pri_lvl, uv_sec_lvl, uvdir,
                                              damping - 1, edges HIGHBD_CALL_SUFFIX);
                     }
--- a/src/cdef_tmpl.c
+++ b/src/cdef_tmpl.c
@@ -55,7 +55,7 @@
 
 static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
                     const pixel *src, const ptrdiff_t src_stride,
-                    const pixel (*left)[2], pixel *const top[2],
+                    const pixel (*left)[2], const pixel *top,
                     const int w, const int h,
                     const enum CdefEdgeFlags edges)
 {
@@ -78,9 +78,11 @@
         x_end -= 2;
     }
 
-    for (int y = y_start; y < 0; y++)
+    for (int y = y_start; y < 0; y++) {
         for (int x = x_start; x < x_end; x++)
-            tmp[x + y * tmp_stride] = top[y & 1][x];
+            tmp[x + y * tmp_stride] = top[x];
+        top += PXSTRIDE(src_stride);
+    }
     for (int y = 0; y < h; y++)
         for (int x = x_start; x < 0; x++)
             tmp[x + y * tmp_stride] = left[y][2 + x];
@@ -94,7 +96,7 @@
 
 static NOINLINE void
 cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const pixel (*left)[2], /*const*/ pixel *const top[2],
+                    const pixel (*left)[2], const pixel *const top,
                     const int pri_strength, const int sec_strength,
                     const int dir, const int damping, const int w, int h,
                     const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
@@ -208,7 +210,7 @@
 static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
                                             const ptrdiff_t stride, \
                                             const pixel (*left)[2], \
-                                            /*const*/ pixel *const top[2], \
+                                            const pixel *const top, \
                                             const int pri_strength, \
                                             const int sec_strength, \
                                             const int dir, \
--- a/src/decode.c
+++ b/src/decode.c
@@ -2762,24 +2762,42 @@
     }
 
     // update allocation of block contexts for above
-    const int line_sz = (int)f->b4_stride << hbd;
-    if (line_sz != f->lf.line_sz) {
-        dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);
-        uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);
+    const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
+    if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) {
+        dav1d_free_aligned(f->lf.cdef_line_buf);
+        size_t alloc_sz = 64;
+        alloc_sz += (y_stride  < 0 ? -y_stride  : y_stride ) * 4;
+        alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8;
+        uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
         if (!ptr) {
-            f->lf.line_sz = 0;
+            f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0;
             goto error;
         }
 
-        for (int pl = 0; pl <= 2; pl++) {
-            f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;
-            f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;
-            f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;
-            f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;
-            ptr += line_sz * 4 * 4;
+        ptr += 32;
+        if (y_stride < 0) {
+            f->lf.cdef_line[0][0] = ptr - y_stride * 1;
+            f->lf.cdef_line[1][0] = ptr - y_stride * 3;
+            ptr -= y_stride * 4;
+        } else {
+            f->lf.cdef_line[0][0] = ptr + y_stride * 0;
+            f->lf.cdef_line[1][0] = ptr + y_stride * 2;
+            ptr += y_stride * 4;
         }
+        if (uv_stride < 0) {
+            f->lf.cdef_line[0][1] = ptr - uv_stride * 1;
+            f->lf.cdef_line[0][2] = ptr - uv_stride * 3;
+            f->lf.cdef_line[1][1] = ptr - uv_stride * 5;
+            f->lf.cdef_line[1][2] = ptr - uv_stride * 7;
+        } else {
+            f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
+            f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
+            f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
+            f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
+        }
 
-        f->lf.line_sz = line_sz;
+        f->lf.cdef_line_sz[0] = (int) y_stride;
+        f->lf.cdef_line_sz[1] = (int) uv_stride;
     }
 
     const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
--- a/src/internal.h
+++ b/src/internal.h
@@ -216,12 +216,14 @@
         Av1Filter *mask;
         Av1Restoration *lr_mask;
         int top_pre_cdef_toggle;
-        int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
+        int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
+        int lr_line_sz, re_sz /* h */;
         ALIGN(Av1FilterLUT lim_lut, 16);
         int last_sharpness;
         uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
         uint8_t *tx_lpf_right_edge[2];
-        pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];
+        uint8_t *cdef_line_buf;
+        pixel *cdef_line[2 /* pre, post */][3 /* plane */];
         pixel *lr_lpf_line[3 /* plane */];
 
         // in-loop filter per-frame state keeping
--- a/src/lib.c
+++ b/src/lib.c
@@ -536,7 +536,7 @@
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
         if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
-        dav1d_free_aligned(f->lf.cdef_line[0][0][0]);
+        dav1d_free_aligned(f->lf.cdef_line_buf);
         dav1d_free_aligned(f->lf.lr_lpf_line[0]);
     }
     dav1d_free_aligned(c->fc);
--- a/src/ppc/cdef_init_tmpl.c
+++ b/src/ppc/cdef_init_tmpl.c
@@ -53,7 +53,7 @@
 
 static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
                            const uint8_t *src, const ptrdiff_t src_stride,
-                           const uint8_t (*left)[2], uint8_t *const top[2],
+                           const uint8_t (*left)[2], const uint8_t *const top,
                            const int w, const int h,
                            const enum CdefEdgeFlags edges)
 {
@@ -70,8 +70,8 @@
         l1 = fill;
         y_start = 0;
     } else {
-        l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
-        l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
+        l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+        l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
     }
 
     vec_st(l0, 0, tmp - 2 * 8);
@@ -115,7 +115,7 @@
 
 static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
                            const uint8_t *src, const ptrdiff_t src_stride,
-                           const uint8_t (*left)[2], uint8_t *const top[2],
+                           const uint8_t (*left)[2], const uint8_t *const top,
                            const int w, const int h,
                            const enum CdefEdgeFlags edges)
 {
@@ -134,8 +134,8 @@
         l1l = fill;
         y_start = 0;
     } else {
-        u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
-        u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
+        u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+        u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
         l0h = u8h_to_u16(l0);
         l0l = u8l_to_u16(l0);
         l1h = u8h_to_u16(l1);
@@ -275,7 +275,7 @@
 
 static inline void
 filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
-           const pixel (*left)[2], /*const*/ pixel *const top[2],
+           const pixel (*left)[2], const pixel *const top,
            const int w, const int h, const int pri_strength,
            const int sec_strength, const int dir,
            const int damping, const enum CdefEdgeFlags edges,
@@ -364,7 +364,7 @@
 
 static inline void
 filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
-           const pixel (*left)[2], /*const*/ pixel *const top[2],
+           const pixel (*left)[2], const pixel *const top,
            const int w, const int h, const int pri_strength,
            const int sec_strength, const int dir,
            const int damping, const enum CdefEdgeFlags edges,
@@ -456,7 +456,7 @@
 static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
                                         const ptrdiff_t dst_stride, \
                                         const pixel (*left)[2], \
-                                        /*const*/ pixel *const top[2], \
+                                        const pixel *const top, \
                                         const int pri_strength, \
                                         const int sec_strength, \
                                         const int dir, \
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -215,23 +215,20 @@
 .body_done:
 
     ; top
-    DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
     test         edgeb, 4                    ; have_top
     jz .no_top
-    mov          top1q, [top2q+0*gprsize]
-    mov          top2q, [top2q+1*gprsize]
     test         edgeb, 1                    ; have_left
     jz .top_no_left
     test         edgeb, 2                    ; have_right
     jz .top_no_right
-    pmovzxbw        m1, [top1q-(%1/2)]
-    pmovzxbw        m2, [top2q-(%1/2)]
+    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
+    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
     movu  [px-2*%3-%1], m1
     movu  [px-1*%3-%1], m2
     jmp .top_done
 .top_no_right:
-    pmovzxbw        m1, [top1q-%1]
-    pmovzxbw        m2, [top2q-%1]
+    pmovzxbw        m1, [topq+strideq*0-%1]
+    pmovzxbw        m2, [topq+strideq*1-%1]
     movu [px-2*%3-%1*2], m1
     movu [px-1*%3-%1*2], m2
     movd [px-2*%3+%1*2], xm14
@@ -240,8 +237,8 @@
 .top_no_left:
     test         edgeb, 2                   ; have_right
     jz .top_no_left_right
-    pmovzxbw        m1, [top1q]
-    pmovzxbw        m2, [top2q]
+    pmovzxbw        m1, [topq+strideq*0]
+    pmovzxbw        m2, [topq+strideq*1]
     mova   [px-2*%3+0], m1
     mova   [px-1*%3+0], m2
     movd   [px-2*%3-4], xm14
@@ -249,14 +246,14 @@
     jmp .top_done
 .top_no_left_right:
 %if %1 == 4
-    movd           xm1, [top1q]
-    pinsrd         xm1, [top2q], 1
+    movd           xm1, [topq+strideq*0]
+    pinsrd         xm1, [topq+strideq*1], 1
     pmovzxbw       xm1, xm1
     movq   [px-2*%3+0], xm1
     movhps [px-1*%3+0], xm1
 %else
-    pmovzxbw       xm1, [top1q]
-    pmovzxbw       xm2, [top2q]
+    pmovzxbw       xm1, [topq+strideq*0]
+    pmovzxbw       xm2, [topq+strideq*1]
     mova   [px-2*%3+0], xm1
     mova   [px-1*%3+0], xm2
 %endif
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -364,26 +364,19 @@
 .body_done:
 
     ; top
- %if ARCH_X86_64
-    DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
- %else
-    DEFINE_ARGS dst, stride, left, top2, stride3, top1, edge
- %endif
     LOAD_ARG32     top
     test         edged, 4                    ; have_top
     jz .no_top
-    mov          top1q, [top2q+0*gprsize]
-    mov          top2q, [top2q+1*gprsize]
     test         edged, 1                    ; have_left
     jz .top_no_left
     test         edged, 2                    ; have_right
     jz .top_no_right
  %if %1 == 4
-    PMOVZXBW        m0, [top1q-2]
-    PMOVZXBW        m1, [top2q-2]
+    PMOVZXBW        m0, [topq+strideq*0-2]
+    PMOVZXBW        m1, [topq+strideq*1-2]
  %else
-    movu            m0, [top1q-4]
-    movu            m1, [top2q-4]
+    movu            m0, [topq+strideq*0-4]
+    movu            m1, [topq+strideq*1-4]
     punpckhbw       m2, m0, m15
     punpcklbw       m0, m15
     punpckhbw       m3, m1, m15
@@ -396,13 +389,13 @@
     jmp .top_done
 .top_no_right:
  %if %1 == 4
-    PMOVZXBW        m0, [top1q-%1]
-    PMOVZXBW        m1, [top2q-%1]
+    PMOVZXBW        m0, [topq+strideq*0-%1]
+    PMOVZXBW        m1, [topq+strideq*1-%1]
     movu [px-2*%3-4*2], m0
     movu [px-1*%3-4*2], m1
  %else
-    movu            m0, [top1q-%1]
-    movu            m1, [top2q-%2]
+    movu            m0, [topq+strideq*0-%1]
+    movu            m1, [topq+strideq*1-%2]
     punpckhbw       m2, m0, m15
     punpcklbw       m0, m15
     punpckhbw       m3, m1, m15
@@ -419,11 +412,11 @@
     test         edged, 2                   ; have_right
     jz .top_no_left_right
  %if %1 == 4
-    PMOVZXBW        m0, [top1q]
-    PMOVZXBW        m1, [top2q]
+    PMOVZXBW        m0, [topq+strideq*0]
+    PMOVZXBW        m1, [topq+strideq*1]
  %else
-    movu            m0, [top1q]
-    movu            m1, [top2q]
+    movu            m0, [topq+strideq*0]
+    movu            m1, [topq+strideq*1]
     punpckhbw       m2, m0, m15
     punpcklbw       m0, m15
     punpckhbw       m3, m1, m15
@@ -437,8 +430,8 @@
     mov dword [px-1*%3-4], OUT_OF_BOUNDS
     jmp .top_done
 .top_no_left_right:
-    PMOVZXBW        m0, [top1q], %1 == 4
-    PMOVZXBW        m1, [top2q], %1 == 4
+    PMOVZXBW        m0, [topq+strideq*0], %1 == 4
+    PMOVZXBW        m1, [topq+strideq*1], %1 == 4
     mova     [px-2*%3], m0
     mova     [px-1*%3], m1
     mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
--- a/tests/checkasm/cdef.c
+++ b/tests/checkasm/cdef.c
@@ -45,15 +45,14 @@
 }
 
 static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
-    ALIGN_STK_64(pixel, c_src, 10 * 16 + 8, ), *const c_dst = c_src + 8;
-    ALIGN_STK_64(pixel, a_src, 10 * 16 + 8, ), *const a_dst = a_src + 8;
-    ALIGN_STK_64(pixel, top, 16 * 2 + 8, );
+    ALIGN_STK_64(pixel, c_src,   16 * 10 + 16, ), *const c_dst = c_src + 8;
+    ALIGN_STK_64(pixel, a_src,   16 * 10 + 16, ), *const a_dst = a_src + 8;
+    ALIGN_STK_64(pixel, top_buf, 16 *  2 + 16, ), *const top = top_buf + 8;
     pixel left[8][2];
-    pixel *const top_ptrs[2] = { top + 8, top + 24 };
     const ptrdiff_t stride = 16 * sizeof(pixel);
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
-                 pixel *const top[2], int pri_strength, int sec_strength,
+                 const pixel *top, int pri_strength, int sec_strength,
                  int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
 
     if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
@@ -66,10 +65,10 @@
 #endif
                 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
 
-                init_tmp(c_src, 10 * 16 + 8, bitdepth_max);
-                init_tmp(top, 16 * 2 + 8, bitdepth_max);
+                init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
+                init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
                 init_tmp((pixel *) left, 8 * 2, bitdepth_max);
-                memcpy(a_src, c_src, (10 * 16 + 8) * sizeof(pixel));
+                memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
 
                 const int lvl = 1 + (rnd() % 62);
                 const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
@@ -77,9 +76,9 @@
                 int sec_strength = lvl & 3;
                 sec_strength += sec_strength == 3;
                 sec_strength <<= bitdepth_min_8;
-                call_ref(c_dst, stride, left, top_ptrs, pri_strength, sec_strength,
+                call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
                          dir, damping, edges HIGHBD_TAIL_SUFFIX);
-                call_new(a_dst, stride, left, top_ptrs, pri_strength, sec_strength,
+                call_new(a_dst, stride, left, top, pri_strength, sec_strength,
                          dir, damping, edges HIGHBD_TAIL_SUFFIX);
                 if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
                     fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
@@ -94,7 +93,7 @@
                      */
                     pri_strength = (edges & 1) << bitdepth_min_8;
                     sec_strength = (edges & 2) << bitdepth_min_8;
-                    bench_new(a_dst, stride, left, top_ptrs, pri_strength, sec_strength,
+                    bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
                               dir, damping, edges HIGHBD_TAIL_SUFFIX);
                 }
             }