ref: bb178db019da21d62fae8c255675efbd84327195
parent: dccc21b742f328060c124463c5d05f76472d758c
author: Henrik Gramner <[email protected]>
date: Wed Jan 29 09:17:11 EST 2020
Rework the CDEF top edge handling Avoids some pointer chasing and simplifies the DSP code, at the cost of making the initialization a little bit more complicated. Also reduces memory usage by a small amount due to properly sizing the buffers instead of always allocating enough space for 4:4:4.
--- a/src/arm/32/cdef.S
+++ b/src/arm/32/cdef.S
@@ -150,7 +150,7 @@
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
-// /*const*/ pixel *const top[2], int h,
+// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
// n1 = s0/d0
@@ -175,10 +175,9 @@
b 3f
1:
// CDEF_HAVE_TOP
- ldr r7, [r4]
- ldr lr, [r4, #4]
+ add r7, r4, r2
sub r0, r0, #2*(2*\stride)
- pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+ pad_top_bottom r4, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
// Middle section
3:
--- a/src/arm/64/cdef.S
+++ b/src/arm/64/cdef.S
@@ -139,7 +139,7 @@
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
-// /*const*/ pixel *const top[2], int h,
+// const pixel *const top, int h,
// enum CdefEdgeFlags edges);
.macro padding_func w, stride, rn, rw
@@ -157,9 +157,8 @@
b 3f
1:
// CDEF_HAVE_TOP
- ldr x8, [x4]
- ldr x9, [x4, #8]
- pad_top_bottom x8, x9, \w, \stride, \rn, \rw, 0
+ add x9, x4, x2
+ pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0
// Middle section
3:
--- a/src/arm/cdef_init_tmpl.c
+++ b/src/arm/cdef_init_tmpl.c
@@ -32,11 +32,11 @@
void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
ptrdiff_t src_stride, const pixel (*left)[2],
- /*const*/ pixel *const top[2], int h,
+ const pixel *const top, int h,
enum CdefEdgeFlags edges);
void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src,
ptrdiff_t src_stride, const pixel (*left)[2],
- /*const*/ pixel *const top[2], int h,
+ const pixel *const top, int h,
enum CdefEdgeFlags edges);
void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride,
@@ -48,17 +48,13 @@
#define DEFINE_FILTER(w, h, tmp_stride) \
static void \
-cdef_filter_##w##x##h##_neon(pixel *dst, \
- const ptrdiff_t stride, \
- const pixel (*left)[2], \
- /*const*/ pixel *const top[2], \
- const int pri_strength, \
- const int sec_strength, \
- const int dir, \
- const int damping, \
+cdef_filter_##w##x##h##_neon(pixel *const dst, const ptrdiff_t stride, \
+ const pixel (*left)[2], const pixel *const top, \
+ const int pri_strength, const int sec_strength, \
+ const int dir, const int damping, \
const enum CdefEdgeFlags edges) \
{ \
- ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,); \
+ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges); \
dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength, \
--- a/src/cdef.h
+++ b/src/cdef.h
@@ -52,7 +52,7 @@
// order to get access to pre-filter top pixels, use $top.
#define decl_cdef_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
- /*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
+ const pixel *top, int pri_strength, int sec_strength, \
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
typedef decl_cdef_fn(*cdef_fn);
--- a/src/cdef_apply_tmpl.c
+++ b/src/cdef_apply_tmpl.c
@@ -39,24 +39,28 @@
BACKUP_2X8_UV = 1 << 1,
};
-static void backup2lines(pixel *const dst[3][2],
- /*const*/ pixel *const src[3],
- const ptrdiff_t src_stride[2], int y_off, int w,
+static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
+ const ptrdiff_t stride[2],
const enum Dav1dPixelLayout layout)
{
- pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
- pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
+ const ptrdiff_t y_stride = PXSTRIDE(stride[0]);
+ if (y_stride < 0)
+ pixel_copy(dst[0] + y_stride, src[0] + 7 * y_stride, -2 * y_stride);
+ else
+ pixel_copy(dst[0], src[0] + 6 * y_stride, 2 * y_stride);
- if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
- const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
- w >>= ss_hor;
- y_off >>= ss_ver;
- pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
- pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
- pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
- pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+ if (layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const ptrdiff_t uv_stride = PXSTRIDE(stride[1]);
+ if (uv_stride < 0) {
+ const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 3 : 7;
+ pixel_copy(dst[1] + uv_stride, src[1] + uv_off * uv_stride, -2 * uv_stride);
+ pixel_copy(dst[2] + uv_stride, src[2] + uv_off * uv_stride, -2 * uv_stride);
+ } else {
+ const int uv_off = layout == DAV1D_PIXEL_LAYOUT_I420 ? 2 : 6;
+ pixel_copy(dst[1], src[1] + uv_off * uv_stride, 2 * uv_stride);
+ pixel_copy(dst[2], src[2] + uv_off * uv_stride, 2 * uv_stride);
+ }
+ }
}
static void backup2x8(pixel dst[3][8][2],
@@ -113,11 +117,8 @@
const int by_idx = by & 30;
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
- if (edges & CDEF_HAVE_BOTTOM) {
- // backup pre-filter data for next iteration
- backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,
- 8, f->bw * 4, layout);
- }
+ if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
+ backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);
pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
@@ -125,7 +126,7 @@
edges |= CDEF_HAVE_RIGHT;
enum Backup2x8Flags prev_flag = 0;
for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
- const int sb128x = sbx >>1;
+ const int sb128x = sbx >> 1;
const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
if (cdef_idx == -1 ||
@@ -188,18 +189,12 @@
const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
if (adj_y_pri_lvl || y_sec_lvl)
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
- (pixel *const [2]) {
- &f->lf.cdef_line[tf][0][0][bx * 4],
- &f->lf.cdef_line[tf][0][1][bx * 4],
- },
+ &f->lf.cdef_line[tf][0][bx * 4],
adj_y_pri_lvl, y_sec_lvl, dir,
damping, edges HIGHBD_CALL_SUFFIX);
} else if (y_sec_lvl)
dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
- (pixel *const [2]) {
- &f->lf.cdef_line[tf][0][0][bx * 4],
- &f->lf.cdef_line[tf][0][1][bx * 4],
- },
+ &f->lf.cdef_line[tf][0][bx * 4],
0, y_sec_lvl, 0,
damping, edges HIGHBD_CALL_SUFFIX);
if (uv_lvl) {
@@ -207,12 +202,8 @@
const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ?
((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0;
for (int pl = 1; pl <= 2; pl++) {
- dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
- lr_bak[bit][pl],
- (pixel *const [2]) {
- &f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],
- &f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],
- },
+ dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
+ &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
uv_pri_lvl, uv_sec_lvl, uvdir,
damping - 1, edges HIGHBD_CALL_SUFFIX);
}
--- a/src/cdef_tmpl.c
+++ b/src/cdef_tmpl.c
@@ -55,7 +55,7 @@
static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
const pixel *src, const ptrdiff_t src_stride,
- const pixel (*left)[2], pixel *const top[2],
+ const pixel (*left)[2], const pixel *top,
const int w, const int h,
const enum CdefEdgeFlags edges)
{
@@ -78,9 +78,11 @@
x_end -= 2;
}
- for (int y = y_start; y < 0; y++)
+ for (int y = y_start; y < 0; y++) {
for (int x = x_start; x < x_end; x++)
- tmp[x + y * tmp_stride] = top[y & 1][x];
+ tmp[x + y * tmp_stride] = top[x];
+ top += PXSTRIDE(src_stride);
+ }
for (int y = 0; y < h; y++)
for (int x = x_start; x < 0; x++)
tmp[x + y * tmp_stride] = left[y][2 + x];
@@ -94,7 +96,7 @@
static NOINLINE void
cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
- const pixel (*left)[2], /*const*/ pixel *const top[2],
+ const pixel (*left)[2], const pixel *const top,
const int pri_strength, const int sec_strength,
const int dir, const int damping, const int w, int h,
const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
@@ -208,7 +210,7 @@
static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
const ptrdiff_t stride, \
const pixel (*left)[2], \
- /*const*/ pixel *const top[2], \
+ const pixel *const top, \
const int pri_strength, \
const int sec_strength, \
const int dir, \
--- a/src/decode.c
+++ b/src/decode.c
@@ -2762,24 +2762,42 @@
}
// update allocation of block contexts for above
- const int line_sz = (int)f->b4_stride << hbd;
- if (line_sz != f->lf.line_sz) {
- dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);
- uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);
+ const ptrdiff_t y_stride = f->cur.stride[0], uv_stride = f->cur.stride[1];
+ if (y_stride != f->lf.cdef_line_sz[0] || uv_stride != f->lf.cdef_line_sz[1]) {
+ dav1d_free_aligned(f->lf.cdef_line_buf);
+ size_t alloc_sz = 64;
+ alloc_sz += (y_stride < 0 ? -y_stride : y_stride ) * 4;
+ alloc_sz += (uv_stride < 0 ? -uv_stride : uv_stride) * 8;
+ uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32);
if (!ptr) {
- f->lf.line_sz = 0;
+ f->lf.cdef_line_sz[0] = f->lf.cdef_line_sz[1] = 0;
goto error;
}
- for (int pl = 0; pl <= 2; pl++) {
- f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;
- f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;
- f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;
- f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;
- ptr += line_sz * 4 * 4;
+ ptr += 32;
+ if (y_stride < 0) {
+ f->lf.cdef_line[0][0] = ptr - y_stride * 1;
+ f->lf.cdef_line[1][0] = ptr - y_stride * 3;
+ ptr -= y_stride * 4;
+ } else {
+ f->lf.cdef_line[0][0] = ptr + y_stride * 0;
+ f->lf.cdef_line[1][0] = ptr + y_stride * 2;
+ ptr += y_stride * 4;
}
+ if (uv_stride < 0) {
+ f->lf.cdef_line[0][1] = ptr - uv_stride * 1;
+ f->lf.cdef_line[0][2] = ptr - uv_stride * 3;
+ f->lf.cdef_line[1][1] = ptr - uv_stride * 5;
+ f->lf.cdef_line[1][2] = ptr - uv_stride * 7;
+ } else {
+ f->lf.cdef_line[0][1] = ptr + uv_stride * 0;
+ f->lf.cdef_line[0][2] = ptr + uv_stride * 2;
+ f->lf.cdef_line[1][1] = ptr + uv_stride * 4;
+ f->lf.cdef_line[1][2] = ptr + uv_stride * 6;
+ }
- f->lf.line_sz = line_sz;
+ f->lf.cdef_line_sz[0] = (int) y_stride;
+ f->lf.cdef_line_sz[1] = (int) uv_stride;
}
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
--- a/src/internal.h
+++ b/src/internal.h
@@ -216,12 +216,14 @@
Av1Filter *mask;
Av1Restoration *lr_mask;
int top_pre_cdef_toggle;
- int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
+ int mask_sz /* w*h */, lr_mask_sz, cdef_line_sz[2] /* stride */;
+ int lr_line_sz, re_sz /* h */;
ALIGN(Av1FilterLUT lim_lut, 16);
int last_sharpness;
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
uint8_t *tx_lpf_right_edge[2];
- pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];
+ uint8_t *cdef_line_buf;
+ pixel *cdef_line[2 /* pre, post */][3 /* plane */];
pixel *lr_lpf_line[3 /* plane */];
// in-loop filter per-frame state keeping
--- a/src/lib.c
+++ b/src/lib.c
@@ -536,7 +536,7 @@
free(f->lf.level);
free(f->lf.tx_lpf_right_edge[0]);
if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
- dav1d_free_aligned(f->lf.cdef_line[0][0][0]);
+ dav1d_free_aligned(f->lf.cdef_line_buf);
dav1d_free_aligned(f->lf.lr_lpf_line[0]);
}
dav1d_free_aligned(c->fc);
--- a/src/ppc/cdef_init_tmpl.c
+++ b/src/ppc/cdef_init_tmpl.c
@@ -53,7 +53,7 @@
static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
const uint8_t *src, const ptrdiff_t src_stride,
- const uint8_t (*left)[2], uint8_t *const top[2],
+ const uint8_t (*left)[2], const uint8_t *const top,
const int w, const int h,
const enum CdefEdgeFlags edges)
{
@@ -70,8 +70,8 @@
l1 = fill;
y_start = 0;
} else {
- l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
- l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
+ l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
+ l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
}
vec_st(l0, 0, tmp - 2 * 8);
@@ -115,7 +115,7 @@
static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
const uint8_t *src, const ptrdiff_t src_stride,
- const uint8_t (*left)[2], uint8_t *const top[2],
+ const uint8_t (*left)[2], const uint8_t *const top,
const int w, const int h,
const enum CdefEdgeFlags edges)
{
@@ -134,8 +134,8 @@
l1l = fill;
y_start = 0;
} else {
- u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
- u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
+ u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
+ u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
l0h = u8h_to_u16(l0);
l0l = u8l_to_u16(l0);
l1h = u8h_to_u16(l1);
@@ -275,7 +275,7 @@
static inline void
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
- const pixel (*left)[2], /*const*/ pixel *const top[2],
+ const pixel (*left)[2], const pixel *const top,
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges,
@@ -364,7 +364,7 @@
static inline void
filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
- const pixel (*left)[2], /*const*/ pixel *const top[2],
+ const pixel (*left)[2], const pixel *const top,
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges,
@@ -456,7 +456,7 @@
static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
const ptrdiff_t dst_stride, \
const pixel (*left)[2], \
- /*const*/ pixel *const top[2], \
+ const pixel *const top, \
const int pri_strength, \
const int sec_strength, \
const int dir, \
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -215,23 +215,20 @@
.body_done:
; top
- DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
test edgeb, 4 ; have_top
jz .no_top
- mov top1q, [top2q+0*gprsize]
- mov top2q, [top2q+1*gprsize]
test edgeb, 1 ; have_left
jz .top_no_left
test edgeb, 2 ; have_right
jz .top_no_right
- pmovzxbw m1, [top1q-(%1/2)]
- pmovzxbw m2, [top2q-(%1/2)]
+ pmovzxbw m1, [topq+strideq*0-(%1/2)]
+ pmovzxbw m2, [topq+strideq*1-(%1/2)]
movu [px-2*%3-%1], m1
movu [px-1*%3-%1], m2
jmp .top_done
.top_no_right:
- pmovzxbw m1, [top1q-%1]
- pmovzxbw m2, [top2q-%1]
+ pmovzxbw m1, [topq+strideq*0-%1]
+ pmovzxbw m2, [topq+strideq*1-%1]
movu [px-2*%3-%1*2], m1
movu [px-1*%3-%1*2], m2
movd [px-2*%3+%1*2], xm14
@@ -240,8 +237,8 @@
.top_no_left:
test edgeb, 2 ; have_right
jz .top_no_left_right
- pmovzxbw m1, [top1q]
- pmovzxbw m2, [top2q]
+ pmovzxbw m1, [topq+strideq*0]
+ pmovzxbw m2, [topq+strideq*1]
mova [px-2*%3+0], m1
mova [px-1*%3+0], m2
movd [px-2*%3-4], xm14
@@ -249,14 +246,14 @@
jmp .top_done
.top_no_left_right:
%if %1 == 4
- movd xm1, [top1q]
- pinsrd xm1, [top2q], 1
+ movd xm1, [topq+strideq*0]
+ pinsrd xm1, [topq+strideq*1], 1
pmovzxbw xm1, xm1
movq [px-2*%3+0], xm1
movhps [px-1*%3+0], xm1
%else
- pmovzxbw xm1, [top1q]
- pmovzxbw xm2, [top2q]
+ pmovzxbw xm1, [topq+strideq*0]
+ pmovzxbw xm2, [topq+strideq*1]
mova [px-2*%3+0], xm1
mova [px-1*%3+0], xm2
%endif
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -364,26 +364,19 @@
.body_done:
; top
- %if ARCH_X86_64
- DEFINE_ARGS dst, stride, left, top2, pri, sec, stride3, top1, edge
- %else
- DEFINE_ARGS dst, stride, left, top2, stride3, top1, edge
- %endif
LOAD_ARG32 top
test edged, 4 ; have_top
jz .no_top
- mov top1q, [top2q+0*gprsize]
- mov top2q, [top2q+1*gprsize]
test edged, 1 ; have_left
jz .top_no_left
test edged, 2 ; have_right
jz .top_no_right
%if %1 == 4
- PMOVZXBW m0, [top1q-2]
- PMOVZXBW m1, [top2q-2]
+ PMOVZXBW m0, [topq+strideq*0-2]
+ PMOVZXBW m1, [topq+strideq*1-2]
%else
- movu m0, [top1q-4]
- movu m1, [top2q-4]
+ movu m0, [topq+strideq*0-4]
+ movu m1, [topq+strideq*1-4]
punpckhbw m2, m0, m15
punpcklbw m0, m15
punpckhbw m3, m1, m15
@@ -396,13 +389,13 @@
jmp .top_done
.top_no_right:
%if %1 == 4
- PMOVZXBW m0, [top1q-%1]
- PMOVZXBW m1, [top2q-%1]
+ PMOVZXBW m0, [topq+strideq*0-%1]
+ PMOVZXBW m1, [topq+strideq*1-%1]
movu [px-2*%3-4*2], m0
movu [px-1*%3-4*2], m1
%else
- movu m0, [top1q-%1]
- movu m1, [top2q-%2]
+ movu m0, [topq+strideq*0-%1]
+ movu m1, [topq+strideq*1-%2]
punpckhbw m2, m0, m15
punpcklbw m0, m15
punpckhbw m3, m1, m15
@@ -419,11 +412,11 @@
test edged, 2 ; have_right
jz .top_no_left_right
%if %1 == 4
- PMOVZXBW m0, [top1q]
- PMOVZXBW m1, [top2q]
+ PMOVZXBW m0, [topq+strideq*0]
+ PMOVZXBW m1, [topq+strideq*1]
%else
- movu m0, [top1q]
- movu m1, [top2q]
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
punpckhbw m2, m0, m15
punpcklbw m0, m15
punpckhbw m3, m1, m15
@@ -437,8 +430,8 @@
mov dword [px-1*%3-4], OUT_OF_BOUNDS
jmp .top_done
.top_no_left_right:
- PMOVZXBW m0, [top1q], %1 == 4
- PMOVZXBW m1, [top2q], %1 == 4
+ PMOVZXBW m0, [topq+strideq*0], %1 == 4
+ PMOVZXBW m1, [topq+strideq*1], %1 == 4
mova [px-2*%3], m0
mova [px-1*%3], m1
mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
--- a/tests/checkasm/cdef.c
+++ b/tests/checkasm/cdef.c
@@ -45,15 +45,14 @@
}
static void check_cdef_filter(const cdef_fn fn, const int w, const int h) {
- ALIGN_STK_64(pixel, c_src, 10 * 16 + 8, ), *const c_dst = c_src + 8;
- ALIGN_STK_64(pixel, a_src, 10 * 16 + 8, ), *const a_dst = a_src + 8;
- ALIGN_STK_64(pixel, top, 16 * 2 + 8, );
+ ALIGN_STK_64(pixel, c_src, 16 * 10 + 16, ), *const c_dst = c_src + 8;
+ ALIGN_STK_64(pixel, a_src, 16 * 10 + 16, ), *const a_dst = a_src + 8;
+ ALIGN_STK_64(pixel, top_buf, 16 * 2 + 16, ), *const top = top_buf + 8;
pixel left[8][2];
- pixel *const top_ptrs[2] = { top + 8, top + 24 };
const ptrdiff_t stride = 16 * sizeof(pixel);
declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel (*left)[2],
- pixel *const top[2], int pri_strength, int sec_strength,
+ const pixel *top, int pri_strength, int sec_strength,
int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
if (check_func(fn, "cdef_filter_%dx%d_%dbpc", w, h, BITDEPTH)) {
@@ -66,10 +65,10 @@
#endif
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
- init_tmp(c_src, 10 * 16 + 8, bitdepth_max);
- init_tmp(top, 16 * 2 + 8, bitdepth_max);
+ init_tmp(c_src, 16 * 10 + 16, bitdepth_max);
+ init_tmp(top_buf, 16 * 2 + 16, bitdepth_max);
init_tmp((pixel *) left, 8 * 2, bitdepth_max);
- memcpy(a_src, c_src, (10 * 16 + 8) * sizeof(pixel));
+ memcpy(a_src, c_src, (16 * 10 + 16) * sizeof(pixel));
const int lvl = 1 + (rnd() % 62);
const int damping = 3 + (rnd() & 3) + bitdepth_min_8 - (w == 4 || (rnd() & 1));
@@ -77,9 +76,9 @@
int sec_strength = lvl & 3;
sec_strength += sec_strength == 3;
sec_strength <<= bitdepth_min_8;
- call_ref(c_dst, stride, left, top_ptrs, pri_strength, sec_strength,
+ call_ref(c_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
- call_new(a_dst, stride, left, top_ptrs, pri_strength, sec_strength,
+ call_new(a_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
if (checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst")) {
fprintf(stderr, "strength = %d:%d, dir = %d, damping = %d, edges = %04d\n",
@@ -94,7 +93,7 @@
*/
pri_strength = (edges & 1) << bitdepth_min_8;
sec_strength = (edges & 2) << bitdepth_min_8;
- bench_new(a_dst, stride, left, top_ptrs, pri_strength, sec_strength,
+ bench_new(a_dst, stride, left, top, pri_strength, sec_strength,
dir, damping, edges HIGHBD_TAIL_SUFFIX);
}
}