ref: 1f32abd286557fc2fb1bee3dbf31c2dcce337c17
parent: 1d7754830ec78b9124c4c8be198aa802669675db
author: Ronald S. Bultje <[email protected]>
date: Tue Oct 16 04:46:04 EDT 2018
Add infrastructure for LR SIMD and unit tests. wiener_luma_8bpc_c: 326272.1 wiener_luma_8bpc_avx2: 19841.5 Decoding time of first 1000 frames of Chimera-8bit-1920x1080.ivf goes from 27.471 to 23.558 seconds.
--- a/src/looprestoration.c
+++ b/src/looprestoration.c
@@ -65,6 +65,11 @@
pixel_copy(dst_l, p, unit_w);
pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
}
pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
@@ -81,6 +86,11 @@
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
}
// Inner UNIT_WxSTRIPE_H
@@ -560,4 +570,8 @@
void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
c->wiener = wiener_c;
c->selfguided = selfguided_c;
+
+#if ARCH_X86 && BITDEPTH == 8
+ bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
+#endif
}
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -40,21 +40,31 @@
LR_HAVE_BOTTOM = 1 << 3,
};
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row)[4];
+#else
+typedef const void *const_left_pixel_row;
+#endif
+
// Although the spec applies restoration filters over 4x4 blocks, the wiener
// filter can be applied to a bigger surface.
// * w is constrained by the restoration unit size (w <= 256)
// * h is constrained by the stripe height (h <= 64)
-typedef void (*wienerfilter_fn)(pixel *dst, ptrdiff_t dst_stride,
- const void *left /*const pixel (*left)[4]*/,
- const pixel *lpf, ptrdiff_t lpf_stride,
- int w, int h, const int16_t filterh[7],
- const int16_t filterv[7], enum LrEdgeFlags edges);
+#define decl_wiener_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const_left_pixel_row left, \
+ const pixel *lpf, ptrdiff_t lpf_stride, \
+ int w, int h, const int16_t filterh[7], \
+ const int16_t filterv[7], enum LrEdgeFlags edges)
+typedef decl_wiener_filter_fn(*wienerfilter_fn);
-typedef void (*selfguided_fn)(pixel *dst, ptrdiff_t dst_stride,
- const void *left /*const pixel (*left)[4]*/,
- const pixel *lpf, ptrdiff_t lpf_stride,
- int w, int h, int sgr_idx, const int16_t sgr_w[2],
- const enum LrEdgeFlags edges);
+#define decl_selfguided_filter_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+ const_left_pixel_row left, \
+ const pixel *lpf, ptrdiff_t lpf_stride, \
+ int w, int h, int sgr_idx, const int16_t sgr_w[2], \
+ const enum LrEdgeFlags edges)
+typedef decl_selfguided_filter_fn(*selfguided_fn);
typedef struct Dav1dLoopRestorationDSPContext {
wienerfilter_fn wiener;
@@ -63,5 +73,8 @@
void dav1d_loop_restoration_dsp_init_8bpc(Dav1dLoopRestorationDSPContext *c);
void dav1d_loop_restoration_dsp_init_10bpc(Dav1dLoopRestorationDSPContext *c);
+
+void dav1d_loop_restoration_dsp_init_x86_8bpc(Dav1dLoopRestorationDSPContext *c);
+void dav1d_loop_restoration_dsp_init_x86_10bpc(Dav1dLoopRestorationDSPContext *c);
#endif /* __DAV1D_SRC_LOOPRESTORATION_H__ */
--- a/src/meson.build
+++ b/src/meson.build
@@ -101,6 +101,7 @@
'x86/ipred_init.c',
'x86/itx_init.c',
'x86/loopfilter_init.c',
+ 'x86/looprestoration_init.c',
'x86/mc_init.c',
)
@@ -110,6 +111,7 @@
'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
+ 'x86/looprestoration.asm',
'x86/mc.asm',
)
--- /dev/null
+++ b/src/x86/looprestoration.asm
@@ -1,0 +1,303 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+pb_right_ext_mask: times 32 db 0xff
+ times 32 db 0
+pb_14x0_1_2: times 14 db 0
+ db 1, 2
+pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13
+ db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14
+pb_15: times 16 db 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+pw_16380: times 2 dw 16380
+pw_0_128: dw 0, 128
+pd_1024: dd 1024
+
+SECTION .text
+
+INIT_YMM avx2
+cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
+ vpbroadcastb m15, [fhq+0]
+ vpbroadcastb m14, [fhq+2]
+ vpbroadcastb m13, [fhq+4]
+ vpbroadcastw m12, [fhq+6]
+ vpbroadcastd m9, [pw_128]
+ paddw m12, m9
+ vpbroadcastd m11, [pw_2048]
+ vpbroadcastd m10, [pw_16380]
+ lea r11, [pb_right_ext_mask]
+
+ DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
+
+ ; if (edge & has_right) align_w_to_32
+ ; else w -= 32, and use that as limit in x loop
+ test edged, 2 ; has_right
+ jnz .align
+ mov xlimq, -3
+ jmp .loop
+.align:
+ add wd, 31
+ and wd, ~31
+ xor xlimd, xlimd
+
+ ; main y loop for vertical filter
+.loop:
+ mov srcptrq, srcq
+ mov dstptrq, dstq
+ lea xq, [wq+xlimq]
+
+ ; load left edge pixels
+ test edged, 1 ; have_left
+ jz .emu_left
+ test leftq, leftq ; left == NULL for the edge-extended bottom/top
+ jz .load_left_combined
+ movd xm0, [leftq]
+ pinsrd xm0, [srcq], 1
+ pslldq xm0, 9
+ jmp .left_load_done
+.load_left_combined:
+ movq xm0, [srcq-5]
+ jmp .left_load_done
+.emu_left:
+ movd xm0, [srcq]
+ pshufb xm0, [pb_14x0_1_2]
+
+ ; load right edge pixels
+.left_load_done:
+ cmp xd, 32
+ jg .main_load
+ test xd, xd
+ jg .load_and_splat
+ je .splat_right
+
+ ; for very small images (w=[1-2]), edge-extend the original cache,
+ ; ugly, but only runs in very odd cases
+ add wd, wd
+ pshufb xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
+ shr wd, 1
+
+ ; main x loop, mostly this starts in .main_load
+.splat_right:
+ ; no need to load new pixels, just extend them from the (possibly previously
+ ; extended) previous load into m0
+ pshufb xm1, xm0, [pb_15]
+ jmp .main_loop
+.load_and_splat:
+ ; load new pixels and extend edge for right-most
+ movu m1, [srcptrq+3]
+ sub r11, xq
+ movu m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]
+ add r11, xq
+ vpbroadcastb m3, [srcptrq+2+xq]
+ pand m1, m2
+ pandn m3, m2, m3
+ por m1, m3
+ jmp .main_loop
+.main_load:
+ ; load subsequent line
+ movu m1, [srcptrq+3]
+.main_loop:
+ vinserti128 m0, xm1, 1
+
+ palignr m2, m1, m0, 10
+ palignr m3, m1, m0, 11
+ palignr m4, m1, m0, 12
+ palignr m5, m1, m0, 13
+ palignr m6, m1, m0, 14
+ palignr m7, m1, m0, 15
+
+ punpcklbw m0, m2, m1
+ punpckhbw m2, m1
+ punpcklbw m8, m3, m7
+ punpckhbw m3, m7
+ punpcklbw m7, m4, m6
+ punpckhbw m4, m6
+ pxor m9, m9
+ punpcklbw m6, m5, m9
+ punpckhbw m5, m9
+
+ pmaddubsw m0, m15
+ pmaddubsw m2, m15
+ pmaddubsw m8, m14
+ pmaddubsw m3, m14
+ pmaddubsw m7, m13
+ pmaddubsw m4, m13
+ pmullw m6, m12
+ pmullw m5, m12
+ ; note that m6/5 are unsigned here, whereas the others are signed
+ psubw m0, m10
+ psubw m2, m10
+ paddw m0, m8
+ paddw m2, m3
+ paddw m0, m7
+ paddw m2, m4
+ paddw m0, m6
+ paddw m2, m5
+ psraw m0, 3
+ psraw m2, 3
+ paddw m0, m11
+ paddw m2, m11
+ mova [dstptrq], xm0
+ mova [dstptrq+16], xm2
+ vextracti128 [dstptrq+32], m0, 1
+ vextracti128 [dstptrq+48], m2, 1
+ vextracti128 xm0, m1, 1
+ add srcptrq, 32
+ add dstptrq, 64
+ sub xq, 32
+ cmp xd, 32
+ jg .main_load
+ test xd, xd
+ jg .load_and_splat
+ cmp xd, xlimd
+ jg .splat_right
+
+ add srcq, strideq
+ add dstq, 384*2
+ dec hd
+ jg .loop
+ RET
+
+cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
+ vpbroadcastd m14, [fvq+4]
+ vpbroadcastd m15, [fvq]
+ vpbroadcastd m13, [pw_0_128]
+ paddw m14, m13
+ vpbroadcastd m12, [pd_1024]
+
+ DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
+ and ylimd, 8 ; have_bottom
+ shr ylimd, 2
+ sub ylimd, 3
+
+ ; main x loop for vertical filter, does one column of 16 pixels
+.loop_x:
+ mova m3, [midq] ; middle line
+
+ ; load top pixels
+ test edged, 4 ; have_top
+ jz .emu_top
+ mova m0, [midq-384*4]
+ mova m2, [midq-384*2]
+ mova m1, m0
+ jmp .load_bottom_pixels
+.emu_top:
+ mova m0, m3
+ mova m1, m3
+ mova m2, m3
+
+ ; load bottom pixels
+.load_bottom_pixels:
+ mov yd, hd
+ mov mptrq, midq
+ mov dstptrq, dstq
+ add yd, ylimd
+ jg .load_threelines
+
+ ; the remainder here is somewhat messy but only runs in very weird
+ ; circumstances at the bottom of the image in very small blocks (h=[1-3]),
+ ; so performance is not terribly important here...
+ je .load_twolines
+ cmp yd, -1
+ je .load_oneline
+ ; h == 1 case
+ mova m5, m3
+ mova m4, m3
+ mova m6, m3
+ jmp .loop
+.load_oneline:
+ ; h == 2 case
+ mova m4, [midq+384*2]
+ mova m5, m4
+ mova m6, m4
+ jmp .loop
+.load_twolines:
+ ; h == 3 case
+ mova m4, [midq+384*2]
+ mova m5, [midq+384*4]
+ mova m6, m5
+ jmp .loop
+.load_threelines:
+ ; h > 3 case
+ mova m4, [midq+384*2]
+ mova m5, [midq+384*4]
+ ; third line loaded in main loop below
+
+ ; main y loop for vertical filter
+.loop_load:
+ ; load one line into m6. if that pixel is no longer available, do
+ ; nothing, since m6 still has the data from the previous line in it. We
+ ; try to structure the loop so that the common case is evaluated fastest
+ mova m6, [mptrq+384*6]
+.loop:
+ paddw m7, m0, m6
+ paddw m8, m1, m5
+ paddw m9, m2, m4
+ punpcklwd m10, m7, m8
+ punpckhwd m7, m8
+ punpcklwd m11, m9, m3
+ punpckhwd m9, m3
+ pmaddwd m10, m15
+ pmaddwd m7, m15
+ pmaddwd m11, m14
+ pmaddwd m9, m14
+ paddd m10, m11
+ paddd m7, m9
+ paddd m10, m12
+ paddd m7, m12
+ psrad m10, 11
+ psrad m7, 11
+ packssdw m10, m7
+ packuswb m10, m10
+ vpermq m10, m10, q3120
+ mova [dstptrq], xm10
+ ; shift pixels one position
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ mova m4, m5
+ mova m5, m6
+ add dstptrq, strideq
+ add mptrq, 384*2
+ dec yd
+ jg .loop_load
+ ; for the bottom pixels, continue using m6 (as extended edge)
+ cmp yd, ylimd
+ jg .loop
+
+ add dstq, 16
+ add midq, 32
+ sub wd, 16
+ jg .loop_x
+ RET
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/looprestoration_init.c
@@ -1,0 +1,88 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/cpu.h"
+#include "src/looprestoration.h"
+
+#if BITDEPTH == 8 && ARCH_X86_64
+void dav1d_wiener_filter_h_avx2(int16_t *dst, const pixel (*left)[4],
+ const pixel *src, ptrdiff_t stride,
+ const int16_t fh[7], const intptr_t w,
+ int h, enum LrEdgeFlags edges);
+void dav1d_wiener_filter_v_avx2(pixel *dst, ptrdiff_t stride,
+ const int16_t *mid, int w, int h,
+ const int16_t fv[7], enum LrEdgeFlags edges);
+
+// Future potential optimizations:
+// - special chroma versions which don't filter [0]/[6];
+// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top
+// to bottom) instead of scanline-ordered should be faster since then the
+// if (have_left) and similar conditions run only once instead of per line;
+// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible
+// to run 32 (like filter_h_avx2), and then all vpermqs can go;
+// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,
+// since then the have_left condition can be inlined;
+// - consider having the wrapper (wiener_filter_avx2) also in hand-written
+// assembly, so the setup overhead is minimized.
+
+static void wiener_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, const ptrdiff_t lpf_stride,
+ const int w, const int h, const int16_t fh[7],
+ const int16_t fv[7], const enum LrEdgeFlags edges)
+{
+ ALIGN_STK_32(int16_t, mid, 68 * 384,);
+
+ // horizontal filter
+ dav1d_wiener_filter_h_avx2(&mid[2 * 384], left, dst, dst_stride,
+ fh, w, h, edges);
+ if (edges & LR_HAVE_TOP)
+ dav1d_wiener_filter_h_avx2(mid, NULL, lpf, lpf_stride,
+ fh, w, 2, edges);
+ if (edges & LR_HAVE_BOTTOM)
+ dav1d_wiener_filter_h_avx2(&mid[(2 + h) * 384], NULL,
+ lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
+ fh, w, 2, edges);
+
+ dav1d_wiener_filter_v_avx2(dst, dst_stride, &mid[2*384], w, h, fv, edges);
+}
+#endif
+
+void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+ c->wiener = wiener_filter_avx2;
+#endif
+}
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -67,6 +67,8 @@
{ "itx_10bpc", checkasm_check_itx_10bpc },
{ "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
{ "loopfilter_10bpc", checkasm_check_loopfilter_10bpc },
+ { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },
+ { "looprestoration_10bpc", checkasm_check_looprestoration_10bpc },
{ "mc_8bpc", checkasm_check_mc_8bpc },
{ "mc_10bpc", checkasm_check_mc_10bpc },
{ 0 }
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -45,6 +45,9 @@
void checkasm_check_loopfilter_8bpc(void);
void checkasm_check_loopfilter_10bpc(void);
+void checkasm_check_looprestoration_8bpc(void);
+void checkasm_check_looprestoration_10bpc(void);
+
void checkasm_check_mc_8bpc(void);
void checkasm_check_mc_10bpc(void);
--- /dev/null
+++ b/tests/checkasm/looprestoration.c
@@ -1,0 +1,127 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "tests/checkasm/checkasm.h"
+
+#include <string.h>
+
+#include "src/levels.h"
+#include "src/looprestoration.h"
+
+static void init_tmp(pixel *buf, const ptrdiff_t stride,
+ const int w, const int h)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ buf[x] = rand() & ((1 << BITDEPTH) - 1);
+ buf += PXSTRIDE(stride);
+ }
+}
+
+static int cmp2d(const pixel *a, const pixel *b, const ptrdiff_t stride,
+ const int w, const int h)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ if (a[x] != b[x]) return (y << 16) | x;
+ a += PXSTRIDE(stride);
+ b += PXSTRIDE(stride);
+ }
+ return -1;
+}
+
+static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {
+ ALIGN_STK_32(pixel, c_dst, 448 * 64,);
+ ALIGN_STK_32(pixel, a_dst, 448 * 64,);
+ ALIGN_STK_32(pixel, h_edge, 448 * 8,);
+ pixel left[64][4];
+
+ declare_func(void, pixel *dst, ptrdiff_t dst_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, ptrdiff_t lpf_stride,
+ int w, int h, const int16_t filterh[7],
+ const int16_t filterv[7], enum LrEdgeFlags edges);
+
+ init_tmp(c_dst, 448 * sizeof(pixel), 448, 64);
+ init_tmp(h_edge, 448 * sizeof(pixel), 448, 8);
+ init_tmp(left, 4 * sizeof(pixel), 4, 64);
+
+ for (int pl = 0; pl < 2; pl++) {
+ if (check_func(c->wiener, "wiener_%s_%dbpc",
+ pl ? "chroma" : "luma", BITDEPTH))
+ {
+ int16_t filter[2][3], filter_v[7], filter_h[7];
+
+ filter[0][0] = pl ? 0 : (rand() & 15) - 5;
+ filter[0][1] = (rand() & 31) - 23;
+ filter[0][2] = (rand() & 63) - 17;
+ filter[1][0] = pl ? 0 : (rand() & 15) - 5;
+ filter[1][1] = (rand() & 31) - 23;
+ filter[1][2] = (rand() & 63) - 17;
+
+ filter_h[0] = filter_h[6] = filter[0][0];
+ filter_h[1] = filter_h[5] = filter[0][1];
+ filter_h[2] = filter_h[4] = filter[0][2];
+ filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);
+
+ filter_v[0] = filter_v[6] = filter[1][0];
+ filter_v[1] = filter_v[5] = filter[1][1];
+ filter_v[2] = filter_v[4] = filter[1][2];
+ filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);
+
+ const int base_w = 1 + (rand() % 384);
+ const int base_h = 1 + (rand() & 63);
+ for (enum LrEdgeFlags edges = 0; edges <= 0; edges++) {
+ const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;
+ const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;
+
+ memcpy(a_dst, c_dst, sizeof(c_dst));
+
+ call_ref(c_dst + 32, 448 * sizeof(pixel), left,
+ h_edge + 32, 448 * sizeof(pixel),
+ w, h, filter_h, filter_v, edges);
+ call_new(a_dst + 32, 448 * sizeof(pixel), left,
+ h_edge + 32, 448 * sizeof(pixel),
+ w, h, filter_h, filter_v, edges);
+ const int res = cmp2d(c_dst + 32, a_dst + 32, 448 * sizeof(pixel), w, h);
+ if (res != -1) fail();
+ }
+ bench_new(a_dst + 32, 448 * sizeof(pixel), left,
+ h_edge + 32, 448 * sizeof(pixel),
+ 256, 64, filter_h, filter_v, 0xf);
+ }
+ }
+ report("wiener");
+}
+
+void bitfn(checkasm_check_looprestoration)(void) {
+ Dav1dLoopRestorationDSPContext c;
+
+ bitfn(dav1d_loop_restoration_dsp_init)(&c);
+
+ check_wiener(&c);
+}
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -38,6 +38,7 @@
'checkasm/ipred.c',
'checkasm/itx.c',
'checkasm/loopfilter.c',
+ 'checkasm/looprestoration.c',
'checkasm/mc.c',
)