shithub: dav1d

--- a/src/looprestoration.c

+++ b/src/looprestoration.c

@@ -65,6 +65,11 @@

         pixel_copy(dst_l, p, unit_w);

         pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);

         pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);

+        if (have_left) {

+            pixel_copy(dst_l, &left[0][1], 3);

+            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);

+            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);

+        }

     pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;

@@ -81,6 +86,11 @@

         pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);

         pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);

         pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);

+        if (have_left) {

+            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

+            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

+            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);

+        }

     // Inner UNIT_WxSTRIPE_H

@@ -560,4 +570,8 @@

 void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {

     c->wiener = wiener_c;

     c->selfguided = selfguided_c;

+#if ARCH_X86 && BITDEPTH == 8

+    bitfn(dav1d_loop_restoration_dsp_init_x86)(c);

+#endif

--- a/src/looprestoration.h

+++ b/src/looprestoration.h

@@ -40,21 +40,31 @@

     LR_HAVE_BOTTOM = 1 << 3,

};

+#ifdef BITDEPTH

+typedef const pixel (*const_left_pixel_row)[4];

+#else

+typedef const void *const_left_pixel_row;

+#endif

 // Although the spec applies restoration filters over 4x4 blocks, the wiener

 // filter can be applied to a bigger surface.

 //    * w is constrained by the restoration unit size (w <= 256)

 //    * h is constrained by the stripe height (h <= 64)

-typedef void (*wienerfilter_fn)(pixel *dst, ptrdiff_t dst_stride,

-                                const void *left /*const pixel (*left)[4]*/,

-                                const pixel *lpf, ptrdiff_t lpf_stride,

-                                int w, int h, const int16_t filterh[7],

-                                const int16_t filterv[7], enum LrEdgeFlags edges);

+#define decl_wiener_filter_fn(name) \

+void (name)(pixel *dst, ptrdiff_t dst_stride, \

+            const_left_pixel_row left, \

+            const pixel *lpf, ptrdiff_t lpf_stride, \

+            int w, int h, const int16_t filterh[7], \

+            const int16_t filterv[7], enum LrEdgeFlags edges)

+typedef decl_wiener_filter_fn(*wienerfilter_fn);

-typedef void (*selfguided_fn)(pixel *dst, ptrdiff_t dst_stride,

-                              const void *left /*const pixel (*left)[4]*/,

-                              const pixel *lpf, ptrdiff_t lpf_stride,

-                              int w, int h, int sgr_idx, const int16_t sgr_w[2],

-                              const enum LrEdgeFlags edges);

+#define decl_selfguided_filter_fn(name) \

+void (name)(pixel *dst, ptrdiff_t dst_stride, \

+            const_left_pixel_row left, \

+            const pixel *lpf, ptrdiff_t lpf_stride, \

+            int w, int h, int sgr_idx, const int16_t sgr_w[2], \

+            const enum LrEdgeFlags edges)

+typedef decl_selfguided_filter_fn(*selfguided_fn);

 typedef struct Dav1dLoopRestorationDSPContext {

     wienerfilter_fn wiener;

@@ -63,5 +73,8 @@

 void dav1d_loop_restoration_dsp_init_8bpc(Dav1dLoopRestorationDSPContext *c);

 void dav1d_loop_restoration_dsp_init_10bpc(Dav1dLoopRestorationDSPContext *c);

+void dav1d_loop_restoration_dsp_init_x86_8bpc(Dav1dLoopRestorationDSPContext *c);

+void dav1d_loop_restoration_dsp_init_x86_10bpc(Dav1dLoopRestorationDSPContext *c);

 #endif /* __DAV1D_SRC_LOOPRESTORATION_H__ */

--- a/src/meson.build

+++ b/src/meson.build

@@ -101,6 +101,7 @@

             'x86/ipred_init.c',

             'x86/itx_init.c',

             'x86/loopfilter_init.c',

+            'x86/looprestoration_init.c',

             'x86/mc_init.c',

@@ -110,6 +111,7 @@

             'x86/ipred.asm',

             'x86/itx.asm',

             'x86/loopfilter.asm',

+            'x86/looprestoration.asm',

             'x86/mc.asm',

--- /dev/null

+++ b/src/x86/looprestoration.asm

@@ -1,0 +1,303 @@

+; Copyright © 2018, VideoLAN and dav1d authors

+; Copyright © 2018, Two Orioles, LLC

+; All rights reserved.

+;

+; Redistribution and use in source and binary forms, with or without

+; modification, are permitted provided that the following conditions are met:

+;

+; 1. Redistributions of source code must retain the above copyright notice, this

+;    list of conditions and the following disclaimer.

+;

+; 2. Redistributions in binary form must reproduce the above copyright notice,

+;    this list of conditions and the following disclaimer in the documentation

+;    and/or other materials provided with the distribution.

+;

+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+%include "config.asm"

+%include "ext/x86/x86inc.asm"

+%if ARCH_X86_64

+SECTION_RODATA 32

+pb_right_ext_mask: times 32 db 0xff

+                   times 32 db 0

+pb_14x0_1_2: times 14 db 0

+             db 1, 2

+pb_0_to_15_min_n: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13

+                  db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14

+pb_15: times 16 db 15

+pw_128: times 2 dw 128

+pw_2048: times 2 dw 2048

+pw_16380: times 2 dw 16380

+pw_0_128: dw 0, 128

+pd_1024: dd 1024

+SECTION .text

+INIT_YMM avx2

+cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge

+    vpbroadcastb m15, [fhq+0]

+    vpbroadcastb m14, [fhq+2]

+    vpbroadcastb m13, [fhq+4]

+    vpbroadcastw m12, [fhq+6]

+    vpbroadcastd  m9, [pw_128]

+    paddw        m12, m9

+    vpbroadcastd m11, [pw_2048]

+    vpbroadcastd m10, [pw_16380]

+    lea          r11, [pb_right_ext_mask]

+    DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim

+    ; if (edge & has_right) align_w_to_32

+    ; else w -= 32, and use that as limit in x loop

+    test       edged, 2 ; has_right

+    jnz .align

+    mov        xlimq, -3

+    jmp .loop

+.align:

+    add           wd, 31

+    and           wd, ~31

+    xor        xlimd, xlimd

+    ; main y loop for vertical filter

+.loop:

+    mov      srcptrq, srcq

+    mov      dstptrq, dstq

+    lea           xq, [wq+xlimq]

+    ; load left edge pixels

+    test       edged, 1 ; have_left

+    jz .emu_left

+    test       leftq, leftq ; left == NULL for the edge-extended bottom/top

+    jz .load_left_combined

+    movd         xm0, [leftq]

+    pinsrd       xm0, [srcq], 1

+    pslldq       xm0, 9

+    jmp .left_load_done

+.load_left_combined:

+    movq         xm0, [srcq-5]

+    jmp .left_load_done

+.emu_left:

+    movd         xm0, [srcq]

+    pshufb       xm0, [pb_14x0_1_2]

+    ; load right edge pixels

+.left_load_done:

+    cmp           xd, 32

+    jg .main_load

+    test          xd, xd

+    jg .load_and_splat

+    je .splat_right

+    ; for very small images (w=[1-2]), edge-extend the original cache,

+    ; ugly, but only runs in very odd cases

+    add           wd, wd

+    pshufb       xm0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]

+    shr           wd, 1

+    ; main x loop, mostly this starts in .main_load

+.splat_right:

+    ; no need to load new pixels, just extend them from the (possibly previously

+    ; extended) previous load into m0

+    pshufb       xm1, xm0, [pb_15]

+    jmp .main_loop

+.load_and_splat:

+    ; load new pixels and extend edge for right-most

+    movu          m1, [srcptrq+3]

+    sub          r11, xq

+    movu          m2, [r11-pb_right_ext_mask+pb_right_ext_mask+32]

+    add          r11, xq

+    vpbroadcastb  m3, [srcptrq+2+xq]

+    pand          m1, m2

+    pandn         m3, m2, m3

+    por           m1, m3

+    jmp .main_loop

+.main_load:

+    ; load subsequent line

+    movu          m1, [srcptrq+3]

+.main_loop:

+    vinserti128   m0, xm1, 1

+    palignr       m2, m1, m0, 10

+    palignr       m3, m1, m0, 11

+    palignr       m4, m1, m0, 12

+    palignr       m5, m1, m0, 13

+    palignr       m6, m1, m0, 14

+    palignr       m7, m1, m0, 15

+    punpcklbw     m0, m2, m1

+    punpckhbw     m2, m1

+    punpcklbw     m8, m3, m7

+    punpckhbw     m3, m7

+    punpcklbw     m7, m4, m6

+    punpckhbw     m4, m6

+    pxor          m9, m9

+    punpcklbw     m6, m5, m9

+    punpckhbw     m5, m9

+    pmaddubsw     m0, m15

+    pmaddubsw     m2, m15

+    pmaddubsw     m8, m14

+    pmaddubsw     m3, m14

+    pmaddubsw     m7, m13

+    pmaddubsw     m4, m13

+    pmullw        m6, m12

+    pmullw        m5, m12

+    ; note that m6/5 are unsigned here, whereas the others are signed

+    psubw         m0, m10

+    psubw         m2, m10

+    paddw         m0, m8

+    paddw         m2, m3

+    paddw         m0, m7

+    paddw         m2, m4

+    paddw         m0, m6

+    paddw         m2, m5

+    psraw         m0, 3

+    psraw         m2, 3

+    paddw         m0, m11

+    paddw         m2, m11

+    mova   [dstptrq], xm0

+    mova [dstptrq+16], xm2

+    vextracti128 [dstptrq+32], m0, 1

+    vextracti128 [dstptrq+48], m2, 1

+    vextracti128 xm0, m1, 1

+    add      srcptrq, 32

+    add      dstptrq, 64

+    sub           xq, 32

+    cmp           xd, 32

+    jg .main_load

+    test          xd, xd

+    jg .load_and_splat

+    cmp           xd, xlimd

+    jg .splat_right

+    add         srcq, strideq

+    add         dstq, 384*2

+    dec           hd

+    jg .loop

+    RET

+cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge

+    vpbroadcastd m14, [fvq+4]

+    vpbroadcastd m15, [fvq]

+    vpbroadcastd m13, [pw_0_128]

+    paddw        m14, m13

+    vpbroadcastd m12, [pd_1024]

+    DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr

+    and        ylimd, 8 ; have_bottom

+    shr        ylimd, 2

+    sub        ylimd, 3

+    ; main x loop for vertical filter, does one column of 16 pixels

+.loop_x:

+    mova          m3, [midq] ; middle line

+    ; load top pixels

+    test       edged, 4 ; have_top

+    jz .emu_top

+    mova          m0, [midq-384*4]

+    mova          m2, [midq-384*2]

+    mova          m1, m0

+    jmp .load_bottom_pixels

+.emu_top:

+    mova          m0, m3

+    mova          m1, m3

+    mova          m2, m3

+    ; load bottom pixels

+.load_bottom_pixels:

+    mov           yd, hd

+    mov        mptrq, midq

+    mov      dstptrq, dstq

+    add           yd, ylimd

+    jg .load_threelines

+    ; the remainder here is somewhat messy but only runs in very weird

+    ; circumstances at the bottom of the image in very small blocks (h=[1-3]),

+    ; so performance is not terribly important here...

+    je .load_twolines

+    cmp           yd, -1

+    je .load_oneline

+    ; h == 1 case

+    mova          m5, m3

+    mova          m4, m3

+    mova          m6, m3

+    jmp .loop

+.load_oneline:

+    ; h == 2 case

+    mova          m4, [midq+384*2]

+    mova          m5, m4

+    mova          m6, m4

+    jmp .loop

+.load_twolines:

+    ; h == 3 case

+    mova          m4, [midq+384*2]

+    mova          m5, [midq+384*4]

+    mova          m6, m5

+    jmp .loop

+.load_threelines:

+    ; h > 3 case

+    mova          m4, [midq+384*2]

+    mova          m5, [midq+384*4]

+    ; third line loaded in main loop below

+    ; main y loop for vertical filter

+.loop_load:

+    ; load one line into m6. if that pixel is no longer available, do

+    ; nothing, since m6 still has the data from the previous line in it. We

+    ; try to structure the loop so that the common case is evaluated fastest

+    mova          m6, [mptrq+384*6]

+.loop:

+    paddw         m7, m0, m6

+    paddw         m8, m1, m5

+    paddw         m9, m2, m4

+    punpcklwd    m10, m7, m8

+    punpckhwd     m7, m8

+    punpcklwd    m11, m9, m3

+    punpckhwd     m9, m3

+    pmaddwd      m10, m15

+    pmaddwd       m7, m15

+    pmaddwd      m11, m14

+    pmaddwd       m9, m14

+    paddd        m10, m11

+    paddd         m7, m9

+    paddd        m10, m12

+    paddd         m7, m12

+    psrad        m10, 11

+    psrad         m7, 11

+    packssdw     m10, m7

+    packuswb     m10, m10

+    vpermq       m10, m10, q3120

+    mova   [dstptrq], xm10

+    ; shift pixels one position

+    mova          m0, m1

+    mova          m1, m2

+    mova          m2, m3

+    mova          m3, m4

+    mova          m4, m5

+    mova          m5, m6

+    add      dstptrq, strideq

+    add        mptrq, 384*2

+    dec           yd

+    jg .loop_load

+    ; for the bottom pixels, continue using m6 (as extended edge)

+    cmp           yd, ylimd

+    jg .loop

+    add         dstq, 16

+    add         midq, 32

+    sub           wd, 16

+    jg .loop_x

+    RET

+%endif ; ARCH_X86_64

--- /dev/null

+++ b/src/x86/looprestoration_init.c

@@ -1,0 +1,88 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include <assert.h>

+#include "common/attributes.h"

+#include "common/intops.h"

+#include "src/cpu.h"

+#include "src/looprestoration.h"

+#if BITDEPTH == 8 && ARCH_X86_64

+void dav1d_wiener_filter_h_avx2(int16_t *dst, const pixel (*left)[4],

+                                const pixel *src, ptrdiff_t stride,

+                                const int16_t fh[7], const intptr_t w,

+                                int h, enum LrEdgeFlags edges);

+void dav1d_wiener_filter_v_avx2(pixel *dst, ptrdiff_t stride,

+                                const int16_t *mid, int w, int h,

+                                const int16_t fv[7], enum LrEdgeFlags edges);

+// Future potential optimizations:

+// - special chroma versions which don't filter [0]/[6];

+// - running filter_h_avx2 transposed (one col of 32 pixels per iteration, top

+//   to bottom) instead of scanline-ordered should be faster since then the

+//   if (have_left) and similar conditions run only once instead of per line;

+// - filter_v_avx2 currently runs 16 pixels per iteration, it should be possible

+//   to run 32 (like filter_h_avx2), and then all vpermqs can go;

+// - maybe split out the top/bottom filter_h_avx2 from the main body filter_h_avx2,

+//   since then the have_left condition can be inlined;

+// - consider having the wrapper (wiener_filter_avx2) also in hand-written

+//   assembly, so the setup overhead is minimized.

+static void wiener_filter_avx2(pixel *const dst, const ptrdiff_t dst_stride,

+                               const pixel (*const left)[4],

+                               const pixel *lpf, const ptrdiff_t lpf_stride,

+                               const int w, const int h, const int16_t fh[7],

+                               const int16_t fv[7], const enum LrEdgeFlags edges)

+{

+    ALIGN_STK_32(int16_t, mid, 68 * 384,);

+    // horizontal filter

+    dav1d_wiener_filter_h_avx2(&mid[2 * 384], left, dst, dst_stride,

+                               fh, w, h, edges);

+    if (edges & LR_HAVE_TOP)

+        dav1d_wiener_filter_h_avx2(mid, NULL, lpf, lpf_stride,

+                                   fh, w, 2, edges);

+    if (edges & LR_HAVE_BOTTOM)

+        dav1d_wiener_filter_h_avx2(&mid[(2 + h) * 384], NULL,

+                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,

+                                   fh, w, 2, edges);

+    dav1d_wiener_filter_v_avx2(dst, dst_stride, &mid[2*384], w, h, fv, edges);

+}

+#endif

+void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {

+    const unsigned flags = dav1d_get_cpu_flags();

+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

+#if BITDEPTH == 8 && ARCH_X86_64

+    c->wiener = wiener_filter_avx2;

+#endif

+}

--- a/tests/checkasm/checkasm.c

+++ b/tests/checkasm/checkasm.c

@@ -67,6 +67,8 @@

     { "itx_10bpc", checkasm_check_itx_10bpc },

     { "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },

     { "loopfilter_10bpc", checkasm_check_loopfilter_10bpc },

+    { "looprestoration_8bpc", checkasm_check_looprestoration_8bpc },

+    { "looprestoration_10bpc", checkasm_check_looprestoration_10bpc },

     { "mc_8bpc", checkasm_check_mc_8bpc },

     { "mc_10bpc", checkasm_check_mc_10bpc },

     { 0 }

--- a/tests/checkasm/checkasm.h

+++ b/tests/checkasm/checkasm.h

@@ -45,6 +45,9 @@

 void checkasm_check_loopfilter_8bpc(void);

 void checkasm_check_loopfilter_10bpc(void);

+void checkasm_check_looprestoration_8bpc(void);

+void checkasm_check_looprestoration_10bpc(void);

 void checkasm_check_mc_8bpc(void);

 void checkasm_check_mc_10bpc(void);

--- /dev/null

+++ b/tests/checkasm/looprestoration.c

@@ -1,0 +1,127 @@

+/*

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2018, Two Orioles, LLC

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ */

+#include "tests/checkasm/checkasm.h"

+#include <string.h>

+#include "src/levels.h"

+#include "src/looprestoration.h"

+static void init_tmp(pixel *buf, const ptrdiff_t stride,

+                     const int w, const int h)

+{

+    for (int y = 0; y < h; y++) {

+        for (int x = 0; x < w; x++)

+            buf[x] = rand() & ((1 << BITDEPTH) - 1);

+        buf += PXSTRIDE(stride);

+    }

+}

+static int cmp2d(const pixel *a, const pixel *b, const ptrdiff_t stride,

+                 const int w, const int h)

+{

+    for (int y = 0; y < h; y++) {

+        for (int x = 0; x < w; x++)

+            if (a[x] != b[x]) return (y << 16) | x;

+        a += PXSTRIDE(stride);

+        b += PXSTRIDE(stride);

+    }

+    return -1;

+}

+static void check_wiener(Dav1dLoopRestorationDSPContext *const c) {

+    ALIGN_STK_32(pixel, c_dst, 448 * 64,);

+    ALIGN_STK_32(pixel, a_dst, 448 * 64,);

+    ALIGN_STK_32(pixel, h_edge, 448 * 8,);

+    pixel left[64][4];

+    declare_func(void, pixel *dst, ptrdiff_t dst_stride,

+                 const pixel (*const left)[4],

+                 const pixel *lpf, ptrdiff_t lpf_stride,

+                 int w, int h, const int16_t filterh[7],

+                 const int16_t filterv[7], enum LrEdgeFlags edges);

+    init_tmp(c_dst, 448 * sizeof(pixel), 448, 64);

+    init_tmp(h_edge, 448 * sizeof(pixel), 448, 8);

+    init_tmp(left, 4 * sizeof(pixel), 4, 64);

+    for (int pl = 0; pl < 2; pl++) {

+        if (check_func(c->wiener, "wiener_%s_%dbpc",

+                       pl ? "chroma" : "luma", BITDEPTH))

+        {

+            int16_t filter[2][3], filter_v[7], filter_h[7];

+            filter[0][0] = pl ? 0 : (rand() & 15) - 5;

+            filter[0][1] = (rand() & 31) - 23;

+            filter[0][2] = (rand() & 63) - 17;

+            filter[1][0] = pl ? 0 : (rand() & 15) - 5;

+            filter[1][1] = (rand() & 31) - 23;

+            filter[1][2] = (rand() & 63) - 17;

+            filter_h[0] = filter_h[6] = filter[0][0];

+            filter_h[1] = filter_h[5] = filter[0][1];

+            filter_h[2] = filter_h[4] = filter[0][2];

+            filter_h[3] = -((filter_h[0] + filter_h[1] + filter_h[2]) * 2);

+            filter_v[0] = filter_v[6] = filter[1][0];

+            filter_v[1] = filter_v[5] = filter[1][1];

+            filter_v[2] = filter_v[4] = filter[1][2];

+            filter_v[3] = -((filter_v[0] + filter_v[1] + filter_v[2]) * 2);

+            const int base_w = 1 + (rand() % 384);

+            const int base_h = 1 + (rand() & 63);

+            for (enum LrEdgeFlags edges = 0; edges <= 0; edges++) {

+                const int w = edges & LR_HAVE_RIGHT ? 256 : base_w;

+                const int h = edges & LR_HAVE_BOTTOM ? 64 : base_h;

+                memcpy(a_dst, c_dst, sizeof(c_dst));

+                call_ref(c_dst + 32, 448 * sizeof(pixel), left,

+                         h_edge + 32, 448 * sizeof(pixel),

+                         w, h, filter_h, filter_v, edges);

+                call_new(a_dst + 32, 448 * sizeof(pixel), left,

+                         h_edge + 32, 448 * sizeof(pixel),

+                         w, h, filter_h, filter_v, edges);

+                const int res = cmp2d(c_dst + 32, a_dst + 32, 448 * sizeof(pixel), w, h);

+                if (res != -1) fail();

+            }

+            bench_new(a_dst + 32, 448 * sizeof(pixel), left,

+                      h_edge + 32, 448 * sizeof(pixel),

+                      256, 64, filter_h, filter_v, 0xf);

+        }

+    }

+    report("wiener");

+}

+void bitfn(checkasm_check_looprestoration)(void) {

+    Dav1dLoopRestorationDSPContext c;

+    bitfn(dav1d_loop_restoration_dsp_init)(&c);

+    check_wiener(&c);

+}

--- a/tests/meson.build

+++ b/tests/meson.build

@@ -38,6 +38,7 @@

         'checkasm/ipred.c',

         'checkasm/itx.c',

         'checkasm/loopfilter.c',

+        'checkasm/looprestoration.c',

         'checkasm/mc.c',