shithub: dav1d

--- a/include/common/intops.h

+++ b/include/common/intops.h

@@ -31,54 +31,19 @@

 #include <stdint.h>

 #include "common/attributes.h"

+#define imax(a, b) (int)((int)(a) > (int)(b) ? (a) : (b))

+#define imin(a, b) (int)((int)(a) < (int)(b) ? (a) : (b))

+#define umax(a, b) (unsigned)((unsigned)(a) > (unsigned)(b) ? (a) : (b))

+#define umin(a, b) (unsigned)((unsigned)(a) < (unsigned)(b) ? (a) : (b))

+#define iclip_u8(v) iclip((v), 0, 255)

+#define apply_sign(v, s) ((int)(s) < 0 ? -(int)(v) : (int)(v))

+#define apply_sign64(v, s) ((int64_t)(s) < 0 ? -(int)(v) : (int)(v))

+#define ulog2(v) (int)(31 - clz((unsigned)(v)))

+#define u64log2(v) (int)(63 - clzll((uint64_t)(v)))

+#define inv_recenter(r, v) (unsigned)((unsigned)(v) > ((unsigned)(r)<<1) ? (v) : (((v)&1) == 0) ? (((unsigned)(v)>>1) + (unsigned)(r)) : ((unsigned)(r) - (((unsigned)(v)+1)>>1)))

-static inline int imax(const int a, const int b) {

-    return a > b ? a : b;

-}

-static inline int imin(const int a, const int b) {

-    return a < b ? a : b;

-}

-static inline unsigned umax(const unsigned a, const unsigned b) {

-    return a > b ? a : b;

-}

-static inline unsigned umin(const unsigned a, const unsigned b) {

-    return a < b ? a : b;

-}

 static inline int iclip(const int v, const int min, const int max) {

     return v < min ? min : v > max ? max : v;

-}

-static inline int iclip_u8(const int v) {

-    return iclip(v, 0, 255);

-}

-static inline int apply_sign(const int v, const int s) {

-    return s < 0 ? -v : v;

-}

-static inline int apply_sign64(const int v, const int64_t s) {

-    return s < 0 ? -v : v;

-}

-static inline int ulog2(const unsigned v) {

-    return 31 - clz(v);

-}

-static inline int u64log2(const uint64_t v) {

-    return 63 - clzll(v);

-}

-static inline unsigned inv_recenter(const unsigned r, const unsigned v) {

-    if (v > (r << 1))

-        return v;

-    else if ((v & 1) == 0)

-        return (v >> 1) + r;

-    else

-        return r - ((v + 1) >> 1);

 #endif /* DAV1D_COMMON_INTOPS_H */

--- a/src/looprestoration_tmpl.c

+++ b/src/looprestoration_tmpl.c

@@ -156,15 +156,12 @@

     const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);

     for (int j = 0; j < h + 6; j++) {

         for (int i = 0; i < w; i++) {

-            int sum = (1 << (bitdepth + 6));

+            int sum = (1 << (bitdepth + 6)) +

 #if BITDEPTH == 8

-            sum += tmp_ptr[i + 3] * 128;

+            tmp_ptr[i + 3] * 128 +

 #endif

+			(((tmp_ptr[i+0]*filter[0][0] + tmp_ptr[i+1]*filter[0][1]) + (tmp_ptr[i+2]*filter[0][2] + tmp_ptr[i+3]*filter[0][3])) + (tmp_ptr[i+4]*filter[0][4] + tmp_ptr[i+5]*filter[0][5])) + tmp_ptr[i+6]*filter[0][6];

-            for (int k = 0; k < 7; k++) {

-                sum += tmp_ptr[i + k] * filter[0][k];

-            }

             hor_ptr[i] =

                 iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);

@@ -177,11 +174,7 @@

     const int round_offset = 1 << (bitdepth + (round_bits_v - 1));

     for (int j = 0; j < h; j++) {

         for (int i = 0; i < w; i++) {

-            int sum = -round_offset;

-            for (int k = 0; k < 7; k++) {

-                sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];

-            }

+            int sum = -round_offset + hor[(j+0)*REST_UNIT_STRIDE+i]*filter[1][0] + hor[(j+1)*REST_UNIT_STRIDE+i]*filter[1][1] + hor[(j+2)*REST_UNIT_STRIDE+i]*filter[1][2] + hor[(j+3)*REST_UNIT_STRIDE+i]*filter[1][3] + hor[(j+4)*REST_UNIT_STRIDE+i]*filter[1][4] + hor[(j+5)*REST_UNIT_STRIDE+i]*filter[1][5] + hor[(j+6)*REST_UNIT_STRIDE+i]*filter[1][6];

             p[j * PXSTRIDE(p_stride) + i] =

                 iclip_pixel((sum + rounding_off_v) >> round_bits_v);

--- a/src/mc_tmpl.c

+++ b/src/mc_tmpl.c

@@ -809,13 +809,17 @@

     src -= 3 * PXSTRIDE(src_stride);

     for (int y = 0; y < 15; y++, mx += abcd[1]) {

-        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {

-            const int8_t *const filter =

-                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];

+        int tmx = mx;

-            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,

-                                         7 - intermediate_bits);

-        }

+        mid_ptr[0] = FILTER_WARP_RND(src, 0, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[1] = FILTER_WARP_RND(src, 1, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[2] = FILTER_WARP_RND(src, 2, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[3] = FILTER_WARP_RND(src, 3, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[4] = FILTER_WARP_RND(src, 4, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[5] = FILTER_WARP_RND(src, 5, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[6] = FILTER_WARP_RND(src, 6, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

+        mid_ptr[7] = FILTER_WARP_RND(src, 7, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];

         src += PXSTRIDE(src_stride);

         mid_ptr += 8;

@@ -822,12 +826,17 @@

     mid_ptr = &mid[3 * 8];

     for (int y = 0; y < 8; y++, my += abcd[3]) {

-        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {

-            const int8_t *const filter =

-                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];

+        int tmy = my;

-            tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;

-        }

+        tmp[0] = FILTER_WARP_RND(mid_ptr, 0, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[1] = FILTER_WARP_RND(mid_ptr, 1, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[2] = FILTER_WARP_RND(mid_ptr, 2, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[3] = FILTER_WARP_RND(mid_ptr, 3, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[4] = FILTER_WARP_RND(mid_ptr, 4, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[5] = FILTER_WARP_RND(mid_ptr, 5, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[6] = FILTER_WARP_RND(mid_ptr, 6, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

+        tmp[7] = FILTER_WARP_RND(mid_ptr, 7, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];

         mid_ptr += 8;

         tmp += tmp_stride;