ref: 6b5e8cc587d544ab399488fa032640268856da9c
parent: 3b14f94924987fc66302bb8b803ad9065816f93e
author: Sigrid Solveig Haflínudóttir <[email protected]>
date: Wed Dec 31 20:50:10 EST 1969
inline and unroll more
--- a/include/common/intops.h
+++ b/include/common/intops.h
@@ -31,54 +31,19 @@
#include <stdint.h>
#include "common/attributes.h"
+#define imax(a, b) (int)((int)(a) > (int)(b) ? (a) : (b))
+#define imin(a, b) (int)((int)(a) < (int)(b) ? (a) : (b))
+#define umax(a, b) (unsigned)((unsigned)(a) > (unsigned)(b) ? (a) : (b))
+#define umin(a, b) (unsigned)((unsigned)(a) < (unsigned)(b) ? (a) : (b))
+#define iclip_u8(v) iclip((v), 0, 255)
+#define apply_sign(v, s) ((int)(s) < 0 ? -(int)(v) : (int)(v))
+#define apply_sign64(v, s) ((int64_t)(s) < 0 ? -(int)(v) : (int)(v))
+#define ulog2(v) (int)(31 - clz((unsigned)(v)))
+#define u64log2(v) (int)(63 - clzll((uint64_t)(v)))
+#define inv_recenter(r, v) (unsigned)((unsigned)(v) > ((unsigned)(r)<<1) ? (v) : (((v)&1) == 0) ? (((unsigned)(v)>>1) + (unsigned)(r)) : ((unsigned)(r) - (((unsigned)(v)+1)>>1)))
-static inline int imax(const int a, const int b) {
- return a > b ? a : b;
-}
-
-static inline int imin(const int a, const int b) {
- return a < b ? a : b;
-}
-
-static inline unsigned umax(const unsigned a, const unsigned b) {
- return a > b ? a : b;
-}
-
-static inline unsigned umin(const unsigned a, const unsigned b) {
- return a < b ? a : b;
-}
-
static inline int iclip(const int v, const int min, const int max) {
return v < min ? min : v > max ? max : v;
-}
-
-static inline int iclip_u8(const int v) {
- return iclip(v, 0, 255);
-}
-
-static inline int apply_sign(const int v, const int s) {
- return s < 0 ? -v : v;
-}
-
-static inline int apply_sign64(const int v, const int64_t s) {
- return s < 0 ? -v : v;
-}
-
-static inline int ulog2(const unsigned v) {
- return 31 - clz(v);
-}
-
-static inline int u64log2(const uint64_t v) {
- return 63 - clzll(v);
-}
-
-static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
- if (v > (r << 1))
- return v;
- else if ((v & 1) == 0)
- return (v >> 1) + r;
- else
- return r - ((v + 1) >> 1);
}
#endif /* DAV1D_COMMON_INTOPS_H */
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -156,15 +156,12 @@
const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
for (int j = 0; j < h + 6; j++) {
for (int i = 0; i < w; i++) {
- int sum = (1 << (bitdepth + 6));
+ int sum = (1 << (bitdepth + 6)) +
#if BITDEPTH == 8
- sum += tmp_ptr[i + 3] * 128;
+ tmp_ptr[i + 3] * 128 +
#endif
+ (((tmp_ptr[i+0]*filter[0][0] + tmp_ptr[i+1]*filter[0][1]) + (tmp_ptr[i+2]*filter[0][2] + tmp_ptr[i+3]*filter[0][3])) + (tmp_ptr[i+4]*filter[0][4] + tmp_ptr[i+5]*filter[0][5])) + tmp_ptr[i+6]*filter[0][6];
- for (int k = 0; k < 7; k++) {
- sum += tmp_ptr[i + k] * filter[0][k];
- }
-
hor_ptr[i] =
iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
}
@@ -177,11 +174,7 @@
const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
- int sum = -round_offset;
-
- for (int k = 0; k < 7; k++) {
- sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
- }
+ int sum = -round_offset + hor[(j+0)*REST_UNIT_STRIDE+i]*filter[1][0] + hor[(j+1)*REST_UNIT_STRIDE+i]*filter[1][1] + hor[(j+2)*REST_UNIT_STRIDE+i]*filter[1][2] + hor[(j+3)*REST_UNIT_STRIDE+i]*filter[1][3] + hor[(j+4)*REST_UNIT_STRIDE+i]*filter[1][4] + hor[(j+5)*REST_UNIT_STRIDE+i]*filter[1][5] + hor[(j+6)*REST_UNIT_STRIDE+i]*filter[1][6];
p[j * PXSTRIDE(p_stride) + i] =
iclip_pixel((sum + rounding_off_v) >> round_bits_v);
--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -809,13 +809,17 @@
src -= 3 * PXSTRIDE(src_stride);
for (int y = 0; y < 15; y++, mx += abcd[1]) {
- for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
- const int8_t *const filter =
- dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+ int tmx = mx;
- mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
- 7 - intermediate_bits);
- }
+ mid_ptr[0] = FILTER_WARP_RND(src, 0, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[1] = FILTER_WARP_RND(src, 1, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[2] = FILTER_WARP_RND(src, 2, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[3] = FILTER_WARP_RND(src, 3, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[4] = FILTER_WARP_RND(src, 4, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[5] = FILTER_WARP_RND(src, 5, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[6] = FILTER_WARP_RND(src, 6, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+ mid_ptr[7] = FILTER_WARP_RND(src, 7, dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)], 1, 7 - intermediate_bits); tmx += abcd[0];
+
src += PXSTRIDE(src_stride);
mid_ptr += 8;
}
@@ -822,12 +826,17 @@
mid_ptr = &mid[3 * 8];
for (int y = 0; y < 8; y++, my += abcd[3]) {
- for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
- const int8_t *const filter =
- dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+ int tmy = my;
- tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
- }
+ tmp[0] = FILTER_WARP_RND(mid_ptr, 0, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[1] = FILTER_WARP_RND(mid_ptr, 1, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[2] = FILTER_WARP_RND(mid_ptr, 2, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[3] = FILTER_WARP_RND(mid_ptr, 3, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[4] = FILTER_WARP_RND(mid_ptr, 4, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[5] = FILTER_WARP_RND(mid_ptr, 5, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[6] = FILTER_WARP_RND(mid_ptr, 6, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+ tmp[7] = FILTER_WARP_RND(mid_ptr, 7, dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)], 8, 7) - PREP_BIAS; tmy += abcd[2];
+
mid_ptr += 8;
tmp += tmp_stride;
}