ref: 8e8fb84dcda63e83671a41235f2d71e726a2e716
parent: feeaf785340e9aa910f65602e0f42e9958bd9e21
author: Martin Storsjö <[email protected]>
date: Wed Feb 5 05:17:59 EST 2020
arm: Use int16_t for the tmp intermediate buffer For 8bpc and 10bpc, int16_t is enough here, and for 12bpc, other intermediate int16_t buffers also need to be made of size coef anyway.
--- a/src/arm/32/looprestoration.S
+++ b/src/arm/32/looprestoration.S
@@ -1661,7 +1661,7 @@
#define FILTER_OUT_STRIDE 384
-// void dav1d_sgr_finish_filter1_neon(coef *tmp,
+// void dav1d_sgr_finish_filter1_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
@@ -1765,7 +1765,7 @@
pop {r4-r11,pc}
endfunc
-// void dav1d_sgr_finish_filter2_neon(coef *tmp,
+// void dav1d_sgr_finish_filter2_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
@@ -1927,7 +1927,7 @@
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
-// const coef *t1, const int w, const int h,
+// const int16_t *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
push {r4-r9,lr}
@@ -2011,7 +2011,7 @@
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
-// const coef *t1, const coef *t2,
+// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
--- a/src/arm/64/looprestoration.S
+++ b/src/arm/64/looprestoration.S
@@ -1540,7 +1540,7 @@
#define FILTER_OUT_STRIDE 384
-// void dav1d_sgr_finish_filter1_neon(coef *tmp,
+// void dav1d_sgr_finish_filter1_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
@@ -1657,7 +1657,7 @@
ret
endfunc
-// void dav1d_sgr_finish_filter2_neon(coef *tmp,
+// void dav1d_sgr_finish_filter2_neon(int16_t *tmp,
// const pixel *src, const ptrdiff_t stride,
// const int32_t *a, const int16_t *b,
// const int w, const int h);
@@ -1809,7 +1809,7 @@
// void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
-// const coef *t1, const int w, const int h,
+// const int16_t *t1, const int w, const int h,
// const int wt);
function sgr_weighted1_neon, export=1
dup v31.8h, w7
@@ -1889,7 +1889,7 @@
// void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *src, const ptrdiff_t src_stride,
-// const coef *t1, const coef *t2,
+// const int16_t *t1, const int16_t *t2,
// const int w, const int h,
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration_init_tmpl.c
@@ -117,13 +117,13 @@
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
-void dav1d_sgr_finish_filter1_neon(coef *tmp,
+void dav1d_sgr_finish_filter1_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 3x3 box (radius=1) */
-static void dav1d_sgr_filter1_neon(coef *tmp,
+static void dav1d_sgr_filter1_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
@@ -160,13 +160,13 @@
const enum LrEdgeFlags edges);
void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
const int w, const int h, const int strength);
-void dav1d_sgr_finish_filter2_neon(coef *tmp,
+void dav1d_sgr_finish_filter2_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const int32_t *a, const int16_t *b,
const int w, const int h);
/* filter with a 5x5 box (radius=2) */
-static void dav1d_sgr_filter2_neon(coef *tmp,
+static void dav1d_sgr_filter2_neon(int16_t *tmp,
const pixel *src, const ptrdiff_t stride,
const pixel (*left)[4],
const pixel *lpf, const ptrdiff_t lpf_stride,
@@ -195,11 +195,11 @@
void dav1d_sgr_weighted1_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
- const coef *t1, const int w, const int h,
+ const int16_t *t1, const int w, const int h,
const int wt);
void dav1d_sgr_weighted2_neon(pixel *dst, const ptrdiff_t dst_stride,
const pixel *src, const ptrdiff_t src_stride,
- const coef *t1, const coef *t2,
+ const int16_t *t1, const int16_t *t2,
const int w, const int h,
const int16_t wt[2]);
@@ -210,7 +210,7 @@
const int16_t sgr_wt[7], const enum LrEdgeFlags edges)
{
if (!dav1d_sgr_params[sgr_idx][0]) {
- ALIGN_STK_16(coef, tmp, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][3], edges);
if (w >= 8)
@@ -228,7 +228,7 @@
w & 7, h);
}
} else if (!dav1d_sgr_params[sgr_idx][1]) {
- ALIGN_STK_16(coef, tmp, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp, 64 * 384,);
dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
if (w >= 8)
@@ -245,8 +245,8 @@
w & 7, h);
}
} else {
- ALIGN_STK_16(coef, tmp1, 64 * 384,);
- ALIGN_STK_16(coef, tmp2, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
+ ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
w, h, dav1d_sgr_params[sgr_idx][2], edges);
dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,