ref: 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0
parent: 367d785a4e70b3e43eee234b3c745b047e3fbd40
author: Marvin Scholz <[email protected]>
date: Thu Oct 25 12:45:12 EDT 2018
Build: Add suffix to templated BITDEPTH files Fix #96
--- a/src/cdef.c
+++ /dev/null
@@ -1,298 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdlib.h>
-
-#include "common/intops.h"
-
-#include "src/cdef.h"
-
-static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {
- { -1 * 8 + 1, -2 * 8 + 2 },
- { 0 * 8 + 1, -1 * 8 + 2 },
- { 0 * 8 + 1, 0 * 8 + 2 },
- { 0 * 8 + 1, 1 * 8 + 2 },
- { 1 * 8 + 1, 2 * 8 + 2 },
- { 1 * 8 + 0, 2 * 8 + 1 },
- { 1 * 8 + 0, 2 * 8 + 0 },
- { 1 * 8 + 0, 2 * 8 - 1 }
-};
-
-static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {
- { -1 * 16 + 1, -2 * 16 + 2 },
- { 0 * 16 + 1, -1 * 16 + 2 },
- { 0 * 16 + 1, 0 * 16 + 2 },
- { 0 * 16 + 1, 1 * 16 + 2 },
- { 1 * 16 + 1, 2 * 16 + 2 },
- { 1 * 16 + 0, 2 * 16 + 1 },
- { 1 * 16 + 0, 2 * 16 + 0 },
- { 1 * 16 + 0, 2 * 16 - 1 }
-};
-static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
-
-static inline int constrain(const int diff, const int threshold,
- const int damping)
-{
- if (!threshold) return 0;
- const int shift = imax(0, damping - ulog2(threshold));
- return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
- diff);
-}
-
-/*
- * <code partially copied from libaom>
- */
-
-#define CDEF_VERY_LARGE (30000)
-
-static void fill(uint16_t *tmp, const ptrdiff_t stride,
- const int w, const int h)
-{
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++)
- tmp[x] = CDEF_VERY_LARGE;
- tmp += stride;
- }
-}
-
-/* Smooth in the direction detected. */
-static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
- /*const*/ pixel *const top[2],
- const int w, const int h, const int pri_strength,
- const int sec_strength, const int dir,
- const int damping, const enum CdefEdgeFlags edges)
-{
- const ptrdiff_t tmp_stride = 16 >> (w == 4);
- assert((w == 4 || w == 8) && (h == 4 || h == 8));
- uint16_t tmp[192]; // 16*12 is the maximum value of tmp_stride * (h + 4)
- uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;
- const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
- const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
- const int8_t (*cdef_directions)[2];
-
- assert(w == 4 || w == 8);
- cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;
-
- // fill extended input buffer
- int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
- if (!(edges & HAVE_TOP)) {
- fill(tmp, tmp_stride, w + 4, 2);
- y_start = 0;
- }
- if (!(edges & HAVE_BOTTOM)) {
- fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);
- y_end -= 2;
- }
- if (!(edges & HAVE_LEFT)) {
- fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);
- x_start = 0;
- }
- if (!(edges & HAVE_RIGHT)) {
- fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,
- 2, y_end - y_start);
- x_end -= 2;
- }
- for (int y = y_start; y < 0; y++)
- for (int x = x_start; x < x_end; x++)
- tmp2[y * tmp_stride + x] = top[y & 1][x];
- for (int y = 0; y < y_end; y++)
- for (int x = x_start; x < x_end; x++)
- tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];
-
- // run actual filter
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++) {
- int sum = 0;
- const int px = dst[y * PXSTRIDE(dst_stride) + x];
- int max = px, min = px;
- for (int k = 0; k < 2; k++) {
- const int8_t off1 = cdef_directions[dir][k];
- const int p0 = tmp2[y * tmp_stride + x + off1];
- const int p1 = tmp2[y * tmp_stride + x - off1];
- sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
- sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
- if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);
- if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);
- min = imin(p0, min);
- min = imin(p1, min);
- const int8_t off2 = cdef_directions[(dir + 2) & 7][k];
- const int s0 = tmp2[y * tmp_stride + x + off2];
- const int s1 = tmp2[y * tmp_stride + x - off2];
- const int8_t off3 = cdef_directions[(dir + 6) & 7][k];
- const int s2 = tmp2[y * tmp_stride + x + off3];
- const int s3 = tmp2[y * tmp_stride + x - off3];
- if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);
- if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);
- if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);
- if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);
- min = imin(s0, min);
- min = imin(s1, min);
- min = imin(s2, min);
- min = imin(s3, min);
- sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);
- sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);
- sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);
- sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);
- }
- dst[y * PXSTRIDE(dst_stride) + x] =
- iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
- }
- }
-}
-
-/*
- * </code partially copied from libaom>
- */
-
-#define cdef_fn(w, h) \
-static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
- const ptrdiff_t stride, \
- /*const*/ pixel *const top[2], \
- const int pri_strength, \
- const int sec_strength, \
- const int dir, \
- const int damping, \
- const enum CdefEdgeFlags edges) \
-{ \
- cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \
- dir, damping, edges); \
-}
-
-cdef_fn(4, 4);
-cdef_fn(4, 8);
-cdef_fn(8, 8);
-
-/*
- * <code copied from libaom>
- */
-
-/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
- The search minimizes the weighted variance along all the lines in a
- particular direction, i.e. the squared error between the input and a
- "predicted" block where each pixel is replaced by the average along a line
- in a particular direction. Since each direction have the same sum(x^2) term,
- that term is never computed. See Section 2, step 2, of:
- http://jmvalin.ca/notes/intra_paint.pdf */
-static const uint16_t div_table[] = {
- 0, 840, 420, 280, 210, 168, 140, 120, 105
-};
-static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
- unsigned *const var)
-{
- int i;
- int32_t cost[8] = { 0 };
- int partial[8][15] = { { 0 } };
- int32_t best_cost = 0;
- int best_dir = 0;
- /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
- The output is then 840 times larger, but we don't care for finding
- the max. */
- for (i = 0; i < 8; i++) {
- int j;
- for (j = 0; j < 8; j++) {
- int x;
- /* We subtract 128 here to reduce the maximum range of the squared
- partial sums. */
- x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;
- partial[0][i + j] += x;
- partial[1][i + j / 2] += x;
- partial[2][i] += x;
- partial[3][3 + i - j / 2] += x;
- partial[4][7 + i - j] += x;
- partial[5][3 - i / 2 + j] += x;
- partial[6][j] += x;
- partial[7][i / 2 + j] += x;
- }
- }
- for (i = 0; i < 8; i++) {
- cost[2] += partial[2][i] * partial[2][i];
- cost[6] += partial[6][i] * partial[6][i];
- }
- cost[2] *= div_table[8];
- cost[6] *= div_table[8];
- for (i = 0; i < 7; i++) {
- cost[0] += (partial[0][i] * partial[0][i] +
- partial[0][14 - i] * partial[0][14 - i]) *
- div_table[i + 1];
- cost[4] += (partial[4][i] * partial[4][i] +
- partial[4][14 - i] * partial[4][14 - i]) *
- div_table[i + 1];
- }
- cost[0] += partial[0][7] * partial[0][7] * div_table[8];
- cost[4] += partial[4][7] * partial[4][7] * div_table[8];
- for (i = 1; i < 8; i += 2) {
- int j;
- for (j = 0; j < 4 + 1; j++) {
- cost[i] += partial[i][3 + j] * partial[i][3 + j];
- }
- cost[i] *= div_table[8];
- for (j = 0; j < 4 - 1; j++) {
- cost[i] += (partial[i][j] * partial[i][j] +
- partial[i][10 - j] * partial[i][10 - j]) *
- div_table[2 * j + 2];
- }
- }
- for (i = 0; i < 8; i++) {
- if (cost[i] > best_cost) {
- best_cost = cost[i];
- best_dir = i;
- }
- }
- /* Difference between the optimal variance and the variance along the
- orthogonal direction. Again, the sum(x^2) terms cancel out. */
- *var = best_cost - cost[(best_dir + 4) & 7];
- /* We'd normally divide by 840, but dividing by 1024 is close enough
- for what we're going to do with this. */
- *var >>= 10;
- return best_dir;
-}
-
-/*
- * </code copied from libaom>
- */
-
-void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
- c->dir = cdef_find_dir_c;
- c->fb[0] = cdef_filter_block_8x8_c;
- c->fb[1] = cdef_filter_block_4x8_c;
- c->fb[2] = cdef_filter_block_4x4_c;
-}
--- a/src/cdef_apply.c
+++ /dev/null
@@ -1,237 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <string.h>
-
-#include "common/intops.h"
-
-#include "src/cdef_apply.h"
-
-static void backup2lines(pixel *const dst[3][2],
- /*const*/ pixel *const src[3],
- const ptrdiff_t src_stride[2], int y_off, int w,
- const enum Dav1dPixelLayout layout)
-{
- pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
- pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
-
- if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
- const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
- w >>= ss_hor;
- y_off >>= ss_ver;
- pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
- pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
- pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
- pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
-}
-
-static void backup2x8(pixel dst[3][8][2],
- /*const*/ pixel *const src[3],
- const ptrdiff_t src_stride[2], int x_off,
- const enum Dav1dPixelLayout layout)
-{
- for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
- pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
-
- if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
- const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
- x_off >>= ss_hor;
- for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
- pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
- pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
- }
-}
-
-static void restore2x8(pixel *const dst[3],
- const ptrdiff_t dst_stride[2],
- const pixel src[3][8][2], const enum Dav1dPixelLayout layout)
-{
- for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))
- pixel_copy(&dst[0][y_off - 2], src[0][y], 2);
-
- if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
- const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-
- for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {
- pixel_copy(&dst[1][y_off - 2], src[1][y], 2);
- pixel_copy(&dst[2][y_off - 2], src[2][y], 2);
- }
-}
-
-static int adjust_strength(const int strength, const unsigned var) {
- if (!var) return 0;
- const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
- return (strength * (4 + i) + 8) >> 4;
-}
-
-void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
- pixel *const p[3],
- const Av1Filter *const lflvl,
- const int by_start, const int by_end)
-{
- const Dav1dDSPContext *const dsp = f->dsp;
- enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);
- pixel *ptrs[3] = { p[0], p[1], p[2] };
- const int sbsz = 16;
- const int sb64w = f->sb128w << 1;
- const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;
- const enum Dav1dPixelLayout layout = f->cur.p.p.layout;
- const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
- const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
- const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
-
- // FIXME a design improvement that could be made here is to keep a set of
- // flags for each block position on whether the block was filtered; if not,
- // the backup of pre-filter data is empty, and the restore is therefore
- // unnecessary as well.
-
- for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
- const int tf = f->lf.top_pre_cdef_toggle;
- if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
-
- if (edges & HAVE_BOTTOM) {
- // backup pre-filter data for next iteration
- backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
- 8, f->bw * 4, layout);
- }
-
- pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
- pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
- edges &= ~HAVE_LEFT;
- edges |= HAVE_RIGHT;
- for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {
- const int sb128x = sbx >>1;
- const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
- const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
- if (cdef_idx == -1 ||
- (!f->frame_hdr.cdef.y_strength[cdef_idx] &&
- !f->frame_hdr.cdef.uv_strength[cdef_idx]))
- {
- last_skip = 1;
- goto next_sb;
- }
-
- const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];
- const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];
- pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
- for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
- bx += 2, edges |= HAVE_LEFT)
- {
- if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;
-
- // check if this 8x8 block had any coded coefficients; if not,
- // go to the next block
- const unsigned bx_mask = 3U << (bx & 14);
- const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
- if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
- lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
- {
- last_skip = 1;
- goto next_b;
- }
-
- if (!last_skip) {
- // backup post-filter data (will be restored at the end)
- backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);
-
- // restore pre-filter data from last iteration
- restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);
- }
- if (edges & HAVE_RIGHT) {
- // backup pre-filter data for next iteration
- backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);
- }
-
- // the actual filter
- const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);
- int y_sec_lvl = y_lvl & 3;
- y_sec_lvl += y_sec_lvl == 3;
- y_sec_lvl <<= BITDEPTH - 8;
- const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);
- int uv_sec_lvl = uv_lvl & 3;
- uv_sec_lvl += uv_sec_lvl == 3;
- uv_sec_lvl <<= BITDEPTH - 8;
- unsigned variance;
- const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
- &variance);
- if (y_lvl) {
- dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],
- (pixel *const [2]) {
- &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
- &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
- },
- adjust_strength(y_pri_lvl, variance),
- y_sec_lvl, y_pri_lvl ? dir : 0,
- damping, edges);
- }
- if (uv_lvl && has_chroma) {
- const int uvdir =
- f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
- ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
- for (int pl = 1; pl <= 2; pl++) {
- dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
- (pixel *const [2]) {
- &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
- &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
- },
- uv_pri_lvl, uv_sec_lvl,
- uv_pri_lvl ? uvdir : 0,
- damping - 1, edges);
- }
- }
-
- if (!last_skip) {
- // restore post-filter data from the beginning of this loop
- restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);
- }
- last_skip = 0;
-
- next_b:
- bptrs[0] += 8;
- bptrs[1] += 8 >> ss_hor;
- bptrs[2] += 8 >> ss_hor;
- }
-
- next_sb:
- iptrs[0] += sbsz * 4;
- iptrs[1] += sbsz * 4 >> ss_hor;
- iptrs[2] += sbsz * 4 >> ss_hor;
- }
-
- ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);
- ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
- ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
- f->lf.top_pre_cdef_toggle ^= 1;
- }
-}
--- /dev/null
+++ b/src/cdef_apply_tmpl.c
@@ -1,0 +1,237 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/cdef_apply.h"
+
+static void backup2lines(pixel *const dst[3][2],
+ /*const*/ pixel *const src[3],
+ const ptrdiff_t src_stride[2], int y_off, int w,
+ const enum Dav1dPixelLayout layout)
+{
+ pixel_copy(dst[0][0], src[0] + (y_off - 2) * PXSTRIDE(src_stride[0]), w);
+ pixel_copy(dst[0][1], src[0] + (y_off - 1) * PXSTRIDE(src_stride[0]), w);
+
+ if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ w >>= ss_hor;
+ y_off >>= ss_ver;
+ pixel_copy(dst[1][0], src[1] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
+ pixel_copy(dst[1][1], src[1] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+ pixel_copy(dst[2][0], src[2] + (y_off - 2) * PXSTRIDE(src_stride[1]), w);
+ pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
+}
+
+static void backup2x8(pixel dst[3][8][2],
+ /*const*/ pixel *const src[3],
+ const ptrdiff_t src_stride[2], int x_off,
+ const enum Dav1dPixelLayout layout)
+{
+ for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+ pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
+
+ if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ x_off >>= ss_hor;
+ for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
+ pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
+ pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
+ }
+}
+
+static void restore2x8(pixel *const dst[3],
+ const ptrdiff_t dst_stride[2],
+ const pixel src[3][8][2], const enum Dav1dPixelLayout layout)
+{
+ for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))
+ pixel_copy(&dst[0][y_off - 2], src[0][y], 2);
+
+ if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+
+ for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {
+ pixel_copy(&dst[1][y_off - 2], src[1][y], 2);
+ pixel_copy(&dst[2][y_off - 2], src[2][y], 2);
+ }
+}
+
+static int adjust_strength(const int strength, const unsigned var) {
+ if (!var) return 0;
+ const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
+ return (strength * (4 + i) + 8) >> 4;
+}
+
+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
+ pixel *const p[3],
+ const Av1Filter *const lflvl,
+ const int by_start, const int by_end)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);
+ pixel *ptrs[3] = { p[0], p[1], p[2] };
+ const int sbsz = 16;
+ const int sb64w = f->sb128w << 1;
+ const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;
+ const enum Dav1dPixelLayout layout = f->cur.p.p.layout;
+ const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
+ const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
+ const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ // FIXME a design improvement that could be made here is to keep a set of
+ // flags for each block position on whether the block was filtered; if not,
+ // the backup of pre-filter data is empty, and the restore is therefore
+ // unnecessary as well.
+
+ for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
+ const int tf = f->lf.top_pre_cdef_toggle;
+ if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
+
+ if (edges & HAVE_BOTTOM) {
+ // backup pre-filter data for next iteration
+ backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
+ 8, f->bw * 4, layout);
+ }
+
+ pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
+ pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
+ edges &= ~HAVE_LEFT;
+ edges |= HAVE_RIGHT;
+ for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {
+ const int sb128x = sbx >>1;
+ const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
+ const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
+ if (cdef_idx == -1 ||
+ (!f->frame_hdr.cdef.y_strength[cdef_idx] &&
+ !f->frame_hdr.cdef.uv_strength[cdef_idx]))
+ {
+ last_skip = 1;
+ goto next_sb;
+ }
+
+ const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];
+ const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];
+ pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
+ for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
+ bx += 2, edges |= HAVE_LEFT)
+ {
+ if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;
+
+ // check if this 8x8 block had any coded coefficients; if not,
+ // go to the next block
+ const unsigned bx_mask = 3U << (bx & 14);
+ const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
+ if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
+ lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
+ {
+ last_skip = 1;
+ goto next_b;
+ }
+
+ if (!last_skip) {
+ // backup post-filter data (will be restored at the end)
+ backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);
+
+ // restore pre-filter data from last iteration
+ restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);
+ }
+ if (edges & HAVE_RIGHT) {
+ // backup pre-filter data for next iteration
+ backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);
+ }
+
+ // the actual filter
+ const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);
+ int y_sec_lvl = y_lvl & 3;
+ y_sec_lvl += y_sec_lvl == 3;
+ y_sec_lvl <<= BITDEPTH - 8;
+ const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);
+ int uv_sec_lvl = uv_lvl & 3;
+ uv_sec_lvl += uv_sec_lvl == 3;
+ uv_sec_lvl <<= BITDEPTH - 8;
+ unsigned variance;
+ const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
+ &variance);
+ if (y_lvl) {
+ dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],
+ (pixel *const [2]) {
+ &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
+ &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
+ },
+ adjust_strength(y_pri_lvl, variance),
+ y_sec_lvl, y_pri_lvl ? dir : 0,
+ damping, edges);
+ }
+ if (uv_lvl && has_chroma) {
+ const int uvdir =
+ f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
+ ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
+ for (int pl = 1; pl <= 2; pl++) {
+ dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
+ (pixel *const [2]) {
+ &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
+ &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
+ },
+ uv_pri_lvl, uv_sec_lvl,
+ uv_pri_lvl ? uvdir : 0,
+ damping - 1, edges);
+ }
+ }
+
+ if (!last_skip) {
+ // restore post-filter data from the beginning of this loop
+ restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);
+ }
+ last_skip = 0;
+
+ next_b:
+ bptrs[0] += 8;
+ bptrs[1] += 8 >> ss_hor;
+ bptrs[2] += 8 >> ss_hor;
+ }
+
+ next_sb:
+ iptrs[0] += sbsz * 4;
+ iptrs[1] += sbsz * 4 >> ss_hor;
+ iptrs[2] += sbsz * 4 >> ss_hor;
+ }
+
+ ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);
+ ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+ ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+ f->lf.top_pre_cdef_toggle ^= 1;
+ }
+}
--- /dev/null
+++ b/src/cdef_tmpl.c
@@ -1,0 +1,298 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/cdef.h"
+
+static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {
+ { -1 * 8 + 1, -2 * 8 + 2 },
+ { 0 * 8 + 1, -1 * 8 + 2 },
+ { 0 * 8 + 1, 0 * 8 + 2 },
+ { 0 * 8 + 1, 1 * 8 + 2 },
+ { 1 * 8 + 1, 2 * 8 + 2 },
+ { 1 * 8 + 0, 2 * 8 + 1 },
+ { 1 * 8 + 0, 2 * 8 + 0 },
+ { 1 * 8 + 0, 2 * 8 - 1 }
+};
+
+static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {
+ { -1 * 16 + 1, -2 * 16 + 2 },
+ { 0 * 16 + 1, -1 * 16 + 2 },
+ { 0 * 16 + 1, 0 * 16 + 2 },
+ { 0 * 16 + 1, 1 * 16 + 2 },
+ { 1 * 16 + 1, 2 * 16 + 2 },
+ { 1 * 16 + 0, 2 * 16 + 1 },
+ { 1 * 16 + 0, 2 * 16 + 0 },
+ { 1 * 16 + 0, 2 * 16 - 1 }
+};
+static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+
+static inline int constrain(const int diff, const int threshold,
+ const int damping)
+{
+ if (!threshold) return 0;
+ const int shift = imax(0, damping - ulog2(threshold));
+ return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
+ diff);
+}
+
+/*
+ * <code partially copied from libaom>
+ */
+
+#define CDEF_VERY_LARGE (30000)
+
+static void fill(uint16_t *tmp, const ptrdiff_t stride,
+ const int w, const int h)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ tmp[x] = CDEF_VERY_LARGE;
+ tmp += stride;
+ }
+}
+
+/* Smooth in the direction detected. */
+static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
+ /*const*/ pixel *const top[2],
+ const int w, const int h, const int pri_strength,
+ const int sec_strength, const int dir,
+ const int damping, const enum CdefEdgeFlags edges)
+{
+ const ptrdiff_t tmp_stride = 16 >> (w == 4);
+ assert((w == 4 || w == 8) && (h == 4 || h == 8));
+ uint16_t tmp[192]; // 16*12 is the maximum value of tmp_stride * (h + 4)
+ uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;
+ const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
+ const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
+ const int8_t (*cdef_directions)[2];
+
+ assert(w == 4 || w == 8);
+ cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;
+
+ // fill extended input buffer
+ int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
+ if (!(edges & HAVE_TOP)) {
+ fill(tmp, tmp_stride, w + 4, 2);
+ y_start = 0;
+ }
+ if (!(edges & HAVE_BOTTOM)) {
+ fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);
+ y_end -= 2;
+ }
+ if (!(edges & HAVE_LEFT)) {
+ fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);
+ x_start = 0;
+ }
+ if (!(edges & HAVE_RIGHT)) {
+ fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,
+ 2, y_end - y_start);
+ x_end -= 2;
+ }
+ for (int y = y_start; y < 0; y++)
+ for (int x = x_start; x < x_end; x++)
+ tmp2[y * tmp_stride + x] = top[y & 1][x];
+ for (int y = 0; y < y_end; y++)
+ for (int x = x_start; x < x_end; x++)
+ tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];
+
+ // run actual filter
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ int sum = 0;
+ const int px = dst[y * PXSTRIDE(dst_stride) + x];
+ int max = px, min = px;
+ for (int k = 0; k < 2; k++) {
+ const int8_t off1 = cdef_directions[dir][k];
+ const int p0 = tmp2[y * tmp_stride + x + off1];
+ const int p1 = tmp2[y * tmp_stride + x - off1];
+ sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
+ sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
+ if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);
+ if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);
+ min = imin(p0, min);
+ min = imin(p1, min);
+ const int8_t off2 = cdef_directions[(dir + 2) & 7][k];
+ const int s0 = tmp2[y * tmp_stride + x + off2];
+ const int s1 = tmp2[y * tmp_stride + x - off2];
+ const int8_t off3 = cdef_directions[(dir + 6) & 7][k];
+ const int s2 = tmp2[y * tmp_stride + x + off3];
+ const int s3 = tmp2[y * tmp_stride + x - off3];
+ if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);
+ if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);
+ if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);
+ if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);
+ min = imin(s0, min);
+ min = imin(s1, min);
+ min = imin(s2, min);
+ min = imin(s3, min);
+ sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);
+ sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);
+ sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);
+ sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);
+ }
+ dst[y * PXSTRIDE(dst_stride) + x] =
+ iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
+ }
+ }
+}
+
+/*
+ * </code partially copied from libaom>
+ */
+
+#define cdef_fn(w, h) \
+static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
+ const ptrdiff_t stride, \
+ /*const*/ pixel *const top[2], \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges) \
+{ \
+ cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \
+ dir, damping, edges); \
+}
+
+cdef_fn(4, 4);
+cdef_fn(4, 8);
+cdef_fn(8, 8);
+
+/*
+ * <code copied from libaom>
+ */
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+ The search minimizes the weighted variance along all the lines in a
+ particular direction, i.e. the squared error between the input and a
+ "predicted" block where each pixel is replaced by the average along a line
+ in a particular direction. Since each direction have the same sum(x^2) term,
+ that term is never computed. See Section 2, step 2, of:
+ http://jmvalin.ca/notes/intra_paint.pdf */
+static const uint16_t div_table[] = {
+ 0, 840, 420, 280, 210, 168, 140, 120, 105
+};
+static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
+ unsigned *const var)
+{
+ int i;
+ int32_t cost[8] = { 0 };
+ int partial[8][15] = { { 0 } };
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+ The output is then 840 times larger, but we don't care for finding
+ the max. */
+ for (i = 0; i < 8; i++) {
+ int j;
+ for (j = 0; j < 8; j++) {
+ int x;
+ /* We subtract 128 here to reduce the maximum range of the squared
+ partial sums. */
+ x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;
+ partial[0][i + j] += x;
+ partial[1][i + j / 2] += x;
+ partial[2][i] += x;
+ partial[3][3 + i - j / 2] += x;
+ partial[4][7 + i - j] += x;
+ partial[5][3 - i / 2 + j] += x;
+ partial[6][j] += x;
+ partial[7][i / 2 + j] += x;
+ }
+ }
+ for (i = 0; i < 8; i++) {
+ cost[2] += partial[2][i] * partial[2][i];
+ cost[6] += partial[6][i] * partial[6][i];
+ }
+ cost[2] *= div_table[8];
+ cost[6] *= div_table[8];
+ for (i = 0; i < 7; i++) {
+ cost[0] += (partial[0][i] * partial[0][i] +
+ partial[0][14 - i] * partial[0][14 - i]) *
+ div_table[i + 1];
+ cost[4] += (partial[4][i] * partial[4][i] +
+ partial[4][14 - i] * partial[4][14 - i]) *
+ div_table[i + 1];
+ }
+ cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+ cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+ for (i = 1; i < 8; i += 2) {
+ int j;
+ for (j = 0; j < 4 + 1; j++) {
+ cost[i] += partial[i][3 + j] * partial[i][3 + j];
+ }
+ cost[i] *= div_table[8];
+ for (j = 0; j < 4 - 1; j++) {
+ cost[i] += (partial[i][j] * partial[i][j] +
+ partial[i][10 - j] * partial[i][10 - j]) *
+ div_table[2 * j + 2];
+ }
+ }
+ for (i = 0; i < 8; i++) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ best_dir = i;
+ }
+ }
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+/*
+ * </code copied from libaom>
+ */
+
+void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
+ c->dir = cdef_find_dir_c;
+ c->fb[0] = cdef_filter_block_8x8_c;
+ c->fb[1] = cdef_filter_block_4x8_c;
+ c->fb[2] = cdef_filter_block_4x4_c;
+}
--- a/src/ipred.c
+++ /dev/null
@@ -1,757 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/ipred.h"
-#include "src/tables.h"
-
-static NOINLINE void
-splat_dc(pixel *dst, const ptrdiff_t stride,
- const int width, const int height, const unsigned dc)
-{
- assert(dc <= (1 << BITDEPTH) - 1);
-#if BITDEPTH == 8
- if (width > 4) {
- const uint64_t dcN = dc * 0x0101010101010101ULL;
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x += sizeof(dcN))
- *((uint64_t *) &dst[x]) = dcN;
- dst += PXSTRIDE(stride);
- }
- } else {
- const unsigned dcN = dc * 0x01010101U;
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x += sizeof(dcN))
- *((unsigned *) &dst[x]) = dcN;
- dst += PXSTRIDE(stride);
- }
- }
-#else
- const uint64_t dcN = dc * 0x0001000100010001ULL;
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x += sizeof(dcN) >> 1)
- *((uint64_t *) &dst[x]) = dcN;
- dst += PXSTRIDE(stride);
- }
-#endif
-}
-
-static NOINLINE void
-cfl_pred(pixel *dst, const ptrdiff_t stride,
- const int width, const int height, const unsigned dc,
- const int16_t *ac, const int alpha)
-{
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- const int diff = alpha * ac[x];
- dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
- }
- ac += width;
- dst += PXSTRIDE(stride);
- }
-}
-
-static unsigned dc_gen_top(const pixel *const topleft, const int width)
-{
- unsigned dc = width >> 1;
- for (int i = 0; i < width; i++)
- dc += topleft[1 + i];
- return dc >> ctz(width);
-}
-
-static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
-}
-
-static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height,
- const int16_t *ac, const int alpha)
-{
- cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
-}
-
-static unsigned dc_gen_left(const pixel *const topleft, const int height)
-{
- unsigned dc = height >> 1;
- for (int i = 0; i < height; i++)
- dc += topleft[-(1 + i)];
- return dc >> ctz(height);
-}
-
-static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
-}
-
-static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height,
- const int16_t *ac, const int alpha)
-{
- unsigned dc = dc_gen_left(topleft, height);
- cfl_pred(dst, stride, width, height, dc, ac, alpha);
-}
-
-#if BITDEPTH == 8
-#define MULTIPLIER_1x2 0x5556
-#define MULTIPLIER_1x4 0x3334
-#define BASE_SHIFT 16
-#else
-#define MULTIPLIER_1x2 0xAAAB
-#define MULTIPLIER_1x4 0x6667
-#define BASE_SHIFT 17
-#endif
-
-static unsigned
-dc_gen(const pixel *const topleft, const int width, const int height)
-{
- unsigned dc = (width + height) >> 1;
- for (int i = 0; i < width; i++)
- dc += topleft[i + 1];
- for (int i = 0; i < height; i++)
- dc += topleft[-(i + 1)];
- dc >>= ctz(width + height);
-
- if (width != height) {
- dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
- MULTIPLIER_1x2;
- dc >>= BASE_SHIFT;
- }
- return dc;
-}
-
-static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
-}
-
-static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height,
- const int16_t *ac, const int alpha)
-{
- unsigned dc = dc_gen(topleft, width, height);
- cfl_pred(dst, stride, width, height, dc, ac, alpha);
-}
-
-#undef MULTIPLIER_1x2
-#undef MULTIPLIER_1x4
-#undef BASE_SHIFT
-
-static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
-}
-
-static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height,
- const int16_t *ac, const int alpha)
-{
- cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
-}
-
-static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- for (int y = 0; y < height; y++) {
- pixel_copy(dst, topleft + 1, width);
- dst += PXSTRIDE(stride);
- }
-}
-
-static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- for (int y = 0; y < height; y++) {
- pixel_set(dst, topleft[-(1 + y)], width);
- dst += PXSTRIDE(stride);
- }
-}
-
-static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const tl_ptr,
- const int width, const int height, const int a)
-{
- const int topleft = tl_ptr[0];
- for (int y = 0; y < height; y++) {
- const int left = tl_ptr[-(y + 1)];
- for (int x = 0; x < width; x++) {
- const int top = tl_ptr[1 + x];
- const int base = left + top - topleft;
- const int ldiff = abs(left - base);
- const int tdiff = abs(top - base);
- const int tldiff = abs(topleft - base);
-
- dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
- tdiff <= tldiff ? top : topleft;
- }
- dst += PXSTRIDE(stride);
- }
-}
-
-static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- const uint8_t *const weights_hor = &dav1d_sm_weights[width];
- const uint8_t *const weights_ver = &dav1d_sm_weights[height];
- const int right = topleft[width], bottom = topleft[-height];
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- const int pred = weights_ver[y] * topleft[1 + x] +
- (256 - weights_ver[y]) * bottom +
- weights_hor[x] * topleft[-(1 + y)] +
- (256 - weights_hor[x]) * right;
- dst[x] = (pred + 256) >> 9;
- }
- dst += PXSTRIDE(stride);
- }
-}
-
-static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- const uint8_t *const weights_ver = &dav1d_sm_weights[height];
- const int bottom = topleft[-height];
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- const int pred = weights_ver[y] * topleft[1 + x] +
- (256 - weights_ver[y]) * bottom;
- dst[x] = (pred + 128) >> 8;
- }
- dst += PXSTRIDE(stride);
- }
-}
-
-static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft,
- const int width, const int height, const int a)
-{
- const uint8_t *const weights_hor = &dav1d_sm_weights[width];
- const int right = topleft[width];
-
- for (int y = 0; y < height; y++) {
- for (int x = 0; x < width; x++) {
- const int pred = weights_hor[x] * topleft[-(y + 1)] +
- (256 - weights_hor[x]) * right;
- dst[x] = (pred + 128) >> 8;
- }
- dst += PXSTRIDE(stride);
- }
-}
-
-static int get_filter_strength(const unsigned blk_wh, const unsigned d,
- const int type)
-{
- int strength = 0;
-
- if (type == 0) {
- if (blk_wh <= 8) {
- if (d >= 56) strength = 1;
- } else if (blk_wh <= 12) {
- if (d >= 40) strength = 1;
- } else if (blk_wh <= 16) {
- if (d >= 40) strength = 1;
- } else if (blk_wh <= 24) {
- if (d >= 8) strength = 1;
- if (d >= 16) strength = 2;
- if (d >= 32) strength = 3;
- } else if (blk_wh <= 32) {
- if (d >= 1) strength = 1;
- if (d >= 4) strength = 2;
- if (d >= 32) strength = 3;
- } else {
- if (d >= 1) strength = 3;
- }
- } else {
- if (blk_wh <= 8) {
- if (d >= 40) strength = 1;
- if (d >= 64) strength = 2;
- } else if (blk_wh <= 16) {
- if (d >= 20) strength = 1;
- if (d >= 48) strength = 2;
- } else if (blk_wh <= 24) {
- if (d >= 4) strength = 3;
- } else {
- if (d >= 1) strength = 3;
- }
- }
-
- return strength;
-}
-
-static void filter_edge(pixel *const out, const int sz, const pixel *const in,
- const int from, const int to, const unsigned strength)
-{
- static const uint8_t kernel[3][5] = {
- { 0, 4, 8, 4, 0 },
- { 0, 5, 6, 5, 0 },
- { 2, 4, 4, 4, 2 }
- };
-
- assert(strength > 0);
- for (int i = 0; i < sz; i++) {
- int s = 0;
- for (int j = 0; j < 5; j++)
- s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
- out[i] = (s + 8) >> 4;
- }
-}
-
-static int get_upsample(const int blk_wh, const unsigned d, const int type) {
- if (d >= 40) return 0;
- return type ? (blk_wh <= 8) : (blk_wh <= 16);
-}
-
-static void upsample_edge(pixel *const out, const int hsz,
- const pixel *const in, const int from, const int to)
-{
- static const int8_t kernel[4] = { -1, 9, 9, -1 };
- int i;
- for (i = 0; i < hsz - 1; i++) {
- out[i * 2] = in[iclip(i, from, to - 1)];
-
- int s = 0;
- for (int j = 0; j < 4; j++)
- s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
- out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
- }
- out[i * 2] = in[iclip(i, from, to - 1)];
-}
-
-static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft_in,
- const int width, const int height, int angle)
-{
- const int is_sm = angle >> 9;
- angle &= 511;
- assert(angle < 90);
- const int dx = dav1d_dr_intra_derivative[angle];
- pixel top_out[(64 + 64) * 2];
- const pixel *top;
- int max_base_x;
- const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);
- if (upsample_above) {
- upsample_edge(top_out, width + height,
- &topleft_in[1], -1, width + imin(width, height));
- top = top_out;
- max_base_x = 2 * (width + height) - 2;
- } else {
- const int filter_strength =
- get_filter_strength(width + height, 90 - angle, is_sm);
-
- if (filter_strength) {
- filter_edge(top_out, width + height,
- &topleft_in[1], -1, width + imin(width, height),
- filter_strength);
- top = top_out;
- max_base_x = width + height - 1;
- } else {
- top = &topleft_in[1];
- max_base_x = width + imin(width, height) - 1;
- }
- }
- const int frac_bits = 6 - upsample_above;
- const int base_inc = 1 << upsample_above;
- for (int y = 0, xpos = dx; y < height;
- y++, dst += PXSTRIDE(stride), xpos += dx)
- {
- int base = xpos >> frac_bits;
- const int frac = ((xpos << upsample_above) & 0x3F) >> 1;
-
- for (int x = 0; x < width; x++, base += base_inc) {
- if (base < max_base_x) {
- const int v = top[base] * (32 - frac) + top[base + 1] * frac;
- dst[x] = iclip_pixel((v + 16) >> 5);
- } else {
- pixel_set(&dst[x], top[max_base_x], width - x);
- break;
- }
- }
- }
-}
-
-static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft_in,
- const int width, const int height, int angle)
-{
- const int is_sm = angle >> 9;
- angle &= 511;
- assert(angle > 90 && angle < 180);
- const int dy = dav1d_dr_intra_derivative[angle - 90];
- const int dx = dav1d_dr_intra_derivative[180 - angle];
- const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
- const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
- pixel edge[64 * 2 + 64 * 2 + 1];
- pixel *const topleft = &edge[height * 2];
-
- if (upsample_above) {
- upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
- } else {
- const int filter_strength =
- get_filter_strength(width + height, angle - 90, is_sm);
-
- if (filter_strength) {
- filter_edge(&topleft[1], width, &topleft_in[1], -1, width,
- filter_strength);
- } else {
- pixel_copy(&topleft[1], &topleft_in[1], width);
- }
- }
- if (upsample_left) {
- upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
- } else {
- const int filter_strength =
- get_filter_strength(width + height, 180 - angle, is_sm);
-
- if (filter_strength) {
- filter_edge(&topleft[-height], height, &topleft_in[-height],
- 0, height + 1, filter_strength);
- } else {
- pixel_copy(&topleft[-height], &topleft_in[-height], height);
- }
- }
- *topleft = *topleft_in;
-
- const int min_base_x = -(1 << upsample_above);
- const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;
- const int base_inc_x = 1 << upsample_above;
- const pixel *const left = &topleft[-(1 << upsample_left)];
- const pixel *const top = &topleft[1 << upsample_above];
- for (int y = 0, xpos = -dx; y < height;
- y++, xpos -= dx, dst += PXSTRIDE(stride))
- {
- int base_x = xpos >> frac_bits_x;
- const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;
-
- for (int x = 0, ypos = (y << 6) - dy; x < width;
- x++, base_x += base_inc_x, ypos -= dy)
- {
- int v;
-
- if (base_x >= min_base_x) {
- v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
- } else {
- const int base_y = ypos >> frac_bits_y;
- assert(base_y >= -(1 << upsample_left));
- const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;
- v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
- }
- dst[x] = iclip_pixel((v + 16) >> 5);
- }
- }
-}
-
-static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft_in,
- const int width, const int height, int angle)
-{
- const int is_sm = angle >> 9;
- angle &= 511;
- assert(angle > 180);
- const int dy = dav1d_dr_intra_derivative[270 - angle];
- pixel left_out[(64 + 64) * 2];
- const pixel *left;
- int max_base_y;
- const int upsample_left = get_upsample(width + height, angle - 180, is_sm);
- if (upsample_left) {
- upsample_edge(left_out, width + height,
- &topleft_in[-(width + height)],
- imax(width - height, 0), width + height + 1);
- left = &left_out[2 * (width + height) - 2];
- max_base_y = 2 * (width + height) - 2;
- } else {
- const int filter_strength =
- get_filter_strength(width + height, angle - 180, is_sm);
-
- if (filter_strength) {
- filter_edge(left_out, width + height,
- &topleft_in[-(width + height)],
- imax(width - height, 0), width + height + 1,
- filter_strength);
- left = &left_out[width + height - 1];
- max_base_y = width + height - 1;
- } else {
- left = &topleft_in[-1];
- max_base_y = height + imin(width, height) - 1;
- }
- }
- const int frac_bits = 6 - upsample_left;
- const int base_inc = 1 << upsample_left;
- for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
- int base = ypos >> frac_bits;
- const int frac = ((ypos << upsample_left) & 0x3F) >> 1;
-
- for (int y = 0; y < height; y++, base += base_inc) {
- if (base < max_base_y) {
- const int v = left[-base] * (32 - frac) +
- left[-(base + 1)] * frac;
- dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
- } else {
- do {
- dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
- } while (++y < height);
- break;
- }
- }
- }
-}
-
-/* Up to 32x32 only */
-static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
- const pixel *const topleft_in,
- const int width, const int height, int filt_idx)
-{
- filt_idx &= 511;
- assert(filt_idx < 5);
-
- const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
- int x, y;
- ptrdiff_t left_stride;
- const pixel *left, *topleft, *top;
-
- top = &topleft_in[1];
- for (y = 0; y < height; y += 2) {
- topleft = &topleft_in[-y];
- left = &topleft[-1];
- left_stride = -1;
- for (x = 0; x < width; x += 4) {
- const int p0 = *topleft;
- const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
- const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
- pixel *ptr = &dst[x];
- const int8_t *flt_ptr = filter;
-
- for (int yy = 0; yy < 2; yy++) {
- for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
- int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
- flt_ptr[16] * p2 + flt_ptr[17] * p3 +
- flt_ptr[32] * p4 + flt_ptr[33] * p5 +
- flt_ptr[48] * p6;
- ptr[xx] = iclip_pixel((acc + 8) >> 4);
- }
- ptr += PXSTRIDE(stride);
- }
- left = &dst[x + 4 - 1];
- left_stride = PXSTRIDE(stride);
- top += 4;
- topleft = &top[-1];
- }
- top = &dst[PXSTRIDE(stride)];
- dst = &dst[PXSTRIDE(stride) * 2];
- }
-}
-
-static NOINLINE void
-cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
- const int w_pad, const int h_pad, const int width, const int height,
- const int ss_hor, const int ss_ver, const int log2sz)
-{
- int y, x;
- int16_t *const ac_orig = ac;
-
- assert(w_pad >= 0 && w_pad * 4 < width);
- assert(h_pad >= 0 && h_pad * 4 < height);
-
- for (y = 0; y < height - 4 * h_pad; y++) {
- for (x = 0; x < width - 4 * w_pad; x++) {
- int ac_sum = ypx[x << ss_hor];
- if (ss_hor) ac_sum += ypx[x * 2 + 1];
- if (ss_ver) {
- ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
- if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
- }
- ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
- }
- for (; x < width; x++)
- ac[x] = ac[x - 1];
- ac += width;
- ypx += PXSTRIDE(stride) << ss_ver;
- }
- for (; y < height; y++) {
- memcpy(ac, &ac[-width], width * sizeof(*ac));
- ac += width;
- }
-
- int sum = (1 << log2sz) >> 1;
- for (ac = ac_orig, y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- sum += ac[x];
- ac += width;
- }
- sum >>= log2sz;
-
- // subtract DC
- for (ac = ac_orig, y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- ac[x] -= sum;
- ac += width;
- }
-}
-
-#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \
-static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \
- const pixel *const ypx, \
- const ptrdiff_t stride, \
- const int w_pad, \
- const int h_pad) \
-{ \
- cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \
-}
-
-cfl_ac_fn( 8, 8, 4, 4, 1, 1, 4)
-cfl_ac_fn( 8, 16, 4, 8, 1, 1, 5)
-cfl_ac_fn( 8, 32, 4, 16, 1, 1, 6)
-cfl_ac_fn(16, 8, 8, 4, 1, 1, 5)
-cfl_ac_fn(16, 16, 8, 8, 1, 1, 6)
-cfl_ac_fn(16, 32, 8, 16, 1, 1, 7)
-cfl_ac_fn(32, 8, 16, 4, 1, 1, 6)
-cfl_ac_fn(32, 16, 16, 8, 1, 1, 7)
-cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)
-
-cfl_ac_fn( 8, 4, 4, 4, 1, 0, 4)
-cfl_ac_fn( 8, 8, 4, 8, 1, 0, 5)
-cfl_ac_fn(16, 4, 8, 4, 1, 0, 5)
-cfl_ac_fn(16, 8, 8, 8, 1, 0, 6)
-cfl_ac_fn(16, 16, 8, 16, 1, 0, 7)
-cfl_ac_fn(32, 8, 16, 8, 1, 0, 7)
-cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)
-cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)
-
-cfl_ac_fn( 4, 4, 4, 4, 0, 0, 4)
-cfl_ac_fn( 4, 8, 4, 8, 0, 0, 5)
-cfl_ac_fn( 4, 16, 4, 16, 0, 0, 6)
-cfl_ac_fn( 8, 4, 8, 4, 0, 0, 5)
-cfl_ac_fn( 8, 8, 8, 8, 0, 0, 6)
-cfl_ac_fn( 8, 16, 8, 16, 0, 0, 7)
-cfl_ac_fn( 8, 32, 8, 32, 0, 0, 8)
-cfl_ac_fn(16, 4, 16, 4, 0, 0, 6)
-cfl_ac_fn(16, 8, 16, 8, 0, 0, 7)
-cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)
-cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)
-cfl_ac_fn(32, 8, 32, 8, 0, 0, 8)
-cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)
-cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)
-
-static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
- const uint16_t *const pal, const uint8_t *idx,
- const int w, const int h)
-{
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++)
- dst[x] = pal[idx[x]];
- idx += w;
- dst += PXSTRIDE(stride);
- }
-}
-
-void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
- c->intra_pred[DC_PRED ] = ipred_dc_c;
- c->intra_pred[DC_128_PRED ] = ipred_dc_128_c;
- c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c;
- c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
- c->intra_pred[HOR_PRED ] = ipred_h_c;
- c->intra_pred[VERT_PRED ] = ipred_v_c;
- c->intra_pred[PAETH_PRED ] = ipred_paeth_c;
- c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c;
- c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
- c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
- c->intra_pred[Z1_PRED ] = ipred_z1_c;
- c->intra_pred[Z2_PRED ] = ipred_z2_c;
- c->intra_pred[Z3_PRED ] = ipred_z3_c;
- c->intra_pred[FILTER_PRED ] = ipred_filter_c;
-
- // cfl functions are split per chroma subsampling type
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4 ] = cfl_ac_8x8_to_4x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8 ] = cfl_ac_8x16_to_4x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4 ] = cfl_ac_16x8_to_8x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8 ] = cfl_ac_16x16_to_8x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;
-
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4 ] = cfl_ac_8x4_to_4x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8 ] = cfl_ac_8x8_to_4x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4 ] = cfl_ac_16x4_to_8x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8 ] = cfl_ac_16x8_to_8x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;
-
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4 ] = cfl_ac_4x4_to_4x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8 ] = cfl_ac_4x8_to_4x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4 ] = cfl_ac_8x4_to_8x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8 ] = cfl_ac_8x8_to_8x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;
-
- c->cfl_pred[DC_PRED ] = ipred_cfl_c;
- c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
- c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
- c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
-
- c->pal_pred = pal_pred_c;
-
-#if HAVE_ASM && ARCH_X86
- bitfn(dav1d_intra_pred_dsp_init_x86)(c);
-#endif
-}
--- a/src/ipred_prepare.c
+++ /dev/null
@@ -1,209 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "common/intops.h"
-
-#include "src/ipred_prepare.h"
-
-static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
- [2 /* have_left */][2 /* have_top */] =
-{
- [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
- { LEFT_DC_PRED, DC_PRED } },
- [PAETH_PRED] = { { DC_128_PRED, VERT_PRED },
- { HOR_PRED, PAETH_PRED } },
-};
-
-static const uint8_t av1_mode_to_angle_map[8] = {
- 90, 180, 45, 135, 113, 157, 203, 67
-};
-
-static const struct {
- uint8_t needs_left:1;
- uint8_t needs_top:1;
- uint8_t needs_topleft:1;
- uint8_t needs_topright:1;
- uint8_t needs_bottomleft:1;
-} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
- [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
- [VERT_PRED] = { .needs_top = 1 },
- [HOR_PRED] = { .needs_left = 1 },
- [LEFT_DC_PRED] = { .needs_left = 1 },
- [TOP_DC_PRED] = { .needs_top = 1 },
- [DC_128_PRED] = { 0 },
- [Z1_PRED] = { .needs_top = 1, .needs_topright = 1,
- .needs_topleft = 1 },
- [Z2_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
- [Z3_PRED] = { .needs_left = 1, .needs_bottomleft = 1,
- .needs_topleft = 1 },
- [SMOOTH_PRED] = { .needs_left = 1, .needs_top = 1 },
- [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
- [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
- [PAETH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
- [FILTER_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
-};
-
-enum IntraPredMode
-bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
- const int y, const int have_top,
- const int w, const int h,
- const enum EdgeFlags edge_flags,
- const pixel *const dst,
- const ptrdiff_t stride,
- const pixel *prefilter_toplevel_sb_edge,
- enum IntraPredMode mode, int *const angle,
- const int tw, const int th,
- pixel *const topleft_out)
-{
- assert(y < h && x < w);
-
- switch (mode) {
- case VERT_PRED:
- case HOR_PRED:
- case DIAG_DOWN_LEFT_PRED:
- case DIAG_DOWN_RIGHT_PRED:
- case VERT_RIGHT_PRED:
- case HOR_DOWN_PRED:
- case HOR_UP_PRED:
- case VERT_LEFT_PRED: {
- *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
-
- if (*angle < 90) {
- mode = have_top ? Z1_PRED : VERT_PRED;
- } else if (*angle == 90) {
- mode = VERT_PRED;
- } else if (*angle < 180) {
- mode = Z2_PRED;
- } else if (*angle == 180) {
- mode = HOR_PRED;
- } else {
- mode = have_left ? Z3_PRED : HOR_PRED;
- }
- break;
- }
- case DC_PRED:
- case PAETH_PRED:
- mode = av1_mode_conv[mode][have_left][have_top];
- break;
- default:
- break;
- }
-
- const pixel *dst_top;
- if (have_top &&
- (av1_intra_prediction_edges[mode].needs_top ||
- av1_intra_prediction_edges[mode].needs_topleft ||
- (av1_intra_prediction_edges[mode].needs_left && !have_left)))
- {
- if (prefilter_toplevel_sb_edge) {
- dst_top = &prefilter_toplevel_sb_edge[x * 4];
- } else {
- dst_top = &dst[-PXSTRIDE(stride)];
- }
- }
-
- if (av1_intra_prediction_edges[mode].needs_left) {
- const int sz = th << 2;
- pixel *const left = &topleft_out[-sz];
-
- if (have_left) {
- const int px_have = imin(sz, (h - y) << 2);
-
- for (int i = 0; i < px_have; i++)
- left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
- if (px_have < sz)
- pixel_set(left, left[sz - px_have], sz - px_have);
- } else {
- pixel_set(left, have_top ? *dst_top : ((1 << BITDEPTH) >> 1) + 1, sz);
- }
-
- if (av1_intra_prediction_edges[mode].needs_bottomleft) {
- const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
- (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
-
- if (have_bottomleft) {
- const int px_have = imin(sz, (h - y - th) << 2);
-
- for (int i = 0; i < px_have; i++)
- left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
- if (px_have < sz)
- pixel_set(left - sz, left[-px_have], sz - px_have);
- } else {
- pixel_set(left - sz, left[0], sz);
- }
- }
- }
-
- if (av1_intra_prediction_edges[mode].needs_top) {
- const int sz = tw << 2;
- pixel *const top = &topleft_out[1];
-
- if (have_top) {
- const int px_have = imin(sz, (w - x) << 2);
- pixel_copy(top, dst_top, px_have);
- if (px_have < sz)
- pixel_set(top + px_have, top[px_have - 1], sz - px_have);
- } else {
- pixel_set(top, have_left ? dst[-1] : ((1 << BITDEPTH) >> 1) - 1, sz);
- }
-
- if (av1_intra_prediction_edges[mode].needs_topright) {
- const int have_topright = (!have_top || x + tw >= w) ? 0 :
- (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
-
- if (have_topright) {
- const int px_have = imin(sz, (w - x - tw) << 2);
-
- pixel_copy(top + sz, &dst_top[sz], px_have);
- if (px_have < sz)
- pixel_set(top + sz + px_have, top[sz + px_have - 1],
- sz - px_have);
- } else {
- pixel_set(top + sz, top[sz - 1], sz);
- }
- }
- }
-
- if (av1_intra_prediction_edges[mode].needs_topleft) {
- if (have_left) {
- *topleft_out = have_top ? dst_top[-1] : dst[-1];
- } else {
- *topleft_out = have_top ? *dst_top : (1 << BITDEPTH) >> 1;
- }
- if (mode == Z2_PRED && tw + th >= 6)
- *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
- topleft_out[1] * 5 + 8) >> 4;
- }
-
- return mode;
-}
--- /dev/null
+++ b/src/ipred_prepare_tmpl.c
@@ -1,0 +1,209 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/ipred_prepare.h"
+
+static const uint8_t av1_mode_conv[N_INTRA_PRED_MODES]
+ [2 /* have_left */][2 /* have_top */] =
+{
+ [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
+ { LEFT_DC_PRED, DC_PRED } },
+ [PAETH_PRED] = { { DC_128_PRED, VERT_PRED },
+ { HOR_PRED, PAETH_PRED } },
+};
+
+static const uint8_t av1_mode_to_angle_map[8] = {
+ 90, 180, 45, 135, 113, 157, 203, 67
+};
+
+static const struct {
+ uint8_t needs_left:1;
+ uint8_t needs_top:1;
+ uint8_t needs_topleft:1;
+ uint8_t needs_topright:1;
+ uint8_t needs_bottomleft:1;
+} av1_intra_prediction_edges[N_IMPL_INTRA_PRED_MODES] = {
+ [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
+ [VERT_PRED] = { .needs_top = 1 },
+ [HOR_PRED] = { .needs_left = 1 },
+ [LEFT_DC_PRED] = { .needs_left = 1 },
+ [TOP_DC_PRED] = { .needs_top = 1 },
+ [DC_128_PRED] = { 0 },
+ [Z1_PRED] = { .needs_top = 1, .needs_topright = 1,
+ .needs_topleft = 1 },
+ [Z2_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+ [Z3_PRED] = { .needs_left = 1, .needs_bottomleft = 1,
+ .needs_topleft = 1 },
+ [SMOOTH_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [SMOOTH_V_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [SMOOTH_H_PRED] = { .needs_left = 1, .needs_top = 1 },
+ [PAETH_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+ [FILTER_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
+};
+
+enum IntraPredMode
+bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
+ const int y, const int have_top,
+ const int w, const int h,
+ const enum EdgeFlags edge_flags,
+ const pixel *const dst,
+ const ptrdiff_t stride,
+ const pixel *prefilter_toplevel_sb_edge,
+ enum IntraPredMode mode, int *const angle,
+ const int tw, const int th,
+ pixel *const topleft_out)
+{
+ assert(y < h && x < w);
+
+ switch (mode) {
+ case VERT_PRED:
+ case HOR_PRED:
+ case DIAG_DOWN_LEFT_PRED:
+ case DIAG_DOWN_RIGHT_PRED:
+ case VERT_RIGHT_PRED:
+ case HOR_DOWN_PRED:
+ case HOR_UP_PRED:
+ case VERT_LEFT_PRED: {
+ *angle = av1_mode_to_angle_map[mode - VERT_PRED] + 3 * *angle;
+
+ if (*angle < 90) {
+ mode = have_top ? Z1_PRED : VERT_PRED;
+ } else if (*angle == 90) {
+ mode = VERT_PRED;
+ } else if (*angle < 180) {
+ mode = Z2_PRED;
+ } else if (*angle == 180) {
+ mode = HOR_PRED;
+ } else {
+ mode = have_left ? Z3_PRED : HOR_PRED;
+ }
+ break;
+ }
+ case DC_PRED:
+ case PAETH_PRED:
+ mode = av1_mode_conv[mode][have_left][have_top];
+ break;
+ default:
+ break;
+ }
+
+ const pixel *dst_top;
+ if (have_top &&
+ (av1_intra_prediction_edges[mode].needs_top ||
+ av1_intra_prediction_edges[mode].needs_topleft ||
+ (av1_intra_prediction_edges[mode].needs_left && !have_left)))
+ {
+ if (prefilter_toplevel_sb_edge) {
+ dst_top = &prefilter_toplevel_sb_edge[x * 4];
+ } else {
+ dst_top = &dst[-PXSTRIDE(stride)];
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_left) {
+ const int sz = th << 2;
+ pixel *const left = &topleft_out[-sz];
+
+ if (have_left) {
+ const int px_have = imin(sz, (h - y) << 2);
+
+ for (int i = 0; i < px_have; i++)
+ left[sz - 1 - i] = dst[PXSTRIDE(stride) * i - 1];
+ if (px_have < sz)
+ pixel_set(left, left[sz - px_have], sz - px_have);
+ } else {
+ pixel_set(left, have_top ? *dst_top : ((1 << BITDEPTH) >> 1) + 1, sz);
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_bottomleft) {
+ const int have_bottomleft = (!have_left || y + th >= h) ? 0 :
+ (edge_flags & EDGE_I444_LEFT_HAS_BOTTOM);
+
+ if (have_bottomleft) {
+ const int px_have = imin(sz, (h - y - th) << 2);
+
+ for (int i = 0; i < px_have; i++)
+ left[-(i + 1)] = dst[(sz + i) * PXSTRIDE(stride) - 1];
+ if (px_have < sz)
+ pixel_set(left - sz, left[-px_have], sz - px_have);
+ } else {
+ pixel_set(left - sz, left[0], sz);
+ }
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_top) {
+ const int sz = tw << 2;
+ pixel *const top = &topleft_out[1];
+
+ if (have_top) {
+ const int px_have = imin(sz, (w - x) << 2);
+ pixel_copy(top, dst_top, px_have);
+ if (px_have < sz)
+ pixel_set(top + px_have, top[px_have - 1], sz - px_have);
+ } else {
+ pixel_set(top, have_left ? dst[-1] : ((1 << BITDEPTH) >> 1) - 1, sz);
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_topright) {
+ const int have_topright = (!have_top || x + tw >= w) ? 0 :
+ (edge_flags & EDGE_I444_TOP_HAS_RIGHT);
+
+ if (have_topright) {
+ const int px_have = imin(sz, (w - x - tw) << 2);
+
+ pixel_copy(top + sz, &dst_top[sz], px_have);
+ if (px_have < sz)
+ pixel_set(top + sz + px_have, top[sz + px_have - 1],
+ sz - px_have);
+ } else {
+ pixel_set(top + sz, top[sz - 1], sz);
+ }
+ }
+ }
+
+ if (av1_intra_prediction_edges[mode].needs_topleft) {
+ if (have_left) {
+ *topleft_out = have_top ? dst_top[-1] : dst[-1];
+ } else {
+ *topleft_out = have_top ? *dst_top : (1 << BITDEPTH) >> 1;
+ }
+ if (mode == Z2_PRED && tw + th >= 6)
+ *topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
+ topleft_out[1] * 5 + 8) >> 4;
+ }
+
+ return mode;
+}
--- /dev/null
+++ b/src/ipred_tmpl.c
@@ -1,0 +1,757 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/ipred.h"
+#include "src/tables.h"
+
+static NOINLINE void
+splat_dc(pixel *dst, const ptrdiff_t stride,
+ const int width, const int height, const unsigned dc)
+{
+ assert(dc <= (1 << BITDEPTH) - 1);
+#if BITDEPTH == 8
+ if (width > 4) {
+ const uint64_t dcN = dc * 0x0101010101010101ULL;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN))
+ *((uint64_t *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+ } else {
+ const unsigned dcN = dc * 0x01010101U;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN))
+ *((unsigned *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+ }
+#else
+ const uint64_t dcN = dc * 0x0001000100010001ULL;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x += sizeof(dcN) >> 1)
+ *((uint64_t *) &dst[x]) = dcN;
+ dst += PXSTRIDE(stride);
+ }
+#endif
+}
+
+static NOINLINE void
+cfl_pred(pixel *dst, const ptrdiff_t stride,
+ const int width, const int height, const unsigned dc,
+ const int16_t *ac, const int alpha)
+{
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int diff = alpha * ac[x];
+ dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
+ }
+ ac += width;
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static unsigned dc_gen_top(const pixel *const topleft, const int width)
+{
+ unsigned dc = width >> 1;
+ for (int i = 0; i < width; i++)
+ dc += topleft[1 + i];
+ return dc >> ctz(width);
+}
+
+static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
+}
+
+static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha)
+{
+ cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
+}
+
+static unsigned dc_gen_left(const pixel *const topleft, const int height)
+{
+ unsigned dc = height >> 1;
+ for (int i = 0; i < height; i++)
+ dc += topleft[-(1 + i)];
+ return dc >> ctz(height);
+}
+
+static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
+}
+
+static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha)
+{
+ unsigned dc = dc_gen_left(topleft, height);
+ cfl_pred(dst, stride, width, height, dc, ac, alpha);
+}
+
+#if BITDEPTH == 8
+#define MULTIPLIER_1x2 0x5556
+#define MULTIPLIER_1x4 0x3334
+#define BASE_SHIFT 16
+#else
+#define MULTIPLIER_1x2 0xAAAB
+#define MULTIPLIER_1x4 0x6667
+#define BASE_SHIFT 17
+#endif
+
+static unsigned
+dc_gen(const pixel *const topleft, const int width, const int height)
+{
+ unsigned dc = (width + height) >> 1;
+ for (int i = 0; i < width; i++)
+ dc += topleft[i + 1];
+ for (int i = 0; i < height; i++)
+ dc += topleft[-(i + 1)];
+ dc >>= ctz(width + height);
+
+ if (width != height) {
+ dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
+ MULTIPLIER_1x2;
+ dc >>= BASE_SHIFT;
+ }
+ return dc;
+}
+
+static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
+}
+
+static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha)
+{
+ unsigned dc = dc_gen(topleft, width, height);
+ cfl_pred(dst, stride, width, height, dc, ac, alpha);
+}
+
+#undef MULTIPLIER_1x2
+#undef MULTIPLIER_1x4
+#undef BASE_SHIFT
+
+static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
+}
+
+static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height,
+ const int16_t *ac, const int alpha)
+{
+ cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
+}
+
+static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ for (int y = 0; y < height; y++) {
+ pixel_copy(dst, topleft + 1, width);
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ for (int y = 0; y < height; y++) {
+ pixel_set(dst, topleft[-(1 + y)], width);
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const tl_ptr,
+ const int width, const int height, const int a)
+{
+ const int topleft = tl_ptr[0];
+ for (int y = 0; y < height; y++) {
+ const int left = tl_ptr[-(y + 1)];
+ for (int x = 0; x < width; x++) {
+ const int top = tl_ptr[1 + x];
+ const int base = left + top - topleft;
+ const int ldiff = abs(left - base);
+ const int tdiff = abs(top - base);
+ const int tldiff = abs(topleft - base);
+
+ dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
+ tdiff <= tldiff ? top : topleft;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+ const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+ const int right = topleft[width], bottom = topleft[-height];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_ver[y] * topleft[1 + x] +
+ (256 - weights_ver[y]) * bottom +
+ weights_hor[x] * topleft[-(1 + y)] +
+ (256 - weights_hor[x]) * right;
+ dst[x] = (pred + 256) >> 9;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ const uint8_t *const weights_ver = &dav1d_sm_weights[height];
+ const int bottom = topleft[-height];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_ver[y] * topleft[1 + x] +
+ (256 - weights_ver[y]) * bottom;
+ dst[x] = (pred + 128) >> 8;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft,
+ const int width, const int height, const int a)
+{
+ const uint8_t *const weights_hor = &dav1d_sm_weights[width];
+ const int right = topleft[width];
+
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ const int pred = weights_hor[x] * topleft[-(y + 1)] +
+ (256 - weights_hor[x]) * right;
+ dst[x] = (pred + 128) >> 8;
+ }
+ dst += PXSTRIDE(stride);
+ }
+}
+
+static int get_filter_strength(const unsigned blk_wh, const unsigned d,
+ const int type)
+{
+ int strength = 0;
+
+ if (type == 0) {
+ if (blk_wh <= 8) {
+ if (d >= 56) strength = 1;
+ } else if (blk_wh <= 12) {
+ if (d >= 40) strength = 1;
+ } else if (blk_wh <= 16) {
+ if (d >= 40) strength = 1;
+ } else if (blk_wh <= 24) {
+ if (d >= 8) strength = 1;
+ if (d >= 16) strength = 2;
+ if (d >= 32) strength = 3;
+ } else if (blk_wh <= 32) {
+ if (d >= 1) strength = 1;
+ if (d >= 4) strength = 2;
+ if (d >= 32) strength = 3;
+ } else {
+ if (d >= 1) strength = 3;
+ }
+ } else {
+ if (blk_wh <= 8) {
+ if (d >= 40) strength = 1;
+ if (d >= 64) strength = 2;
+ } else if (blk_wh <= 16) {
+ if (d >= 20) strength = 1;
+ if (d >= 48) strength = 2;
+ } else if (blk_wh <= 24) {
+ if (d >= 4) strength = 3;
+ } else {
+ if (d >= 1) strength = 3;
+ }
+ }
+
+ return strength;
+}
+
+static void filter_edge(pixel *const out, const int sz, const pixel *const in,
+ const int from, const int to, const unsigned strength)
+{
+ static const uint8_t kernel[3][5] = {
+ { 0, 4, 8, 4, 0 },
+ { 0, 5, 6, 5, 0 },
+ { 2, 4, 4, 4, 2 }
+ };
+
+ assert(strength > 0);
+ for (int i = 0; i < sz; i++) {
+ int s = 0;
+ for (int j = 0; j < 5; j++)
+ s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
+ out[i] = (s + 8) >> 4;
+ }
+}
+
+static int get_upsample(const int blk_wh, const unsigned d, const int type) {
+ if (d >= 40) return 0;
+ return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
+
+static void upsample_edge(pixel *const out, const int hsz,
+ const pixel *const in, const int from, const int to)
+{
+ static const int8_t kernel[4] = { -1, 9, 9, -1 };
+ int i;
+ for (i = 0; i < hsz - 1; i++) {
+ out[i * 2] = in[iclip(i, from, to - 1)];
+
+ int s = 0;
+ for (int j = 0; j < 4; j++)
+ s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
+ out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
+ }
+ out[i * 2] = in[iclip(i, from, to - 1)];
+}
+
+static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle)
+{
+ const int is_sm = angle >> 9;
+ angle &= 511;
+ assert(angle < 90);
+ const int dx = dav1d_dr_intra_derivative[angle];
+ pixel top_out[(64 + 64) * 2];
+ const pixel *top;
+ int max_base_x;
+ const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);
+ if (upsample_above) {
+ upsample_edge(top_out, width + height,
+ &topleft_in[1], -1, width + imin(width, height));
+ top = top_out;
+ max_base_x = 2 * (width + height) - 2;
+ } else {
+ const int filter_strength =
+ get_filter_strength(width + height, 90 - angle, is_sm);
+
+ if (filter_strength) {
+ filter_edge(top_out, width + height,
+ &topleft_in[1], -1, width + imin(width, height),
+ filter_strength);
+ top = top_out;
+ max_base_x = width + height - 1;
+ } else {
+ top = &topleft_in[1];
+ max_base_x = width + imin(width, height) - 1;
+ }
+ }
+ const int frac_bits = 6 - upsample_above;
+ const int base_inc = 1 << upsample_above;
+ for (int y = 0, xpos = dx; y < height;
+ y++, dst += PXSTRIDE(stride), xpos += dx)
+ {
+ int base = xpos >> frac_bits;
+ const int frac = ((xpos << upsample_above) & 0x3F) >> 1;
+
+ for (int x = 0; x < width; x++, base += base_inc) {
+ if (base < max_base_x) {
+ const int v = top[base] * (32 - frac) + top[base + 1] * frac;
+ dst[x] = iclip_pixel((v + 16) >> 5);
+ } else {
+ pixel_set(&dst[x], top[max_base_x], width - x);
+ break;
+ }
+ }
+ }
+}
+
+static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle)
+{
+ const int is_sm = angle >> 9;
+ angle &= 511;
+ assert(angle > 90 && angle < 180);
+ const int dy = dav1d_dr_intra_derivative[angle - 90];
+ const int dx = dav1d_dr_intra_derivative[180 - angle];
+ const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
+ const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
+ pixel edge[64 * 2 + 64 * 2 + 1];
+ pixel *const topleft = &edge[height * 2];
+
+ if (upsample_above) {
+ upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
+ } else {
+ const int filter_strength =
+ get_filter_strength(width + height, angle - 90, is_sm);
+
+ if (filter_strength) {
+ filter_edge(&topleft[1], width, &topleft_in[1], -1, width,
+ filter_strength);
+ } else {
+ pixel_copy(&topleft[1], &topleft_in[1], width);
+ }
+ }
+ if (upsample_left) {
+ upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
+ } else {
+ const int filter_strength =
+ get_filter_strength(width + height, 180 - angle, is_sm);
+
+ if (filter_strength) {
+ filter_edge(&topleft[-height], height, &topleft_in[-height],
+ 0, height + 1, filter_strength);
+ } else {
+ pixel_copy(&topleft[-height], &topleft_in[-height], height);
+ }
+ }
+ *topleft = *topleft_in;
+
+ const int min_base_x = -(1 << upsample_above);
+ const int frac_bits_y = 6 - upsample_left, frac_bits_x = 6 - upsample_above;
+ const int base_inc_x = 1 << upsample_above;
+ const pixel *const left = &topleft[-(1 << upsample_left)];
+ const pixel *const top = &topleft[1 << upsample_above];
+ for (int y = 0, xpos = -dx; y < height;
+ y++, xpos -= dx, dst += PXSTRIDE(stride))
+ {
+ int base_x = xpos >> frac_bits_x;
+ const int frac_x = ((xpos * (1 << upsample_above)) & 0x3F) >> 1;
+
+ for (int x = 0, ypos = (y << 6) - dy; x < width;
+ x++, base_x += base_inc_x, ypos -= dy)
+ {
+ int v;
+
+ if (base_x >= min_base_x) {
+ v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
+ } else {
+ const int base_y = ypos >> frac_bits_y;
+ assert(base_y >= -(1 << upsample_left));
+ const int frac_y = ((ypos * (1 << upsample_left)) & 0x3F) >> 1;
+ v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
+ }
+ dst[x] = iclip_pixel((v + 16) >> 5);
+ }
+ }
+}
+
+static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int angle)
+{
+ const int is_sm = angle >> 9;
+ angle &= 511;
+ assert(angle > 180);
+ const int dy = dav1d_dr_intra_derivative[270 - angle];
+ pixel left_out[(64 + 64) * 2];
+ const pixel *left;
+ int max_base_y;
+ const int upsample_left = get_upsample(width + height, angle - 180, is_sm);
+ if (upsample_left) {
+ upsample_edge(left_out, width + height,
+ &topleft_in[-(width + height)],
+ imax(width - height, 0), width + height + 1);
+ left = &left_out[2 * (width + height) - 2];
+ max_base_y = 2 * (width + height) - 2;
+ } else {
+ const int filter_strength =
+ get_filter_strength(width + height, angle - 180, is_sm);
+
+ if (filter_strength) {
+ filter_edge(left_out, width + height,
+ &topleft_in[-(width + height)],
+ imax(width - height, 0), width + height + 1,
+ filter_strength);
+ left = &left_out[width + height - 1];
+ max_base_y = width + height - 1;
+ } else {
+ left = &topleft_in[-1];
+ max_base_y = height + imin(width, height) - 1;
+ }
+ }
+ const int frac_bits = 6 - upsample_left;
+ const int base_inc = 1 << upsample_left;
+ for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
+ int base = ypos >> frac_bits;
+ const int frac = ((ypos << upsample_left) & 0x3F) >> 1;
+
+ for (int y = 0; y < height; y++, base += base_inc) {
+ if (base < max_base_y) {
+ const int v = left[-base] * (32 - frac) +
+ left[-(base + 1)] * frac;
+ dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
+ } else {
+ do {
+ dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
+ } while (++y < height);
+ break;
+ }
+ }
+ }
+}
+
+/* Up to 32x32 only */
+static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
+ const pixel *const topleft_in,
+ const int width, const int height, int filt_idx)
+{
+ filt_idx &= 511;
+ assert(filt_idx < 5);
+
+ const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
+ int x, y;
+ ptrdiff_t left_stride;
+ const pixel *left, *topleft, *top;
+
+ top = &topleft_in[1];
+ for (y = 0; y < height; y += 2) {
+ topleft = &topleft_in[-y];
+ left = &topleft[-1];
+ left_stride = -1;
+ for (x = 0; x < width; x += 4) {
+ const int p0 = *topleft;
+ const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
+ const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
+ pixel *ptr = &dst[x];
+ const int8_t *flt_ptr = filter;
+
+ for (int yy = 0; yy < 2; yy++) {
+ for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
+ int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
+ flt_ptr[16] * p2 + flt_ptr[17] * p3 +
+ flt_ptr[32] * p4 + flt_ptr[33] * p5 +
+ flt_ptr[48] * p6;
+ ptr[xx] = iclip_pixel((acc + 8) >> 4);
+ }
+ ptr += PXSTRIDE(stride);
+ }
+ left = &dst[x + 4 - 1];
+ left_stride = PXSTRIDE(stride);
+ top += 4;
+ topleft = &top[-1];
+ }
+ top = &dst[PXSTRIDE(stride)];
+ dst = &dst[PXSTRIDE(stride) * 2];
+ }
+}
+
+static NOINLINE void
+cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
+ const int w_pad, const int h_pad, const int width, const int height,
+ const int ss_hor, const int ss_ver, const int log2sz)
+{
+ int y, x;
+ int16_t *const ac_orig = ac;
+
+ assert(w_pad >= 0 && w_pad * 4 < width);
+ assert(h_pad >= 0 && h_pad * 4 < height);
+
+ for (y = 0; y < height - 4 * h_pad; y++) {
+ for (x = 0; x < width - 4 * w_pad; x++) {
+ int ac_sum = ypx[x << ss_hor];
+ if (ss_hor) ac_sum += ypx[x * 2 + 1];
+ if (ss_ver) {
+ ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
+ if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
+ }
+ ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
+ }
+ for (; x < width; x++)
+ ac[x] = ac[x - 1];
+ ac += width;
+ ypx += PXSTRIDE(stride) << ss_ver;
+ }
+ for (; y < height; y++) {
+ memcpy(ac, &ac[-width], width * sizeof(*ac));
+ ac += width;
+ }
+
+ int sum = (1 << log2sz) >> 1;
+ for (ac = ac_orig, y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sum += ac[x];
+ ac += width;
+ }
+ sum >>= log2sz;
+
+ // subtract DC
+ for (ac = ac_orig, y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ ac[x] -= sum;
+ ac += width;
+ }
+}
+
+#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \
+static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \
+ const pixel *const ypx, \
+ const ptrdiff_t stride, \
+ const int w_pad, \
+ const int h_pad) \
+{ \
+ cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \
+}
+
+cfl_ac_fn( 8, 8, 4, 4, 1, 1, 4)
+cfl_ac_fn( 8, 16, 4, 8, 1, 1, 5)
+cfl_ac_fn( 8, 32, 4, 16, 1, 1, 6)
+cfl_ac_fn(16, 8, 8, 4, 1, 1, 5)
+cfl_ac_fn(16, 16, 8, 8, 1, 1, 6)
+cfl_ac_fn(16, 32, 8, 16, 1, 1, 7)
+cfl_ac_fn(32, 8, 16, 4, 1, 1, 6)
+cfl_ac_fn(32, 16, 16, 8, 1, 1, 7)
+cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)
+
+cfl_ac_fn( 8, 4, 4, 4, 1, 0, 4)
+cfl_ac_fn( 8, 8, 4, 8, 1, 0, 5)
+cfl_ac_fn(16, 4, 8, 4, 1, 0, 5)
+cfl_ac_fn(16, 8, 8, 8, 1, 0, 6)
+cfl_ac_fn(16, 16, 8, 16, 1, 0, 7)
+cfl_ac_fn(32, 8, 16, 8, 1, 0, 7)
+cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)
+cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)
+
+cfl_ac_fn( 4, 4, 4, 4, 0, 0, 4)
+cfl_ac_fn( 4, 8, 4, 8, 0, 0, 5)
+cfl_ac_fn( 4, 16, 4, 16, 0, 0, 6)
+cfl_ac_fn( 8, 4, 8, 4, 0, 0, 5)
+cfl_ac_fn( 8, 8, 8, 8, 0, 0, 6)
+cfl_ac_fn( 8, 16, 8, 16, 0, 0, 7)
+cfl_ac_fn( 8, 32, 8, 32, 0, 0, 8)
+cfl_ac_fn(16, 4, 16, 4, 0, 0, 6)
+cfl_ac_fn(16, 8, 16, 8, 0, 0, 7)
+cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)
+cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)
+cfl_ac_fn(32, 8, 32, 8, 0, 0, 8)
+cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)
+cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)
+
+static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
+ const uint16_t *const pal, const uint8_t *idx,
+ const int w, const int h)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++)
+ dst[x] = pal[idx[x]];
+ idx += w;
+ dst += PXSTRIDE(stride);
+ }
+}
+
+void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
+ c->intra_pred[DC_PRED ] = ipred_dc_c;
+ c->intra_pred[DC_128_PRED ] = ipred_dc_128_c;
+ c->intra_pred[TOP_DC_PRED ] = ipred_dc_top_c;
+ c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
+ c->intra_pred[HOR_PRED ] = ipred_h_c;
+ c->intra_pred[VERT_PRED ] = ipred_v_c;
+ c->intra_pred[PAETH_PRED ] = ipred_paeth_c;
+ c->intra_pred[SMOOTH_PRED ] = ipred_smooth_c;
+ c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
+ c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
+ c->intra_pred[Z1_PRED ] = ipred_z1_c;
+ c->intra_pred[Z2_PRED ] = ipred_z2_c;
+ c->intra_pred[Z3_PRED ] = ipred_z3_c;
+ c->intra_pred[FILTER_PRED ] = ipred_filter_c;
+
+ // cfl functions are split per chroma subsampling type
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4 ] = cfl_ac_8x8_to_4x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8 ] = cfl_ac_8x16_to_4x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4 ] = cfl_ac_16x8_to_8x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8 ] = cfl_ac_16x16_to_8x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4 ] = cfl_ac_8x4_to_4x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8 ] = cfl_ac_8x8_to_4x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4 ] = cfl_ac_16x4_to_8x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8 ] = cfl_ac_16x8_to_8x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;
+
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4 ] = cfl_ac_4x4_to_4x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8 ] = cfl_ac_4x8_to_4x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4 ] = cfl_ac_8x4_to_8x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8 ] = cfl_ac_8x8_to_8x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;
+ c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;
+
+ c->cfl_pred[DC_PRED ] = ipred_cfl_c;
+ c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
+ c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
+ c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
+
+ c->pal_pred = pal_pred_c;
+
+#if HAVE_ASM && ARCH_X86
+ bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+#endif
+}
--- a/src/itx.c
+++ /dev/null
@@ -1,233 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/itx.h"
-
-#include "src/itx_1d.c"
-
-typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
- coef *out, ptrdiff_t out_s);
-
-static void NOINLINE
-inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
- coef *const coeff, const int eob,
- const int w, const int h, const int shift1, const int shift2,
- const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn)
-{
- int i, j;
- const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
- assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
- // Maximum value for h and w is 64
- coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
- const int is_rect2 = w * 2 == h || h * 2 == w;
-
- if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
- const int rnd1 = (1 << shift1) >> 1;
- for (i = 0; i < sh; i++) {
- if (w != sw || is_rect2) {
- for (j = 0; j < sw; j++) {
- in_mem[j] = coeff[i + j * sh];
- if (is_rect2)
- in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
- }
- first_1d_fn(in_mem, 1, &tmp[i * w], 1);
- } else {
- first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
- }
- for (j = 0; j < w; j++)
- tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
- }
-
- if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
- const int rnd2 = (1 << shift2) >> 1;
- for (i = 0; i < w; i++) {
- second_1d_fn(&tmp[i], w, out, 1);
- for (j = 0; j < h; j++)
- dst[i + j * PXSTRIDE(stride)] =
- iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
- ((out[j] + (rnd2)) >> shift2));
- }
- memset(coeff, 0, sizeof(*coeff) * sh * sw);
-}
-
-#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \
-static void \
-inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
- const ptrdiff_t stride, \
- coef *const coeff, \
- const int eob) \
-{ \
- inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
- inv_##type1##w##_1d, inv_##type2##h##_1d); \
-}
-
-#define inv_txfm_fn64(w, h, shift1, shift2) \
-inv_txfm_fn(dct, dct, w, h, shift1, shift2)
-
-#define inv_txfm_fn32(w, h, shift1, shift2) \
-inv_txfm_fn64(w, h, shift1, shift2) \
-inv_txfm_fn(identity, identity, w, h, shift1, shift2)
-
-#define inv_txfm_fn16(w, h, shift1, shift2) \
-inv_txfm_fn32(w, h, shift1, shift2) \
-inv_txfm_fn(adst, dct, w, h, shift1, shift2) \
-inv_txfm_fn(dct, adst, w, h, shift1, shift2) \
-inv_txfm_fn(adst, adst, w, h, shift1, shift2) \
-inv_txfm_fn(dct, flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, dct, w, h, shift1, shift2) \
-inv_txfm_fn(adst, flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, adst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(identity, dct, w, h, shift1, shift2) \
-inv_txfm_fn(dct, identity, w, h, shift1, shift2) \
-
-#define inv_txfm_fn84(w, h, shift1, shift2) \
-inv_txfm_fn16(w, h, shift1, shift2) \
-inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \
-inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \
-inv_txfm_fn(identity, adst, w, h, shift1, shift2) \
-inv_txfm_fn(adst, identity, w, h, shift1, shift2) \
-
-inv_txfm_fn84( 4, 4, 0, 4)
-inv_txfm_fn84( 4, 8, 0, 4)
-inv_txfm_fn84( 4, 16, 1, 4)
-inv_txfm_fn84( 8, 4, 0, 4)
-inv_txfm_fn84( 8, 8, 1, 4)
-inv_txfm_fn84( 8, 16, 1, 4)
-inv_txfm_fn32( 8, 32, 2, 4)
-inv_txfm_fn84(16, 4, 1, 4)
-inv_txfm_fn84(16, 8, 1, 4)
-inv_txfm_fn16(16, 16, 2, 4)
-inv_txfm_fn32(16, 32, 1, 4)
-inv_txfm_fn64(16, 64, 2, 4)
-inv_txfm_fn32(32, 8, 2, 4)
-inv_txfm_fn32(32, 16, 1, 4)
-inv_txfm_fn32(32, 32, 2, 4)
-inv_txfm_fn64(32, 64, 1, 4)
-inv_txfm_fn64(64, 16, 2, 4)
-inv_txfm_fn64(64, 32, 1, 4)
-inv_txfm_fn64(64, 64, 2, 4)
-
-static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
- coef *const coeff, const int eob)
-{
- int i, j;
- coef tmp[4 * 4], out[4];
-
- for (i = 0; i < 4; i++)
- inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
-
- for (i = 0; i < 4; i++) {
- inv_wht4_1d(&tmp[i], 4, out, 1, 1);
- for (j = 0; j < 4; j++)
- dst[i + j * PXSTRIDE(stride)] =
- iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
- }
- memset(coeff, 0, sizeof(*coeff) * 4 * 4);
-}
-
-void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
-#define assign_itx_all_fn64(w, h, pfx) \
- c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
- inv_txfm_add_dct_dct_##w##x##h##_c
-
-#define assign_itx_all_fn32(w, h, pfx) \
- assign_itx_all_fn64(w, h, pfx); \
- c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
- inv_txfm_add_identity_identity_##w##x##h##_c
-
-#define assign_itx_all_fn16(w, h, pfx) \
- assign_itx_all_fn32(w, h, pfx); \
- c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
- inv_txfm_add_adst_dct_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
- inv_txfm_add_dct_adst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
- inv_txfm_add_adst_adst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
- inv_txfm_add_flipadst_adst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
- inv_txfm_add_adst_flipadst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
- inv_txfm_add_flipadst_dct_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
- inv_txfm_add_dct_flipadst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
- inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
- inv_txfm_add_dct_identity_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
- inv_txfm_add_identity_dct_##w##x##h##_c
-
-#define assign_itx_all_fn84(w, h, pfx) \
- assign_itx_all_fn16(w, h, pfx); \
- c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
- inv_txfm_add_flipadst_identity_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
- inv_txfm_add_identity_flipadst_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
- inv_txfm_add_adst_identity_##w##x##h##_c; \
- c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
- inv_txfm_add_identity_adst_##w##x##h##_c; \
-
- memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */
-
- c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
- assign_itx_all_fn84( 4, 4, );
- assign_itx_all_fn84( 4, 8, R);
- assign_itx_all_fn84( 4, 16, R);
- assign_itx_all_fn84( 8, 4, R);
- assign_itx_all_fn84( 8, 8, );
- assign_itx_all_fn84( 8, 16, R);
- assign_itx_all_fn32( 8, 32, R);
- assign_itx_all_fn84(16, 4, R);
- assign_itx_all_fn84(16, 8, R);
- assign_itx_all_fn16(16, 16, );
- assign_itx_all_fn32(16, 32, R);
- assign_itx_all_fn64(16, 64, R);
- assign_itx_all_fn32(32, 8, R);
- assign_itx_all_fn32(32, 16, R);
- assign_itx_all_fn32(32, 32, );
- assign_itx_all_fn64(32, 64, R);
- assign_itx_all_fn64(64, 16, R);
- assign_itx_all_fn64(64, 32, R);
- assign_itx_all_fn64(64, 64, );
-
-#if HAVE_ASM && ARCH_X86
- bitfn(dav1d_itx_dsp_init_x86)(c);
-#endif
-}
--- /dev/null
+++ b/src/itx_tmpl.c
@@ -1,0 +1,233 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/itx.h"
+
+#include "src/itx_1d.c"
+
+typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
+ coef *out, ptrdiff_t out_s);
+
+static void NOINLINE
+inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
+ coef *const coeff, const int eob,
+ const int w, const int h, const int shift1, const int shift2,
+ const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn)
+{
+ int i, j;
+ const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
+ assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
+ // Maximum value for h and w is 64
+ coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
+ const int is_rect2 = w * 2 == h || h * 2 == w;
+
+ if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
+ const int rnd1 = (1 << shift1) >> 1;
+ for (i = 0; i < sh; i++) {
+ if (w != sw || is_rect2) {
+ for (j = 0; j < sw; j++) {
+ in_mem[j] = coeff[i + j * sh];
+ if (is_rect2)
+ in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
+ }
+ first_1d_fn(in_mem, 1, &tmp[i * w], 1);
+ } else {
+ first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
+ }
+ for (j = 0; j < w; j++)
+ tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+ }
+
+ if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
+ const int rnd2 = (1 << shift2) >> 1;
+ for (i = 0; i < w; i++) {
+ second_1d_fn(&tmp[i], w, out, 1);
+ for (j = 0; j < h; j++)
+ dst[i + j * PXSTRIDE(stride)] =
+ iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
+ ((out[j] + (rnd2)) >> shift2));
+ }
+ memset(coeff, 0, sizeof(*coeff) * sh * sw);
+}
+
+#define inv_txfm_fn(type1, type2, w, h, shift1, shift2) \
+static void \
+inv_txfm_add_##type1##_##type2##_##w##x##h##_c(pixel *dst, \
+ const ptrdiff_t stride, \
+ coef *const coeff, \
+ const int eob) \
+{ \
+ inv_txfm_add_c(dst, stride, coeff, eob, w, h, shift1, shift2, \
+ inv_##type1##w##_1d, inv_##type2##h##_1d); \
+}
+
+#define inv_txfm_fn64(w, h, shift1, shift2) \
+inv_txfm_fn(dct, dct, w, h, shift1, shift2)
+
+#define inv_txfm_fn32(w, h, shift1, shift2) \
+inv_txfm_fn64(w, h, shift1, shift2) \
+inv_txfm_fn(identity, identity, w, h, shift1, shift2)
+
+#define inv_txfm_fn16(w, h, shift1, shift2) \
+inv_txfm_fn32(w, h, shift1, shift2) \
+inv_txfm_fn(adst, dct, w, h, shift1, shift2) \
+inv_txfm_fn(dct, adst, w, h, shift1, shift2) \
+inv_txfm_fn(adst, adst, w, h, shift1, shift2) \
+inv_txfm_fn(dct, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, dct, w, h, shift1, shift2) \
+inv_txfm_fn(adst, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, adst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(identity, dct, w, h, shift1, shift2) \
+inv_txfm_fn(dct, identity, w, h, shift1, shift2) \
+
+#define inv_txfm_fn84(w, h, shift1, shift2) \
+inv_txfm_fn16(w, h, shift1, shift2) \
+inv_txfm_fn(identity, flipadst, w, h, shift1, shift2) \
+inv_txfm_fn(flipadst, identity, w, h, shift1, shift2) \
+inv_txfm_fn(identity, adst, w, h, shift1, shift2) \
+inv_txfm_fn(adst, identity, w, h, shift1, shift2) \
+
+inv_txfm_fn84( 4, 4, 0, 4)
+inv_txfm_fn84( 4, 8, 0, 4)
+inv_txfm_fn84( 4, 16, 1, 4)
+inv_txfm_fn84( 8, 4, 0, 4)
+inv_txfm_fn84( 8, 8, 1, 4)
+inv_txfm_fn84( 8, 16, 1, 4)
+inv_txfm_fn32( 8, 32, 2, 4)
+inv_txfm_fn84(16, 4, 1, 4)
+inv_txfm_fn84(16, 8, 1, 4)
+inv_txfm_fn16(16, 16, 2, 4)
+inv_txfm_fn32(16, 32, 1, 4)
+inv_txfm_fn64(16, 64, 2, 4)
+inv_txfm_fn32(32, 8, 2, 4)
+inv_txfm_fn32(32, 16, 1, 4)
+inv_txfm_fn32(32, 32, 2, 4)
+inv_txfm_fn64(32, 64, 1, 4)
+inv_txfm_fn64(64, 16, 2, 4)
+inv_txfm_fn64(64, 32, 1, 4)
+inv_txfm_fn64(64, 64, 2, 4)
+
+static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
+ coef *const coeff, const int eob)
+{
+ int i, j;
+ coef tmp[4 * 4], out[4];
+
+ for (i = 0; i < 4; i++)
+ inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
+
+ for (i = 0; i < 4; i++) {
+ inv_wht4_1d(&tmp[i], 4, out, 1, 1);
+ for (j = 0; j < 4; j++)
+ dst[i + j * PXSTRIDE(stride)] =
+ iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
+ }
+ memset(coeff, 0, sizeof(*coeff) * 4 * 4);
+}
+
+void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
+#define assign_itx_all_fn64(w, h, pfx) \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
+ inv_txfm_add_dct_dct_##w##x##h##_c
+
+#define assign_itx_all_fn32(w, h, pfx) \
+ assign_itx_all_fn64(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][IDTX] = \
+ inv_txfm_add_identity_identity_##w##x##h##_c
+
+#define assign_itx_all_fn16(w, h, pfx) \
+ assign_itx_all_fn32(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_ADST ] = \
+ inv_txfm_add_adst_dct_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_DCT ] = \
+ inv_txfm_add_dct_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_ADST] = \
+ inv_txfm_add_adst_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][ADST_FLIPADST] = \
+ inv_txfm_add_flipadst_adst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_ADST] = \
+ inv_txfm_add_adst_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][DCT_FLIPADST] = \
+ inv_txfm_add_flipadst_dct_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_DCT] = \
+ inv_txfm_add_dct_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][FLIPADST_FLIPADST] = \
+ inv_txfm_add_flipadst_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][H_DCT] = \
+ inv_txfm_add_dct_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_DCT] = \
+ inv_txfm_add_identity_dct_##w##x##h##_c
+
+#define assign_itx_all_fn84(w, h, pfx) \
+ assign_itx_all_fn16(w, h, pfx); \
+ c->itxfm_add[pfx##TX_##w##X##h][H_FLIPADST] = \
+ inv_txfm_add_flipadst_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_FLIPADST] = \
+ inv_txfm_add_identity_flipadst_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][H_ADST] = \
+ inv_txfm_add_adst_identity_##w##x##h##_c; \
+ c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
+ inv_txfm_add_identity_adst_##w##x##h##_c; \
+
+ memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */
+
+ c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+ assign_itx_all_fn84( 4, 4, );
+ assign_itx_all_fn84( 4, 8, R);
+ assign_itx_all_fn84( 4, 16, R);
+ assign_itx_all_fn84( 8, 4, R);
+ assign_itx_all_fn84( 8, 8, );
+ assign_itx_all_fn84( 8, 16, R);
+ assign_itx_all_fn32( 8, 32, R);
+ assign_itx_all_fn84(16, 4, R);
+ assign_itx_all_fn84(16, 8, R);
+ assign_itx_all_fn16(16, 16, );
+ assign_itx_all_fn32(16, 32, R);
+ assign_itx_all_fn64(16, 64, R);
+ assign_itx_all_fn32(32, 8, R);
+ assign_itx_all_fn32(32, 16, R);
+ assign_itx_all_fn32(32, 32, );
+ assign_itx_all_fn64(32, 64, R);
+ assign_itx_all_fn64(64, 16, R);
+ assign_itx_all_fn64(64, 32, R);
+ assign_itx_all_fn64(64, 64, );
+
+#if HAVE_ASM && ARCH_X86
+ bitfn(dav1d_itx_dsp_init_x86)(c);
+#endif
+}
--- a/src/lf_apply.c
+++ /dev/null
@@ -1,306 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "common/intops.h"
-
-#include "src/lf_apply.h"
-
-static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
- const int have_left,
- const uint8_t (*lvl)[4],
- const ptrdiff_t b4_stride,
- const uint16_t (*const mask)[3][2],
- pixel *dst, const ptrdiff_t ls,
- const int w,
- const int starty4, const int endy4)
-{
- const Dav1dDSPContext *const dsp = f->dsp;
-
- // filter edges between columns (e.g. block1 | block2)
- for (int x = 0; x < w; x++) {
- if (!have_left && !x) continue;
- uint32_t hmask[4];
- if (!starty4) {
- hmask[0] = mask[x][0][0];
- hmask[1] = mask[x][1][0];
- hmask[2] = mask[x][2][0];
- if (endy4 > 16) {
- hmask[0] |= mask[x][0][1] << 16;
- hmask[1] |= mask[x][1][1] << 16;
- hmask[2] |= mask[x][2][1] << 16;
- }
- } else {
- hmask[0] = mask[x][0][1];
- hmask[1] = mask[x][1][1];
- hmask[2] = mask[x][2][1];
- }
- hmask[3] = 0;
- dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
- (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
- &f->lf.lim_lut, endy4 - starty4);
- }
-}
-
-static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
- const int have_top,
- const uint8_t (*lvl)[4],
- const ptrdiff_t b4_stride,
- const uint16_t (*const mask)[3][2],
- pixel *dst, const ptrdiff_t ls,
- const int w,
- const int starty4, const int endy4)
-{
- const Dav1dDSPContext *const dsp = f->dsp;
-
- // block1
- // filter edges between rows (e.g. ------)
- // block2
- for (int y = starty4; y < endy4;
- y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
- {
- if (!have_top && !y) continue;
- const uint32_t vmask[4] = {
- mask[y][0][0] | (mask[y][0][1] << 16),
- mask[y][1][0] | (mask[y][1][1] << 16),
- mask[y][2][0] | (mask[y][2][1] << 16),
- 0,
- };
- dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
- (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
- &f->lf.lim_lut, w);
- }
-}
-
-static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
- const int have_left,
- const uint8_t (*lvl)[4],
- const ptrdiff_t b4_stride,
- const uint16_t (*const mask)[2][2],
- pixel *const u, pixel *const v,
- const ptrdiff_t ls, const int w,
- const int starty4, const int endy4,
- const int ss_ver)
-{
- const Dav1dDSPContext *const dsp = f->dsp;
-
- // filter edges between columns (e.g. block1 | block2)
- for (int x = 0; x < w; x++) {
- if (!have_left && !x) continue;
- uint32_t hmask[3];
- if (!starty4) {
- hmask[0] = mask[x][0][0];
- hmask[1] = mask[x][1][0];
- if (endy4 > (16 >> ss_ver)) {
- hmask[0] |= mask[x][0][1] << (16 >> ss_ver);
- hmask[1] |= mask[x][1][1] << (16 >> ss_ver);
- }
- } else {
- hmask[0] = mask[x][0][1];
- hmask[1] = mask[x][1][1];
- }
- hmask[2] = 0;
- dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
- (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
- &f->lf.lim_lut, endy4 - starty4);
- dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
- (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
- &f->lf.lim_lut, endy4 - starty4);
- }
-}
-
-static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
- const int have_top,
- const uint8_t (*lvl)[4],
- const ptrdiff_t b4_stride,
- const uint16_t (*const mask)[2][2],
- pixel *const u, pixel *const v,
- const ptrdiff_t ls, const int w,
- const int starty4, const int endy4,
- const int ss_hor)
-{
- const Dav1dDSPContext *const dsp = f->dsp;
- ptrdiff_t off_l = 0;
-
- // block1
- // filter edges between rows (e.g. ------)
- // block2
- for (int y = starty4; y < endy4;
- y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
- {
- if (!have_top && !y) continue;
- const uint32_t vmask[3] = {
- mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),
- mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),
- 0,
- };
- dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
- (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
- &f->lf.lim_lut, w);
- dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
- (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
- &f->lf.lim_lut, w);
- }
-}
-
-void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
- pixel *const p[3], Av1Filter *const lflvl,
- int sby, const int start_of_tile_row)
-{
- int x, have_left;
- // Don't filter outside the frame
- const int hy4 = (f->cur.p.p.h + 3) >> 2;
- const int have_top = sby > 0;
- const int is_sb64 = !f->seq_hdr.sb128;
- const int starty4 = (sby & is_sb64) << 4;
- const int sbsz = 32 >> is_sb64;
- const int sbl2 = 5 - is_sb64;
- const int halign = (f->bh + 31) & ~31;
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
- const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
- const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);
- const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
-
- // fix lpf strength at tile col boundaries
- const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
- const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
- for (int tile_col = 1;; tile_col++) {
- x = f->frame_hdr.tiling.col_start_sb[tile_col];
- if ((x << sbl2) >= f->bw) break;
- const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
- x >>= is_sb64;
-
- uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
- for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
- const int sidx = mask >= 0x10000;
- const unsigned smask = mask >> (sidx << 4);
- const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
- !!(y_hmask[1][sidx] & smask);
- y_hmask[2][sidx] &= ~smask;
- y_hmask[1][sidx] &= ~smask;
- y_hmask[0][sidx] &= ~smask;
- y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
- }
-
- if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
- uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
- for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
- y++, uv_mask <<= 1)
- {
- const int sidx = uv_mask >= vmax;
- const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
- const int idx = !!(uv_hmask[1][sidx] & smask);
- uv_hmask[1][sidx] &= ~smask;
- uv_hmask[0][sidx] &= ~smask;
- uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
- }
- }
- lpf_y += halign;
- lpf_uv += halign >> ss_ver;
- }
-
- // fix lpf strength at tile row boundaries
- if (start_of_tile_row) {
- const BlockContext *a;
- for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
- x < f->sb128w; x++, a++)
- {
- uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
- for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {
- const int sidx = mask >= 0x10000;
- const unsigned smask = mask >> (sidx << 4);
- const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
- !!(y_vmask[1][sidx] & smask);
- y_vmask[2][sidx] &= ~smask;
- y_vmask[1][sidx] &= ~smask;
- y_vmask[0][sidx] &= ~smask;
- y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
- }
-
- if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
- uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
- for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {
- const int sidx = uv_mask >= hmax;
- const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
- const int idx = !!(uv_vmask[1][sidx] & smask);
- uv_vmask[1][sidx] &= ~smask;
- uv_vmask[0][sidx] &= ~smask;
- uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
- }
- }
- }
- }
-
- pixel *ptr;
- uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
- for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
- x++, have_left = 1, ptr += 128, level_ptr += 32)
- {
- filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
- lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
- imin(32, f->bw - x * 32), starty4, endy4);
- }
-
- level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
- for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
- filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
- lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
- imin(32, f->bw - x * 32), starty4, endy4);
- }
-
- if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
- return;
-
- ptrdiff_t uv_off;
- level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
- for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
- x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
- {
- filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
- lflvl[x].filter_uv[0],
- &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
- (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
- starty4 >> ss_ver, uv_endy4, ss_ver);
- }
-
- level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
- for (uv_off = 0, x = 0; x < f->sb128w;
- x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
- {
- filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
- lflvl[x].filter_uv[1],
- &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
- (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
- starty4 >> ss_ver, uv_endy4, ss_hor);
- }
-}
--- /dev/null
+++ b/src/lf_apply_tmpl.c
@@ -1,0 +1,306 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "common/intops.h"
+
+#include "src/lf_apply.h"
+
+static inline void filter_plane_cols_y(const Dav1dFrameContext *const f,
+ const int have_left,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[3][2],
+ pixel *dst, const ptrdiff_t ls,
+ const int w,
+ const int starty4, const int endy4)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // filter edges between columns (e.g. block1 | block2)
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ uint32_t hmask[4];
+ if (!starty4) {
+ hmask[0] = mask[x][0][0];
+ hmask[1] = mask[x][1][0];
+ hmask[2] = mask[x][2][0];
+ if (endy4 > 16) {
+ hmask[0] |= mask[x][0][1] << 16;
+ hmask[1] |= mask[x][1][1] << 16;
+ hmask[2] |= mask[x][2][1] << 16;
+ }
+ } else {
+ hmask[0] = mask[x][0][1];
+ hmask[1] = mask[x][1][1];
+ hmask[2] = mask[x][2][1];
+ }
+ hmask[3] = 0;
+ dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][0], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4);
+ }
+}
+
+static inline void filter_plane_rows_y(const Dav1dFrameContext *const f,
+ const int have_top,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[3][2],
+ pixel *dst, const ptrdiff_t ls,
+ const int w,
+ const int starty4, const int endy4)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // block1
+ // filter edges between rows (e.g. ------)
+ // block2
+ for (int y = starty4; y < endy4;
+ y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
+ {
+ if (!have_top && !y) continue;
+ const uint32_t vmask[4] = {
+ mask[y][0][0] | (mask[y][0][1] << 16),
+ mask[y][1][0] | (mask[y][1][1] << 16),
+ mask[y][2][0] | (mask[y][2][1] << 16),
+ 0,
+ };
+ dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
+ &f->lf.lim_lut, w);
+ }
+}
+
+static inline void filter_plane_cols_uv(const Dav1dFrameContext *const f,
+ const int have_left,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[2][2],
+ pixel *const u, pixel *const v,
+ const ptrdiff_t ls, const int w,
+ const int starty4, const int endy4,
+ const int ss_ver)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+
+ // filter edges between columns (e.g. block1 | block2)
+ for (int x = 0; x < w; x++) {
+ if (!have_left && !x) continue;
+ uint32_t hmask[3];
+ if (!starty4) {
+ hmask[0] = mask[x][0][0];
+ hmask[1] = mask[x][1][0];
+ if (endy4 > (16 >> ss_ver)) {
+ hmask[0] |= mask[x][0][1] << (16 >> ss_ver);
+ hmask[1] |= mask[x][1][1] << (16 >> ss_ver);
+ }
+ } else {
+ hmask[0] = mask[x][0][1];
+ hmask[1] = mask[x][1][1];
+ }
+ hmask[2] = 0;
+ dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4);
+ dsp->lf.loop_filter_sb[1][0](&v[x * 4], ls, hmask,
+ (const uint8_t(*)[4]) &lvl[x][3], b4_stride,
+ &f->lf.lim_lut, endy4 - starty4);
+ }
+}
+
+static inline void filter_plane_rows_uv(const Dav1dFrameContext *const f,
+ const int have_top,
+ const uint8_t (*lvl)[4],
+ const ptrdiff_t b4_stride,
+ const uint16_t (*const mask)[2][2],
+ pixel *const u, pixel *const v,
+ const ptrdiff_t ls, const int w,
+ const int starty4, const int endy4,
+ const int ss_hor)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ ptrdiff_t off_l = 0;
+
+ // block1
+ // filter edges between rows (e.g. ------)
+ // block2
+ for (int y = starty4; y < endy4;
+ y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
+ {
+ if (!have_top && !y) continue;
+ const uint32_t vmask[3] = {
+ mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),
+ mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),
+ 0,
+ };
+ dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
+ &f->lf.lim_lut, w);
+ dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
+ (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
+ &f->lf.lim_lut, w);
+ }
+}
+
+void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
+ pixel *const p[3], Av1Filter *const lflvl,
+ int sby, const int start_of_tile_row)
+{
+ int x, have_left;
+ // Don't filter outside the frame
+ const int hy4 = (f->cur.p.p.h + 3) >> 2;
+ const int have_top = sby > 0;
+ const int is_sb64 = !f->seq_hdr.sb128;
+ const int starty4 = (sby & is_sb64) << 4;
+ const int sbsz = 32 >> is_sb64;
+ const int sbl2 = 5 - is_sb64;
+ const int halign = (f->bh + 31) & ~31;
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
+ const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
+ const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);
+ const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
+
+ // fix lpf strength at tile col boundaries
+ const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
+ const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
+ for (int tile_col = 1;; tile_col++) {
+ x = f->frame_hdr.tiling.col_start_sb[tile_col];
+ if ((x << sbl2) >= f->bw) break;
+ const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
+ x >>= is_sb64;
+
+ uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
+ for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
+ !!(y_hmask[1][sidx] & smask);
+ y_hmask[2][sidx] &= ~smask;
+ y_hmask[1][sidx] &= ~smask;
+ y_hmask[0][sidx] &= ~smask;
+ y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
+ }
+
+ if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ uint16_t (*const uv_hmask)[2] = lflvl[x].filter_uv[0][cbx4];
+ for (unsigned y = starty4 >> ss_ver, uv_mask = 1 << y; y < uv_endy4;
+ y++, uv_mask <<= 1)
+ {
+ const int sidx = uv_mask >= vmax;
+ const unsigned smask = uv_mask >> (sidx << (4 - ss_ver));
+ const int idx = !!(uv_hmask[1][sidx] & smask);
+ uv_hmask[1][sidx] &= ~smask;
+ uv_hmask[0][sidx] &= ~smask;
+ uv_hmask[imin(idx, lpf_uv[y - (starty4 >> ss_ver)])][sidx] |= smask;
+ }
+ }
+ lpf_y += halign;
+ lpf_uv += halign >> ss_ver;
+ }
+
+ // fix lpf strength at tile row boundaries
+ if (start_of_tile_row) {
+ const BlockContext *a;
+ for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
+ x < f->sb128w; x++, a++)
+ {
+ uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
+ for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {
+ const int sidx = mask >= 0x10000;
+ const unsigned smask = mask >> (sidx << 4);
+ const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
+ !!(y_vmask[1][sidx] & smask);
+ y_vmask[2][sidx] &= ~smask;
+ y_vmask[1][sidx] &= ~smask;
+ y_vmask[0][sidx] &= ~smask;
+ y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
+ }
+
+ if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
+ for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {
+ const int sidx = uv_mask >= hmax;
+ const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
+ const int idx = !!(uv_vmask[1][sidx] & smask);
+ uv_vmask[1][sidx] &= ~smask;
+ uv_vmask[0][sidx] &= ~smask;
+ uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
+ }
+ }
+ }
+ }
+
+ pixel *ptr;
+ uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
+ for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
+ x++, have_left = 1, ptr += 128, level_ptr += 32)
+ {
+ filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
+ lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
+ imin(32, f->bw - x * 32), starty4, endy4);
+ }
+
+ level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
+ for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
+ filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
+ lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
+ imin(32, f->bw - x * 32), starty4, endy4);
+ }
+
+ if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
+ return;
+
+ ptrdiff_t uv_off;
+ level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+ for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
+ x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+ {
+ filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
+ lflvl[x].filter_uv[0],
+ &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+ (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
+ starty4 >> ss_ver, uv_endy4, ss_ver);
+ }
+
+ level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
+ for (uv_off = 0, x = 0; x < f->sb128w;
+ x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
+ {
+ filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
+ lflvl[x].filter_uv[1],
+ &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
+ (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
+ starty4 >> ss_ver, uv_endy4, ss_hor);
+ }
+}
--- a/src/loopfilter.c
+++ /dev/null
@@ -1,246 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdlib.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/loopfilter.h"
-
-static NOINLINE void
-loop_filter(pixel *dst, int E, int I, int H,
- const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)
-{
- const int F = 1 << (BITDEPTH - 8);
- E <<= BITDEPTH - 8;
- I <<= BITDEPTH - 8;
- H <<= BITDEPTH - 8;
-
- for (int i = 0; i < 4; i++, dst += stridea) {
- int p6, p5, p4, p3, p2;
- int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
- int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
- int q2, q3, q4, q5, q6;
- int fm, flat8out, flat8in;
-
- fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
- abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
-
- if (wd > 4) {
- p2 = dst[strideb * -3];
- q2 = dst[strideb * +2];
-
- fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
-
- if (wd > 6) {
- p3 = dst[strideb * -4];
- q3 = dst[strideb * +3];
-
- fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
- }
- }
- if (!fm) continue;
-
- if (wd >= 16) {
- p6 = dst[strideb * -7];
- p5 = dst[strideb * -6];
- p4 = dst[strideb * -5];
- q4 = dst[strideb * +4];
- q5 = dst[strideb * +5];
- q6 = dst[strideb * +6];
-
- flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
- abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
- abs(q5 - q0) <= F && abs(q6 - q0) <= F;
- }
-
- if (wd >= 6)
- flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
- abs(q1 - q0) <= F && abs(q2 - q0) <= F;
-
- if (wd >= 8)
- flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
-
- if (wd >= 16 && (flat8out & flat8in)) {
- dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
- p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
- dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
- p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
- dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
- p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
- dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
- p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
- dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
- p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
- dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
- dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
- q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
- dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
- q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
- dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
- q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
- dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
- q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
- dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
- q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
- dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
- q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
- } else if (wd >= 8 && flat8in) {
- dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
- dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
- dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
- dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
- dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
- dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
- } else if (wd == 6 && flat8in) {
- dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
- dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
- dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
- dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
- } else {
- const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
-
-#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \
- 128 * (1 << (BITDEPTH - 8)) - 1)
-
- if (hev) {
- int f = iclip_diff(p1 - q1), f1, f2;
- f = iclip_diff(3 * (q0 - p0) + f);
-
- f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
- f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
-
- dst[strideb * -1] = iclip_pixel(p0 + f2);
- dst[strideb * +0] = iclip_pixel(q0 - f1);
- } else {
- int f = iclip_diff(3 * (q0 - p0)), f1, f2;
-
- f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
- f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
-
- dst[strideb * -1] = iclip_pixel(p0 + f2);
- dst[strideb * +0] = iclip_pixel(q0 - f1);
-
- f = (f1 + 1) >> 1;
- dst[strideb * -2] = iclip_pixel(p1 + f);
- dst[strideb * +1] = iclip_pixel(q1 - f);
- }
-#undef iclip_diff
- }
- }
-}
-
-static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
- const uint32_t *const vmask,
- const uint8_t (*l)[4], ptrdiff_t b4_stride,
- const Av1FilterLUT *lut, const int h)
-{
- const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
- for (unsigned y = 1; vm & ~(y - 1);
- y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
- {
- if (vm & y) {
- const int L = l[0][0] ? l[0][0] : l[-1][0];
- if (!L) continue;
- const int H = L >> 4;
- const int E = lut->e[L], I = lut->i[L];
- const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
- loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
- }
- }
-}
-
-static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
- const uint32_t *const vmask,
- const uint8_t (*l)[4], ptrdiff_t b4_stride,
- const Av1FilterLUT *lut, const int w)
-{
- const unsigned vm = vmask[0] | vmask[1] | vmask[2];
- for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
- if (vm & x) {
- const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
- if (!L) continue;
- const int H = L >> 4;
- const int E = lut->e[L], I = lut->i[L];
- const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
- loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);
- }
- }
-}
-
-static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
- const uint32_t *const vmask,
- const uint8_t (*l)[4], ptrdiff_t b4_stride,
- const Av1FilterLUT *lut, const int h)
-{
- const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
- for (unsigned y = 1; vm & ~(y - 1);
- y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
- {
- if (vm & y) {
- const int L = l[0][0] ? l[0][0] : l[-1][0];
- if (!L) continue;
- const int H = L >> 4;
- const int E = lut->e[L], I = lut->i[L];
- const int idx = !!(vmask[1] & y);
- loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
- }
- }
-}
-
-static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
- const uint32_t *const vmask,
- const uint8_t (*l)[4], ptrdiff_t b4_stride,
- const Av1FilterLUT *lut, const int w)
-{
- const unsigned vm = vmask[0] | vmask[1];
- for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
- if (vm & x) {
- const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
- if (!L) continue;
- const int H = L >> 4;
- const int E = lut->e[L], I = lut->i[L];
- const int idx = !!(vmask[1] & x);
- loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);
- }
- }
-}
-
-void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
- c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
- c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
- c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
- c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
-
-#if HAVE_ASM && ARCH_X86
- bitfn(dav1d_loop_filter_dsp_init_x86)(c);
-#endif
-}
--- /dev/null
+++ b/src/loopfilter_tmpl.c
@@ -1,0 +1,246 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/loopfilter.h"
+
+static NOINLINE void
+loop_filter(pixel *dst, int E, int I, int H,
+ const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)
+{
+ const int F = 1 << (BITDEPTH - 8);
+ E <<= BITDEPTH - 8;
+ I <<= BITDEPTH - 8;
+ H <<= BITDEPTH - 8;
+
+ for (int i = 0; i < 4; i++, dst += stridea) {
+ int p6, p5, p4, p3, p2;
+ int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+ int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+ int q2, q3, q4, q5, q6;
+ int fm, flat8out, flat8in;
+
+ fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
+ abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
+
+ if (wd > 4) {
+ p2 = dst[strideb * -3];
+ q2 = dst[strideb * +2];
+
+ fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
+
+ if (wd > 6) {
+ p3 = dst[strideb * -4];
+ q3 = dst[strideb * +3];
+
+ fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
+ }
+ }
+ if (!fm) continue;
+
+ if (wd >= 16) {
+ p6 = dst[strideb * -7];
+ p5 = dst[strideb * -6];
+ p4 = dst[strideb * -5];
+ q4 = dst[strideb * +4];
+ q5 = dst[strideb * +5];
+ q6 = dst[strideb * +6];
+
+ flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
+ abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
+ abs(q5 - q0) <= F && abs(q6 - q0) <= F;
+ }
+
+ if (wd >= 6)
+ flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
+ abs(q1 - q0) <= F && abs(q2 - q0) <= F;
+
+ if (wd >= 8)
+ flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
+
+ if (wd >= 16 && (flat8out & flat8in)) {
+ dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
+ p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+ dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
+ p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+ dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
+ p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+ dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
+ p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+ dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+ p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+ dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+ q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+ dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+ q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+ dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+ q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
+ dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+ q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+ q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+ q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+ dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
+ q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+ } else if (wd >= 8 && flat8in) {
+ dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+ dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+ dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+ dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+ dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+ dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+ } else if (wd == 6 && flat8in) {
+ dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
+ dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
+ dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
+ dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
+ } else {
+ const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
+
+#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \
+ 128 * (1 << (BITDEPTH - 8)) - 1)
+
+ if (hev) {
+ int f = iclip_diff(p1 - q1), f1, f2;
+ f = iclip_diff(3 * (q0 - p0) + f);
+
+ f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
+ f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
+
+ dst[strideb * -1] = iclip_pixel(p0 + f2);
+ dst[strideb * +0] = iclip_pixel(q0 - f1);
+ } else {
+ int f = iclip_diff(3 * (q0 - p0)), f1, f2;
+
+ f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
+ f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
+
+ dst[strideb * -1] = iclip_pixel(p0 + f2);
+ dst[strideb * +0] = iclip_pixel(q0 - f1);
+
+ f = (f1 + 1) >> 1;
+ dst[strideb * -2] = iclip_pixel(p1 + f);
+ dst[strideb * +1] = iclip_pixel(q1 - f);
+ }
+#undef iclip_diff
+ }
+ }
+}
+
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h)
+{
+ const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
+ }
+ }
+}
+
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int w)
+{
+ const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+ for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+ if (vm & x) {
+ const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+ loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);
+ }
+ }
+}
+
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int h)
+{
+ const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
+ for (unsigned y = 1; vm & ~(y - 1);
+ y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+ {
+ if (vm & y) {
+ const int L = l[0][0] ? l[0][0] : l[-1][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & y);
+ loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
+ }
+ }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+ const uint32_t *const vmask,
+ const uint8_t (*l)[4], ptrdiff_t b4_stride,
+ const Av1FilterLUT *lut, const int w)
+{
+ const unsigned vm = vmask[0] | vmask[1];
+ for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+ if (vm & x) {
+ const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+ if (!L) continue;
+ const int H = L >> 4;
+ const int E = lut->e[L], I = lut->i[L];
+ const int idx = !!(vmask[1] & x);
+ loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);
+ }
+ }
+}
+
+void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
+ c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+ c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+ c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+ c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
+
+#if HAVE_ASM && ARCH_X86
+ bitfn(dav1d_loop_filter_dsp_init_x86)(c);
+#endif
+}
--- a/src/looprestoration.c
+++ /dev/null
@@ -1,577 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdlib.h>
-
-#include "common/intops.h"
-
-#include "src/looprestoration.h"
-#include "src/tables.h"
-
-// 256 * 1.5 + 3 + 3 = 390
-#define REST_UNIT_STRIDE (390)
-
-// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
-// TODO Chroma only requires 2 rows of padding.
-static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
- const pixel (*left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
- int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
-{
- const int have_left = !!(edges & LR_HAVE_LEFT);
- const int have_right = !!(edges & LR_HAVE_RIGHT);
-
- // Copy more pixels if we don't have to pad them
- unit_w += 3 * have_left + 3 * have_right;
- pixel *dst_l = dst + 3 * !have_left;
- p -= 3 * have_left;
- lpf -= 3 * have_left;
-
- if (edges & LR_HAVE_TOP) {
- // Copy previous loop filtered rows
- const pixel *const above_1 = lpf;
- const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);
- pixel_copy(dst_l, above_1, unit_w);
- pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
- pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
- } else {
- // Pad with first row
- pixel_copy(dst_l, p, unit_w);
- pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
- pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
- if (have_left) {
- pixel_copy(dst_l, &left[0][1], 3);
- pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
- pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
- }
- }
-
- pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
- if (edges & LR_HAVE_BOTTOM) {
- // Copy next loop filtered rows
- const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
- const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);
- pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
- pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
- pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
- } else {
- // Pad with last row
- const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
- pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
- pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
- pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
- if (have_left) {
- pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
- pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
- pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
- }
- }
-
- // Inner UNIT_WxSTRIPE_H
- for (int j = 0; j < stripe_h; j++) {
- pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
- dst_tl += REST_UNIT_STRIDE;
- p += PXSTRIDE(p_stride);
- }
-
- if (!have_right) {
- pixel *pad = dst_l + unit_w;
- pixel *row_last = &dst_l[unit_w - 1];
- // Pad 3x(STRIPE_H+6) with last column
- for (int j = 0; j < stripe_h + 6; j++) {
- pixel_set(pad, *row_last, 3);
- pad += REST_UNIT_STRIDE;
- row_last += REST_UNIT_STRIDE;
- }
- }
-
- if (!have_left) {
- // Pad 3x(STRIPE_H+6) with first column
- for (int j = 0; j < stripe_h + 6; j++) {
- pixel_set(dst, *dst_l, 3);
- dst += REST_UNIT_STRIDE;
- dst_l += REST_UNIT_STRIDE;
- }
- } else {
- dst += 3 * REST_UNIT_STRIDE;
- for (int j = 0; j < stripe_h; j++) {
- pixel_copy(dst, &left[j][1], 3);
- dst += REST_UNIT_STRIDE;
- }
- }
-}
-
-// FIXME Could split into luma and chroma specific functions,
-// (since first and last tops are always 0 for chroma)
-// FIXME Could implement a version that requires less temporary memory
-// (should be possible to implement with only 6 rows of temp storage)
-static void wiener_c(pixel *p, const ptrdiff_t p_stride,
- const pixel (*const left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
- const int w, const int h,
- const int16_t filterh[7], const int16_t filterv[7],
- const enum LrEdgeFlags edges)
-{
- // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
- // of padding above and below
- pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- pixel *tmp_ptr = tmp;
-
- padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
-
- // Values stored between horizontal and vertical filtering don't
- // fit in a uint8_t.
- uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- uint16_t *hor_ptr = hor;
-
- const int round_bits_h = 3 + (BITDEPTH == 12) * 2;
- const int rounding_off_h = 1 << (round_bits_h - 1);
- const int clip_limit = 1 << ((BITDEPTH) + 1 + 7 - round_bits_h);
- for (int j = 0; j < h + 6; j++) {
- for (int i = 0; i < w; i++) {
- int sum = (tmp_ptr[i + 3] << 7) + (1 << (BITDEPTH + 6));
-
- for (int k = 0; k < 7; k++) {
- sum += tmp_ptr[i + k] * filterh[k];
- }
-
- hor_ptr[i] =
- iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
- }
- tmp_ptr += REST_UNIT_STRIDE;
- hor_ptr += REST_UNIT_STRIDE;
- }
-
- const int round_bits_v = 11 - (BITDEPTH == 12) * 2;
- const int rounding_off_v = 1 << (round_bits_v - 1);
- const int round_offset = 1 << (BITDEPTH + (round_bits_v - 1));
- for (int i = 0; i < w; i++) {
- for (int j = 0; j < h; j++) {
- int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
-
- for (int k = 0; k < 7; k++) {
- sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
- }
-
- p[j * PXSTRIDE(p_stride) + i] =
- iclip_pixel((sum + rounding_off_v) >> round_bits_v);
- }
- }
-}
-
-// Sum over a 3x3 area
-// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
-// left of the top left corner. However, the self guided filter only needs 1
-// pixel above and one pixel to the left. As for the pixels below and to the
-// right they must be computed in the sums, but don't need to be stored.
-//
-// Example for a 4x4 block:
-// x x x x x x x x x x
-// x c c c c c c c c x
-// x i s s s s s s i x
-// x i s s s s s s i x
-// x i s s s s s s i x
-// x i s s s s s s i x
-// x i s s s s s s i x
-// x i s s s s s s i x
-// x c c c c c c c c x
-// x x x x x x x x x x
-//
-// s: Pixel summed and stored
-// i: Pixel summed and stored (between loops)
-// c: Pixel summed not stored
-// x: Pixel not summed not stored
-static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
- // We skip the first row, as it is never used
- src += REST_UNIT_STRIDE;
- dst += REST_UNIT_STRIDE;
-
- // We skip the first and last columns, as they are never used
- for (int x = 1; x < w - 1; x++) {
- coef *ds = dst + x;
- const pixel *s = src + x;
- int a = s[0], b = s[REST_UNIT_STRIDE];
-
- // We skip the first 2 rows, as they are skipped in the next loop and
- // we don't need the last 2 row as it is skipped in the next loop
- for (int y = 2; y < h - 2; y++) {
- s += REST_UNIT_STRIDE;
- const int c = s[REST_UNIT_STRIDE];
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c;
- a = b;
- b = c;
- }
- }
-
- // We skip the first 2 rows as they are never read
- dst += REST_UNIT_STRIDE;
- // We skip the last 2 rows as it is never read
- for (int y = 2; y < h - 2; y++) {
- int a = dst[1], b = dst[2];
-
- // We don't store the first column as it is never read and
- // we don't store the last 2 columns as they are never read
- for (int x = 2; x < w - 2; x++) {
- const int c = dst[x + 1];
- dst[x] = a + b + c;
- a = b;
- b = c;
- }
- dst += REST_UNIT_STRIDE;
- }
-}
-
-// Sum over a 5x5 area
-// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
-// left of the top left corner. However, the self guided filter only needs 1
-// pixel above and one pixel to the left. As for the pixels below and to the
-// right they must be computed in the sums, but don't need to be stored.
-//
-// Example for a 4x4 block:
-// c c c c c c c c c c
-// c c c c c c c c c c
-// i i s s s s s s i i
-// i i s s s s s s i i
-// i i s s s s s s i i
-// i i s s s s s s i i
-// i i s s s s s s i i
-// i i s s s s s s i i
-// c c c c c c c c c c
-// c c c c c c c c c c
-//
-// s: Pixel summed and stored
-// i: Pixel summed and stored (between loops)
-// c: Pixel summed not stored
-// x: Pixel not summed not stored
-static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
- // We skip the first row, as it is never used
- dst += REST_UNIT_STRIDE;
-
- for (int x = 0; x < w; x++) {
- coef *ds = dst + x;
- const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
- int a = s[-3 * REST_UNIT_STRIDE];
- int b = s[-2 * REST_UNIT_STRIDE];
- int c = s[-1 * REST_UNIT_STRIDE];
- int d = s[0];
-
- // We skip the first 2 rows, as they are skipped in the next loop and
- // we don't need the last 2 row as it is skipped in the next loop
- for (int y = 2; y < h - 2; y++) {
- s += REST_UNIT_STRIDE;
- const int e = *s;
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c + d + e;
- a = b;
- b = c;
- c = d;
- d = e;
- }
- }
-
- // We skip the first 2 rows as they are never read
- dst += REST_UNIT_STRIDE;
- for (int y = 2; y < h - 2; y++) {
- int a = dst[0];
- int b = dst[1];
- int c = dst[2];
- int d = dst[3];
-
- for (int x = 2; x < w - 2; x++) {
- const int e = dst[x + 2];
- dst[x] = a + b + c + d + e;
- a = b;
- b = c;
- c = d;
- d = e;
- }
- dst += REST_UNIT_STRIDE;
- }
-}
-
-// See boxsum3 function comments for details on row and column skipping
-static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
- // We skip the first row, as it is never used
- src += REST_UNIT_STRIDE;
- dst += REST_UNIT_STRIDE;
-
- // We skip the first and last columns, as they are never used
- for (int x = 1; x < w - 1; x++) {
- int *ds = dst + x;
- const pixel *s = src + x;
- int a = s[0] * s[0];
- int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
-
- // We skip the first row, as it is skipped in the next loop and
- // we don't need the last row as it is skipped in the next loop
- for (int y = 2; y < h - 2; y++) {
- s += REST_UNIT_STRIDE;
- const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c;
- a = b;
- b = c;
- }
- }
-
- // We skip the first row as it is never read
- dst += REST_UNIT_STRIDE;
- // We skip the last row as it is never read
- for (int y = 2; y < h - 2; y++) {
- int a = dst[1], b = dst[2];
-
- // We don't store the first column as it is never read and
- // we don't store the last 2 columns as they are never read
- for (int x = 2; x < w - 2; x++) {
- const int c = dst[x + 1];
- dst[x] = a + b + c;
- a = b;
- b = c;
- }
- dst += REST_UNIT_STRIDE;
- }
-}
-
-// See boxsum5 function comments for details on row and column skipping
-static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
- const int h)
-{
- // We skip the first row, as it is never used
- dst += REST_UNIT_STRIDE;
-
- for (int x = 0; x < w; x++) {
- int *ds = dst + x;
- const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
- int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
- int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
- int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
- int d = s[0] * s[0];
-
- // We skip the first 2 rows, as they are skipped in the next loop and
- // we don't need the last 2 row as it is skipped in the next loop
- for (int y = 2; y < h - 2; y++) {
- s += REST_UNIT_STRIDE;
- const int e = s[0] * s[0];
- ds += REST_UNIT_STRIDE;
- *ds = a + b + c + d + e;
- a = b;
- b = c;
- c = d;
- d = e;
- }
- }
-
- // We skip the first 2 rows as they are never read
- dst += REST_UNIT_STRIDE;
- for (int y = 2; y < h - 2; y++) {
- int a = dst[0];
- int b = dst[1];
- int c = dst[2];
- int d = dst[3];
-
- for (int x = 2; x < w - 2; x++) {
- const int e = dst[x + 2];
- dst[x] = a + b + c + d + e;
- a = b;
- b = c;
- c = d;
- d = e;
- }
- dst += REST_UNIT_STRIDE;
- }
-}
-
-static void selfguided_filter(int32_t *dst, const pixel *src,
- const ptrdiff_t src_stride, const int w,
- const int h, const int n, const int s)
-{
- // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
- // of padding above and below
- int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
- // By inverting A and B after the boxsums, B can be of size coef instead
- // of int32_t
- coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
- coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
-
- const int step = (n == 25) + 1;
- if (n == 25) {
- boxsum5(B_, src, w + 6, h + 6);
- boxsum5sqr(A_, src, w + 6, h + 6);
- } else {
- boxsum3(B_, src, w + 6, h + 6);
- boxsum3sqr(A_, src, w + 6, h + 6);
- }
-
- int32_t *AA = A - REST_UNIT_STRIDE;
- coef *BB = B - REST_UNIT_STRIDE;
- for (int j = -1; j < h + 1; j+= step) {
- for (int i = -1; i < w + 1; i++) {
- const int a =
- (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));
- const int b =
- (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);
-
- const uint32_t p = (a * n >= b * b) * (a * n - b * b);
- const uint32_t z = (p * s + (1 << 19)) >> 20;
-
- const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
- // This is where we invert A and B, so that B is of size coef.
- AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;
- BB[i] = x;
- }
- AA += step * REST_UNIT_STRIDE;
- BB += step * REST_UNIT_STRIDE;
- }
-
- src += 3 * REST_UNIT_STRIDE + 3;
- if (n == 25) {
- int j = 0;
-#define SIX_NEIGHBORS(P, i)\
- ((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \
- (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
- P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
- for (; j < h - 1; j+=2) {
- for (int i = 0; i < w; i++) {
- const int32_t a = SIX_NEIGHBORS(B, i);
- const int32_t b = SIX_NEIGHBORS(A, i);
- dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
- }
- dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
- src += REST_UNIT_STRIDE;
- B += REST_UNIT_STRIDE;
- A += REST_UNIT_STRIDE;
- for (int i = 0; i < w; i++) {
- const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
- const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
- dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
- }
- dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
- src += REST_UNIT_STRIDE;
- B += REST_UNIT_STRIDE;
- A += REST_UNIT_STRIDE;
- }
- if (j + 1 == h) { // Last row, when number of rows is odd
- for (int i = 0; i < w; i++) {
- const int32_t a = SIX_NEIGHBORS(B, i);
- const int32_t b = SIX_NEIGHBORS(A, i);
- dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
- }
- }
-#undef SIX_NEIGHBORS
- } else {
-#define EIGHT_NEIGHBORS(P, i)\
- ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
- (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
- P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
- for (int j = 0; j < h; j++) {
- for (int i = 0; i < w; i++) {
- const int32_t a = EIGHT_NEIGHBORS(B, i);
- const int32_t b = EIGHT_NEIGHBORS(A, i);
- dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
- }
- dst += 384;
- src += REST_UNIT_STRIDE;
- B += REST_UNIT_STRIDE;
- A += REST_UNIT_STRIDE;
- }
- }
-#undef NINE_NEIGHBORS
-}
-
-static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
- const pixel (*const left)[4],
- const pixel *lpf, const ptrdiff_t lpf_stride,
- const int w, const int h, const int sgr_idx,
- const int16_t sgr_w[2], const enum LrEdgeFlags edges)
-{
- // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
- // of padding above and below
- pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
-
- padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
-
- // Selfguided filter outputs to a maximum stripe height of 64 and a
- // maximum restoration width of 384 (256 * 1.5)
- int32_t dst[64 * 384];
-
- // both r1 and r0 can't be zero
- if (!dav1d_sgr_params[sgr_idx][0]) {
- const int s1 = dav1d_sgr_params[sgr_idx][3];
- selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
- const int w1 = (1 << 7) - sgr_w[1];
- for (int j = 0; j < h; j++) {
- for (int i = 0; i < w; i++) {
- const int32_t u = (p[i] << 4);
- const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);
- p[i] = iclip_pixel((v + (1 << 10)) >> 11);
- }
- p += PXSTRIDE(p_stride);
- }
- } else if (!dav1d_sgr_params[sgr_idx][1]) {
- const int s0 = dav1d_sgr_params[sgr_idx][2];
- selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
- const int w0 = sgr_w[0];
- for (int j = 0; j < h; j++) {
- for (int i = 0; i < w; i++) {
- const int32_t u = (p[i] << 4);
- const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);
- p[i] = iclip_pixel((v + (1 << 10)) >> 11);
- }
- p += PXSTRIDE(p_stride);
- }
- } else {
- int32_t dst1[64 * 384];
- const int s0 = dav1d_sgr_params[sgr_idx][2];
- const int s1 = dav1d_sgr_params[sgr_idx][3];
- const int w0 = sgr_w[0];
- const int w1 = (1 << 7) - w0 - sgr_w[1];
- selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
- selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
- for (int j = 0; j < h; j++) {
- for (int i = 0; i < w; i++) {
- const int32_t u = (p[i] << 4);
- const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
- w1 * (dst1[j * 384 + i] - u);
- p[i] = iclip_pixel((v + (1 << 10)) >> 11);
- }
- p += PXSTRIDE(p_stride);
- }
- }
-}
-
-void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
- c->wiener = wiener_c;
- c->selfguided = selfguided_c;
-
-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
- bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
-#endif
-}
--- /dev/null
+++ b/src/looprestoration_tmpl.c
@@ -1,0 +1,577 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/intops.h"
+
+#include "src/looprestoration.h"
+#include "src/tables.h"
+
+// 256 * 1.5 + 3 + 3 = 390
+#define REST_UNIT_STRIDE (390)
+
+// TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
+// TODO Chroma only requires 2 rows of padding.
+static void padding(pixel *dst, const pixel *p, const ptrdiff_t p_stride,
+ const pixel (*left)[4],
+ const pixel *lpf, const ptrdiff_t lpf_stride,
+ int unit_w, const int stripe_h, const enum LrEdgeFlags edges)
+{
+ const int have_left = !!(edges & LR_HAVE_LEFT);
+ const int have_right = !!(edges & LR_HAVE_RIGHT);
+
+ // Copy more pixels if we don't have to pad them
+ unit_w += 3 * have_left + 3 * have_right;
+ pixel *dst_l = dst + 3 * !have_left;
+ p -= 3 * have_left;
+ lpf -= 3 * have_left;
+
+ if (edges & LR_HAVE_TOP) {
+ // Copy previous loop filtered rows
+ const pixel *const above_1 = lpf;
+ const pixel *const above_2 = above_1 + PXSTRIDE(lpf_stride);
+ pixel_copy(dst_l, above_1, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
+ } else {
+ // Pad with first row
+ pixel_copy(dst_l, p, unit_w);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
+ if (have_left) {
+ pixel_copy(dst_l, &left[0][1], 3);
+ pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
+ pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
+ }
+ }
+
+ pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
+ if (edges & LR_HAVE_BOTTOM) {
+ // Copy next loop filtered rows
+ const pixel *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
+ const pixel *const below_2 = below_1 + PXSTRIDE(lpf_stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
+ } else {
+ // Pad with last row
+ const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
+ if (have_left) {
+ pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
+ }
+ }
+
+ // Inner UNIT_WxSTRIPE_H
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
+ dst_tl += REST_UNIT_STRIDE;
+ p += PXSTRIDE(p_stride);
+ }
+
+ if (!have_right) {
+ pixel *pad = dst_l + unit_w;
+ pixel *row_last = &dst_l[unit_w - 1];
+ // Pad 3x(STRIPE_H+6) with last column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(pad, *row_last, 3);
+ pad += REST_UNIT_STRIDE;
+ row_last += REST_UNIT_STRIDE;
+ }
+ }
+
+ if (!have_left) {
+ // Pad 3x(STRIPE_H+6) with first column
+ for (int j = 0; j < stripe_h + 6; j++) {
+ pixel_set(dst, *dst_l, 3);
+ dst += REST_UNIT_STRIDE;
+ dst_l += REST_UNIT_STRIDE;
+ }
+ } else {
+ dst += 3 * REST_UNIT_STRIDE;
+ for (int j = 0; j < stripe_h; j++) {
+ pixel_copy(dst, &left[j][1], 3);
+ dst += REST_UNIT_STRIDE;
+ }
+ }
+}
+
+// FIXME Could split into luma and chroma specific functions,
+// (since first and last tops are always 0 for chroma)
+// FIXME Could implement a version that requires less temporary memory
+// (should be possible to implement with only 6 rows of temp storage)
+static void wiener_c(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, const ptrdiff_t lpf_stride,
+ const int w, const int h,
+ const int16_t filterh[7], const int16_t filterv[7],
+ const enum LrEdgeFlags edges)
+{
+ // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ pixel *tmp_ptr = tmp;
+
+ padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+
+ // Values stored between horizontal and vertical filtering don't
+ // fit in a uint8_t.
+ uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ uint16_t *hor_ptr = hor;
+
+ const int round_bits_h = 3 + (BITDEPTH == 12) * 2;
+ const int rounding_off_h = 1 << (round_bits_h - 1);
+ const int clip_limit = 1 << ((BITDEPTH) + 1 + 7 - round_bits_h);
+ for (int j = 0; j < h + 6; j++) {
+ for (int i = 0; i < w; i++) {
+ int sum = (tmp_ptr[i + 3] << 7) + (1 << (BITDEPTH + 6));
+
+ for (int k = 0; k < 7; k++) {
+ sum += tmp_ptr[i + k] * filterh[k];
+ }
+
+ hor_ptr[i] =
+ iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
+ }
+ tmp_ptr += REST_UNIT_STRIDE;
+ hor_ptr += REST_UNIT_STRIDE;
+ }
+
+ const int round_bits_v = 11 - (BITDEPTH == 12) * 2;
+ const int rounding_off_v = 1 << (round_bits_v - 1);
+ const int round_offset = 1 << (BITDEPTH + (round_bits_v - 1));
+ for (int i = 0; i < w; i++) {
+ for (int j = 0; j < h; j++) {
+ int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
+
+ for (int k = 0; k < 7; k++) {
+ sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filterv[k];
+ }
+
+ p[j * PXSTRIDE(p_stride) + i] =
+ iclip_pixel((sum + rounding_off_v) >> round_bits_v);
+ }
+ }
+}
+
+// Sum over a 3x3 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+// x x x x x x x x x x
+// x c c c c c c c c x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x i s s s s s s i x
+// x c c c c c c c c x
+// x x x x x x x x x x
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum3(coef *dst, const pixel *src, const int w, const int h) {
+ // We skip the first row, as it is never used
+ src += REST_UNIT_STRIDE;
+ dst += REST_UNIT_STRIDE;
+
+ // We skip the first and last columns, as they are never used
+ for (int x = 1; x < w - 1; x++) {
+ coef *ds = dst + x;
+ const pixel *s = src + x;
+ int a = s[0], b = s[REST_UNIT_STRIDE];
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int c = s[REST_UNIT_STRIDE];
+ ds += REST_UNIT_STRIDE;
+ *ds = a + b + c;
+ a = b;
+ b = c;
+ }
+ }
+
+ // We skip the first 2 rows as they are never read
+ dst += REST_UNIT_STRIDE;
+ // We skip the last 2 rows as it is never read
+ for (int y = 2; y < h - 2; y++) {
+ int a = dst[1], b = dst[2];
+
+ // We don't store the first column as it is never read and
+ // we don't store the last 2 columns as they are never read
+ for (int x = 2; x < w - 2; x++) {
+ const int c = dst[x + 1];
+ dst[x] = a + b + c;
+ a = b;
+ b = c;
+ }
+ dst += REST_UNIT_STRIDE;
+ }
+}
+
+// Sum over a 5x5 area
+// The dst and src pointers are positioned 3 pixels above and 3 pixels to the
+// left of the top left corner. However, the self guided filter only needs 1
+// pixel above and one pixel to the left. As for the pixels below and to the
+// right they must be computed in the sums, but don't need to be stored.
+//
+// Example for a 4x4 block:
+// c c c c c c c c c c
+// c c c c c c c c c c
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// i i s s s s s s i i
+// c c c c c c c c c c
+// c c c c c c c c c c
+//
+// s: Pixel summed and stored
+// i: Pixel summed and stored (between loops)
+// c: Pixel summed not stored
+// x: Pixel not summed not stored
+static void boxsum5(coef *dst, const pixel *const src, const int w, const int h) {
+ // We skip the first row, as it is never used
+ dst += REST_UNIT_STRIDE;
+
+ for (int x = 0; x < w; x++) {
+ coef *ds = dst + x;
+ const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+ int a = s[-3 * REST_UNIT_STRIDE];
+ int b = s[-2 * REST_UNIT_STRIDE];
+ int c = s[-1 * REST_UNIT_STRIDE];
+ int d = s[0];
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int e = *s;
+ ds += REST_UNIT_STRIDE;
+ *ds = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ }
+ }
+
+ // We skip the first 2 rows as they are never read
+ dst += REST_UNIT_STRIDE;
+ for (int y = 2; y < h - 2; y++) {
+ int a = dst[0];
+ int b = dst[1];
+ int c = dst[2];
+ int d = dst[3];
+
+ for (int x = 2; x < w - 2; x++) {
+ const int e = dst[x + 2];
+ dst[x] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ }
+ dst += REST_UNIT_STRIDE;
+ }
+}
+
+// See boxsum3 function comments for details on row and column skipping
+static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
+ // We skip the first row, as it is never used
+ src += REST_UNIT_STRIDE;
+ dst += REST_UNIT_STRIDE;
+
+ // We skip the first and last columns, as they are never used
+ for (int x = 1; x < w - 1; x++) {
+ int *ds = dst + x;
+ const pixel *s = src + x;
+ int a = s[0] * s[0];
+ int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
+
+ // We skip the first row, as it is skipped in the next loop and
+ // we don't need the last row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int c = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
+ ds += REST_UNIT_STRIDE;
+ *ds = a + b + c;
+ a = b;
+ b = c;
+ }
+ }
+
+ // We skip the first row as it is never read
+ dst += REST_UNIT_STRIDE;
+ // We skip the last row as it is never read
+ for (int y = 2; y < h - 2; y++) {
+ int a = dst[1], b = dst[2];
+
+ // We don't store the first column as it is never read and
+ // we don't store the last 2 columns as they are never read
+ for (int x = 2; x < w - 2; x++) {
+ const int c = dst[x + 1];
+ dst[x] = a + b + c;
+ a = b;
+ b = c;
+ }
+ dst += REST_UNIT_STRIDE;
+ }
+}
+
+// See boxsum5 function comments for details on row and column skipping
+static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
+ const int h)
+{
+ // We skip the first row, as it is never used
+ dst += REST_UNIT_STRIDE;
+
+ for (int x = 0; x < w; x++) {
+ int *ds = dst + x;
+ const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
+ int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
+ int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
+ int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
+ int d = s[0] * s[0];
+
+ // We skip the first 2 rows, as they are skipped in the next loop and
+ // we don't need the last 2 row as it is skipped in the next loop
+ for (int y = 2; y < h - 2; y++) {
+ s += REST_UNIT_STRIDE;
+ const int e = s[0] * s[0];
+ ds += REST_UNIT_STRIDE;
+ *ds = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ }
+ }
+
+ // We skip the first 2 rows as they are never read
+ dst += REST_UNIT_STRIDE;
+ for (int y = 2; y < h - 2; y++) {
+ int a = dst[0];
+ int b = dst[1];
+ int c = dst[2];
+ int d = dst[3];
+
+ for (int x = 2; x < w - 2; x++) {
+ const int e = dst[x + 2];
+ dst[x] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ }
+ dst += REST_UNIT_STRIDE;
+ }
+}
+
+static void selfguided_filter(int32_t *dst, const pixel *src,
+ const ptrdiff_t src_stride, const int w,
+ const int h, const int n, const int s)
+{
+ // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
+ // By inverting A and B after the boxsums, B can be of size coef instead
+ // of int32_t
+ coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+ coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
+
+ const int step = (n == 25) + 1;
+ if (n == 25) {
+ boxsum5(B_, src, w + 6, h + 6);
+ boxsum5sqr(A_, src, w + 6, h + 6);
+ } else {
+ boxsum3(B_, src, w + 6, h + 6);
+ boxsum3sqr(A_, src, w + 6, h + 6);
+ }
+
+ int32_t *AA = A - REST_UNIT_STRIDE;
+ coef *BB = B - REST_UNIT_STRIDE;
+ for (int j = -1; j < h + 1; j+= step) {
+ for (int i = -1; i < w + 1; i++) {
+ const int a =
+ (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));
+ const int b =
+ (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);
+
+ const uint32_t p = (a * n >= b * b) * (a * n - b * b);
+ const uint32_t z = (p * s + (1 << 19)) >> 20;
+
+ const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
+ // This is where we invert A and B, so that B is of size coef.
+ AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;
+ BB[i] = x;
+ }
+ AA += step * REST_UNIT_STRIDE;
+ BB += step * REST_UNIT_STRIDE;
+ }
+
+ src += 3 * REST_UNIT_STRIDE + 3;
+ if (n == 25) {
+ int j = 0;
+#define SIX_NEIGHBORS(P, i)\
+ ((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \
+ (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
+ P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
+ for (; j < h - 1; j+=2) {
+ for (int i = 0; i < w; i++) {
+ const int32_t a = SIX_NEIGHBORS(B, i);
+ const int32_t b = SIX_NEIGHBORS(A, i);
+ dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+ }
+ dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ for (int i = 0; i < w; i++) {
+ const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
+ const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
+ dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
+ }
+ dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ }
+ if (j + 1 == h) { // Last row, when number of rows is odd
+ for (int i = 0; i < w; i++) {
+ const int32_t a = SIX_NEIGHBORS(B, i);
+ const int32_t b = SIX_NEIGHBORS(A, i);
+ dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+ }
+ }
+#undef SIX_NEIGHBORS
+ } else {
+#define EIGHT_NEIGHBORS(P, i)\
+ ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
+ (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
+ P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int32_t a = EIGHT_NEIGHBORS(B, i);
+ const int32_t b = EIGHT_NEIGHBORS(A, i);
+ dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
+ }
+ dst += 384;
+ src += REST_UNIT_STRIDE;
+ B += REST_UNIT_STRIDE;
+ A += REST_UNIT_STRIDE;
+ }
+ }
+#undef NINE_NEIGHBORS
+}
+
+static void selfguided_c(pixel *p, const ptrdiff_t p_stride,
+ const pixel (*const left)[4],
+ const pixel *lpf, const ptrdiff_t lpf_stride,
+ const int w, const int h, const int sgr_idx,
+ const int16_t sgr_w[2], const enum LrEdgeFlags edges)
+{
+ // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
+ // of padding above and below
+ pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
+
+ padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
+
+ // Selfguided filter outputs to a maximum stripe height of 64 and a
+ // maximum restoration width of 384 (256 * 1.5)
+ int32_t dst[64 * 384];
+
+ // both r1 and r0 can't be zero
+ if (!dav1d_sgr_params[sgr_idx][0]) {
+ const int s1 = dav1d_sgr_params[sgr_idx][3];
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
+ const int w1 = (1 << 7) - sgr_w[1];
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int32_t u = (p[i] << 4);
+ const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);
+ p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+ }
+ p += PXSTRIDE(p_stride);
+ }
+ } else if (!dav1d_sgr_params[sgr_idx][1]) {
+ const int s0 = dav1d_sgr_params[sgr_idx][2];
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
+ const int w0 = sgr_w[0];
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int32_t u = (p[i] << 4);
+ const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);
+ p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+ }
+ p += PXSTRIDE(p_stride);
+ }
+ } else {
+ int32_t dst1[64 * 384];
+ const int s0 = dav1d_sgr_params[sgr_idx][2];
+ const int s1 = dav1d_sgr_params[sgr_idx][3];
+ const int w0 = sgr_w[0];
+ const int w1 = (1 << 7) - w0 - sgr_w[1];
+ selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
+ selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
+ for (int j = 0; j < h; j++) {
+ for (int i = 0; i < w; i++) {
+ const int32_t u = (p[i] << 4);
+ const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
+ w1 * (dst1[j * 384 + i] - u);
+ p[i] = iclip_pixel((v + (1 << 10)) >> 11);
+ }
+ p += PXSTRIDE(p_stride);
+ }
+ }
+}
+
+void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
+ c->wiener = wiener_c;
+ c->selfguided = selfguided_c;
+
+#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+ bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
+#endif
+}
--- a/src/lr_apply.c
+++ /dev/null
@@ -1,296 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdio.h>
-
-#include "common/intops.h"
-
-#include "src/lr_apply.h"
-
-
-enum LrRestorePlanes {
- LR_RESTORE_Y = 1 << 0,
- LR_RESTORE_U = 1 << 1,
- LR_RESTORE_V = 1 << 2,
-};
-
-// The loop filter buffer stores 12 rows of pixels. A superblock block will
-// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
-// and 2 below) the final 4 rows are used to swap the bottom of the last
-// stripe with the top of the next super block row.
-static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,
- const pixel *src, ptrdiff_t src_stride,
- const int ss_ver, const int sb128,
- int row, const int row_h, const int w)
-{
- src_stride = PXSTRIDE(src_stride);
- dst_stride = PXSTRIDE(dst_stride);
-
- // The first stripe of the frame is shorter by 8 luma pixel rows.
- int stripe_h = (64 - 8 * !row) >> ss_ver;
-
- if (row) {
- const int top = 4 << sb128;
- // Copy the top part of the stored loop filtered pixels from the
- // previous sb row needed above the first stripe of this sb row.
- pixel_copy(&dst[dst_stride * 0], &dst[dst_stride * top], w);
- pixel_copy(&dst[dst_stride * 1], &dst[dst_stride * (top + 1)], w);
- pixel_copy(&dst[dst_stride * 2], &dst[dst_stride * (top + 2)], w);
- pixel_copy(&dst[dst_stride * 3], &dst[dst_stride * (top + 3)], w);
- }
-
- dst += 4 * dst_stride;
- src += (stripe_h - 2) * src_stride;
-
- for (; row + stripe_h <= row_h; row += stripe_h) {
- for (int i = 0; i < 4; i++) {
- pixel_copy(dst, src, w);
- dst += dst_stride;
- src += src_stride;
- }
- stripe_h = 64 >> ss_ver;
- src += (stripe_h - 4) * src_stride;
- }
-}
-
-void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
- /*const*/ pixel *const src[3], const int sby)
-{
- const ptrdiff_t offset = 8 * !!sby;
- const ptrdiff_t *const src_stride = f->cur.p.stride;
-
- // TODO Also check block level restore type to reduce copying.
- const int restore_planes =
- ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
- ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
- ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
-
- if (restore_planes & LR_RESTORE_Y) {
- const int h = f->bh << 2;
- const int w = f->bw << 2;
- const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
- const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;
- backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,
- src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
- 0, f->seq_hdr.sb128, y_stripe, row_h, w);
- }
- if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int h = f->bh << (2 - ss_ver);
- const int w = f->bw << (2 - ss_hor);
- const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
- const ptrdiff_t offset_uv = offset >> ss_ver;
- const int y_stripe =
- (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
-
- if (restore_planes & LR_RESTORE_U) {
- backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,
- src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
- ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
- }
- if (restore_planes & LR_RESTORE_V) {
- backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,
- src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
- ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
- }
- }
-}
-
-
-static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
- const pixel (*left)[4], int x, int y,
- const int plane, const int unit_w, const int row_h,
- const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
-{
- const Dav1dDSPContext *const dsp = f->dsp;
- const int chroma = !!plane;
- const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
- const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
- const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;
- const ptrdiff_t p_stride = f->cur.p.stride[chroma];
- const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;
-
- // The first stripe of the frame is shorter by 8 luma pixel rows.
- int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
-
- // FIXME [8] might be easier for SIMD
- int16_t filterh[7], filterv[7];
- if (lr->type == RESTORATION_WIENER) {
- filterh[0] = filterh[6] = lr->filter_h[0];
- filterh[1] = filterh[5] = lr->filter_h[1];
- filterh[2] = filterh[4] = lr->filter_h[2];
- filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
-
- filterv[0] = filterv[6] = lr->filter_v[0];
- filterv[1] = filterv[5] = lr->filter_v[1];
- filterv[2] = filterv[4] = lr->filter_v[2];
- filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
- }
-
- while (y + stripe_h <= row_h) {
- // TODO Look into getting rid of the this if
- if (y + stripe_h == row_h) {
- edges &= ~LR_HAVE_BOTTOM;
- } else {
- edges |= LR_HAVE_BOTTOM;
- }
- if (lr->type == RESTORATION_WIENER) {
- dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
- filterh, filterv, edges);
- } else {
- assert(lr->type == RESTORATION_SGRPROJ);
- dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
- lr->sgr_idx, lr->sgr_weights, edges);
- }
-
- left += stripe_h;
- y += stripe_h;
- if (y + stripe_h > row_h && sbrow_has_bottom) break;
- p += stripe_h * PXSTRIDE(p_stride);
- edges |= LR_HAVE_TOP;
- stripe_h = imin(64 >> ss_ver, row_h - y);
- if (stripe_h == 0) break;
- lpf += 4 * PXSTRIDE(lpf_stride);
- }
-}
-
-static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
- int u)
-{
- for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
- pixel_copy(dst, src, 4);
-}
-
-static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
- const int w, const int h, const int row_h, const int plane)
-{
- const int chroma = !!plane;
- const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
- const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
- const ptrdiff_t p_stride = f->cur.p.stride[chroma];
-
- const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];
- const int unit_size = 1 << unit_size_log2;
- const int half_unit_size = unit_size >> 1;
- const int max_unit_size = unit_size + half_unit_size;
-
- // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
- const int row_y = y + ((8 >> ss_ver) * !!y);
-
- // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
- // chroma planes. Question: For Multithreaded decoding, is it better
- // to store the chroma LR information with collocated Luma information?
- // In other words. For a chroma restoration unit locate at 128,128 and
- // with a 4:2:0 chroma subsampling, do we store the filter information at
- // the AV1Filter unit located at (128,128) or (256,256)
- // TODO Support chroma subsampling.
- const int shift_ver = 7 - ss_ver;
- const int shift_hor = 7 - ss_hor;
-
- int ruy = (row_y >> unit_size_log2);
- // Merge last restoration unit if its height is < half_unit_size
- if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;
-
- // The first stripe of the frame is shorter by 8 luma pixel rows.
- const int filter_h =
- imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
-
- pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
-
- int unit_w = unit_size, bit = 0;
-
- enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
- (row_h < h ? LR_HAVE_BOTTOM : 0);
-
- for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
- // TODO Clean up this if statement.
- if (x + max_unit_size > w) {
- unit_w = w - x;
- edges &= ~LR_HAVE_RIGHT;
- } else {
- edges |= LR_HAVE_RIGHT;
- }
-
- // Based on the position of the restoration unit, find the corresponding
- // AV1Filter unit.
- const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);
- const Av1RestorationUnit *const lr =
- &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
- (x >> shift_hor)].lr[plane][unit_idx];
-
- // FIXME Don't backup if the next restoration unit is RESTORE_NONE
- // This also requires not restoring in the same conditions.
- if (edges & LR_HAVE_RIGHT) {
- backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
- }
- if (lr->type != RESTORATION_NONE) {
- lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
- }
- p += unit_w;
- }
-}
-
-void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
- const int sby)
-{
- const ptrdiff_t offset_y = 8 * !!sby;
- const ptrdiff_t *const dst_stride = f->cur.p.stride;
-
- const int restore_planes =
- ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
- ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
- ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
-
- if (restore_planes & LR_RESTORE_Y) {
- const int h = f->cur.p.p.h;
- const int w = f->cur.p.p.w;
- const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
- const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;
- lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
- h, row_h, 0);
- }
- if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;
- const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;
- const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
- const ptrdiff_t offset_uv = offset_y >> ss_ver;
- const int y_stripe =
- (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
- if (restore_planes & LR_RESTORE_U)
- lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
- w, h, row_h, 1);
-
- if (restore_planes & LR_RESTORE_V)
- lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
- w, h, row_h, 2);
- }
-}
--- /dev/null
+++ b/src/lr_apply_tmpl.c
@@ -1,0 +1,296 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdio.h>
+
+#include "common/intops.h"
+
+#include "src/lr_apply.h"
+
+
+enum LrRestorePlanes {
+ LR_RESTORE_Y = 1 << 0,
+ LR_RESTORE_U = 1 << 1,
+ LR_RESTORE_V = 1 << 2,
+};
+
+// The loop filter buffer stores 12 rows of pixels. A superblock block will
+// contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
+// and 2 below) the final 4 rows are used to swap the bottom of the last
+// stripe with the top of the next super block row.
+static void backup_lpf(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int ss_ver, const int sb128,
+ int row, const int row_h, const int w)
+{
+ src_stride = PXSTRIDE(src_stride);
+ dst_stride = PXSTRIDE(dst_stride);
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ int stripe_h = (64 - 8 * !row) >> ss_ver;
+
+ if (row) {
+ const int top = 4 << sb128;
+ // Copy the top part of the stored loop filtered pixels from the
+ // previous sb row needed above the first stripe of this sb row.
+ pixel_copy(&dst[dst_stride * 0], &dst[dst_stride * top], w);
+ pixel_copy(&dst[dst_stride * 1], &dst[dst_stride * (top + 1)], w);
+ pixel_copy(&dst[dst_stride * 2], &dst[dst_stride * (top + 2)], w);
+ pixel_copy(&dst[dst_stride * 3], &dst[dst_stride * (top + 3)], w);
+ }
+
+ dst += 4 * dst_stride;
+ src += (stripe_h - 2) * src_stride;
+
+ for (; row + stripe_h <= row_h; row += stripe_h) {
+ for (int i = 0; i < 4; i++) {
+ pixel_copy(dst, src, w);
+ dst += dst_stride;
+ src += src_stride;
+ }
+ stripe_h = 64 >> ss_ver;
+ src += (stripe_h - 4) * src_stride;
+ }
+}
+
+void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
+ /*const*/ pixel *const src[3], const int sby)
+{
+ const ptrdiff_t offset = 8 * !!sby;
+ const ptrdiff_t *const src_stride = f->cur.p.stride;
+
+ // TODO Also check block level restore type to reduce copying.
+ const int restore_planes =
+ ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
+ ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
+ ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
+
+ if (restore_planes & LR_RESTORE_Y) {
+ const int h = f->bh << 2;
+ const int w = f->bw << 2;
+ const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
+ const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset;
+ backup_lpf(f->lf.lr_lpf_line_ptr[0], sizeof(pixel) * f->b4_stride * 4,
+ src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
+ 0, f->seq_hdr.sb128, y_stripe, row_h, w);
+ }
+ if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h = f->bh << (2 - ss_ver);
+ const int w = f->bw << (2 - ss_hor);
+ const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
+ const ptrdiff_t offset_uv = offset >> ss_ver;
+ const int y_stripe =
+ (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
+
+ if (restore_planes & LR_RESTORE_U) {
+ backup_lpf(f->lf.lr_lpf_line_ptr[1], sizeof(pixel) * f->b4_stride * 4,
+ src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
+ ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
+ }
+ if (restore_planes & LR_RESTORE_V) {
+ backup_lpf(f->lf.lr_lpf_line_ptr[2], sizeof(pixel) * f->b4_stride * 4,
+ src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
+ ss_ver, f->seq_hdr.sb128, y_stripe, row_h, w);
+ }
+ }
+}
+
+
+static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
+ const pixel (*left)[4], int x, int y,
+ const int plane, const int unit_w, const int row_h,
+ const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
+{
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int chroma = !!plane;
+ const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+ const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
+ const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;
+ const ptrdiff_t p_stride = f->cur.p.stride[chroma];
+ const ptrdiff_t lpf_stride = sizeof(pixel) * f->b4_stride * 4;
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
+
+ // FIXME [8] might be easier for SIMD
+ int16_t filterh[7], filterv[7];
+ if (lr->type == RESTORATION_WIENER) {
+ filterh[0] = filterh[6] = lr->filter_h[0];
+ filterh[1] = filterh[5] = lr->filter_h[1];
+ filterh[2] = filterh[4] = lr->filter_h[2];
+ filterh[3] = -((filterh[0] + filterh[1] + filterh[2]) * 2);
+
+ filterv[0] = filterv[6] = lr->filter_v[0];
+ filterv[1] = filterv[5] = lr->filter_v[1];
+ filterv[2] = filterv[4] = lr->filter_v[2];
+ filterv[3] = -((filterv[0] + filterv[1] + filterv[2]) * 2);
+ }
+
+ while (y + stripe_h <= row_h) {
+ // TODO Look into getting rid of the this if
+ if (y + stripe_h == row_h) {
+ edges &= ~LR_HAVE_BOTTOM;
+ } else {
+ edges |= LR_HAVE_BOTTOM;
+ }
+ if (lr->type == RESTORATION_WIENER) {
+ dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+ filterh, filterv, edges);
+ } else {
+ assert(lr->type == RESTORATION_SGRPROJ);
+ dsp->lr.selfguided(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
+ lr->sgr_idx, lr->sgr_weights, edges);
+ }
+
+ left += stripe_h;
+ y += stripe_h;
+ if (y + stripe_h > row_h && sbrow_has_bottom) break;
+ p += stripe_h * PXSTRIDE(p_stride);
+ edges |= LR_HAVE_TOP;
+ stripe_h = imin(64 >> ss_ver, row_h - y);
+ if (stripe_h == 0) break;
+ lpf += 4 * PXSTRIDE(lpf_stride);
+ }
+}
+
+static void backup4xU(pixel (*dst)[4], const pixel *src, const ptrdiff_t src_stride,
+ int u)
+{
+ for (; u > 0; u--, dst++, src += PXSTRIDE(src_stride))
+ pixel_copy(dst, src, 4);
+}
+
+static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
+ const int w, const int h, const int row_h, const int plane)
+{
+ const int chroma = !!plane;
+ const int ss_ver = chroma & (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
+ const int ss_hor = chroma & (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444);
+ const ptrdiff_t p_stride = f->cur.p.stride[chroma];
+
+ const int unit_size_log2 = f->frame_hdr.restoration.unit_size[!!plane];
+ const int unit_size = 1 << unit_size_log2;
+ const int half_unit_size = unit_size >> 1;
+ const int max_unit_size = unit_size + half_unit_size;
+
+ // Y coordinate of the sbrow (y is 8 luma pixel rows above row_y)
+ const int row_y = y + ((8 >> ss_ver) * !!y);
+
+ // FIXME This is an ugly hack to lookup the proper AV1Filter unit for
+ // chroma planes. Question: For Multithreaded decoding, is it better
+ // to store the chroma LR information with collocated Luma information?
+ // In other words. For a chroma restoration unit locate at 128,128 and
+ // with a 4:2:0 chroma subsampling, do we store the filter information at
+ // the AV1Filter unit located at (128,128) or (256,256)
+ // TODO Support chroma subsampling.
+ const int shift_ver = 7 - ss_ver;
+ const int shift_hor = 7 - ss_hor;
+
+ int ruy = (row_y >> unit_size_log2);
+ // Merge last restoration unit if its height is < half_unit_size
+ if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;
+
+ // The first stripe of the frame is shorter by 8 luma pixel rows.
+ const int filter_h =
+ imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
+
+ pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
+
+ int unit_w = unit_size, bit = 0;
+
+ enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
+ (row_h < h ? LR_HAVE_BOTTOM : 0);
+
+ for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
+ // TODO Clean up this if statement.
+ if (x + max_unit_size > w) {
+ unit_w = w - x;
+ edges &= ~LR_HAVE_RIGHT;
+ } else {
+ edges |= LR_HAVE_RIGHT;
+ }
+
+ // Based on the position of the restoration unit, find the corresponding
+ // AV1Filter unit.
+ const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);
+ const Av1RestorationUnit *const lr =
+ &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
+ (x >> shift_hor)].lr[plane][unit_idx];
+
+ // FIXME Don't backup if the next restoration unit is RESTORE_NONE
+ // This also requires not restoring in the same conditions.
+ if (edges & LR_HAVE_RIGHT) {
+ backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
+ }
+ if (lr->type != RESTORATION_NONE) {
+ lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
+ }
+ p += unit_w;
+ }
+}
+
+void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
+ const int sby)
+{
+ const ptrdiff_t offset_y = 8 * !!sby;
+ const ptrdiff_t *const dst_stride = f->cur.p.stride;
+
+ const int restore_planes =
+ ((f->frame_hdr.restoration.type[0] != RESTORATION_NONE) << 0) +
+ ((f->frame_hdr.restoration.type[1] != RESTORATION_NONE) << 1) +
+ ((f->frame_hdr.restoration.type[2] != RESTORATION_NONE) << 2);
+
+ if (restore_planes & LR_RESTORE_Y) {
+ const int h = f->cur.p.p.h;
+ const int w = f->cur.p.p.w;
+ const int row_h = imin((sby + 1) << (6 + f->seq_hdr.sb128), h);
+ const int y_stripe = (sby << (6 + f->seq_hdr.sb128)) - offset_y;
+ lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
+ h, row_h, 0);
+ }
+ if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+ const int w = (f->cur.p.p.w + ss_hor) >> ss_hor;
+ const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr.sb128), h);
+ const ptrdiff_t offset_uv = offset_y >> ss_ver;
+ const int y_stripe =
+ (sby << ((6 - ss_ver) + f->seq_hdr.sb128)) - offset_uv;
+ if (restore_planes & LR_RESTORE_U)
+ lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+ w, h, row_h, 1);
+
+ if (restore_planes & LR_RESTORE_V)
+ lr_sbrow(f, dst[2] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
+ w, h, row_h, 2);
+ }
+}
--- a/src/mc.c
+++ /dev/null
@@ -1,542 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "common/attributes.h"
-#include "common/intops.h"
-
-#include "src/mc.h"
-#include "src/tables.h"
-
-static NOINLINE void
-put_c(pixel *dst, const ptrdiff_t dst_stride,
- const pixel *src, const ptrdiff_t src_stride, const int w, int h)
-{
- do {
- pixel_copy(dst, src, w);
-
- dst += dst_stride;
- src += src_stride;
- } while (--h);
-}
-
-static NOINLINE void
-prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,
- const int w, int h)
-{
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = src[x] << 4;
-
- tmp += w;
- src += src_stride;
- } while (--h);
-}
-
-#define FILTER_8TAP(src, x, F, stride) \
- (F[0] * src[x + -3 * stride] + \
- F[1] * src[x + -2 * stride] + \
- F[2] * src[x + -1 * stride] + \
- F[3] * src[x + +0 * stride] + \
- F[4] * src[x + +1 * stride] + \
- F[5] * src[x + +2 * stride] + \
- F[6] * src[x + +3 * stride] + \
- F[7] * src[x + +4 * stride])
-
-#define FILTER_8TAP_RND(src, x, F, stride, sh) \
- ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)
-
-#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \
- iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))
-
-#define GET_FILTERS() \
- const int8_t *const fh = !mx ? NULL : w > 4 ? \
- dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \
- dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \
- const int8_t *const fv = !my ? NULL : h > 4 ? \
- dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \
- dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \
-
-static NOINLINE void
-put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
- const pixel *src, ptrdiff_t src_stride,
- const int w, int h, const int mx, const int my,
- const int filter_type)
-{
- GET_FILTERS();
- dst_stride = PXSTRIDE(dst_stride);
- src_stride = PXSTRIDE(src_stride);
-
- if (fh) {
- if (fv) {
- int tmp_h = h + 7;
- coef mid[128 * 135], *mid_ptr = mid;
-
- src -= src_stride * 3;
- do {
- for (int x = 0; x < w; x++)
- mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
-
- mid_ptr += 128;
- src += src_stride;
- } while (--tmp_h);
-
- mid_ptr = mid + 128 * 3;
- do {
- for (int x = 0; x < w; x++)
- dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);
-
- mid_ptr += 128;
- dst += dst_stride;
- } while (--h);
- } else {
- do {
- for (int x = 0; x < w; x++) {
- const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);
- dst[x] = iclip_pixel((px + 8) >> 4);
- }
-
- dst += dst_stride;
- src += src_stride;
- } while (--h);
- }
- } else if (fv) {
- do {
- for (int x = 0; x < w; x++)
- dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
-
- dst += dst_stride;
- src += src_stride;
- } while (--h);
- } else
- put_c(dst, dst_stride, src, src_stride, w, h);
-}
-
-static NOINLINE void
-prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
- const int w, int h, const int mx, const int my,
- const int filter_type)
-{
- GET_FILTERS();
- src_stride = PXSTRIDE(src_stride);
-
- if (fh) {
- if (fv) {
- int tmp_h = h + 7;
- coef mid[128 * 135], *mid_ptr = mid;
-
- src -= src_stride * 3;
- do {
- for (int x = 0; x < w; x++)
- mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
-
- mid_ptr += 128;
- src += src_stride;
- } while (--tmp_h);
-
- mid_ptr = mid + 128 * 3;
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);
-
- mid_ptr += 128;
- tmp += w;
- } while (--h);
- } else {
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
-
- tmp += w;
- src += src_stride;
- } while (--h);
- }
- } else if (fv) {
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);
-
- tmp += w;
- src += src_stride;
- } while (--h);
- } else
- prep_c(tmp, src, src_stride, w, h);
-}
-
-#define filter_fns(type, type_h, type_v) \
-static void put_8tap_##type##_c(pixel *const dst, \
- const ptrdiff_t dst_stride, \
- const pixel *const src, \
- const ptrdiff_t src_stride, \
- const int w, const int h, \
- const int mx, const int my) \
-{ \
- put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
- type_h | (type_v << 2)); \
-} \
-static void prep_8tap_##type##_c(coef *const tmp, \
- const pixel *const src, \
- const ptrdiff_t src_stride, \
- const int w, const int h, \
- const int mx, const int my) \
-{ \
- prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
- type_h | (type_v << 2)); \
-}
-
-filter_fns(regular, FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)
-filter_fns(regular_sharp, FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)
-filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)
-filter_fns(smooth, FILTER_8TAP_SMOOTH, FILTER_8TAP_SMOOTH)
-filter_fns(smooth_regular, FILTER_8TAP_SMOOTH, FILTER_8TAP_REGULAR)
-filter_fns(smooth_sharp, FILTER_8TAP_SMOOTH, FILTER_8TAP_SHARP)
-filter_fns(sharp, FILTER_8TAP_SHARP, FILTER_8TAP_SHARP)
-filter_fns(sharp_regular, FILTER_8TAP_SHARP, FILTER_8TAP_REGULAR)
-filter_fns(sharp_smooth, FILTER_8TAP_SHARP, FILTER_8TAP_SMOOTH)
-
-#define FILTER_BILIN(src, x, mxy, stride) \
- (16 * src[x] + (mxy * (src[x + stride] - src[x])))
-
-#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
- ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)
-
-#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
- iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
-
-static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
- const pixel *src, ptrdiff_t src_stride,
- const int w, int h, const int mx, const int my)
-{
- dst_stride = PXSTRIDE(dst_stride);
- src_stride = PXSTRIDE(src_stride);
-
- if (mx) {
- if (my) {
- coef mid[128 * 129], *mid_ptr = mid;
- int tmp_h = h + 1;
-
- do {
- for (int x = 0; x < w; x++)
- mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
- mid_ptr += 128;
- src += src_stride;
- } while (--tmp_h);
-
- mid_ptr = mid;
- do {
- for (int x = 0; x < w; x++)
- dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 8);
-
- mid_ptr += 128;
- dst += dst_stride;
- } while (--h);
- } else {
- do {
- for (int x = 0; x < w; x++)
- dst[x] = FILTER_BILIN_CLIP(src, x, mx, 1, 4);
-
- dst += dst_stride;
- src += src_stride;
- } while (--h);
- }
- } else if (my) {
- do {
- for (int x = 0; x < w; x++)
- dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
-
- dst += dst_stride;
- src += src_stride;
- } while (--h);
- } else
- put_c(dst, dst_stride, src, src_stride, w, h);
-}
-
-static void prep_bilin_c(coef *tmp,
- const pixel *src, ptrdiff_t src_stride,
- const int w, int h, const int mx, const int my)
-{
- src_stride = PXSTRIDE(src_stride);
-
- if (mx) {
- if (my) {
- coef mid[128 * 129], *mid_ptr = mid;
- int tmp_h = h + 1;
-
- do {
- for (int x = 0; x < w; x++)
- mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
-
- mid_ptr += 128;
- src += src_stride;
- } while (--tmp_h);
-
- mid_ptr = mid;
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4);
-
- mid_ptr += 128;
- tmp += w;
- } while (--h);
- } else {
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = FILTER_BILIN(src, x, mx, 1);
-
- tmp += w;
- src += src_stride;
- } while (--h);
- }
- } else if (my) {
- do {
- for (int x = 0; x < w; x++)
- tmp[x] = FILTER_BILIN(src, x, my, src_stride);
-
- tmp += w;
- src += src_stride;
- } while (--h);
- } else
- prep_c(tmp, src, src_stride, w, h);
-}
-
-static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
- const coef *tmp1, const coef *tmp2, const int w, int h)
-{
- do {
- for (int x = 0; x < w; x++)
- dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);
-
- tmp1 += w;
- tmp2 += w;
- dst += PXSTRIDE(dst_stride);
- } while (--h);
-}
-
-static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
- const coef *tmp1, const coef *tmp2, const int w, int h,
- const int weight)
-{
- do {
- for (int x = 0; x < w; x++)
- dst[x] = iclip_pixel((tmp1[x] * weight +
- tmp2[x] * (16 - weight) + 128) >> 8);
-
- tmp1 += w;
- tmp2 += w;
- dst += PXSTRIDE(dst_stride);
- } while (--h);
-}
-
-static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
- const coef *tmp1, const coef *tmp2, const int w, int h,
- const uint8_t *mask)
-{
- do {
- for (int x = 0; x < w; x++)
- dst[x] = iclip_pixel((tmp1[x] * mask[x] +
- tmp2[x] * (64 - mask[x]) + 512) >> 10);
-
- tmp1 += w;
- tmp2 += w;
- mask += w;
- dst += PXSTRIDE(dst_stride);
- } while (--h);
-}
-
-static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
- const pixel *tmp, const ptrdiff_t tmp_stride,
- const int w, const int h,
- const uint8_t *mask, const ptrdiff_t m_stride)
-{
- for (int y = 0; y < h; y++) {
- for (int x = 0; x < w; x++) {
-#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
- dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
- }
- dst += PXSTRIDE(dst_stride);
- tmp += PXSTRIDE(tmp_stride);
- mask += m_stride;
- }
-}
-
-static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
- const coef *tmp1, const coef *tmp2, const int w, int h,
- uint8_t *mask, const int sign,
- const int ss_hor, const int ss_ver)
-{
- // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
- // and then load this intermediate to calculate final value for odd rows
- const int rnd = 8 << (BITDEPTH - 8);
- do {
- for (int x = 0; x < w; x++) {
- const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
- dst[x] = iclip_pixel((tmp1[x] * m +
- tmp2[x] * (64 - m) + 512) >> 10);
-
- if (ss_hor) {
- x++;
-
- const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
- dst[x] = iclip_pixel((tmp1[x] * n +
- tmp2[x] * (64 - n) + 512) >> 10);
-
- if (h & ss_ver) {
- mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
- } else if (ss_ver) {
- mask[x >> 1] = m + n;
- } else {
- mask[x >> 1] = (m + n + 1 - sign) >> 1;
- }
- } else {
- mask[x] = m;
- }
- }
-
- tmp1 += w;
- tmp2 += w;
- dst += PXSTRIDE(dst_stride);
- if (!ss_ver || (h & 1)) mask += w >> ss_hor;
- } while (--h);
-}
-
-#define w_mask_fns(ssn, ss_hor, ss_ver) \
-static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
- const coef *const tmp1, const coef *const tmp2, \
- const int w, const int h, uint8_t *mask, \
- const int sign) \
-{ \
- w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver); \
-}
-
-w_mask_fns(444, 0, 0);
-w_mask_fns(422, 1, 0);
-w_mask_fns(420, 1, 1);
-
-#undef w_mask_fns
-
-static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
- const pixel *src, const ptrdiff_t src_stride,
- const int16_t *const abcd, int mx, int my)
-{
- coef mid[15 * 8], *mid_ptr = mid;
-
- src -= 3 * PXSTRIDE(src_stride);
- for (int y = 0; y < 15; y++, mx += abcd[1]) {
- for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
- const int8_t *const filter =
- dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
-
- mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
- }
- src += PXSTRIDE(src_stride);
- mid_ptr += 8;
- }
-
- mid_ptr = &mid[3 * 8];
- for (int y = 0; y < 8; y++, my += abcd[3]) {
- for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
- const int8_t *const filter =
- dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
-
- dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);
- }
- mid_ptr += 8;
- dst += PXSTRIDE(dst_stride);
- }
-}
-
-static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,
- const pixel *src, const ptrdiff_t src_stride,
- const int16_t *const abcd, int mx, int my)
-{
- coef mid[15 * 8], *mid_ptr = mid;
-
- src -= 3 * PXSTRIDE(src_stride);
- for (int y = 0; y < 15; y++, mx += abcd[1]) {
- for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
- const int8_t *const filter =
- dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
-
- mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
- }
- src += PXSTRIDE(src_stride);
- mid_ptr += 8;
- }
-
- mid_ptr = &mid[3 * 8];
- for (int y = 0; y < 8; y++, my += abcd[3]) {
- for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
- const int8_t *const filter =
- dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
-
- tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);
- }
- mid_ptr += 8;
- tmp += tmp_stride;
- }
-}
-
-void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
-#define init_mc_fns(type, name) do { \
- c->mc [type] = put_##name##_c; \
- c->mct[type] = prep_##name##_c; \
-} while (0)
-
- init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular);
- init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
- init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp);
- init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular);
- init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth);
- init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp);
- init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
- init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth);
- init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp);
- init_mc_fns(FILTER_2D_BILINEAR, bilin);
-
- c->avg = avg_c;
- c->w_avg = w_avg_c;
- c->mask = mask_c;
- c->blend = blend_c;
- c->w_mask[0] = w_mask_444_c;
- c->w_mask[1] = w_mask_422_c;
- c->w_mask[2] = w_mask_420_c;
- c->warp8x8 = warp_affine_8x8_c;
- c->warp8x8t = warp_affine_8x8t_c;
-
-#if HAVE_ASM
-#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_mc_dsp_init_arm)(c);
-#elif ARCH_X86
- bitfn(dav1d_mc_dsp_init_x86)(c);
-#endif
-#endif
-}
--- /dev/null
+++ b/src/mc_tmpl.c
@@ -1,0 +1,542 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/mc.h"
+#include "src/tables.h"
+
+static NOINLINE void
+put_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride, const int w, int h)
+{
+ do {
+ pixel_copy(dst, src, w);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+}
+
+static NOINLINE void
+prep_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,
+ const int w, int h)
+{
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = src[x] << 4;
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+}
+
+#define FILTER_8TAP(src, x, F, stride) \
+ (F[0] * src[x + -3 * stride] + \
+ F[1] * src[x + -2 * stride] + \
+ F[2] * src[x + -1 * stride] + \
+ F[3] * src[x + +0 * stride] + \
+ F[4] * src[x + +1 * stride] + \
+ F[5] * src[x + +2 * stride] + \
+ F[6] * src[x + +3 * stride] + \
+ F[7] * src[x + +4 * stride])
+
+#define FILTER_8TAP_RND(src, x, F, stride, sh) \
+ ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)
+
+#define FILTER_8TAP_CLIP(src, x, F, stride, sh) \
+ iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))
+
+#define GET_FILTERS() \
+ const int8_t *const fh = !mx ? NULL : w > 4 ? \
+ dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \
+ dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \
+ const int8_t *const fv = !my ? NULL : h > 4 ? \
+ dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \
+ dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \
+
+static NOINLINE void
+put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my,
+ const int filter_type)
+{
+ GET_FILTERS();
+ dst_stride = PXSTRIDE(dst_stride);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (fh) {
+ if (fv) {
+ int tmp_h = h + 7;
+ coef mid[128 * 135], *mid_ptr = mid;
+
+ src -= src_stride * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10);
+
+ mid_ptr += 128;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++) {
+ const int px = FILTER_8TAP_RND(src, x, fh, 1, 2);
+ dst[x] = iclip_pixel((px + 8) >> 4);
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (fv) {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ } else
+ put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static NOINLINE void
+prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my,
+ const int filter_type)
+{
+ GET_FILTERS();
+ src_stride = PXSTRIDE(src_stride);
+
+ if (fh) {
+ if (fv) {
+ int tmp_h = h + 7;
+ coef mid[128 * 135], *mid_ptr = mid;
+
+ src -= src_stride * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid + 128 * 3;
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6);
+
+ mid_ptr += 128;
+ tmp += w;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_8TAP_RND(src, x, fh, 1, 2);
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (fv) {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_8TAP_RND(src, x, fv, src_stride, 2);
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ } else
+ prep_c(tmp, src, src_stride, w, h);
+}
+
+#define filter_fns(type, type_h, type_v) \
+static void put_8tap_##type##_c(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my) \
+{ \
+ put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
+ type_h | (type_v << 2)); \
+} \
+static void prep_8tap_##type##_c(coef *const tmp, \
+ const pixel *const src, \
+ const ptrdiff_t src_stride, \
+ const int w, const int h, \
+ const int mx, const int my) \
+{ \
+ prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
+ type_h | (type_v << 2)); \
+}
+
+filter_fns(regular, FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)
+filter_fns(regular_sharp, FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)
+filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)
+filter_fns(smooth, FILTER_8TAP_SMOOTH, FILTER_8TAP_SMOOTH)
+filter_fns(smooth_regular, FILTER_8TAP_SMOOTH, FILTER_8TAP_REGULAR)
+filter_fns(smooth_sharp, FILTER_8TAP_SMOOTH, FILTER_8TAP_SHARP)
+filter_fns(sharp, FILTER_8TAP_SHARP, FILTER_8TAP_SHARP)
+filter_fns(sharp_regular, FILTER_8TAP_SHARP, FILTER_8TAP_REGULAR)
+filter_fns(sharp_smooth, FILTER_8TAP_SHARP, FILTER_8TAP_SMOOTH)
+
+#define FILTER_BILIN(src, x, mxy, stride) \
+ (16 * src[x] + (mxy * (src[x + stride] - src[x])))
+
+#define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
+ ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)
+
+#define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
+ iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
+
+static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my)
+{
+ dst_stride = PXSTRIDE(dst_stride);
+ src_stride = PXSTRIDE(src_stride);
+
+ if (mx) {
+ if (my) {
+ coef mid[128 * 129], *mid_ptr = mid;
+ int tmp_h = h + 1;
+
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128, 8);
+
+ mid_ptr += 128;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(src, x, mx, 1, 4);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (my) {
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
+
+ dst += dst_stride;
+ src += src_stride;
+ } while (--h);
+ } else
+ put_c(dst, dst_stride, src, src_stride, w, h);
+}
+
+static void prep_bilin_c(coef *tmp,
+ const pixel *src, ptrdiff_t src_stride,
+ const int w, int h, const int mx, const int my)
+{
+ src_stride = PXSTRIDE(src_stride);
+
+ if (mx) {
+ if (my) {
+ coef mid[128 * 129], *mid_ptr = mid;
+ int tmp_h = h + 1;
+
+ do {
+ for (int x = 0; x < w; x++)
+ mid_ptr[x] = FILTER_BILIN(src, x, mx, 1);
+
+ mid_ptr += 128;
+ src += src_stride;
+ } while (--tmp_h);
+
+ mid_ptr = mid;
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4);
+
+ mid_ptr += 128;
+ tmp += w;
+ } while (--h);
+ } else {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN(src, x, mx, 1);
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ }
+ } else if (my) {
+ do {
+ for (int x = 0; x < w; x++)
+ tmp[x] = FILTER_BILIN(src, x, my, src_stride);
+
+ tmp += w;
+ src += src_stride;
+ } while (--h);
+ } else
+ prep_c(tmp, src, src_stride, w, h);
+}
+
+static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const coef *tmp1, const coef *tmp2, const int w, int h)
+{
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
+ const coef *tmp1, const coef *tmp2, const int w, int h,
+ const int weight)
+{
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] * weight +
+ tmp2[x] * (16 - weight) + 128) >> 8);
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const coef *tmp1, const coef *tmp2, const int w, int h,
+ const uint8_t *mask)
+{
+ do {
+ for (int x = 0; x < w; x++)
+ dst[x] = iclip_pixel((tmp1[x] * mask[x] +
+ tmp2[x] * (64 - mask[x]) + 512) >> 10);
+
+ tmp1 += w;
+ tmp2 += w;
+ mask += w;
+ dst += PXSTRIDE(dst_stride);
+ } while (--h);
+}
+
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *tmp, const ptrdiff_t tmp_stride,
+ const int w, const int h,
+ const uint8_t *mask, const ptrdiff_t m_stride)
+{
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+ dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
+ }
+ dst += PXSTRIDE(dst_stride);
+ tmp += PXSTRIDE(tmp_stride);
+ mask += m_stride;
+ }
+}
+
+static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
+ const coef *tmp1, const coef *tmp2, const int w, int h,
+ uint8_t *mask, const int sign,
+ const int ss_hor, const int ss_ver)
+{
+ // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
+ // and then load this intermediate to calculate final value for odd rows
+ const int rnd = 8 << (BITDEPTH - 8);
+ do {
+ for (int x = 0; x < w; x++) {
+ const int m = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
+ dst[x] = iclip_pixel((tmp1[x] * m +
+ tmp2[x] * (64 - m) + 512) >> 10);
+
+ if (ss_hor) {
+ x++;
+
+ const int n = imin(38 + ((abs(tmp1[x] - tmp2[x]) + rnd) >> BITDEPTH), 64);
+ dst[x] = iclip_pixel((tmp1[x] * n +
+ tmp2[x] * (64 - n) + 512) >> 10);
+
+ if (h & ss_ver) {
+ mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
+ } else if (ss_ver) {
+ mask[x >> 1] = m + n;
+ } else {
+ mask[x >> 1] = (m + n + 1 - sign) >> 1;
+ }
+ } else {
+ mask[x] = m;
+ }
+ }
+
+ tmp1 += w;
+ tmp2 += w;
+ dst += PXSTRIDE(dst_stride);
+ if (!ss_ver || (h & 1)) mask += w >> ss_hor;
+ } while (--h);
+}
+
+#define w_mask_fns(ssn, ss_hor, ss_ver) \
+static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
+ const coef *const tmp1, const coef *const tmp2, \
+ const int w, const int h, uint8_t *mask, \
+ const int sign) \
+{ \
+ w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver); \
+}
+
+w_mask_fns(444, 0, 0);
+w_mask_fns(422, 1, 0);
+w_mask_fns(420, 1, 1);
+
+#undef w_mask_fns
+
+static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my)
+{
+ coef mid[15 * 8], *mid_ptr = mid;
+
+ src -= 3 * PXSTRIDE(src_stride);
+ for (int y = 0; y < 15; y++, mx += abcd[1]) {
+ for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+ mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
+ }
+ src += PXSTRIDE(src_stride);
+ mid_ptr += 8;
+ }
+
+ mid_ptr = &mid[3 * 8];
+ for (int y = 0; y < 8; y++, my += abcd[3]) {
+ for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+ dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);
+ }
+ mid_ptr += 8;
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,
+ const pixel *src, const ptrdiff_t src_stride,
+ const int16_t *const abcd, int mx, int my)
+{
+ coef mid[15 * 8], *mid_ptr = mid;
+
+ src -= 3 * PXSTRIDE(src_stride);
+ for (int y = 0; y < 15; y++, mx += abcd[1]) {
+ for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
+
+ mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
+ }
+ src += PXSTRIDE(src_stride);
+ mid_ptr += 8;
+ }
+
+ mid_ptr = &mid[3 * 8];
+ for (int y = 0; y < 8; y++, my += abcd[3]) {
+ for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
+ const int8_t *const filter =
+ dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
+
+ tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);
+ }
+ mid_ptr += 8;
+ tmp += tmp_stride;
+ }
+}
+
+void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
+#define init_mc_fns(type, name) do { \
+ c->mc [type] = put_##name##_c; \
+ c->mct[type] = prep_##name##_c; \
+} while (0)
+
+ init_mc_fns(FILTER_2D_8TAP_REGULAR, 8tap_regular);
+ init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
+ init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp);
+ init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular);
+ init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth);
+ init_mc_fns(FILTER_2D_8TAP_SHARP, 8tap_sharp);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH, 8tap_smooth);
+ init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp);
+ init_mc_fns(FILTER_2D_BILINEAR, bilin);
+
+ c->avg = avg_c;
+ c->w_avg = w_avg_c;
+ c->mask = mask_c;
+ c->blend = blend_c;
+ c->w_mask[0] = w_mask_444_c;
+ c->w_mask[1] = w_mask_422_c;
+ c->w_mask[2] = w_mask_420_c;
+ c->warp8x8 = warp_affine_8x8_c;
+ c->warp8x8t = warp_affine_8x8t_c;
+
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+ bitfn(dav1d_mc_dsp_init_arm)(c);
+#elif ARCH_X86
+ bitfn(dav1d_mc_dsp_init_x86)(c);
+#endif
+#endif
+}
--- a/src/meson.build
+++ b/src/meson.build
@@ -52,17 +52,17 @@
# These files are compiled for each bitdepth with
# `BITDEPTH` defined to the currently built bitdepth.
libdav1d_tmpl_sources = files(
- 'ipred.c',
- 'itx.c',
- 'ipred_prepare.c',
- 'lf_apply.c',
- 'loopfilter.c',
- 'mc.c',
- 'cdef_apply.c',
- 'cdef.c',
- 'lr_apply.c',
- 'looprestoration.c',
- 'recon.c'
+ 'ipred_tmpl.c',
+ 'itx_tmpl.c',
+ 'ipred_prepare_tmpl.c',
+ 'lf_apply_tmpl.c',
+ 'loopfilter_tmpl.c',
+ 'mc_tmpl.c',
+ 'cdef_apply_tmpl.c',
+ 'cdef_tmpl.c',
+ 'lr_apply_tmpl.c',
+ 'looprestoration_tmpl.c',
+ 'recon_tmpl.c'
)
# libdav1d entrypoint source files
--- a/src/recon.c
+++ /dev/null
@@ -1,1518 +1,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <string.h>
-#include <stdio.h>
-
-#include "common/attributes.h"
-#include "common/bitdepth.h"
-#include "common/dump.h"
-#include "common/intops.h"
-#include "common/mem.h"
-
-#include "src/cdef_apply.h"
-#include "src/ipred_prepare.h"
-#include "src/lf_apply.h"
-#include "src/lr_apply.h"
-#include "src/recon.h"
-#include "src/scan.h"
-#include "src/tables.h"
-#include "src/wedge.h"
-
-static unsigned read_golomb(MsacContext *const msac) {
- int len = 0;
- unsigned val = 1;
-
- while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;
- while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);
-
- return val - 1;
-}
-
-static int decode_coefs(Dav1dTileContext *const t,
- uint8_t *const a, uint8_t *const l,
- const enum RectTxfmSize tx, const enum BlockSize bs,
- const Av1Block *const b, const int intra,
- const int plane, coef *cf,
- enum TxfmType *const txtp, uint8_t *res_ctx)
-{
- Dav1dTileState *const ts = t->ts;
- const int chroma = !!plane;
- const Dav1dFrameContext *const f = t->f;
- const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
- const int dbg = DEBUG_BLOCK_INFO && plane && 0;
-
- if (dbg) printf("Start: r=%d\n", ts->msac.rng);
-
- // does this block have any non-zero coefficients
- const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);
- const int all_skip =
- msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);
- if (dbg)
- printf("Post-non-zero[%d][%d][%d]: r=%d\n",
- t_dim->ctx, sctx, all_skip, ts->msac.rng);
- if (all_skip) {
- *res_ctx = 0x40;
- *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :
- DCT_DCT;
- return -1;
- }
-
- // transform type (chroma: derived, luma: explicitly coded)
- if (chroma) {
- if (intra) {
- *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);
- } else {
- const enum TxfmType y_txtp = *txtp;
- *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);
- }
- } else {
- const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
- &f->frame_hdr, b->seg_id);
- const unsigned set_cnt = dav1d_tx_type_count[set];
- unsigned idx;
- if (set_cnt == 1) {
- idx = 0;
- } else {
- const int set_idx = dav1d_tx_type_set_index[!intra][set];
- const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
- dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
- uint16_t *const txtp_cdf = intra ?
- ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
- ts->cdf.m.txtp_inter[set_idx][t_dim->min];
- idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
- if (dbg)
- printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
- set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,
- idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
- }
- *txtp = dav1d_tx_types_per_set[set][idx];
- }
-
- // find end-of-block (eob)
- int eob_bin;
- const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
- const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
- const int is_1d = tx_class != TX_CLASS_2D;
- switch (tx2dszctx) {
-#define case_sz(sz, bin) \
- case sz: { \
- uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
- eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
- break; \
- }
- case_sz(0, 16);
- case_sz(1, 32);
- case_sz(2, 64);
- case_sz(3, 128);
- case_sz(4, 256);
- case_sz(5, 512);
- case_sz(6, 1024);
-#undef case_sz
- }
- if (dbg)
- printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
- 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
- int eob;
- if (eob_bin > 1) {
- eob = 1 << (eob_bin - 1);
- uint16_t *const eob_hi_bit_cdf =
- ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
- const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
- if (dbg)
- printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
- t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
- unsigned mask = eob >> 1;
- if (eob_hi_bit) eob |= mask;
- for (mask >>= 1; mask; mask >>= 1) {
- const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);
- if (eob_bit) eob |= mask;
- }
- if (dbg)
- printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
- } else {
- eob = eob_bin;
- }
-
- // base tokens
- uint16_t (*const br_cdf)[5] =
- ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
- const int16_t *const scan = dav1d_scans[tx][tx_class];
- uint8_t levels[36 * 36];
- ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);
- memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));
- const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;
- unsigned cul_level = 0;
- for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {
- const int rc = scan[i], x = rc >> shift, y = rc & mask;
-
- // lo tok
- const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);
- uint16_t *const lo_cdf = is_last ?
- ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
- ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
- int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,
- 4 - is_last) + is_last;
- if (dbg)
- printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
- t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
- if (!tok) continue;
-
- // hi tok
- if (tok == 3) {
- const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
- do {
- const int tok_br =
- msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);
- if (dbg)
- printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
- imin(t_dim->ctx, 3), chroma, br_ctx,
- i, rc, tok_br, tok, ts->msac.rng);
- tok += tok_br;
- if (tok_br < 3) break;
- } while (tok < 15);
- }
-
- levels[x * stride + y] = cf[rc] = tok;
- }
-
- // residual and sign
- int dc_sign = 1;
- const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
- const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
- const int dq_shift = imax(0, t_dim->ctx - 2);
- for (int i = 0; i <= eob; i++) {
- const int rc = scan[i];
- int tok = cf[rc];
- if (!tok) continue;
- int dq;
-
- // sign
- int sign;
- if (i == 0) {
- const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
- uint16_t *const dc_sign_cdf =
- ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
- sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
- if (dbg)
- printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
- chroma, dc_sign_ctx, sign, ts->msac.rng);
- dc_sign = sign ? 0 : 2;
- dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
- } else {
- sign = msac_decode_bool(&ts->msac, 128 << 7);
- if (dbg)
- printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
- dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
- }
-
- // residual
- if (tok == 15) {
- tok += read_golomb(&ts->msac);
- if (dbg)
- printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
- i, rc, tok - 15, tok, ts->msac.rng);
- }
-
- // dequant
- cul_level += tok;
- tok *= dq;
- tok >>= dq_shift;
- cf[rc] = sign ? -tok : tok;
- }
-
- // context
- *res_ctx = imin(cul_level, 63) | (dc_sign << 6);
-
- return eob;
-}
-
-static void read_coef_tree(Dav1dTileContext *const t,
- const enum BlockSize bs, const Av1Block *const b,
- const enum RectTxfmSize ytx, const int depth,
- const uint16_t *const tx_split,
- const int x_off, const int y_off, pixel *dst)
-{
- const Dav1dFrameContext *const f = t->f;
- Dav1dTileState *const ts = t->ts;
- const Dav1dDSPContext *const dsp = f->dsp;
- const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
- const int txw = t_dim->w, txh = t_dim->h;
-
- if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {
- const enum RectTxfmSize sub = t_dim->sub;
- const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
- const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
-
- read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
- x_off * 2 + 0, y_off * 2 + 0, dst);
- t->bx += txsw;
- if (txw >= txh && t->bx < f->bw)
- read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
- y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
- t->bx -= txsw;
- t->by += txsh;
- if (txh >= txw && t->by < f->bh) {
- if (dst)
- dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);
- read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
- x_off * 2 + 0, y_off * 2 + 1, dst);
- t->bx += txsw;
- if (txw >= txh && t->bx < f->bw)
- read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
- y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
- t->bx -= txsw;
- }
- t->by -= txsh;
- } else {
- const int bx4 = t->bx & 31, by4 = t->by & 31;
- enum TxfmType txtp;
- uint8_t cf_ctx;
- int eob;
- coef *cf;
- struct CodedBlockInfo *cbi;
-
- if (f->frame_thread.pass) {
- cf = ts->frame_thread.cf;
- ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
- cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
- } else {
- cf = t->cf;
- }
- if (f->frame_thread.pass != 2) {
- eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
- ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
- if (DEBUG_BLOCK_INFO)
- printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
- ytx, txtp, eob, ts->msac.rng);
- memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
- memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
- for (int y = 0; y < txh; y++)
- memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
- if (f->frame_thread.pass == 1) {
- cbi->eob[0] = eob;
- cbi->txtp[0] = txtp;
- }
- } else {
- eob = cbi->eob[0];
- txtp = cbi->txtp[0];
- }
- if (!(f->frame_thread.pass & 1)) {
- assert(dst);
- if (eob >= 0) {
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
- dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
- }
- }
- }
-}
-
-void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
- const enum BlockSize bs, const Av1Block *const b)
-{
- const Dav1dFrameContext *const f = t->f;
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int bx4 = t->bx & 31, by4 = t->by & 31;
- const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
- const uint8_t *const b_dim = dav1d_block_dimensions[bs];
- const int bw4 = b_dim[0], bh4 = b_dim[1];
- const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
- const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
- (bw4 > ss_hor || t->bx & 1) &&
- (bh4 > ss_ver || t->by & 1);
-
- if (b->skip) {
- memset(&t->a->lcoef[bx4], 0x40, bw4);
- memset(&t->l.lcoef[by4], 0x40, bh4);
- if (has_chroma) for (int pl = 0; pl < 2; pl++) {
- memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
- memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
- }
- return;
- }
-
- Dav1dTileState *const ts = t->ts;
- const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
- const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
- assert(f->frame_thread.pass == 1);
- assert(!b->skip);
- const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
- const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
-
- for (int init_y = 0; init_y < h4; init_y += 16) {
- for (int init_x = 0; init_x < w4; init_x += 16) {
- const int sub_h4 = imin(h4, 16 + init_y);
- const int sub_w4 = imin(w4, init_x + 16);
- int y_off = !!init_y, y, x;
- for (y = init_y, t->by += init_y; y < sub_h4;
- y += t_dim->h, t->by += t_dim->h, y_off++)
- {
- struct CodedBlockInfo *const cbi =
- &f->frame_thread.cbi[t->by * f->b4_stride];
- int x_off = !!init_x;
- for (x = init_x, t->bx += init_x; x < sub_w4;
- x += t_dim->w, t->bx += t_dim->w, x_off++)
- {
- if (!b->intra) {
- read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
- x_off, y_off, NULL);
- } else {
- uint8_t cf_ctx = 0x40;
- enum TxfmType txtp;
- const int eob = cbi[t->bx].eob[0] =
- decode_coefs(t, &t->a->lcoef[bx4 + x],
- &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
- 0, ts->frame_thread.cf, &txtp, &cf_ctx);
- if (DEBUG_BLOCK_INFO)
- printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
- b->tx, txtp, eob, ts->msac.rng);
- cbi[t->bx].txtp[0] = txtp;
- ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
- memset(&t->a->lcoef[bx4 + x], cf_ctx,
- imin(t_dim->w, f->bw - t->bx));
- memset(&t->l.lcoef[by4 + y], cf_ctx,
- imin(t_dim->h, f->bh - t->by));
- }
- }
- t->bx -= x;
- }
- t->by -= y;
-
- if (!has_chroma) continue;
-
- const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
- const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
- for (int pl = 0; pl < 2; pl++) {
- for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
- y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
- {
- struct CodedBlockInfo *const cbi =
- &f->frame_thread.cbi[t->by * f->b4_stride];
- for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
- x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
- {
- uint8_t cf_ctx = 0x40;
- enum TxfmType txtp;
- if (!b->intra)
- txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
- bx4 + (x << ss_hor)];
- const int eob = cbi[t->bx].eob[1 + pl] =
- decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
- &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
- b, b->intra, 1 + pl, ts->frame_thread.cf,
- &txtp, &cf_ctx);
- if (DEBUG_BLOCK_INFO)
- printf("Post-uv-cf-blk[pl=%d,tx=%d,"
- "txtp=%d,eob=%d]: r=%d\n",
- pl, b->uvtx, txtp, eob, ts->msac.rng);
- cbi[t->bx].txtp[1 + pl] = txtp;
- ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
- memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
- imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
- memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
- imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
- }
- t->bx -= x << ss_hor;
- }
- t->by -= y << ss_ver;
- }
- }
- }
-}
-
-static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,
- const pixel *ref, const ptrdiff_t ref_stride,
- const int bw, const int bh,
- const int iw, const int ih,
- const int x, const int y)
-{
- // find offset in reference of visible block to copy
- ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
-
- // number of pixels to extend (left, right, top, bottom)
- const int left_ext = iclip(-x, 0, bw - 1);
- const int right_ext = iclip(x + bw - iw, 0, bw - 1);
- assert(left_ext + right_ext < bw);
- const int top_ext = iclip(-y, 0, bh - 1);
- const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
- assert(top_ext + bottom_ext < bh);
-
- // copy visible portion first
- pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
- const int center_w = bw - left_ext - right_ext;
- const int center_h = bh - top_ext - bottom_ext;
- for (int y = 0; y < center_h; y++) {
- pixel_copy(blk + left_ext, ref, center_w);
- // extend left edge for this line
- if (left_ext)
- pixel_set(blk, blk[left_ext], left_ext);
- // extend right edge for this line
- if (right_ext)
- pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
- right_ext);
- ref += PXSTRIDE(ref_stride);
- blk += PXSTRIDE(dst_stride);
- }
-
- // copy top
- blk = dst + top_ext * PXSTRIDE(dst_stride);
- for (int y = 0; y < top_ext; y++) {
- pixel_copy(dst, blk, bw);
- dst += PXSTRIDE(dst_stride);
- }
-
- // copy bottom
- dst += center_h * PXSTRIDE(dst_stride);
- for (int y = 0; y < bottom_ext; y++) {
- pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
- dst += PXSTRIDE(dst_stride);
- }
-}
-
-static void mc(Dav1dTileContext *const t,
- pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
- const int bw4, const int bh4,
- const int bx, const int by, const int pl,
- const mv mv, const Dav1dThreadPicture *const refp,
- const enum Filter2d filter_2d)
-{
- assert((dst8 != NULL) ^ (dst16 != NULL));
- const Dav1dFrameContext *const f = t->f;
- const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
- const int mvx = mv.x, mvy = mv.y;
- const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
- const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
- const int dy = by * v_mul + (mvy >> (3 + ss_ver));
- ptrdiff_t ref_stride = refp->p.stride[!!pl];
- const pixel *ref;
- int w, h;
-
- if (refp != &f->cur) { // i.e. not for intrabc
- dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
- PLANE_TYPE_Y + !!pl);
- w = (f->cur.p.p.w + ss_hor) >> ss_hor;
- h = (f->cur.p.p.h + ss_ver) >> ss_ver;
- } else {
- w = f->bw * 4 >> ss_hor;
- h = f->bh * 4 >> ss_ver;
- }
- if (dx < !!mx * 3 || dy < !!my * 3 ||
- dx + bw4 * h_mul + !!mx * 4 > w ||
- dy + bh4 * v_mul + !!my * 4 > h)
- {
- emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,
- bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,
- dx - !!mx * 3, dy - !!my * 3);
- ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];
- ref_stride = 160 * sizeof(pixel);
- } else {
- ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
- }
-
- if (dst8 != NULL) {
- f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
- bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
- } else {
- f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
- bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
- }
-}
-
-static void obmc(Dav1dTileContext *const t,
- pixel *const dst, const ptrdiff_t dst_stride,
- const uint8_t *const b_dim, const int pl,
- const int bx4, const int by4, const int w4, const int h4)
-{
- assert(!(t->bx & 1) && !(t->by & 1));
- const Dav1dFrameContext *const f = t->f;
- const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
- pixel *const lap = t->scratch.lap;
- static const uint8_t obmc_mask_2[2] = { 19, 0 };
- static const uint8_t obmc_mask_4[4] = { 25, 14, 5, 0 };
- static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11, 7, 3, 0, 0 };
- static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,
- 8, 6, 4, 3, 0, 0, 0, 0 };
- static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,
- 19, 17, 16, 14, 13, 12, 11, 9,
- 8, 7, 6, 5, 4, 4, 3, 2,
- 0, 0, 0, 0, 0, 0, 0, 0 };
- static const uint8_t *const obmc_masks[] = {
- obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32
- };
- const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-
- if (t->by > t->ts->tiling.row_start &&
- (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
- {
- for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
- // only odd blocks are considered for overlap handling, hence +1
- const refmvs *const a_r = &r[x - f->b4_stride + 1];
- const uint8_t *const a_b_dim =
- dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
-
- if (a_r->ref[0] > 0) {
- mc(t, lap, NULL, 128 * sizeof(pixel),
- iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,
- t->bx + x, t->by, pl, a_r->mv[0],
- &f->refp[a_r->ref[0] - 1],
- dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
- f->dsp->mc.blend(&dst[x * h_mul], dst_stride,
- lap, 128 * sizeof(pixel),
- h_mul * iclip(a_b_dim[0], 2, b_dim[0]),
- v_mul * imin(b_dim[1], 16) >> 1,
- obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);
- i++;
- }
- x += imax(a_b_dim[0], 2);
- }
- }
-
- if (t->bx > t->ts->tiling.col_start)
- for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
- // only odd blocks are considered for overlap handling, hence +1
- const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
- const uint8_t *const l_b_dim =
- dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
-
- if (l_r->ref[0] > 0) {
- mc(t, lap, NULL, 32 * sizeof(pixel),
- imin(b_dim[0], 16) >> 1,
- iclip(l_b_dim[1], 2, b_dim[1]),
- t->bx, t->by + y, pl, l_r->mv[0],
- &f->refp[l_r->ref[0] - 1],
- dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
- f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,
- lap, 32 * sizeof(pixel),
- h_mul * imin(b_dim[0], 16) >> 1,
- v_mul * iclip(l_b_dim[1], 2, b_dim[1]),
- obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);
- i++;
- }
- y += imax(l_b_dim[1], 2);
- }
-}
-
-static void warp_affine(Dav1dTileContext *const t,
- pixel *dst8, coef *dst16, const ptrdiff_t dstride,
- const uint8_t *const b_dim, const int pl,
- const Dav1dThreadPicture *const refp,
- const WarpedMotionParams *const wmp)
-{
- assert((dst8 != NULL) ^ (dst16 != NULL));
- const Dav1dFrameContext *const f = t->f;
- const Dav1dDSPContext *const dsp = f->dsp;
- const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
- assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
- const int32_t *const mat = wmp->matrix;
- const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;
- const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;
-
- for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
- for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
- // calculate transformation relative to center of 8x8 block in
- // luma pixel units
- const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
- const int src_y = t->by * 4 + ((y + 4) << ss_ver);
- const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
- const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
-
- const int dx = (mvx >> 16) - 4;
- const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
- wmp->beta * 7) & ~0x3f;
- const int dy = (mvy >> 16) - 4;
- const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
- wmp->delta * 4) & ~0x3f;
-
- const pixel *ref_ptr;
- ptrdiff_t ref_stride = refp->p.stride[!!pl];
-
- dav1d_thread_picture_wait(refp, dy + 4 + 8,
- PLANE_TYPE_Y + !!pl);
- if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
- emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],
- ref_stride, 15, 15, width, height, dx - 3, dy - 3);
- ref_ptr = &t->emu_edge[160 * 3 + 3];
- ref_stride = 160 * sizeof(pixel);
- } else {
- ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
- }
- if (dst16 != NULL)
- dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
- wmp->abcd, mx, my);
- else
- dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
- wmp->abcd, mx, my);
- }
- if (dst8) dst8 += 8 * PXSTRIDE(dstride);
- else dst16 += 8 * dstride;
- }
-}
-
-void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
- const enum EdgeFlags intra_edge_flags,
- const Av1Block *const b)
-{
- Dav1dTileState *const ts = t->ts;
- const Dav1dFrameContext *const f = t->f;
- const Dav1dDSPContext *const dsp = f->dsp;
- const int bx4 = t->bx & 31, by4 = t->by & 31;
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
- const uint8_t *const b_dim = dav1d_block_dimensions[bs];
- const int bw4 = b_dim[0], bh4 = b_dim[1];
- const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
- const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
- const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
- (bw4 > ss_hor || t->bx & 1) &&
- (bh4 > ss_ver || t->by & 1);
- const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
- const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
-
- // coefficient coding
- ALIGN_STK_32(pixel, edge_buf, 257,);
- pixel *const edge = edge_buf + 128;
- const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
-
- for (int init_y = 0; init_y < h4; init_y += 16) {
- for (int init_x = 0; init_x < w4; init_x += 16) {
- if (b->pal_sz[0]) {
- pixel *dst = ((pixel *) f->cur.p.data[0]) +
- 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
- const uint8_t *pal_idx;
- if (f->frame_thread.pass) {
- pal_idx = ts->frame_thread.pal_idx;
- ts->frame_thread.pal_idx += bw4 * bh4 * 16;
- } else {
- pal_idx = t->scratch.pal_idx;
- }
- const uint16_t *const pal = f->frame_thread.pass ?
- f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
- ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
- f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
- pal_idx, bw4 * 4, bh4 * 4);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
- bw4 * 4, bh4 * 4, "y-pal-pred");
- }
-
- const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);
- const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
- intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
- const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
- intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
- int y, x;
- const int sub_h4 = imin(h4, 16 + init_y);
- const int sub_w4 = imin(w4, init_x + 16);
- for (y = init_y, t->by += init_y; y < sub_h4;
- y += t_dim->h, t->by += t_dim->h)
- {
- pixel *dst = ((pixel *) f->cur.p.data[0]) +
- 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +
- t->bx + init_x);
- for (x = init_x, t->bx += init_x; x < sub_w4;
- x += t_dim->w, t->bx += t_dim->w)
- {
- if (b->pal_sz[0]) goto skip_y_pred;
-
- int angle = b->y_angle;
- const enum EdgeFlags edge_flags =
- (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
- 0 : EDGE_I444_TOP_HAS_RIGHT) |
- ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
- 0 : EDGE_I444_LEFT_HAS_BOTTOM);
- const pixel *top_sb_edge = NULL;
- if (!(t->by & (f->sb_step - 1))) {
- top_sb_edge = f->ipred_edge[0];
- const int sby = t->by >> f->sb_shift;
- top_sb_edge += f->sb128w * 128 * (sby - 1);
- }
- const enum IntraPredMode m =
- bytefn(dav1d_prepare_intra_edges)(t->bx,
- t->bx > ts->tiling.col_start,
- t->by,
- t->by > ts->tiling.row_start,
- ts->tiling.col_end,
- ts->tiling.row_end,
- edge_flags, dst,
- f->cur.p.stride[0], top_sb_edge,
- b->y_mode, &angle,
- t_dim->w, t_dim->h, edge);
- dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
- t_dim->w * 4, t_dim->h * 4,
- angle | sm_fl);
-
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
- hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
- t_dim->h * 4, 2, "l");
- hex_dump(edge, 0, 1, 1, "tl");
- hex_dump(edge + 1, t_dim->w * 4,
- t_dim->w * 4, 2, "t");
- hex_dump(dst, f->cur.p.stride[0],
- t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
- }
-
- skip_y_pred: {}
- if (!b->skip) {
- coef *cf;
- int eob;
- enum TxfmType txtp;
- if (f->frame_thread.pass) {
- cf = ts->frame_thread.cf;
- ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
- const struct CodedBlockInfo *const cbi =
- &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
- eob = cbi->eob[0];
- txtp = cbi->txtp[0];
- } else {
- uint8_t cf_ctx;
- cf = t->cf;
- eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
- &t->l.lcoef[by4 + y], b->tx, bs,
- b, 1, 0, cf, &txtp, &cf_ctx);
- if (DEBUG_BLOCK_INFO)
- printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
- b->tx, txtp, eob, ts->msac.rng);
- memset(&t->a->lcoef[bx4 + x], cf_ctx,
- imin(t_dim->w, f->bw - t->bx));
- memset(&t->l.lcoef[by4 + y], cf_ctx,
- imin(t_dim->h, f->bh - t->by));
- }
- if (eob >= 0) {
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- coef_dump(cf, imin(t_dim->h, 8) * 4,
- imin(t_dim->w, 8) * 4, 3, "dq");
- dsp->itx.itxfm_add[b->tx]
- [txtp](dst,
- f->cur.p.stride[0],
- cf, eob);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- hex_dump(dst, f->cur.p.stride[0],
- t_dim->w * 4, t_dim->h * 4, "recon");
- }
- } else if (!f->frame_thread.pass) {
- memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
- memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
- }
- dst += 4 * t_dim->w;
- }
- t->bx -= x;
- }
- t->by -= y;
-
- if (!has_chroma) continue;
-
- const ptrdiff_t stride = f->cur.p.stride[1];
-
- if (b->uv_mode == CFL_PRED) {
- assert(!init_x && !init_y);
-
- int16_t *const ac = t->scratch.ac;
- pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +
- 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);
- const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
- (t->by >> ss_ver) * PXSTRIDE(stride));
- pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
- ((pixel *) f->cur.p.data[2]) + uv_off };
-
- const int furthest_r =
- ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
- const int furthest_b =
- ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
- dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]
- [b->uvtx](ac, y_src, f->cur.p.stride[0],
- cbw4 - (furthest_r >> ss_hor),
- cbh4 - (furthest_b >> ss_ver));
- for (int pl = 0; pl < 2; pl++) {
- if (!b->cfl_alpha[pl]) continue;
- int angle = 0;
- const pixel *top_sb_edge = NULL;
- if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
- top_sb_edge = f->ipred_edge[pl + 1];
- const int sby = t->by >> f->sb_shift;
- top_sb_edge += f->sb128w * 128 * (sby - 1);
- }
- const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
- const int xstart = ts->tiling.col_start >> ss_hor;
- const int ystart = ts->tiling.row_start >> ss_ver;
- const enum IntraPredMode m =
- bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
- ypos, ypos > ystart,
- ts->tiling.col_end >> ss_hor,
- ts->tiling.row_end >> ss_ver,
- 0, uv_dst[pl], stride,
- top_sb_edge, DC_PRED, &angle,
- uv_t_dim->w,
- uv_t_dim->h, edge);
- dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
- uv_t_dim->w * 4,
- uv_t_dim->h * 4,
- ac, b->cfl_alpha[pl]);
- }
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
- ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
- hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
- hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
- }
- } else if (b->pal_sz[1]) {
- ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
- (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
- const uint8_t *pal_idx;
- if (f->frame_thread.pass) {
- pal_idx = ts->frame_thread.pal_idx;
- ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
- } else {
- pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
- }
- const uint16_t *const pal_u = f->frame_thread.pass ?
- f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
- ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];
- f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,
- f->cur.p.stride[1], pal_u,
- pal_idx, cbw4 * 4, cbh4 * 4);
- const uint16_t *const pal_v = f->frame_thread.pass ?
- f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
- ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
- f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,
- f->cur.p.stride[1], pal_v,
- pal_idx, cbw4 * 4, cbh4 * 4);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
- hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,
- PXSTRIDE(f->cur.p.stride[1]),
- cbw4 * 4, cbh4 * 4, "u-pal-pred");
- hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,
- PXSTRIDE(f->cur.p.stride[1]),
- cbw4 * 4, cbh4 * 4, "v-pal-pred");
- }
- }
-
- const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
- sm_uv_flag(&t->l, cby4);
- const int uv_sb_has_tr =
- ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
- intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));
- const int uv_sb_has_bl =
- init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
- intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));
- const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
- const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
- for (int pl = 0; pl < 2; pl++) {
- for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
- y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
- {
- pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +
- 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
- ((t->bx + init_x) >> ss_hor));
- for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
- x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
- {
- if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
- b->pal_sz[1])
- {
- goto skip_uv_pred;
- }
-
- int angle = b->uv_angle;
- // this probably looks weird because we're using
- // luma flags in a chroma loop, but that's because
- // prepare_intra_edges() expects luma flags as input
- const enum EdgeFlags edge_flags =
- (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
- (x + uv_t_dim->w >= sub_cw4)) ?
- 0 : EDGE_I444_TOP_HAS_RIGHT) |
- ((x > (init_x >> ss_hor) ||
- (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
- 0 : EDGE_I444_LEFT_HAS_BOTTOM);
- const pixel *top_sb_edge = NULL;
- if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
- top_sb_edge = f->ipred_edge[1 + pl];
- const int sby = t->by >> f->sb_shift;
- top_sb_edge += f->sb128w * 128 * (sby - 1);
- }
- const enum IntraPredMode uv_mode =
- b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
- const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
- const int xstart = ts->tiling.col_start >> ss_hor;
- const int ystart = ts->tiling.row_start >> ss_ver;
- const enum IntraPredMode m =
- bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
- ypos, ypos > ystart,
- ts->tiling.col_end >> ss_hor,
- ts->tiling.row_end >> ss_ver,
- edge_flags, dst, stride,
- top_sb_edge, uv_mode,
- &angle, uv_t_dim->w,
- uv_t_dim->h, edge);
- dsp->ipred.intra_pred[m](dst, stride, edge,
- uv_t_dim->w * 4,
- uv_t_dim->h * 4,
- angle | sm_uv_fl);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
- hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
- uv_t_dim->h * 4, 2, "l");
- hex_dump(edge, 0, 1, 1, "tl");
- hex_dump(edge + 1, uv_t_dim->w * 4,
- uv_t_dim->w * 4, 2, "t");
- hex_dump(dst, stride, uv_t_dim->w * 4,
- uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
- }
-
- skip_uv_pred: {}
- if (!b->skip) {
- enum TxfmType txtp;
- int eob;
- coef *cf;
- if (f->frame_thread.pass) {
- cf = ts->frame_thread.cf;
- ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
- const struct CodedBlockInfo *const cbi =
- &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
- eob = cbi->eob[pl + 1];
- txtp = cbi->txtp[pl + 1];
- } else {
- uint8_t cf_ctx;
- cf = t->cf;
- eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
- &t->l.ccoef[pl][cby4 + y],
- b->uvtx, bs, b, 1, 1 + pl, cf,
- &txtp, &cf_ctx);
- if (DEBUG_BLOCK_INFO)
- printf("Post-uv-cf-blk[pl=%d,tx=%d,"
- "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
- pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
- memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
- imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
- memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
- imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
- }
- if (eob >= 0) {
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- coef_dump(cf, uv_t_dim->h * 4,
- uv_t_dim->w * 4, 3, "dq");
- dsp->itx.itxfm_add[b->uvtx]
- [txtp](dst, stride,
- cf, eob);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- hex_dump(dst, stride, uv_t_dim->w * 4,
- uv_t_dim->h * 4, "recon");
- }
- } else if (!f->frame_thread.pass) {
- memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
- memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
- }
- dst += uv_t_dim->w * 4;
- }
- t->bx -= x << ss_hor;
- }
- t->by -= y << ss_ver;
- }
- }
- }
-}
-
-void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
- const Av1Block *const b)
-{
- Dav1dTileState *const ts = t->ts;
- const Dav1dFrameContext *const f = t->f;
- const Dav1dDSPContext *const dsp = f->dsp;
- const int bx4 = t->bx & 31, by4 = t->by & 31;
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
- const uint8_t *const b_dim = dav1d_block_dimensions[bs];
- const int bw4 = b_dim[0], bh4 = b_dim[1];
- const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
- const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
- (bw4 > ss_hor || t->bx & 1) &&
- (bh4 > ss_ver || t->by & 1);
- const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
- DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
-
- // prediction
- const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
- pixel *dst = ((pixel *) f->cur.p.data[0]) +
- 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
- const ptrdiff_t uvdstoff =
- 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
- if (!(f->frame_hdr.frame_type & 1)) {
- // intrabc
- mc(t, dst, NULL, f->cur.p.stride[0],
- bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
- if (has_chroma) for (int pl = 1; pl < 3; pl++)
- mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
- bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
- t->bx & ~ss_hor, t->by & ~ss_ver,
- pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
- } else if (b->comp_type == COMP_INTER_NONE) {
- const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
- const enum Filter2d filter_2d = b->filter2d;
-
- if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
- ((b->inter_mode == GLOBALMV &&
- f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
- (b->motion_mode == MM_WARP &&
- t->warpmv.type > WM_TYPE_TRANSLATION)))
- {
- warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
- b->motion_mode == MM_WARP ? &t->warpmv :
- &f->frame_hdr.gmv[b->ref[0]]);
- } else {
- mc(t, dst, NULL, f->cur.p.stride[0],
- bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);
- if (b->motion_mode == MM_OBMC)
- obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
- }
- if (b->interintra_type) {
- ALIGN_STK_32(pixel, tl_edge_buf, 65,);
- pixel *const tl_edge = tl_edge_buf + 32;
- enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
- SMOOTH_PRED : b->interintra_mode;
- pixel *const tmp = t->scratch.interintra;
- int angle = 0;
- const pixel *top_sb_edge = NULL;
- if (!(t->by & (f->sb_step - 1))) {
- top_sb_edge = f->ipred_edge[0];
- const int sby = t->by >> f->sb_shift;
- top_sb_edge += f->sb128w * 128 * (sby - 1);
- }
- m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
- t->by, t->by > ts->tiling.row_start,
- ts->tiling.col_end, ts->tiling.row_end,
- 0, dst, f->cur.p.stride[0], top_sb_edge,
- m, &angle, bw4, bh4, tl_edge);
- dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
- tl_edge, bw4 * 4, bh4 * 4, 0);
- const uint8_t *const ii_mask =
- b->interintra_type == INTER_INTRA_BLEND ?
- dav1d_ii_masks[bs][0][b->interintra_mode] :
- dav1d_wedge_masks[bs][0][0][b->wedge_idx];
- dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),
- bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
- }
-
- if (!has_chroma) goto skip_inter_chroma_pred;
-
- // sub8x8 derivation
- int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
- refmvs *r;
- if (is_sub8x8) {
- assert(ss_hor == 1);
- r = &f->mvs[t->by * f->b4_stride + t->bx];
- if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;
- if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;
- if (bw4 == 1 && bh4 == ss_ver)
- is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
- }
-
- // chroma prediction
- if (is_sub8x8) {
- assert(ss_hor == 1);
- int h_off = 0, v_off = 0;
- if (bw4 == 1 && bh4 == ss_ver) {
- for (int pl = 0; pl < 2; pl++)
- mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
- NULL, f->cur.p.stride[1],
- bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
- r[-(f->b4_stride + 1)].mv[0],
- &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
- f->frame_thread.pass != 2 ? t->tl_4x4_filter :
- f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
- v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
- h_off = 2;
- }
- if (bw4 == 1) {
- const enum Filter2d left_filter_2d =
- dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
- for (int pl = 0; pl < 2; pl++)
- mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
- f->cur.p.stride[1], bw4, bh4, t->bx - 1,
- t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
- f->frame_thread.pass != 2 ? left_filter_2d :
- f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
- h_off = 2;
- }
- if (bh4 == ss_ver) {
- const enum Filter2d top_filter_2d =
- dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
- for (int pl = 0; pl < 2; pl++)
- mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
- f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
- 1 + pl, r[-f->b4_stride].mv[0],
- &f->refp[r[-f->b4_stride].ref[0] - 1],
- f->frame_thread.pass != 2 ? top_filter_2d :
- f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
- v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
- }
- for (int pl = 0; pl < 2; pl++)
- mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
- bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);
- } else {
- if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
- ((b->inter_mode == GLOBALMV &&
- f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
- (b->motion_mode == MM_WARP &&
- t->warpmv.type > WM_TYPE_TRANSLATION)))
- {
- for (int pl = 0; pl < 2; pl++)
- warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
- f->cur.p.stride[1], b_dim, 1 + pl, refp,
- b->motion_mode == MM_WARP ? &t->warpmv :
- &f->frame_hdr.gmv[b->ref[0]]);
- } else {
- for (int pl = 0; pl < 2; pl++) {
- mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
- NULL, f->cur.p.stride[1],
- bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
- t->bx & ~ss_hor, t->by & ~ss_ver,
- 1 + pl, b->mv[0], refp, filter_2d);
- if (b->motion_mode == MM_OBMC)
- obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
- f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
- }
- }
- if (b->interintra_type) {
- // FIXME for 8x32 with 4:2:2 subsampling, this probably does
- // the wrong thing since it will select 4x16, not 4x32, as a
- // transform size...
- const uint8_t *const ii_mask =
- b->interintra_type == INTER_INTRA_BLEND ?
- dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
- dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
-
- for (int pl = 0; pl < 2; pl++) {
- pixel *const tmp = t->scratch.interintra;
- pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
- enum IntraPredMode m =
- b->interintra_mode == II_SMOOTH_PRED ?
- SMOOTH_PRED : b->interintra_mode;
- int angle = 0;
- pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
- const pixel *top_sb_edge = NULL;
- if (!(t->by & (f->sb_step - 1))) {
- top_sb_edge = f->ipred_edge[pl + 1];
- const int sby = t->by >> f->sb_shift;
- top_sb_edge += f->sb128w * 128 * (sby - 1);
- }
- m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
- (t->bx >> ss_hor) >
- (ts->tiling.col_start >> ss_hor),
- t->by >> ss_ver,
- (t->by >> ss_ver) >
- (ts->tiling.row_start >> ss_ver),
- ts->tiling.col_end >> ss_hor,
- ts->tiling.row_end >> ss_ver,
- 0, uvdst, f->cur.p.stride[1],
- top_sb_edge, m,
- &angle, cbw4, cbh4, tl_edge);
- dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
- tl_edge, cbw4 * 4, cbh4 * 4, 0);
- dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
- cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
- }
- }
- }
-
- skip_inter_chroma_pred: {}
- t->tl_4x4_filter = filter_2d;
- } else {
- const enum Filter2d filter_2d = b->filter2d;
- // Maximum super block size is 128x128
- coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;
- int jnt_weight;
- uint8_t *const seg_mask = t->scratch_seg_mask;
- const uint8_t *mask;
-
- for (int i = 0; i < 2; i++) {
- const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
-
- if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
- f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
- {
- warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
- &f->frame_hdr.gmv[b->ref[i]]);
- } else {
- mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
- b->mv[i], refp, filter_2d);
- }
- }
- switch (b->comp_type) {
- case COMP_INTER_AVG:
- dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
- bw4 * 4, bh4 * 4);
- break;
- case COMP_INTER_WEIGHTED_AVG:
- jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
- dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
- bw4 * 4, bh4 * 4, jnt_weight);
- break;
- case COMP_INTER_SEG:
- dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],
- tmp[b->mask_sign], tmp[!b->mask_sign],
- bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);
- mask = seg_mask;
- break;
- case COMP_INTER_WEDGE:
- mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
- dsp->mc.mask(dst, f->cur.p.stride[0],
- tmp[b->mask_sign], tmp[!b->mask_sign],
- bw4 * 4, bh4 * 4, mask);
- if (has_chroma)
- mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
- break;
- }
-
- // chroma
- if (has_chroma) for (int pl = 0; pl < 2; pl++) {
- for (int i = 0; i < 2; i++) {
- const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
- if (b->inter_mode == GLOBALMV_GLOBALMV &&
- imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
- f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
- {
- warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
- refp, &f->frame_hdr.gmv[b->ref[i]]);
- } else {
- mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
- 1 + pl, b->mv[i], refp, filter_2d);
- }
- }
- pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
- switch (b->comp_type) {
- case COMP_INTER_AVG:
- dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
- bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
- break;
- case COMP_INTER_WEIGHTED_AVG:
- dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
- bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);
- break;
- case COMP_INTER_WEDGE:
- case COMP_INTER_SEG:
- dsp->mc.mask(uvdst, f->cur.p.stride[1],
- tmp[b->mask_sign], tmp[!b->mask_sign],
- bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);
- break;
- }
- }
- }
-
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
- hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
- if (has_chroma) {
- hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],
- cbw4 * 4, cbh4 * 4, "u-pred");
- hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],
- cbw4 * 4, cbh4 * 4, "v-pred");
- }
- }
-
- const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-
- if (b->skip) {
- // reset coef contexts
- memset(&t->a->lcoef[bx4], 0x40, w4);
- memset(&t->l.lcoef[by4], 0x40, h4);
- if (has_chroma) {
- memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
- memset(&t->l.ccoef[0][cby4], 0x40, ch4);
- memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
- memset(&t->l.ccoef[1][cby4], 0x40, ch4);
- }
- return;
- }
-
- const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
- const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
-
- for (int init_y = 0; init_y < bh4; init_y += 16) {
- for (int init_x = 0; init_x < bw4; init_x += 16) {
- // coefficient coding & inverse transforms
- int y_off = !!init_y, y;
- dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;
- for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
- y += ytx->h, y_off++)
- {
- int x, x_off = !!init_x;
- for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
- x += ytx->w, x_off++)
- {
- read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
- x_off, y_off, &dst[x * 4]);
- t->bx += ytx->w;
- }
- dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;
- t->bx -= x;
- t->by += ytx->h;
- }
- dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;
- t->by -= y;
-
- // chroma coefs and inverse transform
- if (has_chroma) for (int pl = 0; pl < 2; pl++) {
- pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +
- (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);
- for (y = init_y >> ss_ver, t->by += init_y;
- y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
- {
- int x;
- for (x = init_x >> ss_hor, t->bx += init_x;
- x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
- {
- coef *cf;
- int eob;
- enum TxfmType txtp;
- if (f->frame_thread.pass) {
- cf = ts->frame_thread.cf;
- ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
- const struct CodedBlockInfo *const cbi =
- &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
- eob = cbi->eob[1 + pl];
- txtp = cbi->txtp[1 + pl];
- } else {
- uint8_t cf_ctx;
- cf = t->cf;
- txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
- bx4 + (x << ss_hor)];
- eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
- &t->l.ccoef[pl][cby4 + y],
- b->uvtx, bs, b, 0, 1 + pl,
- cf, &txtp, &cf_ctx);
- if (DEBUG_BLOCK_INFO)
- printf("Post-uv-cf-blk[pl=%d,tx=%d,"
- "txtp=%d,eob=%d]: r=%d\n",
- pl, b->uvtx, txtp, eob, ts->msac.rng);
- memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
- imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
- memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
- imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
- }
- if (eob >= 0) {
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
- dsp->itx.itxfm_add[b->uvtx]
- [txtp](&uvdst[4 * x],
- f->cur.p.stride[1],
- cf, eob);
- if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
- hex_dump(&uvdst[4 * x], f->cur.p.stride[1],
- uvtx->w * 4, uvtx->h * 4, "recon");
- }
- t->bx += uvtx->w << ss_hor;
- }
- uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
- t->bx -= x << ss_hor;
- t->by += uvtx->h << ss_ver;
- }
- t->by -= y << ss_ver;
- }
- }
- }
-}
-
-void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int sbsz = f->sb_step, sbh = f->sbh;
-
- if (f->frame_hdr.loopfilter.level_y[0] ||
- f->frame_hdr.loopfilter.level_y[1])
- {
- int start_of_tile_row = 0;
- if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)
- start_of_tile_row = f->lf.tile_row++;
- bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
- start_of_tile_row);
- }
-
- if (f->seq_hdr.restoration) {
- // Store loop filtered pixels required by loop restoration
- bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
- }
- if (f->seq_hdr.cdef) {
- if (sby) {
- pixel *p_up[3] = {
- f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),
- f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
- f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
- };
- bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
- sby * sbsz - 2, sby * sbsz);
- }
- const int n_blks = sbsz - 2 * (sby + 1 < sbh);
- bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
- imin(sby * sbsz + n_blks, f->bh));
- }
- if (f->seq_hdr.restoration) {
- bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);
- }
-
- f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);
- f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
- f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
- f->lf.prev_mask_ptr = f->lf.mask_ptr;
- if ((sby & 1) || f->seq_hdr.sb128) {
- f->lf.mask_ptr += f->sb128w;
- }
-}
-
-void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
- const Dav1dFrameContext *const f = t->f;
- Dav1dTileState *const ts = t->ts;
- const int sby = t->by >> f->sb_shift;
- const int sby_off = f->sb128w * 128 * sby;
- const int x_off = ts->tiling.col_start;
-
- const pixel *const y =
- ((const pixel *) f->cur.p.data[0]) + x_off * 4 +
- ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);
- pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
- 4 * (ts->tiling.col_end - x_off));
-
- if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
- const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-
- const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
- (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);
- for (int pl = 1; pl <= 2; pl++)
- pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
- &((const pixel *) f->cur.p.data[pl])[uv_off],
- 4 * (ts->tiling.col_end - x_off) >> ss_hor);
- }
-}
--- /dev/null
+++ b/src/recon_tmpl.c
@@ -1,0 +1,1518 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+#include <stdio.h>
+
+#include "common/attributes.h"
+#include "common/bitdepth.h"
+#include "common/dump.h"
+#include "common/intops.h"
+#include "common/mem.h"
+
+#include "src/cdef_apply.h"
+#include "src/ipred_prepare.h"
+#include "src/lf_apply.h"
+#include "src/lr_apply.h"
+#include "src/recon.h"
+#include "src/scan.h"
+#include "src/tables.h"
+#include "src/wedge.h"
+
+static unsigned read_golomb(MsacContext *const msac) {
+ int len = 0;
+ unsigned val = 1;
+
+ while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;
+ while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);
+
+ return val - 1;
+}
+
+static int decode_coefs(Dav1dTileContext *const t,
+ uint8_t *const a, uint8_t *const l,
+ const enum RectTxfmSize tx, const enum BlockSize bs,
+ const Av1Block *const b, const int intra,
+ const int plane, coef *cf,
+ enum TxfmType *const txtp, uint8_t *res_ctx)
+{
+ Dav1dTileState *const ts = t->ts;
+ const int chroma = !!plane;
+ const Dav1dFrameContext *const f = t->f;
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
+ const int dbg = DEBUG_BLOCK_INFO && plane && 0;
+
+ if (dbg) printf("Start: r=%d\n", ts->msac.rng);
+
+ // does this block have any non-zero coefficients
+ const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);
+ const int all_skip =
+ msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);
+ if (dbg)
+ printf("Post-non-zero[%d][%d][%d]: r=%d\n",
+ t_dim->ctx, sctx, all_skip, ts->msac.rng);
+ if (all_skip) {
+ *res_ctx = 0x40;
+ *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :
+ DCT_DCT;
+ return -1;
+ }
+
+ // transform type (chroma: derived, luma: explicitly coded)
+ if (chroma) {
+ if (intra) {
+ *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);
+ } else {
+ const enum TxfmType y_txtp = *txtp;
+ *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);
+ }
+ } else {
+ const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
+ &f->frame_hdr, b->seg_id);
+ const unsigned set_cnt = dav1d_tx_type_count[set];
+ unsigned idx;
+ if (set_cnt == 1) {
+ idx = 0;
+ } else {
+ const int set_idx = dav1d_tx_type_set_index[!intra][set];
+ const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
+ dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+ uint16_t *const txtp_cdf = intra ?
+ ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
+ ts->cdf.m.txtp_inter[set_idx][t_dim->min];
+ idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
+ if (dbg)
+ printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
+ set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,
+ idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
+ }
+ *txtp = dav1d_tx_types_per_set[set][idx];
+ }
+
+ // find end-of-block (eob)
+ int eob_bin;
+ const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
+ const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
+ const int is_1d = tx_class != TX_CLASS_2D;
+ switch (tx2dszctx) {
+#define case_sz(sz, bin) \
+ case sz: { \
+ uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
+ eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
+ break; \
+ }
+ case_sz(0, 16);
+ case_sz(1, 32);
+ case_sz(2, 64);
+ case_sz(3, 128);
+ case_sz(4, 256);
+ case_sz(5, 512);
+ case_sz(6, 1024);
+#undef case_sz
+ }
+ if (dbg)
+ printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
+ 16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
+ int eob;
+ if (eob_bin > 1) {
+ eob = 1 << (eob_bin - 1);
+ uint16_t *const eob_hi_bit_cdf =
+ ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
+ const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
+ if (dbg)
+ printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
+ t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
+ unsigned mask = eob >> 1;
+ if (eob_hi_bit) eob |= mask;
+ for (mask >>= 1; mask; mask >>= 1) {
+ const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);
+ if (eob_bit) eob |= mask;
+ }
+ if (dbg)
+ printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
+ } else {
+ eob = eob_bin;
+ }
+
+ // base tokens
+ uint16_t (*const br_cdf)[5] =
+ ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
+ const int16_t *const scan = dav1d_scans[tx][tx_class];
+ uint8_t levels[36 * 36];
+ ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);
+ memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));
+ const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;
+ unsigned cul_level = 0;
+ for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {
+ const int rc = scan[i], x = rc >> shift, y = rc & mask;
+
+ // lo tok
+ const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);
+ uint16_t *const lo_cdf = is_last ?
+ ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
+ ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
+ int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,
+ 4 - is_last) + is_last;
+ if (dbg)
+ printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
+ t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
+ if (!tok) continue;
+
+ // hi tok
+ if (tok == 3) {
+ const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
+ do {
+ const int tok_br =
+ msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);
+ if (dbg)
+ printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
+ imin(t_dim->ctx, 3), chroma, br_ctx,
+ i, rc, tok_br, tok, ts->msac.rng);
+ tok += tok_br;
+ if (tok_br < 3) break;
+ } while (tok < 15);
+ }
+
+ levels[x * stride + y] = cf[rc] = tok;
+ }
+
+ // residual and sign
+ int dc_sign = 1;
+ const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
+ const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
+ const int dq_shift = imax(0, t_dim->ctx - 2);
+ for (int i = 0; i <= eob; i++) {
+ const int rc = scan[i];
+ int tok = cf[rc];
+ if (!tok) continue;
+ int dq;
+
+ // sign
+ int sign;
+ if (i == 0) {
+ const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
+ uint16_t *const dc_sign_cdf =
+ ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
+ sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
+ if (dbg)
+ printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
+ chroma, dc_sign_ctx, sign, ts->msac.rng);
+ dc_sign = sign ? 0 : 2;
+ dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
+ } else {
+ sign = msac_decode_bool(&ts->msac, 128 << 7);
+ if (dbg)
+ printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
+ dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
+ }
+
+ // residual
+ if (tok == 15) {
+ tok += read_golomb(&ts->msac);
+ if (dbg)
+ printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
+ i, rc, tok - 15, tok, ts->msac.rng);
+ }
+
+ // dequant
+ cul_level += tok;
+ tok *= dq;
+ tok >>= dq_shift;
+ cf[rc] = sign ? -tok : tok;
+ }
+
+ // context
+ *res_ctx = imin(cul_level, 63) | (dc_sign << 6);
+
+ return eob;
+}
+
+static void read_coef_tree(Dav1dTileContext *const t,
+ const enum BlockSize bs, const Av1Block *const b,
+ const enum RectTxfmSize ytx, const int depth,
+ const uint16_t *const tx_split,
+ const int x_off, const int y_off, pixel *dst)
+{
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
+ const int txw = t_dim->w, txh = t_dim->h;
+
+ if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {
+ const enum RectTxfmSize sub = t_dim->sub;
+ const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
+ const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
+
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+ x_off * 2 + 0, y_off * 2 + 0, dst);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+ y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
+ t->bx -= txsw;
+ t->by += txsh;
+ if (txh >= txw && t->by < f->bh) {
+ if (dst)
+ dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
+ x_off * 2 + 0, y_off * 2 + 1, dst);
+ t->bx += txsw;
+ if (txw >= txh && t->bx < f->bw)
+ read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
+ y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
+ t->bx -= txsw;
+ }
+ t->by -= txsh;
+ } else {
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ enum TxfmType txtp;
+ uint8_t cf_ctx;
+ int eob;
+ coef *cf;
+ struct CodedBlockInfo *cbi;
+
+ if (f->frame_thread.pass) {
+ cf = ts->frame_thread.cf;
+ ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ } else {
+ cf = t->cf;
+ }
+ if (f->frame_thread.pass != 2) {
+ eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
+ ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ ytx, txtp, eob, ts->msac.rng);
+ memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
+ memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
+ for (int y = 0; y < txh; y++)
+ memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
+ if (f->frame_thread.pass == 1) {
+ cbi->eob[0] = eob;
+ cbi->txtp[0] = txtp;
+ }
+ } else {
+ eob = cbi->eob[0];
+ txtp = cbi->txtp[0];
+ }
+ if (!(f->frame_thread.pass & 1)) {
+ assert(dst);
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
+ dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
+ }
+ }
+ }
+}
+
+void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
+ const enum BlockSize bs, const Av1Block *const b)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
+ const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+
+ if (b->skip) {
+ memset(&t->a->lcoef[bx4], 0x40, bw4);
+ memset(&t->l.lcoef[by4], 0x40, bh4);
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
+ memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
+ }
+ return;
+ }
+
+ Dav1dTileState *const ts = t->ts;
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+ assert(f->frame_thread.pass == 1);
+ assert(!b->skip);
+ const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
+
+ for (int init_y = 0; init_y < h4; init_y += 16) {
+ for (int init_x = 0; init_x < w4; init_x += 16) {
+ const int sub_h4 = imin(h4, 16 + init_y);
+ const int sub_w4 = imin(w4, init_x + 16);
+ int y_off = !!init_y, y, x;
+ for (y = init_y, t->by += init_y; y < sub_h4;
+ y += t_dim->h, t->by += t_dim->h, y_off++)
+ {
+ struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride];
+ int x_off = !!init_x;
+ for (x = init_x, t->bx += init_x; x < sub_w4;
+ x += t_dim->w, t->bx += t_dim->w, x_off++)
+ {
+ if (!b->intra) {
+ read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
+ x_off, y_off, NULL);
+ } else {
+ uint8_t cf_ctx = 0x40;
+ enum TxfmType txtp;
+ const int eob = cbi[t->bx].eob[0] =
+ decode_coefs(t, &t->a->lcoef[bx4 + x],
+ &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
+ 0, ts->frame_thread.cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ b->tx, txtp, eob, ts->msac.rng);
+ cbi[t->bx].txtp[0] = txtp;
+ ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ memset(&t->a->lcoef[bx4 + x], cf_ctx,
+ imin(t_dim->w, f->bw - t->bx));
+ memset(&t->l.lcoef[by4 + y], cf_ctx,
+ imin(t_dim->h, f->bh - t->by));
+ }
+ }
+ t->bx -= x;
+ }
+ t->by -= y;
+
+ if (!has_chroma) continue;
+
+ const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+ const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+ for (int pl = 0; pl < 2; pl++) {
+ for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+ y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+ {
+ struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride];
+ for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+ x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+ {
+ uint8_t cf_ctx = 0x40;
+ enum TxfmType txtp;
+ if (!b->intra)
+ txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+ bx4 + (x << ss_hor)];
+ const int eob = cbi[t->bx].eob[1 + pl] =
+ decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
+ b, b->intra, 1 + pl, ts->frame_thread.cf,
+ &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng);
+ cbi[t->bx].txtp[1 + pl] = txtp;
+ ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
+ memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
+ memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+ }
+ t->bx -= x << ss_hor;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,
+ const pixel *ref, const ptrdiff_t ref_stride,
+ const int bw, const int bh,
+ const int iw, const int ih,
+ const int x, const int y)
+{
+ // find offset in reference of visible block to copy
+ ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
+
+ // number of pixels to extend (left, right, top, bottom)
+ const int left_ext = iclip(-x, 0, bw - 1);
+ const int right_ext = iclip(x + bw - iw, 0, bw - 1);
+ assert(left_ext + right_ext < bw);
+ const int top_ext = iclip(-y, 0, bh - 1);
+ const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
+ assert(top_ext + bottom_ext < bh);
+
+ // copy visible portion first
+ pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
+ const int center_w = bw - left_ext - right_ext;
+ const int center_h = bh - top_ext - bottom_ext;
+ for (int y = 0; y < center_h; y++) {
+ pixel_copy(blk + left_ext, ref, center_w);
+ // extend left edge for this line
+ if (left_ext)
+ pixel_set(blk, blk[left_ext], left_ext);
+ // extend right edge for this line
+ if (right_ext)
+ pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
+ right_ext);
+ ref += PXSTRIDE(ref_stride);
+ blk += PXSTRIDE(dst_stride);
+ }
+
+ // copy top
+ blk = dst + top_ext * PXSTRIDE(dst_stride);
+ for (int y = 0; y < top_ext; y++) {
+ pixel_copy(dst, blk, bw);
+ dst += PXSTRIDE(dst_stride);
+ }
+
+ // copy bottom
+ dst += center_h * PXSTRIDE(dst_stride);
+ for (int y = 0; y < bottom_ext; y++) {
+ pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
+ dst += PXSTRIDE(dst_stride);
+ }
+}
+
+static void mc(Dav1dTileContext *const t,
+ pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
+ const int bw4, const int bh4,
+ const int bx, const int by, const int pl,
+ const mv mv, const Dav1dThreadPicture *const refp,
+ const enum Filter2d filter_2d)
+{
+ assert((dst8 != NULL) ^ (dst16 != NULL));
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ const int mvx = mv.x, mvy = mv.y;
+ const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
+ const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
+ const int dy = by * v_mul + (mvy >> (3 + ss_ver));
+ ptrdiff_t ref_stride = refp->p.stride[!!pl];
+ const pixel *ref;
+ int w, h;
+
+ if (refp != &f->cur) { // i.e. not for intrabc
+ dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
+ PLANE_TYPE_Y + !!pl);
+ w = (f->cur.p.p.w + ss_hor) >> ss_hor;
+ h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+ } else {
+ w = f->bw * 4 >> ss_hor;
+ h = f->bh * 4 >> ss_ver;
+ }
+ if (dx < !!mx * 3 || dy < !!my * 3 ||
+ dx + bw4 * h_mul + !!mx * 4 > w ||
+ dy + bh4 * v_mul + !!my * 4 > h)
+ {
+ emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,
+ bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,
+ dx - !!mx * 3, dy - !!my * 3);
+ ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];
+ ref_stride = 160 * sizeof(pixel);
+ } else {
+ ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+ }
+
+ if (dst8 != NULL) {
+ f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
+ bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
+ } else {
+ f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
+ bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
+ }
+}
+
+static void obmc(Dav1dTileContext *const t,
+ pixel *const dst, const ptrdiff_t dst_stride,
+ const uint8_t *const b_dim, const int pl,
+ const int bx4, const int by4, const int w4, const int h4)
+{
+ assert(!(t->bx & 1) && !(t->by & 1));
+ const Dav1dFrameContext *const f = t->f;
+ const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
+ pixel *const lap = t->scratch.lap;
+ static const uint8_t obmc_mask_2[2] = { 19, 0 };
+ static const uint8_t obmc_mask_4[4] = { 25, 14, 5, 0 };
+ static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11, 7, 3, 0, 0 };
+ static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,
+ 8, 6, 4, 3, 0, 0, 0, 0 };
+ static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,
+ 19, 17, 16, 14, 13, 12, 11, 9,
+ 8, 7, 6, 5, 4, 4, 3, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+ static const uint8_t *const obmc_masks[] = {
+ obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32
+ };
+ const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+
+ if (t->by > t->ts->tiling.row_start &&
+ (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+ {
+ for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs *const a_r = &r[x - f->b4_stride + 1];
+ const uint8_t *const a_b_dim =
+ dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
+
+ if (a_r->ref[0] > 0) {
+ mc(t, lap, NULL, 128 * sizeof(pixel),
+ iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,
+ t->bx + x, t->by, pl, a_r->mv[0],
+ &f->refp[a_r->ref[0] - 1],
+ dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
+ f->dsp->mc.blend(&dst[x * h_mul], dst_stride,
+ lap, 128 * sizeof(pixel),
+ h_mul * iclip(a_b_dim[0], 2, b_dim[0]),
+ v_mul * imin(b_dim[1], 16) >> 1,
+ obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);
+ i++;
+ }
+ x += imax(a_b_dim[0], 2);
+ }
+ }
+
+ if (t->bx > t->ts->tiling.col_start)
+ for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
+ const uint8_t *const l_b_dim =
+ dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
+
+ if (l_r->ref[0] > 0) {
+ mc(t, lap, NULL, 32 * sizeof(pixel),
+ imin(b_dim[0], 16) >> 1,
+ iclip(l_b_dim[1], 2, b_dim[1]),
+ t->bx, t->by + y, pl, l_r->mv[0],
+ &f->refp[l_r->ref[0] - 1],
+ dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
+ f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,
+ lap, 32 * sizeof(pixel),
+ h_mul * imin(b_dim[0], 16) >> 1,
+ v_mul * iclip(l_b_dim[1], 2, b_dim[1]),
+ obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);
+ i++;
+ }
+ y += imax(l_b_dim[1], 2);
+ }
+}
+
+static void warp_affine(Dav1dTileContext *const t,
+ pixel *dst8, coef *dst16, const ptrdiff_t dstride,
+ const uint8_t *const b_dim, const int pl,
+ const Dav1dThreadPicture *const refp,
+ const WarpedMotionParams *const wmp)
+{
+ assert((dst8 != NULL) ^ (dst16 != NULL));
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+ const int32_t *const mat = wmp->matrix;
+ const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;
+ const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;
+
+ for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+ for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
+ // calculate transformation relative to center of 8x8 block in
+ // luma pixel units
+ const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
+ const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
+
+ const int dx = (mvx >> 16) - 4;
+ const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
+ wmp->beta * 7) & ~0x3f;
+ const int dy = (mvy >> 16) - 4;
+ const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
+ wmp->delta * 4) & ~0x3f;
+
+ const pixel *ref_ptr;
+ ptrdiff_t ref_stride = refp->p.stride[!!pl];
+
+ dav1d_thread_picture_wait(refp, dy + 4 + 8,
+ PLANE_TYPE_Y + !!pl);
+ if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
+ emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],
+ ref_stride, 15, 15, width, height, dx - 3, dy - 3);
+ ref_ptr = &t->emu_edge[160 * 3 + 3];
+ ref_stride = 160 * sizeof(pixel);
+ } else {
+ ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+ }
+ if (dst16 != NULL)
+ dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
+ wmp->abcd, mx, my);
+ else
+ dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
+ wmp->abcd, mx, my);
+ }
+ if (dst8) dst8 += 8 * PXSTRIDE(dstride);
+ else dst16 += 8 * dstride;
+ }
+}
+
+void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
+ const enum EdgeFlags intra_edge_flags,
+ const Av1Block *const b)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+ const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+ const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
+ const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
+
+ // coefficient coding
+ ALIGN_STK_32(pixel, edge_buf, 257,);
+ pixel *const edge = edge_buf + 128;
+ const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
+
+ for (int init_y = 0; init_y < h4; init_y += 16) {
+ for (int init_x = 0; init_x < w4; init_x += 16) {
+ if (b->pal_sz[0]) {
+ pixel *dst = ((pixel *) f->cur.p.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
+ const uint8_t *pal_idx;
+ if (f->frame_thread.pass) {
+ pal_idx = ts->frame_thread.pal_idx;
+ ts->frame_thread.pal_idx += bw4 * bh4 * 16;
+ } else {
+ pal_idx = t->scratch.pal_idx;
+ }
+ const uint16_t *const pal = f->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
+ f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
+ pal_idx, bw4 * 4, bh4 * 4);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
+ bw4 * 4, bh4 * 4, "y-pal-pred");
+ }
+
+ const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);
+ const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
+ intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
+ const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
+ intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
+ int y, x;
+ const int sub_h4 = imin(h4, 16 + init_y);
+ const int sub_w4 = imin(w4, init_x + 16);
+ for (y = init_y, t->by += init_y; y < sub_h4;
+ y += t_dim->h, t->by += t_dim->h)
+ {
+ pixel *dst = ((pixel *) f->cur.p.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +
+ t->bx + init_x);
+ for (x = init_x, t->bx += init_x; x < sub_w4;
+ x += t_dim->w, t->bx += t_dim->w)
+ {
+ if (b->pal_sz[0]) goto skip_y_pred;
+
+ int angle = b->y_angle;
+ const enum EdgeFlags edge_flags =
+ (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
+ 0 : EDGE_I444_TOP_HAS_RIGHT) |
+ ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
+ 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[0];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(t->bx,
+ t->bx > ts->tiling.col_start,
+ t->by,
+ t->by > ts->tiling.row_start,
+ ts->tiling.col_end,
+ ts->tiling.row_end,
+ edge_flags, dst,
+ f->cur.p.stride[0], top_sb_edge,
+ b->y_mode, &angle,
+ t_dim->w, t_dim->h, edge);
+ dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
+ t_dim->w * 4, t_dim->h * 4,
+ angle | sm_fl);
+
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
+ t_dim->h * 4, 2, "l");
+ hex_dump(edge, 0, 1, 1, "tl");
+ hex_dump(edge + 1, t_dim->w * 4,
+ t_dim->w * 4, 2, "t");
+ hex_dump(dst, f->cur.p.stride[0],
+ t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
+ }
+
+ skip_y_pred: {}
+ if (!b->skip) {
+ coef *cf;
+ int eob;
+ enum TxfmType txtp;
+ if (f->frame_thread.pass) {
+ cf = ts->frame_thread.cf;
+ ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
+ const struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ eob = cbi->eob[0];
+ txtp = cbi->txtp[0];
+ } else {
+ uint8_t cf_ctx;
+ cf = t->cf;
+ eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
+ &t->l.lcoef[by4 + y], b->tx, bs,
+ b, 1, 0, cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
+ b->tx, txtp, eob, ts->msac.rng);
+ memset(&t->a->lcoef[bx4 + x], cf_ctx,
+ imin(t_dim->w, f->bw - t->bx));
+ memset(&t->l.lcoef[by4 + y], cf_ctx,
+ imin(t_dim->h, f->bh - t->by));
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, imin(t_dim->h, 8) * 4,
+ imin(t_dim->w, 8) * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->tx]
+ [txtp](dst,
+ f->cur.p.stride[0],
+ cf, eob);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, f->cur.p.stride[0],
+ t_dim->w * 4, t_dim->h * 4, "recon");
+ }
+ } else if (!f->frame_thread.pass) {
+ memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
+ memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
+ }
+ dst += 4 * t_dim->w;
+ }
+ t->bx -= x;
+ }
+ t->by -= y;
+
+ if (!has_chroma) continue;
+
+ const ptrdiff_t stride = f->cur.p.stride[1];
+
+ if (b->uv_mode == CFL_PRED) {
+ assert(!init_x && !init_y);
+
+ int16_t *const ac = t->scratch.ac;
+ pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +
+ 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);
+ const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
+ (t->by >> ss_ver) * PXSTRIDE(stride));
+ pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
+ ((pixel *) f->cur.p.data[2]) + uv_off };
+
+ const int furthest_r =
+ ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
+ const int furthest_b =
+ ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
+ dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]
+ [b->uvtx](ac, y_src, f->cur.p.stride[0],
+ cbw4 - (furthest_r >> ss_hor),
+ cbh4 - (furthest_b >> ss_ver));
+ for (int pl = 0; pl < 2; pl++) {
+ if (!b->cfl_alpha[pl]) continue;
+ int angle = 0;
+ const pixel *top_sb_edge = NULL;
+ if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[pl + 1];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+ const int xstart = ts->tiling.col_start >> ss_hor;
+ const int ystart = ts->tiling.row_start >> ss_ver;
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+ ypos, ypos > ystart,
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ 0, uv_dst[pl], stride,
+ top_sb_edge, DC_PRED, &angle,
+ uv_t_dim->w,
+ uv_t_dim->h, edge);
+ dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
+ uv_t_dim->w * 4,
+ uv_t_dim->h * 4,
+ ac, b->cfl_alpha[pl]);
+ }
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
+ hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
+ hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
+ }
+ } else if (b->pal_sz[1]) {
+ ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
+ (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
+ const uint8_t *pal_idx;
+ if (f->frame_thread.pass) {
+ pal_idx = ts->frame_thread.pal_idx;
+ ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
+ } else {
+ pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+ }
+ const uint16_t *const pal_u = f->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];
+ f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,
+ f->cur.p.stride[1], pal_u,
+ pal_idx, cbw4 * 4, cbh4 * 4);
+ const uint16_t *const pal_v = f->frame_thread.pass ?
+ f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+ ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
+ f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,
+ f->cur.p.stride[1], pal_v,
+ pal_idx, cbw4 * 4, cbh4 * 4);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,
+ PXSTRIDE(f->cur.p.stride[1]),
+ cbw4 * 4, cbh4 * 4, "u-pal-pred");
+ hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,
+ PXSTRIDE(f->cur.p.stride[1]),
+ cbw4 * 4, cbh4 * 4, "v-pal-pred");
+ }
+ }
+
+ const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
+ sm_uv_flag(&t->l, cby4);
+ const int uv_sb_has_tr =
+ ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
+ intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));
+ const int uv_sb_has_bl =
+ init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
+ intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));
+ const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
+ const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
+ for (int pl = 0; pl < 2; pl++) {
+ for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
+ y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
+ {
+ pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +
+ 4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
+ ((t->bx + init_x) >> ss_hor));
+ for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
+ x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
+ {
+ if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
+ b->pal_sz[1])
+ {
+ goto skip_uv_pred;
+ }
+
+ int angle = b->uv_angle;
+ // this probably looks weird because we're using
+ // luma flags in a chroma loop, but that's because
+ // prepare_intra_edges() expects luma flags as input
+ const enum EdgeFlags edge_flags =
+ (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
+ (x + uv_t_dim->w >= sub_cw4)) ?
+ 0 : EDGE_I444_TOP_HAS_RIGHT) |
+ ((x > (init_x >> ss_hor) ||
+ (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
+ 0 : EDGE_I444_LEFT_HAS_BOTTOM);
+ const pixel *top_sb_edge = NULL;
+ if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[1 + pl];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ const enum IntraPredMode uv_mode =
+ b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
+ const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
+ const int xstart = ts->tiling.col_start >> ss_hor;
+ const int ystart = ts->tiling.row_start >> ss_ver;
+ const enum IntraPredMode m =
+ bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
+ ypos, ypos > ystart,
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ edge_flags, dst, stride,
+ top_sb_edge, uv_mode,
+ &angle, uv_t_dim->w,
+ uv_t_dim->h, edge);
+ dsp->ipred.intra_pred[m](dst, stride, edge,
+ uv_t_dim->w * 4,
+ uv_t_dim->h * 4,
+ angle | sm_uv_fl);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
+ uv_t_dim->h * 4, 2, "l");
+ hex_dump(edge, 0, 1, 1, "tl");
+ hex_dump(edge + 1, uv_t_dim->w * 4,
+ uv_t_dim->w * 4, 2, "t");
+ hex_dump(dst, stride, uv_t_dim->w * 4,
+ uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
+ }
+
+ skip_uv_pred: {}
+ if (!b->skip) {
+ enum TxfmType txtp;
+ int eob;
+ coef *cf;
+ if (f->frame_thread.pass) {
+ cf = ts->frame_thread.cf;
+ ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
+ const struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ eob = cbi->eob[pl + 1];
+ txtp = cbi->txtp[pl + 1];
+ } else {
+ uint8_t cf_ctx;
+ cf = t->cf;
+ eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y],
+ b->uvtx, bs, b, 1, 1 + pl, cf,
+ &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
+ memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
+ imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
+ memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
+ imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, uv_t_dim->h * 4,
+ uv_t_dim->w * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->uvtx]
+ [txtp](dst, stride,
+ cf, eob);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(dst, stride, uv_t_dim->w * 4,
+ uv_t_dim->h * 4, "recon");
+ }
+ } else if (!f->frame_thread.pass) {
+ memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
+ memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
+ }
+ dst += uv_t_dim->w * 4;
+ }
+ t->bx -= x << ss_hor;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
+ const Av1Block *const b)
+{
+ Dav1dTileState *const ts = t->ts;
+ const Dav1dFrameContext *const f = t->f;
+ const Dav1dDSPContext *const dsp = f->dsp;
+ const int bx4 = t->bx & 31, by4 = t->by & 31;
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
+ const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+ const int bw4 = b_dim[0], bh4 = b_dim[1];
+ const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+ const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+ (bw4 > ss_hor || t->bx & 1) &&
+ (bh4 > ss_ver || t->by & 1);
+ const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
+ DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
+
+ // prediction
+ const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
+ pixel *dst = ((pixel *) f->cur.p.data[0]) +
+ 4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
+ const ptrdiff_t uvdstoff =
+ 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
+ if (!(f->frame_hdr.frame_type & 1)) {
+ // intrabc
+ mc(t, dst, NULL, f->cur.p.stride[0],
+ bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
+ if (has_chroma) for (int pl = 1; pl < 3; pl++)
+ mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
+ bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+ t->bx & ~ss_hor, t->by & ~ss_ver,
+ pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
+ } else if (b->comp_type == COMP_INTER_NONE) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
+ const enum Filter2d filter_2d = b->filter2d;
+
+ if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
+ ((b->inter_mode == GLOBALMV &&
+ f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
+ (b->motion_mode == MM_WARP &&
+ t->warpmv.type > WM_TYPE_TRANSLATION)))
+ {
+ warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr.gmv[b->ref[0]]);
+ } else {
+ mc(t, dst, NULL, f->cur.p.stride[0],
+ bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);
+ if (b->motion_mode == MM_OBMC)
+ obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
+ }
+ if (b->interintra_type) {
+ ALIGN_STK_32(pixel, tl_edge_buf, 65,);
+ pixel *const tl_edge = tl_edge_buf + 32;
+ enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
+ SMOOTH_PRED : b->interintra_mode;
+ pixel *const tmp = t->scratch.interintra;
+ int angle = 0;
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[0];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
+ t->by, t->by > ts->tiling.row_start,
+ ts->tiling.col_end, ts->tiling.row_end,
+ 0, dst, f->cur.p.stride[0], top_sb_edge,
+ m, &angle, bw4, bh4, tl_edge);
+ dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
+ tl_edge, bw4 * 4, bh4 * 4, 0);
+ const uint8_t *const ii_mask =
+ b->interintra_type == INTER_INTRA_BLEND ?
+ dav1d_ii_masks[bs][0][b->interintra_mode] :
+ dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+ dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),
+ bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
+ }
+
+ if (!has_chroma) goto skip_inter_chroma_pred;
+
+ // sub8x8 derivation
+ int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+ refmvs *r;
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ r = &f->mvs[t->by * f->b4_stride + t->bx];
+ if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;
+ if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;
+ if (bw4 == 1 && bh4 == ss_ver)
+ is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
+ }
+
+ // chroma prediction
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ int h_off = 0, v_off = 0;
+ if (bw4 == 1 && bh4 == ss_ver) {
+ for (int pl = 0; pl < 2; pl++)
+ mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+ NULL, f->cur.p.stride[1],
+ bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
+ r[-(f->b4_stride + 1)].mv[0],
+ &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
+ f->frame_thread.pass != 2 ? t->tl_4x4_filter :
+ f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+ v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
+ h_off = 2;
+ }
+ if (bw4 == 1) {
+ const enum Filter2d left_filter_2d =
+ dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
+ for (int pl = 0; pl < 2; pl++)
+ mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
+ f->cur.p.stride[1], bw4, bh4, t->bx - 1,
+ t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
+ f->frame_thread.pass != 2 ? left_filter_2d :
+ f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+ h_off = 2;
+ }
+ if (bh4 == ss_ver) {
+ const enum Filter2d top_filter_2d =
+ dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
+ for (int pl = 0; pl < 2; pl++)
+ mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
+ f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
+ 1 + pl, r[-f->b4_stride].mv[0],
+ &f->refp[r[-f->b4_stride].ref[0] - 1],
+ f->frame_thread.pass != 2 ? top_filter_2d :
+ f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+ v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
+ }
+ for (int pl = 0; pl < 2; pl++)
+ mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
+ bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);
+ } else {
+ if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
+ ((b->inter_mode == GLOBALMV &&
+ f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
+ (b->motion_mode == MM_WARP &&
+ t->warpmv.type > WM_TYPE_TRANSLATION)))
+ {
+ for (int pl = 0; pl < 2; pl++)
+ warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
+ f->cur.p.stride[1], b_dim, 1 + pl, refp,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr.gmv[b->ref[0]]);
+ } else {
+ for (int pl = 0; pl < 2; pl++) {
+ mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+ NULL, f->cur.p.stride[1],
+ bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+ t->bx & ~ss_hor, t->by & ~ss_ver,
+ 1 + pl, b->mv[0], refp, filter_2d);
+ if (b->motion_mode == MM_OBMC)
+ obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+ f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+ }
+ }
+ if (b->interintra_type) {
+ // FIXME for 8x32 with 4:2:2 subsampling, this probably does
+ // the wrong thing since it will select 4x16, not 4x32, as a
+ // transform size...
+ const uint8_t *const ii_mask =
+ b->interintra_type == INTER_INTRA_BLEND ?
+ dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
+ dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
+
+ for (int pl = 0; pl < 2; pl++) {
+ pixel *const tmp = t->scratch.interintra;
+ pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
+ enum IntraPredMode m =
+ b->interintra_mode == II_SMOOTH_PRED ?
+ SMOOTH_PRED : b->interintra_mode;
+ int angle = 0;
+ pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
+ const pixel *top_sb_edge = NULL;
+ if (!(t->by & (f->sb_step - 1))) {
+ top_sb_edge = f->ipred_edge[pl + 1];
+ const int sby = t->by >> f->sb_shift;
+ top_sb_edge += f->sb128w * 128 * (sby - 1);
+ }
+ m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
+ (t->bx >> ss_hor) >
+ (ts->tiling.col_start >> ss_hor),
+ t->by >> ss_ver,
+ (t->by >> ss_ver) >
+ (ts->tiling.row_start >> ss_ver),
+ ts->tiling.col_end >> ss_hor,
+ ts->tiling.row_end >> ss_ver,
+ 0, uvdst, f->cur.p.stride[1],
+ top_sb_edge, m,
+ &angle, cbw4, cbh4, tl_edge);
+ dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
+ tl_edge, cbw4 * 4, cbh4 * 4, 0);
+ dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
+ cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
+ }
+ }
+ }
+
+ skip_inter_chroma_pred: {}
+ t->tl_4x4_filter = filter_2d;
+ } else {
+ const enum Filter2d filter_2d = b->filter2d;
+ // Maximum super block size is 128x128
+ coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;
+ int jnt_weight;
+ uint8_t *const seg_mask = t->scratch_seg_mask;
+ const uint8_t *mask;
+
+ for (int i = 0; i < 2; i++) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+
+ if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
+ f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
+ {
+ warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
+ &f->frame_hdr.gmv[b->ref[i]]);
+ } else {
+ mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
+ b->mv[i], refp, filter_2d);
+ }
+ }
+ switch (b->comp_type) {
+ case COMP_INTER_AVG:
+ dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
+ bw4 * 4, bh4 * 4);
+ break;
+ case COMP_INTER_WEIGHTED_AVG:
+ jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
+ dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
+ bw4 * 4, bh4 * 4, jnt_weight);
+ break;
+ case COMP_INTER_SEG:
+ dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);
+ mask = seg_mask;
+ break;
+ case COMP_INTER_WEDGE:
+ mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+ dsp->mc.mask(dst, f->cur.p.stride[0],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4, bh4 * 4, mask);
+ if (has_chroma)
+ mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
+ break;
+ }
+
+ // chroma
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ for (int i = 0; i < 2; i++) {
+ const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
+ if (b->inter_mode == GLOBALMV_GLOBALMV &&
+ imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
+ f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
+ {
+ warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
+ refp, &f->frame_hdr.gmv[b->ref[i]]);
+ } else {
+ mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
+ 1 + pl, b->mv[i], refp, filter_2d);
+ }
+ }
+ pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
+ switch (b->comp_type) {
+ case COMP_INTER_AVG:
+ dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
+ break;
+ case COMP_INTER_WEIGHTED_AVG:
+ dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);
+ break;
+ case COMP_INTER_WEDGE:
+ case COMP_INTER_SEG:
+ dsp->mc.mask(uvdst, f->cur.p.stride[1],
+ tmp[b->mask_sign], tmp[!b->mask_sign],
+ bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);
+ break;
+ }
+ }
+ }
+
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
+ hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
+ if (has_chroma) {
+ hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],
+ cbw4 * 4, cbh4 * 4, "u-pred");
+ hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],
+ cbw4 * 4, cbh4 * 4, "v-pred");
+ }
+ }
+
+ const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
+
+ if (b->skip) {
+ // reset coef contexts
+ memset(&t->a->lcoef[bx4], 0x40, w4);
+ memset(&t->l.lcoef[by4], 0x40, h4);
+ if (has_chroma) {
+ memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
+ memset(&t->l.ccoef[0][cby4], 0x40, ch4);
+ memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
+ memset(&t->l.ccoef[1][cby4], 0x40, ch4);
+ }
+ return;
+ }
+
+ const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
+ const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
+
+ for (int init_y = 0; init_y < bh4; init_y += 16) {
+ for (int init_x = 0; init_x < bw4; init_x += 16) {
+ // coefficient coding & inverse transforms
+ int y_off = !!init_y, y;
+ dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;
+ for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
+ y += ytx->h, y_off++)
+ {
+ int x, x_off = !!init_x;
+ for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
+ x += ytx->w, x_off++)
+ {
+ read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
+ x_off, y_off, &dst[x * 4]);
+ t->bx += ytx->w;
+ }
+ dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;
+ t->bx -= x;
+ t->by += ytx->h;
+ }
+ dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;
+ t->by -= y;
+
+ // chroma coefs and inverse transform
+ if (has_chroma) for (int pl = 0; pl < 2; pl++) {
+ pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +
+ (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);
+ for (y = init_y >> ss_ver, t->by += init_y;
+ y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
+ {
+ int x;
+ for (x = init_x >> ss_hor, t->bx += init_x;
+ x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
+ {
+ coef *cf;
+ int eob;
+ enum TxfmType txtp;
+ if (f->frame_thread.pass) {
+ cf = ts->frame_thread.cf;
+ ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
+ const struct CodedBlockInfo *const cbi =
+ &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
+ eob = cbi->eob[1 + pl];
+ txtp = cbi->txtp[1 + pl];
+ } else {
+ uint8_t cf_ctx;
+ cf = t->cf;
+ txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
+ bx4 + (x << ss_hor)];
+ eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
+ &t->l.ccoef[pl][cby4 + y],
+ b->uvtx, bs, b, 0, 1 + pl,
+ cf, &txtp, &cf_ctx);
+ if (DEBUG_BLOCK_INFO)
+ printf("Post-uv-cf-blk[pl=%d,tx=%d,"
+ "txtp=%d,eob=%d]: r=%d\n",
+ pl, b->uvtx, txtp, eob, ts->msac.rng);
+ memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
+ imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
+ memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
+ imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
+ }
+ if (eob >= 0) {
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
+ dsp->itx.itxfm_add[b->uvtx]
+ [txtp](&uvdst[4 * x],
+ f->cur.p.stride[1],
+ cf, eob);
+ if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
+ hex_dump(&uvdst[4 * x], f->cur.p.stride[1],
+ uvtx->w * 4, uvtx->h * 4, "recon");
+ }
+ t->bx += uvtx->w << ss_hor;
+ }
+ uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
+ t->bx -= x << ss_hor;
+ t->by += uvtx->h << ss_ver;
+ }
+ t->by -= y << ss_ver;
+ }
+ }
+ }
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int sbsz = f->sb_step, sbh = f->sbh;
+
+ if (f->frame_hdr.loopfilter.level_y[0] ||
+ f->frame_hdr.loopfilter.level_y[1])
+ {
+ int start_of_tile_row = 0;
+ if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)
+ start_of_tile_row = f->lf.tile_row++;
+ bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
+ start_of_tile_row);
+ }
+
+ if (f->seq_hdr.restoration) {
+ // Store loop filtered pixels required by loop restoration
+ bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
+ }
+ if (f->seq_hdr.cdef) {
+ if (sby) {
+ pixel *p_up[3] = {
+ f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),
+ f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
+ f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
+ };
+ bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
+ sby * sbsz - 2, sby * sbsz);
+ }
+ const int n_blks = sbsz - 2 * (sby + 1 < sbh);
+ bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
+ imin(sby * sbsz + n_blks, f->bh));
+ }
+ if (f->seq_hdr.restoration) {
+ bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);
+ }
+
+ f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);
+ f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+ f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+ f->lf.prev_mask_ptr = f->lf.mask_ptr;
+ if ((sby & 1) || f->seq_hdr.sb128) {
+ f->lf.mask_ptr += f->sb128w;
+ }
+}
+
+void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
+ const Dav1dFrameContext *const f = t->f;
+ Dav1dTileState *const ts = t->ts;
+ const int sby = t->by >> f->sb_shift;
+ const int sby_off = f->sb128w * 128 * sby;
+ const int x_off = ts->tiling.col_start;
+
+ const pixel *const y =
+ ((const pixel *) f->cur.p.data[0]) + x_off * 4 +
+ ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);
+ pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
+ 4 * (ts->tiling.col_end - x_off));
+
+ if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+ const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+
+ const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
+ (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);
+ for (int pl = 1; pl <= 2; pl++)
+ pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
+ &((const pixel *) f->cur.p.data[pl])[uv_off],
+ 4 * (ts->tiling.col_end - x_off) >> ss_hor);
+ }
+}