shithub: dav1d

--- a/include/common/intops.h

+++ b/include/common/intops.h

@@ -52,6 +52,10 @@

     return s < 0 ? -v : v;

+static inline int apply_sign64(const int v, const int64_t s) {

+    return s < 0 ? -v : v;

+}

 static inline int ulog2(const unsigned v) {

     return 31 - clz(v);

--- a/src/decode.c

+++ b/src/decode.c

@@ -1720,7 +1720,8 @@

                 uint64_t mask[2] = { 0, 0 };

                 find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,

                                   have_left, have_top, b->ref[0], mask);

-                const int allow_warp = !f->frame_hdr.force_integer_mv &&

+                const int allow_warp = !f->svc[b->ref[0]][0].scale &&

+                    !f->frame_hdr.force_integer_mv &&

                     f->frame_hdr.warp_motion && (mask[0] | mask[1]);

                 b->motion_mode = allow_warp ?

@@ -2938,8 +2939,10 @@

         for (int i = 0; i < 7; i++) {

             const int refidx = f->frame_hdr.refidx[i];

             if (!c->refs[refidx].p.p.data[0] ||

-                f->frame_hdr.width  != c->refs[refidx].p.p.p.w ||

-                f->frame_hdr.height != c->refs[refidx].p.p.p.h ||

+                f->frame_hdr.width * 2 < c->refs[refidx].p.p.p.w ||

+                f->frame_hdr.height * 2 < c->refs[refidx].p.p.p.h ||

+                f->frame_hdr.width > c->refs[refidx].p.p.p.w * 16 ||

+                f->frame_hdr.height > c->refs[refidx].p.p.p.h * 16 ||

                 f->seq_hdr.layout != c->refs[refidx].p.p.p.layout ||

                 f->seq_hdr.bpc != c->refs[refidx].p.p.p.bpc)

@@ -2949,6 +2952,21 @@

                 goto error;

             dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);

+            if (f->frame_hdr.width  != c->refs[refidx].p.p.p.w ||

+                f->frame_hdr.height != c->refs[refidx].p.p.p.h)

+            {

+#define scale_fac(ref_sz, this_sz) \

+    (((ref_sz << 14) + (this_sz >> 1)) / this_sz)

+                f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,

+                                               f->frame_hdr.width);

+                f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,

+                                               f->frame_hdr.height);

+#undef scale_fac

+                f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;

+                f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;

+            } else {

+                f->svc[i][0].scale = 0;

+            }

--- a/src/internal.h

+++ b/src/internal.h

@@ -132,6 +132,12 @@

     } tile[256];

     int n_tile_data;

+    // for scalable references

+    struct ScalableMotionParams {

+        int scale; // if no scaling, this is 0

+        int step;

+    } svc[7][2 /* x, y */];

     const Dav1dContext *c;

     Dav1dTileContext *tc;

     int n_tc;

@@ -244,7 +250,7 @@

     int bx, by;

     BlockContext l, *a;

     coef *cf;

-    pixel *emu_edge; // stride=160

+    pixel *emu_edge; // stride=192 for non-SVC, or 320 for SVC

     // FIXME types can be changed to pixel (and dynamically allocated)

     // which would make copy/assign operations slightly faster?

     uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];

--- a/src/lib.c

+++ b/src/lib.c

@@ -117,7 +117,7 @@

             if (!t->scratch.mem) goto error;

             memset(t->cf, 0, 32 * 32 * sizeof(int32_t));

             t->emu_edge =

-                dav1d_alloc_aligned(192 * (128 + 7) * sizeof(uint16_t), 32);

+                dav1d_alloc_aligned(320 * (256 + 7) * sizeof(uint16_t), 32);

             if (!t->emu_edge) goto error;

             if (f->n_tc > 1) {

                 pthread_mutex_init(&t->tile_thread.td.lock, NULL);

--- a/src/mc.h

+++ b/src/mc.h

@@ -41,6 +41,12 @@

             int w, int h, int mx, int my)

 typedef decl_mc_fn(*mc_fn);

+#define decl_mc_scaled_fn(name) \

+void (name)(pixel *dst, ptrdiff_t dst_stride, \

+            const pixel *src, ptrdiff_t src_stride, \

+            int w, int h, int mx, int my, int dx, int dy)

+typedef decl_mc_scaled_fn(*mc_scaled_fn);

 #define decl_warp8x8_fn(name) \

 void (name)(pixel *dst, ptrdiff_t dst_stride, \

             const pixel *src, ptrdiff_t src_stride, \

@@ -52,6 +58,11 @@

             int w, int h, int mx, int my)

 typedef decl_mct_fn(*mct_fn);

+#define decl_mct_scaled_fn(name) \

+void (name)(coef *tmp, const pixel *src, ptrdiff_t src_stride, \

+            int w, int h, int mx, int my, int dx, int dy)

+typedef decl_mct_scaled_fn(*mct_scaled_fn);

 #define decl_warp8x8t_fn(name) \

 void (name)(coef *tmp, const ptrdiff_t tmp_stride, \

             const pixel *src, ptrdiff_t src_stride, \

@@ -96,7 +107,9 @@

 typedef struct Dav1dMCDSPContext {

     mc_fn mc[N_2D_FILTERS];

+    mc_scaled_fn mc_scaled[N_2D_FILTERS];

     mct_fn mct[N_2D_FILTERS];

+    mct_scaled_fn mct_scaled[N_2D_FILTERS];

     avg_fn avg;

     w_avg_fn w_avg;

     mask_fn mask;

--- a/src/mc_tmpl.c

+++ b/src/mc_tmpl.c

@@ -78,13 +78,19 @@

 #define FILTER_8TAP_CLIP(src, x, F, stride, sh) \

     iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))

+#define GET_H_FILTER(mx) \

+    const int8_t *const fh = !(mx) ? NULL : w > 4 ? \

+        dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \

+        dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]

+#define GET_V_FILTER(my) \

+    const int8_t *const fv = !(my) ? NULL : h > 4 ? \

+        dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \

+        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]

 #define GET_FILTERS() \

-    const int8_t *const fh = !mx ? NULL : w > 4 ? \

-        dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \

-        dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \

-    const int8_t *const fv = !my ? NULL : h > 4 ? \

-        dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \

-        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \

+    GET_H_FILTER(mx); \

+    GET_V_FILTER(my)

 static NOINLINE void

 put_8tap_c(pixel *dst, ptrdiff_t dst_stride,

@@ -142,6 +148,48 @@

 static NOINLINE void

+put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,

+                  const pixel *src, const ptrdiff_t src_stride,

+                  const int w, int h, const int mx, int my,

+                  const int dx, const int dy, const int filter_type)

+{

+    int tmp_h = (((h - 1) * dy + my) >> 10) + 8;

+    coef mid[128 * (256 + 7)], *mid_ptr = mid;

+    src -= src_stride * 3;

+    do {

+        int x;

+        int imx = mx, ioff = 0;

+        for (x = 0; x < w; x++) {

+            GET_H_FILTER(imx >> 6);

+            mid_ptr[x] = fh ? FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;

+            imx += dx;

+            ioff += imx >> 10;

+            imx &= 0x3ff;

+        }

+        mid_ptr += 128;

+        src += PXSTRIDE(src_stride);

+    } while (--tmp_h);

+    mid_ptr = mid + 128 * 3;

+    for (int y = 0; y < h; y++) {

+        int x;

+        GET_V_FILTER(my >> 6);

+        for (x = 0; x < w; x++)

+            dst[x] = fv ? FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10) :

+                          (mid_ptr[x] + 8) >> 4;

+        my += dy;

+        mid_ptr += (my >> 10) * 128;

+        my &= 0x3ff;

+        dst += PXSTRIDE(dst_stride);

+    }

+}

+static NOINLINE void

 prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,

             const int w, int h, const int mx, const int my,

             const int filter_type)

@@ -192,6 +240,46 @@

         prep_c(tmp, src, src_stride, w, h);

+static NOINLINE void

+prep_8tap_scaled_c(coef *tmp, const pixel *src, const ptrdiff_t src_stride,

+                   const int w, int h, const int mx, int my,

+                   const int dx, const int dy, const int filter_type)

+{

+    int tmp_h = (((h - 1) * dy + my) >> 10) + 8;

+    coef mid[128 * (256 + 7)], *mid_ptr = mid;

+    src -= src_stride * 3;

+    do {

+        int x;

+        int imx = mx, ioff = 0;

+        for (x = 0; x < w; x++) {

+            GET_H_FILTER(imx >> 6);

+            mid_ptr[x] = fh ? FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;

+            imx += dx;

+            ioff += imx >> 10;

+            imx &= 0x3ff;

+        }

+        mid_ptr += 128;

+        src += PXSTRIDE(src_stride);

+    } while (--tmp_h);

+    mid_ptr = mid + 128 * 3;

+    for (int y = 0; y < h; y++) {

+        int x;

+        GET_V_FILTER(my >> 6);

+        for (x = 0; x < w; x++)

+            tmp[x] = fv ? FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 6) : mid_ptr[x];

+        my += dy;

+        mid_ptr += (my >> 10) * 128;

+        my &= 0x3ff;

+        tmp += w;

+    }

+}

 #define filter_fns(type, type_h, type_v) \

 static void put_8tap_##type##_c(pixel *const dst, \

                                 const ptrdiff_t dst_stride, \

@@ -203,6 +291,17 @@

     put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \

                type_h | (type_v << 2)); \

} \

+static void put_8tap_##type##_scaled_c(pixel *const dst, \

+                                       const ptrdiff_t dst_stride, \

+                                       const pixel *const src, \

+                                       const ptrdiff_t src_stride, \

+                                       const int w, const int h, \

+                                       const int mx, const int my, \

+                                       const int dx, const int dy) \

+{ \

+    put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \

+                      type_h | (type_v << 2)); \

+} \

 static void prep_8tap_##type##_c(coef *const tmp, \

                                  const pixel *const src, \

                                  const ptrdiff_t src_stride, \

@@ -211,6 +310,16 @@

{ \

     prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \

                 type_h | (type_v << 2)); \

+} \

+static void prep_8tap_##type##_scaled_c(coef *const tmp, \

+                                        const pixel *const src, \

+                                        const ptrdiff_t src_stride, \

+                                        const int w, const int h, \

+                                        const int mx, const int my, \

+                                        const int dx, const int dy) \

+{ \

+    prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \

+                       type_h | (type_v << 2)); \

 filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)

@@ -281,6 +390,43 @@

         put_c(dst, dst_stride, src, src_stride, w, h);

+static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,

+                               const pixel *src, ptrdiff_t src_stride,

+                               const int w, int h, const int mx, int my,

+                               const int dx, const int dy)

+{

+    int tmp_h = (((h - 1) * dy + my) >> 10) + 2;

+    coef mid[128 * (256 + 1)], *mid_ptr = mid;

+    do {

+        int x;

+        int imx = mx, ioff = 0;

+        for (x = 0; x < w; x++) {

+            mid_ptr[x] = FILTER_BILIN(src, ioff, imx >> 6, 1);

+            imx += dx;

+            ioff += imx >> 10;

+            imx &= 0x3ff;

+        }

+        mid_ptr += 128;

+        src += PXSTRIDE(src_stride);

+    } while (--tmp_h);

+    mid_ptr = mid;

+    do {

+        int x;

+        for (x = 0; x < w; x++)

+            dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128, 8);

+        my += dy;

+        mid_ptr += (my >> 10) * 128;

+        my &= 0x3ff;

+        dst += PXSTRIDE(dst_stride);

+    } while (--h);

+}

 static void prep_bilin_c(coef *tmp,

                          const pixel *src, ptrdiff_t src_stride,

                          const int w, int h, const int mx, const int my)

@@ -329,6 +475,43 @@

         prep_c(tmp, src, src_stride, w, h);

+static void prep_bilin_scaled_c(coef *tmp,

+                                const pixel *src, ptrdiff_t src_stride,

+                                const int w, int h, const int mx, int my,

+                                const int dx, const int dy)

+{

+    int tmp_h = (((h - 1) * dy + my) >> 10) + 2;

+    coef mid[128 * (256 + 1)], *mid_ptr = mid;

+    do {

+        int x;

+        int imx = mx, ioff = 0;

+        for (x = 0; x < w; x++) {

+            mid_ptr[x] = FILTER_BILIN(src, ioff, imx >> 6, 1);

+            imx += dx;

+            ioff += imx >> 10;

+            imx &= 0x3ff;

+        }

+        mid_ptr += 128;

+        src += PXSTRIDE(src_stride);

+    } while (--tmp_h);

+    mid_ptr = mid;

+    do {

+        int x;

+        for (x = 0; x < w; x++)

+            tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4);

+        my += dy;

+        mid_ptr += (my >> 10) * 128;

+        my &= 0x3ff;

+        tmp += w;

+    } while (--h);

+}

 static void avg_c(pixel *dst, const ptrdiff_t dst_stride,

                   const coef *tmp1, const coef *tmp2, const int w, int h)

@@ -599,8 +782,10 @@

 void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {

 #define init_mc_fns(type, name) do { \

-    c->mc [type] = put_##name##_c; \

-    c->mct[type] = prep_##name##_c; \

+    c->mc        [type] = put_##name##_c; \

+    c->mc_scaled [type] = put_##name##_scaled_c; \

+    c->mct       [type] = prep_##name##_c; \

+    c->mct_scaled[type] = prep_##name##_scaled_c; \

 } while (0)

     init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);

--- a/src/recon_tmpl.c

+++ b/src/recon_tmpl.c

@@ -496,7 +496,7 @@

               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,

               const int bw4, const int bh4,

               const int bx, const int by, const int pl,

-              const mv mv, const Dav1dThreadPicture *const refp,

+              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,

               const enum Filter2d filter_2d)

     assert((dst8 != NULL) ^ (dst16 != NULL));

@@ -506,45 +506,98 @@

     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;

     const int mvx = mv.x, mvy = mv.y;

     const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);

-    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));

-    const int dy = by * v_mul + (mvy >> (3 + ss_ver));

     ptrdiff_t ref_stride = refp->p.stride[!!pl];

     const pixel *ref;

-    int w, h;

-    if (refp != &f->cur) { // i.e. not for intrabc

-        if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,

-                                      PLANE_TYPE_Y + !!pl))

+    if (refp->p.p.w == f->cur.p.p.w && refp->p.p.h == f->cur.p.p.h) {

+        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));

+        const int dy = by * v_mul + (mvy >> (3 + ss_ver));

+        int w, h;

+        if (refp != &f->cur) { // i.e. not for intrabc

+            if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,

+                                          PLANE_TYPE_Y + !!pl))

+            {

+                return -1;

+            }

+            w = (f->cur.p.p.w + ss_hor) >> ss_hor;

+            h = (f->cur.p.p.h + ss_ver) >> ss_ver;

+        } else {

+            w = f->bw * 4 >> ss_hor;

+            h = f->bh * 4 >> ss_ver;

+        }

+        if (dx < !!mx * 3 || dy < !!my * 3 ||

+            dx + bw4 * h_mul + !!mx * 4 > w ||

+            dy + bh4 * v_mul + !!my * 4 > h)

-            return -1;

+            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,

+                                w, h, dx - !!mx * 3, dy - !!my * 3,

+                                t->emu_edge, 192 * sizeof(pixel),

+                                refp->p.data[pl], ref_stride);

+            ref = &t->emu_edge[192 * !!my * 3 + !!mx * 3];

+            ref_stride = 192 * sizeof(pixel);

+        } else {

+            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

-        w = (f->cur.p.p.w + ss_hor) >> ss_hor;

-        h = (f->cur.p.p.h + ss_ver) >> ss_ver;

-    } else {

-        w = f->bw * 4 >> ss_hor;

-        h = f->bh * 4 >> ss_ver;

-    }

-    if (dx < !!mx * 3 || dy < !!my * 3 ||

-        dx + bw4 * h_mul + !!mx * 4 > w ||

-        dy + bh4 * v_mul + !!my * 4 > h)

-    {

-        f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,

-                            w, h, dx - !!mx * 3, dy - !!my * 3,

-                            t->emu_edge, 192 * sizeof(pixel),

-                            refp->p.data[pl], ref_stride);

-        ref = &t->emu_edge[192 * !!my * 3 + !!mx * 3];

-        ref_stride = 192 * sizeof(pixel);

-    } else {

-        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;

-    }

-    if (dst8 != NULL) {

-        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,

-                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

+        if (dst8 != NULL) {

+            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,

+                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

+        } else {

+            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,

+                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

+        }

     } else {

-        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,

-                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);

+        assert(refp != &f->cur);

+        int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);

+        int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);

+#define scale_mv(res, val, scale) do { \

+            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \

+            res = (int)apply_sign64((llabs(tmp) + 128) >> 8, tmp) + 32; \

+        } while (0)

+        int pos_y, pos_x;

+        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);

+        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);

+#undef scale_mv

+        const int left = pos_x >> 10;

+        const int top = pos_y >> 10;

+        const int right =

+            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;

+        const int bottom =

+            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;

+        if (dav1d_thread_picture_wait(refp, bottom, PLANE_TYPE_Y + !!pl))

+            return -1;

+        const int w = (refp->p.p.w + ss_hor) >> ss_hor;

+        const int h = (refp->p.p.h + ss_ver) >> ss_ver;

+        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {

+            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,

+                                w, h, left - 3, top - 3,

+                                t->emu_edge, 320 * sizeof(pixel),

+                                refp->p.data[pl], ref_stride);

+            ref = &t->emu_edge[320 * 3 + 3];

+            ref_stride = 320 * sizeof(pixel);

+        } else {

+            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;

+        }

+        if (dst8 != NULL) {

+            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,

+                                            bw4 * h_mul, bh4 * v_mul,

+                                            pos_x & 0x3ff, pos_y & 0x3ff,

+                                            f->svc[refidx][0].step,

+                                            f->svc[refidx][1].step);

+        } else {

+            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,

+                                             bw4 * h_mul, bh4 * v_mul,

+                                             pos_x & 0x3ff, pos_y & 0x3ff,

+                                             f->svc[refidx][0].step,

+                                             f->svc[refidx][1].step);

+        }

     return 0;

@@ -576,7 +629,7 @@

                 const int oh4 = imin(b_dim[1], 16) >> 1;

                 res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4,

                          t->bx + x, t->by, pl, a_r->mv[0],

-                         &f->refp[a_r->ref[0] - 1],

+                         &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1,

                          dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);

                 if (res) return res;

                 f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,

@@ -599,7 +652,7 @@

                 const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);

                 res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,

                          t->bx, t->by + y, pl, l_r->mv[0],

-                         &f->refp[l_r->ref[0] - 1],

+                         &f->refp[l_r->ref[0] - 1], l_r->ref[0] - 1,

                          dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);

                 if (res) return res;

                 f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],

@@ -1091,13 +1144,13 @@

     if (!(f->frame_hdr.frame_type & 1)) {

         // intrabc

         res = mc(t, dst, NULL, f->cur.p.stride[0],

-                 bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);

+                 bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, -1, FILTER_2D_BILINEAR);

         if (res) return res;

         if (has_chroma) for (int pl = 1; pl < 3; pl++) {

             res = mc(t, ((pixel *)f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],

                      bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

                      t->bx & ~ss_hor, t->by & ~ss_ver,

-                     pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);

+                     pl, b->mv[0], &f->cur, -1, FILTER_2D_BILINEAR);

             if (res) return res;

     } else if (b->comp_type == COMP_INTER_NONE) {

@@ -1116,7 +1169,7 @@

             if (res) return res;

         } else {

             res = mc(t, dst, NULL, f->cur.p.stride[0],

-                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);

+                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);

             if (res) return res;

             if (b->motion_mode == MM_OBMC) {

                 res = obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);

@@ -1176,6 +1229,7 @@

                              bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,

                              r[-(f->b4_stride + 1)].mv[0],

                              &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],

+                             r[-(f->b4_stride + 1)].ref[0] - 1,

                              f->frame_thread.pass != 2 ? t->tl_4x4_filter :

                                  f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);

                     if (res) return res;

@@ -1190,6 +1244,7 @@

                     res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,

                              f->cur.p.stride[1], bw4, bh4, t->bx - 1,

                              t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],

+                             r[-1].ref[0] - 1,

                              f->frame_thread.pass != 2 ? left_filter_2d :

                                  f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);

                     if (res) return res;

@@ -1204,6 +1259,7 @@

                              f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,

                              1 + pl, r[-f->b4_stride].mv[0],

                              &f->refp[r[-f->b4_stride].ref[0] - 1],

+                             r[-f->b4_stride].ref[0] - 1,

                              f->frame_thread.pass != 2 ? top_filter_2d :

                                  f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);

                     if (res) return res;

@@ -1212,7 +1268,8 @@

             for (int pl = 0; pl < 2; pl++) {

                 res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],

-                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);

+                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],

+                         refp, b->ref[0], filter_2d);

                 if (res) return res;

         } else {

@@ -1235,7 +1292,7 @@

                              NULL, f->cur.p.stride[1],

                              bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),

                              t->bx & ~ss_hor, t->by & ~ss_ver,

-                             1 + pl, b->mv[0], refp, filter_2d);

+                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);

                     if (res) return res;

                     if (b->motion_mode == MM_OBMC) {

                         res = obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,

@@ -1307,7 +1364,7 @@

                 if (res) return res;

             } else {

                 res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,

-                         b->mv[i], refp, filter_2d);

+                         b->mv[i], refp, b->ref[i], filter_2d);

                 if (res) return res;

@@ -1350,7 +1407,7 @@

                     if (res) return res;

                 } else {

                     res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,

-                             1 + pl, b->mv[i], refp, filter_2d);

+                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);

                     if (res) return res;

--- a/src/warpmv.c

+++ b/src/warpmv.c

@@ -67,10 +67,6 @@

     return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6);

-static inline int apply_sign64(const int v, const int64_t s) {

-    return s < 0 ? -v : v;

-}

 static inline int resolve_divisor_32(const unsigned d, int *const shift) {

     *shift = ulog2(d);

     const int e = d - (1 << *shift);