shithub: dav1d

--- a/src/cdef_apply_tmpl.c

+++ b/src/cdef_apply_tmpl.c

@@ -109,7 +109,7 @@

         if (edges & CDEF_HAVE_BOTTOM) {

             // backup pre-filter data for next iteration

-            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.stride,

+            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride,

                          8, f->bw * 4, layout);

@@ -173,8 +173,8 @@

                 if (y_lvl) {

                     dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],

                                     (pixel *const [2]) {

-                                        &f->lf.cdef_line_ptr[tf][0][0][bx * 4],

-                                        &f->lf.cdef_line_ptr[tf][0][1][bx * 4],

+                                        &f->lf.cdef_line[tf][0][0][bx * 4],

+                                        &f->lf.cdef_line[tf][0][1][bx * 4],

},

                                     adjust_strength(y_pri_lvl, variance),

                                     y_sec_lvl, y_pri_lvl ? dir : 0,

@@ -188,8 +188,8 @@

                         dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],

                                              lr_bak[bit][pl],

                                              (pixel *const [2]) {

-                                                 &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],

-                                                 &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],

+                                                 &f->lf.cdef_line[tf][pl][0][bx * 4 >> ss_hor],

+                                                 &f->lf.cdef_line[tf][pl][1][bx * 4 >> ss_hor],

},

                                              uv_pri_lvl, uv_sec_lvl,

                                              uv_pri_lvl ? uvdir : 0,

--- a/src/decode.c

+++ b/src/decode.c

@@ -2289,6 +2289,14 @@

     memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));

+// { Y+U+V, Y+U } * 4

+static const uint8_t ss_size_mul[4][2] = {

+    [DAV1D_PIXEL_LAYOUT_I400] = {  4, 4 },

+    [DAV1D_PIXEL_LAYOUT_I420] = {  6, 5 },

+    [DAV1D_PIXEL_LAYOUT_I422] = {  8, 6 },

+    [DAV1D_PIXEL_LAYOUT_I444] = { 12, 8 },

+};

 static void setup_tile(Dav1dTileState *const ts,

                        const Dav1dFrameContext *const f,

                        const uint8_t *const data, const size_t sz,

@@ -2302,8 +2310,11 @@

     const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];

     const int sb_shift = f->sb_shift;

-    ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];

-    ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];

+    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];

+    ts->frame_thread.pal_idx =

+        &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4];

+    ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +

+        (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);

     dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);

     ts->last_qidx = f->frame_hdr->quant.yac;

     memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));

@@ -2573,13 +2584,14 @@

     int retval = DAV1D_ERR(ENOMEM);

     if (f->n_tc > 1) {

-        if (f->frame_hdr->tiling.cols * f->sbh > f->tile_thread.titsati_sz) {

+        const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh;

+        if (titsati_sz != f->tile_thread.titsati_sz) {

             freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);

             f->tile_thread.task_idx_to_sby_and_tile_idx =

                 malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) *

-                       f->frame_hdr->tiling.cols * f->sbh);

+                       titsati_sz);

             if (!f->tile_thread.task_idx_to_sby_and_tile_idx) goto error;

-            f->tile_thread.titsati_sz = f->frame_hdr->tiling.cols * f->sbh;

+            f->tile_thread.titsati_sz = titsati_sz;

         if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||

             f->tile_thread.titsati_init[1] != f->sbh ||

@@ -2606,31 +2618,51 @@

-    if (f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows > f->n_ts) {

-        f->ts = realloc(f->ts, f->frame_hdr->tiling.cols *

-                               f->frame_hdr->tiling.rows * sizeof(*f->ts));

-        if (!f->ts) goto error;

-        for (int n = f->n_ts;

-             n < f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; n++)

-        {

-            Dav1dTileState *const ts = &f->ts[n];

-            if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;

-            if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {

+    const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;

+    if (n_ts != f->n_ts) {

+        if (n_ts > f->n_ts) {

+            Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);

+            if (!ts_new) goto error;

+            f->ts = ts_new;

+            for (int n = f->n_ts; n < n_ts; n++) {

+                Dav1dTileState *const ts = &f->ts[n];

+                if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;

+                if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {

+                    pthread_mutex_destroy(&ts->tile_thread.lock);

+                    goto error;

+                }

+            }

+            f->n_ts = n_ts;

+        } else {

+            for (int n = n_ts; n < f->n_ts; n++) {

+                Dav1dTileState *const ts = &f->ts[n];

+                pthread_cond_destroy(&ts->tile_thread.cond);

                 pthread_mutex_destroy(&ts->tile_thread.lock);

-                goto error;

-            f->n_ts = n + 1;

+            f->n_ts = n_ts;

+            Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);

+            if (!ts_new) goto error;

+            f->ts = ts_new;

         if (c->n_fc > 1) {

             freep(&f->frame_thread.tile_start_off);

             f->frame_thread.tile_start_off =

-                malloc(sizeof(*f->frame_thread.tile_start_off) *

-                       f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows);

+                malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);

             if (!f->frame_thread.tile_start_off) goto error;

-        f->n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;

+    const int a_sz = f->sb128w * f->frame_hdr->tiling.rows;

+    if (a_sz != f->a_sz) {

+        freep(&f->a);

+        f->a = malloc(sizeof(*f->a) * a_sz);

+        if (!f->a) goto error;

+        f->a_sz = a_sz;

+    }

+    const int num_sb128 = f->sb128w * f->sb128h;

+    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];

+    const int hbd = !!f->seq_hdr->hbd;

     if (c->n_fc > 1) {

         int tile_idx = 0;

         for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {

@@ -2643,100 +2675,105 @@

                     f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4;

-    }

-    if (f->sb128w * f->frame_hdr->tiling.rows > f->a_sz) {

-        freep(&f->a);

-        f->a = malloc(f->sb128w * f->frame_hdr->tiling.rows * sizeof(*f->a));

-        if (!f->a) goto error;

-        f->a_sz = f->sb128w * f->frame_hdr->tiling.rows;

+        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;

+        if (cf_sz != f->frame_thread.cf_sz) {

+            dav1d_freep_aligned(&f->frame_thread.cf);

+            f->frame_thread.cf =

+                dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 32);

+            if (!f->frame_thread.cf) goto error;

+            memset(f->frame_thread.cf, 0, (size_t)cf_sz * 128 * 128 / 2);

+            f->frame_thread.cf_sz = cf_sz;

+        }

+        if (f->frame_hdr->allow_screen_content_tools) {

+            if (num_sb128 != f->frame_thread.pal_sz) {

+                dav1d_freep_aligned(&f->frame_thread.pal);

+                f->frame_thread.pal =

+                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *

+                                        num_sb128 * 16 * 16, 32);

+                if (!f->frame_thread.pal)

+                    goto error;

+                f->frame_thread.pal_sz = num_sb128;

+            }

+            const int pal_idx_sz = num_sb128 * size_mul[1];

+            if (pal_idx_sz != f->frame_thread.pal_idx_sz) {

+                dav1d_freep_aligned(&f->frame_thread.pal_idx);

+                f->frame_thread.pal_idx =

+                    dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *

+                                        pal_idx_sz * 128 * 128 / 4, 32);

+                if (!f->frame_thread.pal_idx)

+                    goto error;

+                f->frame_thread.pal_idx_sz = pal_idx_sz;

+            }

+        } else if (f->frame_thread.pal) {

+            dav1d_freep_aligned(&f->frame_thread.pal);

+            dav1d_freep_aligned(&f->frame_thread.pal_idx);

+            f->frame_thread.pal_sz = f->frame_thread.pal_idx_sz = 0;

+        }

     // update allocation of block contexts for above

-    if (f->sb128w > f->lf.line_sz) {

-        dav1d_freep_aligned(&f->lf.cdef_line);

-        // note that we allocate all pixel arrays as if we were dealing with

-        // 10 bits/component data

-        uint16_t *ptr = f->lf.cdef_line =

-            dav1d_alloc_aligned(f->b4_stride * 4 * 12 * sizeof(uint16_t), 32);

+    const int line_sz = (int)f->b4_stride << hbd;

+    if (line_sz != f->lf.line_sz) {

+        dav1d_freep_aligned(&f->lf.cdef_line[0][0][0]);

+        uint8_t *ptr = dav1d_alloc_aligned(line_sz * 4 * 12, 32);

         if (!ptr) goto error;

         for (int pl = 0; pl <= 2; pl++) {

-            f->lf.cdef_line_ptr[0][pl][0] = ptr + f->b4_stride * 4 * 0;

-            f->lf.cdef_line_ptr[0][pl][1] = ptr + f->b4_stride * 4 * 1;

-            f->lf.cdef_line_ptr[1][pl][0] = ptr + f->b4_stride * 4 * 2;

-            f->lf.cdef_line_ptr[1][pl][1] = ptr + f->b4_stride * 4 * 3;

-            ptr += f->b4_stride * 4 * 4;

+            f->lf.cdef_line[0][pl][0] = ptr + line_sz * 4 * 0;

+            f->lf.cdef_line[0][pl][1] = ptr + line_sz * 4 * 1;

+            f->lf.cdef_line[1][pl][0] = ptr + line_sz * 4 * 2;

+            f->lf.cdef_line[1][pl][1] = ptr + line_sz * 4 * 3;

+            ptr += line_sz * 4 * 4;

-        f->lf.line_sz = f->sb128w;

+        f->lf.line_sz = line_sz;

-    const ptrdiff_t lr_stride = (f->sr_cur.p.p.w + 31) & ~31;

-    if (lr_stride > f->lf.lr_line_sz) {

-        dav1d_freep_aligned(&f->lf.lr_lpf_line);

-        uint16_t *lr_ptr = f->lf.lr_lpf_line =

-            dav1d_alloc_aligned(lr_stride * 3 * 12 * sizeof(uint16_t), 32);

+    const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;

+    if (lr_line_sz != f->lf.lr_line_sz) {

+        dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);

+        uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);

         if (!lr_ptr) goto error;

         for (int pl = 0; pl <= 2; pl++) {

-            f->lf.lr_lpf_line_ptr[pl] = lr_ptr;

-            lr_ptr += lr_stride * 12;

+            f->lf.lr_lpf_line[pl] = lr_ptr;

+            lr_ptr += lr_line_sz * 12;

-        f->lf.lr_line_sz = (int) lr_stride;

+        f->lf.lr_line_sz = lr_line_sz;

     // update allocation for loopfilter masks

-    if (f->sb128w * f->sb128h > f->lf.mask_sz) {

+    if (num_sb128 != f->lf.mask_sz) {

         freep(&f->lf.mask);

         freep(&f->lf.level);

-        freep(&f->frame_thread.b);

-        f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));

+        f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128);

         // over-allocate by 3 bytes since some of the SIMD implementations

         // index this from the level type and can thus over-read by up to 3

-        f->lf.level = malloc(3 + f->sb128w * f->sb128h * 32 * 32 *

-                             sizeof(*f->lf.level));

+        f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3);

         if (!f->lf.mask || !f->lf.level) goto error;

         if (c->n_fc > 1) {

             freep(&f->frame_thread.b);

             freep(&f->frame_thread.cbi);

-            dav1d_freep_aligned(&f->frame_thread.cf);

-            dav1d_freep_aligned(&f->frame_thread.pal_idx);

-            dav1d_freep_aligned(&f->frame_thread.pal);

             f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *

-                                       f->sb128w * f->sb128h * 32 * 32);

-            f->frame_thread.pal =

-                dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *

-                                    f->sb128w * f->sb128h * 16 * 16, 32);

-            f->frame_thread.pal_idx =

-                dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *

-                                    f->sb128w * f->sb128h * 128 * 128 * 2, 32);

+                                       num_sb128 * 32 * 32);

             f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *

-                                         f->sb128w * f->sb128h * 32 * 32);

-            f->frame_thread.cf =

-                dav1d_alloc_aligned(sizeof(int32_t) * 3 *

-                                    f->sb128w * f->sb128h * 128 * 128, 32);

-            if (!f->frame_thread.b || !f->frame_thread.pal_idx ||

-                !f->frame_thread.pal || !f->frame_thread.cbi ||

-                !f->frame_thread.cf)

-            {

-                goto error;

-            }

-            memset(f->frame_thread.cf, 0,

-                   sizeof(int32_t) * 3 * f->sb128w * f->sb128h * 128 * 128);

+                                         num_sb128 * 32 * 32);

+            if (!f->frame_thread.b || !f->frame_thread.cbi) goto error;

-        f->lf.mask_sz = f->sb128w * f->sb128h;

+        f->lf.mask_sz = num_sb128;

     f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;

-    if (f->sr_sb128w * f->sb128h > f->lf.lr_mask_sz) {

+    const int lr_mask_sz = f->sr_sb128w * f->sb128h;

+    if (lr_mask_sz != f->lf.lr_mask_sz) {

         freep(&f->lf.lr_mask);

-        f->lf.lr_mask = malloc(f->sr_sb128w * f->sb128h * sizeof(*f->lf.lr_mask));

+        f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz);

         if (!f->lf.lr_mask) goto error;

-        f->lf.lr_mask_sz = f->sr_sb128w * f->sb128h;

+        f->lf.lr_mask_sz = lr_mask_sz;

     if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {

         dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);

@@ -2743,26 +2780,26 @@

         f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;

     dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });

-    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * f->sb128w * f->sb128h);

+    memset(f->lf.mask, 0, sizeof(*f->lf.mask) * num_sb128);

-    if (f->sbh * f->sb128w * 128 > f->ipred_edge_sz) {

+    const int ipred_edge_sz = f->sbh * f->sb128w << hbd;

+    if (ipred_edge_sz != f->ipred_edge_sz) {

         dav1d_freep_aligned(&f->ipred_edge[0]);

-        uint16_t *ptr = f->ipred_edge[0] =

-            dav1d_alloc_aligned(f->sb128w * 128 * f->sbh * 3 * sizeof(uint16_t), 32);

-        if (!f->ipred_edge[0]) goto error;

-        f->ipred_edge_sz = f->sbh * f->sb128w * 128;

-        f->ipred_edge[1] = &ptr[f->ipred_edge_sz];

-        f->ipred_edge[2] = &ptr[f->ipred_edge_sz * 2];

+        uint8_t *ptr = f->ipred_edge[0] =

+            dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 32);

+        if (!ptr) goto error;

+        f->ipred_edge[1] = ptr + ipred_edge_sz * 128 * 1;

+        f->ipred_edge[2] = ptr + ipred_edge_sz * 128 * 2;

+        f->ipred_edge_sz = ipred_edge_sz;

-    if (f->sb128h * f->frame_hdr->tiling.cols > f->lf.re_sz) {

+    const int re_sz = f->sb128h * f->frame_hdr->tiling.cols;

+    if (re_sz != f->lf.re_sz) {

         freep(&f->lf.tx_lpf_right_edge[0]);

-        f->lf.tx_lpf_right_edge[0] = malloc((f->sb128h * 32 * 2) *

-                                            f->frame_hdr->tiling.cols);

+        f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2);

         if (!f->lf.tx_lpf_right_edge[0]) goto error;

-        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] +

-                                     f->sb128h * 32 * f->frame_hdr->tiling.cols;

-        f->lf.re_sz = f->sb128h * f->frame_hdr->tiling.cols;

+        f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] + re_sz * 32;

+        f->lf.re_sz = re_sz;

     // init ref mvs

@@ -3010,8 +3047,9 @@

                 Dav1dTileState *const ts = &f->ts[tile_idx];

                 const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];

-                ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];

-                ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];

+                ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4];

+                ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +

+                    ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);

                 if (f->n_tc > 0) {

                     unsigned row_sb_start = f->frame_hdr->tiling.row_start_sb[ts->tiling.row];

                     atomic_init(&ts->progress, row_sb_start);

--- a/src/internal.h

+++ b/src/internal.h

@@ -196,12 +196,12 @@

             int16_t eob[3 /* plane */];

             uint8_t txtp[3 /* plane */];

         } *cbi;

-        int8_t *txtp;

         // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)

         uint16_t (*pal)[3 /* plane */][8 /* idx */];

         // iterated over inside tile state

         uint8_t *pal_idx;

         coef *cf;

+        int pal_sz, pal_idx_sz, cf_sz;

         // start offsets per tile

         int *tile_start_off;

     } frame_thread;

@@ -217,10 +217,8 @@

         int last_sharpness;

         uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];

         uint8_t *tx_lpf_right_edge[2];

-        pixel *cdef_line;

-        pixel *cdef_line_ptr[2 /* pre, post */][3 /* plane */][2 /* y */];

-        pixel *lr_lpf_line;

-        pixel *lr_lpf_line_ptr[3 /* plane */];

+        pixel *cdef_line[2 /* pre, post */][3 /* plane */][2 /* y */];

+        pixel *lr_lpf_line[3 /* plane */];

         // in-loop filter per-frame state keeping

         int tile_row; // for carry-over at tile row edges

--- a/src/lib.c

+++ b/src/lib.c

@@ -509,8 +509,8 @@

         free(f->lf.level);

         free(f->lf.tx_lpf_right_edge[0]);

         if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);

-        dav1d_free_aligned(f->lf.cdef_line);

-        dav1d_free_aligned(f->lf.lr_lpf_line);

+        dav1d_free_aligned(f->lf.cdef_line[0][0][0]);

+        dav1d_free_aligned(f->lf.lr_lpf_line[0]);

     dav1d_free_aligned(c->fc);

     dav1d_data_unref_internal(&c->in);

--- a/src/lr_apply_tmpl.c

+++ b/src/lr_apply_tmpl.c

@@ -122,7 +122,7 @@

         const int w = f->bw << 2;

         const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);

         const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;

-        backup_lpf(f, f->lf.lr_lpf_line_ptr[0], lr_stride,

+        backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,

                    src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],

                    0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);

@@ -137,12 +137,12 @@

             (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;

         if (restore_planes & LR_RESTORE_U) {

-            backup_lpf(f, f->lf.lr_lpf_line_ptr[1], lr_stride,

+            backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,

                        src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],

                        ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);

         if (restore_planes & LR_RESTORE_V) {

-            backup_lpf(f, f->lf.lr_lpf_line_ptr[2], lr_stride,

+            backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,

                        src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],

                        ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);

@@ -158,7 +158,7 @@

     const int chroma = !!plane;

     const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);

     const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);

-    const pixel *lpf = f->lf.lr_lpf_line_ptr[plane] + x;

+    const pixel *lpf = f->lf.lr_lpf_line[plane] + x;

     const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];

     const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);

--- a/src/thread_task.c

+++ b/src/thread_task.c

@@ -44,7 +44,7 @@

         const int res = dav1d_decode_frame(f);

         if (res)

             memset(f->frame_thread.cf, 0,

-                   sizeof(int32_t) * 3 * f->lf.mask_sz * 128 * 128);

+                   (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);

         pthread_mutex_lock(&f->frame_thread.td.lock);

         f->n_tile_data = 0;