shithub: dav1d

Download patch

ref: cfa986fe1b9fa783671ea66cf479a26b2a5aff19
parent: 20e9f4df68761e48d44d134ba942e4ecc11446b7
author: Niklas Haas <[email protected]>
date: Tue Nov 13 11:53:10 EST 2018

film_grain: implement film grain synthesis

This is using a slightly adapted version of my GPU-based algorithm. The
major difference to the algorithm suggested by the spec (and implemented
in libaom) is that instead of using a line buffer to hold the previous
row's film grain blocks, we compute each row/block fully independently.

This opens up the door to exploit parallelism in the future, since we
don't have any left->right or top->down dependency except for the PRNG
state. (Which we could pre-compute for a massively parallel / GPU
implementation)

That being said, it's probably somewhat slower than using a line buffer
for the serial / single CPU case, although most likely not by much
(since the areas with the most redundant work get progressively smaller,
down to a single 2x2 square for the worst case).

--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -45,6 +45,7 @@
     int n_frame_threads;
     int n_tile_threads;
     Dav1dPicAllocator allocator;
+    int apply_grain;
 } Dav1dSettings;
 
 /**
--- /dev/null
+++ b/src/film_grain.h
@@ -1,0 +1,39 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_FILM_GRAIN_H__
+#define __DAV1D_SRC_FILM_GRAIN_H__
+
+#include "dav1d/dav1d.h"
+
+void dav1d_apply_grain_8bpc(Dav1dPicture *const out,
+                            const Dav1dPicture *const in);
+
+void dav1d_apply_grain_10bpc(Dav1dPicture *const out,
+                             const Dav1dPicture *const in);
+
+#endif /* __DAV1D_SRC_FILM_GRAIN_H__ */
--- /dev/null
+++ b/src/film_grain_tmpl.c
@@ -1,0 +1,530 @@
+/*
+ * Copyright © 2018, Niklas Haas
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "common.h"
+#include "common/intops.h"
+#include "common/bitdepth.h"
+#include "tables.h"
+
+#include "film_grain.h"
+
+#if BITDEPTH == 8
+typedef int8_t entry;
+#else
+typedef int16_t entry;
+#endif
+
+enum {
+    GRAIN_WIDTH  = 82,
+    GRAIN_HEIGHT = 73,
+    SUB_GRAIN_WIDTH = 44,
+    SUB_GRAIN_HEIGHT = 38,
+    SUB_GRAIN_OFFSET = 6,
+    BLOCK_SIZE = 32,
+    SCALING_SIZE = 1 << BITDEPTH,
+};
+
+static inline int get_random_number(const int bits, unsigned *state) {
+    const int r = *state;
+    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
+    *state = (r >> 1) | (bit << 15);
+
+    return (*state >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static inline int round2(const int x, const int shift) {
+    return (x + ((1 << shift) >> 1)) >> shift;
+}
+
+enum {
+    GRAIN_CENTER = 128 << (BITDEPTH - 8),
+    GRAIN_MIN = -GRAIN_CENTER,
+    GRAIN_MAX = (256 << (BITDEPTH - 8)) - 1 - GRAIN_CENTER,
+};
+
+static void generate_grain_y(const Dav1dPicture *const in,
+                             entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])
+{
+    const Dav1dFilmGrainData *data = &in->p.film_grain;
+    unsigned seed = data->seed;
+    const int shift = 12 - BITDEPTH + data->grain_scale_shift;
+
+    for (int y = 0; y < GRAIN_HEIGHT; y++) {
+        for (int x = 0; x < GRAIN_WIDTH; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
+        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_y;
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    if (!dx && !dy)
+                        break;
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+        }
+    }
+}
+
+static void generate_grain_uv(const Dav1dPicture *const in, int uv,
+                              entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],
+                              entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])
+{
+    const Dav1dFilmGrainData *data = &in->p.film_grain;
+    unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
+    const int shift = 12 - BITDEPTH + data->grain_scale_shift;
+
+    const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+
+    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
+    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
+
+    for (int y = 0; y < chromaH; y++) {
+        for (int x = 0; x < chromaW; x++) {
+            const int value = get_random_number(11, &seed);
+            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
+        }
+    }
+
+    const int ar_pad = 3;
+    const int ar_lag = data->ar_coeff_lag;
+
+    for (int y = ar_pad; y < chromaH; y++) {
+        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
+            const int8_t *coeff = data->ar_coeffs_uv[uv];
+            int sum = 0;
+            for (int dy = -ar_lag; dy <= 0; dy++) {
+                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
+                    // For the final (current) pixel, we need to add in the
+                    // contribution from the luma grain texture
+                    if (!dx && !dy) {
+                        if (!data->num_y_points)
+                            break;
+                        int luma = 0;
+                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
+                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
+                        for (int i = 0; i <= suby; i++) {
+                            for (int j = 0; j <= subx; j++) {
+                                luma += buf_y[lumaY + i][lumaX + j];
+                            }
+                        }
+                        luma = round2(luma, subx + suby);
+                        sum += luma * (*coeff);
+                        break;
+                    }
+
+                    sum += *(coeff++) * buf[y + dy][x + dx];
+                }
+            }
+
+            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
+            buf[y][x] = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+        }
+    }
+}
+
+static void generate_scaling(const uint8_t points[][2], int num,
+                             uint8_t scaling[SCALING_SIZE])
+{
+    const int shift_x = BITDEPTH - 8;
+
+    // Fill up the preceding entries with the initial value
+    for (int i = 0; i < points[0][0] << shift_x; i++)
+        scaling[i] = points[0][1];
+
+    // Linearly interpolate the values in the middle
+    for (int i = 0; i < num - 1; i++) {
+        const int bx = points[i][0] << shift_x;
+        const int by = points[i][1];
+        const int ex = points[i+1][0] << shift_x;
+        const int ey = points[i+1][1];
+        const int dx = ex - bx;
+        const int dy = ey - by;
+        const int delta = dy * ((0xFFFF + (dx >> 1))) / dx;
+        for (int x = 0; x < dx; x++) {
+            const int v = by + ((x * delta + 0x8000) >> 16);
+            scaling[bx + x] = v;
+        }
+    }
+
+    // Fill up the remaining entries with the final value
+    for (int i = points[num - 1][0] << shift_x; i < SCALING_SIZE; i++)
+        scaling[i] = points[num - 1][1];
+}
+
+// samples from the correct block of a grain LUT, while taking into account the
+// offsets provided by the offsets cache
+static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+                               int offsets[2][2], int subx, int suby,
+                               int bx, int by, int x, int y)
+{
+    const int randval = offsets[bx][by];
+    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
+    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
+    return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
+                    [offx + x + (BLOCK_SIZE >> subx) * bx];
+}
+
+static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,
+                           entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+                           uint8_t scaling[SCALING_SIZE], int row_num)
+{
+    const Dav1dFilmGrainData *const data = &out->p.film_grain;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << (BITDEPTH - 8);
+        max_value = 235 << (BITDEPTH - 8);
+    } else {
+        min_value = 0;
+        max_value = (1 << BITDEPTH) - 1;
+    }
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    const ptrdiff_t stride = out->stride[0];
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+    assert(stride == in->stride[0]);
+    void *const src_row =  in->data[0] + stride * row_num * BLOCK_SIZE;
+    void *const dst_row = out->data[0] + stride * row_num * BLOCK_SIZE;
+
+    // edge extend source pixels
+    const int row_len = (out->p.w + BLOCK_SIZE - 1) & ~(BLOCK_SIZE - 1);
+    for (int x = out->p.w; x < row_len; x++) {
+        for (int y = 0; y < BLOCK_SIZE; y++) {
+            pixel *src = src_row + y * stride + x * sizeof(pixel);
+            *src = 0;
+        }
+    }
+
+    const int row_h = (row_num + 1) * BLOCK_SIZE;
+    for (int y = out->p.h; y < row_h; y++)
+        memset(in->data[0] + stride * y, 0, row_len * sizeof(pixel));
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks
+    for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? 2 : 0;
+        const int xstart = data->overlap_flag && bx      ? 2 : 0;
+
+        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
+
+#define add_noise_y(x, y, grain)                                                \
+            pixel *src = src_row + (y) * stride + (bx + (x)) * sizeof(pixel);   \
+            pixel *dst = dst_row + (y) * stride + (bx + (x)) * sizeof(pixel);   \
+            int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
+            *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < BLOCK_SIZE; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < BLOCK_SIZE; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_y(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < BLOCK_SIZE; x++) {
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                grain = round2(old * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_y(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
+                top = round2(old * w[x][0] + top * w[x][1], 5);
+                top = iclip(top, GRAIN_MIN, GRAIN_MAX);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
+                grain = round2(old * w[x][0] + grain * w[x][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+
+                // Mix the row rows together and apply grain
+                grain = round2(top * w[y][0] + grain * w[y][1], 5);
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_y(x, y, grain);
+            }
+        }
+    }
+}
+
+static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,
+                            entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
+                            uint8_t scaling[SCALING_SIZE], int uv, int row_num)
+{
+    const Dav1dFilmGrainData *const data = &out->p.film_grain;
+    const int rows = 1 + (data->overlap_flag && row_num > 0);
+
+    int min_value, max_value;
+    if (data->clip_to_restricted_range) {
+        min_value = 16 << (BITDEPTH - 8);
+        if (out->p.mtrx == DAV1D_MC_IDENTITY) {
+            max_value = 235 << (BITDEPTH - 8);
+        } else {
+            max_value = 240 << (BITDEPTH - 8);
+        }
+    } else {
+        min_value = 0;
+        max_value = (1 << BITDEPTH) - 1;
+    }
+
+    const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+
+    // seed[0] contains the current row, seed[1] contains the previous
+    unsigned seed[2];
+    for (int i = 0; i < rows; i++) {
+        seed[i] = data->seed;
+        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
+        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
+    }
+
+    const ptrdiff_t stride = out->stride[1];
+    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+    assert(stride == in->stride[1]);
+
+    const int by = row_num * (BLOCK_SIZE >> sy);
+    void *const dst_row = out->data[1 + uv] + stride * by;
+    void *const src_row =  in->data[1 + uv] + stride * by;
+    void *const luma_row = out->data[0] + out->stride[0] * row_num * BLOCK_SIZE;
+
+    // edge extend source pixels
+    const int row_len = ((out->p.w >> sx) + (BLOCK_SIZE >> sx) - 1)
+                        & ~((BLOCK_SIZE >> sx) - 1);
+    for (int x = out->p.w >> sx; x < row_len; x++) {
+        for (int y = 0; y < BLOCK_SIZE >> sy; y++) {
+            pixel *src = src_row + y * stride + x * sizeof(pixel);
+            *src = 0;
+        }
+    }
+
+    const int row_h = (row_num + 1) * (BLOCK_SIZE >> sy);
+    for (int y = out->p.h >> sy; y < row_h; y++)
+        memset(in->data[1 + uv] + stride * y, 0, row_len * sizeof(pixel));
+
+    int offsets[2 /* col offset */][2 /* row offset */];
+
+    // process this row in BLOCK_SIZE^2 blocks (subsampled)
+    for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {
+        if (data->overlap_flag && bx) {
+            // shift previous offsets left
+            for (int i = 0; i < rows; i++)
+                offsets[1][i] = offsets[0][i];
+        }
+
+        // update current offsets
+        for (int i = 0; i < rows; i++)
+            offsets[0][i] = get_random_number(8, &seed[i]);
+
+        // x/y block offsets to compensate for overlapped regions
+        const int ystart = data->overlap_flag && row_num ? (2 >> sy) : 0;
+        const int xstart = data->overlap_flag && bx      ? (2 >> sx) : 0;
+
+        static const int w[2 /* sub */][2 /* off */][2] = {
+            { { 27, 17 }, { 17, 27 } },
+            { { 23, 22 } },
+        };
+
+#define add_noise_uv(x, y, grain)                                               \
+            const int lx = (bx + x) << sx;                                      \
+            const int ly = y << sy;                                             \
+            pixel *luma = luma_row + ly * out->stride[0] + lx * sizeof(pixel);  \
+            pixel avg = luma[0];                                                \
+            if (sx && lx + 1 < out->p.w)                                        \
+                avg = (avg + luma[1] + 1) >> 1;                                 \
+                                                                                \
+            pixel *src = src_row + (y) * stride + (bx + (x)) * sizeof(pixel);   \
+            pixel *dst = dst_row + (y) * stride + (bx + (x)) * sizeof(pixel);   \
+            int val = avg;                                                      \
+            if (!data->chroma_scaling_from_luma) {                              \
+                int combined = avg * data->uv_luma_mult[uv] +                   \
+                               *src * data->uv_mult[uv];                        \
+                val = iclip_pixel( (combined >> 6) +                            \
+                                   (data->uv_offset[uv] << (BITDEPTH - 8)) );   \
+            }                                                                   \
+                                                                                \
+            int noise = round2(scaling[ val ] * (grain), data->scaling_shift);  \
+            *dst = iclip(*src + noise, min_value, max_value);
+
+        for (int y = ystart; y < BLOCK_SIZE >> sy; y++) {
+            // Non-overlapped image region (straightforward)
+            for (int x = xstart; x < BLOCK_SIZE >> sx; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for overlapped column
+            for (int x = 0; x < xstart; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_uv(x, y, grain);
+            }
+        }
+
+        for (int y = 0; y < ystart; y++) {
+            // Special case for overlapped row (sans corner)
+            for (int x = xstart; x < BLOCK_SIZE >> sx; x++) {
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                int old   = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                grain = (old * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_uv(x, y, grain);
+            }
+
+            // Special case for doubly-overlapped corner
+            for (int x = 0; x < xstart; x++) {
+                // Blend the top pixel with the top left block
+                int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
+                int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
+                top = (old * w[sx][x][0] + top * w[sx][x][1] + 16) >> 5;
+                top = iclip(top, GRAIN_MIN, GRAIN_MAX);
+
+                // Blend the current pixel with the left block
+                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
+                old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
+                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+
+                // Mix the row rows together and apply to image
+                grain = (top * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
+                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
+                add_noise_uv(x, y, grain);
+            }
+        }
+    }
+}
+
+void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,
+                              const Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *const data = &out->p.film_grain;
+
+    entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];
+    uint8_t scaling[3][SCALING_SIZE];
+
+    // Generate grain LUTs as needed
+    generate_grain_y(out, grain_lut[0]); // always needed
+    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
+        generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);
+    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
+        generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);
+
+    // Generate scaling LUTs as needed
+    if (data->num_y_points)
+        generate_scaling(data->y_points, data->num_y_points, scaling[0]);
+    if (data->num_uv_points[0])
+        generate_scaling(data->uv_points[0], data->num_uv_points[0], scaling[1]);
+    if (data->num_uv_points[1])
+        generate_scaling(data->uv_points[1], data->num_uv_points[1], scaling[2]);
+
+    // Synthesize grain for the affected planes
+    int rows = (out->p.h + 16) >> 5;
+    for (int row = 0; row < rows; row++) {
+        if (data->num_y_points)
+            apply_to_row_y(out, in, grain_lut[0], scaling[0], row);
+
+        if (data->chroma_scaling_from_luma) {
+            apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);
+            apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);
+        } else {
+            if (data->num_uv_points[0])
+                apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);
+            if (data->num_uv_points[1])
+                apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);
+        }
+    }
+
+    // Copy over the non-modified planes
+    // TODO: eliminate in favor of per-plane refs
+    if (!data->num_y_points) {
+        assert(out->stride[0] == in->stride[0]);
+        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
+    }
+
+    for (int i = 0; i < 2; i++) {
+        if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
+            const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            assert(out->stride[1] == in->stride[1]);
+            memcpy(out->data[1+i], in->data[1+i],
+                   (out->p.h >> suby) * out->stride[1]);
+        }
+    }
+}
--- a/src/internal.h
+++ b/src/internal.h
@@ -115,6 +115,7 @@
     } intra_edge;
 
     Dav1dPicAllocator allocator;
+    int apply_grain;
 };
 
 struct Dav1dFrameContext {
--- a/src/lib.c
+++ b/src/lib.c
@@ -43,6 +43,7 @@
 #include "src/ref.h"
 #include "src/thread_task.h"
 #include "src/wedge.h"
+#include "src/film_grain.h"
 
 static void init_internal(void) {
     dav1d_init_wedge_masks();
@@ -57,6 +58,7 @@
 void dav1d_default_settings(Dav1dSettings *const s) {
     s->n_frame_threads = 1;
     s->n_tile_threads = 1;
+    s->apply_grain = 1;
     s->allocator.cookie = NULL;
     s->allocator.alloc_picture_callback = default_picture_allocator;
     s->allocator.release_picture_callback = default_picture_release;
@@ -84,6 +86,7 @@
     memset(c, 0, sizeof(*c));
 
     c->allocator = s->allocator;
+    c->apply_grain = s->apply_grain;
     c->n_fc = s->n_frame_threads;
     c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
     if (!c->fc) goto error;
@@ -170,6 +173,39 @@
     return 0;
 }
 
+static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
+                        Dav1dPicture *const in)
+{
+    const Dav1dFilmGrainData *fgdata = &in->p.film_grain;
+    int has_grain = fgdata->num_y_points || fgdata->num_uv_points[0] ||
+                    fgdata->num_uv_points[1];
+
+    // If there is nothing to be done, skip the allocation/copy
+    if (!c->apply_grain || !has_grain) {
+        dav1d_picture_move_ref(out, in);
+        return 0;
+    }
+
+    // Apply film grain to a new copy of the image to avoid corrupting refs
+    int res = dav1d_picture_alloc_copy(out, in);
+    if (res < 0)
+        return res;
+
+    switch (out->p.bpc) {
+    case 8:
+        dav1d_apply_grain_8bpc(out, in);
+        break;
+    case 10:
+        dav1d_apply_grain_10bpc(out, in);
+        break;
+    default:
+        assert(!"apply_grain: missing bit depth");
+    }
+
+    dav1d_picture_unref(in);
+    return 0;
+}
+
 int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
 {
     int res;
@@ -220,16 +256,12 @@
         in->sz -= res;
         in->data += res;
         if (!in->sz) dav1d_data_unref(in);
-        if (c->out.data[0]) {
-            dav1d_picture_move_ref(out, &c->out);
-            return 0;
-        }
+        if (c->out.data[0])
+            break;
     }
 
-    if (c->out.data[0]) {
-        dav1d_picture_move_ref(out, &c->out);
-        return 0;
-    }
+    if (c->out.data[0])
+        return output_image(c, out, &c->out);
 
     return -EAGAIN;
 }
--- a/src/meson.build
+++ b/src/meson.build
@@ -62,7 +62,8 @@
     'cdef_tmpl.c',
     'lr_apply_tmpl.c',
     'looprestoration_tmpl.c',
-    'recon_tmpl.c'
+    'recon_tmpl.c',
+    'film_grain_tmpl.c',
 )
 
 # libdav1d entrypoint source files
--- a/src/picture.c
+++ b/src/picture.c
@@ -180,6 +180,21 @@
     return res;
 }
 
+int dav1d_picture_alloc_copy(Dav1dPicture *const dst,
+                             const Dav1dPicture *const src)
+{
+    struct pic_ctx_context *const pic_ctx = src->ref->user_data;
+    int res = dav1d_picture_alloc(dst, src->p.w, src->p.h, src->p.layout,
+                                  src->p.bpc, &pic_ctx->allocator);
+
+    if (!res) {
+        dst->poc = src->poc;
+        dst->p = src->p;
+    }
+
+    return res;
+}
+
 void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data[0] == NULL);
--- a/src/picture.h
+++ b/src/picture.h
@@ -64,6 +64,11 @@
                                Dav1dPicAllocator *);
 
 /**
+ * Allocate a picture with identical metadata to an existing picture.
+ */
+int dav1d_picture_alloc_copy(Dav1dPicture *dst, const Dav1dPicture *src);
+
+/**
  * Create a copy of a picture.
  */
 void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
--- a/src/tables.c
+++ b/src/tables.c
@@ -872,3 +872,194 @@
     31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11,  9,
      8,  7,  6,  5,  4,  4,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,
 };
+
+// Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512
+const int16_t dav1d_gaussian_sequence[2048] = {
+    56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+    224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+    112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+    -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+    432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+    192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+    540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+    248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+    248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+    340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+    220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+    -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+    60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+    488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+    -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+    -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+    -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+    -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+    728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+    4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+    772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+    -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+    -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+    -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+    1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+    204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+    548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+    -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+    96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+    -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+    240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+    -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+    896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+    -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+    -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+    -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+    -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+    -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+    424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+    436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+    -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+    -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+    496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+    56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+    -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+    540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+    424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+    -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+    756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+    -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+    60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+    -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+    -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+    308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+    -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+    -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+    284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+    264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+    -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+    908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+    124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+    1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+    -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+    -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+    -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+    320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+    -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+    -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+    -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+    -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+    -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+    636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+    -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+    -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+    392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+    -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+    -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+    -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+    756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+    -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+    472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+    844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+    60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+    -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+    -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+    472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+    652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+    -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+    -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+    -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+    -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+    220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+    412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+    320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+    372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+    924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+    332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+    436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+    -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+    1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+    -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+    -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+    -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+    528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+    -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+    -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+    1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+    20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+    96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+    192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+    648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+    816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+    648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+    -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+    -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+    -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+    384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+    -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+    -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+    64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+    -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+    128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+    112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+    828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+    -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+    0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+    -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+    24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+    508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+    716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+    600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+    -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+    -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+    344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+    -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+    164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+    192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+    288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+    -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+    -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+    556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+    268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+    884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+    -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+    -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+    244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+    -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+    -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+    -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+    1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+    -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+    344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+    -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+    1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+    -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+    504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+    76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+    116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+    28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+    -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+    -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+    -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+    -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+    252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+    312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+    732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+    124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+    -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+    440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+    -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+    648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+    680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+    -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+    -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+    -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+    -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+    372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+    -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+    -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+    -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+    -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+    52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+    716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+    -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+    -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+    104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+    428,   -484
+};
--- a/src/tables.h
+++ b/src/tables.h
@@ -119,4 +119,6 @@
 
 extern const uint8_t dav1d_obmc_masks[64];
 
+extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
+
 #endif /* __DAV1D_SRC_TABLES_H__ */
--- a/tools/dav1d_cli_parse.c
+++ b/tools/dav1d_cli_parse.c
@@ -48,6 +48,7 @@
     ARG_FRAME_THREADS,
     ARG_TILE_THREADS,
     ARG_VERIFY,
+    ARG_FILM_GRAIN,
 };
 
 static const struct option long_opts[] = {
@@ -62,6 +63,7 @@
     { "framethreads",   1, NULL, ARG_FRAME_THREADS },
     { "tilethreads",    1, NULL, ARG_TILE_THREADS },
     { "verify",         1, NULL, ARG_VERIFY },
+    { "filmgrain",      1, NULL, ARG_FILM_GRAIN },
     { NULL,             0, NULL, 0 },
 };
 
@@ -86,6 +88,7 @@
             " --version/-v:        print version and exit\n"
             " --framethreads $num: number of frame threads (default: 1)\n"
             " --tilethreads $num:  number of tile threads (default: 1)\n"
+            " --filmgrain          enable film grain application (default: 1, except if muxer is md5)\n"
             " --verify $md5:       verify decoded md5. implies --muxer md5, no output\n");
     exit(1);
 }
@@ -124,6 +127,7 @@
 
     memset(cli_settings, 0, sizeof(*cli_settings));
     dav1d_default_settings(lib_settings);
+    int grain_specified = 0;
 
     while ((o = getopt_long(argc, argv, short_opts, long_opts, NULL)) >= 0) {
         switch (o) {
@@ -159,6 +163,11 @@
         case ARG_VERIFY:
             cli_settings->verify = optarg;
             break;
+        case ARG_FILM_GRAIN:
+            lib_settings->apply_grain =
+                !!parse_unsigned(optarg, ARG_FILM_GRAIN, argv[0]);
+            grain_specified = 1;
+            break;
         case 'v':
             fprintf(stderr, "%s\n", dav1d_version());
             exit(0);
@@ -176,6 +185,12 @@
         cli_settings->outputfile = "-";
         if (!cli_settings->muxer)
             cli_settings->muxer = "md5";
+    }
+
+    if (!grain_specified && cli_settings->muxer &&
+        !strcmp(cli_settings->muxer, "md5"))
+    {
+        lib_settings->apply_grain = 0;
     }
 
     if (!cli_settings->inputfile)