shithub: dav1d

Download patch

ref: 490a1420f34765f6b1aa9610e23aea247bec2dcc
parent: 3f35ef1f312dd9f8eaa91c4c4d94f5605bfebdc4
author: Niklas Haas <[email protected]>
date: Sun Sep 15 00:38:31 EDT 2019

dav1dplay: initial support for --zerocopy

Right now this just allocates a new buffer for every frame, uses it,
then discards it immediately. This is not optimal, either dav1d should
start reusing buffers internally or we need to pool them in dav1dplay.

As it stands, this is not really a performance gain. I'll have to
investigate why, but my suspicion is that seeing any gains might require
reusing buffers somewhere.

Note: Thrashing buffers is not as bad as it seems, initially. Not only
does libplacebo pool and reuse GPU memory and buffer state objects
internally, but this also absolves us from having to do any manual
polling to figure out when the buffer is reusable again. Creating, using
and immediately destroying buffers actually isn't as bad an approach as
it might otherwise seem.

It's entirely possible that this is only bad because of lock contention.
As said, I'll have to investigate further...

--- a/examples/dav1dplay.c
+++ b/examples/dav1dplay.c
@@ -51,6 +51,7 @@
     const char *inputfile;
     int highquality;
     int untimed;
+    int zerocopy;
 } Dav1dPlaySettings;
 
 #define WINDOW_WIDTH  910
@@ -161,7 +162,11 @@
     // Callback to the render function that renders a prevously sent frame
     void (*render)(void *cookie, const Dav1dPlaySettings *settings);
     // Callback to the send frame function
-    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic);
+    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
+                        const Dav1dPlaySettings *settings);
+    // Callback for alloc/release pictures (optional)
+    int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
+    void (*release_pic)(Dav1dPicture *pic, void *cookie);
 } Dav1dPlayRenderInfo;
 
 #ifdef HAVE_PLACEBO_VULKAN
@@ -389,7 +394,8 @@
     SDL_UnlockMutex(rd_priv_ctx->lock);
 }
 
-static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
+static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
+                                 const Dav1dPlaySettings *settings)
 {
     Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
     assert(rd_priv_ctx != NULL);
@@ -417,7 +423,6 @@
         .height         = height,
         .pixel_stride   = 1,
         .row_stride     = dav1d_pic->stride[0],
-        .pixels         = dav1d_pic->data[0],
         .component_size = {8},
         .component_map  = {0},
     };
@@ -428,7 +433,6 @@
         .height         = height/2,
         .pixel_stride   = 1,
         .row_stride     = dav1d_pic->stride[1],
-        .pixels         = dav1d_pic->data[1],
         .component_size = {8},
         .component_map  = {1},
     };
@@ -439,11 +443,23 @@
         .height         = height/2,
         .pixel_stride   = 1,
         .row_stride     = dav1d_pic->stride[1],
-        .pixels         = dav1d_pic->data[2],
         .component_size = {8},
         .component_map  = {2},
     };
 
+    if (settings->zerocopy) {
+        const struct pl_buf *buf = dav1d_pic->allocator_data;
+        assert(buf);
+        data_y.buf = data_u.buf = data_v.buf = buf;
+        data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
+        data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
+        data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
+    } else {
+        data_y.pixels = dav1d_pic->data[0];
+        data_u.pixels = dav1d_pic->data[1];
+        data_v.pixels = dav1d_pic->data[2];
+    }
+
     bool ok = true;
     ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
     ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
@@ -460,11 +476,106 @@
     return !ok;
 }
 
+// Align to power of 2
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
+    int ret = DAV1D_ERR(ENOMEM);
+
+    // Copied from dav1d_default_picture_alloc
+    const int hbd = p->p.bpc > 8;
+    const int aligned_w = ALIGN2(p->p.w, 128);
+    const int aligned_h = ALIGN2(p->p.h, 128);
+    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    p->stride[0] = aligned_w << hbd;
+    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+    // Align strides up to multiples of the GPU performance hints
+    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
+    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
+
+    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
+    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
+    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+    // even in the case that the driver gives us insane alignments
+    const size_t pic_size = y_sz + 2 * uv_sz;
+    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+    // Validate size limitations
+    if (total_size > gpu->limits.max_xfer_size) {
+        printf("alloc of %zu bytes exceeds limits\n", total_size);
+        goto err;
+    }
+
+    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .type = PL_BUF_TEX_TRANSFER,
+        .host_mapped = true,
+        .size = total_size,
+        .memory_type = PL_BUF_MEM_HOST,
+        .user_data = p,
+    });
+
+    if (!buf) {
+        printf("alloc of GPU mapped buffer failed\n");
+        goto err;
+    }
+
+    assert(buf->data);
+    uintptr_t base = (uintptr_t) buf->data, data[3];
+    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+    // Sanity check offset alignment for the sake of debugging
+    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
+        data[1] - base != ALIGN2(data[1] - base, off_align) ||
+        data[2] - base != ALIGN2(data[2] - base, off_align))
+    {
+        printf("GPU buffer horribly misaligned, expect slowdown!\n");
+    }
+
+    p->allocator_data = (void *) buf;
+    p->data[0] = (void *) data[0];
+    p->data[1] = (void *) data[1];
+    p->data[2] = (void *) data[2];
+    ret = 0;
+
+    // fall through
+err:
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return ret;
+}
+
+static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    assert(pic->allocator_data);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+    const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
+    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
 static const Dav1dPlayRenderInfo renderer_info = {
     .create_renderer = placebo_renderer_create,
     .destroy_renderer = placebo_renderer_destroy,
     .render = placebo_render,
-    .update_frame = placebo_upload_planes
+    .update_frame = placebo_upload_planes,
+    .alloc_pic = placebo_alloc_pic,
+    .release_pic = placebo_release_pic,
 };
 
 #else
@@ -540,7 +651,8 @@
     SDL_UnlockMutex(rd_priv_ctx->lock);
 }
 
-static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic)
+static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
+                              const Dav1dPlaySettings *settings)
 {
     Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
     assert(rd_priv_ctx != NULL);
@@ -655,6 +767,7 @@
             " --framethreads $num:  number of frame threads (default: 1)\n"
             " --tilethreads $num:   number of tile threads (default: 1)\n"
             " --highquality:        enable high quality rendering\n"
+            " --zerocopy/-z:        enable zero copy upload path\n"
             " --version/-v:         print version and exit\n");
     exit(1);
 }
@@ -678,7 +791,7 @@
     Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
 
     // Short options
-    static const char short_opts[] = "i:vu";
+    static const char short_opts[] = "i:vuz";
 
     enum {
         ARG_FRAME_THREADS = 256,
@@ -694,6 +807,7 @@
         { "framethreads",   1, NULL, ARG_FRAME_THREADS },
         { "tilethreads",    1, NULL, ARG_TILE_THREADS },
         { "highquality",    0, NULL, ARG_HIGH_QUALITY },
+        { "zerocopy",       0, NULL, 'z' },
         { NULL,             0, NULL, 0 },
     };
 
@@ -714,6 +828,12 @@
                 fprintf(stderr, "warning: --highquality requires libplacebo\n");
 #endif
                 break;
+            case 'z':
+                settings->zerocopy = true;
+#ifndef HAVE_PLACEBO_VULKAN
+                fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
+#endif
+                break;
             case ARG_FRAME_THREADS:
                 lib_settings->n_frame_threads =
                     parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
@@ -829,7 +949,7 @@
 static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
     Dav1dPicture *dav1d_pic)
 {
-    renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic);
+    renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
     rd_ctx->current_pts = dav1d_pic->m.timestamp;
 }
 
@@ -1067,6 +1187,18 @@
 
     // Parse and validate arguments
     dp_rd_ctx_parse_args(rd_ctx, argc, argv);
+
+    if (rd_ctx->settings.zerocopy) {
+        if (renderer_info.alloc_pic) {
+            rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
+                .cookie = rd_ctx->rd_priv,
+                .alloc_picture_callback = renderer_info.alloc_pic,
+                .release_picture_callback = renderer_info.release_pic,
+            };
+        } else {
+            fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
+        }
+    }
 
     // Start decoder thread
     decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);