From 65979986a923a8f08019b257c3fe72cd5e8ecf68 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.xyz>
Date: Thu, 14 Sep 2017 08:04:55 +0200
Subject: vo_opengl: refactor into vo_gpu

This is done in several steps:

1. refactor MPGLContext -> struct ra_ctx
2. move GL-specific stuff in vo_opengl into opengl/context.c
3. generalize context creation to support other APIs, and add --gpu-api
4. rename all of the --opengl- options that are no longer opengl-specific
5. move all of the stuff from opengl/* that isn't GL-specific into gpu/
   (note: opengl/gl_utils.h became opengl/utils.h)
6. rename vo_opengl to vo_gpu
7. to handle window screenshots, the short-term approach was to just add
   it to ra_swchain_fns. Long term (and for vulkan) this has to be moved to
   ra itself (and vo_gpu altered to compensate), but this was a stop-gap
   measure to prevent this commit from getting too big
8. move ra->fns->flush to ra_gl_ctx instead
9. some other minor changes that I've probably already forgotten

Note: This is one half of a major refactor, the other half of which is
provided by rossy's following commit. This commit enables support for
all linux platforms, while his version enables support for all non-linux
platforms.

Note 2: vo_opengl_cb.c also re-uses ra_gl_ctx so it benefits from the
--opengl- options like --opengl-early-flush, --opengl-finish etc. Should
be a strict superset of the old functionality.

Disclaimer: Since I have no way of compiling mpv on all platforms, some
of these ports were done blindly. Specifically, the blind ports included
context_mali_fbdev.c and context_rpi.c. Since they're both based on
egl_helpers, the port should have gone smoothly without any major
changes required. But if somebody complains about a compile error on
those platforms (assuming anybody actually uses them), you know where to
complain.
---
 video/out/gpu/context.c               |  186 ++
 video/out/gpu/context.h               |   95 +
 video/out/gpu/hwdec.c                 |  239 +++
 video/out/gpu/hwdec.h                 |  130 ++
 video/out/gpu/lcms.c                  |  531 +++++
 video/out/gpu/lcms.h                  |   43 +
 video/out/gpu/osd.c                   |  367 ++++
 video/out/gpu/osd.h                   |   25 +
 video/out/gpu/ra.c                    |  327 +++
 video/out/gpu/ra.h                    |  488 +++++
 video/out/gpu/shader_cache.c          |  954 +++++++++
 video/out/gpu/shader_cache.h          |   56 +
 video/out/gpu/user_shaders.c          |  452 ++++
 video/out/gpu/user_shaders.h          |   98 +
 video/out/gpu/utils.c                 |  372 ++++
 video/out/gpu/utils.h                 |  120 ++
 video/out/gpu/video.c                 | 3809 ++++++++++++++++++++++++++++++++
 video/out/gpu/video.h                 |  194 ++
 video/out/gpu/video_shaders.c         |  872 ++++++++
 video/out/gpu/video_shaders.h         |   56 +
 video/out/opengl/common.h             |    4 +-
 video/out/opengl/context.c            |  446 ++--
 video/out/opengl/context.h            |  152 +-
 video/out/opengl/context_cocoa.c      |    2 +-
 video/out/opengl/context_drm_egl.c    |  194 +-
 video/out/opengl/context_glx.c        |  376 ++++
 video/out/opengl/context_mali_fbdev.c |   58 +-
 video/out/opengl/context_rpi.c        |   84 +-
 video/out/opengl/context_vdpau.c      |  202 +-
 video/out/opengl/context_wayland.c    |   74 +-
 video/out/opengl/context_x11.c        |  358 ----
 video/out/opengl/context_x11egl.c     |   84 +-
 video/out/opengl/egl_helpers.c        |  114 +-
 video/out/opengl/egl_helpers.h        |   19 +-
 video/out/opengl/formats.h            |    1 -
 video/out/opengl/gl_utils.c           |  291 ---
 video/out/opengl/gl_utils.h           |   56 -
 video/out/opengl/hwdec.c              |  239 ---
 video/out/opengl/hwdec.h              |  130 --
 video/out/opengl/hwdec_cuda.c         |    3 +-
 video/out/opengl/hwdec_ios.m          |    2 +-
 video/out/opengl/hwdec_osx.c          |    2 +-
 video/out/opengl/hwdec_rpi.c          |    2 +-
 video/out/opengl/hwdec_vaegl.c        |    4 +-
 video/out/opengl/hwdec_vaglx.c        |    5 +-
 video/out/opengl/hwdec_vdpau.c        |    2 +-
 video/out/opengl/lcms.c               |  531 -----
 video/out/opengl/lcms.h               |   43 -
 video/out/opengl/osd.c                |  367 ----
 video/out/opengl/osd.h                |   25 -
 video/out/opengl/ra.c                 |  327 ---
 video/out/opengl/ra.h                 |  491 -----
 video/out/opengl/ra_gl.c              |    7 -
 video/out/opengl/ra_gl.h              |    3 +-
 video/out/opengl/shader_cache.c       |  955 ---------
 video/out/opengl/shader_cache.h       |   56 -
 video/out/opengl/user_shaders.c       |  452 ----
 video/out/opengl/user_shaders.h       |   98 -
 video/out/opengl/utils.c              |  524 ++---
 video/out/opengl/utils.h              |  151 +-
 video/out/opengl/video.c              | 3813 ---------------------------------
 video/out/opengl/video.h              |  195 --
 video/out/opengl/video_shaders.c      |  872 --------
 video/out/opengl/video_shaders.h      |   56 -
 video/out/vo.c                        |    6 +-
 video/out/vo_gpu.c                    |  385 ++++
 video/out/vo_opengl.c                 |  470 ----
 video/out/vo_opengl_cb.c              |   53 +-
 video/out/vo_rpi.c                    |    2 +-
 69 files changed, 11238 insertions(+), 10962 deletions(-)
 create mode 100644 video/out/gpu/context.c
 create mode 100644 video/out/gpu/context.h
 create mode 100644 video/out/gpu/hwdec.c
 create mode 100644 video/out/gpu/hwdec.h
 create mode 100644 video/out/gpu/lcms.c
 create mode 100644 video/out/gpu/lcms.h
 create mode 100644 video/out/gpu/osd.c
 create mode 100644 video/out/gpu/osd.h
 create mode 100644 video/out/gpu/ra.c
 create mode 100644 video/out/gpu/ra.h
 create mode 100644 video/out/gpu/shader_cache.c
 create mode 100644 video/out/gpu/shader_cache.h
 create mode 100644 video/out/gpu/user_shaders.c
 create mode 100644 video/out/gpu/user_shaders.h
 create mode 100644 video/out/gpu/utils.c
 create mode 100644 video/out/gpu/utils.h
 create mode 100644 video/out/gpu/video.c
 create mode 100644 video/out/gpu/video.h
 create mode 100644 video/out/gpu/video_shaders.c
 create mode 100644 video/out/gpu/video_shaders.h
 create mode 100644 video/out/opengl/context_glx.c
 delete mode 100644 video/out/opengl/context_x11.c
 delete mode 100644 video/out/opengl/gl_utils.c
 delete mode 100644 video/out/opengl/gl_utils.h
 delete mode 100644 video/out/opengl/hwdec.c
 delete mode 100644 video/out/opengl/hwdec.h
 delete mode 100644 video/out/opengl/lcms.c
 delete mode 100644 video/out/opengl/lcms.h
 delete mode 100644 video/out/opengl/osd.c
 delete mode 100644 video/out/opengl/osd.h
 delete mode 100644 video/out/opengl/ra.c
 delete mode 100644 video/out/opengl/ra.h
 delete mode 100644 video/out/opengl/shader_cache.c
 delete mode 100644 video/out/opengl/shader_cache.h
 delete mode 100644 video/out/opengl/user_shaders.c
 delete mode 100644 video/out/opengl/user_shaders.h
 delete mode 100644 video/out/opengl/video.c
 delete mode 100644 video/out/opengl/video.h
 delete mode 100644 video/out/opengl/video_shaders.c
 delete mode 100644 video/out/opengl/video_shaders.h
 create mode 100644 video/out/vo_gpu.c
 delete mode 100644 video/out/vo_opengl.c

(limited to 'video')

diff --git a/video/out/gpu/context.c b/video/out/gpu/context.c
new file mode 100644
index 0000000000..dbabba8b3b
--- /dev/null
+++ b/video/out/gpu/context.c
@@ -0,0 +1,186 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config.h"
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/options.h"
+#include "options/m_option.h"
+#include "video/out/vo.h"
+
+#include "context.h"
+
+extern const struct ra_ctx_fns ra_ctx_glx;
+extern const struct ra_ctx_fns ra_ctx_glx_probe;
+extern const struct ra_ctx_fns ra_ctx_x11_egl;
+extern const struct ra_ctx_fns ra_ctx_drm_egl;
+extern const struct ra_ctx_fns ra_ctx_cocoa;
+extern const struct ra_ctx_fns ra_ctx_wayland_egl;
+extern const struct ra_ctx_fns ra_ctx_wgl;
+extern const struct ra_ctx_fns ra_ctx_angle;
+extern const struct ra_ctx_fns ra_ctx_dxinterop;
+extern const struct ra_ctx_fns ra_ctx_rpi;
+extern const struct ra_ctx_fns ra_ctx_mali;
+extern const struct ra_ctx_fns ra_ctx_vdpauglx;
+
+static const struct ra_ctx_fns *contexts[] = {
+// OpenGL contexts:
+#if HAVE_RPI
+    &ra_ctx_rpi,
+#endif
+/*
+#if HAVE_GL_COCOA
+    &ra_ctx_cocoa,
+#endif
+#if HAVE_EGL_ANGLE_WIN32
+    &ra_ctx_angle,
+#endif
+#if HAVE_GL_WIN32
+    &ra_ctx_wgl,
+#endif
+#if HAVE_GL_DXINTEROP
+    &ra_ctx_dxinterop,
+#endif
+*/
+#if HAVE_GL_X11
+    &ra_ctx_glx_probe,
+#endif
+#if HAVE_EGL_X11
+    &ra_ctx_x11_egl,
+#endif
+#if HAVE_GL_X11
+    &ra_ctx_glx,
+#endif
+#if HAVE_GL_WAYLAND
+    &ra_ctx_wayland_egl,
+#endif
+#if HAVE_EGL_DRM
+    &ra_ctx_drm_egl,
+#endif
+#if HAVE_MALI_FBDEV
+    &ra_ctx_mali,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_ctx_vdpauglx,
+#endif
+};
+
+static bool get_help(struct mp_log *log, struct bstr param)
+{
+    if (bstr_equals0(param, "help")) {
+        mp_info(log, "GPU contexts / APIs:\n");
+        mp_info(log, "    auto (autodetect)\n");
+        for (int n = 0; n < MP_ARRAY_SIZE(contexts); n++)
+            mp_info(log, "    %s (%s)\n", contexts[n]->name, contexts[n]->type);
+        return true;
+    }
+
+    return false;
+}
+
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param)
+{
+    if (get_help(log, param))
+        return M_OPT_EXIT;
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->type))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param)
+{
+    if (get_help(log, param))
+        return M_OPT_EXIT;
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->name))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+// Create a VO window and create a RA context on it.
+//  vo_flags: passed to the backend's create window function
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts)
+{
+    bool api_auto = !context_type || strcmp(context_type, "auto") == 0;
+    bool ctx_auto = !context_name || strcmp(context_name, "auto") == 0;
+
+    if (ctx_auto) {
+        MP_VERBOSE(vo, "Probing for best GPU context.\n");
+        opts.probing = true;
+    }
+
+    // Hack to silence backend (X11/Wayland/etc.) errors. Kill it once backends
+    // are separate from `struct vo`
+    bool old_probing = vo->probing;
+    vo->probing = opts.probing;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (!opts.probing && strcmp(contexts[i]->name, context_name) != 0)
+            continue;
+        if (!api_auto && strcmp(contexts[i]->type, context_type) != 0)
+            continue;
+
+        struct ra_ctx *ctx = talloc_ptrtype(NULL, ctx);
+        *ctx = (struct ra_ctx) {
+            .vo = vo,
+            .global = vo->global,
+            .log = mp_log_new(ctx, vo->log, contexts[i]->type),
+            .opts = opts,
+            .fns = contexts[i],
+        };
+
+        MP_VERBOSE(ctx, "Initializing GPU context '%s'\n", ctx->fns->name);
+        if (contexts[i]->init(ctx)) {
+            vo->probing = old_probing;
+            return ctx;
+        }
+
+        talloc_free(ctx);
+    }
+
+    // If we've reached this point, then none of the contexts matched the name
+    // requested, or the backend creation failed for all of them.
+    MP_ERR(vo, "Failed initializing any suitable GPU context!\n");
+    vo->probing = old_probing;
+    return NULL;
+}
+
+void ra_ctx_destroy(struct ra_ctx **ctx)
+{
+    if (*ctx)
+        (*ctx)->fns->uninit(*ctx);
+    talloc_free(*ctx);
+    *ctx = NULL;
+}
diff --git a/video/out/gpu/context.h b/video/out/gpu/context.h
new file mode 100644
index 0000000000..42de59b75f
--- /dev/null
+++ b/video/out/gpu/context.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "video/out/vo.h"
+
+#include "config.h"
+#include "ra.h"
+
+struct ra_ctx_opts {
+    int allow_sw;        // allow software renderers
+    int want_alpha;      // create an alpha framebuffer if possible
+    int debug;           // enable debugging layers/callbacks etc.
+    bool probing;        // the backend was auto-probed
+    int swapchain_depth; // max number of images to render ahead
+};
+
+struct ra_ctx {
+    struct vo *vo;
+    struct ra *ra;
+    struct mpv_global *global;
+    struct mp_log *log;
+
+    struct ra_ctx_opts opts;
+    const struct ra_ctx_fns *fns;
+    struct ra_swapchain *swapchain;
+
+    void *priv;
+};
+
+// The functions that make up a ra_ctx.
+struct ra_ctx_fns {
+    const char *type; // API type (for --gpu-api)
+    const char *name; // name (for --gpu-context)
+
+    // Resize the window, or create a new window if there isn't one yet.
+    // Currently, there is an unfortunate interaction with ctx->vo, and
+    // display size etc. are determined by it.
+    bool (*reconfig)(struct ra_ctx *ctx);
+
+    // This behaves exactly like vo_driver.control().
+    int (*control)(struct ra_ctx *ctx, int *events, int request, void *arg);
+
+    // These behave exactly like vo_driver.wakeup/wait_events. They are
+    // optional.
+    void (*wakeup)(struct ra_ctx *ctx);
+    void (*wait_events)(struct ra_ctx *ctx, int64_t until_time_us);
+
+    // Initialize/destroy the 'struct ra' and possibly the underlying VO backend.
+    // Not normally called by the user of the ra_ctx.
+    bool (*init)(struct ra_ctx *ctx);
+    void (*uninit)(struct ra_ctx *ctx);
+};
+
+// Extra struct for the swapchain-related functions so they can be easily
+// inherited from helpers.
+struct ra_swapchain {
+    struct ra_ctx *ctx;
+    struct priv *priv;
+    const struct ra_swapchain_fns *fns;
+
+    bool flip_v; // flip the rendered image vertically (set by the swapchain)
+};
+
+struct ra_swapchain_fns {
+    // Gets the current framebuffer depth in bits (0 if unknown). Optional.
+    int (*color_depth)(struct ra_swapchain *sw);
+
+    // Retrieves a screenshot of the framebuffer. These are always the right
+    // side up, regardless of ra_swapchain->flip_v. Optional.
+    struct mp_image *(*screenshot)(struct ra_swapchain *sw);
+
+    // Called when rendering starts. Returns NULL on failure. This must be
+    // followed by submit_frame, to submit the rendered frame.
+    struct ra_tex *(*start_frame)(struct ra_swapchain *sw);
+
+    // Present the frame. Issued in lockstep with start_frame, with rendering
+    // commands in between. The `frame` is just there for timing data, for
+    // swapchains smart enough to do something with it.
+    bool (*submit_frame)(struct ra_swapchain *sw, const struct vo_frame *frame);
+
+    // Performs a buffer swap. This blocks for as long as necessary to meet
+    // params.swapchain_depth, or until the next vblank (for vsynced contexts)
+    void (*swap_buffers)(struct ra_swapchain *sw);
+};
+
+// Create and destroy a ra_ctx. This also takes care of creating and destroying
+// the underlying `struct ra`, and perhaps the underlying VO backend.
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts);
+void ra_ctx_destroy(struct ra_ctx **ctx);
+
+struct m_option;
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param);
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param);
diff --git a/video/out/gpu/hwdec.c b/video/out/gpu/hwdec.c
new file mode 100644
index 0000000000..5fbc1aa4a9
--- /dev/null
+++ b/video/out/gpu/hwdec.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "config.h"
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/m_config.h"
+#include "hwdec.h"
+
+extern const struct ra_hwdec_driver ra_hwdec_vaegl;
+extern const struct ra_hwdec_driver ra_hwdec_vaglx;
+extern const struct ra_hwdec_driver ra_hwdec_videotoolbox;
+extern const struct ra_hwdec_driver ra_hwdec_vdpau;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2egl;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11egl;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2;
+extern const struct ra_hwdec_driver ra_hwdec_cuda;
+extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
+
+static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
+#if HAVE_VAAPI_EGL
+    &ra_hwdec_vaegl,
+#endif
+#if HAVE_VAAPI_GLX
+    &ra_hwdec_vaglx,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_hwdec_vdpau,
+#endif
+#if HAVE_VIDEOTOOLBOX_GL || HAVE_IOS_GL
+    &ra_hwdec_videotoolbox,
+#endif
+#if HAVE_D3D_HWACCEL
+    &ra_hwdec_d3d11egl,
+    &ra_hwdec_d3d11eglrgb,
+ #if HAVE_D3D9_HWACCEL
+    &ra_hwdec_dxva2egl,
+ #endif
+#endif
+#if HAVE_GL_DXINTEROP_D3D9
+    &ra_hwdec_dxva2gldx,
+#endif
+#if HAVE_CUDA_HWACCEL
+    &ra_hwdec_cuda,
+#endif
+#if HAVE_RPI
+    &ra_hwdec_rpi_overlay,
+#endif
+    NULL
+};
+
+static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
+                                          struct mpv_global *global,
+                                          struct mp_hwdec_devices *devs,
+                                          const struct ra_hwdec_driver *drv,
+                                          bool is_auto)
+{
+    struct ra_hwdec *hwdec = talloc(NULL, struct ra_hwdec);
+    *hwdec = (struct ra_hwdec) {
+        .driver = drv,
+        .log = mp_log_new(hwdec, log, drv->name),
+        .global = global,
+        .ra = ra,
+        .devs = devs,
+        .probing = is_auto,
+        .priv = talloc_zero_size(hwdec, drv->priv_size),
+    };
+    mp_verbose(log, "Loading hwdec driver '%s'\n", drv->name);
+    if (hwdec->driver->init(hwdec) < 0) {
+        ra_hwdec_uninit(hwdec);
+        mp_verbose(log, "Loading failed.\n");
+        return NULL;
+    }
+    return hwdec;
+}
+
+struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api)
+{
+    bool is_auto = HWDEC_IS_AUTO(api);
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        if ((is_auto || api == drv->api) && !drv->testing_only) {
+            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, is_auto);
+            if (r)
+                return r;
+        }
+    }
+    return NULL;
+}
+
+// Load by option name.
+struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
+                               struct mpv_global *g,
+                               struct mp_hwdec_devices *devs,
+                               const char *name)
+{
+    int g_hwdec_api;
+    mp_read_option_raw(g, "hwdec", &m_option_type_choice, &g_hwdec_api);
+    if (!name || !name[0])
+        name = m_opt_choice_str(mp_hwdec_names, g_hwdec_api);
+
+    int api_id = HWDEC_NONE;
+    for (int n = 0; mp_hwdec_names[n].name; n++) {
+        if (name && strcmp(mp_hwdec_names[n].name, name) == 0)
+            api_id = mp_hwdec_names[n].value;
+    }
+
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        if (name && strcmp(drv->name, name) == 0) {
+            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, false);
+            if (r)
+                return r;
+        }
+    }
+
+    return ra_hwdec_load_api(log, ra, g, devs, api_id);
+}
+
+int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
+                          struct bstr name, struct bstr param)
+{
+    bool help = bstr_equals0(param, "help");
+    if (help)
+        mp_info(log, "Available hwdecs:\n");
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        const char *api_name = m_opt_choice_str(mp_hwdec_names, drv->api);
+        if (help) {
+            mp_info(log, "    %s [%s]\n", drv->name, api_name);
+        } else if (bstr_equals0(param, drv->name) ||
+                   bstr_equals0(param, api_name))
+        {
+            return 1;
+        }
+    }
+    if (help) {
+        mp_info(log, "    auto (loads best)\n"
+                     "    (other --hwdec values)\n"
+                     "Setting an empty string means use --hwdec.\n");
+        return M_OPT_EXIT;
+    }
+    if (!param.len)
+        return 1; // "" is treated specially
+    for (int n = 0; mp_hwdec_names[n].name; n++) {
+        if (bstr_equals0(param, mp_hwdec_names[n].name))
+            return 1;
+    }
+    mp_fatal(log, "No hwdec backend named '%.*s' found!\n", BSTR_P(param));
+    return M_OPT_INVALID;
+}
+
+void ra_hwdec_uninit(struct ra_hwdec *hwdec)
+{
+    if (hwdec)
+        hwdec->driver->uninit(hwdec);
+    talloc_free(hwdec);
+}
+
+bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt)
+{
+    for (int n = 0; hwdec->driver->imgfmts[n]; n++) {
+        if (hwdec->driver->imgfmts[n] == imgfmt)
+            return true;
+    }
+    return false;
+}
+
+struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
+                                               struct mp_image_params *params)
+{
+    assert(ra_hwdec_test_format(hwdec, params->imgfmt));
+
+    struct ra_hwdec_mapper *mapper = talloc_ptrtype(NULL, mapper);
+    *mapper = (struct ra_hwdec_mapper){
+        .owner = hwdec,
+        .driver = hwdec->driver->mapper,
+        .log = hwdec->log,
+        .ra = hwdec->ra,
+        .priv = talloc_zero_size(mapper, hwdec->driver->mapper->priv_size),
+        .src_params = *params,
+        .dst_params = *params,
+    };
+    if (mapper->driver->init(mapper) < 0)
+        ra_hwdec_mapper_free(&mapper);
+    return mapper;
+}
+
+void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper)
+{
+    struct ra_hwdec_mapper *p = *mapper;
+    if (p) {
+        ra_hwdec_mapper_unmap(p);
+        p->driver->uninit(p);
+        talloc_free(p);
+    }
+    *mapper = NULL;
+}
+
+void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper)
+{
+    if (mapper->driver->unmap)
+        mapper->driver->unmap(mapper);
+    mp_image_unrefp(&mapper->src);
+}
+
+int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img)
+{
+    ra_hwdec_mapper_unmap(mapper);
+    mp_image_setrefp(&mapper->src, img);
+    if (mapper->driver->map(mapper) < 0) {
+        ra_hwdec_mapper_unmap(mapper);
+        return -1;
+    }
+    return 0;
+}
diff --git a/video/out/gpu/hwdec.h b/video/out/gpu/hwdec.h
new file mode 100644
index 0000000000..20bbaae9eb
--- /dev/null
+++ b/video/out/gpu/hwdec.h
@@ -0,0 +1,130 @@
+#ifndef MPGL_HWDEC_H_
+#define MPGL_HWDEC_H_
+
+#include "video/mp_image.h"
+#include "ra.h"
+#include "video/hwdec.h"
+
+struct ra_hwdec {
+    const struct ra_hwdec_driver *driver;
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct ra *ra;
+    struct mp_hwdec_devices *devs;
+    // GLSL extensions required to sample textures from this.
+    const char **glsl_extensions;
+    // For free use by hwdec driver
+    void *priv;
+    // For working around the vdpau vs. vaapi mess.
+    bool probing;
+    // Used in overlay mode only.
+    float overlay_colorkey[4];
+};
+
+struct ra_hwdec_mapper {
+    const struct ra_hwdec_mapper_driver *driver;
+    struct mp_log *log;
+    struct ra *ra;
+    void *priv;
+    struct ra_hwdec *owner;
+    // Input frame parameters. (Set before init(), immutable.)
+    struct mp_image_params src_params;
+    // Output frame parameters (represents the format the textures return). Must
+    // be set by init(), immutable afterwards,
+    struct mp_image_params dst_params;
+
+    // The currently mapped source image (or the image about to be mapped in
+    // ->map()). NULL if unmapped. The mapper can also clear this reference if
+    // the mapped textures contain a full copy.
+    struct mp_image *src;
+
+    // The mapped textures and metadata about them. These fields change if a
+    // new frame is mapped (or unmapped), but otherwise remain constant.
+    // The common code won't mess with these, so you can e.g. set them in the
+    // .init() callback.
+    struct ra_tex *tex[4];
+    bool vdpau_fields;
+};
+
+// This can be used to map frames of a specific hw format as GL textures.
+struct ra_hwdec_mapper_driver {
+    // Used to create ra_hwdec_mapper.priv.
+    size_t priv_size;
+
+    // Init the mapper implementation. At this point, the field src_params,
+    // fns, devs, priv are initialized.
+    int (*init)(struct ra_hwdec_mapper *mapper);
+    // Destroy the mapper. unmap is called before this.
+    void (*uninit)(struct ra_hwdec_mapper *mapper);
+
+    // Map mapper->src as texture, and set mapper->frame to textures using it.
+    // It is expected that that the textures remain valid until the next unmap
+    // or uninit call.
+    // The function is allowed to unref mapper->src if it's not needed (i.e.
+    // this function creates a copy).
+    // The underlying format can change, so you might need to do some form
+    // of change detection. You also must reject unsupported formats with an
+    // error.
+    // On error, returns negative value on error and remains unmapped.
+    int (*map)(struct ra_hwdec_mapper *mapper);
+    // Unmap the frame. Does nothing if already unmapped. Optional.
+    void (*unmap)(struct ra_hwdec_mapper *mapper);
+};
+
+struct ra_hwdec_driver {
+    // Name of the interop backend. This is used for informational purposes only.
+    const char *name;
+    // Used to create ra_hwdec.priv.
+    size_t priv_size;
+    // Used to explicitly request a specific API.
+    enum hwdec_type api;
+    // One of the hardware surface IMGFMT_ that must be passed to map_image later.
+    // Terminated with a 0 entry. (Extend the array size as needed.)
+    const int imgfmts[3];
+    // Dosn't load this unless requested by name.
+    bool testing_only;
+
+    // Create the hwdec device. It must add it to hw->devs, if applicable.
+    int (*init)(struct ra_hwdec *hw);
+    void (*uninit)(struct ra_hwdec *hw);
+
+    // This will be used to create a ra_hwdec_mapper from ra_hwdec.
+    const struct ra_hwdec_mapper_driver *mapper;
+
+    // The following function provides an alternative API. Each ra_hwdec_driver
+    // must have either provide a mapper or overlay_frame (not both or none), and
+    // if overlay_frame is set, it operates in overlay mode. In this mode,
+    // OSD etc. is rendered via OpenGL, but the video is rendered as a separate
+    // layer below it.
+    // Non-overlay mode is strictly preferred, so try not to use overlay mode.
+    // Set the given frame as overlay, replacing the previous one. This can also
+    // just change the position of the overlay.
+    // hw_image==src==dst==NULL is passed to clear the overlay.
+    int (*overlay_frame)(struct ra_hwdec *hw, struct mp_image *hw_image,
+                         struct mp_rect *src, struct mp_rect *dst, bool newframe);
+};
+
+struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api);
+
+struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
+                               struct mpv_global *g,
+                               struct mp_hwdec_devices *devs,
+                               const char *name);
+
+int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
+                          struct bstr name, struct bstr param);
+
+void ra_hwdec_uninit(struct ra_hwdec *hwdec);
+
+bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt);
+
+struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
+                                               struct mp_image_params *params);
+void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper);
+void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper);
+int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img);
+
+#endif
diff --git a/video/out/gpu/lcms.c b/video/out/gpu/lcms.c
new file mode 100644
index 0000000000..8747ae6aa6
--- /dev/null
+++ b/video/out/gpu/lcms.c
@@ -0,0 +1,531 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+#include <math.h>
+
+#include "mpv_talloc.h"
+
+#include "config.h"
+
+#include "stream/stream.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "common/msg.h"
+#include "options/m_option.h"
+#include "options/path.h"
+#include "video/csputils.h"
+#include "lcms.h"
+
+#include "osdep/io.h"
+
+#if HAVE_LCMS2
+
+#include <lcms2.h>
+#include <libavutil/sha.h>
+#include <libavutil/mem.h>
+
+struct gl_lcms {
+    void *icc_data;
+    size_t icc_size;
+    struct AVBufferRef *vid_profile;
+    char *current_profile;
+    bool using_memory_profile;
+    bool changed;
+    enum mp_csp_prim current_prim;
+    enum mp_csp_trc current_trc;
+
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct mp_icc_opts *opts;
+};
+
+static bool parse_3dlut_size(const char *arg, int *p1, int *p2, int *p3)
+{
+    if (sscanf(arg, "%dx%dx%d", p1, p2, p3) != 3)
+        return false;
+    for (int n = 0; n < 3; n++) {
+        int s = ((int[]) { *p1, *p2, *p3 })[n];
+        if (s < 2 || s > 512)
+            return false;
+    }
+    return true;
+}
+
+static int validate_3dlut_size_opt(struct mp_log *log, const m_option_t *opt,
+                                   struct bstr name, struct bstr param)
+{
+    int p1, p2, p3;
+    char s[20];
+    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+    return parse_3dlut_size(s, &p1, &p2, &p3);
+}
+
+#define OPT_BASE_STRUCT struct mp_icc_opts
+const struct m_sub_options mp_icc_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_FLAG("use-embedded-icc-profile", use_embedded, 0),
+        OPT_STRING("icc-profile", profile, M_OPT_FILE),
+        OPT_FLAG("icc-profile-auto", profile_auto, 0),
+        OPT_STRING("icc-cache-dir", cache_dir, M_OPT_FILE),
+        OPT_INT("icc-intent", intent, 0),
+        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
+        OPT_STRING_VALIDATE("icc-3dlut-size", size_str, 0, validate_3dlut_size_opt),
+
+        OPT_REPLACED("3dlut-size", "icc-3dlut-size"),
+        OPT_REMOVED("icc-cache", "see icc-cache-dir"),
+        {0}
+    },
+    .size = sizeof(struct mp_icc_opts),
+    .defaults = &(const struct mp_icc_opts) {
+        .size_str = "64x64x64",
+        .intent = INTENT_RELATIVE_COLORIMETRIC,
+        .use_embedded = true,
+    },
+};
+
+static void lcms2_error_handler(cmsContext ctx, cmsUInt32Number code,
+                                const char *msg)
+{
+    struct gl_lcms *p = cmsGetContextUserData(ctx);
+    MP_ERR(p, "lcms2: %s\n", msg);
+}
+
+static void load_profile(struct gl_lcms *p)
+{
+    talloc_free(p->icc_data);
+    p->icc_data = NULL;
+    p->icc_size = 0;
+    p->using_memory_profile = false;
+    talloc_free(p->current_profile);
+    p->current_profile = NULL;
+
+    if (!p->opts->profile || !p->opts->profile[0])
+        return;
+
+    char *fname = mp_get_user_path(NULL, p->global, p->opts->profile);
+    MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
+    struct bstr iccdata = stream_read_file(fname, p, p->global,
+                                           100000000); // 100 MB
+    talloc_free(fname);
+    if (!iccdata.len)
+        return;
+
+    talloc_free(p->icc_data);
+
+    p->icc_data = iccdata.start;
+    p->icc_size = iccdata.len;
+    p->current_profile = talloc_strdup(p, p->opts->profile);
+}
+
+static void gl_lcms_destructor(void *ptr)
+{
+    struct gl_lcms *p = ptr;
+    av_buffer_unref(&p->vid_profile);
+}
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts)
+{
+    struct gl_lcms *p = talloc_ptrtype(talloc_ctx, p);
+    talloc_set_destructor(p, gl_lcms_destructor);
+    *p = (struct gl_lcms) {
+        .global = global,
+        .log = log,
+        .opts = opts,
+    };
+    gl_lcms_update_options(p);
+    return p;
+}
+
+void gl_lcms_update_options(struct gl_lcms *p)
+{
+    if ((p->using_memory_profile && !p->opts->profile_auto) ||
+        !bstr_equals(bstr0(p->opts->profile), bstr0(p->current_profile)))
+    {
+        load_profile(p);
+    }
+
+    p->changed = true; // probably
+}
+
+// Warning: profile.start must point to a ta allocation, and the function
+//          takes over ownership.
+// Returns whether the internal profile was changed.
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile)
+{
+    if (!p->opts->profile_auto || (p->opts->profile && p->opts->profile[0])) {
+        talloc_free(profile.start);
+        return false;
+    }
+
+    if (p->using_memory_profile &&
+        p->icc_data && profile.start &&
+        profile.len == p->icc_size &&
+        memcmp(profile.start, p->icc_data, p->icc_size) == 0)
+    {
+        talloc_free(profile.start);
+        return false;
+    }
+
+    p->changed = true;
+    p->using_memory_profile = true;
+
+    talloc_free(p->icc_data);
+
+    p->icc_data = talloc_steal(p, profile.start);
+    p->icc_size = profile.len;
+
+    return true;
+}
+
+// Guards against NULL and uses bstr_equals to short-circuit some special cases
+static bool vid_profile_eq(struct AVBufferRef *a, struct AVBufferRef *b)
+{
+    if (!a || !b)
+        return a == b;
+
+    return bstr_equals((struct bstr){ a->data, a->size },
+                       (struct bstr){ b->data, b->size });
+}
+
+// Return whether the profile or config has changed since the last time it was
+// retrieved. If it has changed, gl_lcms_get_lut3d() should be called.
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
+{
+    if (p->changed || p->current_prim != prim || p->current_trc != trc)
+        return true;
+
+    return !vid_profile_eq(p->vid_profile, vid_profile);
+}
+
+// Whether a profile is set. (gl_lcms_get_lut3d() is expected to return a lut,
+// but it could still fail due to runtime errors, such as invalid icc data.)
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return p->icc_size > 0;
+}
+
+static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
+                                   cmsHPROFILE disp_profile,
+                                   enum mp_csp_prim prim, enum mp_csp_trc trc)
+{
+    if (p->opts->use_embedded && p->vid_profile) {
+        // Try using the embedded ICC profile
+        cmsHPROFILE prof = cmsOpenProfileFromMemTHR(cms, p->vid_profile->data,
+                                                    p->vid_profile->size);
+        if (prof) {
+            MP_VERBOSE(p, "Successfully opened embedded ICC profile\n");
+            return prof;
+        }
+
+        // Otherwise, warn the user and generate the profile as usual
+        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring..\n");
+    }
+
+    // The input profile for the transformation is dependent on the video
+    // primaries and transfer characteristics
+    struct mp_csp_primaries csp = mp_get_csp_primaries(prim);
+    cmsCIExyY wp_xyY = {csp.white.x, csp.white.y, 1.0};
+    cmsCIExyYTRIPLE prim_xyY = {
+        .Red   = {csp.red.x,   csp.red.y,   1.0},
+        .Green = {csp.green.x, csp.green.y, 1.0},
+        .Blue  = {csp.blue.x,  csp.blue.y,  1.0},
+    };
+
+    cmsToneCurve *tonecurve[3] = {0};
+    switch (trc) {
+    case MP_CSP_TRC_LINEAR:  tonecurve[0] = cmsBuildGamma(cms, 1.0); break;
+    case MP_CSP_TRC_GAMMA18: tonecurve[0] = cmsBuildGamma(cms, 1.8); break;
+    case MP_CSP_TRC_GAMMA22: tonecurve[0] = cmsBuildGamma(cms, 2.2); break;
+    case MP_CSP_TRC_GAMMA28: tonecurve[0] = cmsBuildGamma(cms, 2.8); break;
+
+    case MP_CSP_TRC_SRGB:
+        // Values copied from Little-CMS
+        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
+                (double[5]){2.40, 1/1.055, 0.055/1.055, 1/12.92, 0.04045});
+        break;
+
+    case MP_CSP_TRC_PRO_PHOTO:
+        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
+                (double[5]){1.8, 1.0, 0.0, 1/16.0, 0.03125});
+        break;
+
+    case MP_CSP_TRC_BT_1886: {
+        // To build an appropriate BT.1886 transformation we need access to
+        // the display's black point, so we LittleCMS' detection function.
+        // Relative colorimetric is used since we want to approximate the
+        // BT.1886 to the target device's actual black point even in e.g.
+        // perceptual mode
+        const int intent = MP_INTENT_RELATIVE_COLORIMETRIC;
+        cmsCIEXYZ bp_XYZ;
+        if (!cmsDetectBlackPoint(&bp_XYZ, disp_profile, intent, 0))
+            return false;
+
+        // Map this XYZ value back into the (linear) source space
+        cmsToneCurve *linear = cmsBuildGamma(cms, 1.0);
+        cmsHPROFILE rev_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
+                (cmsToneCurve*[3]){linear, linear, linear});
+        cmsHPROFILE xyz_profile = cmsCreateXYZProfile();
+        cmsHTRANSFORM xyz2src = cmsCreateTransformTHR(cms,
+                xyz_profile, TYPE_XYZ_DBL, rev_profile, TYPE_RGB_DBL,
+                intent, 0);
+        cmsFreeToneCurve(linear);
+        cmsCloseProfile(rev_profile);
+        cmsCloseProfile(xyz_profile);
+        if (!xyz2src)
+            return false;
+
+        double src_black[3];
+        cmsDoTransform(xyz2src, &bp_XYZ, src_black, 1);
+        cmsDeleteTransform(xyz2src);
+
+        // Contrast limiting
+        if (p->opts->contrast > 0) {
+            for (int i = 0; i < 3; i++)
+                src_black[i] = MPMAX(src_black[i], 1.0 / p->opts->contrast);
+        }
+
+        // Built-in contrast failsafe
+        double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
+        if (contrast > 100000) {
+            MP_WARN(p, "ICC profile detected contrast very high (>100000),"
+                    " falling back to contrast 1000 for sanity. Set the"
+                    " icc-contrast option to silence this warning.\n");
+            src_black[0] = src_black[1] = src_black[2] = 1.0 / 1000;
+        }
+
+        // Build the parametric BT.1886 transfer curve, one per channel
+        for (int i = 0; i < 3; i++) {
+            const double gamma = 2.40;
+            double binv = pow(src_black[i], 1.0/gamma);
+            tonecurve[i] = cmsBuildParametricToneCurve(cms, 6,
+                    (double[4]){gamma, 1.0 - binv, binv, 0.0});
+        }
+        break;
+    }
+
+    default:
+        abort();
+    }
+
+    if (!tonecurve[0])
+        return false;
+
+    if (!tonecurve[1]) tonecurve[1] = tonecurve[0];
+    if (!tonecurve[2]) tonecurve[2] = tonecurve[0];
+
+    cmsHPROFILE *vid_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
+                                                      tonecurve);
+
+    if (tonecurve[2] != tonecurve[0]) cmsFreeToneCurve(tonecurve[2]);
+    if (tonecurve[1] != tonecurve[0]) cmsFreeToneCurve(tonecurve[1]);
+    cmsFreeToneCurve(tonecurve[0]);
+
+    return vid_profile;
+}
+
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile)
+{
+    int s_r, s_g, s_b;
+    bool result = false;
+
+    p->changed = false;
+    p->current_prim = prim;
+    p->current_trc = trc;
+
+    // We need to hold on to a reference to the video's ICC profile for as long
+    // as we still need to perform equality checking, so generate a new
+    // reference here
+    av_buffer_unref(&p->vid_profile);
+    if (vid_profile) {
+        MP_VERBOSE(p, "Got an embedded ICC profile.\n");
+        p->vid_profile = av_buffer_ref(vid_profile);
+        if (!p->vid_profile)
+            abort();
+    }
+
+    if (!parse_3dlut_size(p->opts->size_str, &s_r, &s_g, &s_b))
+        return false;
+
+    if (!gl_lcms_has_profile(p))
+        return false;
+
+    void *tmp = talloc_new(NULL);
+    uint16_t *output = talloc_array(tmp, uint16_t, s_r * s_g * s_b * 4);
+    struct lut3d *lut = NULL;
+    cmsContext cms = NULL;
+
+    char *cache_file = NULL;
+    if (p->opts->cache_dir && p->opts->cache_dir[0]) {
+        // Gamma is included in the header to help uniquely identify it,
+        // because we may change the parameter in the future or make it
+        // customizable, same for the primaries.
+        char *cache_info = talloc_asprintf(tmp,
+                "ver=1.4, intent=%d, size=%dx%dx%d, prim=%d, trc=%d, "
+                "contrast=%d\n",
+                p->opts->intent, s_r, s_g, s_b, prim, trc, p->opts->contrast);
+
+        uint8_t hash[32];
+        struct AVSHA *sha = av_sha_alloc();
+        if (!sha)
+            abort();
+        av_sha_init(sha, 256);
+        av_sha_update(sha, cache_info, strlen(cache_info));
+        if (vid_profile)
+            av_sha_update(sha, vid_profile->data, vid_profile->size);
+        av_sha_update(sha, p->icc_data, p->icc_size);
+        av_sha_final(sha, hash);
+        av_free(sha);
+
+        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts->cache_dir);
+        cache_file = talloc_strdup(tmp, "");
+        for (int i = 0; i < sizeof(hash); i++)
+            cache_file = talloc_asprintf_append(cache_file, "%02X", hash[i]);
+        cache_file = mp_path_join(tmp, cache_dir, cache_file);
+
+        mp_mkdirp(cache_dir);
+    }
+
+    // check cache
+    if (cache_file && stat(cache_file, &(struct stat){0}) == 0) {
+        MP_VERBOSE(p, "Opening 3D LUT cache in file '%s'.\n", cache_file);
+        struct bstr cachedata = stream_read_file(cache_file, tmp, p->global,
+                                                 1000000000); // 1 GB
+        if (cachedata.len == talloc_get_size(output)) {
+            memcpy(output, cachedata.start, cachedata.len);
+            goto done;
+        } else {
+            MP_WARN(p, "3D LUT cache invalid!\n");
+        }
+    }
+
+    cms = cmsCreateContext(NULL, p);
+    if (!cms)
+        goto error_exit;
+    cmsSetLogErrorHandlerTHR(cms, lcms2_error_handler);
+
+    cmsHPROFILE profile =
+        cmsOpenProfileFromMemTHR(cms, p->icc_data, p->icc_size);
+    if (!profile)
+        goto error_exit;
+
+    cmsHPROFILE vid_hprofile = get_vid_profile(p, cms, profile, prim, trc);
+    if (!vid_hprofile) {
+        cmsCloseProfile(profile);
+        goto error_exit;
+    }
+
+    cmsHTRANSFORM trafo = cmsCreateTransformTHR(cms, vid_hprofile, TYPE_RGB_16,
+                                                profile, TYPE_RGBA_16,
+                                                p->opts->intent,
+                                                cmsFLAGS_HIGHRESPRECALC |
+                                                cmsFLAGS_BLACKPOINTCOMPENSATION);
+    cmsCloseProfile(profile);
+    cmsCloseProfile(vid_hprofile);
+
+    if (!trafo)
+        goto error_exit;
+
+    // transform a (s_r)x(s_g)x(s_b) cube, with 3 components per channel
+    uint16_t *input = talloc_array(tmp, uint16_t, s_r * 3);
+    for (int b = 0; b < s_b; b++) {
+        for (int g = 0; g < s_g; g++) {
+            for (int r = 0; r < s_r; r++) {
+                input[r * 3 + 0] = r * 65535 / (s_r - 1);
+                input[r * 3 + 1] = g * 65535 / (s_g - 1);
+                input[r * 3 + 2] = b * 65535 / (s_b - 1);
+            }
+            size_t base = (b * s_r * s_g + g * s_r) * 4;
+            cmsDoTransform(trafo, input, output + base, s_r);
+        }
+    }
+
+    cmsDeleteTransform(trafo);
+
+    if (cache_file) {
+        FILE *out = fopen(cache_file, "wb");
+        if (out) {
+            fwrite(output, talloc_get_size(output), 1, out);
+            fclose(out);
+        }
+    }
+
+done: ;
+
+    lut = talloc_ptrtype(NULL, lut);
+    *lut = (struct lut3d) {
+        .data = talloc_steal(lut, output),
+        .size = {s_r, s_g, s_b},
+    };
+
+    *result_lut3d = lut;
+    result = true;
+
+error_exit:
+
+    if (cms)
+        cmsDeleteContext(cms);
+
+    if (!lut)
+        MP_FATAL(p, "Error loading ICC profile.\n");
+
+    talloc_free(tmp);
+    return result;
+}
+
+#else /* HAVE_LCMS2 */
+
+const struct m_sub_options mp_icc_conf = {
+    .opts = (const m_option_t[]) { {0} },
+    .size = sizeof(struct mp_icc_opts),
+    .defaults = &(const struct mp_icc_opts) {0},
+};
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts)
+{
+    return (struct gl_lcms *) talloc_new(talloc_ctx);
+}
+
+void gl_lcms_update_options(struct gl_lcms *p) { }
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile) {return false;}
+
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
+{
+    return false;
+}
+
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return false;
+}
+
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile)
+{
+    return false;
+}
+
+#endif
diff --git a/video/out/gpu/lcms.h b/video/out/gpu/lcms.h
new file mode 100644
index 0000000000..35bbd61fe0
--- /dev/null
+++ b/video/out/gpu/lcms.h
@@ -0,0 +1,43 @@
+#ifndef MP_GL_LCMS_H
+#define MP_GL_LCMS_H
+
+#include <stddef.h>
+#include <stdbool.h>
+#include "misc/bstr.h"
+#include "video/csputils.h"
+#include <libavutil/buffer.h>
+
+extern const struct m_sub_options mp_icc_conf;
+
+struct mp_icc_opts {
+    int use_embedded;
+    char *profile;
+    int profile_auto;
+    char *cache_dir;
+    char *size_str;
+    int intent;
+    int contrast;
+};
+
+struct lut3d {
+    uint16_t *data;
+    int size[3];
+};
+
+struct mp_log;
+struct mpv_global;
+struct gl_lcms;
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts);
+void gl_lcms_update_options(struct gl_lcms *p);
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile);
+bool gl_lcms_has_profile(struct gl_lcms *p);
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile);
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile);
+
+#endif
diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c
new file mode 100644
index 0000000000..f7c325d1db
--- /dev/null
+++ b/video/out/gpu/osd.c
@@ -0,0 +1,367 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+
+#include <libavutil/common.h>
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "video/csputils.h"
+#include "video/mp_image.h"
+#include "osd.h"
+
+#define GLSL(x) gl_sc_add(sc, #x "\n");
+
+// glBlendFuncSeparate() arguments
+static const int blend_factors[SUBBITMAP_COUNT][4] = {
+    [SUBBITMAP_LIBASS] = {RA_BLEND_SRC_ALPHA, RA_BLEND_ONE_MINUS_SRC_ALPHA,
+                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
+    [SUBBITMAP_RGBA] =   {RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA,
+                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
+};
+
+struct vertex {
+    float position[2];
+    float texcoord[2];
+    uint8_t ass_color[4];
+};
+
+static const struct ra_renderpass_input vertex_vao[] = {
+    {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
+    {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
+    {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
+    {0}
+};
+
+struct mpgl_osd_part {
+    enum sub_bitmap_format format;
+    int change_id;
+    struct ra_tex *texture;
+    int w, h;
+    int num_subparts;
+    int prev_num_subparts;
+    struct sub_bitmap *subparts;
+    int num_vertices;
+    struct vertex *vertices;
+};
+
+struct mpgl_osd {
+    struct mp_log *log;
+    struct osd_state *osd;
+    struct ra *ra;
+    struct mpgl_osd_part *parts[MAX_OSD_PARTS];
+    const struct ra_format *fmt_table[SUBBITMAP_COUNT];
+    bool formats[SUBBITMAP_COUNT];
+    bool change_flag; // for reporting to API user only
+    // temporary
+    int stereo_mode;
+    struct mp_osd_res osd_res;
+    void *scratch;
+};
+
+struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
+                               struct osd_state *osd)
+{
+    struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
+    *ctx = (struct mpgl_osd) {
+        .log = log,
+        .osd = osd,
+        .ra = ra,
+        .change_flag = true,
+        .scratch = talloc_zero_size(ctx, 1),
+    };
+
+    ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1);
+    ctx->fmt_table[SUBBITMAP_RGBA]   = ra_find_unorm_format(ra, 1, 4);
+
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
+
+    for (int n = 0; n < SUBBITMAP_COUNT; n++)
+        ctx->formats[n] = !!ctx->fmt_table[n];
+
+    return ctx;
+}
+
+void mpgl_osd_destroy(struct mpgl_osd *ctx)
+{
+    if (!ctx)
+        return;
+
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        struct mpgl_osd_part *p = ctx->parts[n];
+        ra_tex_free(ctx->ra, &p->texture);
+    }
+    talloc_free(ctx);
+}
+
+static int next_pow2(int v)
+{
+    for (int x = 0; x < 30; x++) {
+        if ((1 << x) >= v)
+            return 1 << x;
+    }
+    return INT_MAX;
+}
+
+static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
+                       struct sub_bitmaps *imgs)
+{
+    struct ra *ra = ctx->ra;
+    bool ok = false;
+
+    assert(imgs->packed);
+
+    int req_w = next_pow2(imgs->packed_w);
+    int req_h = next_pow2(imgs->packed_h);
+
+    const struct ra_format *fmt = ctx->fmt_table[imgs->format];
+    assert(fmt);
+
+    if (!osd->texture || req_w > osd->w || req_h > osd->h ||
+        osd->format != imgs->format)
+    {
+        ra_tex_free(ra, &osd->texture);
+
+        osd->format = imgs->format;
+        osd->w = FFMAX(32, req_w);
+        osd->h = FFMAX(32, req_h);
+
+        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
+
+        if (osd->w > ra->max_texture_wh || osd->h > ra->max_texture_wh) {
+            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
+                   "supported size %dx%d.\n", ra->max_texture_wh,
+                   ra->max_texture_wh);
+            goto done;
+        }
+
+        struct ra_tex_params params = {
+            .dimensions = 2,
+            .w = osd->w,
+            .h = osd->h,
+            .d = 1,
+            .format = fmt,
+            .render_src = true,
+            .src_linear = true,
+            .host_mutable = true,
+        };
+        osd->texture = ra_tex_create(ra, &params);
+        if (!osd->texture)
+            goto done;
+    }
+
+    struct ra_tex_upload_params params = {
+        .tex = osd->texture,
+        .src = imgs->packed->planes[0],
+        .invalidate = true,
+        .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h},
+        .stride = imgs->packed->stride[0],
+    };
+
+    ok = ra->fns->tex_upload(ra, &params);
+
+done:
+    return ok;
+}
+
+static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
+{
+    struct mpgl_osd *ctx = pctx;
+
+    if (imgs->num_parts == 0 || !ctx->formats[imgs->format])
+        return;
+
+    struct mpgl_osd_part *osd = ctx->parts[imgs->render_index];
+
+    bool ok = true;
+    if (imgs->change_id != osd->change_id) {
+        if (!upload_osd(ctx, osd, imgs))
+            ok = false;
+
+        osd->change_id = imgs->change_id;
+        ctx->change_flag = true;
+    }
+    osd->num_subparts = ok ? imgs->num_parts : 0;
+
+    MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
+    memcpy(osd->subparts, imgs->parts,
+           osd->num_subparts * sizeof(osd->subparts[0]));
+}
+
+bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
+                           struct gl_shader_cache *sc)
+{
+    assert(index >= 0 && index < MAX_OSD_PARTS);
+    struct mpgl_osd_part *part = ctx->parts[index];
+
+    enum sub_bitmap_format fmt = part->format;
+    if (!fmt || !part->num_subparts)
+        return false;
+
+    gl_sc_uniform_texture(sc, "osdtex", part->texture);
+    switch (fmt) {
+    case SUBBITMAP_RGBA: {
+        GLSL(color = texture(osdtex, texcoord).bgra;)
+        break;
+    }
+    case SUBBITMAP_LIBASS: {
+        GLSL(color =
+            vec4(ass_color.rgb, ass_color.a * texture(osdtex, texcoord).r);)
+        break;
+    }
+    default:
+        abort();
+    }
+
+    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
+
+    return true;
+}
+
+static void write_quad(struct vertex *va, struct gl_transform t,
+                       float x0, float y0, float x1, float y1,
+                       float tx0, float ty0, float tx1, float ty1,
+                       float tex_w, float tex_h, const uint8_t color[4])
+{
+    gl_transform_vec(t, &x0, &y0);
+    gl_transform_vec(t, &x1, &y1);
+
+#define COLOR_INIT {color[0], color[1], color[2], color[3]}
+    va[0] = (struct vertex){ {x0, y0}, {tx0 / tex_w, ty0 / tex_h}, COLOR_INIT };
+    va[1] = (struct vertex){ {x0, y1}, {tx0 / tex_w, ty1 / tex_h}, COLOR_INIT };
+    va[2] = (struct vertex){ {x1, y0}, {tx1 / tex_w, ty0 / tex_h}, COLOR_INIT };
+    va[3] = (struct vertex){ {x1, y1}, {tx1 / tex_w, ty1 / tex_h}, COLOR_INIT };
+    va[4] = va[2];
+    va[5] = va[1];
+#undef COLOR_INIT
+}
+
+static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
+{
+    int num_vertices = part->num_subparts * 6;
+    MP_TARRAY_GROW(part, part->vertices, part->num_vertices + num_vertices);
+
+    for (int n = 0; n < part->num_subparts; n++) {
+        struct sub_bitmap *b = &part->subparts[n];
+        struct vertex *va = &part->vertices[part->num_vertices];
+
+        // NOTE: the blend color is used with SUBBITMAP_LIBASS only, so it
+        //       doesn't matter that we upload garbage for the other formats
+        uint32_t c = b->libass.color;
+        uint8_t color[4] = { c >> 24, (c >> 16) & 0xff,
+                            (c >> 8) & 0xff, 255 - (c & 0xff) };
+
+        write_quad(&va[n * 6], t,
+                   b->x, b->y, b->x + b->dw, b->y + b->dh,
+                   b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
+                   part->w, part->h, color);
+    }
+
+    part->num_vertices += num_vertices;
+}
+
+// number of screen divisions per axis (x=0, y=1) for the current 3D mode
+static void get_3d_side_by_side(int stereo_mode, int div[2])
+{
+    div[0] = div[1] = 1;
+    switch (stereo_mode) {
+    case MP_STEREO3D_SBS2L:
+    case MP_STEREO3D_SBS2R: div[0] = 2; break;
+    case MP_STEREO3D_AB2R:
+    case MP_STEREO3D_AB2L:  div[1] = 2; break;
+    }
+}
+
+void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
+                          struct gl_shader_cache *sc, struct fbodst target)
+{
+    struct mpgl_osd_part *part = ctx->parts[index];
+
+    int div[2];
+    get_3d_side_by_side(ctx->stereo_mode, div);
+
+    part->num_vertices = 0;
+
+    for (int x = 0; x < div[0]; x++) {
+        for (int y = 0; y < div[1]; y++) {
+            struct gl_transform t;
+            gl_transform_ortho_fbodst(&t, target);
+
+            float a_x = ctx->osd_res.w * x;
+            float a_y = ctx->osd_res.h * y;
+            t.t[0] += a_x * t.m[0][0] + a_y * t.m[1][0];
+            t.t[1] += a_x * t.m[0][1] + a_y * t.m[1][1];
+
+            generate_verts(part, t);
+        }
+    }
+
+    const int *factors = &blend_factors[part->format][0];
+    gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
+
+    gl_sc_dispatch_draw(sc, target.tex, part->vertices, part->num_vertices);
+}
+
+static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
+{
+    int div[2];
+    get_3d_side_by_side(stereo_mode, div);
+
+    res.w /= div[0];
+    res.h /= div[1];
+    ctx->osd_res = res;
+}
+
+void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
+                       int stereo_mode, int draw_flags)
+{
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n]->num_subparts = 0;
+
+    set_res(ctx, res, stereo_mode);
+
+    osd_draw(ctx->osd, ctx->osd_res, pts, draw_flags, ctx->formats, gen_osd_cb, ctx);
+    ctx->stereo_mode = stereo_mode;
+
+    // Parts going away does not necessarily result in gen_osd_cb() being called
+    // (not even with num_parts==0), so check this separately.
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        struct mpgl_osd_part *part = ctx->parts[n];
+        if (part->num_subparts !=  part->prev_num_subparts)
+            ctx->change_flag = true;
+        part->prev_num_subparts = part->num_subparts;
+    }
+}
+
+// See osd_resize() for remarks. This function is an optional optimization too.
+void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
+{
+    set_res(ctx, res, stereo_mode);
+    osd_resize(ctx->osd, ctx->osd_res);
+}
+
+bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
+                           double pts)
+{
+    ctx->change_flag = false;
+    mpgl_osd_generate(ctx, *res, pts, 0, 0);
+    return ctx->change_flag;
+}
diff --git a/video/out/gpu/osd.h b/video/out/gpu/osd.h
new file mode 100644
index 0000000000..6c2b886de3
--- /dev/null
+++ b/video/out/gpu/osd.h
@@ -0,0 +1,25 @@
+#ifndef MPLAYER_GL_OSD_H
+#define MPLAYER_GL_OSD_H
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "utils.h"
+#include "shader_cache.h"
+#include "sub/osd.h"
+
+struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
+                               struct osd_state *osd);
+void mpgl_osd_destroy(struct mpgl_osd *ctx);
+
+void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
+                       int stereo_mode, int draw_flags);
+void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode);
+bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
+                           struct gl_shader_cache *sc);
+void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
+                          struct gl_shader_cache *sc, struct fbodst target);
+bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
+                           double pts);
+
+#endif
diff --git a/video/out/gpu/ra.c b/video/out/gpu/ra.c
new file mode 100644
index 0000000000..ef1de54d1a
--- /dev/null
+++ b/video/out/gpu/ra.c
@@ -0,0 +1,327 @@
+#include "common/common.h"
+#include "common/msg.h"
+#include "video/img_format.h"
+
+#include "ra.h"
+
+struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params)
+{
+    return ra->fns->tex_create(ra, params);
+}
+
+void ra_tex_free(struct ra *ra, struct ra_tex **tex)
+{
+    if (*tex)
+        ra->fns->tex_destroy(ra, *tex);
+    *tex = NULL;
+}
+
+struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params)
+{
+    return ra->fns->buf_create(ra, params);
+}
+
+void ra_buf_free(struct ra *ra, struct ra_buf **buf)
+{
+    if (*buf)
+        ra->fns->buf_destroy(ra, *buf);
+    *buf = NULL;
+}
+
+void ra_free(struct ra **ra)
+{
+    if (*ra)
+        (*ra)->fns->destroy(*ra);
+    talloc_free(*ra);
+    *ra = NULL;
+}
+
+size_t ra_vartype_size(enum ra_vartype type)
+{
+    switch (type) {
+    case RA_VARTYPE_INT:        return sizeof(int);
+    case RA_VARTYPE_FLOAT:      return sizeof(float);
+    case RA_VARTYPE_BYTE_UNORM: return 1;
+    default: return 0;
+    }
+}
+
+struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input)
+{
+    size_t el_size = ra_vartype_size(input->type);
+    if (!el_size)
+        return (struct ra_layout){0};
+
+    // host data is always tightly packed
+    return (struct ra_layout) {
+        .align  = 1,
+        .stride = el_size * input->dim_v,
+        .size   = el_size * input->dim_v * input->dim_m,
+    };
+}
+
+static struct ra_renderpass_input *dup_inputs(void *ta_parent,
+            const struct ra_renderpass_input *inputs, int num_inputs)
+{
+    struct ra_renderpass_input *res =
+        talloc_memdup(ta_parent, (void *)inputs, num_inputs * sizeof(inputs[0]));
+    for (int n = 0; n < num_inputs; n++)
+        res[n].name = talloc_strdup(res, res[n].name);
+    return res;
+}
+
+// Return a newly allocated deep-copy of params.
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
+        const struct ra_renderpass_params *params)
+{
+    struct ra_renderpass_params *res = talloc_ptrtype(ta_parent, res);
+    *res = *params;
+    res->inputs = dup_inputs(res, res->inputs, res->num_inputs);
+    res->vertex_attribs =
+        dup_inputs(res, res->vertex_attribs, res->num_vertex_attribs);
+    res->cached_program = bstrdup(res, res->cached_program);
+    res->vertex_shader = talloc_strdup(res, res->vertex_shader);
+    res->frag_shader = talloc_strdup(res, res->frag_shader);
+    res->compute_shader = talloc_strdup(res, res->compute_shader);
+    return res;
+};
+
+
+// Return whether this is a tightly packed format with no external padding and
+// with the same bit size/depth in all components, and the shader returns
+// components in the same order as in memory.
+static bool ra_format_is_regular(const struct ra_format *fmt)
+{
+    if (!fmt->pixel_size || !fmt->num_components || !fmt->ordered)
+        return false;
+    for (int n = 1; n < fmt->num_components; n++) {
+        if (fmt->component_size[n] != fmt->component_size[0] ||
+            fmt->component_depth[n] != fmt->component_depth[0])
+            return false;
+    }
+    if (fmt->component_size[0] * fmt->num_components != fmt->pixel_size * 8)
+        return false;
+    return true;
+}
+
+// Return a regular filterable format using RA_CTYPE_UNORM.
+const struct ra_format *ra_find_unorm_format(struct ra *ra,
+                                             int bytes_per_component,
+                                             int n_components)
+{
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (fmt->ctype == RA_CTYPE_UNORM && fmt->num_components == n_components &&
+            fmt->pixel_size == bytes_per_component * n_components &&
+            fmt->component_depth[0] == bytes_per_component * 8 &&
+            fmt->linear_filter && ra_format_is_regular(fmt))
+            return fmt;
+    }
+    return NULL;
+}
+
+// Return a regular format using RA_CTYPE_UINT.
+const struct ra_format *ra_find_uint_format(struct ra *ra,
+                                            int bytes_per_component,
+                                            int n_components)
+{
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (fmt->ctype == RA_CTYPE_UINT && fmt->num_components == n_components &&
+            fmt->pixel_size == bytes_per_component * n_components &&
+            fmt->component_depth[0] == bytes_per_component * 8 &&
+            ra_format_is_regular(fmt))
+            return fmt;
+    }
+    return NULL;
+}
+
+// Find a float format of any precision that matches the C type of the same
+// size for upload.
+// May drop bits from the mantissa (such as selecting float16 even if
+// bytes_per_component == 32); prefers possibly faster formats first.
+static const struct ra_format *ra_find_float_format(struct ra *ra,
+                                                    int bytes_per_component,
+                                                    int n_components)
+{
+    // Assumes ra_format are ordered by performance.
+    // The >=16 check is to avoid catching fringe formats.
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (fmt->ctype == RA_CTYPE_FLOAT && fmt->num_components == n_components &&
+            fmt->pixel_size == bytes_per_component * n_components &&
+            fmt->component_depth[0] >= 16 &&
+            fmt->linear_filter && ra_format_is_regular(fmt))
+            return fmt;
+    }
+    return NULL;
+}
+
+// Return a filterable regular format that uses at least float16 internally, and
+// uses a normal C float for transfer on the CPU side. (This is just so we don't
+// need 32->16 bit conversion on CPU, which would be messy.)
+const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components)
+{
+    return ra_find_float_format(ra, sizeof(float), n_components);
+}
+
+const struct ra_format *ra_find_named_format(struct ra *ra, const char *name)
+{
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (strcmp(fmt->name, name) == 0)
+            return fmt;
+    }
+    return NULL;
+}
+
+// Like ra_find_unorm_format(), but if no fixed point format is available,
+// return an unsigned integer format.
+static const struct ra_format *find_plane_format(struct ra *ra, int bytes,
+                                                 int n_channels,
+                                                 enum mp_component_type ctype)
+{
+    switch (ctype) {
+    case MP_COMPONENT_TYPE_UINT: {
+        const struct ra_format *f = ra_find_unorm_format(ra, bytes, n_channels);
+        if (f)
+            return f;
+        return ra_find_uint_format(ra, bytes, n_channels);
+    }
+    case MP_COMPONENT_TYPE_FLOAT:
+        return ra_find_float_format(ra, bytes, n_channels);
+    default: return NULL;
+    }
+}
+
+// Put a mapping of imgfmt to texture formats into *out. Basically it selects
+// the correct texture formats needed to represent an imgfmt in a shader, with
+// textures using the same memory organization as on the CPU.
+// Each plane is represented by a texture, and each texture has a RGBA
+// component order. out->components describes the meaning of them.
+// May return integer formats for >8 bit formats, if the driver has no
+// normalized 16 bit formats.
+// Returns false (and *out is not touched) if no format found.
+bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out)
+{
+    struct ra_imgfmt_desc res = {0};
+
+    struct mp_regular_imgfmt regfmt;
+    if (mp_get_regular_imgfmt(&regfmt, imgfmt)) {
+        enum ra_ctype ctype = RA_CTYPE_UNKNOWN;
+        res.num_planes = regfmt.num_planes;
+        res.component_bits = regfmt.component_size * 8;
+        res.component_pad = regfmt.component_pad;
+        for (int n = 0; n < regfmt.num_planes; n++) {
+            struct mp_regular_imgfmt_plane *plane = &regfmt.planes[n];
+            res.planes[n] = find_plane_format(ra, regfmt.component_size,
+                                              plane->num_components,
+                                              regfmt.component_type);
+            if (!res.planes[n])
+                return false;
+            for (int i = 0; i < plane->num_components; i++)
+                res.components[n][i] = plane->components[i];
+            // Dropping LSBs when shifting will lead to dropped MSBs.
+            if (res.component_bits > res.planes[n]->component_depth[0] &&
+                res.component_pad < 0)
+                return false;
+            // Renderer restriction, but actually an unwanted corner case.
+            if (ctype != RA_CTYPE_UNKNOWN && ctype != res.planes[n]->ctype)
+                return false;
+            ctype = res.planes[n]->ctype;
+        }
+        res.chroma_w = regfmt.chroma_w;
+        res.chroma_h = regfmt.chroma_h;
+        goto supported;
+    }
+
+    for (int n = 0; n < ra->num_formats; n++) {
+        if (imgfmt && ra->formats[n]->special_imgfmt == imgfmt) {
+            res = *ra->formats[n]->special_imgfmt_desc;
+            goto supported;
+        }
+    }
+
+    // Unsupported format
+    return false;
+
+supported:
+
+    *out = res;
+    return true;
+}
+
+void ra_dump_tex_formats(struct ra *ra, int msgl)
+{
+    if (!mp_msg_test(ra->log, msgl))
+        return;
+    MP_MSG(ra, msgl, "Texture formats:\n");
+    MP_MSG(ra, msgl, "  NAME       COMP*TYPE SIZE        DEPTH PER COMP.\n");
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        const char *ctype = "unknown";
+        switch (fmt->ctype) {
+        case RA_CTYPE_UNORM:    ctype = "unorm";    break;
+        case RA_CTYPE_UINT:     ctype = "uint ";    break;
+        case RA_CTYPE_FLOAT:    ctype = "float";    break;
+        }
+        char cl[40] = "";
+        for (int i = 0; i < fmt->num_components; i++) {
+            mp_snprintf_cat(cl, sizeof(cl), "%s%d", i ? " " : "",
+                            fmt->component_size[i]);
+            if (fmt->component_size[i] != fmt->component_depth[i])
+                mp_snprintf_cat(cl, sizeof(cl), "/%d", fmt->component_depth[i]);
+        }
+        MP_MSG(ra, msgl, "  %-10s %d*%s %3dB %s %s %s {%s}\n", fmt->name,
+               fmt->num_components, ctype, fmt->pixel_size,
+               fmt->luminance_alpha ? "LA" : "  ",
+               fmt->linear_filter ? "LF" : "  ",
+               fmt->renderable ? "CR" : "  ", cl);
+    }
+    MP_MSG(ra, msgl, " LA = LUMINANCE_ALPHA hack format\n");
+    MP_MSG(ra, msgl, " LF = linear filterable\n");
+    MP_MSG(ra, msgl, " CR = can be used for render targets\n");
+}
+
+void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
+                         int msgl)
+{
+    char pl[80] = "";
+    char pf[80] = "";
+    for (int n = 0; n < desc->num_planes; n++) {
+        if (n > 0) {
+            mp_snprintf_cat(pl, sizeof(pl), "/");
+            mp_snprintf_cat(pf, sizeof(pf), "/");
+        }
+        char t[5] = {0};
+        for (int i = 0; i < 4; i++)
+            t[i] = "_rgba"[desc->components[n][i]];
+        for (int i = 3; i > 0 && t[i] == '_'; i--)
+            t[i] = '\0';
+        mp_snprintf_cat(pl, sizeof(pl), "%s", t);
+        mp_snprintf_cat(pf, sizeof(pf), "%s", desc->planes[n]->name);
+    }
+    MP_MSG(ra, msgl, "%d planes %dx%d %d/%d [%s] (%s)\n",
+           desc->num_planes, desc->chroma_w, desc->chroma_h,
+           desc->component_bits, desc->component_pad, pf, pl);
+}
+
+void ra_dump_img_formats(struct ra *ra, int msgl)
+{
+    if (!mp_msg_test(ra->log, msgl))
+        return;
+    MP_MSG(ra, msgl, "Image formats:\n");
+    for (int imgfmt = IMGFMT_START; imgfmt < IMGFMT_END; imgfmt++) {
+        const char *name = mp_imgfmt_to_name(imgfmt);
+        if (strcmp(name, "unknown") == 0)
+            continue;
+        MP_MSG(ra, msgl, "  %s", name);
+        struct ra_imgfmt_desc desc;
+        if (ra_get_imgfmt_desc(ra, imgfmt, &desc)) {
+            MP_MSG(ra, msgl, " => ");
+            ra_dump_imgfmt_desc(ra, &desc, msgl);
+        } else {
+            MP_MSG(ra, msgl, "\n");
+        }
+    }
+}
diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h
new file mode 100644
index 0000000000..76f98397f8
--- /dev/null
+++ b/video/out/gpu/ra.h
@@ -0,0 +1,488 @@
+#pragma once
+
+#include "common/common.h"
+#include "misc/bstr.h"
+
+// Handle for a rendering API backend.
+struct ra {
+    struct ra_fns *fns;
+    void *priv;
+
+    int glsl_version;       // GLSL version (e.g. 300 => 3.0)
+    bool glsl_es;           // use ES dialect
+    bool glsl_vulkan;       // use vulkan dialect
+
+    struct mp_log *log;
+
+    // RA_CAP_* bit field. The RA backend must set supported features at init
+    // time.
+    uint64_t caps;
+
+    // Maximum supported width and height of a 2D texture. Set by the RA backend
+    // at init time.
+    int max_texture_wh;
+
+    // Maximum shared memory for compute shaders. Set by the RA backend at init
+    // time.
+    size_t max_shmem;
+
+    // Set of supported texture formats. Must be added by RA backend at init time.
+    // If there are equivalent formats with different caveats, the preferred
+    // formats should have a lower index. (E.g. GLES3 should put rg8 before la.)
+    struct ra_format **formats;
+    int num_formats;
+
+    // Accelerate texture uploads via an extra PBO even when
+    // RA_CAP_DIRECT_UPLOAD is supported. This is basically only relevant for
+    // OpenGL. Set by the RA user.
+    bool use_pbo;
+};
+
+enum {
+    RA_CAP_TEX_1D         = 1 << 0, // supports 1D textures (as shader inputs)
+    RA_CAP_TEX_3D         = 1 << 1, // supports 3D textures (as shader inputs)
+    RA_CAP_BLIT           = 1 << 2, // supports ra_fns.blit
+    RA_CAP_COMPUTE        = 1 << 3, // supports compute shaders
+    RA_CAP_DIRECT_UPLOAD  = 1 << 4, // supports tex_upload without ra_buf
+    RA_CAP_BUF_RO         = 1 << 5, // supports RA_VARTYPE_BUF_RO
+    RA_CAP_BUF_RW         = 1 << 6, // supports RA_VARTYPE_BUF_RW
+    RA_CAP_NESTED_ARRAY   = 1 << 7, // supports nested arrays
+    RA_CAP_SHARED_BINDING = 1 << 8, // sampler/image/buffer namespaces are disjoint
+    RA_CAP_GLOBAL_UNIFORM = 1 << 9, // supports using "naked" uniforms (not UBO)
+};
+
+enum ra_ctype {
+    RA_CTYPE_UNKNOWN = 0,   // also used for inconsistent multi-component formats
+    RA_CTYPE_UNORM,         // unsigned normalized integer (fixed point) formats
+    RA_CTYPE_UINT,          // full integer formats
+    RA_CTYPE_FLOAT,         // float formats (signed, any bit size)
+};
+
+// All formats must be useable as texture formats. All formats must be byte
+// aligned (all pixels start and end on a byte boundary), at least as far CPU
+// transfers are concerned.
+struct ra_format {
+    // All fields are read-only after creation.
+    const char *name;       // symbolic name for user interaction/debugging
+    void *priv;
+    enum ra_ctype ctype;    // data type of each component
+    bool ordered;           // components are sequential in memory, and returned
+                            // by the shader in memory order (the shader can
+                            // return arbitrary values for unused components)
+    int num_components;     // component count, 0 if not applicable, max. 4
+    int component_size[4];  // in bits, all entries 0 if not applicable
+    int component_depth[4]; // bits in use for each component, 0 if not applicable
+                            // (_must_ be set if component_size[] includes padding,
+                            //  and the real procession as seen by shader is lower)
+    int pixel_size;         // in bytes, total pixel size (0 if opaque)
+    bool luminance_alpha;   // pre-GL_ARB_texture_rg hack for 2 component textures
+                            // if this is set, shader must use .ra instead of .rg
+                            // only applies to 2-component textures
+    bool linear_filter;     // linear filtering available from shader
+    bool renderable;        // can be used for render targets
+
+    // If not 0, the format represents some sort of packed fringe format, whose
+    // shader representation is given by the special_imgfmt_desc pointer.
+    int special_imgfmt;
+    const struct ra_imgfmt_desc *special_imgfmt_desc;
+};
+
+struct ra_tex_params {
+    int dimensions;         // 1-3 for 1D-3D textures
+    // Size of the texture. 1D textures require h=d=1, 2D textures require d=1.
+    int w, h, d;
+    const struct ra_format *format;
+    bool render_src;        // must be useable as source texture in a shader
+    bool render_dst;        // must be useable as target texture in a shader
+    bool storage_dst;       // must be usable as a storage image (RA_VARTYPE_IMG_W)
+    bool blit_src;          // must be usable as a blit source
+    bool blit_dst;          // must be usable as a blit destination
+    bool host_mutable;      // texture may be updated with tex_upload
+    // When used as render source texture.
+    bool src_linear;        // if false, use nearest sampling (whether this can
+                            // be true depends on ra_format.linear_filter)
+    bool src_repeat;        // if false, clamp texture coordinates to edge
+                            // if true, repeat texture coordinates
+    bool non_normalized;    // hack for GL_TEXTURE_RECTANGLE OSX idiocy
+                            // always set to false, except in OSX code
+    bool external_oes;      // hack for GL_TEXTURE_EXTERNAL_OES idiocy
+    // If non-NULL, the texture will be created with these contents. Using
+    // this does *not* require setting host_mutable. Otherwise, the initial
+    // data is undefined.
+    void *initial_data;
+};
+
+// Conflates the following typical GPU API concepts:
+// - texture itself
+// - sampler state
+// - staging buffers for texture upload
+// - framebuffer objects
+// - wrappers for swapchain framebuffers
+// - synchronization needed for upload/rendering/etc.
+struct ra_tex {
+    // All fields are read-only after creation.
+    struct ra_tex_params params;
+    void *priv;
+};
+
+struct ra_tex_upload_params {
+    struct ra_tex *tex; // Texture to upload to
+    bool invalidate;    // Discard pre-existing data not in the region uploaded
+    // Uploading from buffer:
+    struct ra_buf *buf; // Buffer to upload from (mutually exclusive with `src`)
+    size_t buf_offset;  // Start of data within buffer (bytes)
+    // Uploading directly: (Note: If RA_CAP_DIRECT_UPLOAD is not set, then this
+    // will be internally translated to a tex_upload buffer by the RA)
+    const void *src;    // Address of data
+    // For 2D textures only:
+    struct mp_rect *rc; // Region to upload. NULL means entire image
+    ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
+};
+
+// Buffer type hint. Setting this may result in more or less efficient
+// operation, although it shouldn't technically prohibit anything
+enum ra_buf_type {
+    RA_BUF_TYPE_INVALID,
+    RA_BUF_TYPE_TEX_UPLOAD,     // texture upload buffer (pixel buffer object)
+    RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
+    RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
+};
+
+struct ra_buf_params {
+    enum ra_buf_type type;
+    size_t size;
+    bool host_mapped;  // create a read-writable persistent mapping (ra_buf.data)
+    bool host_mutable; // contents may be updated via buf_update()
+    // If non-NULL, the buffer will be created with these contents. Otherwise,
+    // the initial data is undefined.
+    void *initial_data;
+};
+
+// A generic buffer, which can be used for many purposes (texture upload,
+// storage buffer, uniform buffer, etc.)
+struct ra_buf {
+    // All fields are read-only after creation.
+    struct ra_buf_params params;
+    void *data; // for persistently mapped buffers, points to the first byte
+    void *priv;
+};
+
+// Type of a shader uniform variable, or a vertex attribute. In all cases,
+// vectors are matrices are done by having more than 1 value.
+enum ra_vartype {
+    RA_VARTYPE_INVALID,
+    RA_VARTYPE_INT,             // C: int, GLSL: int, ivec*
+    RA_VARTYPE_FLOAT,           // C: float, GLSL: float, vec*, mat*
+    RA_VARTYPE_TEX,             // C: ra_tex*, GLSL: various sampler types
+                                // ra_tex.params.render_src must be true
+    RA_VARTYPE_IMG_W,           // C: ra_tex*, GLSL: various image types
+                                // write-only (W) image for compute shaders
+                                // ra_tex.params.storage_dst must be true
+    RA_VARTYPE_BYTE_UNORM,      // C: uint8_t, GLSL: int, vec* (vertex data only)
+    RA_VARTYPE_BUF_RO,          // C: ra_buf*, GLSL: uniform buffer block
+                                // buf type must be RA_BUF_TYPE_UNIFORM
+    RA_VARTYPE_BUF_RW,          // C: ra_buf*, GLSL: shader storage buffer block
+                                // buf type must be RA_BUF_TYPE_SHADER_STORAGE
+    RA_VARTYPE_COUNT
+};
+
+// Returns the host size of a ra_vartype, or 0 for abstract vartypes (e.g. tex)
+size_t ra_vartype_size(enum ra_vartype type);
+
+// Represents a uniform, texture input parameter, and similar things.
+struct ra_renderpass_input {
+    const char *name;       // name as used in the shader
+    enum ra_vartype type;
+    // The total number of values is given by dim_v * dim_m.
+    int dim_v;              // vector dimension (1 for non-vector and non-matrix)
+    int dim_m;              // additional matrix dimension (dim_v x dim_m)
+    // Vertex data: byte offset of the attribute into the vertex struct
+    size_t offset;
+    // RA_VARTYPE_TEX: texture unit
+    // RA_VARTYPE_IMG_W: image unit
+    // RA_VARTYPE_BUF_* buffer binding point
+    // Other uniforms: unused
+    // If RA_CAP_SHARED_BINDING is set, these may only be unique per input type.
+    // Otherwise, these must be unique for all input values.
+    int binding;
+};
+
+// Represents the layout requirements of an input value
+struct ra_layout {
+    size_t align;  // the alignment requirements (always a power of two)
+    size_t stride; // the delta between two rows of an array/matrix
+    size_t size;   // the total size of the input
+};
+
+// Returns the host layout of a render pass input. Returns {0} for renderpass
+// inputs without a corresponding host representation (e.g. textures/buffers)
+struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input);
+
+enum ra_blend {
+    RA_BLEND_ZERO,
+    RA_BLEND_ONE,
+    RA_BLEND_SRC_ALPHA,
+    RA_BLEND_ONE_MINUS_SRC_ALPHA,
+};
+
+enum ra_renderpass_type {
+    RA_RENDERPASS_TYPE_INVALID,
+    RA_RENDERPASS_TYPE_RASTER,  // vertex+fragment shader
+    RA_RENDERPASS_TYPE_COMPUTE, // compute shader
+};
+
+// Static part of a rendering pass. It conflates the following:
+//  - compiled shader and its list of uniforms
+//  - vertex attributes and its shader mappings
+//  - blending parameters
+// (For Vulkan, this would be shader module + pipeline state.)
+// Upon creation, the values of dynamic values such as uniform contents (whose
+// initial values are not provided here) are required to be 0.
+struct ra_renderpass_params {
+    enum ra_renderpass_type type;
+
+    // Uniforms, including texture/sampler inputs.
+    struct ra_renderpass_input *inputs;
+    int num_inputs;
+
+    // Highly implementation-specific byte array storing a compiled version
+    // of the program. Can be used to speed up shader compilation. A backend
+    // xan read this in renderpass_create, or set this on the newly created
+    // ra_renderpass params field.
+    bstr cached_program;
+
+    // --- type==RA_RENDERPASS_TYPE_RASTER only
+
+    // Describes the format of the vertex data. When using ra.glsl_vulkan,
+    // the order of this array must match the vertex attribute locations.
+    struct ra_renderpass_input *vertex_attribs;
+    int num_vertex_attribs;
+    int vertex_stride;
+
+    // Format of the target texture
+    const struct ra_format *target_format;
+
+    // Shader text, in GLSL. (Yes, you need a GLSL compiler.)
+    // These are complete shaders, including prelude and declarations.
+    const char *vertex_shader;
+    const char *frag_shader;
+
+    // Target blending mode. If enable_blend is false, the blend_ fields can
+    // be ignored.
+    bool enable_blend;
+    enum ra_blend blend_src_rgb;
+    enum ra_blend blend_dst_rgb;
+    enum ra_blend blend_src_alpha;
+    enum ra_blend blend_dst_alpha;
+
+    // --- type==RA_RENDERPASS_TYPE_COMPUTE only
+
+    // Shader text, like vertex_shader/frag_shader.
+    const char *compute_shader;
+};
+
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
+        const struct ra_renderpass_params *params);
+
+// Conflates the following typical GPU API concepts:
+// - various kinds of shaders
+// - rendering pipelines
+// - descriptor sets, uniforms, other bindings
+// - all synchronization necessary
+// - the current values of all uniforms (this one makes it relatively stateful
+//   from an API perspective)
+struct ra_renderpass {
+    // All fields are read-only after creation.
+    struct ra_renderpass_params params;
+    void *priv;
+};
+
+// An input value (see ra_renderpass_input).
+struct ra_renderpass_input_val {
+    int index;  // index into ra_renderpass_params.inputs[]
+    void *data; // pointer to data according to ra_renderpass_input
+                // (e.g. type==RA_VARTYPE_FLOAT+dim_v=3,dim_m=3 => float[9])
+};
+
+// Parameters for performing a rendering pass (basically the dynamic params).
+// These change potentially every time.
+struct ra_renderpass_run_params {
+    struct ra_renderpass *pass;
+
+    // Generally this lists parameters only which changed since the last
+    // invocation and need to be updated. The ra_renderpass instance is
+    // supposed to keep unchanged values from the previous run.
+    // For non-primitive types like textures, these entries are always added,
+    // even if they do not change.
+    struct ra_renderpass_input_val *values;
+    int num_values;
+
+    // --- pass->params.type==RA_RENDERPASS_TYPE_RASTER only
+
+    // target->params.render_dst must be true, and target->params.format must
+    // match pass->params.target_format.
+    struct ra_tex *target;
+    struct mp_rect viewport;
+    struct mp_rect scissors;
+
+    // (The primitive type is always a triangle list.)
+    void *vertex_data;
+    int vertex_count;   // number of vertex elements, not bytes
+
+    // --- pass->params.type==RA_RENDERPASS_TYPE_COMPUTE only
+
+    // Number of work groups to be run in X/Y/Z dimensions.
+    int compute_groups[3];
+};
+
+// This is an opaque type provided by the implementation, but we want to at
+// least give it a saner name than void* for code readability purposes.
+typedef void ra_timer;
+
+// Rendering API entrypoints. (Note: there are some additional hidden features
+// you need to take care of. For example, hwdec mapping will be provided
+// separately from ra, but might need to call into ra private code.)
+struct ra_fns {
+    void (*destroy)(struct ra *ra);
+
+    // Create a texture (with undefined contents). Return NULL on failure.
+    // This is a rare operation, and normally textures and even FBOs for
+    // temporary rendering intermediate data are cached.
+    struct ra_tex *(*tex_create)(struct ra *ra,
+                                 const struct ra_tex_params *params);
+
+    void (*tex_destroy)(struct ra *ra, struct ra_tex *tex);
+
+    // Upload data to a texture. This is an extremely common operation. When
+    // using a buffer, the contants of the buffer must exactly match the image
+    // - conversions between bit depth etc. are not supported. The buffer *may*
+    // be marked as "in use" while this operation is going on, and the contents
+    // must not be touched again by the API user until buf_poll returns true.
+    // Returns whether successful.
+    bool (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params);
+
+    // Create a buffer. This can be used as a persistently mapped buffer,
+    // a uniform buffer, a shader storage buffer or possibly others.
+    // Not all usage types must be supported; may return NULL if unavailable.
+    struct ra_buf *(*buf_create)(struct ra *ra,
+                                 const struct ra_buf_params *params);
+
+    void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
+
+    // Update the contents of a buffer, starting at a given offset and up to a
+    // given size, with the contents of *data. This is an extremely common
+    // operation. Calling this while the buffer is considered "in use" is an
+    // error. (See: buf_poll)
+    void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
+                       const void *data, size_t size);
+
+    // Returns if a buffer is currently "in use" or not. Updating the contents
+    // of a buffer (via buf_update or writing to buf->data) while it is still
+    // in use is an error and may result in graphical corruption. Optional, if
+    // NULL then all buffers are always usable.
+    bool (*buf_poll)(struct ra *ra, struct ra_buf *buf);
+
+    // Returns the layout requirements of a uniform buffer element. Optional,
+    // but must be implemented if RA_CAP_BUF_RO is supported.
+    struct ra_layout (*uniform_layout)(struct ra_renderpass_input *inp);
+
+    // Clear the dst with the given color (rgba) and within the given scissor.
+    // dst must have dst->params.render_dst==true. Content outside of the
+    // scissor is preserved.
+    void (*clear)(struct ra *ra, struct ra_tex *dst, float color[4],
+                  struct mp_rect *scissor);
+
+    // Copy a sub-rectangle from one texture to another. The source/dest region
+    // is always within the texture bounds. Areas outside the dest region are
+    // preserved. The formats of the textures must be losely compatible. The
+    // dst texture can be a swapchain framebuffer, but src can not. Only 2D
+    // textures are supported.
+    // The textures must have blit_src and blit_dst set, respectively.
+    // Rectangles with negative width/height lead to flipping, different src/dst
+    // sizes lead to point scaling. Coordinates are always in pixels.
+    // Optional. Only available if RA_CAP_BLIT is set (if it's not set, it must
+    // not be called, even if it's non-NULL).
+    void (*blit)(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
+                 struct mp_rect *dst_rc, struct mp_rect *src_rc);
+
+    // Compile a shader and create a pipeline. This is a rare operation.
+    // The params pointer and anything it points to must stay valid until
+    // renderpass_destroy.
+    struct ra_renderpass *(*renderpass_create)(struct ra *ra,
+                                    const struct ra_renderpass_params *params);
+
+    void (*renderpass_destroy)(struct ra *ra, struct ra_renderpass *pass);
+
+    // Perform a render pass, basically drawing a list of triangles to a FBO.
+    // This is an extremely common operation.
+    void (*renderpass_run)(struct ra *ra,
+                           const struct ra_renderpass_run_params *params);
+
+    // Create a timer object. Returns NULL on failure, or if timers are
+    // unavailable for some reason. Optional.
+    ra_timer *(*timer_create)(struct ra *ra);
+
+    void (*timer_destroy)(struct ra *ra, ra_timer *timer);
+
+    // Start recording a timer. Note that valid usage requires you to pair
+    // every start with a stop. Trying to start a timer twice, or trying to
+    // stop a timer before having started it, consistutes invalid usage.
+    void (*timer_start)(struct ra *ra, ra_timer *timer);
+
+    // Stop recording a timer. This also returns any results that have been
+    // measured since the last usage of this ra_timer. It's important to note
+    // that GPU timer measurement are asynchronous, so this function does not
+    // always produce a value - and the values it does produce are typically
+    // delayed by a few frames. When no value is available, this returns 0.
+    uint64_t (*timer_stop)(struct ra *ra, ra_timer *timer);
+
+    // Associates a marker with any past error messages, for debugging
+    // purposes. Optional.
+    void (*debug_marker)(struct ra *ra, const char *msg);
+};
+
+struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params);
+void ra_tex_free(struct ra *ra, struct ra_tex **tex);
+
+struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params);
+void ra_buf_free(struct ra *ra, struct ra_buf **buf);
+
+void ra_free(struct ra **ra);
+
+const struct ra_format *ra_find_unorm_format(struct ra *ra,
+                                             int bytes_per_component,
+                                             int n_components);
+const struct ra_format *ra_find_uint_format(struct ra *ra,
+                                            int bytes_per_component,
+                                            int n_components);
+const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components);
+const struct ra_format *ra_find_named_format(struct ra *ra, const char *name);
+
+struct ra_imgfmt_desc {
+    int num_planes;
+    const struct ra_format *planes[4];
+    // Chroma pixel size (1x1 is 4:4:4)
+    uint8_t chroma_w, chroma_h;
+    // Component storage size in bits (possibly padded). For formats with
+    // different sizes per component, this is arbitrary. For padded formats
+    // like P010 or YUV420P10, padding is included.
+    int component_bits;
+    // Like mp_regular_imgfmt.component_pad.
+    int component_pad;
+    // For each texture and each texture output (rgba order) describe what
+    // component it returns.
+    // The values are like the values in mp_regular_imgfmt_plane.components[].
+    // Access as components[plane_nr][component_index]. Set unused items to 0.
+    // For ra_format.luminance_alpha, this returns 1/2 ("rg") instead of 1/4
+    // ("ra"). the logic is that the texture format has 2 channels, thus the
+    // data must be returned in the first two components. The renderer fixes
+    // this later.
+    uint8_t components[4][4];
+};
+
+bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out);
+
+void ra_dump_tex_formats(struct ra *ra, int msgl);
+void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
+                         int msgl);
+void ra_dump_img_formats(struct ra *ra, int msgl);
diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c
new file mode 100644
index 0000000000..afda9cc036
--- /dev/null
+++ b/video/out/gpu/shader_cache.c
@@ -0,0 +1,954 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <libavutil/sha.h>
+#include <libavutil/mem.h>
+
+#include "osdep/io.h"
+
+#include "common/common.h"
+#include "options/path.h"
+#include "stream/stream.h"
+#include "shader_cache.h"
+#include "utils.h"
+
+// Force cache flush if more than this number of shaders is created.
+#define SC_MAX_ENTRIES 48
+
+union uniform_val {
+    float f[9];         // RA_VARTYPE_FLOAT
+    int i[4];           // RA_VARTYPE_INT
+    struct ra_tex *tex; // RA_VARTYPE_TEX, RA_VARTYPE_IMG_*
+    struct ra_buf *buf; // RA_VARTYPE_BUF_*
+};
+
+enum sc_uniform_type {
+    SC_UNIFORM_TYPE_GLOBAL = 0, // global uniform (RA_CAP_GLOBAL_UNIFORM)
+    SC_UNIFORM_TYPE_UBO = 1,    // uniform buffer (RA_CAP_BUF_RO)
+};
+
+struct sc_uniform {
+    enum sc_uniform_type type;
+    struct ra_renderpass_input input;
+    const char *glsl_type;
+    union uniform_val v;
+    char *buffer_format;
+    // for SC_UNIFORM_TYPE_UBO:
+    struct ra_layout layout;
+    size_t offset; // byte offset within the buffer
+};
+
+struct sc_cached_uniform {
+    union uniform_val v;
+    int index; // for ra_renderpass_input_val
+    bool set; // whether the uniform has ever been set
+};
+
+struct sc_entry {
+    struct ra_renderpass *pass;
+    struct sc_cached_uniform *cached_uniforms;
+    int num_cached_uniforms;
+    bstr total;
+    struct timer_pool *timer;
+    struct ra_buf *ubo;
+    int ubo_index; // for ra_renderpass_input_val.index
+};
+
+struct gl_shader_cache {
+    struct ra *ra;
+    struct mp_log *log;
+
+    // permanent
+    char **exts;
+    int num_exts;
+
+    // this is modified during use (gl_sc_add() etc.) and reset for each shader
+    bstr prelude_text;
+    bstr header_text;
+    bstr text;
+
+    // Next binding point (texture unit, image unit, buffer binding, etc.)
+    // In OpenGL these are separate for each input type
+    int next_binding[RA_VARTYPE_COUNT];
+
+    struct ra_renderpass_params params;
+
+    struct sc_entry **entries;
+    int num_entries;
+
+    struct sc_entry *current_shader; // set by gl_sc_generate()
+
+    struct sc_uniform *uniforms;
+    int num_uniforms;
+
+    int ubo_binding;
+    size_t ubo_size;
+
+    struct ra_renderpass_input_val *values;
+    int num_values;
+
+    // For checking that the user is calling gl_sc_reset() properly.
+    bool needs_reset;
+
+    bool error_state; // true if an error occurred
+
+    // temporary buffers (avoids frequent reallocations)
+    bstr tmp[6];
+
+    // For the disk-cache.
+    char *cache_dir;
+    struct mpv_global *global; // can be NULL
+};
+
+static void gl_sc_reset(struct gl_shader_cache *sc);
+
+struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
+                                     struct mp_log *log)
+{
+    struct gl_shader_cache *sc = talloc_ptrtype(NULL, sc);
+    *sc = (struct gl_shader_cache){
+        .ra = ra,
+        .global = global,
+        .log = log,
+    };
+    gl_sc_reset(sc);
+    return sc;
+}
+
+// Reset the previous pass. This must be called after gl_sc_generate and before
+// starting a new shader.
+static void gl_sc_reset(struct gl_shader_cache *sc)
+{
+    sc->prelude_text.len = 0;
+    sc->header_text.len = 0;
+    sc->text.len = 0;
+    for (int n = 0; n < sc->num_uniforms; n++)
+        talloc_free((void *)sc->uniforms[n].input.name);
+    sc->num_uniforms = 0;
+    sc->ubo_binding = 0;
+    sc->ubo_size = 0;
+    for (int i = 0; i < RA_VARTYPE_COUNT; i++)
+        sc->next_binding[i] = 0;
+    sc->current_shader = NULL;
+    sc->params = (struct ra_renderpass_params){0};
+    sc->needs_reset = false;
+}
+
+static void sc_flush_cache(struct gl_shader_cache *sc)
+{
+    MP_VERBOSE(sc, "flushing shader cache\n");
+
+    for (int n = 0; n < sc->num_entries; n++) {
+        struct sc_entry *e = sc->entries[n];
+        ra_buf_free(sc->ra, &e->ubo);
+        if (e->pass)
+            sc->ra->fns->renderpass_destroy(sc->ra, e->pass);
+        timer_pool_destroy(e->timer);
+        talloc_free(e);
+    }
+    sc->num_entries = 0;
+}
+
+void gl_sc_destroy(struct gl_shader_cache *sc)
+{
+    if (!sc)
+        return;
+    gl_sc_reset(sc);
+    sc_flush_cache(sc);
+    talloc_free(sc);
+}
+
+bool gl_sc_error_state(struct gl_shader_cache *sc)
+{
+    return sc->error_state;
+}
+
+void gl_sc_reset_error(struct gl_shader_cache *sc)
+{
+    sc->error_state = false;
+}
+
+void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name)
+{
+    for (int n = 0; n < sc->num_exts; n++) {
+        if (strcmp(sc->exts[n], name) == 0)
+            return;
+    }
+    MP_TARRAY_APPEND(sc, sc->exts, sc->num_exts, talloc_strdup(sc, name));
+}
+
+#define bstr_xappend0(sc, b, s) bstr_xappend(sc, b, bstr0(s))
+
+void gl_sc_add(struct gl_shader_cache *sc, const char *text)
+{
+    bstr_xappend0(sc, &sc->text, text);
+}
+
+void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(sc, &sc->text, textf, ap);
+    va_end(ap);
+}
+
+void gl_sc_hadd(struct gl_shader_cache *sc, const char *text)
+{
+    bstr_xappend0(sc, &sc->header_text, text);
+}
+
+void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(sc, &sc->header_text, textf, ap);
+    va_end(ap);
+}
+
+void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text)
+{
+    bstr_xappend(sc, &sc->header_text, text);
+}
+
+void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(sc, &sc->prelude_text, textf, ap);
+    va_end(ap);
+}
+
+static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
+                                       const char *name)
+{
+    struct sc_uniform new = {
+        .input = {
+            .dim_v = 1,
+            .dim_m = 1,
+        },
+    };
+
+    for (int n = 0; n < sc->num_uniforms; n++) {
+        struct sc_uniform *u = &sc->uniforms[n];
+        if (strcmp(u->input.name, name) == 0) {
+            const char *allocname = u->input.name;
+            *u = new;
+            u->input.name = allocname;
+            return u;
+        }
+    }
+
+    // not found -> add it
+    new.input.name = talloc_strdup(NULL, name);
+    MP_TARRAY_APPEND(sc, sc->uniforms, sc->num_uniforms, new);
+    return &sc->uniforms[sc->num_uniforms - 1];
+}
+
+static int gl_sc_next_binding(struct gl_shader_cache *sc, enum ra_vartype type)
+{
+    if (sc->ra->caps & RA_CAP_SHARED_BINDING) {
+        return sc->next_binding[type]++;
+    } else {
+        return sc->next_binding[0]++;
+    }
+}
+
+// Updates the UBO metadata for the given sc_uniform. Assumes sc_uniform->input
+// is already set. Also updates sc_uniform->type.
+static void update_ubo_params(struct gl_shader_cache *sc, struct sc_uniform *u)
+{
+    if (!(sc->ra->caps & RA_CAP_BUF_RO))
+        return;
+
+    // Using UBOs with explicit layout(offset) like we do requires GLSL version
+    // 440 or higher. In theory the UBO code can also use older versions, but
+    // just try and avoid potential headaches. This also ensures they're only
+    // used on drivers that are probably modern enough to actually support them
+    // correctly.
+    if (sc->ra->glsl_version < 440)
+        return;
+
+    u->type = SC_UNIFORM_TYPE_UBO;
+    u->layout = sc->ra->fns->uniform_layout(&u->input);
+    u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align);
+    sc->ubo_size = u->offset + u->layout.size;
+}
+
+void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
+                           struct ra_tex *tex)
+{
+    const char *glsl_type = "sampler2D";
+    if (tex->params.dimensions == 1) {
+        glsl_type = "sampler1D";
+    } else if (tex->params.dimensions == 3) {
+        glsl_type = "sampler3D";
+    } else if (tex->params.non_normalized) {
+        glsl_type = "sampler2DRect";
+    } else if (tex->params.external_oes) {
+        glsl_type = "samplerExternalOES";
+    } else if (tex->params.format->ctype == RA_CTYPE_UINT) {
+        glsl_type = sc->ra->glsl_es ? "highp usampler2D" : "usampler2D";
+    }
+
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_TEX;
+    u->glsl_type = glsl_type;
+    u->input.binding = gl_sc_next_binding(sc, u->input.type);
+    u->v.tex = tex;
+}
+
+void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
+                              struct ra_tex *tex)
+{
+    gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
+
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_IMG_W;
+    u->glsl_type = "writeonly image2D";
+    u->input.binding = gl_sc_next_binding(sc, u->input.type);
+    u->v.tex = tex;
+}
+
+void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
+                char *format, ...)
+{
+    assert(sc->ra->caps & RA_CAP_BUF_RW);
+    gl_sc_enable_extension(sc, "GL_ARB_shader_storage_buffer_object");
+
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_BUF_RW;
+    u->glsl_type = "";
+    u->input.binding = gl_sc_next_binding(sc, u->input.type);
+    u->v.buf = buf;
+
+    va_list ap;
+    va_start(ap, format);
+    u->buffer_format = ta_vasprintf(sc, format, ap);
+    va_end(ap);
+}
+
+void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->glsl_type = "float";
+    update_ubo_params(sc, u);
+    u->v.f[0] = f;
+}
+
+void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int i)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_INT;
+    u->glsl_type = "int";
+    update_ubo_params(sc, u);
+    u->v.i[0] = i;
+}
+
+void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2])
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 2;
+    u->glsl_type = "vec2";
+    update_ubo_params(sc, u);
+    u->v.f[0] = f[0];
+    u->v.f[1] = f[1];
+}
+
+void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3])
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 3;
+    u->glsl_type = "vec3";
+    update_ubo_params(sc, u);
+    u->v.f[0] = f[0];
+    u->v.f[1] = f[1];
+    u->v.f[2] = f[2];
+}
+
+static void transpose2x2(float r[2 * 2])
+{
+    MPSWAP(float, r[0+2*1], r[1+2*0]);
+}
+
+void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 2;
+    u->input.dim_m = 2;
+    u->glsl_type = "mat2";
+    update_ubo_params(sc, u);
+    for (int n = 0; n < 4; n++)
+        u->v.f[n] = v[n];
+    if (transpose)
+        transpose2x2(&u->v.f[0]);
+}
+
+static void transpose3x3(float r[3 * 3])
+{
+    MPSWAP(float, r[0+3*1], r[1+3*0]);
+    MPSWAP(float, r[0+3*2], r[2+3*0]);
+    MPSWAP(float, r[1+3*2], r[2+3*1]);
+}
+
+void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 3;
+    u->input.dim_m = 3;
+    u->glsl_type = "mat3";
+    update_ubo_params(sc, u);
+    for (int n = 0; n < 9; n++)
+        u->v.f[n] = v[n];
+    if (transpose)
+        transpose3x3(&u->v.f[0]);
+}
+
+// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
+// data layout and attribute names. The entries array is terminated with a {0}
+// entry. The array memory must remain valid indefinitely (for now).
+void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
+                             const struct ra_renderpass_input *entries,
+                             int vertex_stride)
+{
+    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
+    sc->params.num_vertex_attribs = 0;
+    while (entries[sc->params.num_vertex_attribs].name)
+        sc->params.num_vertex_attribs++;
+    sc->params.vertex_stride = vertex_stride;
+}
+
+void gl_sc_blend(struct gl_shader_cache *sc,
+                 enum ra_blend blend_src_rgb,
+                 enum ra_blend blend_dst_rgb,
+                 enum ra_blend blend_src_alpha,
+                 enum ra_blend blend_dst_alpha)
+{
+    sc->params.enable_blend = true;
+    sc->params.blend_src_rgb = blend_src_rgb;
+    sc->params.blend_dst_rgb = blend_dst_rgb;
+    sc->params.blend_src_alpha = blend_src_alpha;
+    sc->params.blend_dst_alpha = blend_dst_alpha;
+}
+
+static const char *vao_glsl_type(const struct ra_renderpass_input *e)
+{
+    // pretty dumb... too dumb, but works for us
+    switch (e->dim_v) {
+    case 1: return "float";
+    case 2: return "vec2";
+    case 3: return "vec3";
+    case 4: return "vec4";
+    default: abort();
+    }
+}
+
+static void update_ubo(struct ra *ra, struct ra_buf *ubo, struct sc_uniform *u)
+{
+    uintptr_t src = (uintptr_t) &u->v;
+    size_t dst = u->offset;
+    struct ra_layout src_layout = ra_renderpass_input_layout(&u->input);
+    struct ra_layout dst_layout = u->layout;
+
+    for (int i = 0; i < u->input.dim_m; i++) {
+        ra->fns->buf_update(ra, ubo, dst, (void *)src, src_layout.stride);
+        src += src_layout.stride;
+        dst += dst_layout.stride;
+    }
+}
+
+static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
+                           struct sc_uniform *u, int n)
+{
+    struct sc_cached_uniform *un = &e->cached_uniforms[n];
+    struct ra_layout layout = ra_renderpass_input_layout(&u->input);
+    if (layout.size > 0 && un->set && memcmp(&un->v, &u->v, layout.size) == 0)
+        return;
+
+    un->v = u->v;
+    un->set = true;
+
+    switch (u->type) {
+    case SC_UNIFORM_TYPE_GLOBAL: {
+        struct ra_renderpass_input_val value = {
+            .index = un->index,
+            .data = &un->v,
+        };
+        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, value);
+        break;
+    }
+    case SC_UNIFORM_TYPE_UBO:
+        assert(e->ubo);
+        update_ubo(sc->ra, e->ubo, u);
+        break;
+    default: abort();
+    }
+}
+
+void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir)
+{
+    talloc_free(sc->cache_dir);
+    sc->cache_dir = talloc_strdup(sc, dir);
+}
+
+static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
+{
+    bool ret = false;
+
+    void *tmp = talloc_new(NULL);
+    struct ra_renderpass_params params = sc->params;
+
+    MP_VERBOSE(sc, "new shader program:\n");
+    if (sc->header_text.len) {
+        MP_VERBOSE(sc, "header:\n");
+        mp_log_source(sc->log, MSGL_V, sc->header_text.start);
+        MP_VERBOSE(sc, "body:\n");
+    }
+    if (sc->text.len)
+        mp_log_source(sc->log, MSGL_V, sc->text.start);
+
+    // The vertex shader uses mangled names for the vertex attributes, so that
+    // the fragment shader can use the "real" names. But the shader is expecting
+    // the vertex attribute names (at least with older GLSL targets for GL).
+    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
+                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
+    for (int n = 0; n < params.num_vertex_attribs; n++) {
+        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
+        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
+    }
+
+    const char *cache_header = "mpv shader cache v1\n";
+    char *cache_filename = NULL;
+    char *cache_dir = NULL;
+
+    if (sc->cache_dir && sc->cache_dir[0]) {
+        // Try to load it from a disk cache.
+        cache_dir = mp_get_user_path(tmp, sc->global, sc->cache_dir);
+
+        struct AVSHA *sha = av_sha_alloc();
+        if (!sha)
+            abort();
+        av_sha_init(sha, 256);
+        av_sha_update(sha, entry->total.start, entry->total.len);
+
+        uint8_t hash[256 / 8];
+        av_sha_final(sha, hash);
+        av_free(sha);
+
+        char hashstr[256 / 8 * 2 + 1];
+        for (int n = 0; n < 256 / 8; n++)
+            snprintf(hashstr + n * 2, sizeof(hashstr) - n * 2, "%02X", hash[n]);
+
+        cache_filename = mp_path_join(tmp, cache_dir, hashstr);
+        if (stat(cache_filename, &(struct stat){0}) == 0) {
+            MP_VERBOSE(sc, "Trying to load shader from disk...\n");
+            struct bstr cachedata =
+                stream_read_file(cache_filename, tmp, sc->global, 1000000000);
+            if (bstr_eatstart0(&cachedata, cache_header))
+                params.cached_program = cachedata;
+        }
+    }
+
+    // If using a UBO, also make sure to add it as an input value so the RA
+    // can see it
+    if (sc->ubo_size) {
+        entry->ubo_index = sc->params.num_inputs;
+        struct ra_renderpass_input ubo_input = {
+            .name = "UBO",
+            .type = RA_VARTYPE_BUF_RO,
+            .dim_v = 1,
+            .dim_m = 1,
+            .binding = sc->ubo_binding,
+        };
+        MP_TARRAY_APPEND(sc, params.inputs, params.num_inputs, ubo_input);
+    }
+
+    entry->pass = sc->ra->fns->renderpass_create(sc->ra, &params);
+    if (!entry->pass)
+        goto error;
+
+    if (sc->ubo_size) {
+        struct ra_buf_params ubo_params = {
+            .type = RA_BUF_TYPE_UNIFORM,
+            .size = sc->ubo_size,
+            .host_mutable = true,
+        };
+
+        entry->ubo = ra_buf_create(sc->ra, &ubo_params);
+        if (!entry->ubo) {
+            MP_ERR(sc, "Failed creating uniform buffer!\n");
+            goto error;
+        }
+    }
+
+    if (entry->pass && cache_filename) {
+        bstr nc = entry->pass->params.cached_program;
+        if (nc.len && !bstr_equals(params.cached_program, nc)) {
+            mp_mkdirp(cache_dir);
+
+            MP_VERBOSE(sc, "Writing shader cache file: %s\n", cache_filename);
+            FILE *out = fopen(cache_filename, "wb");
+            if (out) {
+                fwrite(cache_header, strlen(cache_header), 1, out);
+                fwrite(nc.start, nc.len, 1, out);
+                fclose(out);
+            }
+        }
+    }
+
+    ret = true;
+
+error:
+    talloc_free(tmp);
+    return ret;
+}
+
+#define ADD(x, ...) bstr_xappend_asprintf(sc, (x), __VA_ARGS__)
+#define ADD_BSTR(x, s) bstr_xappend(sc, (x), (s))
+
+static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
+{
+    // Add all of the UBO entries separately as members of their own buffer
+    if (sc->ubo_size > 0) {
+        ADD(dst, "layout(std140, binding=%d) uniform UBO {\n", sc->ubo_binding);
+        for (int n = 0; n < sc->num_uniforms; n++) {
+            struct sc_uniform *u = &sc->uniforms[n];
+            if (u->type != SC_UNIFORM_TYPE_UBO)
+                continue;
+            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset,
+                u->glsl_type, u->input.name);
+        }
+        ADD(dst, "};\n");
+    }
+
+    for (int n = 0; n < sc->num_uniforms; n++) {
+        struct sc_uniform *u = &sc->uniforms[n];
+        if (u->type != SC_UNIFORM_TYPE_GLOBAL)
+            continue;
+        switch (u->input.type) {
+        case RA_VARTYPE_INT:
+        case RA_VARTYPE_FLOAT:
+            assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM);
+            // fall through
+        case RA_VARTYPE_TEX:
+        case RA_VARTYPE_IMG_W:
+            // Vulkan requires explicitly assigning the bindings in the shader
+            // source. For OpenGL it's optional, but requires higher GL version
+            // so we don't do it (and instead have ra_gl update the bindings
+            // after program creation).
+            if (sc->ra->glsl_vulkan)
+                ADD(dst, "layout(binding=%d) ", u->input.binding);
+            ADD(dst, "uniform %s %s;\n", u->glsl_type, u->input.name);
+            break;
+        case RA_VARTYPE_BUF_RO:
+            ADD(dst, "layout(std140, binding=%d) uniform %s { %s };\n",
+                u->input.binding, u->input.name, u->buffer_format);
+            break;
+        case RA_VARTYPE_BUF_RW:
+            ADD(dst, "layout(std430, binding=%d) buffer %s { %s };\n",
+                u->input.binding, u->input.name, u->buffer_format);
+            break;
+        }
+    }
+}
+
+// 1. Generate vertex and fragment shaders from the fragment shader text added
+//    with gl_sc_add(). The generated shader program is cached (based on the
+//    text), so actual compilation happens only the first time.
+// 2. Update the uniforms and textures set with gl_sc_uniform_*.
+// 3. Make the new shader program current (glUseProgram()).
+// After that, you render, and then you call gc_sc_reset(), which does:
+// 1. Unbind the program and all textures.
+// 2. Reset the sc state and prepare for a new shader program. (All uniforms
+//    and fragment operations needed for the next program have to be re-added.)
+static void gl_sc_generate(struct gl_shader_cache *sc,
+                           enum ra_renderpass_type type,
+                           const struct ra_format *target_format)
+{
+    int glsl_version = sc->ra->glsl_version;
+    int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
+
+    sc->params.type = type;
+
+    // gl_sc_reset() must be called after ending the previous render process,
+    // and before starting a new one.
+    assert(!sc->needs_reset);
+    sc->needs_reset = true;
+
+    // gl_sc_set_vertex_format() must always be called
+    assert(sc->params.vertex_attribs);
+
+    // If using a UBO, pick a binding (needed for shader generation)
+    if (sc->ubo_size)
+        sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
+
+    for (int n = 0; n < MP_ARRAY_SIZE(sc->tmp); n++)
+        sc->tmp[n].len = 0;
+
+    // set up shader text (header + uniforms + body)
+    bstr *header = &sc->tmp[0];
+    ADD(header, "#version %d%s\n", glsl_version, glsl_es >= 300 ? " es" : "");
+    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
+        // This extension cannot be enabled in fragment shader. Enable it as
+        // an exception for compute shader.
+        ADD(header, "#extension GL_ARB_compute_shader : enable\n");
+    }
+    for (int n = 0; n < sc->num_exts; n++)
+        ADD(header, "#extension %s : enable\n", sc->exts[n]);
+    if (glsl_es) {
+        ADD(header, "precision mediump float;\n");
+        ADD(header, "precision mediump sampler2D;\n");
+        if (sc->ra->caps & RA_CAP_TEX_3D)
+            ADD(header, "precision mediump sampler3D;\n");
+    }
+
+    if (glsl_version >= 130) {
+        ADD(header, "#define tex1D texture\n");
+        ADD(header, "#define tex3D texture\n");
+    } else {
+        ADD(header, "#define tex1D texture1D\n");
+        ADD(header, "#define tex3D texture3D\n");
+        ADD(header, "#define texture texture2D\n");
+    }
+
+    if (sc->ra->glsl_vulkan && type == RA_RENDERPASS_TYPE_COMPUTE) {
+        ADD(header, "#define gl_GlobalInvocationIndex "
+                    "(gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID)\n");
+    }
+
+    // Additional helpers.
+    ADD(header, "#define LUT_POS(x, lut_size)"
+                " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
+
+    char *vert_in = glsl_version >= 130 ? "in" : "attribute";
+    char *vert_out = glsl_version >= 130 ? "out" : "varying";
+    char *frag_in = glsl_version >= 130 ? "in" : "varying";
+
+    struct bstr *vert = NULL, *frag = NULL, *comp = NULL;
+
+    if (type == RA_RENDERPASS_TYPE_RASTER) {
+        // vertex shader: we don't use the vertex shader, so just setup a
+        // dummy, which passes through the vertex array attributes.
+        bstr *vert_head = &sc->tmp[1];
+        ADD_BSTR(vert_head, *header);
+        bstr *vert_body = &sc->tmp[2];
+        ADD(vert_body, "void main() {\n");
+        bstr *frag_vaos = &sc->tmp[3];
+        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
+            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
+            const char *glsl_type = vao_glsl_type(e);
+            char loc[32] = {0};
+            if (sc->ra->glsl_vulkan)
+                snprintf(loc, sizeof(loc), "layout(location=%d) ", n);
+            if (strcmp(e->name, "position") == 0) {
+                // setting raster pos. requires setting gl_Position magic variable
+                assert(e->dim_v == 2 && e->type == RA_VARTYPE_FLOAT);
+                ADD(vert_head, "%s%s vec2 vertex_position;\n", loc, vert_in);
+                ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n");
+            } else {
+                ADD(vert_head, "%s%s %s vertex_%s;\n", loc, vert_in, glsl_type, e->name);
+                ADD(vert_head, "%s%s %s %s;\n", loc, vert_out, glsl_type, e->name);
+                ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name);
+                ADD(frag_vaos, "%s%s %s %s;\n", loc, frag_in, glsl_type, e->name);
+            }
+        }
+        ADD(vert_body, "}\n");
+        vert = vert_head;
+        ADD_BSTR(vert, *vert_body);
+
+        // fragment shader; still requires adding used uniforms and VAO elements
+        frag = &sc->tmp[4];
+        ADD_BSTR(frag, *header);
+        if (glsl_version >= 130) {
+            ADD(frag, "%sout vec4 out_color;\n",
+                sc->ra->glsl_vulkan ? "layout(location=0) " : "");
+        }
+        ADD_BSTR(frag, *frag_vaos);
+        add_uniforms(sc, frag);
+
+        ADD_BSTR(frag, sc->prelude_text);
+        ADD_BSTR(frag, sc->header_text);
+
+        ADD(frag, "void main() {\n");
+        // we require _all_ frag shaders to write to a "vec4 color"
+        ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
+        ADD_BSTR(frag, sc->text);
+        if (glsl_version >= 130) {
+            ADD(frag, "out_color = color;\n");
+        } else {
+            ADD(frag, "gl_FragColor = color;\n");
+        }
+        ADD(frag, "}\n");
+
+        // We need to fix the format of the render dst at renderpass creation
+        // time
+        assert(target_format);
+        sc->params.target_format = target_format;
+    }
+
+    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
+        comp = &sc->tmp[4];
+        ADD_BSTR(comp, *header);
+
+        add_uniforms(sc, comp);
+
+        ADD_BSTR(comp, sc->prelude_text);
+        ADD_BSTR(comp, sc->header_text);
+
+        ADD(comp, "void main() {\n");
+        ADD(comp, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); // convenience
+        ADD_BSTR(comp, sc->text);
+        ADD(comp, "}\n");
+    }
+
+    bstr *hash_total = &sc->tmp[5];
+
+    ADD(hash_total, "type %d\n", sc->params.type);
+
+    if (frag) {
+        ADD_BSTR(hash_total, *frag);
+        sc->params.frag_shader = frag->start;
+    }
+    ADD(hash_total, "\n");
+    if (vert) {
+        ADD_BSTR(hash_total, *vert);
+        sc->params.vertex_shader = vert->start;
+    }
+    ADD(hash_total, "\n");
+    if (comp) {
+        ADD_BSTR(hash_total, *comp);
+        sc->params.compute_shader = comp->start;
+    }
+    ADD(hash_total, "\n");
+
+    if (sc->params.enable_blend) {
+        ADD(hash_total, "blend %d %d %d %d\n",
+            sc->params.blend_src_rgb, sc->params.blend_dst_rgb,
+            sc->params.blend_src_alpha, sc->params.blend_dst_alpha);
+    }
+
+    if (sc->params.target_format)
+        ADD(hash_total, "format %s\n", sc->params.target_format->name);
+
+    struct sc_entry *entry = NULL;
+    for (int n = 0; n < sc->num_entries; n++) {
+        struct sc_entry *cur = sc->entries[n];
+        if (bstr_equals(cur->total, *hash_total)) {
+            entry = cur;
+            break;
+        }
+    }
+    if (!entry) {
+        if (sc->num_entries == SC_MAX_ENTRIES)
+            sc_flush_cache(sc);
+        entry = talloc_ptrtype(NULL, entry);
+        *entry = (struct sc_entry){
+            .total = bstrdup(entry, *hash_total),
+            .timer = timer_pool_create(sc->ra),
+        };
+        for (int n = 0; n < sc->num_uniforms; n++) {
+            struct sc_cached_uniform u = {0};
+            if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
+                // global uniforms need to be made visible to the ra_renderpass
+                u.index = sc->params.num_inputs;
+                MP_TARRAY_APPEND(sc, sc->params.inputs, sc->params.num_inputs,
+                                 sc->uniforms[n].input);
+            }
+            MP_TARRAY_APPEND(entry, entry->cached_uniforms,
+                             entry->num_cached_uniforms, u);
+        }
+        if (!create_pass(sc, entry))
+            sc->error_state = true;
+        MP_TARRAY_APPEND(sc, sc->entries, sc->num_entries, entry);
+    }
+    if (sc->error_state)
+        return;
+
+    assert(sc->num_uniforms == entry->num_cached_uniforms);
+
+    sc->num_values = 0;
+    for (int n = 0; n < sc->num_uniforms; n++)
+        update_uniform(sc, entry, &sc->uniforms[n], n);
+
+    // If we're using a UBO, make sure to bind it as well
+    if (sc->ubo_size) {
+        struct ra_renderpass_input_val ubo_val = {
+            .index = entry->ubo_index,
+            .data = &entry->ubo,
+        };
+        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, ubo_val);
+    }
+
+    sc->current_shader = entry;
+}
+
+struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
+                                        struct ra_tex *target,
+                                        void *ptr, size_t num)
+{
+    struct timer_pool *timer = NULL;
+
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
+    if (!sc->current_shader)
+        goto error;
+
+    timer = sc->current_shader->timer;
+
+    struct mp_rect full_rc = {0, 0, target->params.w, target->params.h};
+
+    struct ra_renderpass_run_params run = {
+        .pass = sc->current_shader->pass,
+        .values = sc->values,
+        .num_values = sc->num_values,
+        .target = target,
+        .vertex_data = ptr,
+        .vertex_count = num,
+        .viewport = full_rc,
+        .scissors = full_rc,
+    };
+
+    timer_pool_start(timer);
+    sc->ra->fns->renderpass_run(sc->ra, &run);
+    timer_pool_stop(timer);
+
+error:
+    gl_sc_reset(sc);
+    return timer_pool_measure(timer);
+}
+
+struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
+                                           int w, int h, int d)
+{
+    struct timer_pool *timer = NULL;
+
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
+    if (!sc->current_shader)
+        goto error;
+
+    timer = sc->current_shader->timer;
+
+    struct ra_renderpass_run_params run = {
+        .pass = sc->current_shader->pass,
+        .values = sc->values,
+        .num_values = sc->num_values,
+        .compute_groups = {w, h, d},
+    };
+
+    timer_pool_start(timer);
+    sc->ra->fns->renderpass_run(sc->ra, &run);
+    timer_pool_stop(timer);
+
+error:
+    gl_sc_reset(sc);
+    return timer_pool_measure(timer);
+}
diff --git a/video/out/gpu/shader_cache.h b/video/out/gpu/shader_cache.h
new file mode 100644
index 0000000000..82a078079b
--- /dev/null
+++ b/video/out/gpu/shader_cache.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "ra.h"
+
+// For mp_pass_perf
+#include "video/out/vo.h"
+
+struct mp_log;
+struct mpv_global;
+struct gl_shader_cache;
+
+struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
+                                     struct mp_log *log);
+void gl_sc_destroy(struct gl_shader_cache *sc);
+bool gl_sc_error_state(struct gl_shader_cache *sc);
+void gl_sc_reset_error(struct gl_shader_cache *sc);
+void gl_sc_add(struct gl_shader_cache *sc, const char *text);
+void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
+    PRINTF_ATTRIBUTE(2, 3);
+void gl_sc_hadd(struct gl_shader_cache *sc, const char *text);
+void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
+    PRINTF_ATTRIBUTE(2, 3);
+void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
+void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
+    PRINTF_ATTRIBUTE(2, 3);
+void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
+                           struct ra_tex *tex);
+void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
+                              struct ra_tex *tex);
+void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
+                char *format, ...) PRINTF_ATTRIBUTE(4, 5);
+void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f);
+void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int f);
+void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2]);
+void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3]);
+void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v);
+void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v);
+void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
+                             const struct ra_renderpass_input *vertex_attribs,
+                             int vertex_stride);
+void gl_sc_blend(struct gl_shader_cache *sc,
+                 enum ra_blend blend_src_rgb,
+                 enum ra_blend blend_dst_rgb,
+                 enum ra_blend blend_src_alpha,
+                 enum ra_blend blend_dst_alpha);
+void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
+struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
+                                        struct ra_tex *target,
+                                        void *ptr, size_t num);
+struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
+                                           int w, int h, int d);
+void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir);
diff --git a/video/out/gpu/user_shaders.c b/video/out/gpu/user_shaders.c
new file mode 100644
index 0000000000..446941b03f
--- /dev/null
+++ b/video/out/gpu/user_shaders.c
@@ -0,0 +1,452 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+
+#include "common/msg.h"
+#include "misc/ctype.h"
+#include "user_shaders.h"
+
+static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
+{
+    int pos = 0;
+
+    while (line.len > 0) {
+        struct bstr word = bstr_strip(bstr_splitchar(line, &line, ' '));
+        if (word.len == 0)
+            continue;
+
+        if (pos >= MAX_SZEXP_SIZE)
+            return false;
+
+        struct szexp *exp = &out[pos++];
+
+        if (bstr_eatend0(&word, ".w") || bstr_eatend0(&word, ".width")) {
+            exp->tag = SZEXP_VAR_W;
+            exp->val.varname = word;
+            continue;
+        }
+
+        if (bstr_eatend0(&word, ".h") || bstr_eatend0(&word, ".height")) {
+            exp->tag = SZEXP_VAR_H;
+            exp->val.varname = word;
+            continue;
+        }
+
+        switch (word.start[0]) {
+        case '+': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_ADD; continue;
+        case '-': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_SUB; continue;
+        case '*': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_MUL; continue;
+        case '/': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_DIV; continue;
+        case '!': exp->tag = SZEXP_OP1; exp->val.op = SZEXP_OP_NOT; continue;
+        case '>': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_GT;  continue;
+        case '<': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_LT;  continue;
+        }
+
+        if (mp_isdigit(word.start[0])) {
+            exp->tag = SZEXP_CONST;
+            if (bstr_sscanf(word, "%f", &exp->val.cval) != 1)
+                return false;
+            continue;
+        }
+
+        // Some sort of illegal expression
+        return false;
+    }
+
+    return true;
+}
+
+// Returns whether successful. 'result' is left untouched on failure
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result)
+{
+    float stack[MAX_SZEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
+
+    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SZEXP_END:
+            goto done;
+
+        case SZEXP_CONST:
+            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SZEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
+
+        case SZEXP_OP1:
+            if (idx < 1) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            switch (expr[i].val.op) {
+            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: abort();
+            }
+            continue;
+
+        case SZEXP_OP2:
+            if (idx < 2) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SZEXP_OP_ADD: res = op1 + op2; break;
+            case SZEXP_OP_SUB: res = op1 - op2; break;
+            case SZEXP_OP_MUL: res = op1 * op2; break;
+            case SZEXP_OP_DIV: res = op1 / op2; break;
+            case SZEXP_OP_GT:  res = op1 > op2; break;
+            case SZEXP_OP_LT:  res = op1 < op2; break;
+            default: abort();
+            }
+
+            if (!isfinite(res)) {
+                mp_warn(log, "Illegal operation in RPN expression!\n");
+                return false;
+            }
+
+            stack[idx++] = res;
+            continue;
+
+        case SZEXP_VAR_W:
+        case SZEXP_VAR_H: {
+            struct bstr name = expr[i].val.varname;
+            float size[2];
+
+            if (!lookup(priv, name, size)) {
+                mp_warn(log, "Variable %.*s not found in RPN expression!\n",
+                        BSTR_P(name));
+                return false;
+            }
+
+            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? size[0] : size[1];
+            continue;
+            }
+        }
+    }
+
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        mp_warn(log, "Malformed stack after RPN expression!\n");
+        return false;
+    }
+
+    *result = stack[0];
+    return true;
+}
+
+static bool parse_hook(struct mp_log *log, struct bstr *body,
+                       struct gl_user_shader_hook *out)
+{
+    *out = (struct gl_user_shader_hook){
+        .pass_desc = bstr0("(unknown)"),
+        .offset = identity_trans,
+        .width = {{ SZEXP_VAR_W, { .varname = bstr0("HOOKED") }}},
+        .height = {{ SZEXP_VAR_H, { .varname = bstr0("HOOKED") }}},
+        .cond = {{ SZEXP_CONST, { .cval = 1.0 }}},
+    };
+
+    int hook_idx = 0;
+    int bind_idx = 0;
+
+    // Parse all headers
+    while (true) {
+        struct bstr rest;
+        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
+
+        // Check for the presence of the magic line beginning
+        if (!bstr_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        // Parse the supported commands
+        if (bstr_eatstart0(&line, "HOOK")) {
+            if (hook_idx == SHADER_MAX_HOOKS) {
+                mp_err(log, "Passes may only hook up to %d textures!\n",
+                       SHADER_MAX_HOOKS);
+                return false;
+            }
+            out->hook_tex[hook_idx++] = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "BIND")) {
+            if (bind_idx == SHADER_MAX_BINDS) {
+                mp_err(log, "Passes may only bind up to %d textures!\n",
+                       SHADER_MAX_BINDS);
+                return false;
+            }
+            out->bind_tex[bind_idx++] = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "SAVE")) {
+            out->save_tex = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "DESC")) {
+            out->pass_desc = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "OFFSET")) {
+            float ox, oy;
+            if (bstr_sscanf(line, "%f %f", &ox, &oy) != 2) {
+                mp_err(log, "Error while parsing OFFSET!\n");
+                return false;
+            }
+            out->offset.t[0] = ox;
+            out->offset.t[1] = oy;
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "WIDTH")) {
+            if (!parse_rpn_szexpr(line, out->width)) {
+                mp_err(log, "Error while parsing WIDTH!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "HEIGHT")) {
+            if (!parse_rpn_szexpr(line, out->height)) {
+                mp_err(log, "Error while parsing HEIGHT!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "WHEN")) {
+            if (!parse_rpn_szexpr(line, out->cond)) {
+                mp_err(log, "Error while parsing WHEN!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "COMPONENTS")) {
+            if (bstr_sscanf(line, "%d", &out->components) != 1) {
+                mp_err(log, "Error while parsing COMPONENTS!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "COMPUTE")) {
+            struct compute_info *ci = &out->compute;
+            int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
+                                  &ci->threads_w, &ci->threads_h);
+
+            if (num == 2 || num == 4) {
+                ci->active = true;
+                ci->directly_writes = true;
+            } else {
+                mp_err(log, "Error while parsing COMPUTE!\n");
+                return false;
+            }
+            continue;
+        }
+
+        // Unknown command type
+        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
+        return false;
+    }
+
+    // The rest of the file up until the next magic line beginning (if any)
+    // shall be the shader body
+    if (bstr_split_tok(*body, "//!", &out->pass_body, body)) {
+        // Make sure the magic line is part of the rest
+        body->start -= 3;
+        body->len += 3;
+    }
+
+    // Sanity checking
+    if (hook_idx == 0)
+        mp_warn(log, "Pass has no hooked textures (will be ignored)!\n");
+
+    return true;
+}
+
+static bool parse_tex(struct mp_log *log, struct ra *ra, struct bstr *body,
+                      struct gl_user_shader_tex *out)
+{
+    *out = (struct gl_user_shader_tex){
+        .name = bstr0("USER_TEX"),
+        .params = {
+            .dimensions = 2,
+            .w = 1, .h = 1, .d = 1,
+            .render_src = true,
+            .src_linear = true,
+        },
+    };
+    struct ra_tex_params *p = &out->params;
+
+    while (true) {
+        struct bstr rest;
+        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
+
+        if (!bstr_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (bstr_eatstart0(&line, "TEXTURE")) {
+            out->name = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "SIZE")) {
+            p->dimensions = bstr_sscanf(line, "%d %d %d", &p->w, &p->h, &p->d);
+            if (p->dimensions < 1 || p->dimensions > 3 ||
+                p->w < 1 || p->h < 1 || p->d < 1)
+            {
+                mp_err(log, "Error while parsing SIZE!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "FORMAT ")) {
+            p->format = NULL;
+            for (int n = 0; n < ra->num_formats; n++) {
+                const struct ra_format *fmt = ra->formats[n];
+                if (bstr_equals0(line, fmt->name)) {
+                    p->format = fmt;
+                    break;
+                }
+            }
+            // (pixel_size==0 is for opaque formats)
+            if (!p->format || !p->format->pixel_size) {
+                mp_err(log, "Unrecognized/unavailable FORMAT name: '%.*s'!\n",
+                       BSTR_P(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "FILTER")) {
+            line = bstr_strip(line);
+            if (bstr_equals0(line, "LINEAR")) {
+                p->src_linear = true;
+            } else if (bstr_equals0(line, "NEAREST")) {
+                p->src_linear = false;
+            } else {
+                mp_err(log, "Unrecognized FILTER: '%.*s'!\n", BSTR_P(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "BORDER")) {
+            line = bstr_strip(line);
+            if (bstr_equals0(line, "CLAMP")) {
+                p->src_repeat = false;
+            } else if (bstr_equals0(line, "REPEAT")) {
+                p->src_repeat = true;
+            } else {
+                mp_err(log, "Unrecognized BORDER: '%.*s'!\n", BSTR_P(line));
+                return false;
+            }
+            continue;
+        }
+
+        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
+        return false;
+    }
+
+    if (!p->format) {
+        mp_err(log, "No FORMAT specified.\n");
+        return false;
+    }
+
+    if (p->src_linear && !p->format->linear_filter) {
+        mp_err(log, "The specified texture format cannot be filtered!\n");
+        return false;
+    }
+
+    // Decode the rest of the section (up to the next //! marker) as raw hex
+    // data for the texture
+    struct bstr hexdata;
+    if (bstr_split_tok(*body, "//!", &hexdata, body)) {
+        // Make sure the magic line is part of the rest
+        body->start -= 3;
+        body->len += 3;
+    }
+
+    struct bstr tex;
+    if (!bstr_decode_hex(NULL, bstr_strip(hexdata), &tex)) {
+        mp_err(log, "Error while parsing TEXTURE body: must be a valid "
+                    "hexadecimal sequence, on a single line!\n");
+        return false;
+    }
+
+    int expected_len = p->w * p->h * p->d * p->format->pixel_size;
+    if (tex.len != expected_len) {
+        mp_err(log, "Shader TEXTURE size mismatch: got %zd bytes, expected %d!\n",
+               tex.len, expected_len);
+        talloc_free(tex.start);
+        return false;
+    }
+
+    p->initial_data = tex.start;
+    return true;
+}
+
+void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
+                       void *priv,
+                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
+                       bool (*dotex)(void *p, struct gl_user_shader_tex tex))
+{
+    if (!dohook || !dotex || !shader.len)
+        return;
+
+    // Skip all garbage (e.g. comments) before the first header
+    int pos = bstr_find(shader, bstr0("//!"));
+    if (pos < 0) {
+        mp_warn(log, "Shader appears to contain no headers!\n");
+        return;
+    }
+    shader = bstr_cut(shader, pos);
+
+    // Loop over the file
+    while (shader.len > 0)
+    {
+        // Peek at the first header to dispatch the right type
+        if (bstr_startswith0(shader, "//!TEXTURE")) {
+            struct gl_user_shader_tex t;
+            if (!parse_tex(log, ra, &shader, &t) || !dotex(priv, t))
+                return;
+            continue;
+        }
+
+        struct gl_user_shader_hook h;
+        if (!parse_hook(log, &shader, &h) || !dohook(priv, h))
+            return;
+    }
+}
diff --git a/video/out/gpu/user_shaders.h b/video/out/gpu/user_shaders.h
new file mode 100644
index 0000000000..94a070c8e2
--- /dev/null
+++ b/video/out/gpu/user_shaders.h
@@ -0,0 +1,98 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_USER_SHADERS_H
+#define MP_GL_USER_SHADERS_H
+
+#include "utils.h"
+#include "ra.h"
+
+#define SHADER_MAX_PASSES 32
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 6
+#define SHADER_MAX_SAVED 64
+#define MAX_SZEXP_SIZE 32
+
+enum szexp_op {
+    SZEXP_OP_ADD,
+    SZEXP_OP_SUB,
+    SZEXP_OP_MUL,
+    SZEXP_OP_DIV,
+    SZEXP_OP_NOT,
+    SZEXP_OP_GT,
+    SZEXP_OP_LT,
+};
+
+enum szexp_tag {
+    SZEXP_END = 0, // End of an RPN expression
+    SZEXP_CONST, // Push a constant value onto the stack
+    SZEXP_VAR_W, // Get the width/height of a named texture (variable)
+    SZEXP_VAR_H,
+    SZEXP_OP2, // Pop two elements and push the result of a dyadic operation
+    SZEXP_OP1, // Pop one element and push the result of a monadic operation
+};
+
+struct szexp {
+    enum szexp_tag tag;
+    union {
+        float cval;
+        struct bstr varname;
+        enum szexp_op op;
+    } val;
+};
+
+struct compute_info {
+    bool active;
+    int block_w, block_h;     // Block size (each block corresponds to one WG)
+    int threads_w, threads_h; // How many threads form a working group
+    bool directly_writes;     // If true, shader is assumed to imageStore(out_image)
+};
+
+struct gl_user_shader_hook {
+    struct bstr pass_desc;
+    struct bstr hook_tex[SHADER_MAX_HOOKS];
+    struct bstr bind_tex[SHADER_MAX_BINDS];
+    struct bstr save_tex;
+    struct bstr pass_body;
+    struct gl_transform offset;
+    struct szexp width[MAX_SZEXP_SIZE];
+    struct szexp height[MAX_SZEXP_SIZE];
+    struct szexp cond[MAX_SZEXP_SIZE];
+    int components;
+    struct compute_info compute;
+};
+
+struct gl_user_shader_tex {
+    struct bstr name;
+    struct ra_tex_params params;
+    // for video.c
+    struct ra_tex *tex;
+};
+
+// Parse the next shader block from `body`. The callbacks are invoked on every
+// valid shader block parsed.
+void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
+                       void *priv,
+                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
+                       bool (*dotex)(void *p, struct gl_user_shader_tex tex));
+
+// Evaluate a szexp, given a lookup function for named textures
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result);
+
+#endif
diff --git a/video/out/gpu/utils.c b/video/out/gpu/utils.c
new file mode 100644
index 0000000000..f8dcbaac60
--- /dev/null
+++ b/video/out/gpu/utils.c
@@ -0,0 +1,372 @@
+#include "common/msg.h"
+#include "video/out/vo.h"
+#include "utils.h"
+
+// Standard parallel 2D projection, except y1 < y0 means that the coordinate
+// system is flipped, not the projection.
+void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
+                        float y0, float y1)
+{
+    if (y1 < y0) {
+        float tmp = y0;
+        y0 = tmp - y1;
+        y1 = tmp;
+    }
+
+    t->m[0][0] = 2.0f / (x1 - x0);
+    t->m[0][1] = 0.0f;
+    t->m[1][0] = 0.0f;
+    t->m[1][1] = 2.0f / (y1 - y0);
+    t->t[0] = -(x1 + x0) / (x1 - x0);
+    t->t[1] = -(y1 + y0) / (y1 - y0);
+}
+
+// Apply the effects of one transformation to another, transforming it in the
+// process. In other words: post-composes t onto x
+void gl_transform_trans(struct gl_transform t, struct gl_transform *x)
+{
+    struct gl_transform xt = *x;
+    x->m[0][0] = t.m[0][0] * xt.m[0][0] + t.m[0][1] * xt.m[1][0];
+    x->m[1][0] = t.m[1][0] * xt.m[0][0] + t.m[1][1] * xt.m[1][0];
+    x->m[0][1] = t.m[0][0] * xt.m[0][1] + t.m[0][1] * xt.m[1][1];
+    x->m[1][1] = t.m[1][0] * xt.m[0][1] + t.m[1][1] * xt.m[1][1];
+    gl_transform_vec(t, &x->t[0], &x->t[1]);
+}
+
+void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo)
+{
+    int y_dir = fbo.flip ? -1 : 1;
+    gl_transform_ortho(t, 0, fbo.tex->params.w, 0, fbo.tex->params.h * y_dir);
+}
+
+void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool)
+{
+    for (int i = 0; i < pool->num_buffers; i++)
+        ra_buf_free(ra, &pool->buffers[i]);
+
+    talloc_free(pool->buffers);
+    *pool = (struct ra_buf_pool){0};
+}
+
+static bool ra_buf_params_compatible(const struct ra_buf_params *new,
+                                     const struct ra_buf_params *old)
+{
+    return new->type == old->type &&
+           new->size <= old->size &&
+           new->host_mapped  == old->host_mapped &&
+           new->host_mutable == old->host_mutable;
+}
+
+static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
+{
+    struct ra_buf *buf = ra_buf_create(ra, &pool->current_params);
+    if (!buf)
+        return false;
+
+    MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
+    MP_VERBOSE(ra, "Resized buffer pool of type %u to size %d\n",
+               pool->current_params.type, pool->num_buffers);
+    return true;
+}
+
+struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
+                               const struct ra_buf_params *params)
+{
+    assert(!params->initial_data);
+
+    if (!ra_buf_params_compatible(params, &pool->current_params)) {
+        ra_buf_pool_uninit(ra, pool);
+        pool->current_params = *params;
+    }
+
+    // Make sure we have at least one buffer available
+    if (!pool->buffers && !ra_buf_pool_grow(ra, pool))
+        return NULL;
+
+    // Make sure the next buffer is available for use
+    if (!ra->fns->buf_poll(ra, pool->buffers[pool->index]) &&
+        !ra_buf_pool_grow(ra, pool))
+    {
+        return NULL;
+    }
+
+    struct ra_buf *buf = pool->buffers[pool->index++];
+    pool->index %= pool->num_buffers;
+
+    return buf;
+}
+
+bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
+                       const struct ra_tex_upload_params *params)
+{
+    if (params->buf)
+        return ra->fns->tex_upload(ra, params);
+
+    struct ra_tex *tex = params->tex;
+    size_t row_size = tex->params.dimensions == 2 ? params->stride :
+                      tex->params.w * tex->params.format->pixel_size;
+
+    struct ra_buf_params bufparams = {
+        .type = RA_BUF_TYPE_TEX_UPLOAD,
+        .size = row_size * tex->params.h * tex->params.d,
+        .host_mutable = true,
+    };
+
+    struct ra_buf *buf = ra_buf_pool_get(ra, pbo, &bufparams);
+    if (!buf)
+        return false;
+
+    ra->fns->buf_update(ra, buf, 0, params->src, bufparams.size);
+
+    struct ra_tex_upload_params newparams = *params;
+    newparams.buf = buf;
+    newparams.src = NULL;
+
+    return ra->fns->tex_upload(ra, &newparams);
+}
+
+struct ra_layout std140_layout(struct ra_renderpass_input *inp)
+{
+    size_t el_size = ra_vartype_size(inp->type);
+
+    // std140 packing rules:
+    // 1. The alignment of generic values is their size in bytes
+    // 2. The alignment of vectors is the vector length * the base count, with
+    // the exception of vec3 which is always aligned like vec4
+    // 3. The alignment of arrays is that of the element size rounded up to
+    // the nearest multiple of vec4
+    // 4. Matrices are treated like arrays of vectors
+    // 5. Arrays/matrices are laid out with a stride equal to the alignment
+    size_t size = el_size * inp->dim_v;
+    if (inp->dim_v == 3)
+        size += el_size;
+    if (inp->dim_m > 1)
+        size = MP_ALIGN_UP(size, sizeof(float[4]));
+
+    return (struct ra_layout) {
+        .align  = size,
+        .stride = size,
+        .size   = size * inp->dim_m,
+    };
+}
+
+struct ra_layout std430_layout(struct ra_renderpass_input *inp)
+{
+    size_t el_size = ra_vartype_size(inp->type);
+
+    // std430 packing rules: like std140, except arrays/matrices are always
+    // "tightly" packed, even arrays/matrices of vec3s
+    size_t size = el_size * inp->dim_v;
+    if (inp->dim_v == 3 && inp->dim_m == 1)
+        size += el_size;
+
+    return (struct ra_layout) {
+        .align  = size,
+        .stride = size,
+        .size   = size * inp->dim_m,
+    };
+}
+
+// Create a texture and a FBO using the texture as color attachments.
+//  fmt: texture internal format
+// If the parameters are the same as the previous call, do not touch it.
+// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H.
+// Enabling FUZZY for W or H means the w or h does not need to be exact.
+bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
+                   int w, int h, const struct ra_format *fmt, int flags)
+{
+    int lw = w, lh = h;
+
+    if (fbo->tex) {
+        int cw = w, ch = h;
+        int rw = fbo->tex->params.w, rh = fbo->tex->params.h;
+
+        if ((flags & FBOTEX_FUZZY_W) && cw < rw)
+            cw = rw;
+        if ((flags & FBOTEX_FUZZY_H) && ch < rh)
+            ch = rh;
+
+        if (rw == cw && rh == ch && fbo->tex->params.format == fmt)
+            goto done;
+    }
+
+    if (flags & FBOTEX_FUZZY_W)
+        w = MP_ALIGN_UP(w, 256);
+    if (flags & FBOTEX_FUZZY_H)
+        h = MP_ALIGN_UP(h, 256);
+
+    mp_verbose(log, "Create FBO: %dx%d (%dx%d)\n", lw, lh, w, h);
+
+    if (!fmt || !fmt->renderable || !fmt->linear_filter) {
+        mp_err(log, "Format %s not supported.\n", fmt ? fmt->name : "(unset)");
+        return false;
+    }
+
+    fbotex_uninit(fbo);
+
+    *fbo = (struct fbotex) {
+        .ra = ra,
+    };
+
+    struct ra_tex_params params = {
+        .dimensions = 2,
+        .w = w,
+        .h = h,
+        .d = 1,
+        .format = fmt,
+        .src_linear = true,
+        .render_src = true,
+        .render_dst = true,
+        .storage_dst = true,
+        .blit_src = true,
+    };
+
+    fbo->tex = ra_tex_create(fbo->ra, &params);
+
+    if (!fbo->tex) {
+        mp_err(log, "Error: framebuffer could not be created.\n");
+        fbotex_uninit(fbo);
+        return false;
+    }
+
+done:
+
+    fbo->lw = lw;
+    fbo->lh = lh;
+
+    fbo->fbo = (struct fbodst){
+        .tex = fbo->tex,
+    };
+
+    return true;
+}
+
+void fbotex_uninit(struct fbotex *fbo)
+{
+    if (fbo->ra) {
+        ra_tex_free(fbo->ra, &fbo->tex);
+        *fbo = (struct fbotex) {0};
+    }
+}
+
+struct timer_pool {
+    struct ra *ra;
+    ra_timer *timer;
+    bool running; // detect invalid usage
+
+    uint64_t samples[VO_PERF_SAMPLE_COUNT];
+    int sample_idx;
+    int sample_count;
+
+    uint64_t sum;
+    uint64_t peak;
+};
+
+struct timer_pool *timer_pool_create(struct ra *ra)
+{
+    if (!ra->fns->timer_create)
+        return NULL;
+
+    ra_timer *timer = ra->fns->timer_create(ra);
+    if (!timer)
+        return NULL;
+
+    struct timer_pool *pool = talloc(NULL, struct timer_pool);
+    if (!pool) {
+        ra->fns->timer_destroy(ra, timer);
+        return NULL;
+    }
+
+    *pool = (struct timer_pool){ .ra = ra, .timer = timer };
+    return pool;
+}
+
+void timer_pool_destroy(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    pool->ra->fns->timer_destroy(pool->ra, pool->timer);
+    talloc_free(pool);
+}
+
+void timer_pool_start(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    assert(!pool->running);
+    pool->ra->fns->timer_start(pool->ra, pool->timer);
+    pool->running = true;
+}
+
+void timer_pool_stop(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    assert(pool->running);
+    uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
+    pool->running = false;
+
+    if (res) {
+        // Input res into the buffer and grab the previous value
+        uint64_t old = pool->samples[pool->sample_idx];
+        pool->sample_count = MPMIN(pool->sample_count + 1, VO_PERF_SAMPLE_COUNT);
+        pool->samples[pool->sample_idx++] = res;
+        pool->sample_idx %= VO_PERF_SAMPLE_COUNT;
+        pool->sum = pool->sum + res - old;
+
+        // Update peak if necessary
+        if (res >= pool->peak) {
+            pool->peak = res;
+        } else if (pool->peak == old) {
+            // It's possible that the last peak was the value we just removed,
+            // if so we need to scan for the new peak
+            uint64_t peak = res;
+            for (int i = 0; i < VO_PERF_SAMPLE_COUNT; i++)
+                peak = MPMAX(peak, pool->samples[i]);
+            pool->peak = peak;
+        }
+    }
+}
+
+struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
+{
+    if (!pool)
+        return (struct mp_pass_perf){0};
+
+    struct mp_pass_perf res = {
+        .peak = pool->peak,
+        .count = pool->sample_count,
+    };
+
+    int idx = pool->sample_idx - pool->sample_count + VO_PERF_SAMPLE_COUNT;
+    for (int i = 0; i < res.count; i++) {
+        idx %= VO_PERF_SAMPLE_COUNT;
+        res.samples[i] = pool->samples[idx++];
+    }
+
+    if (res.count > 0) {
+        res.last = res.samples[res.count - 1];
+        res.avg = pool->sum / res.count;
+    }
+
+    return res;
+}
+
+void mp_log_source(struct mp_log *log, int lev, const char *src)
+{
+    int line = 1;
+    if (!src)
+        return;
+    while (*src) {
+        const char *end = strchr(src, '\n');
+        const char *next = end + 1;
+        if (!end)
+            next = end = src + strlen(src);
+        mp_msg(log, lev, "[%3d] %.*s\n", line, (int)(end - src), src);
+        line++;
+        src = next;
+    }
+}
diff --git a/video/out/gpu/utils.h b/video/out/gpu/utils.h
new file mode 100644
index 0000000000..04695f8085
--- /dev/null
+++ b/video/out/gpu/utils.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <stdbool.h>
+#include <math.h>
+
+#include "ra.h"
+
+// A 3x2 matrix, with the translation part separate.
+struct gl_transform {
+    // row-major, e.g. in mathematical notation:
+    //  | m[0][0] m[0][1] |
+    //  | m[1][0] m[1][1] |
+    float m[2][2];
+    float t[2];
+};
+
+static const struct gl_transform identity_trans = {
+    .m = {{1.0, 0.0}, {0.0, 1.0}},
+    .t = {0.0, 0.0},
+};
+
+void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
+                        float y0, float y1);
+
+// This treats m as an affine transformation, in other words m[2][n] gets
+// added to the output.
+static inline void gl_transform_vec(struct gl_transform t, float *x, float *y)
+{
+    float vx = *x, vy = *y;
+    *x = vx * t.m[0][0] + vy * t.m[0][1] + t.t[0];
+    *y = vx * t.m[1][0] + vy * t.m[1][1] + t.t[1];
+}
+
+struct mp_rect_f {
+    float x0, y0, x1, y1;
+};
+
+// Semantic equality (fuzzy comparison)
+static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
+{
+    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
+           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
+}
+
+static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
+{
+    gl_transform_vec(t, &r->x0, &r->y0);
+    gl_transform_vec(t, &r->x1, &r->y1);
+}
+
+static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
+{
+    for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+            if (a.m[x][y] != b.m[x][y])
+                return false;
+        }
+    }
+
+    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
+}
+
+void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
+
+struct fbodst {
+    struct ra_tex *tex;
+    bool flip; // mirror vertically
+};
+
+void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo);
+
+// A pool of buffers, which can grow as needed
+struct ra_buf_pool {
+    struct ra_buf_params current_params;
+    struct ra_buf **buffers;
+    int num_buffers;
+    int index;
+};
+
+void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool);
+
+// Note: params->initial_data is *not* supported
+struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
+                               const struct ra_buf_params *params);
+
+// Helper that wraps ra_tex_upload using texture upload buffers to ensure that
+// params->buf is always set. This is intended for RA-internal usage.
+bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
+                       const struct ra_tex_upload_params *params);
+
+// Layout rules for GLSL's packing modes
+struct ra_layout std140_layout(struct ra_renderpass_input *inp);
+struct ra_layout std430_layout(struct ra_renderpass_input *inp);
+
+struct fbotex {
+    struct ra *ra;
+    struct ra_tex *tex;
+    int lw, lh; // logical (configured) size, <= than texture size
+    struct fbodst fbo;
+};
+
+void fbotex_uninit(struct fbotex *fbo);
+bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
+                   int w, int h, const struct ra_format *fmt, int flags);
+#define FBOTEX_FUZZY_W 1
+#define FBOTEX_FUZZY_H 2
+#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
+
+// A wrapper around ra_timer that does result pooling, averaging etc.
+struct timer_pool;
+
+struct timer_pool *timer_pool_create(struct ra *ra);
+void timer_pool_destroy(struct timer_pool *pool);
+void timer_pool_start(struct timer_pool *pool);
+void timer_pool_stop(struct timer_pool *pool);
+struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);
+
+// print a multi line string with line numbers (e.g. for shader sources)
+// log, lev: module and log level, as in mp_msg()
+void mp_log_source(struct mp_log *log, int lev, const char *src);
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
new file mode 100644
index 0000000000..e36fde60e8
--- /dev/null
+++ b/video/out/gpu/video.c
@@ -0,0 +1,3809 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include <libavutil/common.h>
+#include <libavutil/lfg.h>
+
+#include "video.h"
+
+#include "misc/bstr.h"
+#include "options/m_config.h"
+#include "common/global.h"
+#include "options/options.h"
+#include "utils.h"
+#include "hwdec.h"
+#include "osd.h"
+#include "ra.h"
+#include "stream/stream.h"
+#include "video_shaders.h"
+#include "user_shaders.h"
+#include "video/out/filter_kernels.h"
+#include "video/out/aspect.h"
+#include "video/out/dither.h"
+#include "video/out/vo.h"
+
+// scale/cscale arguments that map directly to shader filter routines.
+// Note that the convolution filters are not included in this list.
+static const char *const fixed_scale_filters[] = {
+    "bilinear",
+    "bicubic_fast",
+    "oversample",
+    NULL
+};
+static const char *const fixed_tscale_filters[] = {
+    "oversample",
+    "linear",
+    NULL
+};
+
+// must be sorted, and terminated with 0
+int filter_sizes[] =
+    {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
+int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
+
+struct vertex_pt {
+    float x, y;
+};
+
+struct vertex {
+    struct vertex_pt position;
+    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
+};
+
+static const struct ra_renderpass_input vertex_vao[] = {
+    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
+    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
+    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
+    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
+    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
+    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
+    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
+    {0}
+};
+
+struct texplane {
+    struct ra_tex *tex;
+    int w, h;
+    bool flipped;
+};
+
+struct video_image {
+    struct texplane planes[4];
+    struct mp_image *mpi;       // original input image
+    uint64_t id;                // unique ID identifying mpi contents
+    bool hwdec_mapped;
+};
+
+enum plane_type {
+    PLANE_NONE = 0,
+    PLANE_RGB,
+    PLANE_LUMA,
+    PLANE_CHROMA,
+    PLANE_ALPHA,
+    PLANE_XYZ,
+};
+
+static const char *plane_names[] = {
+    [PLANE_NONE] = "unknown",
+    [PLANE_RGB] = "rgb",
+    [PLANE_LUMA] = "luma",
+    [PLANE_CHROMA] = "chroma",
+    [PLANE_ALPHA] = "alpha",
+    [PLANE_XYZ] = "xyz",
+};
+
+// A self-contained description of a source image which can be bound to a
+// texture unit and sampled from. Contains metadata about how it's to be used
+struct img_tex {
+    enum plane_type type; // must be set to something non-zero
+    int components; // number of relevant coordinates
+    float multiplier; // multiplier to be used when sampling
+    struct ra_tex *tex;
+    int w, h; // logical size (after transformation)
+    struct gl_transform transform; // rendering transformation
+};
+
+// A named img_tex, for user scripting purposes
+struct saved_tex {
+    const char *name;
+    struct img_tex tex;
+};
+
+// A texture hook. This is some operation that transforms a named texture as
+// soon as it's generated
+struct tex_hook {
+    const char *save_tex;
+    const char *hook_tex[SHADER_MAX_HOOKS];
+    const char *bind_tex[TEXUNIT_VIDEO_NUM];
+    int components; // how many components are relevant (0 = same as input)
+    void *priv; // this gets talloc_freed when the tex_hook is removed
+    void (*hook)(struct gl_video *p, struct img_tex tex, // generates GLSL
+                 struct gl_transform *trans, void *priv);
+    bool (*cond)(struct gl_video *p, struct img_tex tex, void *priv);
+};
+
+struct fbosurface {
+    struct fbotex fbotex;
+    uint64_t id;
+    double pts;
+};
+
+#define FBOSURFACES_MAX 10
+
+struct cached_file {
+    char *path;
+    struct bstr body;
+};
+
+struct pass_info {
+    struct bstr desc;
+    struct mp_pass_perf perf;
+};
+
+#define PASS_INFO_MAX (SHADER_MAX_PASSES + 32)
+
+struct dr_buffer {
+    struct ra_buf *buf;
+    // The mpi reference will keep the data from being recycled (or from other
+    // references gaining write access) while the GPU is accessing the buffer.
+    struct mp_image *mpi;
+};
+
+struct gl_video {
+    struct ra *ra;
+
+    struct mpv_global *global;
+    struct mp_log *log;
+    struct gl_video_opts opts;
+    struct m_config_cache *opts_cache;
+    struct gl_lcms *cms;
+
+    int fb_depth;               // actual bits available in GL main framebuffer
+    struct m_color clear_color;
+    bool force_clear_color;
+
+    struct gl_shader_cache *sc;
+
+    struct osd_state *osd_state;
+    struct mpgl_osd *osd;
+    double osd_pts;
+
+    struct ra_tex *lut_3d_texture;
+    bool use_lut_3d;
+    int lut_3d_size[3];
+
+    struct ra_tex *dither_texture;
+
+    struct mp_image_params real_image_params;   // configured format
+    struct mp_image_params image_params;        // texture format (mind hwdec case)
+    struct ra_imgfmt_desc ra_format;            // texture format
+    int plane_count;
+
+    bool is_gray;
+    bool has_alpha;
+    char color_swizzle[5];
+    bool use_integer_conversion;
+
+    struct video_image image;
+
+    struct dr_buffer *dr_buffers;
+    int num_dr_buffers;
+
+    bool using_dr_path;
+
+    bool dumb_mode;
+    bool forced_dumb_mode;
+
+    const struct ra_format *fbo_format;
+    struct fbotex merge_fbo[4];
+    struct fbotex scale_fbo[4];
+    struct fbotex integer_fbo[4];
+    struct fbotex indirect_fbo;
+    struct fbotex blend_subs_fbo;
+    struct fbotex screen_fbo;
+    struct fbotex output_fbo;
+    struct fbosurface surfaces[FBOSURFACES_MAX];
+    struct fbotex vdpau_deinterleave_fbo[2];
+    struct ra_buf *hdr_peak_ssbo;
+
+    // user pass descriptions and textures
+    struct tex_hook tex_hooks[SHADER_MAX_PASSES];
+    int tex_hook_num;
+    struct gl_user_shader_tex user_textures[SHADER_MAX_PASSES];
+    int user_tex_num;
+
+    int surface_idx;
+    int surface_now;
+    int frames_drawn;
+    bool is_interpolated;
+    bool output_fbo_valid;
+
+    // state for configured scalers
+    struct scaler scaler[SCALER_COUNT];
+
+    struct mp_csp_equalizer_state *video_eq;
+
+    struct mp_rect src_rect;    // displayed part of the source video
+    struct mp_rect dst_rect;    // video rectangle on output window
+    struct mp_osd_res osd_rect; // OSD size/margins
+
+    // temporary during rendering
+    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    struct compute_info pass_compute; // compute shader metadata for this pass
+    int pass_tex_num;
+    int texture_w, texture_h;
+    struct gl_transform texture_offset; // texture transform without rotation
+    int components;
+    bool use_linear;
+    float user_gamma;
+
+    // pass info / metrics
+    struct pass_info pass_fresh[PASS_INFO_MAX];
+    struct pass_info pass_redraw[PASS_INFO_MAX];
+    struct pass_info *pass;
+    int pass_idx;
+    struct timer_pool *upload_timer;
+    struct timer_pool *blit_timer;
+    struct timer_pool *osd_timer;
+
+    // intermediate textures
+    struct saved_tex saved_tex[SHADER_MAX_SAVED];
+    int saved_tex_num;
+    struct fbotex hook_fbos[SHADER_MAX_SAVED];
+    int hook_fbo_num;
+
+    int frames_uploaded;
+    int frames_rendered;
+    AVLFG lfg;
+
+    // Cached because computing it can take relatively long
+    int last_dither_matrix_size;
+    float *last_dither_matrix;
+
+    struct cached_file *files;
+    int num_files;
+
+    struct ra_hwdec *hwdec;
+    struct ra_hwdec_mapper *hwdec_mapper;
+    bool hwdec_active;
+
+    bool dsi_warned;
+    bool broken_frame; // temporary error state
+};
+
+static const struct gl_video_opts gl_video_opts_def = {
+    .dither_algo = DITHER_FRUIT,
+    .dither_depth = -1,
+    .dither_size = 6,
+    .temporal_dither_period = 1,
+    .fbo_format = "auto",
+    .sigmoid_center = 0.75,
+    .sigmoid_slope = 6.5,
+    .scaler = {
+        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .cutoff = 0.001}, // scale
+        {{NULL,       .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .cutoff = 0.001}, // dscale
+        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .cutoff = 0.001}, // cscale
+        {{"mitchell", .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .clamp = 1, }, // tscale
+    },
+    .scaler_resizes_only = 1,
+    .scaler_lut_size = 6,
+    .interpolation_threshold = 0.0001,
+    .alpha_mode = ALPHA_BLEND_TILES,
+    .background = {0, 0, 0, 255},
+    .gamma = 1.0f,
+    .tone_mapping = TONE_MAPPING_MOBIUS,
+    .tone_mapping_param = NAN,
+    .tone_mapping_desat = 2.0,
+    .early_flush = -1,
+};
+
+static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param);
+
+static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param);
+
+#define OPT_BASE_STRUCT struct gl_video_opts
+
+#define SCALER_OPTS(n, i) \
+    OPT_STRING_VALIDATE(n, scaler[i].kernel.name, 0, validate_scaler_opt), \
+    OPT_FLOAT(n"-param1", scaler[i].kernel.params[0], 0),                  \
+    OPT_FLOAT(n"-param2", scaler[i].kernel.params[1], 0),                  \
+    OPT_FLOAT(n"-blur",   scaler[i].kernel.blur, 0),                       \
+    OPT_FLOATRANGE(n"-cutoff", scaler[i].cutoff, 0, 0.0, 1.0),             \
+    OPT_FLOATRANGE(n"-taper", scaler[i].kernel.taper, 0, 0.0, 1.0),        \
+    OPT_FLOAT(n"-wparam", scaler[i].window.params[0], 0),                  \
+    OPT_FLOAT(n"-wblur",  scaler[i].window.blur, 0),                       \
+    OPT_FLOATRANGE(n"-wtaper", scaler[i].window.taper, 0, 0.0, 1.0),       \
+    OPT_FLOATRANGE(n"-clamp", scaler[i].clamp, 0, 0.0, 1.0),               \
+    OPT_FLOATRANGE(n"-radius",    scaler[i].radius, 0, 0.5, 16.0),         \
+    OPT_FLOATRANGE(n"-antiring",  scaler[i].antiring, 0, 0.0, 1.0),        \
+    OPT_STRING_VALIDATE(n"-window", scaler[i].window.name, 0, validate_window_opt)
+
+const struct m_sub_options gl_video_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_CHOICE("gpu-dumb-mode", dumb_mode, 0,
+                   ({"auto", 0}, {"yes", 1}, {"no", -1})),
+        OPT_FLOATRANGE("gamma-factor", gamma, 0, 0.1, 2.0),
+        OPT_FLAG("gamma-auto", gamma_auto, 0),
+        OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
+        OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
+        OPT_CHOICE("tone-mapping", tone_mapping, 0,
+                   ({"clip",     TONE_MAPPING_CLIP},
+                    {"mobius",   TONE_MAPPING_MOBIUS},
+                    {"reinhard", TONE_MAPPING_REINHARD},
+                    {"hable",    TONE_MAPPING_HABLE},
+                    {"gamma",    TONE_MAPPING_GAMMA},
+                    {"linear",   TONE_MAPPING_LINEAR})),
+        OPT_FLAG("hdr-compute-peak", compute_hdr_peak, 0),
+        OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
+        OPT_FLOAT("tone-mapping-desaturate", tone_mapping_desat, 0),
+        OPT_FLAG("gamut-warning", gamut_warning, 0),
+        OPT_FLAG("opengl-pbo", pbo, 0),
+        SCALER_OPTS("scale",  SCALER_SCALE),
+        SCALER_OPTS("dscale", SCALER_DSCALE),
+        SCALER_OPTS("cscale", SCALER_CSCALE),
+        SCALER_OPTS("tscale", SCALER_TSCALE),
+        OPT_INTRANGE("scaler-lut-size", scaler_lut_size, 0, 4, 10),
+        OPT_FLAG("scaler-resizes-only", scaler_resizes_only, 0),
+        OPT_FLAG("linear-scaling", linear_scaling, 0),
+        OPT_FLAG("correct-downscaling", correct_downscaling, 0),
+        OPT_FLAG("sigmoid-upscaling", sigmoid_upscaling, 0),
+        OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
+        OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
+        OPT_STRING("fbo-format", fbo_format, 0),
+        OPT_CHOICE_OR_INT("dither-depth", dither_depth, 0, -1, 16,
+                          ({"no", -1}, {"auto", 0})),
+        OPT_CHOICE("dither", dither_algo, 0,
+                   ({"fruit", DITHER_FRUIT},
+                    {"ordered", DITHER_ORDERED},
+                    {"no", DITHER_NONE})),
+        OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
+        OPT_FLAG("temporal-dither", temporal_dither, 0),
+        OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
+        OPT_CHOICE("alpha", alpha_mode, 0,
+                   ({"no", ALPHA_NO},
+                    {"yes", ALPHA_YES},
+                    {"blend", ALPHA_BLEND},
+                    {"blend-tiles", ALPHA_BLEND_TILES})),
+        OPT_FLAG("opengl-rectangle-textures", use_rectangle, 0),
+        OPT_COLOR("background", background, 0),
+        OPT_FLAG("interpolation", interpolation, 0),
+        OPT_FLOAT("interpolation-threshold", interpolation_threshold, 0),
+        OPT_CHOICE("blend-subtitles", blend_subs, 0,
+                   ({"no", BLEND_SUBS_NO},
+                    {"yes", BLEND_SUBS_YES},
+                    {"video", BLEND_SUBS_VIDEO})),
+        OPT_PATHLIST("glsl-shaders", user_shaders, 0),
+        OPT_CLI_ALIAS("glsl-shader", "glsl-shaders-append"),
+        OPT_FLAG("deband", deband, 0),
+        OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
+        OPT_FLOAT("sharpen", unsharp, 0),
+        OPT_INTRANGE("gpu-tex-pad-x", tex_pad_x, 0, 0, 4096),
+        OPT_INTRANGE("gpu-tex-pad-y", tex_pad_y, 0, 0, 4096),
+        OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
+        OPT_STRING("gpu-shader-cache-dir", shader_cache_dir, 0),
+        OPT_REPLACED("hdr-tone-mapping", "tone-mapping"),
+        OPT_REPLACED("opengl-shaders", "glsl-shaders"),
+        OPT_CLI_ALIAS("opengl-shader", "glsl-shaders-append"),
+        OPT_REPLACED("opengl-shader-cache-dir", "gpu-shader-cache-dir"),
+        OPT_REPLACED("opengl-tex-pad-x", "gpu-tex-pad-x"),
+        OPT_REPLACED("opengl-tex-pad-y", "gpu-tex-pad-y"),
+        OPT_REPLACED("opengl-fbo-format", "fbo-format"),
+        OPT_REPLACED("opengl-dumb-mode", "gpu-dumb-mode"),
+        OPT_REPLACED("opengl-gamma", "gpu-gamma"),
+        {0}
+    },
+    .size = sizeof(struct gl_video_opts),
+    .defaults = &gl_video_opts_def,
+};
+
+static void uninit_rendering(struct gl_video *p);
+static void uninit_scaler(struct gl_video *p, struct scaler *scaler);
+static void check_gl_features(struct gl_video *p);
+static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id);
+static const char *handle_scaler_opt(const char *name, bool tscale);
+static void reinit_from_options(struct gl_video *p);
+static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
+static void gl_video_setup_hooks(struct gl_video *p);
+
+#define GLSL(x) gl_sc_add(p->sc, #x "\n");
+#define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
+#define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__)
+#define PRELUDE(...) gl_sc_paddf(p->sc, __VA_ARGS__)
+
+static struct bstr load_cached_file(struct gl_video *p, const char *path)
+{
+    if (!path || !path[0])
+        return (struct bstr){0};
+    for (int n = 0; n < p->num_files; n++) {
+        if (strcmp(p->files[n].path, path) == 0)
+            return p->files[n].body;
+    }
+    // not found -> load it
+    struct bstr s = stream_read_file(path, p, p->global, 1024000); // 1024 kB
+    if (s.len) {
+        struct cached_file new = {
+            .path = talloc_strdup(p, path),
+            .body = s,
+        };
+        MP_TARRAY_APPEND(p, p->files, p->num_files, new);
+        return new.body;
+    }
+    return (struct bstr){0};
+}
+
+static void debug_check_gl(struct gl_video *p, const char *msg)
+{
+    if (p->ra->fns->debug_marker)
+        p->ra->fns->debug_marker(p->ra, msg);
+}
+
+static void gl_video_reset_surfaces(struct gl_video *p)
+{
+    for (int i = 0; i < FBOSURFACES_MAX; i++) {
+        p->surfaces[i].id = 0;
+        p->surfaces[i].pts = MP_NOPTS_VALUE;
+    }
+    p->surface_idx = 0;
+    p->surface_now = 0;
+    p->frames_drawn = 0;
+    p->output_fbo_valid = false;
+}
+
+static void gl_video_reset_hooks(struct gl_video *p)
+{
+    for (int i = 0; i < p->tex_hook_num; i++)
+        talloc_free(p->tex_hooks[i].priv);
+
+    for (int i = 0; i < p->user_tex_num; i++)
+        ra_tex_free(p->ra, &p->user_textures[i].tex);
+
+    p->tex_hook_num = 0;
+    p->user_tex_num = 0;
+}
+
+static inline int fbosurface_wrap(int id)
+{
+    id = id % FBOSURFACES_MAX;
+    return id < 0 ? id + FBOSURFACES_MAX : id;
+}
+
+static void reinit_osd(struct gl_video *p)
+{
+    mpgl_osd_destroy(p->osd);
+    p->osd = NULL;
+    if (p->osd_state)
+        p->osd = mpgl_osd_init(p->ra, p->log, p->osd_state);
+}
+
+static void uninit_rendering(struct gl_video *p)
+{
+    for (int n = 0; n < SCALER_COUNT; n++)
+        uninit_scaler(p, &p->scaler[n]);
+
+    ra_tex_free(p->ra, &p->dither_texture);
+
+    for (int n = 0; n < 4; n++) {
+        fbotex_uninit(&p->merge_fbo[n]);
+        fbotex_uninit(&p->scale_fbo[n]);
+        fbotex_uninit(&p->integer_fbo[n]);
+    }
+
+    fbotex_uninit(&p->indirect_fbo);
+    fbotex_uninit(&p->blend_subs_fbo);
+    fbotex_uninit(&p->screen_fbo);
+    fbotex_uninit(&p->output_fbo);
+
+    for (int n = 0; n < FBOSURFACES_MAX; n++)
+        fbotex_uninit(&p->surfaces[n].fbotex);
+
+    for (int n = 0; n < SHADER_MAX_SAVED; n++)
+        fbotex_uninit(&p->hook_fbos[n]);
+
+    for (int n = 0; n < 2; n++)
+        fbotex_uninit(&p->vdpau_deinterleave_fbo[n]);
+
+    gl_video_reset_surfaces(p);
+    gl_video_reset_hooks(p);
+
+    gl_sc_reset_error(p->sc);
+}
+
+bool gl_video_gamma_auto_enabled(struct gl_video *p)
+{
+    return p->opts.gamma_auto;
+}
+
+struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p)
+{
+    return (struct mp_colorspace) {
+        .primaries = p->opts.target_prim,
+        .gamma = p->opts.target_trc,
+    };
+}
+
+// Warning: profile.start must point to a ta allocation, and the function
+//          takes over ownership.
+void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data)
+{
+    if (gl_lcms_set_memory_profile(p->cms, icc_data))
+        reinit_from_options(p);
+}
+
+bool gl_video_icc_auto_enabled(struct gl_video *p)
+{
+    return p->opts.icc_opts ? p->opts.icc_opts->profile_auto : false;
+}
+
+static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
+                               enum mp_csp_trc trc)
+{
+    if (!p->use_lut_3d)
+        return false;
+
+    struct AVBufferRef *icc = NULL;
+    if (p->image.mpi)
+        icc = p->image.mpi->icc_profile;
+
+    if (p->lut_3d_texture && !gl_lcms_has_changed(p->cms, prim, trc, icc))
+        return true;
+
+    // GLES3 doesn't provide filtered 16 bit integer textures
+    // GLES2 doesn't even provide 3D textures
+    const struct ra_format *fmt = ra_find_unorm_format(p->ra, 2, 4);
+    if (!fmt || !(p->ra->caps & RA_CAP_TEX_3D)) {
+        p->use_lut_3d = false;
+        MP_WARN(p, "Disabling color management (no RGBA16 3D textures).\n");
+        return false;
+    }
+
+    struct lut3d *lut3d = NULL;
+    if (!fmt || !gl_lcms_get_lut3d(p->cms, &lut3d, prim, trc, icc) || !lut3d) {
+        p->use_lut_3d = false;
+        return false;
+    }
+
+    ra_tex_free(p->ra, &p->lut_3d_texture);
+
+    struct ra_tex_params params = {
+        .dimensions = 3,
+        .w = lut3d->size[0],
+        .h = lut3d->size[1],
+        .d = lut3d->size[2],
+        .format = fmt,
+        .render_src = true,
+        .src_linear = true,
+        .initial_data = lut3d->data,
+    };
+    p->lut_3d_texture = ra_tex_create(p->ra, &params);
+
+    debug_check_gl(p, "after 3d lut creation");
+
+    for (int i = 0; i < 3; i++)
+        p->lut_3d_size[i] = lut3d->size[i];
+
+    talloc_free(lut3d);
+
+    return true;
+}
+
+// Fill an img_tex struct from an FBO + some metadata
+static struct img_tex img_tex_fbo(struct fbotex *fbo, enum plane_type type,
+                                  int components)
+{
+    assert(type != PLANE_NONE);
+    return (struct img_tex){
+        .type = type,
+        .tex = fbo->tex,
+        .multiplier = 1.0,
+        .w = fbo->lw,
+        .h = fbo->lh,
+        .transform = identity_trans,
+        .components = components,
+    };
+}
+
+// Bind an img_tex to a free texture unit and return its ID. At most
+// TEXUNIT_VIDEO_NUM texture units can be bound at once
+static int pass_bind(struct gl_video *p, struct img_tex tex)
+{
+    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
+    p->pass_tex[p->pass_tex_num] = tex;
+    return p->pass_tex_num++;
+}
+
+// Rotation by 90° and flipping.
+// w/h is used for recentering.
+static void get_transform(float w, float h, int rotate, bool flip,
+                          struct gl_transform *out_tr)
+{
+    int a = rotate % 90 ? 0 : rotate / 90;
+    int sin90[4] = {0, 1, 0, -1}; // just to avoid rounding issues etc.
+    int cos90[4] = {1, 0, -1, 0};
+    struct gl_transform tr = {{{ cos90[a], sin90[a]},
+                               {-sin90[a], cos90[a]}}};
+
+    // basically, recenter to keep the whole image in view
+    float b[2] = {1, 1};
+    gl_transform_vec(tr, &b[0], &b[1]);
+    tr.t[0] += b[0] < 0 ? w : 0;
+    tr.t[1] += b[1] < 0 ? h : 0;
+
+    if (flip) {
+        struct gl_transform fliptr = {{{1, 0}, {0, -1}}, {0, h}};
+        gl_transform_trans(fliptr, &tr);
+    }
+
+    *out_tr = tr;
+}
+
+// Return the chroma plane upscaled to luma size, but with additional padding
+// for image sizes not aligned to subsampling.
+static int chroma_upsize(int size, int pixel)
+{
+    return (size + pixel - 1) / pixel * pixel;
+}
+
+// If a and b are on the same plane, return what plane type should be used.
+// If a or b are none, the other type always wins.
+// Usually: LUMA/RGB/XYZ > CHROMA > ALPHA
+static enum plane_type merge_plane_types(enum plane_type a, enum plane_type b)
+{
+    if (a == PLANE_NONE)
+        return b;
+    if (b == PLANE_LUMA || b == PLANE_RGB || b == PLANE_XYZ)
+        return b;
+    if (b != PLANE_NONE && a == PLANE_ALPHA)
+        return b;
+    return a;
+}
+
+// Places a video_image's image textures + associated metadata into tex[]. The
+// number of textures is equal to p->plane_count. Any necessary plane offsets
+// are stored in off. (e.g. chroma position)
+static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
+                             struct img_tex tex[4], struct gl_transform off[4])
+{
+    assert(vimg->mpi);
+
+    int w = p->image_params.w;
+    int h = p->image_params.h;
+
+    // Determine the chroma offset
+    float ls_w = 1.0 / p->ra_format.chroma_w;
+    float ls_h = 1.0 / p->ra_format.chroma_h;
+
+    struct gl_transform chroma = {{{ls_w, 0.0}, {0.0, ls_h}}};
+
+    if (p->image_params.chroma_location != MP_CHROMA_CENTER) {
+        int cx, cy;
+        mp_get_chroma_location(p->image_params.chroma_location, &cx, &cy);
+        // By default texture coordinates are such that chroma is centered with
+        // any chroma subsampling. If a specific direction is given, make it
+        // so that the luma and chroma sample line up exactly.
+        // For 4:4:4, setting chroma location should have no effect at all.
+        // luma sample size (in chroma coord. space)
+        chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
+        chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
+    }
+
+    int msb_valid_bits =
+        p->ra_format.component_bits + MPMIN(p->ra_format.component_pad, 0);
+    // The existing code assumes we just have a single tex multiplier for
+    // all of the planes. This may change in the future
+    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.color.space,
+                                         msb_valid_bits,
+                                         p->ra_format.component_bits);
+
+    memset(tex, 0, 4 * sizeof(tex[0]));
+    for (int n = 0; n < p->plane_count; n++) {
+        struct texplane *t = &vimg->planes[n];
+
+        enum plane_type type = PLANE_NONE;
+        for (int i = 0; i < 4; i++) {
+            int c = p->ra_format.components[n][i];
+            enum plane_type ctype;
+            if (c == 0) {
+                ctype = PLANE_NONE;
+            } else if (c == 4) {
+                ctype = PLANE_ALPHA;
+            } else if (p->image_params.color.space == MP_CSP_RGB) {
+                ctype = PLANE_RGB;
+            } else if (p->image_params.color.space == MP_CSP_XYZ) {
+                ctype = PLANE_XYZ;
+            } else {
+                ctype = c == 1 ? PLANE_LUMA : PLANE_CHROMA;
+            }
+            type = merge_plane_types(type, ctype);
+        }
+
+        tex[n] = (struct img_tex){
+            .type = type,
+            .tex = t->tex,
+            .multiplier = tex_mul,
+            .w = t->w,
+            .h = t->h,
+        };
+
+        for (int i = 0; i < 4; i++)
+            tex[n].components += !!p->ra_format.components[n][i];
+
+        get_transform(t->w, t->h, p->image_params.rotate, t->flipped,
+                      &tex[n].transform);
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(int, tex[n].w, tex[n].h);
+
+        off[n] = identity_trans;
+
+        if (type == PLANE_CHROMA) {
+            struct gl_transform rot;
+            get_transform(0, 0, p->image_params.rotate, true, &rot);
+
+            struct gl_transform tr = chroma;
+            gl_transform_vec(rot, &tr.t[0], &tr.t[1]);
+
+            float dx = (chroma_upsize(w, p->ra_format.chroma_w) - w) * ls_w;
+            float dy = (chroma_upsize(h, p->ra_format.chroma_h) - h) * ls_h;
+
+            // Adjust the chroma offset if the real chroma size is fractional
+            // due image sizes not aligned to chroma subsampling.
+            struct gl_transform rot2;
+            get_transform(0, 0, p->image_params.rotate, t->flipped, &rot2);
+            if (rot2.m[0][0] < 0)
+                tr.t[0] += dx;
+            if (rot2.m[1][0] < 0)
+                tr.t[0] += dy;
+            if (rot2.m[0][1] < 0)
+                tr.t[1] += dx;
+            if (rot2.m[1][1] < 0)
+                tr.t[1] += dy;
+
+            off[n] = tr;
+        }
+    }
+}
+
+// Return the index of the given component (assuming all non-padding components
+// of all planes are concatenated into a linear list).
+static int find_comp(struct ra_imgfmt_desc *desc, int component)
+{
+    int cur = 0;
+    for (int n = 0; n < desc->num_planes; n++) {
+        for (int i = 0; i < 4; i++) {
+            if (desc->components[n][i]) {
+                if (desc->components[n][i] == component)
+                    return cur;
+                cur++;
+            }
+        }
+    }
+    return -1;
+}
+
+static void init_video(struct gl_video *p)
+{
+    p->use_integer_conversion = false;
+
+    if (p->hwdec && ra_hwdec_test_format(p->hwdec, p->image_params.imgfmt)) {
+        if (p->hwdec->driver->overlay_frame) {
+            MP_WARN(p, "Using HW-overlay mode. No GL filtering is performed "
+                       "on the video!\n");
+        } else {
+            p->hwdec_mapper = ra_hwdec_mapper_create(p->hwdec, &p->image_params);
+            if (!p->hwdec_mapper)
+                MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
+        }
+        if (p->hwdec_mapper)
+            p->image_params = p->hwdec_mapper->dst_params;
+        const char **exts = p->hwdec->glsl_extensions;
+        for (int n = 0; exts && exts[n]; n++)
+            gl_sc_enable_extension(p->sc, (char *)exts[n]);
+        p->hwdec_active = true;
+    }
+
+    p->ra_format = (struct ra_imgfmt_desc){0};
+    ra_get_imgfmt_desc(p->ra, p->image_params.imgfmt, &p->ra_format);
+
+    p->plane_count = p->ra_format.num_planes;
+
+    p->has_alpha = false;
+    p->is_gray = true;
+
+    for (int n = 0; n < p->ra_format.num_planes; n++) {
+        for (int i = 0; i < 4; i++) {
+            if (p->ra_format.components[n][i]) {
+                p->has_alpha |= p->ra_format.components[n][i] == 4;
+                p->is_gray &= p->ra_format.components[n][i] == 1 ||
+                              p->ra_format.components[n][i] == 4;
+            }
+        }
+    }
+
+    for (int c = 0; c < 4; c++) {
+        int loc = find_comp(&p->ra_format, c + 1);
+        p->color_swizzle[c] = "rgba"[loc >= 0 && loc < 4 ? loc : 0];
+    }
+    p->color_swizzle[4] = '\0';
+
+    // Format-dependent checks.
+    check_gl_features(p);
+
+    mp_image_params_guess_csp(&p->image_params);
+
+    av_lfg_init(&p->lfg, 1);
+
+    debug_check_gl(p, "before video texture creation");
+
+    if (!p->hwdec_active) {
+        struct video_image *vimg = &p->image;
+
+        struct mp_image layout = {0};
+        mp_image_set_params(&layout, &p->image_params);
+
+        for (int n = 0; n < p->plane_count; n++) {
+            struct texplane *plane = &vimg->planes[n];
+            const struct ra_format *format = p->ra_format.planes[n];
+
+            plane->w = mp_image_plane_w(&layout, n);
+            plane->h = mp_image_plane_h(&layout, n);
+
+            struct ra_tex_params params = {
+                .dimensions = 2,
+                .w = plane->w + p->opts.tex_pad_x,
+                .h = plane->h + p->opts.tex_pad_y,
+                .d = 1,
+                .format = format,
+                .render_src = true,
+                .src_linear = format->linear_filter,
+                .non_normalized = p->opts.use_rectangle,
+                .host_mutable = true,
+            };
+
+            MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n,
+                       params.w, params.h);
+
+            plane->tex = ra_tex_create(p->ra, &params);
+            if (!plane->tex)
+                abort(); // shit happens
+
+            p->use_integer_conversion |= format->ctype == RA_CTYPE_UINT;
+        }
+    }
+
+    debug_check_gl(p, "after video texture creation");
+
+    gl_video_setup_hooks(p);
+}
+
+// Release any texture mappings associated with the current frame.
+static void unmap_current_image(struct gl_video *p)
+{
+    struct video_image *vimg = &p->image;
+
+    if (vimg->hwdec_mapped) {
+        assert(p->hwdec_active && p->hwdec_mapper);
+        ra_hwdec_mapper_unmap(p->hwdec_mapper);
+        memset(vimg->planes, 0, sizeof(vimg->planes));
+        vimg->hwdec_mapped = false;
+        vimg->id = 0; // needs to be mapped again
+    }
+}
+
+static struct dr_buffer *gl_find_dr_buffer(struct gl_video *p, uint8_t *ptr)
+{
+   for (int i = 0; i < p->num_dr_buffers; i++) {
+       struct dr_buffer *buffer = &p->dr_buffers[i];
+        uint8_t *bufptr = buffer->buf->data;
+        size_t size = buffer->buf->params.size;
+        if (ptr >= bufptr && ptr < bufptr + size)
+            return buffer;
+    }
+
+    return NULL;
+}
+
+static void gc_pending_dr_fences(struct gl_video *p, bool force)
+{
+again:;
+    for (int n = 0; n < p->num_dr_buffers; n++) {
+        struct dr_buffer *buffer = &p->dr_buffers[n];
+        if (!buffer->mpi)
+            continue;
+
+        bool res = p->ra->fns->buf_poll(p->ra, buffer->buf);
+        if (res || force) {
+            // Unreferencing the image could cause gl_video_dr_free_buffer()
+            // to be called by the talloc destructor (if it was the last
+            // reference). This will implicitly invalidate the buffer pointer
+            // and change the p->dr_buffers array. To make it worse, it could
+            // free multiple dr_buffers due to weird theoretical corner cases.
+            // This is also why we use the goto to iterate again from the
+            // start, because everything gets fucked up. Hail satan!
+            struct mp_image *ref = buffer->mpi;
+            buffer->mpi = NULL;
+            talloc_free(ref);
+            goto again;
+        }
+    }
+}
+
+static void unref_current_image(struct gl_video *p)
+{
+    unmap_current_image(p);
+    p->image.id = 0;
+
+    mp_image_unrefp(&p->image.mpi);
+
+    // While we're at it, also garbage collect pending fences in here to
+    // get it out of the way.
+    gc_pending_dr_fences(p, false);
+}
+
+// If overlay mode is used, make sure to remove the overlay.
+// Be careful with this. Removing the overlay and adding another one will
+// lead to flickering artifacts.
+static void unmap_overlay(struct gl_video *p)
+{
+    if (p->hwdec_active && p->hwdec->driver->overlay_frame)
+        p->hwdec->driver->overlay_frame(p->hwdec, NULL, NULL, NULL, true);
+}
+
+static void uninit_video(struct gl_video *p)
+{
+    uninit_rendering(p);
+
+    struct video_image *vimg = &p->image;
+
+    unmap_overlay(p);
+    unref_current_image(p);
+
+    for (int n = 0; n < p->plane_count; n++) {
+        struct texplane *plane = &vimg->planes[n];
+        ra_tex_free(p->ra, &plane->tex);
+    }
+    *vimg = (struct video_image){0};
+
+    // Invalidate image_params to ensure that gl_video_config() will call
+    // init_video() on uninitialized gl_video.
+    p->real_image_params = (struct mp_image_params){0};
+    p->image_params = p->real_image_params;
+    p->hwdec_active = false;
+    ra_hwdec_mapper_free(&p->hwdec_mapper);
+}
+
+static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
+{
+    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
+        return;
+
+    struct pass_info *pass = &p->pass[p->pass_idx];
+    pass->perf = perf;
+
+    if (pass->desc.len == 0)
+        bstr_xappend(p, &pass->desc, bstr0("(unknown)"));
+
+    p->pass_idx++;
+}
+
+PRINTF_ATTRIBUTE(2, 3)
+static void pass_describe(struct gl_video *p, const char *textf, ...)
+{
+    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
+        return;
+
+    struct pass_info *pass = &p->pass[p->pass_idx];
+
+    if (pass->desc.len > 0)
+        bstr_xappend(p, &pass->desc, bstr0(" + "));
+
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(p, &pass->desc, textf, ap);
+    va_end(ap);
+}
+
+static void pass_info_reset(struct gl_video *p, bool is_redraw)
+{
+    p->pass = is_redraw ? p->pass_redraw : p->pass_fresh;
+    p->pass_idx = 0;
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        p->pass[i].desc.len = 0;
+        p->pass[i].perf = (struct mp_pass_perf){0};
+    }
+}
+
+static void pass_report_performance(struct gl_video *p)
+{
+    if (!p->pass)
+        return;
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        struct pass_info *pass = &p->pass[i];
+        if (pass->desc.len) {
+            MP_DBG(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
+                   BSTR_P(pass->desc),
+                   (int)pass->perf.last/1000,
+                   (int)pass->perf.avg/1000,
+                   (int)pass->perf.peak/1000);
+        }
+    }
+}
+
+static void pass_prepare_src_tex(struct gl_video *p)
+{
+    struct gl_shader_cache *sc = p->sc;
+
+    for (int n = 0; n < p->pass_tex_num; n++) {
+        struct img_tex *s = &p->pass_tex[n];
+        if (!s->tex)
+            continue;
+
+        char *texture_name = mp_tprintf(32, "texture%d", n);
+        char *texture_size = mp_tprintf(32, "texture_size%d", n);
+        char *texture_rot = mp_tprintf(32, "texture_rot%d", n);
+        char *texture_off = mp_tprintf(32, "texture_off%d", n);
+        char *pixel_size = mp_tprintf(32, "pixel_size%d", n);
+
+        gl_sc_uniform_texture(sc, texture_name, s->tex);
+        float f[2] = {1, 1};
+        if (!s->tex->params.non_normalized) {
+            f[0] = s->tex->params.w;
+            f[1] = s->tex->params.h;
+        }
+        gl_sc_uniform_vec2(sc, texture_size, f);
+        gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m);
+        gl_sc_uniform_vec2(sc, texture_off, (float *)s->transform.t);
+        gl_sc_uniform_vec2(sc, pixel_size, (float[]){1.0f / f[0],
+                                                     1.0f / f[1]});
+    }
+}
+
+// Sets the appropriate compute shader metadata for an implicit compute pass
+// bw/bh: block size
+static void pass_is_compute(struct gl_video *p, int bw, int bh)
+{
+    p->pass_compute = (struct compute_info){
+        .active = true,
+        .block_w = bw,
+        .block_h = bh,
+    };
+}
+
+// w/h: the width/height of the compute shader's operating domain (e.g. the
+// target target that needs to be written, or the source texture that needs to
+// be reduced)
+static void dispatch_compute(struct gl_video *p, int w, int h,
+                             struct compute_info info)
+{
+    PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
+            info.threads_w > 0 ? info.threads_w : info.block_w,
+            info.threads_h > 0 ? info.threads_h : info.block_h);
+
+    pass_prepare_src_tex(p);
+    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
+
+    // Since we don't actually have vertices, we pretend for convenience
+    // reasons that we do and calculate the right texture coordinates based on
+    // the output sample ID
+    gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
+    PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
+
+    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
+        struct img_tex *s = &p->pass_tex[n];
+        if (!s->tex)
+            continue;
+
+        // We need to rescale the coordinates to the true texture size
+        char tex_scale[32];
+        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
+        gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
+                (float)s->w / s->tex->params.w,
+                (float)s->h / s->tex->params.h,
+        });
+
+        PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
+        PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + "
+               "pixel_size%d * texture_off%d)\n", n, n, n, n, n);
+        // Clamp the texture coordinates to prevent sampling out-of-bounds in
+        // threads that exceed the requested width/height
+        PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n);
+        PRELUDE("#define texcoord%d texmap%d(gl_GlobalInvocationID)\n", n, n);
+    }
+
+    // always round up when dividing to make sure we don't leave off a part of
+    // the image
+    int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
+        num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
+
+    pass_record(p, gl_sc_dispatch_compute(p->sc, num_x, num_y, 1));
+
+    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+    p->pass_tex_num = 0;
+}
+
+static struct mp_pass_perf render_pass_quad(struct gl_video *p,
+                                            struct fbodst target,
+                                            const struct mp_rect *dst)
+{
+    struct vertex va[6] = {0};
+
+    struct gl_transform t;
+    gl_transform_ortho_fbodst(&t, target);
+
+    float x[2] = {dst->x0, dst->x1};
+    float y[2] = {dst->y0, dst->y1};
+    gl_transform_vec(t, &x[0], &y[0]);
+    gl_transform_vec(t, &x[1], &y[1]);
+
+    for (int n = 0; n < 4; n++) {
+        struct vertex *v = &va[n];
+        v->position.x = x[n / 2];
+        v->position.y = y[n % 2];
+        for (int i = 0; i < p->pass_tex_num; i++) {
+            struct img_tex *s = &p->pass_tex[i];
+            if (!s->tex)
+                continue;
+            struct gl_transform tr = s->transform;
+            float tx = (n / 2) * s->w;
+            float ty = (n % 2) * s->h;
+            gl_transform_vec(tr, &tx, &ty);
+            bool rect = s->tex->params.non_normalized;
+            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
+            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
+        }
+    }
+
+    va[4] = va[2];
+    va[5] = va[1];
+
+    return gl_sc_dispatch_draw(p->sc, target.tex, va, 6);
+}
+
+static void finish_pass_direct(struct gl_video *p, struct fbodst target,
+                               const struct mp_rect *dst)
+{
+    pass_prepare_src_tex(p);
+    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
+    pass_record(p, render_pass_quad(p, target, dst));
+    debug_check_gl(p, "after rendering");
+    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+    p->pass_tex_num = 0;
+}
+
+// dst_fbo: this will be used for rendering; possibly reallocating the whole
+//          FBO, if the required parameters have changed
+// w, h: required FBO target dimension, and also defines the target rectangle
+//       used for rasterization
+// flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
+//        flags allows the FBO to be larger than the w/h parameters)
+static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
+                            int w, int h, int flags)
+{
+    fbotex_change(dst_fbo, p->ra, p->log, w, h, p->fbo_format, flags);
+
+    if (p->pass_compute.active) {
+        if (!dst_fbo->tex)
+            return;
+        gl_sc_uniform_image2D_wo(p->sc, "out_image", dst_fbo->tex);
+        if (!p->pass_compute.directly_writes)
+            GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
+
+        dispatch_compute(p, w, h, p->pass_compute);
+        p->pass_compute = (struct compute_info){0};
+
+        debug_check_gl(p, "after dispatching compute shader");
+    } else {
+        finish_pass_direct(p, dst_fbo->fbo, &(struct mp_rect){0, 0, w, h});
+    }
+}
+
+static const char *get_tex_swizzle(struct img_tex *img)
+{
+    if (!img->tex)
+        return "rgba";
+    return img->tex->params.format->luminance_alpha ? "raaa" : "rgba";
+}
+
+// Copy a texture to the vec4 color, while increasing offset. Also applies
+// the texture multiplier to the sampled color
+static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
+{
+    int count = img.components;
+    assert(*offset + count <= 4);
+
+    int id = pass_bind(p, img);
+    char src[5] = {0};
+    char dst[5] = {0};
+    const char *tex_fmt = get_tex_swizzle(&img);
+    const char *dst_fmt = "rgba";
+    for (int i = 0; i < count; i++) {
+        src[i] = tex_fmt[i];
+        dst[i] = dst_fmt[*offset + i];
+    }
+
+    if (img.tex && img.tex->params.format->ctype == RA_CTYPE_UINT) {
+        uint64_t tex_max = 1ull << p->ra_format.component_bits;
+        img.multiplier *= 1.0 / (tex_max - 1);
+    }
+
+    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
+          dst, img.multiplier, id, id, src);
+
+    *offset += count;
+}
+
+static void skip_unused(struct gl_video *p, int num_components)
+{
+    for (int i = num_components; i < 4; i++)
+        GLSLF("color.%c = %f;\n", "rgba"[i], i < 3 ? 0.0 : 1.0);
+}
+
+static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
+{
+    fbotex_uninit(&scaler->sep_fbo);
+    ra_tex_free(p->ra, &scaler->lut);
+    scaler->kernel = NULL;
+    scaler->initialized = false;
+}
+
+static void hook_prelude(struct gl_video *p, const char *name, int id,
+                         struct img_tex tex)
+{
+    GLSLHF("#define %s_raw texture%d\n", name, id);
+    GLSLHF("#define %s_pos texcoord%d\n", name, id);
+    GLSLHF("#define %s_size texture_size%d\n", name, id);
+    GLSLHF("#define %s_rot texture_rot%d\n", name, id);
+    GLSLHF("#define %s_pt pixel_size%d\n", name, id);
+    GLSLHF("#define %s_map texmap%d\n", name, id);
+    GLSLHF("#define %s_mul %f\n", name, tex.multiplier);
+
+    // Set up the sampling functions
+    GLSLHF("#define %s_tex(pos) (%s_mul * vec4(texture(%s_raw, pos)).%s)\n",
+           name, name, name, get_tex_swizzle(&tex));
+
+    // Since the extra matrix multiplication impacts performance,
+    // skip it unless the texture was actually rotated
+    if (gl_transform_eq(tex.transform, identity_trans)) {
+        GLSLHF("#define %s_texOff(off) %s_tex(%s_pos + %s_pt * vec2(off))\n",
+               name, name, name, name);
+    } else {
+        GLSLHF("#define %s_texOff(off) "
+                   "%s_tex(%s_pos + %s_rot * vec2(off)/%s_size)\n",
+               name, name, name, name, name);
+    }
+}
+
+static bool saved_tex_find(struct gl_video *p, const char *name,
+                           struct img_tex *out)
+{
+    if (!name || !out)
+        return false;
+
+    for (int i = 0; i < p->saved_tex_num; i++) {
+        if (strcmp(p->saved_tex[i].name, name) == 0) {
+            *out = p->saved_tex[i].tex;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void saved_tex_store(struct gl_video *p, const char *name,
+                            struct img_tex tex)
+{
+    assert(name);
+
+    for (int i = 0; i < p->saved_tex_num; i++) {
+        if (strcmp(p->saved_tex[i].name, name) == 0) {
+            p->saved_tex[i].tex = tex;
+            return;
+        }
+    }
+
+    assert(p->saved_tex_num < SHADER_MAX_SAVED);
+    p->saved_tex[p->saved_tex_num++] = (struct saved_tex) {
+        .name = name,
+        .tex = tex
+    };
+}
+
+static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
+                                  struct img_tex tex, struct tex_hook *hook)
+{
+    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
+        char *bind_name = (char *)hook->bind_tex[t];
+
+        if (!bind_name)
+            continue;
+
+        // This is a special name that means "currently hooked texture"
+        if (strcmp(bind_name, "HOOKED") == 0) {
+            int id = pass_bind(p, tex);
+            hook_prelude(p, "HOOKED", id, tex);
+            hook_prelude(p, name, id, tex);
+            continue;
+        }
+
+        // BIND can also be used to load user-defined textures, in which
+        // case we will directly load them as a uniform instead of
+        // generating the hook_prelude boilerplate
+        for (int u = 0; u < p->user_tex_num; u++) {
+            struct gl_user_shader_tex *utex = &p->user_textures[u];
+            if (bstr_equals0(utex->name, bind_name)) {
+                gl_sc_uniform_texture(p->sc, bind_name, utex->tex);
+                goto next_bind;
+            }
+        }
+
+        struct img_tex bind_tex;
+        if (!saved_tex_find(p, bind_name, &bind_tex)) {
+            // Clean up texture bindings and move on to the next hook
+            MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
+                   name, bind_name);
+            p->pass_tex_num -= t;
+            return false;
+        }
+
+        hook_prelude(p, bind_name, pass_bind(p, bind_tex), bind_tex);
+
+next_bind: ;
+    }
+
+    return true;
+}
+
+// Process hooks for a plane, saving the result and returning a new img_tex
+// If 'trans' is NULL, the shader is forbidden from transforming tex
+static struct img_tex pass_hook(struct gl_video *p, const char *name,
+                                struct img_tex tex, struct gl_transform *trans)
+{
+    if (!name)
+        return tex;
+
+    saved_tex_store(p, name, tex);
+
+    MP_DBG(p, "Running hooks for %s\n", name);
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        struct tex_hook *hook = &p->tex_hooks[i];
+
+        // Figure out if this pass hooks this texture
+        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
+            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
+                goto found;
+        }
+
+        continue;
+
+found:
+        // Check the hook's condition
+        if (hook->cond && !hook->cond(p, tex, hook->priv)) {
+            MP_DBG(p, "Skipping hook on %s due to condition.\n", name);
+            continue;
+        }
+
+        if (!pass_hook_setup_binds(p, name, tex, hook))
+            continue;
+
+        // Run the actual hook. This generates a series of GLSL shader
+        // instructions sufficient for drawing the hook's output
+        struct gl_transform hook_off = identity_trans;
+        hook->hook(p, tex, &hook_off, hook->priv);
+
+        int comps = hook->components ? hook->components : tex.components;
+        skip_unused(p, comps);
+
+        // Compute the updated FBO dimensions and store the result
+        struct mp_rect_f sz = {0, 0, tex.w, tex.h};
+        gl_transform_rect(hook_off, &sz);
+        int w = lroundf(fabs(sz.x1 - sz.x0));
+        int h = lroundf(fabs(sz.y1 - sz.y0));
+
+        assert(p->hook_fbo_num < SHADER_MAX_SAVED);
+        struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
+        finish_pass_fbo(p, fbo, w, h, 0);
+
+        const char *store_name = hook->save_tex ? hook->save_tex : name;
+        struct img_tex saved_tex = img_tex_fbo(fbo, tex.type, comps);
+
+        // If the texture we're saving overwrites the "current" texture, also
+        // update the tex parameter so that the future loop cycles will use the
+        // updated values, and export the offset
+        if (strcmp(store_name, name) == 0) {
+            if (!trans && !gl_transform_eq(hook_off, identity_trans)) {
+                MP_ERR(p, "Hook tried changing size of unscalable texture %s!\n",
+                       name);
+                return tex;
+            }
+
+            tex = saved_tex;
+            if (trans)
+                gl_transform_trans(hook_off, trans);
+        }
+
+        saved_tex_store(p, store_name, saved_tex);
+    }
+
+    return tex;
+}
+
+// This can be used at any time in the middle of rendering to specify an
+// optional hook point, which if triggered will render out to a new FBO and
+// load the result back into vec4 color. Offsets applied by the hooks are
+// accumulated in tex_trans, and the FBO is dimensioned according
+// to p->texture_w/h
+static void pass_opt_hook_point(struct gl_video *p, const char *name,
+                                struct gl_transform *tex_trans)
+{
+    if (!name)
+        return;
+
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        struct tex_hook *hook = &p->tex_hooks[i];
+
+        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
+            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
+                goto found;
+        }
+
+        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
+            if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
+                goto found;
+        }
+    }
+
+    // Nothing uses this texture, don't bother storing it
+    return;
+
+found:
+    assert(p->hook_fbo_num < SHADER_MAX_SAVED);
+    struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
+    finish_pass_fbo(p, fbo, p->texture_w, p->texture_h, 0);
+
+    struct img_tex img = img_tex_fbo(fbo, PLANE_RGB, p->components);
+    img = pass_hook(p, name, img, tex_trans);
+    copy_img_tex(p, &(int){0}, img);
+    p->texture_w = img.w;
+    p->texture_h = img.h;
+    p->components = img.components;
+    pass_describe(p, "(remainder pass)");
+}
+
+static void load_shader(struct gl_video *p, struct bstr body)
+{
+    gl_sc_hadd_bstr(p->sc, body);
+    gl_sc_uniform_f(p->sc, "random", (double)av_lfg_get(&p->lfg) / UINT32_MAX);
+    gl_sc_uniform_i(p->sc, "frame", p->frames_uploaded);
+    gl_sc_uniform_vec2(p->sc, "input_size",
+                       (float[]){(p->src_rect.x1 - p->src_rect.x0) *
+                                  p->texture_offset.m[0][0],
+                                  (p->src_rect.y1 - p->src_rect.y0) *
+                                  p->texture_offset.m[1][1]});
+    gl_sc_uniform_vec2(p->sc, "target_size",
+                       (float[]){p->dst_rect.x1 - p->dst_rect.x0,
+                                 p->dst_rect.y1 - p->dst_rect.y0});
+    gl_sc_uniform_vec2(p->sc, "tex_offset",
+                       (float[]){p->src_rect.x0 * p->texture_offset.m[0][0] +
+                                 p->texture_offset.t[0],
+                                 p->src_rect.y0 * p->texture_offset.m[1][1] +
+                                 p->texture_offset.t[1]});
+}
+
+// Semantic equality
+static bool double_seq(double a, double b)
+{
+    return (isnan(a) && isnan(b)) || a == b;
+}
+
+static bool scaler_fun_eq(struct scaler_fun a, struct scaler_fun b)
+{
+    if ((a.name && !b.name) || (b.name && !a.name))
+        return false;
+
+    return ((!a.name && !b.name) || strcmp(a.name, b.name) == 0) &&
+           double_seq(a.params[0], b.params[0]) &&
+           double_seq(a.params[1], b.params[1]) &&
+           a.blur == b.blur &&
+           a.taper == b.taper;
+}
+
+static bool scaler_conf_eq(struct scaler_config a, struct scaler_config b)
+{
+    // Note: antiring isn't compared because it doesn't affect LUT
+    // generation
+    return scaler_fun_eq(a.kernel, b.kernel) &&
+           scaler_fun_eq(a.window, b.window) &&
+           a.radius == b.radius &&
+           a.clamp == b.clamp;
+}
+
+static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
+                          const struct scaler_config *conf,
+                          double scale_factor,
+                          int sizes[])
+{
+    if (scaler_conf_eq(scaler->conf, *conf) &&
+        scaler->scale_factor == scale_factor &&
+        scaler->initialized)
+        return;
+
+    uninit_scaler(p, scaler);
+
+    scaler->conf = *conf;
+    bool is_tscale = scaler->index == SCALER_TSCALE;
+    scaler->conf.kernel.name = (char *)handle_scaler_opt(conf->kernel.name, is_tscale);
+    scaler->conf.window.name = (char *)handle_scaler_opt(conf->window.name, is_tscale);
+    scaler->scale_factor = scale_factor;
+    scaler->insufficient = false;
+    scaler->initialized = true;
+
+    const struct filter_kernel *t_kernel = mp_find_filter_kernel(conf->kernel.name);
+    if (!t_kernel)
+        return;
+
+    scaler->kernel_storage = *t_kernel;
+    scaler->kernel = &scaler->kernel_storage;
+
+    const char *win = conf->window.name;
+    if (!win || !win[0])
+        win = t_kernel->window; // fall back to the scaler's default window
+    const struct filter_window *t_window = mp_find_filter_window(win);
+    if (t_window)
+        scaler->kernel->w = *t_window;
+
+    for (int n = 0; n < 2; n++) {
+        if (!isnan(conf->kernel.params[n]))
+            scaler->kernel->f.params[n] = conf->kernel.params[n];
+        if (!isnan(conf->window.params[n]))
+            scaler->kernel->w.params[n] = conf->window.params[n];
+    }
+
+    if (conf->kernel.blur > 0.0)
+        scaler->kernel->f.blur = conf->kernel.blur;
+    if (conf->window.blur > 0.0)
+        scaler->kernel->w.blur = conf->window.blur;
+
+    if (conf->kernel.taper > 0.0)
+        scaler->kernel->f.taper = conf->kernel.taper;
+    if (conf->window.taper > 0.0)
+        scaler->kernel->w.taper = conf->window.taper;
+
+    if (scaler->kernel->f.resizable && conf->radius > 0.0)
+        scaler->kernel->f.radius = conf->radius;
+
+    scaler->kernel->clamp = conf->clamp;
+    scaler->kernel->value_cutoff = conf->cutoff;
+
+    scaler->insufficient = !mp_init_filter(scaler->kernel, sizes, scale_factor);
+
+    int size = scaler->kernel->size;
+    int num_components = size > 2 ? 4 : size;
+    const struct ra_format *fmt = ra_find_float16_format(p->ra, num_components);
+    assert(fmt);
+
+    int width = (size + num_components - 1) / num_components; // round up
+    int stride = width * num_components;
+    assert(size <= stride);
+
+    scaler->lut_size = 1 << p->opts.scaler_lut_size;
+
+    float *weights = talloc_array(NULL, float, scaler->lut_size * stride);
+    mp_compute_lut(scaler->kernel, scaler->lut_size, stride, weights);
+
+    bool use_1d = scaler->kernel->polar && (p->ra->caps & RA_CAP_TEX_1D);
+
+    struct ra_tex_params lut_params = {
+        .dimensions = use_1d ? 1 : 2,
+        .w = use_1d ? scaler->lut_size : width,
+        .h = use_1d ? 1 : scaler->lut_size,
+        .d = 1,
+        .format = fmt,
+        .render_src = true,
+        .src_linear = true,
+        .initial_data = weights,
+    };
+    scaler->lut = ra_tex_create(p->ra, &lut_params);
+
+    talloc_free(weights);
+
+    debug_check_gl(p, "after initializing scaler");
+}
+
+// Special helper for sampling from two separated stages
+static void pass_sample_separated(struct gl_video *p, struct img_tex src,
+                                  struct scaler *scaler, int w, int h)
+{
+    // Separate the transformation into x and y components, per pass
+    struct gl_transform t_x = {
+        .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}},
+        .t = {src.transform.t[0], 0.0},
+    };
+    struct gl_transform t_y = {
+        .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}},
+        .t = {0.0, src.transform.t[1]},
+    };
+
+    // First pass (scale only in the y dir)
+    src.transform = t_y;
+    sampler_prelude(p->sc, pass_bind(p, src));
+    GLSLF("// first pass\n");
+    pass_sample_separated_gen(p->sc, scaler, 0, 1);
+    GLSLF("color *= %f;\n", src.multiplier);
+    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
+
+    // Second pass (scale only in the x dir)
+    src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
+    src.transform = t_x;
+    pass_describe(p, "%s second pass", scaler->conf.kernel.name);
+    sampler_prelude(p->sc, pass_bind(p, src));
+    pass_sample_separated_gen(p->sc, scaler, 1, 0);
+}
+
+// Picks either the compute shader version or the regular sampler version
+// depending on hardware support
+static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
+                                       struct img_tex tex, int w, int h)
+{
+    uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY;
+    if ((p->ra->caps & reqs) != reqs)
+        goto fallback;
+
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    int offset = bound - 1; // padding top/left
+    int padding = offset + bound; // total padding
+
+    float ratiox = (float)w / tex.w,
+          ratioy = (float)h / tex.h;
+
+    // For performance we want to load at least as many pixels
+    // horizontally as there are threads in a warp (32 for nvidia), as
+    // well as enough to take advantage of shmem parallelism
+    const int warp_size = 32, threads = 256;
+    int bw = warp_size;
+    int bh = threads / bw;
+
+    // We need to sample everything from base_min to base_max, so make sure
+    // we have enough room in shmem
+    int iw = (int)ceil(bw / ratiox) + padding + 1,
+        ih = (int)ceil(bh / ratioy) + padding + 1;
+
+    int shmem_req = iw * ih * tex.components * sizeof(float);
+    if (shmem_req > p->ra->max_shmem)
+        goto fallback;
+
+    pass_is_compute(p, bw, bh);
+    pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
+    return;
+
+fallback:
+    // Fall back to regular polar shader when compute shaders are unsupported
+    // or the kernel is too big for shmem
+    pass_sample_polar(p->sc, scaler, tex.components, p->ra->glsl_version);
+}
+
+// Sample from img_tex, with the src rectangle given by it.
+// The dst rectangle is implicit by what the caller will do next, but w and h
+// must still be what is going to be used (to dimension FBOs correctly).
+// This will write the scaled contents to the vec4 "color".
+// The scaler unit is initialized by this function; in order to avoid cache
+// thrashing, the scaler unit should usually use the same parameters.
+static void pass_sample(struct gl_video *p, struct img_tex tex,
+                        struct scaler *scaler, const struct scaler_config *conf,
+                        double scale_factor, int w, int h)
+{
+    reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
+
+    // Describe scaler
+    const char *scaler_opt[] = {
+        [SCALER_SCALE] = "scale",
+        [SCALER_DSCALE] = "dscale",
+        [SCALER_CSCALE] = "cscale",
+        [SCALER_TSCALE] = "tscale",
+    };
+
+    pass_describe(p, "%s=%s (%s)", scaler_opt[scaler->index],
+                  scaler->conf.kernel.name, plane_names[tex.type]);
+
+    bool is_separated = scaler->kernel && !scaler->kernel->polar;
+
+    // Set up the transformation+prelude and bind the texture, for everything
+    // other than separated scaling (which does this in the subfunction)
+    if (!is_separated)
+        sampler_prelude(p->sc, pass_bind(p, tex));
+
+    // Dispatch the scaler. They're all wildly different.
+    const char *name = scaler->conf.kernel.name;
+    if (strcmp(name, "bilinear") == 0) {
+        GLSL(color = texture(tex, pos);)
+    } else if (strcmp(name, "bicubic_fast") == 0) {
+        pass_sample_bicubic_fast(p->sc);
+    } else if (strcmp(name, "oversample") == 0) {
+        pass_sample_oversample(p->sc, scaler, w, h);
+    } else if (scaler->kernel && scaler->kernel->polar) {
+        pass_dispatch_sample_polar(p, scaler, tex, w, h);
+    } else if (scaler->kernel) {
+        pass_sample_separated(p, tex, scaler, w, h);
+    } else {
+        // Should never happen
+        abort();
+    }
+
+    // Apply any required multipliers. Separated scaling already does this in
+    // its first stage
+    if (!is_separated)
+        GLSLF("color *= %f;\n", tex.multiplier);
+
+    // Micro-optimization: Avoid scaling unneeded channels
+    skip_unused(p, tex.components);
+}
+
+// Returns true if two img_texs are semantically equivalent (same metadata)
+static bool img_tex_equiv(struct img_tex a, struct img_tex b)
+{
+    return a.type == b.type &&
+           a.components == b.components &&
+           a.multiplier == b.multiplier &&
+           a.tex->params.format == b.tex->params.format &&
+           a.tex->params.w == b.tex->params.w &&
+           a.tex->params.h == b.tex->params.h &&
+           a.w == b.w &&
+           a.h == b.h &&
+           gl_transform_eq(a.transform, b.transform);
+}
+
+static bool add_hook(struct gl_video *p, struct tex_hook hook)
+{
+    if (p->tex_hook_num < SHADER_MAX_PASSES) {
+        p->tex_hooks[p->tex_hook_num++] = hook;
+        return true;
+    } else {
+        MP_ERR(p, "Too many passes! Limit is %d.\n", SHADER_MAX_PASSES);
+        talloc_free(hook.priv);
+        return false;
+    }
+}
+
+static void deband_hook(struct gl_video *p, struct img_tex tex,
+                        struct gl_transform *trans, void *priv)
+{
+    pass_describe(p, "debanding (%s)", plane_names[tex.type]);
+    pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg,
+                       p->image_params.color.gamma);
+}
+
+static void unsharp_hook(struct gl_video *p, struct img_tex tex,
+                         struct gl_transform *trans, void *priv)
+{
+    pass_describe(p, "unsharp masking");
+    pass_sample_unsharp(p->sc, p->opts.unsharp);
+}
+
+struct szexp_ctx {
+    struct gl_video *p;
+    struct img_tex tex;
+};
+
+static bool szexp_lookup(void *priv, struct bstr var, float size[2])
+{
+    struct szexp_ctx *ctx = priv;
+    struct gl_video *p = ctx->p;
+
+    if (bstr_equals0(var, "NATIVE_CROPPED")) {
+        size[0] = (p->src_rect.x1 - p->src_rect.x0) * p->texture_offset.m[0][0];
+        size[1] = (p->src_rect.y1 - p->src_rect.y0) * p->texture_offset.m[1][1];
+        return true;
+    }
+
+    // The size of OUTPUT is determined. It could be useful for certain
+    // user shaders to skip passes.
+    if (bstr_equals0(var, "OUTPUT")) {
+        size[0] = p->dst_rect.x1 - p->dst_rect.x0;
+        size[1] = p->dst_rect.y1 - p->dst_rect.y0;
+        return true;
+    }
+
+    // HOOKED is a special case
+    if (bstr_equals0(var, "HOOKED")) {
+        size[0] = ctx->tex.w;
+        size[1] = ctx->tex.h;
+        return true;
+    }
+
+    for (int o = 0; o < p->saved_tex_num; o++) {
+        if (bstr_equals0(var, p->saved_tex[o].name)) {
+            size[0] = p->saved_tex[o].tex.w;
+            size[1] = p->saved_tex[o].tex.h;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
+{
+    struct gl_user_shader_hook *shader = priv;
+    assert(shader);
+
+    float res = false;
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->cond, &res);
+    return res;
+}
+
+static void user_hook(struct gl_video *p, struct img_tex tex,
+                      struct gl_transform *trans, void *priv)
+{
+    struct gl_user_shader_hook *shader = priv;
+    assert(shader);
+    load_shader(p, shader->pass_body);
+
+    pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
+                  plane_names[tex.type]);
+
+    if (shader->compute.active) {
+        p->pass_compute = shader->compute;
+        GLSLF("hook();\n");
+    } else {
+        GLSLF("color = hook();\n");
+    }
+
+    // Make sure we at least create a legal FBO on failure, since it's better
+    // to do this and display an error message than just crash OpenGL
+    float w = 1.0, h = 1.0;
+
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->width, &w);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->height, &h);
+
+    *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
+    gl_transform_trans(shader->offset, trans);
+}
+
+static bool add_user_hook(void *priv, struct gl_user_shader_hook hook)
+{
+    struct gl_video *p = priv;
+    struct gl_user_shader_hook *copy = talloc_ptrtype(p, copy);
+    *copy = hook;
+
+    struct tex_hook texhook = {
+        .save_tex = bstrdup0(copy, hook.save_tex),
+        .components = hook.components,
+        .hook = user_hook,
+        .cond = user_hook_cond,
+        .priv = copy,
+    };
+
+    for (int h = 0; h < SHADER_MAX_HOOKS; h++)
+        texhook.hook_tex[h] = bstrdup0(copy, hook.hook_tex[h]);
+    for (int h = 0; h < SHADER_MAX_BINDS; h++)
+        texhook.bind_tex[h] = bstrdup0(copy, hook.bind_tex[h]);
+
+    return add_hook(p, texhook);
+}
+
+static bool add_user_tex(void *priv, struct gl_user_shader_tex tex)
+{
+    struct gl_video *p = priv;
+
+    if (p->user_tex_num == SHADER_MAX_PASSES) {
+        MP_ERR(p, "Too many textures! Limit is %d.\n", SHADER_MAX_PASSES);
+        goto err;
+    }
+
+    tex.tex = ra_tex_create(p->ra, &tex.params);
+    TA_FREEP(&tex.params.initial_data);
+
+    p->user_textures[p->user_tex_num++] = tex;
+    return true;
+
+err:
+    talloc_free(tex.params.initial_data);
+    return false;
+}
+
+static void load_user_shaders(struct gl_video *p, char **shaders)
+{
+    if (!shaders)
+        return;
+
+    for (int n = 0; shaders[n] != NULL; n++) {
+        struct bstr file = load_cached_file(p, shaders[n]);
+        parse_user_shader(p->log, p->ra, file, p, add_user_hook, add_user_tex);
+    }
+}
+
+static void gl_video_setup_hooks(struct gl_video *p)
+{
+    gl_video_reset_hooks(p);
+
+    if (p->opts.deband) {
+        add_hook(p, (struct tex_hook) {
+            .hook_tex = {"LUMA", "CHROMA", "RGB", "XYZ"},
+            .bind_tex = {"HOOKED"},
+            .hook = deband_hook,
+        });
+    }
+
+    if (p->opts.unsharp != 0.0) {
+        add_hook(p, (struct tex_hook) {
+            .hook_tex = {"MAIN"},
+            .bind_tex = {"HOOKED"},
+            .hook = unsharp_hook,
+        });
+    }
+
+    load_user_shaders(p, p->opts.user_shaders);
+}
+
+// sample from video textures, set "color" variable to yuv value
+static void pass_read_video(struct gl_video *p)
+{
+    struct img_tex tex[4];
+    struct gl_transform offsets[4];
+    pass_get_img_tex(p, &p->image, tex, offsets);
+
+    // To keep the code as simple as possibly, we currently run all shader
+    // stages even if they would be unnecessary (e.g. no hooks for a texture).
+    // In the future, deferred img_tex should optimize this away.
+
+    // Merge semantically identical textures. This loop is done from back
+    // to front so that merged textures end up in the right order while
+    // simultaneously allowing us to skip unnecessary merges
+    for (int n = 3; n >= 0; n--) {
+        if (tex[n].type == PLANE_NONE)
+            continue;
+
+        int first = n;
+        int num = 0;
+
+        for (int i = 0; i < n; i++) {
+            if (img_tex_equiv(tex[n], tex[i]) &&
+                gl_transform_eq(offsets[n], offsets[i]))
+            {
+                GLSLF("// merging plane %d ...\n", i);
+                copy_img_tex(p, &num, tex[i]);
+                first = MPMIN(first, i);
+                tex[i] = (struct img_tex){0};
+            }
+        }
+
+        if (num > 0) {
+            GLSLF("// merging plane %d ... into %d\n", n, first);
+            copy_img_tex(p, &num, tex[n]);
+            pass_describe(p, "merging planes");
+            finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
+            tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
+            tex[n] = (struct img_tex){0};
+        }
+    }
+
+    // If any textures are still in integer format by this point, we need
+    // to introduce an explicit conversion pass to avoid breaking hooks/scaling
+    for (int n = 0; n < 4; n++) {
+        if (tex[n].tex && tex[n].tex->params.format->ctype == RA_CTYPE_UINT) {
+            GLSLF("// use_integer fix for plane %d\n", n);
+            copy_img_tex(p, &(int){0}, tex[n]);
+            pass_describe(p, "use_integer fix");
+            finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
+            tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
+                                 tex[n].components);
+        }
+    }
+
+    // Dispatch the hooks for all of these textures, saving and perhaps
+    // modifying them in the process
+    for (int n = 0; n < 4; n++) {
+        const char *name;
+        switch (tex[n].type) {
+        case PLANE_RGB:    name = "RGB";    break;
+        case PLANE_LUMA:   name = "LUMA";   break;
+        case PLANE_CHROMA: name = "CHROMA"; break;
+        case PLANE_ALPHA:  name = "ALPHA";  break;
+        case PLANE_XYZ:    name = "XYZ";    break;
+        default: continue;
+        }
+
+        tex[n] = pass_hook(p, name, tex[n], &offsets[n]);
+    }
+
+    // At this point all planes are finalized but they may not be at the
+    // required size yet. Furthermore, they may have texture offsets that
+    // require realignment. For lack of something better to do, we assume
+    // the rgb/luma texture is the "reference" and scale everything else
+    // to match.
+    for (int n = 0; n < 4; n++) {
+        switch (tex[n].type) {
+        case PLANE_RGB:
+        case PLANE_XYZ:
+        case PLANE_LUMA: break;
+        default: continue;
+        }
+
+        p->texture_w = tex[n].w;
+        p->texture_h = tex[n].h;
+        p->texture_offset = offsets[n];
+        break;
+    }
+
+    // Compute the reference rect
+    struct mp_rect_f src = {0.0, 0.0, p->image_params.w, p->image_params.h};
+    struct mp_rect_f ref = src;
+    gl_transform_rect(p->texture_offset, &ref);
+    MP_DBG(p, "ref rect: {%f %f} {%f %f}\n", ref.x0, ref.y0, ref.x1, ref.y1);
+
+    // Explicitly scale all of the textures that don't match
+    for (int n = 0; n < 4; n++) {
+        if (tex[n].type == PLANE_NONE)
+            continue;
+
+        // If the planes are aligned identically, we will end up with the
+        // exact same source rectangle.
+        struct mp_rect_f rect = src;
+        gl_transform_rect(offsets[n], &rect);
+        MP_DBG(p, "rect[%d]: {%f %f} {%f %f}\n", n,
+               rect.x0, rect.y0, rect.x1, rect.y1);
+
+        if (mp_rect_f_seq(ref, rect))
+            continue;
+
+        // If the rectangles differ, then our planes have a different
+        // alignment and/or size. First of all, we have to compute the
+        // corrections required to meet the target rectangle
+        struct gl_transform fix = {
+            .m = {{(ref.x1 - ref.x0) / (rect.x1 - rect.x0), 0.0},
+                  {0.0, (ref.y1 - ref.y0) / (rect.y1 - rect.y0)}},
+            .t = {ref.x0, ref.y0},
+        };
+        MP_DBG(p, "-> fix[%d] = {%f %f} + off {%f %f}\n", n,
+               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
+
+        // Since the scale in texture space is different from the scale in
+        // absolute terms, we have to scale the coefficients down to be
+        // relative to the texture's physical dimensions and local offset
+        struct gl_transform scale = {
+            .m = {{(float)tex[n].w / p->texture_w, 0.0},
+                  {0.0, (float)tex[n].h / p->texture_h}},
+            .t = {-rect.x0, -rect.y0},
+        };
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(double, scale.m[0][0], scale.m[1][1]);
+
+        gl_transform_trans(scale, &fix);
+        MP_DBG(p, "-> scaled[%d] = {%f %f} + off {%f %f}\n", n,
+               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
+
+        // Since the texture transform is a function of the texture coordinates
+        // to texture space, rather than the other way around, we have to
+        // actually apply the *inverse* of this. Fortunately, calculating
+        // the inverse is relatively easy here.
+        fix.m[0][0] = 1.0 / fix.m[0][0];
+        fix.m[1][1] = 1.0 / fix.m[1][1];
+        fix.t[0] = fix.m[0][0] * -fix.t[0];
+        fix.t[1] = fix.m[1][1] * -fix.t[1];
+        gl_transform_trans(fix, &tex[n].transform);
+
+        int scaler_id = -1;
+        const char *name = NULL;
+        switch (tex[n].type) {
+        case PLANE_RGB:
+        case PLANE_LUMA:
+        case PLANE_XYZ:
+            scaler_id = SCALER_SCALE;
+            // these aren't worth hooking, fringe hypothetical cases only
+            break;
+        case PLANE_CHROMA:
+            scaler_id = SCALER_CSCALE;
+            name = "CHROMA_SCALED";
+            break;
+        case PLANE_ALPHA:
+            // alpha always uses bilinear
+            name = "ALPHA_SCALED";
+        }
+
+        if (scaler_id < 0)
+            continue;
+
+        const struct scaler_config *conf = &p->opts.scaler[scaler_id];
+        struct scaler *scaler = &p->scaler[scaler_id];
+
+        // bilinear scaling is a free no-op thanks to GPU sampling
+        if (strcmp(conf->kernel.name, "bilinear") != 0) {
+            GLSLF("// upscaling plane %d\n", n);
+            pass_sample(p, tex[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
+            finish_pass_fbo(p, &p->scale_fbo[n], p->texture_w, p->texture_h, 0);
+            tex[n] = img_tex_fbo(&p->scale_fbo[n], tex[n].type, tex[n].components);
+        }
+
+        // Run any post-scaling hooks
+        tex[n] = pass_hook(p, name, tex[n], NULL);
+    }
+
+    // All planes are of the same size and properly aligned at this point
+    GLSLF("// combining planes\n");
+    int coord = 0;
+    for (int i = 0; i < 4; i++) {
+        if (tex[i].type != PLANE_NONE)
+            copy_img_tex(p, &coord, tex[i]);
+    }
+    p->components = coord;
+}
+
+// Utility function that simply binds an FBO and reads from it, without any
+// transformations.
+static void pass_read_fbo(struct gl_video *p, struct fbotex *fbo)
+{
+    struct img_tex tex = img_tex_fbo(fbo, PLANE_RGB, p->components);
+    copy_img_tex(p, &(int){0}, tex);
+}
+
+// yuv conversion, and any other conversions before main up/down-scaling
+static void pass_convert_yuv(struct gl_video *p)
+{
+    struct gl_shader_cache *sc = p->sc;
+
+    struct mp_csp_params cparams = MP_CSP_PARAMS_DEFAULTS;
+    cparams.gray = p->is_gray;
+    mp_csp_set_image_params(&cparams, &p->image_params);
+    mp_csp_equalizer_state_get(p->video_eq, &cparams);
+    p->user_gamma = 1.0 / (cparams.gamma * p->opts.gamma);
+
+    pass_describe(p, "color conversion");
+
+    if (p->color_swizzle[0])
+        GLSLF("color = color.%s;\n", p->color_swizzle);
+
+    // Pre-colormatrix input gamma correction
+    if (cparams.color.space == MP_CSP_XYZ)
+        GLSL(color.rgb = pow(color.rgb, vec3(2.6));) // linear light
+
+    // We always explicitly normalize the range in pass_read_video
+    cparams.input_bits = cparams.texture_bits = 0;
+
+    // Conversion to RGB. For RGB itself, this still applies e.g. brightness
+    // and contrast controls, or expansion of e.g. LSB-packed 10 bit data.
+    struct mp_cmat m = {{{0}}};
+    mp_get_csp_matrix(&cparams, &m);
+    gl_sc_uniform_mat3(sc, "colormatrix", true, &m.m[0][0]);
+    gl_sc_uniform_vec3(sc, "colormatrix_c", m.c);
+
+    GLSL(color.rgb = mat3(colormatrix) * color.rgb + colormatrix_c;)
+
+    if (p->image_params.color.space == MP_CSP_BT_2020_C) {
+        // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
+        // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
+        //      = (B'-Y'c) / 1.5816  | C'bc >  0
+        //
+        // C'rc = (R'-Y'c) / 1.7184  | C'rc <= 0
+        //      = (R'-Y'c) / 0.9936  | C'rc >  0
+        //
+        // as per the BT.2020 specification, table 4. This is a non-linear
+        // transformation because (constant) luminance receives non-equal
+        // contributions from the three different channels.
+        GLSLF("// constant luminance conversion\n");
+        GLSL(color.br = color.br * mix(vec2(1.5816, 0.9936),
+                                       vec2(1.9404, 1.7184),
+                                       lessThanEqual(color.br, vec2(0)))
+                        + color.gg;)
+        // Expand channels to camera-linear light. This shader currently just
+        // assumes everything uses the BT.2020 12-bit gamma function, since the
+        // difference between 10 and 12-bit is negligible for anything other
+        // than 12-bit content.
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
+                             pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993),
+                                 vec3(1.0/0.45)),
+                             lessThanEqual(vec3(0.08145), color.rgb));)
+        // Calculate the green channel from the expanded RYcB
+        // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
+        GLSL(color.g = (color.g - 0.2627*color.r - 0.0593*color.b)*1.0/0.6780;)
+        // Recompress to receive the R'G'B' result, same as other systems
+        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
+                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
+                             lessThanEqual(vec3(0.0181), color.rgb));)
+    }
+
+    p->components = 3;
+    if (!p->has_alpha || p->opts.alpha_mode == ALPHA_NO) {
+        GLSL(color.a = 1.0;)
+    } else { // alpha present in image
+        p->components = 4;
+        GLSL(color = vec4(color.rgb * color.a, color.a);)
+    }
+}
+
+static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2])
+{
+    double target_w = p->src_rect.x1 - p->src_rect.x0;
+    double target_h = p->src_rect.y1 - p->src_rect.y0;
+    if (transpose_rot && p->image_params.rotate % 180 == 90)
+        MPSWAP(double, target_w, target_h);
+    xy[0] = (p->dst_rect.x1 - p->dst_rect.x0) / target_w;
+    xy[1] = (p->dst_rect.y1 - p->dst_rect.y0) / target_h;
+}
+
+// Cropping.
+static void compute_src_transform(struct gl_video *p, struct gl_transform *tr)
+{
+    float sx = (p->src_rect.x1 - p->src_rect.x0) / (float)p->texture_w,
+          sy = (p->src_rect.y1 - p->src_rect.y0) / (float)p->texture_h,
+          ox = p->src_rect.x0,
+          oy = p->src_rect.y0;
+    struct gl_transform transform = {{{sx, 0}, {0, sy}}, {ox, oy}};
+
+    gl_transform_trans(p->texture_offset, &transform);
+
+    *tr = transform;
+}
+
+// Takes care of the main scaling and pre/post-conversions
+static void pass_scale_main(struct gl_video *p)
+{
+    // Figure out the main scaler.
+    double xy[2];
+    get_scale_factors(p, true, xy);
+
+    // actual scale factor should be divided by the scale factor of prescaling.
+    xy[0] /= p->texture_offset.m[0][0];
+    xy[1] /= p->texture_offset.m[1][1];
+
+    bool downscaling = xy[0] < 1.0 || xy[1] < 1.0;
+    bool upscaling = !downscaling && (xy[0] > 1.0 || xy[1] > 1.0);
+    double scale_factor = 1.0;
+
+    struct scaler *scaler = &p->scaler[SCALER_SCALE];
+    struct scaler_config scaler_conf = p->opts.scaler[SCALER_SCALE];
+    if (p->opts.scaler_resizes_only && !downscaling && !upscaling) {
+        scaler_conf.kernel.name = "bilinear";
+        // For scaler-resizes-only, we round the texture offset to
+        // the nearest round value in order to prevent ugly blurriness
+        // (in exchange for slightly shifting the image by up to half a
+        // subpixel)
+        p->texture_offset.t[0] = roundf(p->texture_offset.t[0]);
+        p->texture_offset.t[1] = roundf(p->texture_offset.t[1]);
+    }
+    if (downscaling && p->opts.scaler[SCALER_DSCALE].kernel.name) {
+        scaler_conf = p->opts.scaler[SCALER_DSCALE];
+        scaler = &p->scaler[SCALER_DSCALE];
+    }
+
+    // When requesting correct-downscaling and the clip is anamorphic, and
+    // because only a single scale factor is used for both axes, enable it only
+    // when both axes are downscaled, and use the milder of the factors to not
+    // end up with too much blur on one axis (even if we end up with sub-optimal
+    // scale factor on the other axis). This is better than not respecting
+    // correct scaling at all for anamorphic clips.
+    double f = MPMAX(xy[0], xy[1]);
+    if (p->opts.correct_downscaling && f < 1.0)
+        scale_factor = 1.0 / f;
+
+    // Pre-conversion, like linear light/sigmoidization
+    GLSLF("// scaler pre-conversion\n");
+    bool use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
+
+    // Linear light downscaling results in nasty artifacts for HDR curves due
+    // to the potentially extreme brightness differences severely compounding
+    // any ringing. So just scale in gamma light instead.
+    if (mp_trc_is_hdr(p->image_params.color.gamma) && downscaling)
+        use_linear = false;
+
+    if (use_linear) {
+        p->use_linear = true;
+        pass_linearize(p->sc, p->image_params.color.gamma);
+        pass_opt_hook_point(p, "LINEAR", NULL);
+    }
+
+    bool use_sigmoid = use_linear && p->opts.sigmoid_upscaling && upscaling;
+    float sig_center, sig_slope, sig_offset, sig_scale;
+    if (use_sigmoid) {
+        // Coefficients for the sigmoidal transform are taken from the
+        // formula here: http://www.imagemagick.org/Usage/color_mods/#sigmoidal
+        sig_center = p->opts.sigmoid_center;
+        sig_slope  = p->opts.sigmoid_slope;
+        // This function needs to go through (0,0) and (1,1) so we compute the
+        // values at 1 and 0, and then scale/shift them, respectively.
+        sig_offset = 1.0/(1+expf(sig_slope * sig_center));
+        sig_scale  = 1.0/(1+expf(sig_slope * (sig_center-1))) - sig_offset;
+        GLSLF("color.rgb = %f - log(1.0/(color.rgb * %f + %f) - 1.0) * 1.0/%f;\n",
+                sig_center, sig_scale, sig_offset, sig_slope);
+        pass_opt_hook_point(p, "SIGMOID", NULL);
+    }
+
+    pass_opt_hook_point(p, "PREKERNEL", NULL);
+
+    int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
+    int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+    struct gl_transform transform;
+    compute_src_transform(p, &transform);
+
+    GLSLF("// main scaling\n");
+    finish_pass_fbo(p, &p->indirect_fbo, p->texture_w, p->texture_h, 0);
+    struct img_tex src = img_tex_fbo(&p->indirect_fbo, PLANE_RGB, p->components);
+    gl_transform_trans(transform, &src.transform);
+    pass_sample(p, src, scaler, &scaler_conf, scale_factor, vp_w, vp_h);
+
+    // Changes the texture size to display size after main scaler.
+    p->texture_w = vp_w;
+    p->texture_h = vp_h;
+
+    pass_opt_hook_point(p, "POSTKERNEL", NULL);
+
+    GLSLF("// scaler post-conversion\n");
+    if (use_sigmoid) {
+        // Inverse of the transformation above
+        GLSLF("color.rgb = (1.0/(1.0 + exp(%f * (%f - color.rgb))) - %f) * 1.0/%f;\n",
+                sig_slope, sig_center, sig_offset, sig_scale);
+    }
+}
+
+// Adapts the colors to the right output color space. (Final pass during
+// rendering)
+// If OSD is true, ignore any changes that may have been made to the video
+// by previous passes (i.e. linear scaling)
+static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool osd)
+{
+    struct ra *ra = p->ra;
+
+    // Figure out the target color space from the options, or auto-guess if
+    // none were set
+    struct mp_colorspace dst = {
+        .gamma = p->opts.target_trc,
+        .primaries = p->opts.target_prim,
+        .light = MP_CSP_LIGHT_DISPLAY,
+    };
+
+    if (p->use_lut_3d) {
+        // The 3DLUT is always generated against the video's original source
+        // space, *not* the reference space. (To avoid having to regenerate
+        // the 3DLUT for the OSD on every frame)
+        enum mp_csp_prim prim_orig = p->image_params.color.primaries;
+        enum mp_csp_trc trc_orig = p->image_params.color.gamma;
+
+        // One exception: HDR is not implemented by LittleCMS for technical
+        // limitation reasons, so we use a gamma 2.2 input curve here instead.
+        // We could pick any value we want here, the difference is just coding
+        // efficiency.
+        if (mp_trc_is_hdr(trc_orig))
+            trc_orig = MP_CSP_TRC_GAMMA22;
+
+        if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
+            dst.primaries = prim_orig;
+            dst.gamma = trc_orig;
+        }
+    }
+
+    if (dst.primaries == MP_CSP_PRIM_AUTO) {
+        // The vast majority of people are on sRGB or BT.709 displays, so pick
+        // this as the default output color space.
+        dst.primaries = MP_CSP_PRIM_BT_709;
+
+        if (src.primaries == MP_CSP_PRIM_BT_601_525 ||
+            src.primaries == MP_CSP_PRIM_BT_601_625)
+        {
+            // Since we auto-pick BT.601 and BT.709 based on the dimensions,
+            // combined with the fact that they're very similar to begin with,
+            // and to avoid confusing the average user, just don't adapt BT.601
+            // content automatically at all.
+            dst.primaries = src.primaries;
+        }
+    }
+
+    if (dst.gamma == MP_CSP_TRC_AUTO) {
+        // Most people seem to complain when the image is darker or brighter
+        // than what they're "used to", so just avoid changing the gamma
+        // altogether by default. The only exceptions to this rule apply to
+        // very unusual TRCs, which even hardcode technoluddites would probably
+        // not enjoy viewing unaltered.
+        dst.gamma = src.gamma;
+
+        // Avoid outputting linear light or HDR content "by default". For these
+        // just pick gamma 2.2 as a default, since it's a good estimate for
+        // the response of typical displays
+        if (dst.gamma == MP_CSP_TRC_LINEAR || mp_trc_is_hdr(dst.gamma))
+            dst.gamma = MP_CSP_TRC_GAMMA22;
+    }
+
+    bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
+    if (detect_peak && !p->hdr_peak_ssbo) {
+        struct {
+            unsigned int sig_peak_raw;
+            unsigned int index;
+            unsigned int frame_max[PEAK_DETECT_FRAMES+1];
+        } peak_ssbo = {0};
+
+        // Prefill with safe values
+        int safe = MP_REF_WHITE * mp_trc_nom_peak(p->image_params.color.gamma);
+        peak_ssbo.sig_peak_raw = PEAK_DETECT_FRAMES * safe;
+        for (int i = 0; i < PEAK_DETECT_FRAMES+1; i++)
+            peak_ssbo.frame_max[i] = safe;
+
+        struct ra_buf_params params = {
+            .type = RA_BUF_TYPE_SHADER_STORAGE,
+            .size = sizeof(peak_ssbo),
+            .initial_data = &peak_ssbo,
+        };
+
+        p->hdr_peak_ssbo = ra_buf_create(ra, &params);
+        if (!p->hdr_peak_ssbo) {
+            MP_WARN(p, "Failed to create HDR peak detection SSBO, disabling.\n");
+            detect_peak = (p->opts.compute_hdr_peak = false);
+        }
+    }
+
+    if (detect_peak) {
+        pass_describe(p, "detect HDR peak");
+        pass_is_compute(p, 8, 8); // 8x8 is good for performance
+        gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
+            "uint sig_peak_raw;"
+            "uint index;"
+            "uint frame_max[%d];", PEAK_DETECT_FRAMES + 1
+        );
+    }
+
+    // Adapt from src to dst as necessary
+    pass_color_map(p->sc, src, dst, p->opts.tone_mapping,
+                   p->opts.tone_mapping_param, p->opts.tone_mapping_desat,
+                   detect_peak, p->opts.gamut_warning, p->use_linear && !osd);
+
+    if (p->use_lut_3d) {
+        gl_sc_uniform_texture(p->sc, "lut_3d", p->lut_3d_texture);
+        GLSL(vec3 cpos;)
+        for (int i = 0; i < 3; i++)
+            GLSLF("cpos[%d] = LUT_POS(color[%d], %d.0);\n", i, i, p->lut_3d_size[i]);
+        GLSL(color.rgb = tex3D(lut_3d, cpos).rgb;)
+    }
+}
+
+void gl_video_set_fb_depth(struct gl_video *p, int fb_depth)
+{
+    p->fb_depth = fb_depth;
+}
+
+static void pass_dither(struct gl_video *p)
+{
+    // Assume 8 bits per component if unknown.
+    int dst_depth = p->fb_depth > 0 ? p->fb_depth : 8;
+    if (p->opts.dither_depth > 0)
+        dst_depth = p->opts.dither_depth;
+
+    if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
+        return;
+
+    if (!p->dither_texture) {
+        MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
+
+        int tex_size = 0;
+        void *tex_data = NULL;
+        const struct ra_format *fmt = NULL;
+        void *temp = NULL;
+
+        if (p->opts.dither_algo == DITHER_FRUIT) {
+            int sizeb = p->opts.dither_size;
+            int size = 1 << sizeb;
+
+            if (p->last_dither_matrix_size != size) {
+                p->last_dither_matrix = talloc_realloc(p, p->last_dither_matrix,
+                                                       float, size * size);
+                mp_make_fruit_dither_matrix(p->last_dither_matrix, sizeb);
+                p->last_dither_matrix_size = size;
+            }
+
+            // Prefer R16 texture since they provide higher precision.
+            fmt = ra_find_unorm_format(p->ra, 2, 1);
+            if (!fmt)
+                fmt = ra_find_float16_format(p->ra, 1);
+            if (fmt) {
+                tex_size = size;
+                tex_data = p->last_dither_matrix;
+                if (fmt->ctype == RA_CTYPE_UNORM) {
+                    uint16_t *t = temp = talloc_array(NULL, uint16_t, size * size);
+                    for (int n = 0; n < size * size; n++)
+                        t[n] = p->last_dither_matrix[n] * UINT16_MAX;
+                    tex_data = t;
+                }
+            } else {
+                MP_VERBOSE(p, "GL too old. Falling back to ordered dither.\n");
+                p->opts.dither_algo = DITHER_ORDERED;
+            }
+        }
+
+        if (p->opts.dither_algo == DITHER_ORDERED) {
+            temp = talloc_array(NULL, char, 8 * 8);
+            mp_make_ordered_dither_matrix(temp, 8);
+
+            fmt = ra_find_unorm_format(p->ra, 1, 1);
+            tex_size = 8;
+            tex_data = temp;
+        }
+
+        struct ra_tex_params params = {
+            .dimensions = 2,
+            .w = tex_size,
+            .h = tex_size,
+            .d = 1,
+            .format = fmt,
+            .render_src = true,
+            .src_repeat = true,
+            .initial_data = tex_data,
+        };
+        p->dither_texture = ra_tex_create(p->ra, &params);
+
+        debug_check_gl(p, "dither setup");
+
+        talloc_free(temp);
+    }
+
+    GLSLF("// dithering\n");
+
+    // This defines how many bits are considered significant for output on
+    // screen. The superfluous bits will be used for rounding according to the
+    // dither matrix. The precision of the source implicitly decides how many
+    // dither patterns can be visible.
+    int dither_quantization = (1 << dst_depth) - 1;
+    int dither_size = p->dither_texture->params.w;
+
+    gl_sc_uniform_texture(p->sc, "dither", p->dither_texture);
+
+    GLSLF("vec2 dither_pos = gl_FragCoord.xy * 1.0/%d.0;\n", dither_size);
+
+    if (p->opts.temporal_dither) {
+        int phase = (p->frames_rendered / p->opts.temporal_dither_period) % 8u;
+        float r = phase * (M_PI / 2); // rotate
+        float m = phase < 4 ? 1 : -1; // mirror
+
+        float matrix[2][2] = {{cos(r),     -sin(r)    },
+                              {sin(r) * m,  cos(r) * m}};
+        gl_sc_uniform_mat2(p->sc, "dither_trafo", true, &matrix[0][0]);
+
+        GLSL(dither_pos = dither_trafo * dither_pos;)
+    }
+
+    GLSL(float dither_value = texture(dither, dither_pos).r;)
+    GLSLF("color = floor(color * %d.0 + dither_value + 0.5 / %d.0) * 1.0/%d.0;\n",
+          dither_quantization, dither_size * dither_size, dither_quantization);
+}
+
+// Draws the OSD, in scene-referred colors.. If cms is true, subtitles are
+// instead adapted to the display's gamut.
+static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
+                          struct mp_osd_res rect, struct fbodst target, bool cms)
+{
+    mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo_out, draw_flags);
+
+    timer_pool_start(p->osd_timer);
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        // (This returns false if this part is empty with nothing to draw.)
+        if (!mpgl_osd_draw_prepare(p->osd, n, p->sc))
+            continue;
+        // When subtitles need to be color managed, assume they're in sRGB
+        // (for lack of anything saner to do)
+        if (cms) {
+            static const struct mp_colorspace csp_srgb = {
+                .primaries = MP_CSP_PRIM_BT_709,
+                .gamma = MP_CSP_TRC_SRGB,
+                .light = MP_CSP_LIGHT_DISPLAY,
+            };
+
+            pass_colormanage(p, csp_srgb, true);
+        }
+        mpgl_osd_draw_finish(p->osd, n, p->sc, target);
+    }
+
+    timer_pool_stop(p->osd_timer);
+    pass_describe(p, "drawing osd");
+    pass_record(p, timer_pool_measure(p->osd_timer));
+}
+
+static float chroma_realign(int size, int pixel)
+{
+    return size / (float)chroma_upsize(size, pixel);
+}
+
+// Minimal rendering code path, for GLES or OpenGL 2.1 without proper FBOs.
+static void pass_render_frame_dumb(struct gl_video *p)
+{
+    struct img_tex tex[4];
+    struct gl_transform off[4];
+    pass_get_img_tex(p, &p->image, tex, off);
+
+    struct gl_transform transform;
+    compute_src_transform(p, &transform);
+
+    int index = 0;
+    for (int i = 0; i < p->plane_count; i++) {
+        int cw = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_w : 1;
+        int ch = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_h : 1;
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(int, cw, ch);
+
+        struct gl_transform t = transform;
+        t.m[0][0] *= chroma_realign(p->texture_w, cw);
+        t.m[1][1] *= chroma_realign(p->texture_h, ch);
+
+        t.t[0] /= cw;
+        t.t[1] /= ch;
+
+        t.t[0] += off[i].t[0];
+        t.t[1] += off[i].t[1];
+
+        gl_transform_trans(tex[i].transform, &t);
+        tex[i].transform = t;
+
+        copy_img_tex(p, &index, tex[i]);
+    }
+
+    pass_convert_yuv(p);
+}
+
+// The main rendering function, takes care of everything up to and including
+// upscaling. p->image is rendered.
+static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t id)
+{
+    // initialize the texture parameters and temporary variables
+    p->texture_w = p->image_params.w;
+    p->texture_h = p->image_params.h;
+    p->texture_offset = identity_trans;
+    p->components = 0;
+    p->saved_tex_num = 0;
+    p->hook_fbo_num = 0;
+    p->use_linear = false;
+
+    // try uploading the frame
+    if (!pass_upload_image(p, mpi, id))
+        return false;
+
+    if (p->image_params.rotate % 180 == 90)
+        MPSWAP(int, p->texture_w, p->texture_h);
+
+    if (p->dumb_mode)
+        return true;
+
+    pass_read_video(p);
+    pass_opt_hook_point(p, "NATIVE", &p->texture_offset);
+    pass_convert_yuv(p);
+    pass_opt_hook_point(p, "MAINPRESUB", &p->texture_offset);
+
+    // For subtitles
+    double vpts = p->image.mpi->pts;
+    if (vpts == MP_NOPTS_VALUE)
+        vpts = p->osd_pts;
+
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO) {
+        double scale[2];
+        get_scale_factors(p, false, scale);
+        struct mp_osd_res rect = {
+            .w = p->texture_w, .h = p->texture_h,
+            .display_par = scale[1] / scale[0], // counter compensate scaling
+        };
+        finish_pass_fbo(p, &p->blend_subs_fbo, rect.w, rect.h, 0);
+        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
+                      p->blend_subs_fbo.fbo, false);
+        pass_read_fbo(p, &p->blend_subs_fbo);
+        pass_describe(p, "blend subs video");
+    }
+    pass_opt_hook_point(p, "MAIN", &p->texture_offset);
+
+    pass_scale_main(p);
+
+    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
+        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES) {
+        // Recreate the real video size from the src/dst rects
+        struct mp_osd_res rect = {
+            .w = vp_w, .h = vp_h,
+            .ml = -p->src_rect.x0, .mr = p->src_rect.x1 - p->image_params.w,
+            .mt = -p->src_rect.y0, .mb = p->src_rect.y1 - p->image_params.h,
+            .display_par = 1.0,
+        };
+        // Adjust margins for scale
+        double scale[2];
+        get_scale_factors(p, true, scale);
+        rect.ml *= scale[0]; rect.mr *= scale[0];
+        rect.mt *= scale[1]; rect.mb *= scale[1];
+        // We should always blend subtitles in non-linear light
+        if (p->use_linear) {
+            pass_delinearize(p->sc, p->image_params.color.gamma);
+            p->use_linear = false;
+        }
+        finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h, 0);
+        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
+                      p->blend_subs_fbo.fbo, false);
+        pass_read_fbo(p, &p->blend_subs_fbo);
+        pass_describe(p, "blend subs");
+    }
+
+    pass_opt_hook_point(p, "SCALED", NULL);
+
+    return true;
+}
+
+static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
+{
+    if (p->dumb_mode)
+        pass_render_frame_dumb(p);
+
+    // Adjust the overall gamma before drawing to screen
+    if (p->user_gamma != 1) {
+        gl_sc_uniform_f(p->sc, "user_gamma", p->user_gamma);
+        GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+        GLSL(color.rgb = pow(color.rgb, vec3(user_gamma));)
+    }
+
+    pass_colormanage(p, p->image_params.color, false);
+
+    // Since finish_pass_direct doesn't work with compute shaders, and neither
+    // does the checkerboard/dither code, we may need an indirection via
+    // p->screen_fbo here.
+    if (p->pass_compute.active) {
+        int o_w = p->dst_rect.x1 - p->dst_rect.x0,
+            o_h = p->dst_rect.y1 - p->dst_rect.y0;
+        finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
+        struct img_tex tmp = img_tex_fbo(&p->screen_fbo, PLANE_RGB, p->components);
+        copy_img_tex(p, &(int){0}, tmp);
+    }
+
+    if (p->has_alpha){
+        if (p->opts.alpha_mode == ALPHA_BLEND_TILES) {
+            // Draw checkerboard pattern to indicate transparency
+            GLSLF("// transparency checkerboard\n");
+            GLSL(bvec2 tile = lessThan(fract(gl_FragCoord.xy * 1.0/32.0), vec2(0.5));)
+            GLSL(vec3 background = vec3(tile.x == tile.y ? 1.0 : 0.75);)
+            GLSL(color.rgb = mix(background, color.rgb, color.a);)
+        } else if (p->opts.alpha_mode == ALPHA_BLEND) {
+            // Blend into background color (usually black)
+            struct m_color c = p->opts.background;
+            GLSLF("vec4 background = vec4(%f, %f, %f, %f);\n",
+                  c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0);
+            GLSL(color = mix(background, vec4(color.rgb, 1.0), color.a);)
+        }
+    }
+
+    pass_opt_hook_point(p, "OUTPUT", NULL);
+
+    pass_dither(p);
+    pass_describe(p, "output to screen");
+    finish_pass_direct(p, fbo, &p->dst_rect);
+}
+
+static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
+                              uint64_t id, struct fbosurface *surf)
+{
+    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
+        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+
+    pass_info_reset(p, false);
+    if (!pass_render_frame(p, mpi, id))
+        return false;
+
+    // Frame blending should always be done in linear light to preserve the
+    // overall brightness, otherwise this will result in flashing dark frames
+    // because mixing in compressed light artificially darkens the results
+    if (!p->use_linear) {
+        p->use_linear = true;
+        pass_linearize(p->sc, p->image_params.color.gamma);
+    }
+
+    finish_pass_fbo(p, &surf->fbotex, vp_w, vp_h, FBOTEX_FUZZY);
+    surf->id  = id;
+    surf->pts = mpi->pts;
+    return true;
+}
+
+// Draws an interpolate frame to fbo, based on the frame timing in t
+static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
+                                       struct fbodst fbo)
+{
+    bool is_new = false;
+
+    // Reset the queue completely if this is a still image, to avoid any
+    // interpolation artifacts from surrounding frames when unpausing or
+    // framestepping
+    if (t->still)
+        gl_video_reset_surfaces(p);
+
+    // First of all, figure out if we have a frame available at all, and draw
+    // it manually + reset the queue if not
+    if (p->surfaces[p->surface_now].id == 0) {
+        struct fbosurface *now = &p->surfaces[p->surface_now];
+        if (!update_fbosurface(p, t->current, t->frame_id, now))
+            return;
+        p->surface_idx = p->surface_now;
+        is_new = true;
+    }
+
+    // Find the right frame for this instant
+    if (t->current) {
+        int next = fbosurface_wrap(p->surface_now + 1);
+        while (p->surfaces[next].id &&
+               p->surfaces[next].id > p->surfaces[p->surface_now].id &&
+               p->surfaces[p->surface_now].id < t->frame_id)
+        {
+            p->surface_now = next;
+            next = fbosurface_wrap(next + 1);
+        }
+    }
+
+    // Figure out the queue size. For illustration, a filter radius of 2 would
+    // look like this: _ A [B] C D _
+    // A is surface_bse, B is surface_now, C is surface_now+1 and D is
+    // surface_end.
+    struct scaler *tscale = &p->scaler[SCALER_TSCALE];
+    reinit_scaler(p, tscale, &p->opts.scaler[SCALER_TSCALE], 1, tscale_sizes);
+    bool oversample = strcmp(tscale->conf.kernel.name, "oversample") == 0;
+    bool linear = strcmp(tscale->conf.kernel.name, "linear") == 0;
+    int size;
+
+    if (oversample || linear) {
+        size = 2;
+    } else {
+        assert(tscale->kernel && !tscale->kernel->polar);
+        size = ceil(tscale->kernel->size);
+        assert(size <= TEXUNIT_VIDEO_NUM);
+    }
+
+    int radius = size/2;
+    int surface_now = p->surface_now;
+    int surface_bse = fbosurface_wrap(surface_now - (radius-1));
+    int surface_end = fbosurface_wrap(surface_now + radius);
+    assert(fbosurface_wrap(surface_bse + size-1) == surface_end);
+
+    // Render new frames while there's room in the queue. Note that technically,
+    // this should be done before the step where we find the right frame, but
+    // it only barely matters at the very beginning of playback, and this way
+    // makes the code much more linear.
+    int surface_dst = fbosurface_wrap(p->surface_idx + 1);
+    for (int i = 0; i < t->num_frames; i++) {
+        // Avoid overwriting data we might still need
+        if (surface_dst == surface_bse - 1)
+            break;
+
+        struct mp_image *f = t->frames[i];
+        uint64_t f_id = t->frame_id + i;
+        if (!mp_image_params_equal(&f->params, &p->real_image_params))
+            continue;
+
+        if (f_id > p->surfaces[p->surface_idx].id) {
+            struct fbosurface *dst = &p->surfaces[surface_dst];
+            if (!update_fbosurface(p, f, f_id, dst))
+                return;
+            p->surface_idx = surface_dst;
+            surface_dst = fbosurface_wrap(surface_dst + 1);
+            is_new = true;
+        }
+    }
+
+    // Figure out whether the queue is "valid". A queue is invalid if the
+    // frames' PTS is not monotonically increasing. Anything else is invalid,
+    // so avoid blending incorrect data and just draw the latest frame as-is.
+    // Possible causes for failure of this condition include seeks, pausing,
+    // end of playback or start of playback.
+    bool valid = true;
+    for (int i = surface_bse, ii; valid && i != surface_end; i = ii) {
+        ii = fbosurface_wrap(i + 1);
+        if (p->surfaces[i].id == 0 || p->surfaces[ii].id == 0) {
+            valid = false;
+        } else if (p->surfaces[ii].id < p->surfaces[i].id) {
+            valid = false;
+            MP_DBG(p, "interpolation queue underrun\n");
+        }
+    }
+
+    // Update OSD PTS to synchronize subtitles with the displayed frame
+    p->osd_pts = p->surfaces[surface_now].pts;
+
+    // Finally, draw the right mix of frames to the screen.
+    if (!is_new)
+        pass_info_reset(p, true);
+    pass_describe(p, "interpolation");
+    if (!valid || t->still) {
+        // surface_now is guaranteed to be valid, so we can safely use it.
+        pass_read_fbo(p, &p->surfaces[surface_now].fbotex);
+        p->is_interpolated = false;
+    } else {
+        double mix = t->vsync_offset / t->ideal_frame_duration;
+        // The scaler code always wants the fcoord to be between 0 and 1,
+        // so we try to adjust by using the previous set of N frames instead
+        // (which requires some extra checking to make sure it's valid)
+        if (mix < 0.0) {
+            int prev = fbosurface_wrap(surface_bse - 1);
+            if (p->surfaces[prev].id != 0 &&
+                p->surfaces[prev].id < p->surfaces[surface_bse].id)
+            {
+                mix += 1.0;
+                surface_bse = prev;
+            } else {
+                mix = 0.0; // at least don't blow up, this should only
+                           // ever happen at the start of playback
+            }
+        }
+
+        if (oversample) {
+            // Oversample uses the frame area as mix ratio, not the the vsync
+            // position itself
+            double vsync_dist = t->vsync_interval / t->ideal_frame_duration,
+                   threshold = tscale->conf.kernel.params[0];
+            threshold = isnan(threshold) ? 0.0 : threshold;
+            mix = (1 - mix) / vsync_dist;
+            mix = mix <= 0 + threshold ? 0 : mix;
+            mix = mix >= 1 - threshold ? 1 : mix;
+            mix = 1 - mix;
+        }
+
+        // Blend the frames together
+        if (oversample || linear) {
+            gl_sc_uniform_f(p->sc, "inter_coeff", mix);
+            GLSL(color = mix(texture(texture0, texcoord0),
+                             texture(texture1, texcoord1),
+                             inter_coeff);)
+        } else {
+            gl_sc_uniform_f(p->sc, "fcoord", mix);
+            pass_sample_separated_gen(p->sc, tscale, 0, 0);
+        }
+
+        // Load all the required frames
+        for (int i = 0; i < size; i++) {
+            struct img_tex img =
+                img_tex_fbo(&p->surfaces[fbosurface_wrap(surface_bse+i)].fbotex,
+                            PLANE_RGB, p->components);
+            // Since the code in pass_sample_separated currently assumes
+            // the textures are bound in-order and starting at 0, we just
+            // assert to make sure this is the case (which it should always be)
+            int id = pass_bind(p, img);
+            assert(id == i);
+        }
+
+        MP_DBG(p, "inter frame dur: %f vsync: %f, mix: %f\n",
+               t->ideal_frame_duration, t->vsync_interval, mix);
+        p->is_interpolated = true;
+    }
+    pass_draw_to_screen(p, fbo);
+
+    p->frames_drawn += 1;
+}
+
+void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
+                           struct fbodst target)
+{
+    struct mp_rect target_rc = {0, 0, target.tex->params.w, target.tex->params.h};
+
+    p->broken_frame = false;
+
+    bool has_frame = !!frame->current;
+
+    if (!has_frame || !mp_rect_equals(&p->dst_rect, &target_rc)) {
+        struct m_color c = p->clear_color;
+        float color[4] = {c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0};
+        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
+    }
+
+    if (p->hwdec_active && p->hwdec->driver->overlay_frame) {
+        if (has_frame) {
+            float *color = p->hwdec->overlay_colorkey;
+            p->ra->fns->clear(p->ra, target.tex, color, &p->dst_rect);
+        }
+
+        p->hwdec->driver->overlay_frame(p->hwdec, frame->current,
+                                        &p->src_rect, &p->dst_rect,
+                                        frame->frame_id != p->image.id);
+
+        if (frame->current)
+            p->osd_pts = frame->current->pts;
+
+        // Disable GL rendering
+        has_frame = false;
+    }
+
+    if (has_frame) {
+        bool interpolate = p->opts.interpolation && frame->display_synced &&
+                           (p->frames_drawn || !frame->still);
+        if (interpolate) {
+            double ratio = frame->ideal_frame_duration / frame->vsync_interval;
+            if (fabs(ratio - 1.0) < p->opts.interpolation_threshold)
+                interpolate = false;
+        }
+
+        if (interpolate) {
+            gl_video_interpolate_frame(p, frame, target);
+        } else {
+            bool is_new = frame->frame_id != p->image.id;
+
+            // Redrawing a frame might update subtitles.
+            if (frame->still && p->opts.blend_subs)
+                is_new = true;
+
+            if (is_new || !p->output_fbo_valid) {
+                p->output_fbo_valid = false;
+
+                pass_info_reset(p, !is_new);
+                if (!pass_render_frame(p, frame->current, frame->frame_id))
+                    goto done;
+
+                // For the non-interpolation case, we draw to a single "cache"
+                // FBO to speed up subsequent re-draws (if any exist)
+                struct fbodst dest_fbo = target;
+                if (frame->num_vsyncs > 1 && frame->display_synced &&
+                    !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
+                {
+                    fbotex_change(&p->output_fbo, p->ra, p->log,
+                                  target.tex->params.w, target.tex->params.h,
+                                  p->fbo_format, FBOTEX_FUZZY);
+                    dest_fbo = p->output_fbo.fbo;
+                    p->output_fbo_valid = true;
+                }
+                pass_draw_to_screen(p, dest_fbo);
+            }
+
+            // "output fbo valid" and "output fbo needed" are equivalent
+            if (p->output_fbo_valid) {
+                pass_info_reset(p, true);
+                pass_describe(p, "redraw cached frame");
+                struct mp_rect src = p->dst_rect;
+                struct mp_rect dst = src;
+                if (target.flip) {
+                    dst.y0 = target.tex->params.h - src.y0;
+                    dst.y1 = target.tex->params.h - src.y1;
+                }
+                timer_pool_start(p->blit_timer);
+                p->ra->fns->blit(p->ra, target.tex, p->output_fbo.tex,
+                                 &dst, &src);
+                timer_pool_stop(p->blit_timer);
+                pass_record(p, timer_pool_measure(p->blit_timer));
+            }
+        }
+    }
+
+done:
+
+    unmap_current_image(p);
+
+    debug_check_gl(p, "after video rendering");
+
+    if (p->osd) {
+        // If we haven't actually drawn anything so far, then we technically
+        // need to consider this the start of a new pass. Let's call it a
+        // redraw just because, since it's basically a blank frame anyway
+        if (!has_frame)
+            pass_info_reset(p, true);
+
+        pass_draw_osd(p, p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0,
+                      p->osd_pts, p->osd_rect, target, true);
+        debug_check_gl(p, "after OSD rendering");
+    }
+
+    if (gl_sc_error_state(p->sc) || p->broken_frame) {
+        // Make the screen solid blue to make it visually clear that an
+        // error has occurred
+        float color[4] = {0.0, 0.05, 0.5, 1.0};
+        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
+    }
+
+    p->frames_rendered++;
+    pass_report_performance(p);
+}
+
+// Use this color instead of the global option.
+void gl_video_set_clear_color(struct gl_video *p, struct m_color c)
+{
+    p->force_clear_color = true;
+    p->clear_color = c;
+}
+
+void gl_video_set_osd_pts(struct gl_video *p, double pts)
+{
+    p->osd_pts = pts;
+}
+
+bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *res,
+                               double pts)
+{
+    return p->osd ? mpgl_osd_check_change(p->osd, res, pts) : false;
+}
+
+void gl_video_resize(struct gl_video *p,
+                     struct mp_rect *src, struct mp_rect *dst,
+                     struct mp_osd_res *osd)
+{
+    if (mp_rect_equals(&p->src_rect, src) &&
+        mp_rect_equals(&p->dst_rect, dst) &&
+        osd_res_equals(p->osd_rect, *osd))
+        return;
+
+    p->src_rect = *src;
+    p->dst_rect = *dst;
+    p->osd_rect = *osd;
+
+    gl_video_reset_surfaces(p);
+
+    if (p->osd)
+        mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo_out);
+}
+
+static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
+{
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        if (!pass[i].desc.len)
+            break;
+        out->perf[out->count] = pass[i].perf;
+        out->desc[out->count] = pass[i].desc.start;
+        out->count++;
+    }
+}
+
+void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out)
+{
+    *out = (struct voctrl_performance_data){0};
+    frame_perf_data(p->pass_fresh,  &out->fresh);
+    frame_perf_data(p->pass_redraw, &out->redraw);
+}
+
+// This assumes nv12, with textures set to GL_NEAREST filtering.
+static void reinterleave_vdpau(struct gl_video *p,
+                               struct ra_tex *input[4], struct ra_tex *output[2])
+{
+    for (int n = 0; n < 2; n++) {
+        struct fbotex *fbo = &p->vdpau_deinterleave_fbo[n];
+        // This is an array of the 2 to-merge planes.
+        struct ra_tex **src = &input[n * 2];
+        int w = src[0]->params.w;
+        int h = src[0]->params.h;
+        int ids[2];
+        for (int t = 0; t < 2; t++) {
+            ids[t] = pass_bind(p, (struct img_tex){
+                .tex = src[t],
+                .multiplier = 1.0,
+                .transform = identity_trans,
+                .w = w,
+                .h = h,
+            });
+        }
+
+        GLSLF("color = fract(gl_FragCoord.y * 0.5) < 0.5\n");
+        GLSLF("      ? texture(texture%d, texcoord%d)\n", ids[0], ids[0]);
+        GLSLF("      : texture(texture%d, texcoord%d);", ids[1], ids[1]);
+
+        const struct ra_format *fmt =
+            ra_find_unorm_format(p->ra, 1, n == 0 ? 1 : 2);
+        fbotex_change(fbo, p->ra, p->log, w, h * 2, fmt, 0);
+
+        pass_describe(p, "vdpau reinterleaving");
+        finish_pass_direct(p, fbo->fbo, &(struct mp_rect){0, 0, w, h * 2});
+
+        output[n] = fbo->tex;
+    }
+}
+
+// Returns false on failure.
+static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id)
+{
+    struct video_image *vimg = &p->image;
+
+    if (vimg->id == id)
+        return true;
+
+    unref_current_image(p);
+
+    mpi = mp_image_new_ref(mpi);
+    if (!mpi)
+        goto error;
+
+    vimg->mpi = mpi;
+    vimg->id = id;
+    p->osd_pts = mpi->pts;
+    p->frames_uploaded++;
+
+    if (p->hwdec_active) {
+        // Hardware decoding
+
+        if (!p->hwdec_mapper)
+            goto error;
+
+        pass_describe(p, "map frame (hwdec)");
+        timer_pool_start(p->upload_timer);
+        bool ok = ra_hwdec_mapper_map(p->hwdec_mapper, vimg->mpi) >= 0;
+        timer_pool_stop(p->upload_timer);
+        pass_record(p, timer_pool_measure(p->upload_timer));
+
+        vimg->hwdec_mapped = true;
+        if (ok) {
+            struct mp_image layout = {0};
+            mp_image_set_params(&layout, &p->image_params);
+            struct ra_tex **tex = p->hwdec_mapper->tex;
+            struct ra_tex *tmp[4] = {0};
+            if (p->hwdec_mapper->vdpau_fields) {
+                reinterleave_vdpau(p, tex, tmp);
+                tex = tmp;
+            }
+            for (int n = 0; n < p->plane_count; n++) {
+                vimg->planes[n] = (struct texplane){
+                    .w = mp_image_plane_w(&layout, n),
+                    .h = mp_image_plane_h(&layout, n),
+                    .tex = tex[n],
+                };
+            }
+        } else {
+            MP_FATAL(p, "Mapping hardware decoded surface failed.\n");
+            goto error;
+        }
+        return true;
+    }
+
+    // Software decoding
+    assert(mpi->num_planes == p->plane_count);
+
+    timer_pool_start(p->upload_timer);
+    for (int n = 0; n < p->plane_count; n++) {
+        struct texplane *plane = &vimg->planes[n];
+
+        plane->flipped = mpi->stride[0] < 0;
+
+        struct ra_tex_upload_params params = {
+            .tex = plane->tex,
+            .src = mpi->planes[n],
+            .invalidate = true,
+            .stride = mpi->stride[n],
+        };
+
+        struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]);
+        if (mapped) {
+            params.buf = mapped->buf;
+            params.buf_offset = (uintptr_t)params.src -
+                                (uintptr_t)mapped->buf->data;
+            params.src = NULL;
+        }
+
+        if (p->using_dr_path != !!mapped) {
+            p->using_dr_path = !!mapped;
+            MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no");
+        }
+
+        if (!p->ra->fns->tex_upload(p->ra, &params)) {
+            timer_pool_stop(p->upload_timer);
+            goto error;
+        }
+
+        if (mapped && !mapped->mpi)
+            mapped->mpi = mp_image_new_ref(mpi);
+    }
+    timer_pool_stop(p->upload_timer);
+
+    bool using_pbo = p->ra->use_pbo || !(p->ra->caps & RA_CAP_DIRECT_UPLOAD);
+    const char *mode = p->using_dr_path ? "DR" : using_pbo ? "PBO" : "naive";
+    pass_describe(p, "upload frame (%s)", mode);
+    pass_record(p, timer_pool_measure(p->upload_timer));
+
+    return true;
+
+error:
+    unref_current_image(p);
+    p->broken_frame = true;
+    return false;
+}
+
+static bool test_fbo(struct gl_video *p, const struct ra_format *fmt)
+{
+    MP_VERBOSE(p, "Testing FBO format %s\n", fmt->name);
+    struct fbotex fbo = {0};
+    bool success = fbotex_change(&fbo, p->ra, p->log, 16, 16, fmt, 0);
+    fbotex_uninit(&fbo);
+    return success;
+}
+
+// Return whether dumb-mode can be used without disabling any features.
+// Essentially, vo_opengl with mostly default settings will return true.
+static bool check_dumb_mode(struct gl_video *p)
+{
+    struct gl_video_opts *o = &p->opts;
+    if (p->use_integer_conversion)
+        return false;
+    if (o->dumb_mode > 0) // requested by user
+        return true;
+    if (o->dumb_mode < 0) // disabled by user
+        return false;
+
+    // otherwise, use auto-detection
+    if (o->target_prim || o->target_trc || o->linear_scaling ||
+        o->correct_downscaling || o->sigmoid_upscaling || o->interpolation ||
+        o->blend_subs || o->deband || o->unsharp)
+        return false;
+    // check remaining scalers (tscale is already implicitly excluded above)
+    for (int i = 0; i < SCALER_COUNT; i++) {
+        if (i != SCALER_TSCALE) {
+            const char *name = o->scaler[i].kernel.name;
+            if (name && strcmp(name, "bilinear") != 0)
+                return false;
+        }
+    }
+    if (o->user_shaders && o->user_shaders[0])
+        return false;
+    if (p->use_lut_3d)
+        return false;
+    return true;
+}
+
+// Disable features that are not supported with the current OpenGL version.
+static void check_gl_features(struct gl_video *p)
+{
+    struct ra *ra = p->ra;
+    bool have_float_tex = !!ra_find_float16_format(ra, 1);
+    bool have_mglsl = ra->glsl_version >= 130; // modern GLSL
+    const struct ra_format *rg_tex = ra_find_unorm_format(p->ra, 1, 2);
+    bool have_texrg = rg_tex && !rg_tex->luminance_alpha;
+    bool have_compute = ra->caps & RA_CAP_COMPUTE;
+    bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
+
+    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgb10_a2", "rgba8", 0};
+    const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
+    const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
+                          ? user_fbo_fmts : auto_fbo_fmts;
+    bool have_fbo = false;
+    p->fbo_format = NULL;
+    for (int n = 0; fbo_fmts[n]; n++) {
+        const char *fmt = fbo_fmts[n];
+        const struct ra_format *f = ra_find_named_format(p->ra, fmt);
+        if (!f && fbo_fmts == user_fbo_fmts)
+            MP_WARN(p, "FBO format '%s' not found!\n", fmt);
+        if (f && f->renderable && f->linear_filter && test_fbo(p, f)) {
+            MP_VERBOSE(p, "Using FBO format %s.\n", f->name);
+            have_fbo = true;
+            p->fbo_format = f;
+            break;
+        }
+    }
+
+    p->forced_dumb_mode = p->opts.dumb_mode > 0 || !have_fbo || !have_texrg;
+    bool voluntarily_dumb = check_dumb_mode(p);
+    if (p->forced_dumb_mode || voluntarily_dumb) {
+        if (voluntarily_dumb) {
+            MP_VERBOSE(p, "No advanced processing required. Enabling dumb mode.\n");
+        } else if (p->opts.dumb_mode <= 0) {
+            MP_WARN(p, "High bit depth FBOs unsupported. Enabling dumb mode.\n"
+                       "Most extended features will be disabled.\n");
+        }
+        p->dumb_mode = true;
+        p->use_lut_3d = false;
+        // Most things don't work, so whitelist all options that still work.
+        p->opts = (struct gl_video_opts){
+            .gamma = p->opts.gamma,
+            .gamma_auto = p->opts.gamma_auto,
+            .pbo = p->opts.pbo,
+            .fbo_format = p->opts.fbo_format,
+            .alpha_mode = p->opts.alpha_mode,
+            .use_rectangle = p->opts.use_rectangle,
+            .background = p->opts.background,
+            .dither_algo = p->opts.dither_algo,
+            .dither_depth = p->opts.dither_depth,
+            .dither_size = p->opts.dither_size,
+            .temporal_dither = p->opts.temporal_dither,
+            .temporal_dither_period = p->opts.temporal_dither_period,
+            .tex_pad_x = p->opts.tex_pad_x,
+            .tex_pad_y = p->opts.tex_pad_y,
+            .tone_mapping = p->opts.tone_mapping,
+            .tone_mapping_param = p->opts.tone_mapping_param,
+            .tone_mapping_desat = p->opts.tone_mapping_desat,
+            .early_flush = p->opts.early_flush,
+        };
+        for (int n = 0; n < SCALER_COUNT; n++)
+            p->opts.scaler[n] = gl_video_opts_def.scaler[n];
+        return;
+    }
+    p->dumb_mode = false;
+
+    // Normally, we want to disable them by default if FBOs are unavailable,
+    // because they will be slow (not critically slow, but still slower).
+    // Without FP textures, we must always disable them.
+    // I don't know if luminance alpha float textures exist, so disregard them.
+    for (int n = 0; n < SCALER_COUNT; n++) {
+        const struct filter_kernel *kernel =
+            mp_find_filter_kernel(p->opts.scaler[n].kernel.name);
+        if (kernel) {
+            char *reason = NULL;
+            if (!have_float_tex)
+                reason = "(float tex. missing)";
+            if (!have_mglsl)
+                reason = "(GLSL version too old)";
+            if (reason) {
+                MP_WARN(p, "Disabling scaler #%d %s %s.\n", n,
+                        p->opts.scaler[n].kernel.name, reason);
+                // p->opts is a copy => we can just mess with it.
+                p->opts.scaler[n].kernel.name = "bilinear";
+                if (n == SCALER_TSCALE)
+                    p->opts.interpolation = 0;
+            }
+        }
+    }
+
+    int use_cms = p->opts.target_prim != MP_CSP_PRIM_AUTO ||
+                  p->opts.target_trc != MP_CSP_TRC_AUTO || p->use_lut_3d;
+
+    // mix() is needed for some gamma functions
+    if (!have_mglsl && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
+        p->opts.linear_scaling = false;
+        p->opts.sigmoid_upscaling = false;
+        MP_WARN(p, "Disabling linear/sigmoid scaling (GLSL version too old).\n");
+    }
+    if (!have_mglsl && use_cms) {
+        p->opts.target_prim = MP_CSP_PRIM_AUTO;
+        p->opts.target_trc = MP_CSP_TRC_AUTO;
+        p->use_lut_3d = false;
+        MP_WARN(p, "Disabling color management (GLSL version too old).\n");
+    }
+    if (!have_mglsl && p->opts.deband) {
+        p->opts.deband = 0;
+        MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
+    }
+    if ((!have_compute || !have_ssbo) && p->opts.compute_hdr_peak) {
+        p->opts.compute_hdr_peak = 0;
+        MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
+    }
+}
+
+static void init_gl(struct gl_video *p)
+{
+    debug_check_gl(p, "before init_gl");
+
+    p->upload_timer = timer_pool_create(p->ra);
+    p->blit_timer = timer_pool_create(p->ra);
+    p->osd_timer = timer_pool_create(p->ra);
+
+    debug_check_gl(p, "after init_gl");
+
+    ra_dump_tex_formats(p->ra, MSGL_DEBUG);
+    ra_dump_img_formats(p->ra, MSGL_DEBUG);
+}
+
+void gl_video_uninit(struct gl_video *p)
+{
+    if (!p)
+        return;
+
+    uninit_video(p);
+
+    gl_sc_destroy(p->sc);
+
+    ra_tex_free(p->ra, &p->lut_3d_texture);
+    ra_buf_free(p->ra, &p->hdr_peak_ssbo);
+
+    timer_pool_destroy(p->upload_timer);
+    timer_pool_destroy(p->blit_timer);
+    timer_pool_destroy(p->osd_timer);
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        talloc_free(p->pass_fresh[i].desc.start);
+        talloc_free(p->pass_redraw[i].desc.start);
+    }
+
+    mpgl_osd_destroy(p->osd);
+
+    // Forcibly destroy possibly remaining image references. This should also
+    // cause gl_video_dr_free_buffer() to be called for the remaining buffers.
+    gc_pending_dr_fences(p, true);
+
+    // Should all have been unreffed already.
+    assert(!p->num_dr_buffers);
+
+    talloc_free(p);
+}
+
+void gl_video_reset(struct gl_video *p)
+{
+    gl_video_reset_surfaces(p);
+}
+
+bool gl_video_showing_interpolated_frame(struct gl_video *p)
+{
+    return p->is_interpolated;
+}
+
+static bool is_imgfmt_desc_supported(struct gl_video *p,
+                                     const struct ra_imgfmt_desc *desc)
+{
+    if (!desc->num_planes)
+        return false;
+
+    if (desc->planes[0]->ctype == RA_CTYPE_UINT && p->forced_dumb_mode)
+        return false;
+
+    return true;
+}
+
+bool gl_video_check_format(struct gl_video *p, int mp_format)
+{
+    struct ra_imgfmt_desc desc;
+    if (ra_get_imgfmt_desc(p->ra, mp_format, &desc) &&
+        is_imgfmt_desc_supported(p, &desc))
+        return true;
+    if (p->hwdec && ra_hwdec_test_format(p->hwdec, mp_format))
+        return true;
+    return false;
+}
+
+void gl_video_config(struct gl_video *p, struct mp_image_params *params)
+{
+    unmap_overlay(p);
+    unref_current_image(p);
+
+    if (!mp_image_params_equal(&p->real_image_params, params)) {
+        uninit_video(p);
+        p->real_image_params = *params;
+        p->image_params = *params;
+        if (params->imgfmt)
+            init_video(p);
+    }
+
+    gl_video_reset_surfaces(p);
+}
+
+void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd)
+{
+    mpgl_osd_destroy(p->osd);
+    p->osd = NULL;
+    p->osd_state = osd;
+    reinit_osd(p);
+}
+
+struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
+                               struct mpv_global *g)
+{
+    struct gl_video *p = talloc_ptrtype(NULL, p);
+    *p = (struct gl_video) {
+        .ra = ra,
+        .global = g,
+        .log = log,
+        .sc = gl_sc_create(ra, g, log),
+        .video_eq = mp_csp_equalizer_create(p, g),
+        .opts_cache = m_config_cache_alloc(p, g, &gl_video_conf),
+    };
+    // make sure this variable is initialized to *something*
+    p->pass = p->pass_fresh;
+    struct gl_video_opts *opts = p->opts_cache->opts;
+    p->cms = gl_lcms_init(p, log, g, opts->icc_opts),
+    p->opts = *opts;
+    for (int n = 0; n < SCALER_COUNT; n++)
+        p->scaler[n] = (struct scaler){.index = n};
+    init_gl(p);
+    reinit_from_options(p);
+    return p;
+}
+
+// Get static string for scaler shader. If "tscale" is set to true, the
+// scaler must be a separable convolution filter.
+static const char *handle_scaler_opt(const char *name, bool tscale)
+{
+    if (name && name[0]) {
+        const struct filter_kernel *kernel = mp_find_filter_kernel(name);
+        if (kernel && (!tscale || !kernel->polar))
+                return kernel->f.name;
+
+        for (const char *const *filter = tscale ? fixed_tscale_filters
+                                                : fixed_scale_filters;
+             *filter; filter++) {
+            if (strcmp(*filter, name) == 0)
+                return *filter;
+        }
+    }
+    return NULL;
+}
+
+void gl_video_update_options(struct gl_video *p)
+{
+    if (m_config_cache_update(p->opts_cache)) {
+        gl_lcms_update_options(p->cms);
+        reinit_from_options(p);
+    }
+}
+
+static void reinit_from_options(struct gl_video *p)
+{
+    p->use_lut_3d = gl_lcms_has_profile(p->cms);
+
+    // Copy the option fields, so that check_gl_features() can mutate them.
+    // This works only for the fields themselves of course, not for any memory
+    // referenced by them.
+    p->opts = *(struct gl_video_opts *)p->opts_cache->opts;
+
+    if (!p->force_clear_color)
+        p->clear_color = p->opts.background;
+
+    check_gl_features(p);
+    uninit_rendering(p);
+    gl_sc_set_cache_dir(p->sc, p->opts.shader_cache_dir);
+    p->ra->use_pbo = p->opts.pbo;
+    gl_video_setup_hooks(p);
+    reinit_osd(p);
+
+    if (p->opts.interpolation && !p->global->opts->video_sync && !p->dsi_warned) {
+        MP_WARN(p, "Interpolation now requires enabling display-sync mode.\n"
+                   "E.g.: --video-sync=display-resample\n");
+        p->dsi_warned = true;
+    }
+}
+
+void gl_video_configure_queue(struct gl_video *p, struct vo *vo)
+{
+    int queue_size = 1;
+
+    // Figure out an adequate size for the interpolation queue. The larger
+    // the radius, the earlier we need to queue frames.
+    if (p->opts.interpolation) {
+        const struct filter_kernel *kernel =
+            mp_find_filter_kernel(p->opts.scaler[SCALER_TSCALE].kernel.name);
+        if (kernel) {
+            // filter_scale wouldn't be correctly initialized were we to use it here.
+            // This is fine since we're always upsampling, but beware if downsampling
+            // is added!
+            double radius = kernel->f.radius;
+            radius = radius > 0 ? radius : p->opts.scaler[SCALER_TSCALE].radius;
+            queue_size += 1 + ceil(radius);
+        } else {
+            // Oversample/linear case
+            queue_size += 2;
+        }
+    }
+
+    vo_set_queue_params(vo, 0, queue_size);
+}
+
+static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param)
+{
+    char s[20] = {0};
+    int r = 1;
+    bool tscale = bstr_equals0(name, "tscale");
+    if (bstr_equals0(param, "help")) {
+        r = M_OPT_EXIT;
+    } else {
+        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+        if (!handle_scaler_opt(s, tscale))
+            r = M_OPT_INVALID;
+    }
+    if (r < 1) {
+        mp_info(log, "Available scalers:\n");
+        for (const char *const *filter = tscale ? fixed_tscale_filters
+                                                : fixed_scale_filters;
+             *filter; filter++) {
+            mp_info(log, "    %s\n", *filter);
+        }
+        for (int n = 0; mp_filter_kernels[n].f.name; n++) {
+            if (!tscale || !mp_filter_kernels[n].polar)
+                mp_info(log, "    %s\n", mp_filter_kernels[n].f.name);
+        }
+        if (s[0])
+            mp_fatal(log, "No scaler named '%s' found!\n", s);
+    }
+    return r;
+}
+
+static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param)
+{
+    char s[20] = {0};
+    int r = 1;
+    if (bstr_equals0(param, "help")) {
+        r = M_OPT_EXIT;
+    } else {
+        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+        const struct filter_window *window = mp_find_filter_window(s);
+        if (!window)
+            r = M_OPT_INVALID;
+    }
+    if (r < 1) {
+        mp_info(log, "Available windows:\n");
+        for (int n = 0; mp_filter_windows[n].name; n++)
+            mp_info(log, "    %s\n", mp_filter_windows[n].name);
+        if (s[0])
+            mp_fatal(log, "No window named '%s' found!\n", s);
+    }
+    return r;
+}
+
+float gl_video_scale_ambient_lux(float lmin, float lmax,
+                                 float rmin, float rmax, float lux)
+{
+    assert(lmax > lmin);
+
+    float num = (rmax - rmin) * (log10(lux) - log10(lmin));
+    float den = log10(lmax) - log10(lmin);
+    float result = num / den + rmin;
+
+    // clamp the result
+    float max = MPMAX(rmax, rmin);
+    float min = MPMIN(rmax, rmin);
+    return MPMAX(MPMIN(result, max), min);
+}
+
+void gl_video_set_ambient_lux(struct gl_video *p, int lux)
+{
+    if (p->opts.gamma_auto) {
+        float gamma = gl_video_scale_ambient_lux(16.0, 64.0, 2.40, 1.961, lux);
+        MP_VERBOSE(p, "ambient light changed: %dlux (gamma: %f)\n", lux, gamma);
+        p->opts.gamma = MPMIN(1.0, 1.961 / gamma);
+    }
+}
+
+void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec)
+{
+    unref_current_image(p);
+    ra_hwdec_mapper_free(&p->hwdec_mapper);
+    p->hwdec = hwdec;
+}
+
+static void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size)
+{
+    struct ra_buf_params params = {
+        .type = RA_BUF_TYPE_TEX_UPLOAD,
+        .host_mapped = true,
+        .size = size,
+    };
+
+    struct ra_buf *buf = ra_buf_create(p->ra, &params);
+    if (!buf)
+        return NULL;
+
+    MP_TARRAY_GROW(p, p->dr_buffers, p->num_dr_buffers);
+    p->dr_buffers[p->num_dr_buffers++] = (struct dr_buffer){ .buf = buf };
+
+    return buf->data;
+};
+
+static void gl_video_dr_free_buffer(void *opaque, uint8_t *data)
+{
+    struct gl_video *p = opaque;
+
+    for (int n = 0; n < p->num_dr_buffers; n++) {
+        struct dr_buffer *buffer = &p->dr_buffers[n];
+        if (buffer->buf->data == data) {
+            assert(!buffer->mpi); // can't be freed while it has a ref
+            ra_buf_free(p->ra, &buffer->buf);
+            MP_TARRAY_REMOVE_AT(p->dr_buffers, p->num_dr_buffers, n);
+            return;
+        }
+    }
+    // not found - must not happen
+    assert(0);
+}
+
+struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
+                                    int stride_align)
+{
+    int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
+    if (size < 0)
+        return NULL;
+
+    int alloc_size = size + stride_align;
+    void *ptr = gl_video_dr_alloc_buffer(p, alloc_size);
+    if (!ptr)
+        return NULL;
+
+    // (we expect vo.c to proxy the free callback, so it happens in the same
+    // thread it was allocated in, removing the need for synchronization)
+    struct mp_image *res = mp_image_from_buffer(imgfmt, w, h, stride_align,
+                                                ptr, alloc_size, p,
+                                                gl_video_dr_free_buffer);
+    if (!res)
+        gl_video_dr_free_buffer(p, ptr);
+    return res;
+}
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
new file mode 100644
index 0000000000..884f5914fd
--- /dev/null
+++ b/video/out/gpu/video.h
@@ -0,0 +1,194 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_VIDEO_H
+#define MP_GL_VIDEO_H
+
+#include <stdbool.h>
+
+#include "options/m_option.h"
+#include "sub/osd.h"
+#include "utils.h"
+#include "lcms.h"
+#include "shader_cache.h"
+#include "video/csputils.h"
+#include "video/out/filter_kernels.h"
+
+// Assume we have this many texture units for sourcing additional passes.
+// The actual texture unit assignment is dynamic.
+#define TEXUNIT_VIDEO_NUM 6
+
+struct scaler_fun {
+    char *name;
+    float params[2];
+    float blur;
+    float taper;
+};
+
+struct scaler_config {
+    struct scaler_fun kernel;
+    struct scaler_fun window;
+    float radius;
+    float antiring;
+    float cutoff;
+    float clamp;
+};
+
+struct scaler {
+    int index;
+    struct scaler_config conf;
+    double scale_factor;
+    bool initialized;
+    struct filter_kernel *kernel;
+    struct ra_tex *lut;
+    struct fbotex sep_fbo;
+    bool insufficient;
+    int lut_size;
+
+    // kernel points here
+    struct filter_kernel kernel_storage;
+};
+
+enum scaler_unit {
+    SCALER_SCALE,  // luma/video
+    SCALER_DSCALE, // luma-video downscaling
+    SCALER_CSCALE, // chroma upscaling
+    SCALER_TSCALE, // temporal scaling (interpolation)
+    SCALER_COUNT
+};
+
+enum dither_algo {
+    DITHER_NONE = 0,
+    DITHER_FRUIT,
+    DITHER_ORDERED,
+};
+
+enum alpha_mode {
+    ALPHA_NO = 0,
+    ALPHA_YES,
+    ALPHA_BLEND,
+    ALPHA_BLEND_TILES,
+};
+
+enum blend_subs_mode {
+    BLEND_SUBS_NO = 0,
+    BLEND_SUBS_YES,
+    BLEND_SUBS_VIDEO,
+};
+
+enum tone_mapping {
+    TONE_MAPPING_CLIP,
+    TONE_MAPPING_MOBIUS,
+    TONE_MAPPING_REINHARD,
+    TONE_MAPPING_HABLE,
+    TONE_MAPPING_GAMMA,
+    TONE_MAPPING_LINEAR,
+};
+
+// How many frames to average over for HDR peak detection
+#define PEAK_DETECT_FRAMES 100
+
+struct gl_video_opts {
+    int dumb_mode;
+    struct scaler_config scaler[4];
+    int scaler_lut_size;
+    float gamma;
+    int gamma_auto;
+    int target_prim;
+    int target_trc;
+    int target_brightness;
+    int tone_mapping;
+    int compute_hdr_peak;
+    float tone_mapping_param;
+    float tone_mapping_desat;
+    int gamut_warning;
+    int linear_scaling;
+    int correct_downscaling;
+    int sigmoid_upscaling;
+    float sigmoid_center;
+    float sigmoid_slope;
+    int scaler_resizes_only;
+    int pbo;
+    int dither_depth;
+    int dither_algo;
+    int dither_size;
+    int temporal_dither;
+    int temporal_dither_period;
+    char *fbo_format;
+    int alpha_mode;
+    int use_rectangle;
+    struct m_color background;
+    int interpolation;
+    float interpolation_threshold;
+    int blend_subs;
+    char **user_shaders;
+    int deband;
+    struct deband_opts *deband_opts;
+    float unsharp;
+    int tex_pad_x, tex_pad_y;
+    struct mp_icc_opts *icc_opts;
+    int early_flush;
+    char *shader_cache_dir;
+};
+
+extern const struct m_sub_options gl_video_conf;
+
+struct gl_video;
+struct vo_frame;
+
+struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
+                               struct mpv_global *g);
+void gl_video_uninit(struct gl_video *p);
+void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
+void gl_video_update_options(struct gl_video *p);
+bool gl_video_check_format(struct gl_video *p, int mp_format);
+void gl_video_config(struct gl_video *p, struct mp_image_params *params);
+void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
+void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
+                           struct fbodst target);
+void gl_video_resize(struct gl_video *p,
+                     struct mp_rect *src, struct mp_rect *dst,
+                     struct mp_osd_res *osd);
+void gl_video_set_fb_depth(struct gl_video *p, int fb_depth);
+void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out);
+void gl_video_set_clear_color(struct gl_video *p, struct m_color color);
+void gl_video_set_osd_pts(struct gl_video *p, double pts);
+bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *osd,
+                               double pts);
+
+float gl_video_scale_ambient_lux(float lmin, float lmax,
+                                 float rmin, float rmax, float lux);
+void gl_video_set_ambient_lux(struct gl_video *p, int lux);
+void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data);
+bool gl_video_icc_auto_enabled(struct gl_video *p);
+bool gl_video_gamma_auto_enabled(struct gl_video *p);
+struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p);
+
+void gl_video_reset(struct gl_video *p);
+bool gl_video_showing_interpolated_frame(struct gl_video *p);
+
+struct ra_hwdec;
+void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec);
+
+struct vo;
+void gl_video_configure_queue(struct gl_video *p, struct vo *vo);
+
+struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
+                                    int stride_align);
+
+
+#endif
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
new file mode 100644
index 0000000000..60c5ce82ac
--- /dev/null
+++ b/video/out/gpu/video_shaders.c
@@ -0,0 +1,872 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "video_shaders.h"
+#include "video.h"
+
+#define GLSL(x) gl_sc_add(sc, #x "\n");
+#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
+#define GLSLH(x) gl_sc_hadd(sc, #x "\n");
+#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__)
+
+// Set up shared/commonly used variables and macros
+void sampler_prelude(struct gl_shader_cache *sc, int tex_num)
+{
+    GLSLF("#undef tex\n");
+    GLSLF("#undef texmap\n");
+    GLSLF("#define tex texture%d\n", tex_num);
+    GLSLF("#define texmap texmap%d\n", tex_num);
+    GLSLF("vec2 pos = texcoord%d;\n", tex_num);
+    GLSLF("vec2 size = texture_size%d;\n", tex_num);
+    GLSLF("vec2 pt = pixel_size%d;\n", tex_num);
+}
+
+static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
+                                              struct scaler *scaler)
+{
+    gl_sc_uniform_texture(sc, "lut", scaler->lut);
+    GLSLF("float ypos = LUT_POS(fcoord, %d.0);\n", scaler->lut_size);
+
+    int N = scaler->kernel->size;
+    int width = (N + 3) / 4; // round up
+
+    GLSLF("float weights[%d];\n", N);
+    for (int i = 0; i < N; i++) {
+        if (i % 4 == 0)
+            GLSLF("c = texture(lut, vec2(%f, ypos));\n", (i / 4 + 0.5) / width);
+        GLSLF("weights[%d] = c[%d];\n", i, i % 4);
+    }
+}
+
+// Handle a single pass (either vertical or horizontal). The direction is given
+// by the vector (d_x, d_y). If the vector is 0, then planar interpolation is
+// used instead (samples from texture0 through textureN)
+void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
+                               int d_x, int d_y)
+{
+    int N = scaler->kernel->size;
+    bool use_ar = scaler->conf.antiring > 0;
+    bool planar = d_x == 0 && d_y == 0;
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    if (!planar) {
+        GLSLF("vec2 dir = vec2(%d.0, %d.0);\n", d_x, d_y);
+        GLSL(pt *= dir;)
+        GLSL(float fcoord = dot(fract(pos * size - vec2(0.5)), dir);)
+        GLSLF("vec2 base = pos - fcoord * pt - pt * vec2(%d.0);\n", N / 2 - 1);
+    }
+    GLSL(vec4 c;)
+    if (use_ar) {
+        GLSL(vec4 hi = vec4(0.0);)
+        GLSL(vec4 lo = vec4(1.0);)
+    }
+    pass_sample_separated_get_weights(sc, scaler);
+    GLSLF("// scaler samples\n");
+    for (int n = 0; n < N; n++) {
+        if (planar) {
+            GLSLF("c = texture(texture%d, texcoord%d);\n", n, n);
+        } else {
+            GLSLF("c = texture(tex, base + pt * vec2(%d.0));\n", n);
+        }
+        GLSLF("color += vec4(weights[%d]) * c;\n", n);
+        if (use_ar && (n == N/2-1 || n == N/2)) {
+            GLSL(lo = min(lo, c);)
+            GLSL(hi = max(hi, c);)
+        }
+    }
+    if (use_ar)
+        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
+              scaler->conf.antiring);
+    GLSLF("}\n");
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If subtexel < 0 and offset < 0, samples directly.
+// If subtexel >= 0, takes the texel from cN[subtexel]
+// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
+static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
+                         int x, int y, int subtexel, int offset, int components)
+{
+    double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
+    double radius_cutoff = scaler->kernel->radius_cutoff;
+
+    // Since we can't know the subpixel position in advance, assume a
+    // worst case scenario
+    int yy = y > 0 ? y-1 : y;
+    int xx = x > 0 ? x-1 : x;
+    double dmax = sqrt(xx*xx + yy*yy);
+    // Skip samples definitely outside the radius
+    if (dmax >= radius_cutoff)
+        return;
+    GLSLF("d = length(vec2(%d.0, %d.0) - fcoord);\n", x, y);
+    // Check for samples that might be skippable
+    bool maybe_skippable = dmax >= radius_cutoff - M_SQRT2;
+    if (maybe_skippable)
+        GLSLF("if (d < %f) {\n", radius_cutoff);
+
+    // get the weight for this pixel
+    if (scaler->lut->params.dimensions == 1) {
+        GLSLF("w = tex1D(lut, LUT_POS(d * 1.0/%f, %d.0)).r;\n",
+              radius, scaler->lut_size);
+    } else {
+        GLSLF("w = texture(lut, vec2(0.5, LUT_POS(d * 1.0/%f, %d.0))).r;\n",
+              radius, scaler->lut_size);
+    }
+    GLSL(wsum += w;)
+
+    if (subtexel < 0 && offset < 0) {
+        GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
+        GLSL(color += vec4(w) * c0;)
+    } else if (subtexel >= 0) {
+        for (int n = 0; n < components; n++)
+            GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
+    } else if (offset >= 0) {
+        for (int n = 0; n <components; n++)
+            GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
+                  y + offset, x + offset);
+    } else {
+        // invalid usage
+        abort();
+    }
+
+    if (maybe_skippable)
+        GLSLF("}\n");
+}
+
+void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                       int components, int glsl_version)
+{
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    GLSL(vec2 base = pos - fcoord * pt;)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    for (int n = 0; n < components; n++)
+        GLSLF("vec4 c%d;\n", n);
+
+    gl_sc_uniform_texture(sc, "lut", scaler->lut);
+
+    GLSLF("// scaler samples\n");
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    for (int y = 1-bound; y <= bound; y += 2) {
+        for (int x = 1-bound; x <= bound; x += 2) {
+            // First we figure out whether it's more efficient to use direct
+            // sampling or gathering. The problem is that gathering 4 texels
+            // only to discard some of them is very wasteful, so only do it if
+            // we suspect it will be a win rather than a loss. This is the case
+            // exactly when all four texels are within bounds
+            bool use_gather = sqrt(x*x + y*y) < scaler->kernel->radius_cutoff;
+
+            // textureGather is only supported in GLSL 400+
+            if (glsl_version < 400)
+                use_gather = false;
+
+            if (use_gather) {
+                // Gather the four surrounding texels simultaneously
+                for (int n = 0; n < components; n++) {
+                    GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
+                          n, x, y, n);
+                }
+
+                // Mix in all of the points with their weights
+                for (int p = 0; p < 4; p++) {
+                    // The four texels are gathered counterclockwise starting
+                    // from the bottom left
+                    static const int xo[4] = {0, 1, 1, 0};
+                    static const int yo[4] = {1, 1, 0, 0};
+                    if (x+xo[p] > bound || y+yo[p] > bound)
+                        continue;
+                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
+                }
+            } else {
+                // switch to direct sampling instead, for efficiency/compatibility
+                for (int yy = y; yy <= bound && yy <= y+1; yy++) {
+                    for (int xx = x; xx <= bound && xx <= x+1; xx++)
+                        polar_sample(sc, scaler, xx, yy, -1, -1, components);
+                }
+            }
+        }
+    }
+
+    GLSL(color = color / vec4(wsum);)
+    GLSLF("}\n");
+}
+
+// bw/bh: block size
+// iw/ih: input size (pre-calculated to fit all required texels)
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                        int components, int bw, int bh, int iw, int ih)
+{
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    int offset = bound - 1; // padding top/left
+
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 wpos = texmap(gl_WorkGroupID * gl_WorkGroupSize);)
+    GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));)
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    GLSL(vec2 base = pos - pt * fcoord;)
+    GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    gl_sc_uniform_texture(sc, "lut", scaler->lut);
+
+    // Load all relevant texels into shmem
+    gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays");
+    for (int c = 0; c < components; c++)
+        GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
+
+    GLSL(vec4 c;)
+    GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
+    GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
+    GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
+    for (int c = 0; c < components; c++)
+        GLSLF("in%d[y][x] = c[%d];\n", c, c);
+    GLSLF("}}\n");
+    GLSL(groupMemoryBarrier();)
+    GLSL(barrier();)
+
+    // Dispatch the actual samples
+    GLSLF("// scaler samples\n");
+    for (int y = 1-bound; y <= bound; y++) {
+        for (int x = 1-bound; x <= bound; x++)
+            polar_sample(sc, scaler, x, y, -1, offset, components);
+    }
+
+    GLSL(color = color / vec4(wsum);)
+    GLSLF("}\n");
+}
+
+static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
+{
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+    // Explanation why this algorithm normally always blurs, even with unit
+    // scaling:
+    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
+    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
+    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
+                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
+    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
+    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
+    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
+    GLSLF("%s.xy += vec2(1.0 + %s, 1.0 - %s);\n", t, s, s);
+}
+
+void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
+{
+    GLSLF("{\n");
+    GLSL(vec2 fcoord = fract(pos * size + vec2(0.5, 0.5));)
+    bicubic_calcweights(sc, "parmx", "fcoord.x");
+    bicubic_calcweights(sc, "parmy", "fcoord.y");
+    GLSL(vec4 cdelta;)
+    GLSL(cdelta.xz = parmx.rg * vec2(-pt.x, pt.x);)
+    GLSL(cdelta.yw = parmy.rg * vec2(-pt.y, pt.y);)
+    // first y-interpolation
+    GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
+    GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
+    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
+    // second y-interpolation
+    GLSL(vec4 br = texture(tex, pos + cdelta.zy);)
+    GLSL(vec4 bg = texture(tex, pos + cdelta.zw);)
+    GLSL(vec4 aa = mix(bg, br, parmy.b);)
+    // x-interpolation
+    GLSL(color = mix(aa, ab, parmx.b);)
+    GLSLF("}\n");
+}
+
+void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
+                                   int w, int h)
+{
+    GLSLF("{\n");
+    GLSL(vec2 pos = pos - vec2(0.5) * pt;) // round to nearest
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    // Determine the mixing coefficient vector
+    gl_sc_uniform_vec2(sc, "output_size", (float[2]){w, h});
+    GLSL(vec2 coeff = fcoord * output_size/size;)
+    float threshold = scaler->conf.kernel.params[0];
+    threshold = isnan(threshold) ? 0.0 : threshold;
+    GLSLF("coeff = (coeff - %f) * 1.0/%f;\n", threshold, 1.0 - 2 * threshold);
+    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
+    // Compute the right blend of colors
+    GLSL(color = texture(tex, pos + pt * (coeff - fcoord));)
+    GLSLF("}\n");
+}
+
+// Common constants for SMPTE ST.2084 (HDR)
+static const float PQ_M1 = 2610./4096 * 1./4,
+                   PQ_M2 = 2523./4096 * 128,
+                   PQ_C1 = 3424./4096,
+                   PQ_C2 = 2413./4096 * 32,
+                   PQ_C3 = 2392./4096 * 32;
+
+// Common constants for ARIB STD-B67 (HLG)
+static const float HLG_A = 0.17883277,
+                   HLG_B = 0.28466892,
+                   HLG_C = 0.55991073;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+                   VLOG_C = 0.241514,
+                   VLOG_D = 0.598206;
+
+// Common constants for Sony S-Log
+static const float SLOG_A = 0.432699,
+                   SLOG_B = 0.037584,
+                   SLOG_C = 0.616596 + 0.03,
+                   SLOG_P = 3.538813,
+                   SLOG_Q = 0.030001,
+                   SLOG_K2 = 155.0 / 219.0;
+
+// Linearize (expand), given a TRC as input. In essence, this is the ITU-R
+// EOTF, calculated on an idealized (reference) monitor with a white point of
+// MP_REF_WHITE and infinite contrast.
+void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
+{
+    if (trc == MP_CSP_TRC_LINEAR)
+        return;
+
+    GLSLF("// linearize\n");
+
+    // Note that this clamp may technically violate the definition of
+    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+    // displayed on the display where such would be possible. That said, the
+    // problem is that not all gamma curves are well-defined on the values
+    // outside this range, so we ignore it and just clip anyway for sanity.
+    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+
+    switch (trc) {
+    case MP_CSP_TRC_SRGB:
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/12.92),
+                             pow((color.rgb + vec3(0.055))/vec3(1.055), vec3(2.4)),
+                             lessThan(vec3(0.04045), color.rgb));)
+        break;
+    case MP_CSP_TRC_BT_1886:
+        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
+        break;
+    case MP_CSP_TRC_GAMMA18:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.8));)
+        break;
+    case MP_CSP_TRC_GAMMA22:
+        GLSL(color.rgb = pow(color.rgb, vec3(2.2));)
+        break;
+    case MP_CSP_TRC_GAMMA28:
+        GLSL(color.rgb = pow(color.rgb, vec3(2.8));)
+        break;
+    case MP_CSP_TRC_PRO_PHOTO:
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/16.0),
+                             pow(color.rgb, vec3(1.8)),
+                             lessThan(vec3(0.03125), color.rgb));)
+        break;
+    case MP_CSP_TRC_PQ:
+        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M2);
+        GLSLF("color.rgb = max(color.rgb - vec3(%f), vec3(0.0)) \n"
+              "             / (vec3(%f) - vec3(%f) * color.rgb);\n",
+              PQ_C1, PQ_C2, PQ_C3);
+        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M1);
+        // PQ's output range is 0-10000, but we need it to be relative to to
+        // MP_REF_WHITE instead, so rescale
+        GLSLF("color.rgb *= vec3(%f);\n", 10000 / MP_REF_WHITE);
+        break;
+    case MP_CSP_TRC_HLG:
+        GLSLF("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,\n"
+              "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) + vec3(%f),\n"
+              "                lessThan(vec3(0.5), color.rgb));\n",
+              HLG_C, HLG_A, HLG_B);
+        break;
+    case MP_CSP_TRC_V_LOG:
+        GLSLF("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
+              "    pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+              "              - vec3(%f),                                  \n"
+              "    lessThanEqual(vec3(0.181), color.rgb));                \n",
+              VLOG_D, VLOG_C, VLOG_B);
+        break;
+    case MP_CSP_TRC_S_LOG1:
+        GLSLF("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
+              "            - vec3(%f);\n",
+              SLOG_C, SLOG_A, SLOG_B);
+        break;
+    case MP_CSP_TRC_S_LOG2:
+        GLSLF("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f),      \n"
+              "    (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+              "              - vec3(%f)) * vec3(1.0/%f),                   \n"
+              "    lessThanEqual(vec3(%f), color.rgb));                    \n",
+              SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
+        break;
+    default:
+        abort();
+    }
+
+    // Rescale to prevent clipping on non-float textures
+    GLSLF("color.rgb *= vec3(1.0/%f);\n", mp_trc_nom_peak(trc));
+}
+
+// Delinearize (compress), given a TRC as output. This corresponds to the
+// inverse EOTF (not the OETF) in ITU-R terminology, again assuming a
+// reference monitor.
+void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
+{
+    if (trc == MP_CSP_TRC_LINEAR)
+        return;
+
+    GLSLF("// delinearize\n");
+    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+    GLSLF("color.rgb *= vec3(%f);\n", mp_trc_nom_peak(trc));
+
+    switch (trc) {
+    case MP_CSP_TRC_SRGB:
+        GLSL(color.rgb = mix(color.rgb * vec3(12.92),
+                             vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))
+                                 - vec3(0.055),
+                             lessThanEqual(vec3(0.0031308), color.rgb));)
+        break;
+    case MP_CSP_TRC_BT_1886:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
+        break;
+    case MP_CSP_TRC_GAMMA18:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.8));)
+        break;
+    case MP_CSP_TRC_GAMMA22:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.2));)
+        break;
+    case MP_CSP_TRC_GAMMA28:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.8));)
+        break;
+    case MP_CSP_TRC_PRO_PHOTO:
+        GLSL(color.rgb = mix(color.rgb * vec3(16.0),
+                             pow(color.rgb, vec3(1.0/1.8)),
+                             lessThanEqual(vec3(0.001953), color.rgb));)
+        break;
+    case MP_CSP_TRC_PQ:
+        GLSLF("color.rgb *= vec3(1.0/%f);\n", 10000 / MP_REF_WHITE);
+        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M1);
+        GLSLF("color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+              "             / (vec3(1.0) + vec3(%f) * color.rgb);\n",
+              PQ_C1, PQ_C2, PQ_C3);
+        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M2);
+        break;
+    case MP_CSP_TRC_HLG:
+        GLSLF("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),\n"
+              "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),\n"
+              "                lessThan(vec3(1.0), color.rgb));\n",
+              HLG_A, HLG_B, HLG_C);
+        break;
+    case MP_CSP_TRC_V_LOG:
+        GLSLF("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),   \n"
+              "                vec3(%f) * log(color.rgb + vec3(%f))   \n"
+              "                    + vec3(%f),                        \n"
+              "                lessThanEqual(vec3(0.01), color.rgb)); \n",
+              VLOG_C / M_LN10, VLOG_B, VLOG_D);
+        break;
+    case MP_CSP_TRC_S_LOG1:
+        GLSLF("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
+              SLOG_A / M_LN10, SLOG_B, SLOG_C);
+        break;
+    case MP_CSP_TRC_S_LOG2:
+        GLSLF("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f),                \n"
+              "                vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
+              "                    + vec3(%f),                                 \n"
+              "                lessThanEqual(vec3(0.0), color.rgb));           \n",
+              SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
+        break;
+    default:
+        abort();
+    }
+}
+
+// Apply the OOTF mapping from a given light type to display-referred light.
+// The extra peak parameter is used to scale the values before and after
+// the OOTF, and can be inferred using mp_trc_nom_peak
+void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
+{
+    if (light == MP_CSP_LIGHT_DISPLAY)
+        return;
+
+    GLSLF("// apply ootf\n");
+    GLSLF("color.rgb *= vec3(%f);\n", peak);
+
+    switch (light)
+    {
+    case MP_CSP_LIGHT_SCENE_HLG:
+        // HLG OOTF from BT.2100, assuming a reference display with a
+        // peak of 1000 cd/m² -> gamma = 1.2
+        GLSLF("color.rgb *= vec3(%f * pow(dot(src_luma, color.rgb), 0.2));\n",
+              (1000 / MP_REF_WHITE) / pow(12, 1.2));
+        break;
+    case MP_CSP_LIGHT_SCENE_709_1886:
+        // This OOTF is defined by encoding the result as 709 and then decoding
+        // it as 1886; although this is called 709_1886 we actually use the
+        // more precise (by one decimal) values from BT.2020 instead
+        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
+                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
+                             lessThan(vec3(0.0181), color.rgb));)
+        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
+        break;
+    case MP_CSP_LIGHT_SCENE_1_2:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.2));)
+        break;
+    default:
+        abort();
+    }
+
+    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
+}
+
+// Inverse of the function pass_ootf, for completeness' sake.
+void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
+{
+    if (light == MP_CSP_LIGHT_DISPLAY)
+        return;
+
+    GLSLF("// apply inverse ootf\n");
+    GLSLF("color.rgb *= vec3(%f);\n", peak);
+
+    switch (light)
+    {
+    case MP_CSP_LIGHT_SCENE_HLG:
+        GLSLF("color.rgb *= vec3(1.0/%f);\n", (1000 / MP_REF_WHITE) / pow(12, 1.2));
+        GLSL(color.rgb /= vec3(max(1e-6, pow(dot(src_luma, color.rgb), 0.2/1.2)));)
+        break;
+    case MP_CSP_LIGHT_SCENE_709_1886:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
+                             pow((color.rgb + vec3(0.0993)) * vec3(1.0/1.0993),
+                                 vec3(1/0.45)),
+                             lessThan(vec3(0.08145), color.rgb));)
+        break;
+    case MP_CSP_LIGHT_SCENE_1_2:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.2));)
+        break;
+    default:
+        abort();
+    }
+
+    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
+}
+
+// Tone map from a known peak brightness to the range [0,1]. If ref_peak
+// is 0, we will use peak detection instead
+static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
+                          enum tone_mapping algo, float param, float desat)
+{
+    GLSLF("// HDR tone mapping\n");
+
+    // Desaturate the color using a coefficient dependent on the luminance
+    GLSL(float luma = dot(dst_luma, color.rgb);)
+    if (desat > 0) {
+        GLSLF("float overbright = max(luma - %f, 1e-6) / max(luma, 1e-6);\n", desat);
+        GLSL(color.rgb = mix(color.rgb, vec3(luma), overbright);)
+    }
+
+    // To prevent discoloration due to out-of-bounds clipping, we need to make
+    // sure to reduce the value range as far as necessary to keep the entire
+    // signal in range, so tone map based on the brightest component.
+    GLSL(float sig = max(max(color.r, color.g), color.b);)
+    GLSL(float sig_orig = sig;)
+
+    if (!ref_peak) {
+        // For performance, we want to do as few atomic operations on global
+        // memory as possible, so use an atomic in shmem for the work group.
+        // We also want slightly more stable values, so use the group average
+        // instead of the group max
+        GLSLHF("shared uint group_sum = 0;\n");
+        GLSLF("atomicAdd(group_sum, uint(sig * %f));\n", MP_REF_WHITE);
+
+        // Have one thread in each work group update the frame maximum
+        GLSL(memoryBarrierBuffer();)
+        GLSL(barrier();)
+        GLSL(if (gl_LocalInvocationIndex == 0))
+            GLSL(atomicMax(frame_max[index], group_sum /
+                 (gl_WorkGroupSize.x * gl_WorkGroupSize.y));)
+
+        // Finally, have one thread per invocation update the total maximum
+        // and advance the index
+        GLSL(memoryBarrierBuffer();)
+        GLSL(barrier();)
+        GLSL(if (gl_GlobalInvocationID == ivec3(0)) {) // do this once per invocation
+            GLSLF("uint next = (index + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
+            GLSLF("sig_peak_raw = sig_peak_raw + frame_max[index] - frame_max[next];\n");
+            GLSLF("frame_max[next] = %d;\n", (int)MP_REF_WHITE);
+            GLSL(index = next;)
+        GLSL(})
+
+        GLSL(memoryBarrierBuffer();)
+        GLSL(barrier();)
+        GLSLF("float sig_peak = 1.0/%f * float(sig_peak_raw);\n",
+              MP_REF_WHITE * PEAK_DETECT_FRAMES);
+    } else {
+        GLSLHF("const float sig_peak = %f;\n", ref_peak);
+    }
+
+    switch (algo) {
+    case TONE_MAPPING_CLIP:
+        GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
+        break;
+
+    case TONE_MAPPING_MOBIUS:
+        GLSLF("const float j = %f;\n", isnan(param) ? 0.3 : param);
+        // solve for M(j) = j; M(sig_peak) = 1.0; M'(j) = 1.0
+        // where M(x) = scale * (x+a)/(x+b)
+        GLSLF("float a = -j*j * (sig_peak - 1.0) / (j*j - 2.0*j + sig_peak);\n");
+        GLSLF("float b = (j*j - 2.0*j*sig_peak + sig_peak) / "
+              "max(1e-6, sig_peak - 1.0);\n");
+        GLSLF("float scale = (b*b + 2.0*b*j + j*j) / (b-a);\n");
+        GLSL(sig = mix(sig, scale * (sig + a) / (sig + b), sig > j);)
+        break;
+
+    case TONE_MAPPING_REINHARD: {
+        float contrast = isnan(param) ? 0.5 : param,
+              offset = (1.0 - contrast) / contrast;
+        GLSLF("sig = sig / (sig + %f);\n", offset);
+        GLSLF("float scale = (sig_peak + %f) / sig_peak;\n", offset);
+        GLSL(sig *= scale;)
+        break;
+    }
+
+    case TONE_MAPPING_HABLE: {
+        float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
+        GLSLHF("float hable(float x) {\n");
+        GLSLHF("return ((x * (%f*x + %f)+%f)/(x * (%f*x + %f) + %f)) - %f;\n",
+               A, C*B, D*E, A, B, D*F, E/F);
+        GLSLHF("}\n");
+        GLSL(sig = hable(sig) / hable(sig_peak);)
+        break;
+    }
+
+    case TONE_MAPPING_GAMMA: {
+        float gamma = isnan(param) ? 1.8 : param;
+        GLSLF("const float cutoff = 0.05, gamma = %f;\n", 1.0/gamma);
+        GLSL(float scale = pow(cutoff / sig_peak, gamma) / cutoff;)
+        GLSL(sig = sig > cutoff ? pow(sig / sig_peak, gamma) : scale * sig;)
+        break;
+    }
+
+    case TONE_MAPPING_LINEAR: {
+        float coeff = isnan(param) ? 1.0 : param;
+        GLSLF("sig = %f / sig_peak * sig;\n", coeff);
+        break;
+    }
+
+    default:
+        abort();
+    }
+
+    // Apply the computed scale factor to the color, linearly to prevent
+    // discoloration
+    GLSL(color.rgb *= sig / sig_orig;)
+}
+
+// Map colors from one source space to another. These source spaces must be
+// known (i.e. not MP_CSP_*_AUTO), as this function won't perform any
+// auto-guessing. If is_linear is true, we assume the input has already been
+// linearized (e.g. for linear-scaling). If `detect_peak` is true, we will
+// detect the peak instead of relying on metadata. Note that this requires
+// the caller to have already bound the appropriate SSBO and set up the
+// compute shader metadata
+void pass_color_map(struct gl_shader_cache *sc,
+                    struct mp_colorspace src, struct mp_colorspace dst,
+                    enum tone_mapping algo, float tone_mapping_param,
+                    float tone_mapping_desat, bool detect_peak,
+                    bool gamut_warning, bool is_linear)
+{
+    GLSLF("// color mapping\n");
+
+    // Compute the highest encodable level
+    float src_range = mp_trc_nom_peak(src.gamma),
+          dst_range = mp_trc_nom_peak(dst.gamma);
+    float ref_peak = src.sig_peak / dst_range;
+
+    // Some operations need access to the video's luma coefficients, so make
+    // them available
+    float rgb2xyz[3][3];
+    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(src.primaries), rgb2xyz);
+    gl_sc_uniform_vec3(sc, "src_luma", rgb2xyz[1]);
+    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(dst.primaries), rgb2xyz);
+    gl_sc_uniform_vec3(sc, "dst_luma", rgb2xyz[1]);
+
+    // All operations from here on require linear light as a starting point,
+    // so we linearize even if src.gamma == dst.gamma when one of the other
+    // operations needs it
+    bool need_gamma = src.gamma != dst.gamma ||
+                      src.primaries != dst.primaries ||
+                      src_range != dst_range ||
+                      src.sig_peak > dst_range ||
+                      src.light != dst.light;
+
+    if (need_gamma && !is_linear) {
+        pass_linearize(sc, src.gamma);
+        is_linear= true;
+    }
+
+    if (src.light != dst.light)
+        pass_ootf(sc, src.light, mp_trc_nom_peak(src.gamma));
+
+    // Rescale the signal to compensate for differences in the encoding range
+    // and reference white level. This is necessary because of how mpv encodes
+    // brightness in textures.
+    if (src_range != dst_range) {
+        GLSLF("// rescale value range;\n");
+        GLSLF("color.rgb *= vec3(%f);\n", src_range / dst_range);
+    }
+
+    // Adapt to the right colorspace if necessary
+    if (src.primaries != dst.primaries) {
+        struct mp_csp_primaries csp_src = mp_get_csp_primaries(src.primaries),
+                                csp_dst = mp_get_csp_primaries(dst.primaries);
+        float m[3][3] = {{0}};
+        mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
+        gl_sc_uniform_mat3(sc, "cms_matrix", true, &m[0][0]);
+        GLSL(color.rgb = cms_matrix * color.rgb;)
+        // Since this can reduce the gamut, figure out by how much
+        for (int c = 0; c < 3; c++)
+            ref_peak = MPMAX(ref_peak, m[c][c]);
+    }
+
+    // Tone map to prevent clipping when the source signal peak exceeds the
+    // encodable range or we've reduced the gamut
+    if (ref_peak > 1) {
+        pass_tone_map(sc, detect_peak ? 0 : ref_peak, algo,
+                      tone_mapping_param, tone_mapping_desat);
+    }
+
+    if (src.light != dst.light)
+        pass_inverse_ootf(sc, dst.light, mp_trc_nom_peak(dst.gamma));
+
+    // Warn for remaining out-of-gamut colors is enabled
+    if (gamut_warning) {
+        GLSL(if (any(greaterThan(color.rgb, vec3(1.01)))))
+            GLSL(color.rgb = vec3(1.0) - color.rgb;) // invert
+    }
+
+    if (is_linear)
+        pass_delinearize(sc, dst.gamma);
+}
+
+// Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
+// Obtain random numbers by calling rand(h), followed by h = permute(h) to
+// update the state. Assumes the texture was hooked.
+static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
+{
+    GLSLH(float mod289(float x)  { return x - floor(x * 1.0/289.0) * 289.0; })
+    GLSLH(float permute(float x) { return mod289((34.0*x + 1.0) * x); })
+    GLSLH(float rand(float x)    { return fract(x * 1.0/41.0); })
+
+    // Initialize the PRNG by hashing the position + a random uniform
+    GLSL(vec3 _m = vec3(HOOKED_pos, random) + vec3(1.0);)
+    GLSL(float h = permute(permute(permute(_m.x)+_m.y)+_m.z);)
+    gl_sc_uniform_f(sc, "random", (double)av_lfg_get(lfg) / UINT32_MAX);
+}
+
+struct deband_opts {
+    int enabled;
+    int iterations;
+    float threshold;
+    float range;
+    float grain;
+};
+
+const struct deband_opts deband_opts_def = {
+    .iterations = 1,
+    .threshold = 64.0,
+    .range = 16.0,
+    .grain = 48.0,
+};
+
+#define OPT_BASE_STRUCT struct deband_opts
+const struct m_sub_options deband_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_INTRANGE("iterations", iterations, 0, 1, 16),
+        OPT_FLOATRANGE("threshold", threshold, 0, 0.0, 4096.0),
+        OPT_FLOATRANGE("range", range, 0, 1.0, 64.0),
+        OPT_FLOATRANGE("grain", grain, 0, 0.0, 4096.0),
+        {0}
+    },
+    .size = sizeof(struct deband_opts),
+    .defaults = &deband_opts_def,
+};
+
+// Stochastically sample a debanded result from a hooked texture.
+void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
+                        AVLFG *lfg, enum mp_csp_trc trc)
+{
+    // Initialize the PRNG
+    GLSLF("{\n");
+    prng_init(sc, lfg);
+
+    // Helper: Compute a stochastic approximation of the avg color around a
+    // pixel
+    GLSLHF("vec4 average(float range, inout float h) {\n");
+        // Compute a random rangle and distance
+        GLSLH(float dist = rand(h) * range;     h = permute(h);)
+        GLSLH(float dir  = rand(h) * 6.2831853; h = permute(h);)
+        GLSLH(vec2 o = dist * vec2(cos(dir), sin(dir));)
+
+        // Sample at quarter-turn intervals around the source pixel
+        GLSLH(vec4 ref[4];)
+        GLSLH(ref[0] = HOOKED_texOff(vec2( o.x,  o.y));)
+        GLSLH(ref[1] = HOOKED_texOff(vec2(-o.y,  o.x));)
+        GLSLH(ref[2] = HOOKED_texOff(vec2(-o.x, -o.y));)
+        GLSLH(ref[3] = HOOKED_texOff(vec2( o.y, -o.x));)
+
+        // Return the (normalized) average
+        GLSLH(return (ref[0] + ref[1] + ref[2] + ref[3])*0.25;)
+    GLSLHF("}\n");
+
+    // Sample the source pixel
+    GLSL(color = HOOKED_tex(HOOKED_pos);)
+    GLSLF("vec4 avg, diff;\n");
+    for (int i = 1; i <= opts->iterations; i++) {
+        // Sample the average pixel and use it instead of the original if
+        // the difference is below the given threshold
+        GLSLF("avg = average(%f, h);\n", i * opts->range);
+        GLSL(diff = abs(color - avg);)
+        GLSLF("color = mix(avg, color, greaterThan(diff, vec4(%f)));\n",
+              opts->threshold / (i * 16384.0));
+    }
+
+    // Add some random noise to smooth out residual differences
+    GLSL(vec3 noise;)
+    GLSL(noise.x = rand(h); h = permute(h);)
+    GLSL(noise.y = rand(h); h = permute(h);)
+    GLSL(noise.z = rand(h); h = permute(h);)
+
+    // Noise is scaled to the signal level to prevent extreme noise for HDR
+    float gain = opts->grain/8192.0 / mp_trc_nom_peak(trc);
+    GLSLF("color.xyz += %f * (noise - vec3(0.5));\n", gain);
+    GLSLF("}\n");
+}
+
+// Assumes the texture was hooked
+void pass_sample_unsharp(struct gl_shader_cache *sc, float param) {
+    GLSLF("{\n");
+    GLSL(float st1 = 1.2;)
+    GLSL(vec4 p = HOOKED_tex(HOOKED_pos);)
+    GLSL(vec4 sum1 = HOOKED_texOff(st1 * vec2(+1, +1))
+                   + HOOKED_texOff(st1 * vec2(+1, -1))
+                   + HOOKED_texOff(st1 * vec2(-1, +1))
+                   + HOOKED_texOff(st1 * vec2(-1, -1));)
+    GLSL(float st2 = 1.5;)
+    GLSL(vec4 sum2 = HOOKED_texOff(st2 * vec2(+1,  0))
+                   + HOOKED_texOff(st2 * vec2( 0, +1))
+                   + HOOKED_texOff(st2 * vec2(-1,  0))
+                   + HOOKED_texOff(st2 * vec2( 0, -1));)
+    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
+    GLSLF("color = p + t * %f;\n", param);
+    GLSLF("}\n");
+}
diff --git a/video/out/gpu/video_shaders.h b/video/out/gpu/video_shaders.h
new file mode 100644
index 0000000000..8345e4c598
--- /dev/null
+++ b/video/out/gpu/video_shaders.h
@@ -0,0 +1,56 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_VIDEO_SHADERS_H
+#define MP_GL_VIDEO_SHADERS_H
+
+#include <libavutil/lfg.h>
+
+#include "utils.h"
+#include "video.h"
+
+extern const struct deband_opts deband_opts_def;
+extern const struct m_sub_options deband_conf;
+
+void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
+void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
+                               int d_x, int d_y);
+void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                       int components, int glsl_version);
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                        int components, int bw, int bh, int iw, int ih);
+void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
+void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
+                            int w, int h);
+
+void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
+void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
+void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
+void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
+
+void pass_color_map(struct gl_shader_cache *sc,
+                    struct mp_colorspace src, struct mp_colorspace dst,
+                    enum tone_mapping algo, float tone_mapping_param,
+                    float tone_mapping_desat, bool use_detected_peak,
+                    bool gamut_warning, bool is_linear);
+
+void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
+                        AVLFG *lfg, enum mp_csp_trc trc);
+
+void pass_sample_unsharp(struct gl_shader_cache *sc, float param);
+
+#endif
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index 7b2e3ed497..b9f582b79f 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -26,10 +26,10 @@
 #include "common/msg.h"
 #include "misc/bstr.h"
 
-#include "video/out/vo.h"
 #include "video/csputils.h"
-
 #include "video/mp_image.h"
+#include "video/out/vo.h"
+#include "video/out/gpu/ra.h"
 
 #include "gl_headers.h"
 
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index fe454e9741..d3cdcac3b7 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -1,10 +1,4 @@
 /*
- * common OpenGL routines
- *
- * copyleft (C) 2005-2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
- * Special thanks go to the xine team and Matthias Hopf, whose video_out_opengl.c
- * gave me lots of good ideas.
- *
  * This file is part of mpv.
  *
  * mpv is free software; you can redistribute it and/or
@@ -21,73 +15,10 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdbool.h>
-#include <math.h>
-#include <assert.h>
-
+#include "options/m_config.h"
 #include "context.h"
-#include "common/common.h"
-#include "options/options.h"
-#include "options/m_option.h"
-
-extern const struct mpgl_driver mpgl_driver_x11;
-extern const struct mpgl_driver mpgl_driver_x11egl;
-extern const struct mpgl_driver mpgl_driver_x11_probe;
-extern const struct mpgl_driver mpgl_driver_drm_egl;
-extern const struct mpgl_driver mpgl_driver_drm;
-extern const struct mpgl_driver mpgl_driver_cocoa;
-extern const struct mpgl_driver mpgl_driver_wayland;
-extern const struct mpgl_driver mpgl_driver_w32;
-extern const struct mpgl_driver mpgl_driver_angle;
-extern const struct mpgl_driver mpgl_driver_angle_es2;
-extern const struct mpgl_driver mpgl_driver_dxinterop;
-extern const struct mpgl_driver mpgl_driver_rpi;
-extern const struct mpgl_driver mpgl_driver_mali;
-extern const struct mpgl_driver mpgl_driver_vdpauglx;
-
-static const struct mpgl_driver *const backends[] = {
-#if HAVE_RPI
-    &mpgl_driver_rpi,
-#endif
-#if HAVE_GL_COCOA
-    &mpgl_driver_cocoa,
-#endif
-#if HAVE_EGL_ANGLE_WIN32
-    &mpgl_driver_angle,
-#endif
-#if HAVE_GL_WIN32
-    &mpgl_driver_w32,
-#endif
-#if HAVE_GL_DXINTEROP
-    &mpgl_driver_dxinterop,
-#endif
-#if HAVE_GL_X11
-    &mpgl_driver_x11_probe,
-#endif
-#if HAVE_EGL_X11
-    &mpgl_driver_x11egl,
-#endif
-#if HAVE_GL_X11
-    &mpgl_driver_x11,
-#endif
-#if HAVE_GL_WAYLAND
-    &mpgl_driver_wayland,
-#endif
-#if HAVE_EGL_DRM
-    &mpgl_driver_drm,
-    &mpgl_driver_drm_egl,
-#endif
-#if HAVE_MALI_FBDEV
-    &mpgl_driver_mali,
-#endif
-#if HAVE_VDPAU_GL_X11
-    &mpgl_driver_vdpauglx,
-#endif
-};
+#include "ra_gl.h"
+#include "utils.h"
 
 // 0-terminated list of desktop GL versions a backend should try to
 // initialize. The first entry is the most preferred version.
@@ -103,140 +34,319 @@ const int mpgl_preferred_gl_versions[] = {
     0
 };
 
-int mpgl_find_backend(const char *name)
+enum {
+    FLUSH_NO = 0,
+    FLUSH_YES,
+    FLUSH_AUTO,
+};
+
+enum {
+    GLES_AUTO = 0,
+    GLES_YES,
+    GLES_NO,
+};
+
+struct opengl_opts {
+    int use_glfinish;
+    int waitvsync;
+    int vsync_pattern[2];
+    int swapinterval;
+    int early_flush;
+    int restrict_version;
+    int gles_mode;
+};
+
+#define OPT_BASE_STRUCT struct opengl_opts
+const struct m_sub_options opengl_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_FLAG("opengl-glfinish", use_glfinish, 0),
+        OPT_FLAG("opengl-waitvsync", waitvsync, 0),
+        OPT_INT("opengl-swapinterval", swapinterval, 0),
+        OPT_INTPAIR("opengl-check-pattern", vsync_pattern, 0),
+        OPT_INT("opengl-restrict", restrict_version, 0),
+        OPT_CHOICE("opengl-es", gles_mode, 0,
+                ({"auto", GLES_AUTO}, {"yes", GLES_YES}, {"no", GLES_NO})),
+        OPT_CHOICE("opengl-early-flush", early_flush, 0,
+                ({"no", FLUSH_NO}, {"yes", FLUSH_YES}, {"auto", FLUSH_AUTO})),
+
+        OPT_REPLACED("opengl-debug", "gpu-debug"),
+        OPT_REPLACED("opengl-sw", "gpu-sw"),
+        OPT_REPLACED("opengl-vsync-fences", "swapchain-depth"),
+        OPT_REPLACED("opengl-backend", "gpu-context"),
+        {0},
+    },
+    .defaults = &(const struct opengl_opts) {
+        .swapinterval = 1,
+    },
+    .size = sizeof(struct opengl_opts),
+};
+
+struct priv {
+    GL *gl;
+    struct mp_log *log;
+    struct ra_gl_ctx_params params;
+    struct opengl_opts *opts;
+    struct ra_swapchain_fns fns;
+    GLuint main_fb;
+    struct ra_tex *wrapped_fb; // corresponds to main_fb
+    // for debugging:
+    int frames_rendered;
+    unsigned int prev_sgi_sync_count;
+    // for gl_vsync_pattern
+    int last_pattern;
+    int matches, mismatches;
+    // for swapchain_depth simulation
+    GLsync *vsync_fences;
+    int num_vsync_fences;
+};
+
+bool ra_gl_ctx_test_version(struct ra_ctx *ctx, int version, bool es)
 {
-    if (name == NULL || strcmp(name, "auto") == 0)
-        return -1;
-    for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-        if (strcmp(backends[n]->name, name) == 0)
-            return n;
+    bool ret;
+    struct opengl_opts *opts;
+    void *tmp = talloc_new(NULL);
+    opts = mp_get_config_group(tmp, ctx->global, &opengl_conf);
+
+    // Version too high
+    if (opts->restrict_version && version >= opts->restrict_version) {
+        ret = false;
+        goto done;
     }
-    return -2;
-}
 
-int mpgl_validate_backend_opt(struct mp_log *log, const struct m_option *opt,
-                              struct bstr name, struct bstr param)
-{
-    if (bstr_equals0(param, "help")) {
-        mp_info(log, "OpenGL windowing backends:\n");
-        mp_info(log, "    auto (autodetect)\n");
-        for (int n = 0; n < MP_ARRAY_SIZE(backends); n++)
-            mp_info(log, "    %s\n", backends[n]->name);
-        return M_OPT_EXIT;
+    switch (opts->gles_mode) {
+    case GLES_YES:  ret = es;   goto done;
+    case GLES_NO:   ret = !es;  goto done;
+    case GLES_AUTO: ret = true; goto done;
+    default: abort();
     }
-    char s[20];
-    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-    return mpgl_find_backend(s) >= -1 ? 1 : M_OPT_INVALID;
+
+done:
+    talloc_free(tmp);
+    return ret;
 }
 
-static void *get_native_display(void *pctx, const char *name)
+static void *get_native_display(void *priv, const char *name)
 {
-    MPGLContext *ctx = pctx;
-    if (!ctx->native_display_type || !name)
+    struct priv *p = priv;
+    if (!p->params.native_display_type || !name)
+        return NULL;
+    if (strcmp(p->params.native_display_type, name) != 0)
         return NULL;
-    return strcmp(ctx->native_display_type, name) == 0 ? ctx->native_display : NULL;
+
+    return p->params.native_display;
 }
 
-static MPGLContext *init_backend(struct vo *vo, const struct mpgl_driver *driver,
-                                 bool probing, int vo_flags)
+void ra_gl_ctx_uninit(struct ra_ctx *ctx)
 {
-    MPGLContext *ctx = talloc_ptrtype(NULL, ctx);
-    *ctx = (MPGLContext) {
-        .gl = talloc_zero(ctx, GL),
-        .vo = vo,
-        .global = vo->global,
-        .driver = driver,
-        .log = vo->log,
+    if (ctx->ra)
+        ctx->ra->fns->destroy(ctx->ra);
+    if (ctx->swapchain) {
+        talloc_free(ctx->swapchain);
+        ctx->swapchain = NULL;
+    }
+}
+
+static const struct ra_swapchain_fns ra_gl_swapchain_fns;
+
+bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params)
+{
+    struct ra_swapchain *sw = ctx->swapchain = talloc_ptrtype(NULL, sw);
+    *sw = (struct ra_swapchain) {
+        .ctx    = ctx,
+        .flip_v = !params.flipped, // OpenGL framebuffers are normally inverted
     };
-    if (probing)
-        vo_flags |= VOFLAG_PROBING;
-    bool old_probing = vo->probing;
-    vo->probing = probing; // hack; kill it once backends are separate
-    MP_VERBOSE(vo, "Initializing OpenGL backend '%s'\n", ctx->driver->name);
-    ctx->priv = talloc_zero_size(ctx, ctx->driver->priv_size);
-    if (ctx->driver->init(ctx, vo_flags) < 0) {
-        vo->probing = old_probing;
-        talloc_free(ctx);
-        return NULL;
+
+    struct priv *p = sw->priv = talloc_ptrtype(sw, p);
+    *p = (struct priv) {
+        .gl     = gl,
+        .log    = ctx->log,
+        .params = params,
+        .opts   = mp_get_config_group(p, ctx->global, &opengl_conf),
+        .fns    = ra_gl_swapchain_fns,
+    };
+
+    sw->fns = &p->fns;
+
+    const struct ra_swapchain_fns *ext = p->params.external_swapchain;
+    if (ext) {
+        if (ext->color_depth)
+            p->fns.color_depth = ext->color_depth;
+        if (ext->screenshot)
+            p->fns.screenshot = ext->screenshot;
+        if (ext->start_frame)
+            p->fns.start_frame = ext->start_frame;
+        if (ext->submit_frame)
+            p->fns.submit_frame = ext->submit_frame;
+        if (ext->swap_buffers)
+            p->fns.swap_buffers = ext->swap_buffers;
     }
-    vo->probing = old_probing;
 
-    if (!ctx->gl->version && !ctx->gl->es)
-        goto cleanup;
+    if (!gl->version && !gl->es)
+        return false;
 
-    if (probing && ctx->gl->es && (vo_flags & VOFLAG_NO_GLES)) {
-        MP_VERBOSE(ctx->vo, "Skipping GLES backend.\n");
-        goto cleanup;
+    if (gl->mpgl_caps & MPGL_CAP_SW) {
+        MP_WARN(p, "Suspected software renderer or indirect context.\n");
+        if (ctx->opts.probing && !ctx->opts.allow_sw)
+            return false;
     }
 
-    if (ctx->gl->mpgl_caps & MPGL_CAP_SW) {
-        MP_WARN(ctx->vo, "Suspected software renderer or indirect context.\n");
-        if (vo->probing && !(vo_flags & VOFLAG_SW))
-            goto cleanup;
+    gl->debug_context = ctx->opts.debug;
+    gl->get_native_display_ctx = p;
+    gl->get_native_display = get_native_display;
+
+    if (gl->SwapInterval) {
+        gl->SwapInterval(p->opts->swapinterval);
+    } else {
+        MP_VERBOSE(p, "GL_*_swap_control extension missing.\n");
     }
 
-    ctx->gl->debug_context = !!(vo_flags & VOFLAG_GL_DEBUG);
+    ctx->ra = ra_create_gl(p->gl, ctx->log);
+    return !!ctx->ra;
+}
 
-    ctx->gl->get_native_display_ctx = ctx;
-    ctx->gl->get_native_display = get_native_display;
+void ra_gl_ctx_resize(struct ra_swapchain *sw, int w, int h, int fbo)
+{
+    struct priv *p = sw->priv;
+    if (p->main_fb == fbo && p->wrapped_fb && p->wrapped_fb->params.w == w
+        && p->wrapped_fb->params.h == h)
+        return;
 
-    return ctx;
+    if (p->wrapped_fb)
+        ra_tex_free(sw->ctx->ra, &p->wrapped_fb);
 
-cleanup:
-    mpgl_uninit(ctx);
-    return NULL;
+    p->main_fb = fbo;
+    p->wrapped_fb = ra_create_wrapped_fb(sw->ctx->ra, fbo, w, h);
 }
 
-// Create a VO window and create a GL context on it.
-//  vo_flags: passed to the backend's create window function
-MPGLContext *mpgl_init(struct vo *vo, const char *backend_name, int vo_flags)
+int ra_gl_ctx_color_depth(struct ra_swapchain *sw)
 {
-    MPGLContext *ctx = NULL;
-    int index = mpgl_find_backend(backend_name);
-    if (index == -1) {
-        for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-            ctx = init_backend(vo, backends[n], true, vo_flags);
-            if (ctx)
-                break;
-        }
-        // VO forced, but no backend is ok => force the first that works at all
-        if (!ctx && !vo->probing) {
-            for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-                ctx = init_backend(vo, backends[n], false, vo_flags);
-                if (ctx)
-                    break;
-            }
-        }
-    } else if (index >= 0) {
-        ctx = init_backend(vo, backends[index], false, vo_flags);
-    }
-    return ctx;
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    if (!p->wrapped_fb)
+        return 0;
+
+    if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))
+        return 0;
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, p->main_fb);
+
+    GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
+    if (p->main_fb)
+        obj = GL_COLOR_ATTACHMENT0;
+
+    GLint depth_g = 0;
+
+    gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+
+    return depth_g;
 }
 
-int mpgl_reconfig_window(struct MPGLContext *ctx)
+struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw)
 {
-    return ctx->driver->reconfig(ctx);
+    struct priv *p = sw->priv;
+
+    assert(p->wrapped_fb);
+    struct mp_image *screen = gl_read_fbo_contents(p->gl, p->main_fb,
+                                                   p->wrapped_fb->params.w,
+                                                   p->wrapped_fb->params.h);
+
+    // OpenGL FB is also read in flipped order, so we need to flip when the
+    // rendering is *not* flipped, which in our case is whenever
+    // p->params.flipped is true. I hope that made sense
+    if (p->params.flipped)
+        mp_image_vflip(screen);
+
+    return screen;
 }
 
-int mpgl_control(struct MPGLContext *ctx, int *events, int request, void *arg)
+struct ra_tex *ra_gl_ctx_start_frame(struct ra_swapchain *sw)
 {
-    return ctx->driver->control(ctx, events, request, arg);
+    struct priv *p = sw->priv;
+
+    return p->wrapped_fb;
 }
 
-void mpgl_start_frame(struct MPGLContext *ctx)
+bool ra_gl_ctx_submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
 {
-    if (ctx->driver->start_frame)
-        ctx->driver->start_frame(ctx);
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    if (p->opts->use_glfinish)
+        gl->Finish();
+
+    if (gl->FenceSync && !p->params.external_swapchain) {
+        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        if (fence)
+            MP_TARRAY_APPEND(p, p->vsync_fences, p->num_vsync_fences, fence);
+    }
+
+    switch (p->opts->early_flush) {
+    case FLUSH_AUTO:
+        if (frame->display_synced)
+            break;
+        // fall through
+    case FLUSH_YES:
+        gl->Flush();
+    }
+
+    return true;
 }
 
-void mpgl_swap_buffers(struct MPGLContext *ctx)
+static void check_pattern(struct priv *p, int item)
 {
-    ctx->driver->swap_buffers(ctx);
+    int expected = p->opts->vsync_pattern[p->last_pattern];
+    if (item == expected) {
+        p->last_pattern++;
+        if (p->last_pattern >= 2)
+            p->last_pattern = 0;
+        p->matches++;
+    } else {
+        p->mismatches++;
+        MP_WARN(p, "wrong pattern, expected %d got %d (hit: %d, mis: %d)\n",
+                expected, item, p->matches, p->mismatches);
+    }
 }
 
-void mpgl_uninit(MPGLContext *ctx)
+void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw)
 {
-    if (ctx)
-        ctx->driver->uninit(ctx);
-    talloc_free(ctx);
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    p->params.swap_buffers(sw->ctx);
+    p->frames_rendered++;
+
+    if (p->frames_rendered > 5 && !sw->ctx->opts.debug)
+        ra_gl_set_debug(sw->ctx->ra, false);
+
+    if ((p->opts->waitvsync || p->opts->vsync_pattern[0])
+        && gl->GetVideoSync)
+    {
+        unsigned int n1 = 0, n2 = 0;
+        gl->GetVideoSync(&n1);
+        if (p->opts->waitvsync)
+            gl->WaitVideoSync(2, (n1 + 1) % 2, &n2);
+        int step = n1 - p->prev_sgi_sync_count;
+        p->prev_sgi_sync_count = n1;
+        MP_DBG(p, "Flip counts: %u->%u, step=%d\n", n1, n2, step);
+        if (p->opts->vsync_pattern[0])
+            check_pattern(p, step);
+    }
+
+    while (p->num_vsync_fences >= sw->ctx->opts.swapchain_depth) {
+        gl->ClientWaitSync(p->vsync_fences[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
+        gl->DeleteSync(p->vsync_fences[0]);
+        MP_TARRAY_REMOVE_AT(p->vsync_fences, p->num_vsync_fences, 0);
+    }
 }
+
+static const struct ra_swapchain_fns ra_gl_swapchain_fns = {
+    .color_depth   = ra_gl_ctx_color_depth,
+    .screenshot    = ra_gl_ctx_screenshot,
+    .start_frame   = ra_gl_ctx_start_frame,
+    .submit_frame  = ra_gl_ctx_submit_frame,
+    .swap_buffers  = ra_gl_ctx_swap_buffers,
+};
diff --git a/video/out/opengl/context.h b/video/out/opengl/context.h
index 229c5ef54f..bdf426b9b4 100644
--- a/video/out/opengl/context.h
+++ b/video/out/opengl/context.h
@@ -1,116 +1,56 @@
-/*
- * common OpenGL routines
- *
- * copyleft (C) 2005-2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
- * Special thanks go to the xine team and Matthias Hopf, whose video_out_opengl.c
- * gave me lots of good ideas.
- *
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_CONTEXT_H_
-#define MP_GL_CONTEXT_H_
+#pragma once
 
+#include "common/global.h"
+#include "video/out/gpu/context.h"
 #include "common.h"
 
-enum {
-    VOFLAG_GLES         = 1 << 0,       // Hint to create a GLES context
-    VOFLAG_NO_GLES      = 1 << 1,       // Hint to create a desktop GL context
-    VOFLAG_GL_DEBUG     = 1 << 2,       // Hint to request debug OpenGL context
-    VOFLAG_ALPHA        = 1 << 3,       // Hint to request alpha framebuffer
-    VOFLAG_SW           = 1 << 4,       // Hint to accept a software GL renderer
-    VOFLAG_PROBING      = 1 << 6,       // The backend is being auto-probed.
-    VOFLAG_GLES2        = 1 << 7,       // Hint for GLESv2 (needs VOFLAG_GLES)
-};
-
 extern const int mpgl_preferred_gl_versions[];
 
-struct MPGLContext;
-
-// A windowing backend (like X11, win32, ...), which provides OpenGL rendering.
-struct mpgl_driver {
-    const char *name;
-
-    // Size of the struct allocated for MPGLContext.priv
-    int priv_size;
-
-    // Init the GL context and possibly the underlying VO backend.
-    // The created context should be compatible to GL 3.2 core profile, but
-    // some other GL versions are supported as well (e.g. GL 2.1 or GLES 2).
-    // Return 0 on success, negative value (-1) on error.
-    int (*init)(struct MPGLContext *ctx, int vo_flags);
-
-    // Resize the window, or create a new window if there isn't one yet.
-    // Currently, there is an unfortunate interaction with ctx->vo, and
-    // display size etc. are determined by it.
-    // Return 0 on success, negative value (-1) on error.
-    int (*reconfig)(struct MPGLContext *ctx);
-
-    // Called when rendering starts. The backend can map or resize the
-    // framebuffer, or update GL.main_fb. swap_buffers() ends the frame.
-    // Optional.
-    void (*start_frame)(struct MPGLContext *ctx);
-
-    // Present the frame.
-    void (*swap_buffers)(struct MPGLContext *ctx);
-
-    // This behaves exactly like vo_driver.control().
-    int (*control)(struct MPGLContext *ctx, int *events, int request, void *arg);
-
-    // These behave exactly like vo_driver.wakeup/wait_events. They are
-    // optional.
-    void (*wakeup)(struct MPGLContext *ctx);
-    void (*wait_events)(struct MPGLContext *ctx, int64_t until_time_us);
-
-    // Destroy the GL context and possibly the underlying VO backend.
-    void (*uninit)(struct MPGLContext *ctx);
-};
-
-typedef struct MPGLContext {
-    GL *gl;
-    struct vo *vo;
-    const struct mpgl_driver *driver;
-    struct mpv_global *global;
-    struct mp_log *log;
-
-    // For hwdec_vaegl.c.
+// Returns whether or not a candidate GL version should be accepted or not
+// (based on the --opengl opts). Implementations may call this before
+// ra_gl_ctx_init if they wish to probe for multiple possible GL versions.
+bool ra_gl_ctx_test_version(struct ra_ctx *ctx, int version, bool es);
+
+// These are a set of helpers for ra_ctx providers based on ra_gl.
+// The init function also initializes ctx->ra and ctx->swapchain, so the user
+// doesn't have to do this manually. (Similarly, the uninit function will
+// clean them up)
+
+struct ra_gl_ctx_params {
+    // Set to the platform-specific function to swap buffers, like
+    // glXSwapBuffers, eglSwapBuffers etc. This will be called by
+    // ra_gl_ctx_swap_buffers. Required unless you either never call that
+    // function or if you override it yourself.
+    void (*swap_buffers)(struct ra_ctx *ctx);
+
+    // Set to false if the implementation follows normal GL semantics, which is
+    // upside down. Set to true if it does *not*, i.e. if rendering is right
+    // side up
+    bool flipped;
+
+    // If this is set to non-NULL, then the ra_gl_ctx will consider the GL
+    // implementation to be using an external swapchain, which disables the
+    // software simulation of --swapchain-depth. Any functions defined by this
+    // ra_swapchain_fns structs will entirely replace the equivalent ra_gl_ctx
+    // functions in the resulting ra_swapchain.
+    const struct ra_swapchain_fns *external_swapchain;
+
+    // For hwdec_vaegl.c:
     const char *native_display_type;
     void *native_display;
+};
 
-    // Flip the rendered image vertically. This is useful for dxinterop.
-    bool flip_v;
-
-    // framebuffer to render to (normally 0)
-    GLuint main_fb;
-
-    // For free use by the mpgl_driver.
-    void *priv;
-} MPGLContext;
-
-MPGLContext *mpgl_init(struct vo *vo, const char *backend_name, int vo_flags);
-void mpgl_uninit(MPGLContext *ctx);
-int mpgl_reconfig_window(struct MPGLContext *ctx);
-int mpgl_control(struct MPGLContext *ctx, int *events, int request, void *arg);
-void mpgl_start_frame(struct MPGLContext *ctx);
-void mpgl_swap_buffers(struct MPGLContext *ctx);
-
-int mpgl_find_backend(const char *name);
+void ra_gl_ctx_uninit(struct ra_ctx *ctx);
+bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params);
 
-struct m_option;
-int mpgl_validate_backend_opt(struct mp_log *log, const struct m_option *opt,
-                              struct bstr name, struct bstr param);
+// Call this any time the window size or main framebuffer changes
+void ra_gl_ctx_resize(struct ra_swapchain *sw, int w, int h, int fbo);
 
-#endif
+// These functions are normally set in the ra_swapchain->fns, but if an
+// implementation has a need to override this fns struct with custom functions
+// for whatever reason, these can be used to inherit the original behavior.
+int ra_gl_ctx_color_depth(struct ra_swapchain *sw);
+struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw);
+struct ra_tex *ra_gl_ctx_start_frame(struct ra_swapchain *sw);
+bool ra_gl_ctx_submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame);
+void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw);
diff --git a/video/out/opengl/context_cocoa.c b/video/out/opengl/context_cocoa.c
index 1d9a10cf38..cdf6faffcd 100644
--- a/video/out/opengl/context_cocoa.c
+++ b/video/out/opengl/context_cocoa.c
@@ -188,4 +188,4 @@ const struct mpgl_driver mpgl_driver_cocoa = {
     .swap_buffers   = cocoa_swap_buffers,
     .control        = cocoa_control,
     .uninit         = cocoa_uninit,
-};
\ No newline at end of file
+};
diff --git a/video/out/opengl/context_drm_egl.c b/video/out/opengl/context_drm_egl.c
index e52fec451b..21b16a52d5 100644
--- a/video/out/opengl/context_drm_egl.c
+++ b/video/out/opengl/context_drm_egl.c
@@ -28,10 +28,12 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "context.h"
-#include "egl_helpers.h"
-#include "common/common.h"
 #include "video/out/drm_common.h"
+#include "common/common.h"
+
+#include "egl_helpers.h"
+#include "common.h"
+#include "context.h"
 
 #define USE_MASTER 0
 
@@ -59,6 +61,7 @@ struct egl
 };
 
 struct priv {
+    GL gl;
     struct kms *kms;
 
     drmEventContext ev;
@@ -75,34 +78,33 @@ struct priv {
     struct vt_switcher vt_switcher;
 };
 
-static bool init_egl(struct MPGLContext *ctx, int flags)
+static bool init_egl(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    MP_VERBOSE(ctx->vo, "Initializing EGL\n");
+    MP_VERBOSE(ctx, "Initializing EGL\n");
     p->egl.display = eglGetDisplay(p->gbm.device);
     if (p->egl.display == EGL_NO_DISPLAY) {
-        MP_ERR(ctx->vo, "Failed to get EGL display.\n");
+        MP_ERR(ctx, "Failed to get EGL display.\n");
         return false;
     }
     if (!eglInitialize(p->egl.display, NULL, NULL)) {
-        MP_ERR(ctx->vo, "Failed to initialize EGL.\n");
+        MP_ERR(ctx, "Failed to initialize EGL.\n");
         return false;
     }
     EGLConfig config;
-    if (!mpegl_create_context(p->egl.display, ctx->vo->log, flags,
-                              &p->egl.context, &config))
-        return -1;
-    MP_VERBOSE(ctx->vo, "Initializing EGL surface\n");
+    if (!mpegl_create_context(ctx, p->egl.display, &p->egl.context, &config))
+        return false;
+    MP_VERBOSE(ctx, "Initializing EGL surface\n");
     p->egl.surface
         = eglCreateWindowSurface(p->egl.display, config, p->gbm.surface, NULL);
     if (p->egl.surface == EGL_NO_SURFACE) {
-        MP_ERR(ctx->vo, "Failed to create EGL surface.\n");
+        MP_ERR(ctx, "Failed to create EGL surface.\n");
         return false;
     }
     return true;
 }
 
-static bool init_gbm(struct MPGLContext *ctx)
+static bool init_gbm(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     MP_VERBOSE(ctx->vo, "Creating GBM device\n");
@@ -136,7 +138,7 @@ static void framebuffer_destroy_callback(struct gbm_bo *bo, void *data)
 }
 
 static void update_framebuffer_from_bo(
-    const struct MPGLContext *ctx, struct gbm_bo *bo)
+    const struct ra_ctx *ctx, struct gbm_bo *bo)
 {
     struct priv *p = ctx->priv;
     p->fb.bo = bo;
@@ -161,7 +163,7 @@ static void page_flipped(int fd, unsigned int frame, unsigned int sec,
     p->waiting_for_flip = false;
 }
 
-static bool crtc_setup(struct MPGLContext *ctx)
+static bool crtc_setup(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     if (p->active)
@@ -174,7 +176,7 @@ static bool crtc_setup(struct MPGLContext *ctx)
     return ret == 0;
 }
 
-static void crtc_release(struct MPGLContext *ctx)
+static void crtc_release(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -204,7 +206,7 @@ static void crtc_release(struct MPGLContext *ctx)
 
 static void release_vt(void *data)
 {
-    struct MPGLContext *ctx = data;
+    struct ra_ctx *ctx = data;
     MP_VERBOSE(ctx->vo, "Releasing VT");
     crtc_release(ctx);
     if (USE_MASTER) {
@@ -221,7 +223,7 @@ static void release_vt(void *data)
 
 static void acquire_vt(void *data)
 {
-    struct MPGLContext *ctx = data;
+    struct ra_ctx *ctx = data;
     MP_VERBOSE(ctx->vo, "Acquiring VT");
     if (USE_MASTER) {
         struct priv *p = ctx->priv;
@@ -234,11 +236,41 @@ static void acquire_vt(void *data)
     crtc_setup(ctx);
 }
 
-static void drm_egl_uninit(MPGLContext *ctx)
+static void drm_egl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    crtc_release(ctx);
+    eglSwapBuffers(p->egl.display, p->egl.surface);
+    p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
+    p->waiting_for_flip = true;
+    update_framebuffer_from_bo(ctx, p->gbm.next_bo);
+    int ret = drmModePageFlip(p->kms->fd, p->kms->crtc_id, p->fb.id,
+                              DRM_MODE_PAGE_FLIP_EVENT, p);
+    if (ret) {
+        MP_WARN(ctx->vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
+    }
+
+    // poll page flip finish event
+    const int timeout_ms = 3000;
+    struct pollfd fds[1] = { { .events = POLLIN, .fd = p->kms->fd } };
+    poll(fds, 1, timeout_ms);
+    if (fds[0].revents & POLLIN) {
+        ret = drmHandleEvent(p->kms->fd, &p->ev);
+        if (ret != 0) {
+            MP_ERR(ctx->vo, "drmHandleEvent failed: %i\n", ret);
+            return;
+        }
+    }
+
+    gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
+    p->gbm.bo = p->gbm.next_bo;
+}
 
+static void drm_egl_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
+    crtc_release(ctx);
     if (p->vt_switcher_active)
         vt_switcher_destroy(&p->vt_switcher);
 
@@ -258,19 +290,14 @@ static void drm_egl_uninit(MPGLContext *ctx)
     }
 }
 
-static int drm_egl_init(struct MPGLContext *ctx, int flags)
+static bool drm_egl_init(struct ra_ctx *ctx)
 {
-    if (ctx->vo->probing) {
-        MP_VERBOSE(ctx->vo, "DRM EGL backend can be activated only manually.\n");
-        return -1;
+    if (ctx->opts.probing) {
+        MP_VERBOSE(ctx, "DRM EGL backend can be activated only manually.\n");
+        return false;
     }
-    struct priv *p = ctx->priv;
-    p->kms = NULL;
-    p->old_crtc = NULL;
-    p->gbm.surface = NULL;
-    p->gbm.device = NULL;
-    p->active = false;
-    p->waiting_for_flip = false;
+
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     p->ev.version = DRM_EVENT_CONTEXT_VERSION;
     p->ev.page_flip_handler = page_flipped;
 
@@ -279,79 +306,76 @@ static int drm_egl_init(struct MPGLContext *ctx, int flags)
         vt_switcher_acquire(&p->vt_switcher, acquire_vt, ctx);
         vt_switcher_release(&p->vt_switcher, release_vt, ctx);
     } else {
-        MP_WARN(ctx->vo, "Failed to set up VT switcher. Terminal switching will be unavailable.\n");
+        MP_WARN(ctx, "Failed to set up VT switcher. Terminal switching will be unavailable.\n");
     }
 
-    MP_VERBOSE(ctx->vo, "Initializing KMS\n");
-    p->kms = kms_create(ctx->vo->log, ctx->vo->opts->drm_connector_spec,
+    MP_VERBOSE(ctx, "Initializing KMS\n");
+    p->kms = kms_create(ctx->log, ctx->vo->opts->drm_connector_spec,
                         ctx->vo->opts->drm_mode_id);
     if (!p->kms) {
         MP_ERR(ctx->vo, "Failed to create KMS.\n");
-        return -1;
+        return false;
     }
 
     if (!init_gbm(ctx)) {
         MP_ERR(ctx->vo, "Failed to setup GBM.\n");
-        return -1;
+        return false;
     }
 
-    if (!init_egl(ctx, flags)) {
+    if (!init_egl(ctx)) {
         MP_ERR(ctx->vo, "Failed to setup EGL.\n");
-        return -1;
+        return false;
     }
 
     if (!eglMakeCurrent(p->egl.display, p->egl.surface, p->egl.surface,
                         p->egl.context)) {
         MP_ERR(ctx->vo, "Failed to make context current.\n");
-        return -1;
+        return false;
     }
 
-    mpegl_load_functions(ctx->gl, ctx->vo->log);
-
-    ctx->native_display_type = "drm";
-    ctx->native_display = (void *)(intptr_t)p->kms->fd;
-
+    mpegl_load_functions(&p->gl, ctx->vo->log);
     // required by gbm_surface_lock_front_buffer
     eglSwapBuffers(p->egl.display, p->egl.surface);
 
-    MP_VERBOSE(ctx->vo, "Preparing framebuffer\n");
+    MP_VERBOSE(ctx, "Preparing framebuffer\n");
     p->gbm.bo = gbm_surface_lock_front_buffer(p->gbm.surface);
     if (!p->gbm.bo) {
-        MP_ERR(ctx->vo, "Failed to lock GBM surface.\n");
-        return -1;
+        MP_ERR(ctx, "Failed to lock GBM surface.\n");
+        return false;
     }
     update_framebuffer_from_bo(ctx, p->gbm.bo);
     if (!p->fb.id) {
-        MP_ERR(ctx->vo, "Failed to create framebuffer.\n");
-        return -1;
+        MP_ERR(ctx, "Failed to create framebuffer.\n");
+        return false;
     }
 
     if (!crtc_setup(ctx)) {
-        MP_ERR(ctx->vo, "Failed to set CRTC for connector %u: %s\n",
+        MP_ERR(ctx, "Failed to set CRTC for connector %u: %s\n",
                p->kms->connector->connector_id, mp_strerror(errno));
-        return -1;
+        return false;
     }
 
-    return 0;
-}
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = drm_egl_swap_buffers,
+        .native_display_type = "drm",
+        .native_display = (void *)(intptr_t)p->kms->fd,
+    };
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        return false;
 
-static int drm_egl_init_deprecated(struct MPGLContext *ctx, int flags)
-{
-    if (ctx->vo->probing)
-        return -1;
-    MP_WARN(ctx->vo, "'drm-egl' is deprecated, use 'drm' instead.\n");
-    return drm_egl_init(ctx, flags);
+    return true;
 }
 
-static int drm_egl_reconfig(struct MPGLContext *ctx)
+static bool drm_egl_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     ctx->vo->dwidth = p->fb.width;
     ctx->vo->dheight = p->fb.height;
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, p->fb.width, p->fb.height, 0);
+    return true;
 }
 
-static int drm_egl_control(struct MPGLContext *ctx, int *events, int request,
+static int drm_egl_control(struct ra_ctx *ctx, int *events, int request,
                            void *arg)
 {
     struct priv *p = ctx->priv;
@@ -367,51 +391,11 @@ static int drm_egl_control(struct MPGLContext *ctx, int *events, int request,
     return VO_NOTIMPL;
 }
 
-static void drm_egl_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl.display, p->egl.surface);
-    p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
-    p->waiting_for_flip = true;
-    update_framebuffer_from_bo(ctx, p->gbm.next_bo);
-    int ret = drmModePageFlip(p->kms->fd, p->kms->crtc_id, p->fb.id,
-                              DRM_MODE_PAGE_FLIP_EVENT, p);
-    if (ret) {
-        MP_WARN(ctx->vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
-    }
-
-    // poll page flip finish event
-    const int timeout_ms = 3000;
-    struct pollfd fds[1] = { { .events = POLLIN, .fd = p->kms->fd } };
-    poll(fds, 1, timeout_ms);
-    if (fds[0].revents & POLLIN) {
-        ret = drmHandleEvent(p->kms->fd, &p->ev);
-        if (ret != 0) {
-            MP_ERR(ctx->vo, "drmHandleEvent failed: %i\n", ret);
-            return;
-        }
-    }
-
-    gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
-    p->gbm.bo = p->gbm.next_bo;
-}
-
-const struct mpgl_driver mpgl_driver_drm = {
+const struct ra_ctx_fns ra_ctx_drm_egl = {
+    .type           = "opengl",
     .name           = "drm",
-    .priv_size      = sizeof(struct priv),
-    .init           = drm_egl_init,
     .reconfig       = drm_egl_reconfig,
-    .swap_buffers   = drm_egl_swap_buffers,
-    .control        = drm_egl_control,
-    .uninit         = drm_egl_uninit,
-};
-
-const struct mpgl_driver mpgl_driver_drm_egl = {
-    .name           = "drm-egl",
-    .priv_size      = sizeof(struct priv),
-    .init           = drm_egl_init_deprecated,
-    .reconfig       = drm_egl_reconfig,
-    .swap_buffers   = drm_egl_swap_buffers,
     .control        = drm_egl_control,
+    .init           = drm_egl_init,
     .uninit         = drm_egl_uninit,
 };
diff --git a/video/out/opengl/context_glx.c b/video/out/opengl/context_glx.c
new file mode 100644
index 0000000000..462f2cf592
--- /dev/null
+++ b/video/out/opengl/context_glx.c
@@ -0,0 +1,376 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <X11/Xlib.h>
+#include <GL/glx.h>
+
+// FreeBSD 10.0-CURRENT lacks the GLX_ARB_create_context extension completely
+#ifndef GLX_CONTEXT_MAJOR_VERSION_ARB
+#define GLX_CONTEXT_MAJOR_VERSION_ARB           0x2091
+#define GLX_CONTEXT_MINOR_VERSION_ARB           0x2092
+#define GLX_CONTEXT_FLAGS_ARB                   0x2094
+#define GLX_CONTEXT_PROFILE_MASK_ARB            0x9126
+#ifndef __APPLE__
+// These are respectively 0x00000001 and 0x00000002 on OSX
+#define GLX_CONTEXT_DEBUG_BIT_ARB               0x0001
+#define GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB  0x0002
+#endif
+#define GLX_CONTEXT_CORE_PROFILE_BIT_ARB        0x00000001
+#define GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB 0x00000002
+#endif
+// GLX_EXT_create_context_es2_profile
+#ifndef GLX_CONTEXT_ES2_PROFILE_BIT_EXT
+#define GLX_CONTEXT_ES2_PROFILE_BIT_EXT         0x00000004
+#endif
+
+#include "video/out/x11_common.h"
+#include "context.h"
+#include "utils.h"
+
+struct priv {
+    GL gl;
+    XVisualInfo *vinfo;
+    GLXContext context;
+    GLXFBConfig fbc;
+};
+
+static void glx_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
+    if (p->vinfo)
+        XFree(p->vinfo);
+    if (p->context) {
+        Display *display = ctx->vo->x11->display;
+        glXMakeCurrent(display, None, NULL);
+        glXDestroyContext(display, p->context);
+    }
+
+    vo_x11_uninit(ctx->vo);
+}
+
+static bool create_context_x11_old(struct ra_ctx *ctx, GL *gl)
+{
+    struct priv *p = ctx->priv;
+    Display *display = ctx->vo->x11->display;
+    struct vo *vo = ctx->vo;
+
+    if (p->context)
+        return true;
+
+    if (!p->vinfo) {
+        MP_FATAL(vo, "Can't create a legacy GLX context without X visual\n");
+        return false;
+    }
+
+    GLXContext new_context = glXCreateContext(display, p->vinfo, NULL, True);
+    if (!new_context) {
+        MP_FATAL(vo, "Could not create GLX context!\n");
+        return false;
+    }
+
+    if (!glXMakeCurrent(display, ctx->vo->x11->window, new_context)) {
+        MP_FATAL(vo, "Could not set GLX context!\n");
+        glXDestroyContext(display, new_context);
+        return false;
+    }
+
+    const char *glxstr = glXQueryExtensionsString(display, ctx->vo->x11->screen);
+
+    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+
+    p->context = new_context;
+
+    return true;
+}
+
+typedef GLXContext (*glXCreateContextAttribsARBProc)
+    (Display*, GLXFBConfig, GLXContext, Bool, const int*);
+
+static bool create_context_x11_gl3(struct ra_ctx *ctx, GL *gl, int gl_version,
+                                   bool es)
+{
+    struct priv *p = ctx->priv;
+    struct vo *vo = ctx->vo;
+
+    if (p->context)
+        return true;
+
+    if (!ra_gl_ctx_test_version(ctx, gl_version, es))
+        return false;
+
+    glXCreateContextAttribsARBProc glXCreateContextAttribsARB =
+        (glXCreateContextAttribsARBProc)
+            glXGetProcAddressARB((const GLubyte *)"glXCreateContextAttribsARB");
+
+    const char *glxstr =
+        glXQueryExtensionsString(vo->x11->display, vo->x11->screen);
+    bool have_ctx_ext = glxstr && !!strstr(glxstr, "GLX_ARB_create_context");
+
+    if (!(have_ctx_ext && glXCreateContextAttribsARB)) {
+        return false;
+    }
+
+    int ctx_flags = ctx->opts.debug ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
+    int profile_mask = GLX_CONTEXT_CORE_PROFILE_BIT_ARB;
+
+    if (es) {
+        profile_mask = GLX_CONTEXT_ES2_PROFILE_BIT_EXT;
+        if (!(glxstr && strstr(glxstr, "GLX_EXT_create_context_es2_profile")))
+            return false;
+    }
+
+    int context_attribs[] = {
+        GLX_CONTEXT_MAJOR_VERSION_ARB, MPGL_VER_GET_MAJOR(gl_version),
+        GLX_CONTEXT_MINOR_VERSION_ARB, MPGL_VER_GET_MINOR(gl_version),
+        GLX_CONTEXT_PROFILE_MASK_ARB, profile_mask,
+        GLX_CONTEXT_FLAGS_ARB, ctx_flags,
+        None
+    };
+    vo_x11_silence_xlib(1);
+    GLXContext context = glXCreateContextAttribsARB(vo->x11->display,
+                                                    p->fbc, 0, True,
+                                                    context_attribs);
+    vo_x11_silence_xlib(-1);
+    if (!context)
+        return false;
+
+    // set context
+    if (!glXMakeCurrent(vo->x11->display, vo->x11->window, context)) {
+        MP_FATAL(vo, "Could not set GLX context!\n");
+        glXDestroyContext(vo->x11->display, context);
+        return false;
+    }
+
+    p->context = context;
+
+    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+
+    return true;
+}
+
+// The GL3/FBC initialization code roughly follows/copies from:
+//  http://www.opengl.org/wiki/Tutorial:_OpenGL_3.0_Context_Creation_(GLX)
+// but also uses some of the old code.
+
+static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, bool alpha)
+{
+    int fbcount;
+    GLXFBConfig *fbc = glXChooseFBConfig(vo->x11->display, vo->x11->screen,
+                                         attribs, &fbcount);
+    if (!fbc)
+        return NULL;
+
+    // The list in fbc is sorted (so that the first element is the best).
+    GLXFBConfig fbconfig = fbcount > 0 ? fbc[0] : NULL;
+
+    if (alpha) {
+        for (int n = 0; n < fbcount; n++) {
+            XVisualInfo *v = glXGetVisualFromFBConfig(vo->x11->display, fbc[n]);
+            if (v) {
+                bool is_rgba = vo_x11_is_rgba_visual(v);
+                XFree(v);
+                if (is_rgba) {
+                    fbconfig = fbc[n];
+                    break;
+                }
+            }
+        }
+    }
+
+    XFree(fbc);
+
+    return fbconfig;
+}
+
+static void set_glx_attrib(int *attribs, int name, int value)
+{
+    for (int n = 0; attribs[n * 2 + 0] != None; n++) {
+        if (attribs[n * 2 + 0] == name) {
+            attribs[n * 2 + 1] = value;
+            break;
+        }
+    }
+}
+
+static void glx_swap_buffers(struct ra_ctx *ctx)
+{
+    glXSwapBuffers(ctx->vo->x11->display, ctx->vo->x11->window);
+}
+
+static bool glx_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    struct vo *vo = ctx->vo;
+    GL *gl = &p->gl;
+
+    if (!vo_x11_init(ctx->vo))
+        goto uninit;
+
+    int glx_major, glx_minor;
+
+    if (!glXQueryVersion(vo->x11->display, &glx_major, &glx_minor)) {
+        MP_ERR(ctx, "GLX not found.\n");
+        goto uninit;
+    }
+    // FBConfigs were added in GLX version 1.3.
+    if (MPGL_VER(glx_major, glx_minor) <  MPGL_VER(1, 3)) {
+        MP_ERR(ctx, "GLX version older than 1.3.\n");
+        goto uninit;
+    }
+
+    int glx_attribs[] = {
+        GLX_X_RENDERABLE, True,
+        GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
+        GLX_RED_SIZE, 1,
+        GLX_GREEN_SIZE, 1,
+        GLX_BLUE_SIZE, 1,
+        GLX_ALPHA_SIZE, 0,
+        GLX_DOUBLEBUFFER, True,
+        None
+    };
+    GLXFBConfig fbc = NULL;
+    if (ctx->opts.want_alpha) {
+        set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 1);
+        fbc = select_fb_config(vo, glx_attribs, true);
+        if (!fbc)
+            set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 0);
+    }
+    if (!fbc)
+        fbc = select_fb_config(vo, glx_attribs, false);
+    if (!fbc) {
+        MP_ERR(ctx, "no GLX support present\n");
+        goto uninit;
+    }
+
+    int fbid = -1;
+    if (!glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_FBCONFIG_ID, &fbid))
+        MP_VERBOSE(ctx, "GLX chose FB config with ID 0x%x\n", fbid);
+
+    p->fbc = fbc;
+    p->vinfo = glXGetVisualFromFBConfig(vo->x11->display, fbc);
+    if (p->vinfo) {
+        MP_VERBOSE(ctx, "GLX chose visual with ID 0x%x\n",
+                   (int)p->vinfo->visualid);
+    } else {
+        MP_WARN(ctx, "Selected GLX FB config has no associated X visual\n");
+    }
+
+    if (!vo_x11_create_vo_window(vo, p->vinfo, "gl"))
+        goto uninit;
+
+    bool success = false;
+    for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
+        int version = mpgl_preferred_gl_versions[n];
+        MP_VERBOSE(ctx, "Creating OpenGL %d.%d context...\n",
+                   MPGL_VER_P(version));
+        if (version >= 300) {
+            success = create_context_x11_gl3(ctx, gl, version, false);
+        } else {
+            success = create_context_x11_old(ctx, gl);
+        }
+        if (success)
+            break;
+    }
+    if (!success) // try again for GLES
+        success = create_context_x11_gl3(ctx, gl, 200, true);
+    if (success && !glXIsDirect(vo->x11->display, p->context))
+        gl->mpgl_caps |= MPGL_CAP_SW;
+    if (!success)
+        goto uninit;
+
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = glx_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto uninit;
+
+    return true;
+
+uninit:
+    glx_uninit(ctx);
+    return false;
+}
+
+static bool glx_init_probe(struct ra_ctx *ctx)
+{
+    if (!glx_init(ctx))
+        return false;
+
+    struct priv *p = ctx->priv;
+    if (!(p->gl.mpgl_caps & MPGL_CAP_VDPAU)) {
+        MP_VERBOSE(ctx, "No vdpau support found - probing more things.\n");
+        glx_uninit(ctx);
+        return false;
+    }
+
+    return true;
+}
+
+static void resize(struct ra_ctx *ctx)
+{
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
+}
+
+static bool glx_reconfig(struct ra_ctx *ctx)
+{
+    vo_x11_config_vo_window(ctx->vo);
+    resize(ctx);
+    return true;
+}
+
+static int glx_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
+}
+
+static void glx_wakeup(struct ra_ctx *ctx)
+{
+    vo_x11_wakeup(ctx->vo);
+}
+
+static void glx_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
+{
+    vo_x11_wait_events(ctx->vo, until_time_us);
+}
+
+const struct ra_ctx_fns ra_ctx_glx = {
+    .type           = "opengl",
+    .name           = "x11",
+    .reconfig       = glx_reconfig,
+    .control        = glx_control,
+    .wakeup         = glx_wakeup,
+    .wait_events    = glx_wait_events,
+    .init           = glx_init,
+    .uninit         = glx_uninit,
+};
+
+const struct ra_ctx_fns ra_ctx_glx_probe = {
+    .type           = "opengl",
+    .name           = "x11probe",
+    .reconfig       = glx_reconfig,
+    .control        = glx_control,
+    .wakeup         = glx_wakeup,
+    .wait_events    = glx_wait_events,
+    .init           = glx_init_probe,
+    .uninit         = glx_uninit,
+};
diff --git a/video/out/opengl/context_mali_fbdev.c b/video/out/opengl/context_mali_fbdev.c
index 66daa7f9ee..8576e536d3 100644
--- a/video/out/opengl/context_mali_fbdev.c
+++ b/video/out/opengl/context_mali_fbdev.c
@@ -50,8 +50,7 @@ static bool get_fbdev_size(int *w, int *h)
 }
 
 struct priv {
-    struct mp_log *log;
-    struct GL *gl;
+    struct GL gl;
     EGLDisplay egl_display;
     EGLConfig egl_config;
     EGLContext egl_context;
@@ -60,9 +59,10 @@ struct priv {
     int w, h;
 };
 
-static void mali_uninit(struct MPGLContext *ctx)
+static void mali_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     if (p->egl_surface) {
         eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
@@ -74,25 +74,29 @@ static void mali_uninit(struct MPGLContext *ctx)
     eglReleaseThread();
 }
 
-static int mali_init(struct MPGLContext *ctx, int flags)
+static void mali_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    p->log = ctx->vo->log;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool mali_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     if (!get_fbdev_size(&p->w, &p->h)) {
-        MP_FATAL(p, "Could not get fbdev size.\n");
+        MP_FATAL(ctx, "Could not get fbdev size.\n");
         goto fail;
     }
 
     p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        MP_FATAL(p, "EGL failed to initialize.\n");
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
         goto fail;
     }
 
     EGLConfig config;
-    if (!mpegl_create_context(p->egl_display, p->log, flags, &p->egl_context,
-                              &config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &config))
         goto fail;
 
     p->egl_window = (struct fbdev_window){
@@ -104,53 +108,51 @@ static int mali_init(struct MPGLContext *ctx, int flags)
                                     (EGLNativeWindowType)&p->egl_window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(p, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto fail;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(p, "Failed to set context!\n");
+        MP_FATAL(ctx, "Failed to set context!\n");
         goto fail;
     }
 
-    ctx->gl = talloc_zero(ctx, GL);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    mpegl_load_functions(ctx->gl, p->log);
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = mali_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
 
-    return 0;
+    return true;
 
 fail:
     mali_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int mali_reconfig(struct MPGLContext *ctx)
+static bool mali_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 }
 
-static void mali_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
-}
-
-static int mali_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int mali_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
     return VO_NOTIMPL;
 }
 
-const struct mpgl_driver mpgl_driver_mali = {
+const struct ra_ctx_fns ra_ctx_mali_fbdev = {
+    .type           = "opengl",
     .name           = "mali-fbdev",
-    .priv_size      = sizeof(struct priv),
-    .init           = mali_init,
     .reconfig       = mali_reconfig,
-    .swap_buffers   = mali_swap_buffers,
     .control        = mali_control,
+    .init           = mali_init,
     .uninit         = mali_uninit,
 };
diff --git a/video/out/opengl/context_rpi.c b/video/out/opengl/context_rpi.c
index e79622be5d..8b447d0bfc 100644
--- a/video/out/opengl/context_rpi.c
+++ b/video/out/opengl/context_rpi.c
@@ -30,7 +30,7 @@
 #include "egl_helpers.h"
 
 struct priv {
-    struct mp_log *log;
+    struct GL gl;
     DISPMANX_DISPLAY_HANDLE_T display;
     DISPMANX_ELEMENT_HANDLE_T window;
     DISPMANX_UPDATE_HANDLE_T update;
@@ -49,13 +49,13 @@ struct priv {
 static void tv_callback(void *callback_data, uint32_t reason, uint32_t param1,
                         uint32_t param2)
 {
-    struct MPGLContext *ctx = callback_data;
+    struct ra_ctx *ctx = callback_data;
     struct priv *p = ctx->priv;
     atomic_store(&p->reload_display, true);
     vo_wakeup(ctx->vo);
 }
 
-static void destroy_dispmanx(struct MPGLContext *ctx)
+static void destroy_dispmanx(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -77,9 +77,10 @@ static void destroy_dispmanx(struct MPGLContext *ctx)
     p->update = 0;
 }
 
-static void rpi_uninit(MPGLContext *ctx)
+static void rpi_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     vc_tv_unregister_callback_full(tv_callback, ctx);
 
@@ -92,26 +93,26 @@ static void rpi_uninit(MPGLContext *ctx)
     p->egl_display = EGL_NO_DISPLAY;
 }
 
-static int recreate_dispmanx(struct MPGLContext *ctx)
+static bool recreate_dispmanx(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     int display_nr = 0;
     int layer = 0;
 
-    MP_VERBOSE(ctx->vo, "Recreating DISPMANX state...\n");
+    MP_VERBOSE(ctx, "Recreating DISPMANX state...\n");
 
     destroy_dispmanx(ctx);
 
     p->display = vc_dispmanx_display_open(display_nr);
     p->update = vc_dispmanx_update_start(0);
     if (!p->display || !p->update) {
-        MP_FATAL(ctx->vo, "Could not get DISPMANX objects.\n");
+        MP_FATAL(ctx, "Could not get DISPMANX objects.\n");
         goto fail;
     }
 
     uint32_t dispw, disph;
     if (graphics_get_display_size(0, &dispw, &disph) < 0) {
-        MP_FATAL(ctx->vo, "Could not get display size.\n");
+        MP_FATAL(ctx, "Could not get display size.\n");
         goto fail;
     }
     p->w = dispw;
@@ -145,7 +146,7 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
                                         &src, DISPMANX_PROTECTION_NONE, &alpha,
                                         0, 0);
     if (!p->window) {
-        MP_FATAL(ctx->vo, "Could not add DISPMANX element.\n");
+        MP_FATAL(ctx, "Could not add DISPMANX element.\n");
         goto fail;
     }
 
@@ -161,14 +162,14 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
                                             &p->egl_window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(p, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto fail;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(p, "Failed to set context!\n");
+        MP_FATAL(ctx, "Failed to set context!\n");
         goto fail;
     }
 
@@ -197,21 +198,27 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
 
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
+    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 
     ctx->vo->want_redraw = true;
 
     vo_event(ctx->vo, VO_EVENT_WIN_STATE);
-    return 0;
+    return true;
 
 fail:
     destroy_dispmanx(ctx);
-    return -1;
+    return false;
 }
 
-static int rpi_init(struct MPGLContext *ctx, int flags)
+static void rpi_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    p->log = ctx->vo->log;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool rpi_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     bcm_host_init();
 
@@ -219,43 +226,40 @@ static int rpi_init(struct MPGLContext *ctx, int flags)
 
     p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        MP_FATAL(p, "EGL failed to initialize.\n");
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
         goto fail;
     }
 
-    if (!mpegl_create_context(p->egl_display, p->log, 0, &p->egl_context,
-                              &p->egl_config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &p->egl_config))
         goto fail;
 
     if (recreate_dispmanx(ctx) < 0)
         goto fail;
 
-    ctx->gl = talloc_zero(ctx, GL);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    mpegl_load_functions(ctx->gl, p->log);
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = rpi_swap_buffers,
+        .native_display_type = "MPV_RPI_WINDOW",
+        .native_display = p->win_params,
+    };
 
-    ctx->native_display_type = "MPV_RPI_WINDOW";
-    ctx->native_display = p->win_params;
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
 
-    return 0;
+    return true;
 
 fail:
     rpi_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int rpi_reconfig(struct MPGLContext *ctx)
+static bool rpi_reconfig(struct ra_ctx *ctx)
 {
     return recreate_dispmanx(ctx);
 }
 
-static void rpi_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
-}
-
-static struct mp_image *take_screenshot(struct MPGLContext *ctx)
+static struct mp_image *take_screenshot(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -289,21 +293,20 @@ fail:
     return NULL;
 }
 
-
-static int rpi_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int rpi_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
     struct priv *p = ctx->priv;
 
     switch (request) {
     case VOCTRL_SCREENSHOT_WIN:
         *(struct mp_image **)arg = take_screenshot(ctx);
-        return true;
+        return VO_TRUE;
     case VOCTRL_FULLSCREEN:
         recreate_dispmanx(ctx);
         return VO_TRUE;
     case VOCTRL_CHECK_EVENTS:
         if (atomic_fetch_and(&p->reload_display, 0)) {
-            MP_WARN(ctx->vo, "Recovering from display mode switch...\n");
+            MP_WARN(ctx, "Recovering from display mode switch...\n");
             recreate_dispmanx(ctx);
         }
         return VO_TRUE;
@@ -315,12 +318,11 @@ static int rpi_control(MPGLContext *ctx, int *events, int request, void *arg)
     return VO_NOTIMPL;
 }
 
-const struct mpgl_driver mpgl_driver_rpi = {
+const struct ra_ctx_fns ra_ctx_rpi = {
+    .type           = "opengl",
     .name           = "rpi",
-    .priv_size      = sizeof(struct priv),
-    .init           = rpi_init,
     .reconfig       = rpi_reconfig,
-    .swap_buffers   = rpi_swap_buffers,
     .control        = rpi_control,
+    .init           = rpi_init,
     .uninit         = rpi_uninit,
-};
\ No newline at end of file
+};
diff --git a/video/out/opengl/context_vdpau.c b/video/out/opengl/context_vdpau.c
index 40d21ab65c..a2321f78dd 100644
--- a/video/out/opengl/context_vdpau.c
+++ b/video/out/opengl/context_vdpau.c
@@ -26,8 +26,6 @@
 // follow it. I'm not sure about the original nvidia headers.
 #define BRAINDEATH(x) ((void *)(uintptr_t)(x))
 
-#define NUM_SURFACES 4
-
 struct surface {
     int w, h;
     VdpOutputSurface surface;
@@ -39,21 +37,22 @@ struct surface {
 };
 
 struct priv {
+    GL gl;
     GLXContext context;
     struct mp_vdpau_ctx *vdp;
     VdpPresentationQueueTarget vdp_target;
     VdpPresentationQueue vdp_queue;
+    struct surface *surfaces;
     int num_surfaces;
-    struct surface surfaces[NUM_SURFACES];
-    int current_surface;
+    int idx_surfaces;
 };
 
 typedef GLXContext (*glXCreateContextAttribsARBProc)
     (Display*, GLXFBConfig, GLXContext, Bool, const int*);
 
-static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
+static bool create_context_x11(struct ra_ctx *ctx)
 {
-    struct priv *glx_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
     int glx_major, glx_minor;
@@ -62,6 +61,9 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
+    if (!ra_gl_ctx_test_version(ctx, MPGL_VER(glx_major, glx_minor), false))
+        return false;
+
     int glx_attribs[] = {
         GLX_X_RENDERABLE, True,
         GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
@@ -96,7 +98,7 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
-    int ctx_flags = vo_flags & VOFLAG_GL_DEBUG ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
+    int ctx_flags = ctx->opts.debug ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
     int context_attribs[] = {
         GLX_CONTEXT_MAJOR_VERSION_ARB, 4,
         GLX_CONTEXT_MINOR_VERSION_ARB, 0,
@@ -117,19 +119,20 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
-    glx_ctx->context = context;
-    mpgl_load_functions(ctx->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+    p->context = context;
+    mpgl_load_functions(&p->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
     return true;
 }
 
-static int create_vdpau_objects(struct MPGLContext *ctx)
+static int create_vdpau_objects(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    struct GL *gl = &p->gl;
     VdpDevice dev = p->vdp->vdp_device;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
 
-    ctx->gl->VDPAUInitNV(BRAINDEATH(dev), p->vdp->get_proc_address);
+    gl->VDPAUInitNV(BRAINDEATH(dev), p->vdp->get_proc_address);
 
     vdp_st = vdp->presentation_queue_target_create_x11(dev, ctx->vo->x11->window,
                                                        &p->vdp_target);
@@ -141,13 +144,13 @@ static int create_vdpau_objects(struct MPGLContext *ctx)
     return 0;
 }
 
-static void destroy_vdpau_surface(struct MPGLContext *ctx,
+static void destroy_vdpau_surface(struct ra_ctx *ctx,
                                   struct surface *surface)
 {
     struct priv *p = ctx->priv;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     if (surface->mapped)
         gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
@@ -168,14 +171,14 @@ static void destroy_vdpau_surface(struct MPGLContext *ctx,
     };
 }
 
-static int recreate_vdpau_surface(struct MPGLContext *ctx,
-                                  struct surface *surface)
+static bool recreate_vdpau_surface(struct ra_ctx *ctx,
+                                   struct surface *surface)
 {
     struct priv *p = ctx->priv;
     VdpDevice dev = p->vdp->vdp_device;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     destroy_vdpau_surface(ctx, surface);
 
@@ -219,16 +222,37 @@ static int recreate_vdpau_surface(struct MPGLContext *ctx,
     gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
     surface->mapped = false;
 
-    return 0;
+    return true;
 
 error:
     destroy_vdpau_surface(ctx, surface);
-    return -1;
+    return false;
+}
+
+static void vdpau_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    struct vdp_functions *vdp = &p->vdp->vdp;
+    VdpStatus vdp_st;
+
+    // This is the *next* surface we will be rendering to. By delaying the
+    // block_until_idle, we're essentially allowing p->num_surfaces - 1
+    // in-flight surfaces, plus the one currently visible surface.
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    if (surf->surface == VDP_INVALID_HANDLE)
+        return;
+
+    VdpTime prev_vsync_time;
+    vdp_st = vdp->presentation_queue_block_until_surface_idle(p->vdp_queue,
+                                                              surf->surface,
+                                                              &prev_vsync_time);
+    CHECK_VDP_WARNING(ctx, "waiting for surface failed");
 }
 
-static void glx_uninit(MPGLContext *ctx)
+static void vdpau_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     if (p->vdp) {
         struct vdp_functions *vdp = &p->vdp->vdp;
@@ -259,10 +283,12 @@ static void glx_uninit(MPGLContext *ctx)
     vo_x11_uninit(ctx->vo);
 }
 
-static int glx_init(struct MPGLContext *ctx, int flags)
+static const struct ra_swapchain_fns vdpau_swapchain;
+
+static bool vdpau_init(struct ra_ctx *ctx)
 {
     struct vo *vo = ctx->vo;
-    struct priv *p = ctx->priv;
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     p->vdp_queue = VDP_INVALID_HANDLE;
     p->vdp_target = VDP_INVALID_HANDLE;
@@ -280,110 +306,112 @@ static int glx_init(struct MPGLContext *ctx, int flags)
     if (!vo_x11_create_vo_window(vo, NULL, "vdpauglx"))
         goto uninit;
 
-    if (!create_context_x11(ctx, flags))
+    if (!create_context_x11(ctx))
         goto uninit;
 
-    if (!(ctx->gl->mpgl_caps & MPGL_CAP_VDPAU))
+    if (!(p->gl.mpgl_caps & MPGL_CAP_VDPAU))
         goto uninit;
 
     if (create_vdpau_objects(ctx) < 0)
         goto uninit;
 
-    p->num_surfaces = NUM_SURFACES;
+    p->num_surfaces = ctx->opts.swapchain_depth + 1; // +1 for the visible image
+    p->surfaces = talloc_zero_array(p, struct surface, p->num_surfaces);
     for (int n = 0; n < p->num_surfaces; n++)
         p->surfaces[n].surface = VDP_INVALID_HANDLE;
 
-    ctx->flip_v = true;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = vdpau_swap_buffers,
+        .external_swapchain = &vdpau_swapchain,
+        .flipped = true,
+    };
 
-    return 0;
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
-    glx_uninit(ctx);
-    return -1;
+    vdpau_uninit(ctx);
+    return false;
 }
 
-static int glx_reconfig(struct MPGLContext *ctx)
+static struct ra_tex *vdpau_start_frame(struct ra_swapchain *sw)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
-}
+    struct priv *p = sw->ctx->priv;
+    struct vo *vo = sw->ctx->vo;
+    GL *gl = &p->gl;
+
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    if (surf->w != vo->dwidth || surf->h != vo->dheight ||
+        surf->surface == VDP_INVALID_HANDLE)
+    {
+        if (!recreate_vdpau_surface(sw->ctx, surf))
+            return NULL;
+    }
 
-static int glx_control(struct MPGLContext *ctx, int *events, int request,
-                       void *arg)
-{
-    return vo_x11_control(ctx->vo, events, request, arg);
+    assert(!surf->mapped);
+    gl->VDPAUMapSurfacesNV(1, &surf->registered);
+    surf->mapped = true;
+
+    ra_gl_ctx_resize(sw, surf->w, surf->h, surf->fbo);
+    return ra_gl_ctx_start_frame(sw);
 }
 
-static void glx_start_frame(struct MPGLContext *ctx)
+static bool vdpau_submit_frame(struct ra_swapchain *sw,
+                               const struct vo_frame *frame)
 {
-    struct priv *p = ctx->priv;
+    struct priv *p = sw->ctx->priv;
+    GL *gl = &p->gl;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
-
-    struct surface *surface = &p->surfaces[p->current_surface];
-
-    if (surface->surface != VDP_INVALID_HANDLE) {
-        VdpTime prev_vsync_time;
-        vdp_st = vdp->presentation_queue_block_until_surface_idle(p->vdp_queue,
-                                                                  surface->surface,
-                                                                  &prev_vsync_time);
-        CHECK_VDP_WARNING(ctx, "waiting for surface failed");
-    }
 
-    if (surface->w != ctx->vo->dwidth || surface->h != ctx->vo->dheight)
-        recreate_vdpau_surface(ctx, surface);
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    assert(surf->surface != VDP_INVALID_HANDLE);
+    assert(surf->mapped);
+    gl->VDPAUUnmapSurfacesNV(1, &surf->registered);
+    surf->mapped = false;
 
+    vdp_st = vdp->presentation_queue_display(p->vdp_queue, surf->surface, 0, 0, 0);
+    CHECK_VDP_WARNING(sw->ctx, "trying to present vdp surface");
 
-    ctx->main_fb = surface->fbo; // 0 if creating the surface failed
-
-    if (surface->surface != VDP_INVALID_HANDLE) {
-        gl->VDPAUMapSurfacesNV(1, &surface->registered);
-        surface->mapped = true;
-    }
+    p->idx_surfaces = (p->idx_surfaces + 1) % p->num_surfaces;
+    return ra_gl_ctx_submit_frame(sw, frame) && vdp_st == VDP_STATUS_OK;
 }
 
-static void glx_swap_buffers(struct MPGLContext *ctx)
+static bool vdpau_reconfig(struct ra_ctx *ctx)
 {
-    struct priv *p = ctx->priv;
-    struct vdp_functions *vdp = &p->vdp->vdp;
-    VdpStatus vdp_st;
-    GL *gl = ctx->gl;
-
-    struct surface *surface = &p->surfaces[p->current_surface];
-    if (surface->surface == VDP_INVALID_HANDLE)
-        return; // surface alloc probably failed before
-
-    if (surface->mapped)
-        gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
-    surface->mapped = false;
-
-    vdp_st = vdp->presentation_queue_display(p->vdp_queue, surface->surface,
-                                             0, 0, 0);
-    CHECK_VDP_WARNING(ctx, "trying to present vdp surface");
+    vo_x11_config_vo_window(ctx->vo);
+    return true;
+}
 
-    p->current_surface = (p->current_surface + 1) % p->num_surfaces;
+static int vdpau_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    return vo_x11_control(ctx->vo, events, request, arg);
 }
 
-static void glx_wakeup(struct MPGLContext *ctx)
+static void vdpau_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void glx_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void vdpau_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_vdpauglx = {
+static const struct ra_swapchain_fns vdpau_swapchain = {
+    .start_frame   = vdpau_start_frame,
+    .submit_frame  = vdpau_submit_frame,
+};
+
+const struct ra_ctx_fns ra_ctx_vdpauglx = {
+    .type           = "opengl",
     .name           = "vdpauglx",
-    .priv_size      = sizeof(struct priv),
-    .init           = glx_init,
-    .reconfig       = glx_reconfig,
-    .start_frame    = glx_start_frame,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
+    .reconfig       = vdpau_reconfig,
+    .control        = vdpau_control,
+    .wakeup         = vdpau_wakeup,
+    .wait_events    = vdpau_wait_events,
+    .init           = vdpau_init,
+    .uninit         = vdpau_uninit,
 };
diff --git a/video/out/opengl/context_wayland.c b/video/out/opengl/context_wayland.c
index 87e98cd64f..6ddc550306 100644
--- a/video/out/opengl/context_wayland.c
+++ b/video/out/opengl/context_wayland.c
@@ -19,6 +19,7 @@
 #include "video/out/wayland_common.h"
 #include "context.h"
 #include "egl_helpers.h"
+#include "utils.h"
 
 static void egl_resize(struct vo_wayland_state *wl)
 {
@@ -63,30 +64,42 @@ static void egl_resize(struct vo_wayland_state *wl)
     wl->vo->want_redraw = true;
 }
 
-static int egl_create_context(struct vo_wayland_state *wl, MPGLContext *ctx,
-                              int flags)
+static void waylandgl_swap_buffers(struct ra_ctx *ctx)
 {
-    GL *gl = ctx->gl;
+    struct vo_wayland_state *wl = ctx->vo->wayland;
+    vo_wayland_wait_events(ctx->vo, 0);
+    eglSwapBuffers(wl->egl_context.egl.dpy, wl->egl_context.egl_surface);
+}
+
+static bool egl_create_context(struct ra_ctx *ctx, struct vo_wayland_state *wl)
+{
+    GL *gl = ctx->priv = talloc_zero(ctx, GL);
 
     if (!(wl->egl_context.egl.dpy = eglGetDisplay(wl->display.display)))
-        return -1;
+        return false;
 
     if (eglInitialize(wl->egl_context.egl.dpy, NULL, NULL) != EGL_TRUE)
-        return -1;
+        return false;
 
-    if (!mpegl_create_context(wl->egl_context.egl.dpy, wl->log, flags,
+    if (!mpegl_create_context(ctx, wl->egl_context.egl.dpy,
                               &wl->egl_context.egl.ctx,
                               &wl->egl_context.egl.conf))
-        return -1;
+        return false;
 
     eglMakeCurrent(wl->egl_context.egl.dpy, NULL, NULL, wl->egl_context.egl.ctx);
 
     mpegl_load_functions(gl, wl->log);
 
-    ctx->native_display_type = "wl";
-    ctx->native_display = wl->display.display;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = waylandgl_swap_buffers,
+        .native_display_type = "wl",
+        .native_display = wl->display.display,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        return false;
 
-    return 0;
+    return true;
 }
 
 static void egl_create_window(struct vo_wayland_state *wl)
@@ -122,23 +135,25 @@ static void egl_create_window(struct vo_wayland_state *wl)
     eglSwapInterval(wl->egl_context.egl.dpy, 0);
 }
 
-static int waylandgl_reconfig(struct MPGLContext *ctx)
+static bool waylandgl_reconfig(struct ra_ctx *ctx)
 {
     struct vo_wayland_state * wl = ctx->vo->wayland;
 
     if (!vo_wayland_config(ctx->vo))
-        return -1;
+        return false;
 
     if (!wl->egl_context.egl_window)
         egl_create_window(wl);
 
-    return 0;
+    return true;
 }
 
-static void waylandgl_uninit(MPGLContext *ctx)
+static void waylandgl_uninit(struct ra_ctx *ctx)
 {
     struct vo_wayland_state *wl = ctx->vo->wayland;
 
+    ra_gl_ctx_uninit(ctx);
+
     if (wl->egl_context.egl.ctx) {
         eglReleaseThread();
         if (wl->egl_context.egl_window)
@@ -153,52 +168,45 @@ static void waylandgl_uninit(MPGLContext *ctx)
     vo_wayland_uninit(ctx->vo);
 }
 
-static void waylandgl_swap_buffers(MPGLContext *ctx)
-{
-    struct vo_wayland_state *wl = ctx->vo->wayland;
-
-    vo_wayland_wait_events(ctx->vo, 0);
-
-    eglSwapBuffers(wl->egl_context.egl.dpy, wl->egl_context.egl_surface);
-}
-
-static int waylandgl_control(MPGLContext *ctx, int *events, int request,
+static int waylandgl_control(struct ra_ctx *ctx, int *events, int request,
                              void *data)
 {
     struct vo_wayland_state *wl = ctx->vo->wayland;
     int r = vo_wayland_control(ctx->vo, events, request, data);
 
-    if (*events & VO_EVENT_RESIZE)
+    if (*events & VO_EVENT_RESIZE) {
         egl_resize(wl);
+        ra_gl_ctx_resize(ctx->swapchain, wl->vo->dwidth, wl->vo->dheight, 0);
+    }
 
     return r;
 }
 
-static void wayland_wakeup(struct MPGLContext *ctx)
+static void wayland_wakeup(struct ra_ctx *ctx)
 {
     vo_wayland_wakeup(ctx->vo);
 }
 
-static void wayland_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void wayland_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_wayland_wait_events(ctx->vo, until_time_us);
 }
 
-static int waylandgl_init(struct MPGLContext *ctx, int flags)
+static bool waylandgl_init(struct ra_ctx *ctx)
 {
     if (!vo_wayland_init(ctx->vo))
-        return -1;
+        return false;
 
-    return egl_create_context(ctx->vo->wayland, ctx, flags);
+    return egl_create_context(ctx, ctx->vo->wayland);
 }
 
-const struct mpgl_driver mpgl_driver_wayland = {
+const struct ra_ctx_fns ra_ctx_wayland_egl = {
+    .type           = "opengl",
     .name           = "wayland",
-    .init           = waylandgl_init,
     .reconfig       = waylandgl_reconfig,
-    .swap_buffers   = waylandgl_swap_buffers,
     .control        = waylandgl_control,
     .wakeup         = wayland_wakeup,
     .wait_events    = wayland_wait_events,
+    .init           = waylandgl_init,
     .uninit         = waylandgl_uninit,
 };
diff --git a/video/out/opengl/context_x11.c b/video/out/opengl/context_x11.c
deleted file mode 100644
index 4d8dac1ea5..0000000000
--- a/video/out/opengl/context_x11.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <X11/Xlib.h>
-#include <GL/glx.h>
-
-// FreeBSD 10.0-CURRENT lacks the GLX_ARB_create_context extension completely
-#ifndef GLX_CONTEXT_MAJOR_VERSION_ARB
-#define GLX_CONTEXT_MAJOR_VERSION_ARB           0x2091
-#define GLX_CONTEXT_MINOR_VERSION_ARB           0x2092
-#define GLX_CONTEXT_FLAGS_ARB                   0x2094
-#define GLX_CONTEXT_PROFILE_MASK_ARB            0x9126
-#ifndef __APPLE__
-// These are respectively 0x00000001 and 0x00000002 on OSX
-#define GLX_CONTEXT_DEBUG_BIT_ARB               0x0001
-#define GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB  0x0002
-#endif
-#define GLX_CONTEXT_CORE_PROFILE_BIT_ARB        0x00000001
-#define GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB 0x00000002
-#endif
-// GLX_EXT_create_context_es2_profile
-#ifndef GLX_CONTEXT_ES2_PROFILE_BIT_EXT
-#define GLX_CONTEXT_ES2_PROFILE_BIT_EXT         0x00000004
-#endif
-
-#include "video/out/x11_common.h"
-#include "context.h"
-
-struct glx_context {
-    XVisualInfo *vinfo;
-    GLXContext context;
-    GLXFBConfig fbc;
-};
-
-static void glx_uninit(MPGLContext *ctx)
-{
-    struct glx_context *glx_ctx = ctx->priv;
-    if (glx_ctx->vinfo)
-        XFree(glx_ctx->vinfo);
-    if (glx_ctx->context) {
-        Display *display = ctx->vo->x11->display;
-        glXMakeCurrent(display, None, NULL);
-        glXDestroyContext(display, glx_ctx->context);
-    }
-    vo_x11_uninit(ctx->vo);
-}
-
-static bool create_context_x11_old(struct MPGLContext *ctx)
-{
-    struct glx_context *glx_ctx = ctx->priv;
-    Display *display = ctx->vo->x11->display;
-    struct vo *vo = ctx->vo;
-    GL *gl = ctx->gl;
-
-    if (glx_ctx->context)
-        return true;
-
-    if (!glx_ctx->vinfo) {
-        MP_FATAL(vo, "Can't create a legacy GLX context without X visual\n");
-        return false;
-    }
-
-    GLXContext new_context = glXCreateContext(display, glx_ctx->vinfo, NULL,
-                                              True);
-    if (!new_context) {
-        MP_FATAL(vo, "Could not create GLX context!\n");
-        return false;
-    }
-
-    if (!glXMakeCurrent(display, ctx->vo->x11->window, new_context)) {
-        MP_FATAL(vo, "Could not set GLX context!\n");
-        glXDestroyContext(display, new_context);
-        return false;
-    }
-
-    const char *glxstr = glXQueryExtensionsString(display, ctx->vo->x11->screen);
-
-    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
-
-    glx_ctx->context = new_context;
-
-    return true;
-}
-
-typedef GLXContext (*glXCreateContextAttribsARBProc)
-    (Display*, GLXFBConfig, GLXContext, Bool, const int*);
-
-static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
-                                   int gl_version, bool es)
-{
-    struct glx_context *glx_ctx = ctx->priv;
-    struct vo *vo = ctx->vo;
-
-    if (glx_ctx->context)
-        return true;
-
-    glXCreateContextAttribsARBProc glXCreateContextAttribsARB =
-        (glXCreateContextAttribsARBProc)
-            glXGetProcAddressARB((const GLubyte *)"glXCreateContextAttribsARB");
-
-    const char *glxstr =
-        glXQueryExtensionsString(vo->x11->display, vo->x11->screen);
-    bool have_ctx_ext = glxstr && !!strstr(glxstr, "GLX_ARB_create_context");
-
-    if (!(have_ctx_ext && glXCreateContextAttribsARB)) {
-        return false;
-    }
-
-    int ctx_flags = vo_flags & VOFLAG_GL_DEBUG ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
-    int profile_mask = GLX_CONTEXT_CORE_PROFILE_BIT_ARB;
-
-    if (es) {
-        profile_mask = GLX_CONTEXT_ES2_PROFILE_BIT_EXT;
-        if (!(glxstr && strstr(glxstr, "GLX_EXT_create_context_es2_profile")))
-            return false;
-    }
-
-    int context_attribs[] = {
-        GLX_CONTEXT_MAJOR_VERSION_ARB, MPGL_VER_GET_MAJOR(gl_version),
-        GLX_CONTEXT_MINOR_VERSION_ARB, MPGL_VER_GET_MINOR(gl_version),
-        GLX_CONTEXT_PROFILE_MASK_ARB, profile_mask,
-        GLX_CONTEXT_FLAGS_ARB, ctx_flags,
-        None
-    };
-    vo_x11_silence_xlib(1);
-    GLXContext context = glXCreateContextAttribsARB(vo->x11->display,
-                                                    glx_ctx->fbc, 0, True,
-                                                    context_attribs);
-    vo_x11_silence_xlib(-1);
-    if (!context)
-        return false;
-
-    // set context
-    if (!glXMakeCurrent(vo->x11->display, vo->x11->window, context)) {
-        MP_FATAL(vo, "Could not set GLX context!\n");
-        glXDestroyContext(vo->x11->display, context);
-        return false;
-    }
-
-    glx_ctx->context = context;
-
-    mpgl_load_functions(ctx->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
-
-    return true;
-}
-
-// The GL3/FBC initialization code roughly follows/copies from:
-//  http://www.opengl.org/wiki/Tutorial:_OpenGL_3.0_Context_Creation_(GLX)
-// but also uses some of the old code.
-
-static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, int flags)
-{
-    int fbcount;
-    GLXFBConfig *fbc = glXChooseFBConfig(vo->x11->display, vo->x11->screen,
-                                         attribs, &fbcount);
-    if (!fbc)
-        return NULL;
-
-    // The list in fbc is sorted (so that the first element is the best).
-    GLXFBConfig fbconfig = fbcount > 0 ? fbc[0] : NULL;
-
-    if (flags & VOFLAG_ALPHA) {
-        for (int n = 0; n < fbcount; n++) {
-            XVisualInfo *v = glXGetVisualFromFBConfig(vo->x11->display, fbc[n]);
-            if (v) {
-                bool is_rgba = vo_x11_is_rgba_visual(v);
-                XFree(v);
-                if (is_rgba) {
-                    fbconfig = fbc[n];
-                    break;
-                }
-            }
-        }
-    }
-
-    XFree(fbc);
-
-    return fbconfig;
-}
-
-static void set_glx_attrib(int *attribs, int name, int value)
-{
-    for (int n = 0; attribs[n * 2 + 0] != None; n++) {
-        if (attribs[n * 2 + 0] == name) {
-            attribs[n * 2 + 1] = value;
-            break;
-        }
-    }
-}
-
-static int glx_init(struct MPGLContext *ctx, int flags)
-{
-    struct vo *vo = ctx->vo;
-    struct glx_context *glx_ctx = ctx->priv;
-
-    if (!vo_x11_init(ctx->vo))
-        goto uninit;
-
-    int glx_major, glx_minor;
-
-    if (!glXQueryVersion(vo->x11->display, &glx_major, &glx_minor)) {
-        MP_ERR(vo, "GLX not found.\n");
-        goto uninit;
-    }
-    // FBConfigs were added in GLX version 1.3.
-    if (MPGL_VER(glx_major, glx_minor) <  MPGL_VER(1, 3)) {
-        MP_ERR(vo, "GLX version older than 1.3.\n");
-        goto uninit;
-    }
-
-    int glx_attribs[] = {
-        GLX_X_RENDERABLE, True,
-        GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
-        GLX_RED_SIZE, 1,
-        GLX_GREEN_SIZE, 1,
-        GLX_BLUE_SIZE, 1,
-        GLX_ALPHA_SIZE, 0,
-        GLX_DOUBLEBUFFER, True,
-        None
-    };
-    GLXFBConfig fbc = NULL;
-    if (flags & VOFLAG_ALPHA) {
-        set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 1);
-        fbc = select_fb_config(vo, glx_attribs, flags);
-        if (!fbc) {
-            set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 0);
-            flags &= ~VOFLAG_ALPHA;
-        }
-    }
-    if (!fbc)
-        fbc = select_fb_config(vo, glx_attribs, flags);
-    if (!fbc) {
-        MP_ERR(vo, "no GLX support present\n");
-        goto uninit;
-    }
-
-    int fbid = -1;
-    if (!glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_FBCONFIG_ID, &fbid))
-        MP_VERBOSE(vo, "GLX chose FB config with ID 0x%x\n", fbid);
-
-    glx_ctx->fbc = fbc;
-    glx_ctx->vinfo = glXGetVisualFromFBConfig(vo->x11->display, fbc);
-    if (glx_ctx->vinfo) {
-        MP_VERBOSE(vo, "GLX chose visual with ID 0x%x\n",
-                   (int)glx_ctx->vinfo->visualid);
-    } else {
-        MP_WARN(vo, "Selected GLX FB config has no associated X visual\n");
-    }
-
-    if (!vo_x11_create_vo_window(vo, glx_ctx->vinfo, "gl"))
-        goto uninit;
-
-    bool success = false;
-    if (!(flags & VOFLAG_GLES)) {
-        for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
-            int version = mpgl_preferred_gl_versions[n];
-            MP_VERBOSE(vo, "Creating OpenGL %d.%d context...\n",
-                       MPGL_VER_P(version));
-            if (version >= 300) {
-                success = create_context_x11_gl3(ctx, flags, version, false);
-            } else {
-                success = create_context_x11_old(ctx);
-            }
-            if (success)
-                break;
-        }
-    }
-    if (!success) // try ES
-        success = create_context_x11_gl3(ctx, flags, 200, true);
-    if (success && !glXIsDirect(vo->x11->display, glx_ctx->context))
-        ctx->gl->mpgl_caps |= MPGL_CAP_SW;
-    if (!success)
-        goto uninit;
-
-    return 0;
-
-uninit:
-    glx_uninit(ctx);
-    return -1;
-}
-
-static int glx_init_probe(struct MPGLContext *ctx, int flags)
-{
-    int r = glx_init(ctx, flags);
-    if (r >= 0) {
-        if (!(ctx->gl->mpgl_caps & MPGL_CAP_VDPAU)) {
-            MP_VERBOSE(ctx->vo, "No vdpau support found - probing more things.\n");
-            glx_uninit(ctx);
-            r = -1;
-        }
-    }
-    return r;
-}
-
-static int glx_reconfig(struct MPGLContext *ctx)
-{
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
-}
-
-static int glx_control(struct MPGLContext *ctx, int *events, int request,
-                       void *arg)
-{
-    return vo_x11_control(ctx->vo, events, request, arg);
-}
-
-static void glx_swap_buffers(struct MPGLContext *ctx)
-{
-    glXSwapBuffers(ctx->vo->x11->display, ctx->vo->x11->window);
-}
-
-static void glx_wakeup(struct MPGLContext *ctx)
-{
-    vo_x11_wakeup(ctx->vo);
-}
-
-static void glx_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
-{
-    vo_x11_wait_events(ctx->vo, until_time_us);
-}
-
-const struct mpgl_driver mpgl_driver_x11 = {
-    .name           = "x11",
-    .priv_size      = sizeof(struct glx_context),
-    .init           = glx_init,
-    .reconfig       = glx_reconfig,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
-};
-
-const struct mpgl_driver mpgl_driver_x11_probe = {
-    .name           = "x11probe",
-    .priv_size      = sizeof(struct glx_context),
-    .init           = glx_init_probe,
-    .reconfig       = glx_reconfig,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
-};
diff --git a/video/out/opengl/context_x11egl.c b/video/out/opengl/context_x11egl.c
index 2b68007a33..7ab4fe0579 100644
--- a/video/out/opengl/context_x11egl.c
+++ b/video/out/opengl/context_x11egl.c
@@ -32,14 +32,17 @@
 #include "egl_helpers.h"
 
 struct priv {
+    GL gl;
     EGLDisplay egl_display;
     EGLContext egl_context;
     EGLSurface egl_surface;
 };
 
-static void mpegl_uninit(MPGLContext *ctx)
+static void mpegl_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
     if (p->egl_context) {
         eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
                        EGL_NO_CONTEXT);
@@ -51,7 +54,7 @@ static void mpegl_uninit(MPGLContext *ctx)
 
 static int pick_xrgba_config(void *user_data, EGLConfig *configs, int num_configs)
 {
-    struct MPGLContext *ctx = user_data;
+    struct ra_ctx *ctx = user_data;
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
@@ -72,40 +75,44 @@ static int pick_xrgba_config(void *user_data, EGLConfig *configs, int num_config
     return 0;
 }
 
-static int mpegl_init(struct MPGLContext *ctx, int flags)
+static void mpegl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool mpegl_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     struct vo *vo = ctx->vo;
-    int msgl = vo->probing ? MSGL_V : MSGL_FATAL;
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_FATAL;
 
     if (!vo_x11_init(vo))
         goto uninit;
 
     p->egl_display = eglGetDisplay(vo->x11->display);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        mp_msg(vo->log, msgl, "Could not initialize EGL.\n");
+        MP_MSG(ctx, msgl, "Could not initialize EGL.\n");
         goto uninit;
     }
 
-    struct mpegl_opts opts = {
-        .vo_flags = flags,
+    struct mpegl_cb cb = {
         .user_data = ctx,
-        .refine_config = (flags & VOFLAG_ALPHA) ? pick_xrgba_config : NULL,
+        .refine_config = ctx->opts.want_alpha ? pick_xrgba_config : NULL,
     };
 
     EGLConfig config;
-    if (!mpegl_create_context_opts(p->egl_display, vo->log, &opts,
-                                   &p->egl_context, &config))
+    if (!mpegl_create_context_cb(ctx, p->egl_display, cb, &p->egl_context, &config))
         goto uninit;
 
     int vID, n;
     eglGetConfigAttrib(p->egl_display, config, EGL_NATIVE_VISUAL_ID, &vID);
-    MP_VERBOSE(vo, "chose visual 0x%x\n", vID);
+    MP_VERBOSE(ctx, "chose visual 0x%x\n", vID);
     XVisualInfo template = {.visualid = vID};
     XVisualInfo *vi = XGetVisualInfo(vo->x11->display, VisualIDMask, &template, &n);
 
     if (!vi) {
-        MP_FATAL(vo, "Getting X visual failed!\n");
+        MP_FATAL(ctx, "Getting X visual failed!\n");
         goto uninit;
     }
 
@@ -120,64 +127,73 @@ static int mpegl_init(struct MPGLContext *ctx, int flags)
                                     (EGLNativeWindowType)vo->x11->window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(ctx->vo, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto uninit;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(ctx->vo, "Could not make context current!\n");
+        MP_FATAL(ctx, "Could not make context current!\n");
         goto uninit;
     }
 
-    mpegl_load_functions(ctx->gl, vo->log);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    ctx->native_display_type = "x11";
-    ctx->native_display = vo->x11->display;
-    return 0;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = mpegl_swap_buffers,
+        .native_display_type = "x11",
+        .native_display = vo->x11->display,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
     mpegl_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int mpegl_reconfig(struct MPGLContext *ctx)
+static void resize(struct ra_ctx *ctx)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
 }
 
-static int mpegl_control(struct MPGLContext *ctx, int *events, int request,
-                         void *arg)
+static bool mpegl_reconfig(struct ra_ctx *ctx)
 {
-    return vo_x11_control(ctx->vo, events, request, arg);
+    vo_x11_config_vo_window(ctx->vo);
+    resize(ctx);
+    return true;
 }
 
-static void mpegl_swap_buffers(MPGLContext *ctx)
+static int mpegl_control(struct ra_ctx *ctx, int *events, int request,
+                         void *arg)
 {
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-static void mpegl_wakeup(struct MPGLContext *ctx)
+static void mpegl_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void mpegl_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void mpegl_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_x11egl = {
+const struct ra_ctx_fns ra_ctx_x11_egl = {
+    .type           = "opengl",
     .name           = "x11egl",
-    .priv_size      = sizeof(struct priv),
-    .init           = mpegl_init,
     .reconfig       = mpegl_reconfig,
-    .swap_buffers   = mpegl_swap_buffers,
     .control        = mpegl_control,
     .wakeup         = mpegl_wakeup,
     .wait_events    = mpegl_wait_events,
+    .init           = mpegl_init,
     .uninit         = mpegl_uninit,
 };
diff --git a/video/out/opengl/egl_helpers.c b/video/out/opengl/egl_helpers.c
index ac152df06a..0033bf1e33 100644
--- a/video/out/opengl/egl_helpers.c
+++ b/video/out/opengl/egl_helpers.c
@@ -25,6 +25,7 @@
 
 #include "egl_helpers.h"
 #include "common.h"
+#include "utils.h"
 #include "context.h"
 
 #if HAVE_EGL_ANGLE
@@ -43,41 +44,49 @@
 #define EGL_OPENGL_ES3_BIT                      0x00000040
 #endif
 
-// es_version = 0 (desktop), 2/3 (ES major version)
-static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
-                           int es_version, struct mpegl_opts *opts,
+// es_version: 0 (core), 2 or 3
+static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
+                           int es_version, struct mpegl_cb cb,
                            EGLContext *out_context, EGLConfig *out_config)
 {
-    int msgl = probing ? MSGL_V : MSGL_FATAL;
-
-    EGLenum api = EGL_OPENGL_API;
-    EGLint rend = EGL_OPENGL_BIT;
-    const char *name = "Desktop OpenGL";
-    if (es_version == 2) {
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_FATAL;
+
+    EGLenum api;
+    EGLint rend;
+    const char *name;
+
+    switch (es_version) {
+    case 0:
+        api = EGL_OPENGL_API;
+        rend = EGL_OPENGL_BIT;
+        name = "Desktop OpenGL";
+        break;
+    case 2:
         api = EGL_OPENGL_ES_API;
         rend = EGL_OPENGL_ES2_BIT;
-        name = "GLES 2.0";
-    }
-    if (es_version == 3) {
+        name = "GLES 2.x";
+        break;
+    case 3:
         api = EGL_OPENGL_ES_API;
         rend = EGL_OPENGL_ES3_BIT;
         name = "GLES 3.x";
+        break;
+    default: abort();
     }
 
-    mp_msg(log, MSGL_V, "Trying to create %s context.\n", name);
+    MP_VERBOSE(ctx, "Trying to create %s context.\n", name);
 
     if (!eglBindAPI(api)) {
-        mp_msg(log, MSGL_V, "Could not bind API!\n");
+        MP_VERBOSE(ctx, "Could not bind API!\n");
         return false;
     }
 
-
     EGLint attributes[] = {
         EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
         EGL_RED_SIZE, 1,
         EGL_GREEN_SIZE, 1,
         EGL_BLUE_SIZE, 1,
-        EGL_ALPHA_SIZE, (opts->vo_flags & VOFLAG_ALPHA ) ? 1 : 0,
+        EGL_ALPHA_SIZE, ctx->opts.want_alpha ? 1 : 0,
         EGL_RENDERABLE_TYPE, rend,
         EGL_NONE
     };
@@ -92,29 +101,34 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
 
     if (!num_configs) {
         talloc_free(configs);
-        mp_msg(log, msgl, "Could not choose EGLConfig!\n");
+        MP_MSG(ctx, msgl, "Could not choose EGLConfig!\n");
         return false;
     }
 
     int chosen = 0;
-    if (opts->refine_config)
-        chosen = opts->refine_config(opts->user_data, configs, num_configs);
+    if (cb.refine_config)
+        chosen = cb.refine_config(cb.user_data, configs, num_configs);
     EGLConfig config = configs[chosen];
 
     talloc_free(configs);
 
-    EGLContext *ctx = NULL;
+    EGLContext *egl_ctx = NULL;
 
     if (es_version) {
+        if (!ra_gl_ctx_test_version(ctx, MPGL_VER(es_version, 0), true))
+            return false;
+
         EGLint attrs[] = {
             EGL_CONTEXT_CLIENT_VERSION, es_version,
             EGL_NONE
         };
 
-        ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+        egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
     } else {
         for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
             int ver = mpgl_preferred_gl_versions[n];
+            if (!ra_gl_ctx_test_version(ctx, ver, false))
+                continue;
 
             EGLint attrs[] = {
                 EGL_CONTEXT_MAJOR_VERSION, MPGL_VER_GET_MAJOR(ver),
@@ -124,25 +138,25 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
                 EGL_NONE
             };
 
-            ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
-            if (ctx)
+            egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+            if (egl_ctx)
                 break;
         }
 
-        if (!ctx) {
+        if (!egl_ctx && ra_gl_ctx_test_version(ctx, 140, false)) {
             // Fallback for EGL 1.4 without EGL_KHR_create_context.
             EGLint attrs[] = { EGL_NONE };
 
-            ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+            egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
         }
     }
 
-    if (!ctx) {
-        mp_msg(log, msgl, "Could not create EGL context!\n");
+    if (!egl_ctx) {
+        MP_MSG(ctx, msgl, "Could not create EGL context!\n");
         return false;
     }
 
-    *out_context = ctx;
+    *out_context = egl_ctx;
     *out_config = config;
     return true;
 }
@@ -152,56 +166,36 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
 // Create a context and return it and the config it was created with. If it
 // returns false, the out_* pointers are set to NULL.
 // vo_flags is a combination of VOFLAG_* values.
-bool mpegl_create_context(EGLDisplay display, struct mp_log *log, int vo_flags,
+bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
                           EGLContext *out_context, EGLConfig *out_config)
 {
-    return mpegl_create_context_opts(display, log,
-        &(struct mpegl_opts){.vo_flags = vo_flags}, out_context, out_config);
+    return mpegl_create_context_cb(ctx, display, (struct mpegl_cb){0},
+                                   out_context, out_config);
 }
 
 // Create a context and return it and the config it was created with. If it
 // returns false, the out_* pointers are set to NULL.
-bool mpegl_create_context_opts(EGLDisplay display, struct mp_log *log,
-                               struct mpegl_opts *opts,
-                               EGLContext *out_context, EGLConfig *out_config)
+bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
+                             struct mpegl_cb cb, EGLContext *out_context,
+                             EGLConfig *out_config)
 {
-    assert(opts);
-
     *out_context = NULL;
     *out_config = NULL;
 
     const char *version = eglQueryString(display, EGL_VERSION);
     const char *vendor = eglQueryString(display, EGL_VENDOR);
     const char *apis = eglQueryString(display, EGL_CLIENT_APIS);
-    mp_verbose(log, "EGL_VERSION=%s\nEGL_VENDOR=%s\nEGL_CLIENT_APIS=%s\n",
+    MP_VERBOSE(ctx, "EGL_VERSION=%s\nEGL_VENDOR=%s\nEGL_CLIENT_APIS=%s\n",
                STR_OR_ERR(version), STR_OR_ERR(vendor), STR_OR_ERR(apis));
 
-    bool probing = opts->vo_flags & VOFLAG_PROBING;
-    int msgl = probing ? MSGL_V : MSGL_FATAL;
-    bool try_gles = !(opts->vo_flags & VOFLAG_NO_GLES);
-
-    if (!(opts->vo_flags & VOFLAG_GLES)) {
-        // Desktop OpenGL
-        if (create_context(display, log, try_gles | probing, 0, opts,
-                           out_context, out_config))
-            return true;
-    }
-
-    if (try_gles && !(opts->vo_flags & VOFLAG_GLES2)) {
-        // ES 3.x
-        if (create_context(display, log, true, 3, opts,
-                           out_context, out_config))
-            return true;
-    }
-
-    if (try_gles) {
-        // ES 2.0
-        if (create_context(display, log, probing, 2, opts,
-                           out_context, out_config))
+    int es[] = {0, 3, 2}; // preference order
+    for (int i = 0; i < MP_ARRAY_SIZE(es); i++) {
+        if (create_context(ctx, display, es[i], cb, out_context, out_config))
             return true;
     }
 
-    mp_msg(log, msgl, "Could not create a GL context.\n");
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR;
+    MP_MSG(ctx, msgl, "Could not create a GL context.\n");
     return false;
 }
 
diff --git a/video/out/opengl/egl_helpers.h b/video/out/opengl/egl_helpers.h
index 05f9dccb70..eaaf9d7a48 100644
--- a/video/out/opengl/egl_helpers.h
+++ b/video/out/opengl/egl_helpers.h
@@ -6,26 +6,23 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
+#include "video/out/gpu/context.h"
+
 struct mp_log;
 
-bool mpegl_create_context(EGLDisplay display, struct mp_log *log, int vo_flags,
+bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
                           EGLContext *out_context, EGLConfig *out_config);
 
-struct mpegl_opts {
-    // combination of VOFLAG_* values.
-    int vo_flags;
-
-    // for callbacks
-    void *user_data;
-
+struct mpegl_cb {
     // if set, pick the desired config from the given list and return its index
     // defaults to 0 (they are sorted by eglChooseConfig)
     int (*refine_config)(void *user_data, EGLConfig *configs, int num_configs);
+    void *user_data;
 };
 
-bool mpegl_create_context_opts(EGLDisplay display, struct mp_log *log,
-                               struct mpegl_opts *opts,
-                               EGLContext *out_context, EGLConfig *out_config);
+bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
+                             struct mpegl_cb cb, EGLContext *out_context,
+                             EGLConfig *out_config);
 
 struct GL;
 void mpegl_load_functions(struct GL *gl, struct mp_log *log);
diff --git a/video/out/opengl/formats.h b/video/out/opengl/formats.h
index 3da6ede82a..f727a3b6ef 100644
--- a/video/out/opengl/formats.h
+++ b/video/out/opengl/formats.h
@@ -2,7 +2,6 @@
 #define MPGL_FORMATS_H_
 
 #include "common.h"
-#include "ra.h"
 
 struct gl_format {
     const char *name;           // symbolic name for user interaction/debugging
diff --git a/video/out/opengl/gl_utils.c b/video/out/opengl/gl_utils.c
deleted file mode 100644
index bce2dabe5d..0000000000
--- a/video/out/opengl/gl_utils.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * This file is part of mpv.
- * Parts based on MPlayer code by Reimar Döffinger.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include <libavutil/sha.h>
-#include <libavutil/intreadwrite.h>
-#include <libavutil/mem.h>
-
-#include "osdep/io.h"
-
-#include "common/common.h"
-#include "options/path.h"
-#include "stream/stream.h"
-#include "formats.h"
-#include "ra_gl.h"
-#include "gl_utils.h"
-
-// GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
-static const char *gl_error_to_string(GLenum error)
-{
-    switch (error) {
-    case GL_INVALID_ENUM: return "INVALID_ENUM";
-    case GL_INVALID_VALUE: return "INVALID_VALUE";
-    case GL_INVALID_OPERATION: return "INVALID_OPERATION";
-    case GL_INVALID_FRAMEBUFFER_OPERATION: return "INVALID_FRAMEBUFFER_OPERATION";
-    case GL_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
-    default: return "unknown";
-    }
-}
-
-void gl_check_error(GL *gl, struct mp_log *log, const char *info)
-{
-    for (;;) {
-        GLenum error = gl->GetError();
-        if (error == GL_NO_ERROR)
-            break;
-        mp_msg(log, MSGL_ERR, "%s: OpenGL error %s.\n", info,
-               gl_error_to_string(error));
-    }
-}
-
-static int get_alignment(int stride)
-{
-    if (stride % 8 == 0)
-        return 8;
-    if (stride % 4 == 0)
-        return 4;
-    if (stride % 2 == 0)
-        return 2;
-    return 1;
-}
-
-// upload a texture, handling things like stride and slices
-//  target: texture target, usually GL_TEXTURE_2D
-//  format, type: texture parameters
-//  dataptr, stride: image data
-//  x, y, width, height: part of the image to upload
-void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
-                   const void *dataptr, int stride,
-                   int x, int y, int w, int h)
-{
-    int bpp = gl_bytes_per_pixel(format, type);
-    const uint8_t *data = dataptr;
-    int y_max = y + h;
-    if (w <= 0 || h <= 0 || !bpp)
-        return;
-    if (stride < 0) {
-        data += (h - 1) * stride;
-        stride = -stride;
-    }
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
-    int slice = h;
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
-        // this is not always correct, but should work for MPlayer
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
-    } else {
-        if (stride != bpp * w)
-            slice = 1; // very inefficient, but at least it works
-    }
-    for (; y + slice <= y_max; y += slice) {
-        gl->TexSubImage2D(target, 0, x, y, w, slice, format, type, data);
-        data += stride * slice;
-    }
-    if (y < y_max)
-        gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
-}
-
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
-{
-    if (gl->es)
-        return NULL; // ES can't read from front buffer
-    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
-    if (!image)
-        return NULL;
-    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
-    GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
-    gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
-    gl->ReadBuffer(obj);
-    //flip image while reading (and also avoid stride-related trouble)
-    for (int y = 0; y < h; y++) {
-        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
-                       image->planes[0] + y * image->stride[0]);
-    }
-    gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-    return image;
-}
-
-static void gl_vao_enable_attribs(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    for (int n = 0; n < vao->num_entries; n++) {
-        const struct ra_renderpass_input *e = &vao->entries[n];
-        GLenum type = 0;
-        bool normalized = false;
-        switch (e->type) {
-        case RA_VARTYPE_INT:
-            type = GL_INT;
-            break;
-        case RA_VARTYPE_FLOAT:
-            type = GL_FLOAT;
-            break;
-        case RA_VARTYPE_BYTE_UNORM:
-            type = GL_UNSIGNED_BYTE;
-            normalized = true;
-            break;
-        default:
-            abort();
-        }
-        assert(e->dim_m == 1);
-
-        gl->EnableVertexAttribArray(n);
-        gl->VertexAttribPointer(n, e->dim_v, type, normalized,
-                                vao->stride, (void *)(intptr_t)e->offset);
-    }
-}
-
-void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
-                 const struct ra_renderpass_input *entries,
-                 int num_entries)
-{
-    assert(!vao->vao);
-    assert(!vao->buffer);
-
-    *vao = (struct gl_vao){
-        .gl = gl,
-        .stride = stride,
-        .entries = entries,
-        .num_entries = num_entries,
-    };
-
-    gl->GenBuffers(1, &vao->buffer);
-
-    if (gl->BindVertexArray) {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-
-        gl->GenVertexArrays(1, &vao->vao);
-        gl->BindVertexArray(vao->vao);
-        gl_vao_enable_attribs(vao);
-        gl->BindVertexArray(0);
-
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-}
-
-void gl_vao_uninit(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-    if (!gl)
-        return;
-
-    if (gl->DeleteVertexArrays)
-        gl->DeleteVertexArrays(1, &vao->vao);
-    gl->DeleteBuffers(1, &vao->buffer);
-
-    *vao = (struct gl_vao){0};
-}
-
-static void gl_vao_bind(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    if (gl->BindVertexArray) {
-        gl->BindVertexArray(vao->vao);
-    } else {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-        gl_vao_enable_attribs(vao);
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-}
-
-static void gl_vao_unbind(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    if (gl->BindVertexArray) {
-        gl->BindVertexArray(0);
-    } else {
-        for (int n = 0; n < vao->num_entries; n++)
-            gl->DisableVertexAttribArray(n);
-    }
-}
-
-// Draw the vertex data (as described by the gl_vao_entry entries) in ptr
-// to the screen. num is the number of vertexes. prim is usually GL_TRIANGLES.
-// If ptr is NULL, then skip the upload, and use the data uploaded with the
-// previous call.
-void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
-{
-    GL *gl = vao->gl;
-
-    if (ptr) {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-        gl->BufferData(GL_ARRAY_BUFFER, num * vao->stride, ptr, GL_STREAM_DRAW);
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-
-    gl_vao_bind(vao);
-
-    gl->DrawArrays(prim, 0, num);
-
-    gl_vao_unbind(vao);
-}
-
-static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
-                                   GLenum severity, GLsizei length,
-                                   const GLchar *message, const void *userParam)
-{
-    // keep in mind that the debug callback can be asynchronous
-    struct mp_log *log = (void *)userParam;
-    int level = MSGL_ERR;
-    switch (severity) {
-    case GL_DEBUG_SEVERITY_NOTIFICATION:level = MSGL_V; break;
-    case GL_DEBUG_SEVERITY_LOW:         level = MSGL_INFO; break;
-    case GL_DEBUG_SEVERITY_MEDIUM:      level = MSGL_WARN; break;
-    case GL_DEBUG_SEVERITY_HIGH:        level = MSGL_ERR; break;
-    }
-    mp_msg(log, level, "GL: %s\n", message);
-}
-
-void gl_set_debug_logger(GL *gl, struct mp_log *log)
-{
-    if (gl->DebugMessageCallback)
-        gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
-}
-
-int gl_get_fb_depth(GL *gl, int fbo)
-{
-    if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))
-        return -1;
-
-    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
-
-    GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
-    if (fbo)
-        obj = GL_COLOR_ATTACHMENT0;
-
-    GLint depth_g = -1;
-
-    gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
-                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
-
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-
-    return depth_g > 0 ? depth_g : -1;
-}
diff --git a/video/out/opengl/gl_utils.h b/video/out/opengl/gl_utils.h
deleted file mode 100644
index 306ee23f65..0000000000
--- a/video/out/opengl/gl_utils.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of mpv.
- * Parts based on MPlayer code by Reimar Döffinger.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_UTILS_
-#define MP_GL_UTILS_
-
-#include <math.h>
-
-#include "common.h"
-#include "ra.h"
-
-struct mp_log;
-
-void gl_check_error(GL *gl, struct mp_log *log, const char *info);
-
-void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
-                   const void *dataptr, int stride,
-                   int x, int y, int w, int h);
-
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
-
-struct gl_vao {
-    GL *gl;
-    GLuint vao;     // the VAO object, or 0 if unsupported by driver
-    GLuint buffer;  // GL_ARRAY_BUFFER used for the data
-    int stride;     // size of each element (interleaved elements are assumed)
-    const struct ra_renderpass_input *entries;
-    int num_entries;
-};
-
-void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
-                 const struct ra_renderpass_input *entries,
-                 int num_entries);
-void gl_vao_uninit(struct gl_vao *vao);
-void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
-
-void gl_set_debug_logger(GL *gl, struct mp_log *log);
-
-int gl_get_fb_depth(GL *gl, int fbo);
-
-#endif
diff --git a/video/out/opengl/hwdec.c b/video/out/opengl/hwdec.c
deleted file mode 100644
index 5fbc1aa4a9..0000000000
--- a/video/out/opengl/hwdec.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stddef.h>
-#include <string.h>
-
-#include "config.h"
-
-#include "common/common.h"
-#include "common/msg.h"
-#include "options/m_config.h"
-#include "hwdec.h"
-
-extern const struct ra_hwdec_driver ra_hwdec_vaegl;
-extern const struct ra_hwdec_driver ra_hwdec_vaglx;
-extern const struct ra_hwdec_driver ra_hwdec_videotoolbox;
-extern const struct ra_hwdec_driver ra_hwdec_vdpau;
-extern const struct ra_hwdec_driver ra_hwdec_dxva2egl;
-extern const struct ra_hwdec_driver ra_hwdec_d3d11egl;
-extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
-extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
-extern const struct ra_hwdec_driver ra_hwdec_dxva2;
-extern const struct ra_hwdec_driver ra_hwdec_cuda;
-extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
-
-static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
-#if HAVE_VAAPI_EGL
-    &ra_hwdec_vaegl,
-#endif
-#if HAVE_VAAPI_GLX
-    &ra_hwdec_vaglx,
-#endif
-#if HAVE_VDPAU_GL_X11
-    &ra_hwdec_vdpau,
-#endif
-#if HAVE_VIDEOTOOLBOX_GL || HAVE_IOS_GL
-    &ra_hwdec_videotoolbox,
-#endif
-#if HAVE_D3D_HWACCEL
-    &ra_hwdec_d3d11egl,
-    &ra_hwdec_d3d11eglrgb,
- #if HAVE_D3D9_HWACCEL
-    &ra_hwdec_dxva2egl,
- #endif
-#endif
-#if HAVE_GL_DXINTEROP_D3D9
-    &ra_hwdec_dxva2gldx,
-#endif
-#if HAVE_CUDA_HWACCEL
-    &ra_hwdec_cuda,
-#endif
-#if HAVE_RPI
-    &ra_hwdec_rpi_overlay,
-#endif
-    NULL
-};
-
-static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
-                                          struct mpv_global *global,
-                                          struct mp_hwdec_devices *devs,
-                                          const struct ra_hwdec_driver *drv,
-                                          bool is_auto)
-{
-    struct ra_hwdec *hwdec = talloc(NULL, struct ra_hwdec);
-    *hwdec = (struct ra_hwdec) {
-        .driver = drv,
-        .log = mp_log_new(hwdec, log, drv->name),
-        .global = global,
-        .ra = ra,
-        .devs = devs,
-        .probing = is_auto,
-        .priv = talloc_zero_size(hwdec, drv->priv_size),
-    };
-    mp_verbose(log, "Loading hwdec driver '%s'\n", drv->name);
-    if (hwdec->driver->init(hwdec) < 0) {
-        ra_hwdec_uninit(hwdec);
-        mp_verbose(log, "Loading failed.\n");
-        return NULL;
-    }
-    return hwdec;
-}
-
-struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
-                                   struct mpv_global *g,
-                                   struct mp_hwdec_devices *devs,
-                                   enum hwdec_type api)
-{
-    bool is_auto = HWDEC_IS_AUTO(api);
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if ((is_auto || api == drv->api) && !drv->testing_only) {
-            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, is_auto);
-            if (r)
-                return r;
-        }
-    }
-    return NULL;
-}
-
-// Load by option name.
-struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
-                               struct mpv_global *g,
-                               struct mp_hwdec_devices *devs,
-                               const char *name)
-{
-    int g_hwdec_api;
-    mp_read_option_raw(g, "hwdec", &m_option_type_choice, &g_hwdec_api);
-    if (!name || !name[0])
-        name = m_opt_choice_str(mp_hwdec_names, g_hwdec_api);
-
-    int api_id = HWDEC_NONE;
-    for (int n = 0; mp_hwdec_names[n].name; n++) {
-        if (name && strcmp(mp_hwdec_names[n].name, name) == 0)
-            api_id = mp_hwdec_names[n].value;
-    }
-
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if (name && strcmp(drv->name, name) == 0) {
-            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, false);
-            if (r)
-                return r;
-        }
-    }
-
-    return ra_hwdec_load_api(log, ra, g, devs, api_id);
-}
-
-int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
-                          struct bstr name, struct bstr param)
-{
-    bool help = bstr_equals0(param, "help");
-    if (help)
-        mp_info(log, "Available hwdecs:\n");
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        const char *api_name = m_opt_choice_str(mp_hwdec_names, drv->api);
-        if (help) {
-            mp_info(log, "    %s [%s]\n", drv->name, api_name);
-        } else if (bstr_equals0(param, drv->name) ||
-                   bstr_equals0(param, api_name))
-        {
-            return 1;
-        }
-    }
-    if (help) {
-        mp_info(log, "    auto (loads best)\n"
-                     "    (other --hwdec values)\n"
-                     "Setting an empty string means use --hwdec.\n");
-        return M_OPT_EXIT;
-    }
-    if (!param.len)
-        return 1; // "" is treated specially
-    for (int n = 0; mp_hwdec_names[n].name; n++) {
-        if (bstr_equals0(param, mp_hwdec_names[n].name))
-            return 1;
-    }
-    mp_fatal(log, "No hwdec backend named '%.*s' found!\n", BSTR_P(param));
-    return M_OPT_INVALID;
-}
-
-void ra_hwdec_uninit(struct ra_hwdec *hwdec)
-{
-    if (hwdec)
-        hwdec->driver->uninit(hwdec);
-    talloc_free(hwdec);
-}
-
-bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt)
-{
-    for (int n = 0; hwdec->driver->imgfmts[n]; n++) {
-        if (hwdec->driver->imgfmts[n] == imgfmt)
-            return true;
-    }
-    return false;
-}
-
-struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
-                                               struct mp_image_params *params)
-{
-    assert(ra_hwdec_test_format(hwdec, params->imgfmt));
-
-    struct ra_hwdec_mapper *mapper = talloc_ptrtype(NULL, mapper);
-    *mapper = (struct ra_hwdec_mapper){
-        .owner = hwdec,
-        .driver = hwdec->driver->mapper,
-        .log = hwdec->log,
-        .ra = hwdec->ra,
-        .priv = talloc_zero_size(mapper, hwdec->driver->mapper->priv_size),
-        .src_params = *params,
-        .dst_params = *params,
-    };
-    if (mapper->driver->init(mapper) < 0)
-        ra_hwdec_mapper_free(&mapper);
-    return mapper;
-}
-
-void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper)
-{
-    struct ra_hwdec_mapper *p = *mapper;
-    if (p) {
-        ra_hwdec_mapper_unmap(p);
-        p->driver->uninit(p);
-        talloc_free(p);
-    }
-    *mapper = NULL;
-}
-
-void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper)
-{
-    if (mapper->driver->unmap)
-        mapper->driver->unmap(mapper);
-    mp_image_unrefp(&mapper->src);
-}
-
-int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img)
-{
-    ra_hwdec_mapper_unmap(mapper);
-    mp_image_setrefp(&mapper->src, img);
-    if (mapper->driver->map(mapper) < 0) {
-        ra_hwdec_mapper_unmap(mapper);
-        return -1;
-    }
-    return 0;
-}
diff --git a/video/out/opengl/hwdec.h b/video/out/opengl/hwdec.h
deleted file mode 100644
index 20bbaae9eb..0000000000
--- a/video/out/opengl/hwdec.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef MPGL_HWDEC_H_
-#define MPGL_HWDEC_H_
-
-#include "video/mp_image.h"
-#include "ra.h"
-#include "video/hwdec.h"
-
-struct ra_hwdec {
-    const struct ra_hwdec_driver *driver;
-    struct mp_log *log;
-    struct mpv_global *global;
-    struct ra *ra;
-    struct mp_hwdec_devices *devs;
-    // GLSL extensions required to sample textures from this.
-    const char **glsl_extensions;
-    // For free use by hwdec driver
-    void *priv;
-    // For working around the vdpau vs. vaapi mess.
-    bool probing;
-    // Used in overlay mode only.
-    float overlay_colorkey[4];
-};
-
-struct ra_hwdec_mapper {
-    const struct ra_hwdec_mapper_driver *driver;
-    struct mp_log *log;
-    struct ra *ra;
-    void *priv;
-    struct ra_hwdec *owner;
-    // Input frame parameters. (Set before init(), immutable.)
-    struct mp_image_params src_params;
-    // Output frame parameters (represents the format the textures return). Must
-    // be set by init(), immutable afterwards,
-    struct mp_image_params dst_params;
-
-    // The currently mapped source image (or the image about to be mapped in
-    // ->map()). NULL if unmapped. The mapper can also clear this reference if
-    // the mapped textures contain a full copy.
-    struct mp_image *src;
-
-    // The mapped textures and metadata about them. These fields change if a
-    // new frame is mapped (or unmapped), but otherwise remain constant.
-    // The common code won't mess with these, so you can e.g. set them in the
-    // .init() callback.
-    struct ra_tex *tex[4];
-    bool vdpau_fields;
-};
-
-// This can be used to map frames of a specific hw format as GL textures.
-struct ra_hwdec_mapper_driver {
-    // Used to create ra_hwdec_mapper.priv.
-    size_t priv_size;
-
-    // Init the mapper implementation. At this point, the field src_params,
-    // fns, devs, priv are initialized.
-    int (*init)(struct ra_hwdec_mapper *mapper);
-    // Destroy the mapper. unmap is called before this.
-    void (*uninit)(struct ra_hwdec_mapper *mapper);
-
-    // Map mapper->src as texture, and set mapper->frame to textures using it.
-    // It is expected that that the textures remain valid until the next unmap
-    // or uninit call.
-    // The function is allowed to unref mapper->src if it's not needed (i.e.
-    // this function creates a copy).
-    // The underlying format can change, so you might need to do some form
-    // of change detection. You also must reject unsupported formats with an
-    // error.
-    // On error, returns negative value on error and remains unmapped.
-    int (*map)(struct ra_hwdec_mapper *mapper);
-    // Unmap the frame. Does nothing if already unmapped. Optional.
-    void (*unmap)(struct ra_hwdec_mapper *mapper);
-};
-
-struct ra_hwdec_driver {
-    // Name of the interop backend. This is used for informational purposes only.
-    const char *name;
-    // Used to create ra_hwdec.priv.
-    size_t priv_size;
-    // Used to explicitly request a specific API.
-    enum hwdec_type api;
-    // One of the hardware surface IMGFMT_ that must be passed to map_image later.
-    // Terminated with a 0 entry. (Extend the array size as needed.)
-    const int imgfmts[3];
-    // Dosn't load this unless requested by name.
-    bool testing_only;
-
-    // Create the hwdec device. It must add it to hw->devs, if applicable.
-    int (*init)(struct ra_hwdec *hw);
-    void (*uninit)(struct ra_hwdec *hw);
-
-    // This will be used to create a ra_hwdec_mapper from ra_hwdec.
-    const struct ra_hwdec_mapper_driver *mapper;
-
-    // The following function provides an alternative API. Each ra_hwdec_driver
-    // must have either provide a mapper or overlay_frame (not both or none), and
-    // if overlay_frame is set, it operates in overlay mode. In this mode,
-    // OSD etc. is rendered via OpenGL, but the video is rendered as a separate
-    // layer below it.
-    // Non-overlay mode is strictly preferred, so try not to use overlay mode.
-    // Set the given frame as overlay, replacing the previous one. This can also
-    // just change the position of the overlay.
-    // hw_image==src==dst==NULL is passed to clear the overlay.
-    int (*overlay_frame)(struct ra_hwdec *hw, struct mp_image *hw_image,
-                         struct mp_rect *src, struct mp_rect *dst, bool newframe);
-};
-
-struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
-                                   struct mpv_global *g,
-                                   struct mp_hwdec_devices *devs,
-                                   enum hwdec_type api);
-
-struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
-                               struct mpv_global *g,
-                               struct mp_hwdec_devices *devs,
-                               const char *name);
-
-int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
-                          struct bstr name, struct bstr param);
-
-void ra_hwdec_uninit(struct ra_hwdec *hwdec);
-
-bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt);
-
-struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
-                                               struct mp_image_params *params);
-void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper);
-void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper);
-int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img);
-
-#endif
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index d40bafee24..d9c4c199f1 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -32,11 +32,10 @@
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
 
+#include "video/out/gpu/hwdec.h"
 #include "formats.h"
-#include "hwdec.h"
 #include "options/m_config.h"
 #include "ra_gl.h"
-#include "video.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
diff --git a/video/out/opengl/hwdec_ios.m b/video/out/opengl/hwdec_ios.m
index 8e020ded63..71b205b583 100644
--- a/video/out/opengl/hwdec_ios.m
+++ b/video/out/opengl/hwdec_ios.m
@@ -27,10 +27,10 @@
 
 #include "config.h"
 
+#include "video/out/gpu/hwdec.h"
 #include "video/mp_image_pool.h"
 #include "video/vt.h"
 #include "ra_gl.h"
-#include "hwdec.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
diff --git a/video/out/opengl/hwdec_osx.c b/video/out/opengl/hwdec_osx.c
index 348a5e19c5..cfd5f52e7b 100644
--- a/video/out/opengl/hwdec_osx.c
+++ b/video/out/opengl/hwdec_osx.c
@@ -29,9 +29,9 @@
 #include "config.h"
 
 #include "video/mp_image_pool.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/vt.h"
 #include "ra_gl.h"
-#include "hwdec.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
diff --git a/video/out/opengl/hwdec_rpi.c b/video/out/opengl/hwdec_rpi.c
index 6f39c3e330..ea8312a179 100644
--- a/video/out/opengl/hwdec_rpi.c
+++ b/video/out/opengl/hwdec_rpi.c
@@ -33,8 +33,8 @@
 #include "common/common.h"
 #include "common/msg.h"
 #include "video/mp_image.h"
+#include "video/out/gpu/hwdec.h"
 
-#include "hwdec.h"
 #include "common.h"
 #include "ra_gl.h"
 
diff --git a/video/out/opengl/hwdec_vaegl.c b/video/out/opengl/hwdec_vaegl.c
index a0e3222cfc..6078222bd5 100644
--- a/video/out/opengl/hwdec_vaegl.c
+++ b/video/out/opengl/hwdec_vaegl.c
@@ -30,9 +30,9 @@
 
 #include "config.h"
 
-#include "hwdec.h"
-#include "video/vaapi.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/mp_image_pool.h"
+#include "video/vaapi.h"
 #include "common.h"
 #include "ra_gl.h"
 
diff --git a/video/out/opengl/hwdec_vaglx.c b/video/out/opengl/hwdec_vaglx.c
index 8db15c4468..d5bc0b6ee7 100644
--- a/video/out/opengl/hwdec_vaglx.c
+++ b/video/out/opengl/hwdec_vaglx.c
@@ -25,10 +25,11 @@
 #include <va/va_x11.h>
 
 #include "video/out/x11_common.h"
-#include "ra_gl.h"
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/vaapi.h"
 
+#include "ra_gl.h"
+
 struct priv_owner {
     struct mp_vaapi_ctx *ctx;
     VADisplay *display;
diff --git a/video/out/opengl/hwdec_vdpau.c b/video/out/opengl/hwdec_vdpau.c
index d733650328..e0618e425e 100644
--- a/video/out/opengl/hwdec_vdpau.c
+++ b/video/out/opengl/hwdec_vdpau.c
@@ -20,7 +20,7 @@
 
 #include <GL/glx.h>
 
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/vdpau.h"
 #include "video/vdpau_mixer.h"
diff --git a/video/out/opengl/lcms.c b/video/out/opengl/lcms.c
deleted file mode 100644
index 8747ae6aa6..0000000000
--- a/video/out/opengl/lcms.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <string.h>
-#include <math.h>
-
-#include "mpv_talloc.h"
-
-#include "config.h"
-
-#include "stream/stream.h"
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "common/msg.h"
-#include "options/m_option.h"
-#include "options/path.h"
-#include "video/csputils.h"
-#include "lcms.h"
-
-#include "osdep/io.h"
-
-#if HAVE_LCMS2
-
-#include <lcms2.h>
-#include <libavutil/sha.h>
-#include <libavutil/mem.h>
-
-struct gl_lcms {
-    void *icc_data;
-    size_t icc_size;
-    struct AVBufferRef *vid_profile;
-    char *current_profile;
-    bool using_memory_profile;
-    bool changed;
-    enum mp_csp_prim current_prim;
-    enum mp_csp_trc current_trc;
-
-    struct mp_log *log;
-    struct mpv_global *global;
-    struct mp_icc_opts *opts;
-};
-
-static bool parse_3dlut_size(const char *arg, int *p1, int *p2, int *p3)
-{
-    if (sscanf(arg, "%dx%dx%d", p1, p2, p3) != 3)
-        return false;
-    for (int n = 0; n < 3; n++) {
-        int s = ((int[]) { *p1, *p2, *p3 })[n];
-        if (s < 2 || s > 512)
-            return false;
-    }
-    return true;
-}
-
-static int validate_3dlut_size_opt(struct mp_log *log, const m_option_t *opt,
-                                   struct bstr name, struct bstr param)
-{
-    int p1, p2, p3;
-    char s[20];
-    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-    return parse_3dlut_size(s, &p1, &p2, &p3);
-}
-
-#define OPT_BASE_STRUCT struct mp_icc_opts
-const struct m_sub_options mp_icc_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_FLAG("use-embedded-icc-profile", use_embedded, 0),
-        OPT_STRING("icc-profile", profile, M_OPT_FILE),
-        OPT_FLAG("icc-profile-auto", profile_auto, 0),
-        OPT_STRING("icc-cache-dir", cache_dir, M_OPT_FILE),
-        OPT_INT("icc-intent", intent, 0),
-        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
-        OPT_STRING_VALIDATE("icc-3dlut-size", size_str, 0, validate_3dlut_size_opt),
-
-        OPT_REPLACED("3dlut-size", "icc-3dlut-size"),
-        OPT_REMOVED("icc-cache", "see icc-cache-dir"),
-        {0}
-    },
-    .size = sizeof(struct mp_icc_opts),
-    .defaults = &(const struct mp_icc_opts) {
-        .size_str = "64x64x64",
-        .intent = INTENT_RELATIVE_COLORIMETRIC,
-        .use_embedded = true,
-    },
-};
-
-static void lcms2_error_handler(cmsContext ctx, cmsUInt32Number code,
-                                const char *msg)
-{
-    struct gl_lcms *p = cmsGetContextUserData(ctx);
-    MP_ERR(p, "lcms2: %s\n", msg);
-}
-
-static void load_profile(struct gl_lcms *p)
-{
-    talloc_free(p->icc_data);
-    p->icc_data = NULL;
-    p->icc_size = 0;
-    p->using_memory_profile = false;
-    talloc_free(p->current_profile);
-    p->current_profile = NULL;
-
-    if (!p->opts->profile || !p->opts->profile[0])
-        return;
-
-    char *fname = mp_get_user_path(NULL, p->global, p->opts->profile);
-    MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
-    struct bstr iccdata = stream_read_file(fname, p, p->global,
-                                           100000000); // 100 MB
-    talloc_free(fname);
-    if (!iccdata.len)
-        return;
-
-    talloc_free(p->icc_data);
-
-    p->icc_data = iccdata.start;
-    p->icc_size = iccdata.len;
-    p->current_profile = talloc_strdup(p, p->opts->profile);
-}
-
-static void gl_lcms_destructor(void *ptr)
-{
-    struct gl_lcms *p = ptr;
-    av_buffer_unref(&p->vid_profile);
-}
-
-struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
-                             struct mpv_global *global,
-                             struct mp_icc_opts *opts)
-{
-    struct gl_lcms *p = talloc_ptrtype(talloc_ctx, p);
-    talloc_set_destructor(p, gl_lcms_destructor);
-    *p = (struct gl_lcms) {
-        .global = global,
-        .log = log,
-        .opts = opts,
-    };
-    gl_lcms_update_options(p);
-    return p;
-}
-
-void gl_lcms_update_options(struct gl_lcms *p)
-{
-    if ((p->using_memory_profile && !p->opts->profile_auto) ||
-        !bstr_equals(bstr0(p->opts->profile), bstr0(p->current_profile)))
-    {
-        load_profile(p);
-    }
-
-    p->changed = true; // probably
-}
-
-// Warning: profile.start must point to a ta allocation, and the function
-//          takes over ownership.
-// Returns whether the internal profile was changed.
-bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile)
-{
-    if (!p->opts->profile_auto || (p->opts->profile && p->opts->profile[0])) {
-        talloc_free(profile.start);
-        return false;
-    }
-
-    if (p->using_memory_profile &&
-        p->icc_data && profile.start &&
-        profile.len == p->icc_size &&
-        memcmp(profile.start, p->icc_data, p->icc_size) == 0)
-    {
-        talloc_free(profile.start);
-        return false;
-    }
-
-    p->changed = true;
-    p->using_memory_profile = true;
-
-    talloc_free(p->icc_data);
-
-    p->icc_data = talloc_steal(p, profile.start);
-    p->icc_size = profile.len;
-
-    return true;
-}
-
-// Guards against NULL and uses bstr_equals to short-circuit some special cases
-static bool vid_profile_eq(struct AVBufferRef *a, struct AVBufferRef *b)
-{
-    if (!a || !b)
-        return a == b;
-
-    return bstr_equals((struct bstr){ a->data, a->size },
-                       (struct bstr){ b->data, b->size });
-}
-
-// Return whether the profile or config has changed since the last time it was
-// retrieved. If it has changed, gl_lcms_get_lut3d() should be called.
-bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
-                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
-{
-    if (p->changed || p->current_prim != prim || p->current_trc != trc)
-        return true;
-
-    return !vid_profile_eq(p->vid_profile, vid_profile);
-}
-
-// Whether a profile is set. (gl_lcms_get_lut3d() is expected to return a lut,
-// but it could still fail due to runtime errors, such as invalid icc data.)
-bool gl_lcms_has_profile(struct gl_lcms *p)
-{
-    return p->icc_size > 0;
-}
-
-static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
-                                   cmsHPROFILE disp_profile,
-                                   enum mp_csp_prim prim, enum mp_csp_trc trc)
-{
-    if (p->opts->use_embedded && p->vid_profile) {
-        // Try using the embedded ICC profile
-        cmsHPROFILE prof = cmsOpenProfileFromMemTHR(cms, p->vid_profile->data,
-                                                    p->vid_profile->size);
-        if (prof) {
-            MP_VERBOSE(p, "Successfully opened embedded ICC profile\n");
-            return prof;
-        }
-
-        // Otherwise, warn the user and generate the profile as usual
-        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring..\n");
-    }
-
-    // The input profile for the transformation is dependent on the video
-    // primaries and transfer characteristics
-    struct mp_csp_primaries csp = mp_get_csp_primaries(prim);
-    cmsCIExyY wp_xyY = {csp.white.x, csp.white.y, 1.0};
-    cmsCIExyYTRIPLE prim_xyY = {
-        .Red   = {csp.red.x,   csp.red.y,   1.0},
-        .Green = {csp.green.x, csp.green.y, 1.0},
-        .Blue  = {csp.blue.x,  csp.blue.y,  1.0},
-    };
-
-    cmsToneCurve *tonecurve[3] = {0};
-    switch (trc) {
-    case MP_CSP_TRC_LINEAR:  tonecurve[0] = cmsBuildGamma(cms, 1.0); break;
-    case MP_CSP_TRC_GAMMA18: tonecurve[0] = cmsBuildGamma(cms, 1.8); break;
-    case MP_CSP_TRC_GAMMA22: tonecurve[0] = cmsBuildGamma(cms, 2.2); break;
-    case MP_CSP_TRC_GAMMA28: tonecurve[0] = cmsBuildGamma(cms, 2.8); break;
-
-    case MP_CSP_TRC_SRGB:
-        // Values copied from Little-CMS
-        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
-                (double[5]){2.40, 1/1.055, 0.055/1.055, 1/12.92, 0.04045});
-        break;
-
-    case MP_CSP_TRC_PRO_PHOTO:
-        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
-                (double[5]){1.8, 1.0, 0.0, 1/16.0, 0.03125});
-        break;
-
-    case MP_CSP_TRC_BT_1886: {
-        // To build an appropriate BT.1886 transformation we need access to
-        // the display's black point, so we LittleCMS' detection function.
-        // Relative colorimetric is used since we want to approximate the
-        // BT.1886 to the target device's actual black point even in e.g.
-        // perceptual mode
-        const int intent = MP_INTENT_RELATIVE_COLORIMETRIC;
-        cmsCIEXYZ bp_XYZ;
-        if (!cmsDetectBlackPoint(&bp_XYZ, disp_profile, intent, 0))
-            return false;
-
-        // Map this XYZ value back into the (linear) source space
-        cmsToneCurve *linear = cmsBuildGamma(cms, 1.0);
-        cmsHPROFILE rev_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
-                (cmsToneCurve*[3]){linear, linear, linear});
-        cmsHPROFILE xyz_profile = cmsCreateXYZProfile();
-        cmsHTRANSFORM xyz2src = cmsCreateTransformTHR(cms,
-                xyz_profile, TYPE_XYZ_DBL, rev_profile, TYPE_RGB_DBL,
-                intent, 0);
-        cmsFreeToneCurve(linear);
-        cmsCloseProfile(rev_profile);
-        cmsCloseProfile(xyz_profile);
-        if (!xyz2src)
-            return false;
-
-        double src_black[3];
-        cmsDoTransform(xyz2src, &bp_XYZ, src_black, 1);
-        cmsDeleteTransform(xyz2src);
-
-        // Contrast limiting
-        if (p->opts->contrast > 0) {
-            for (int i = 0; i < 3; i++)
-                src_black[i] = MPMAX(src_black[i], 1.0 / p->opts->contrast);
-        }
-
-        // Built-in contrast failsafe
-        double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
-        if (contrast > 100000) {
-            MP_WARN(p, "ICC profile detected contrast very high (>100000),"
-                    " falling back to contrast 1000 for sanity. Set the"
-                    " icc-contrast option to silence this warning.\n");
-            src_black[0] = src_black[1] = src_black[2] = 1.0 / 1000;
-        }
-
-        // Build the parametric BT.1886 transfer curve, one per channel
-        for (int i = 0; i < 3; i++) {
-            const double gamma = 2.40;
-            double binv = pow(src_black[i], 1.0/gamma);
-            tonecurve[i] = cmsBuildParametricToneCurve(cms, 6,
-                    (double[4]){gamma, 1.0 - binv, binv, 0.0});
-        }
-        break;
-    }
-
-    default:
-        abort();
-    }
-
-    if (!tonecurve[0])
-        return false;
-
-    if (!tonecurve[1]) tonecurve[1] = tonecurve[0];
-    if (!tonecurve[2]) tonecurve[2] = tonecurve[0];
-
-    cmsHPROFILE *vid_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
-                                                      tonecurve);
-
-    if (tonecurve[2] != tonecurve[0]) cmsFreeToneCurve(tonecurve[2]);
-    if (tonecurve[1] != tonecurve[0]) cmsFreeToneCurve(tonecurve[1]);
-    cmsFreeToneCurve(tonecurve[0]);
-
-    return vid_profile;
-}
-
-bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
-                       enum mp_csp_prim prim, enum mp_csp_trc trc,
-                       struct AVBufferRef *vid_profile)
-{
-    int s_r, s_g, s_b;
-    bool result = false;
-
-    p->changed = false;
-    p->current_prim = prim;
-    p->current_trc = trc;
-
-    // We need to hold on to a reference to the video's ICC profile for as long
-    // as we still need to perform equality checking, so generate a new
-    // reference here
-    av_buffer_unref(&p->vid_profile);
-    if (vid_profile) {
-        MP_VERBOSE(p, "Got an embedded ICC profile.\n");
-        p->vid_profile = av_buffer_ref(vid_profile);
-        if (!p->vid_profile)
-            abort();
-    }
-
-    if (!parse_3dlut_size(p->opts->size_str, &s_r, &s_g, &s_b))
-        return false;
-
-    if (!gl_lcms_has_profile(p))
-        return false;
-
-    void *tmp = talloc_new(NULL);
-    uint16_t *output = talloc_array(tmp, uint16_t, s_r * s_g * s_b * 4);
-    struct lut3d *lut = NULL;
-    cmsContext cms = NULL;
-
-    char *cache_file = NULL;
-    if (p->opts->cache_dir && p->opts->cache_dir[0]) {
-        // Gamma is included in the header to help uniquely identify it,
-        // because we may change the parameter in the future or make it
-        // customizable, same for the primaries.
-        char *cache_info = talloc_asprintf(tmp,
-                "ver=1.4, intent=%d, size=%dx%dx%d, prim=%d, trc=%d, "
-                "contrast=%d\n",
-                p->opts->intent, s_r, s_g, s_b, prim, trc, p->opts->contrast);
-
-        uint8_t hash[32];
-        struct AVSHA *sha = av_sha_alloc();
-        if (!sha)
-            abort();
-        av_sha_init(sha, 256);
-        av_sha_update(sha, cache_info, strlen(cache_info));
-        if (vid_profile)
-            av_sha_update(sha, vid_profile->data, vid_profile->size);
-        av_sha_update(sha, p->icc_data, p->icc_size);
-        av_sha_final(sha, hash);
-        av_free(sha);
-
-        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts->cache_dir);
-        cache_file = talloc_strdup(tmp, "");
-        for (int i = 0; i < sizeof(hash); i++)
-            cache_file = talloc_asprintf_append(cache_file, "%02X", hash[i]);
-        cache_file = mp_path_join(tmp, cache_dir, cache_file);
-
-        mp_mkdirp(cache_dir);
-    }
-
-    // check cache
-    if (cache_file && stat(cache_file, &(struct stat){0}) == 0) {
-        MP_VERBOSE(p, "Opening 3D LUT cache in file '%s'.\n", cache_file);
-        struct bstr cachedata = stream_read_file(cache_file, tmp, p->global,
-                                                 1000000000); // 1 GB
-        if (cachedata.len == talloc_get_size(output)) {
-            memcpy(output, cachedata.start, cachedata.len);
-            goto done;
-        } else {
-            MP_WARN(p, "3D LUT cache invalid!\n");
-        }
-    }
-
-    cms = cmsCreateContext(NULL, p);
-    if (!cms)
-        goto error_exit;
-    cmsSetLogErrorHandlerTHR(cms, lcms2_error_handler);
-
-    cmsHPROFILE profile =
-        cmsOpenProfileFromMemTHR(cms, p->icc_data, p->icc_size);
-    if (!profile)
-        goto error_exit;
-
-    cmsHPROFILE vid_hprofile = get_vid_profile(p, cms, profile, prim, trc);
-    if (!vid_hprofile) {
-        cmsCloseProfile(profile);
-        goto error_exit;
-    }
-
-    cmsHTRANSFORM trafo = cmsCreateTransformTHR(cms, vid_hprofile, TYPE_RGB_16,
-                                                profile, TYPE_RGBA_16,
-                                                p->opts->intent,
-                                                cmsFLAGS_HIGHRESPRECALC |
-                                                cmsFLAGS_BLACKPOINTCOMPENSATION);
-    cmsCloseProfile(profile);
-    cmsCloseProfile(vid_hprofile);
-
-    if (!trafo)
-        goto error_exit;
-
-    // transform a (s_r)x(s_g)x(s_b) cube, with 3 components per channel
-    uint16_t *input = talloc_array(tmp, uint16_t, s_r * 3);
-    for (int b = 0; b < s_b; b++) {
-        for (int g = 0; g < s_g; g++) {
-            for (int r = 0; r < s_r; r++) {
-                input[r * 3 + 0] = r * 65535 / (s_r - 1);
-                input[r * 3 + 1] = g * 65535 / (s_g - 1);
-                input[r * 3 + 2] = b * 65535 / (s_b - 1);
-            }
-            size_t base = (b * s_r * s_g + g * s_r) * 4;
-            cmsDoTransform(trafo, input, output + base, s_r);
-        }
-    }
-
-    cmsDeleteTransform(trafo);
-
-    if (cache_file) {
-        FILE *out = fopen(cache_file, "wb");
-        if (out) {
-            fwrite(output, talloc_get_size(output), 1, out);
-            fclose(out);
-        }
-    }
-
-done: ;
-
-    lut = talloc_ptrtype(NULL, lut);
-    *lut = (struct lut3d) {
-        .data = talloc_steal(lut, output),
-        .size = {s_r, s_g, s_b},
-    };
-
-    *result_lut3d = lut;
-    result = true;
-
-error_exit:
-
-    if (cms)
-        cmsDeleteContext(cms);
-
-    if (!lut)
-        MP_FATAL(p, "Error loading ICC profile.\n");
-
-    talloc_free(tmp);
-    return result;
-}
-
-#else /* HAVE_LCMS2 */
-
-const struct m_sub_options mp_icc_conf = {
-    .opts = (const m_option_t[]) { {0} },
-    .size = sizeof(struct mp_icc_opts),
-    .defaults = &(const struct mp_icc_opts) {0},
-};
-
-struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
-                             struct mpv_global *global,
-                             struct mp_icc_opts *opts)
-{
-    return (struct gl_lcms *) talloc_new(talloc_ctx);
-}
-
-void gl_lcms_update_options(struct gl_lcms *p) { }
-bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile) {return false;}
-
-bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
-                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
-{
-    return false;
-}
-
-bool gl_lcms_has_profile(struct gl_lcms *p)
-{
-    return false;
-}
-
-bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
-                       enum mp_csp_prim prim, enum mp_csp_trc trc,
-                       struct AVBufferRef *vid_profile)
-{
-    return false;
-}
-
-#endif
diff --git a/video/out/opengl/lcms.h b/video/out/opengl/lcms.h
deleted file mode 100644
index 35bbd61fe0..0000000000
--- a/video/out/opengl/lcms.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef MP_GL_LCMS_H
-#define MP_GL_LCMS_H
-
-#include <stddef.h>
-#include <stdbool.h>
-#include "misc/bstr.h"
-#include "video/csputils.h"
-#include <libavutil/buffer.h>
-
-extern const struct m_sub_options mp_icc_conf;
-
-struct mp_icc_opts {
-    int use_embedded;
-    char *profile;
-    int profile_auto;
-    char *cache_dir;
-    char *size_str;
-    int intent;
-    int contrast;
-};
-
-struct lut3d {
-    uint16_t *data;
-    int size[3];
-};
-
-struct mp_log;
-struct mpv_global;
-struct gl_lcms;
-
-struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
-                             struct mpv_global *global,
-                             struct mp_icc_opts *opts);
-void gl_lcms_update_options(struct gl_lcms *p);
-bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile);
-bool gl_lcms_has_profile(struct gl_lcms *p);
-bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **,
-                       enum mp_csp_prim prim, enum mp_csp_trc trc,
-                       struct AVBufferRef *vid_profile);
-bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
-                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile);
-
-#endif
diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c
deleted file mode 100644
index f7c325d1db..0000000000
--- a/video/out/opengl/osd.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdlib.h>
-#include <assert.h>
-#include <limits.h>
-
-#include <libavutil/common.h>
-
-#include "common/common.h"
-#include "common/msg.h"
-#include "video/csputils.h"
-#include "video/mp_image.h"
-#include "osd.h"
-
-#define GLSL(x) gl_sc_add(sc, #x "\n");
-
-// glBlendFuncSeparate() arguments
-static const int blend_factors[SUBBITMAP_COUNT][4] = {
-    [SUBBITMAP_LIBASS] = {RA_BLEND_SRC_ALPHA, RA_BLEND_ONE_MINUS_SRC_ALPHA,
-                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
-    [SUBBITMAP_RGBA] =   {RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA,
-                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
-};
-
-struct vertex {
-    float position[2];
-    float texcoord[2];
-    uint8_t ass_color[4];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
-    {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
-    {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
-    {0}
-};
-
-struct mpgl_osd_part {
-    enum sub_bitmap_format format;
-    int change_id;
-    struct ra_tex *texture;
-    int w, h;
-    int num_subparts;
-    int prev_num_subparts;
-    struct sub_bitmap *subparts;
-    int num_vertices;
-    struct vertex *vertices;
-};
-
-struct mpgl_osd {
-    struct mp_log *log;
-    struct osd_state *osd;
-    struct ra *ra;
-    struct mpgl_osd_part *parts[MAX_OSD_PARTS];
-    const struct ra_format *fmt_table[SUBBITMAP_COUNT];
-    bool formats[SUBBITMAP_COUNT];
-    bool change_flag; // for reporting to API user only
-    // temporary
-    int stereo_mode;
-    struct mp_osd_res osd_res;
-    void *scratch;
-};
-
-struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
-                               struct osd_state *osd)
-{
-    struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
-    *ctx = (struct mpgl_osd) {
-        .log = log,
-        .osd = osd,
-        .ra = ra,
-        .change_flag = true,
-        .scratch = talloc_zero_size(ctx, 1),
-    };
-
-    ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1);
-    ctx->fmt_table[SUBBITMAP_RGBA]   = ra_find_unorm_format(ra, 1, 4);
-
-    for (int n = 0; n < MAX_OSD_PARTS; n++)
-        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
-
-    for (int n = 0; n < SUBBITMAP_COUNT; n++)
-        ctx->formats[n] = !!ctx->fmt_table[n];
-
-    return ctx;
-}
-
-void mpgl_osd_destroy(struct mpgl_osd *ctx)
-{
-    if (!ctx)
-        return;
-
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct mpgl_osd_part *p = ctx->parts[n];
-        ra_tex_free(ctx->ra, &p->texture);
-    }
-    talloc_free(ctx);
-}
-
-static int next_pow2(int v)
-{
-    for (int x = 0; x < 30; x++) {
-        if ((1 << x) >= v)
-            return 1 << x;
-    }
-    return INT_MAX;
-}
-
-static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
-                       struct sub_bitmaps *imgs)
-{
-    struct ra *ra = ctx->ra;
-    bool ok = false;
-
-    assert(imgs->packed);
-
-    int req_w = next_pow2(imgs->packed_w);
-    int req_h = next_pow2(imgs->packed_h);
-
-    const struct ra_format *fmt = ctx->fmt_table[imgs->format];
-    assert(fmt);
-
-    if (!osd->texture || req_w > osd->w || req_h > osd->h ||
-        osd->format != imgs->format)
-    {
-        ra_tex_free(ra, &osd->texture);
-
-        osd->format = imgs->format;
-        osd->w = FFMAX(32, req_w);
-        osd->h = FFMAX(32, req_h);
-
-        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
-
-        if (osd->w > ra->max_texture_wh || osd->h > ra->max_texture_wh) {
-            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-                   "supported size %dx%d.\n", ra->max_texture_wh,
-                   ra->max_texture_wh);
-            goto done;
-        }
-
-        struct ra_tex_params params = {
-            .dimensions = 2,
-            .w = osd->w,
-            .h = osd->h,
-            .d = 1,
-            .format = fmt,
-            .render_src = true,
-            .src_linear = true,
-            .host_mutable = true,
-        };
-        osd->texture = ra_tex_create(ra, &params);
-        if (!osd->texture)
-            goto done;
-    }
-
-    struct ra_tex_upload_params params = {
-        .tex = osd->texture,
-        .src = imgs->packed->planes[0],
-        .invalidate = true,
-        .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h},
-        .stride = imgs->packed->stride[0],
-    };
-
-    ok = ra->fns->tex_upload(ra, &params);
-
-done:
-    return ok;
-}
-
-static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
-{
-    struct mpgl_osd *ctx = pctx;
-
-    if (imgs->num_parts == 0 || !ctx->formats[imgs->format])
-        return;
-
-    struct mpgl_osd_part *osd = ctx->parts[imgs->render_index];
-
-    bool ok = true;
-    if (imgs->change_id != osd->change_id) {
-        if (!upload_osd(ctx, osd, imgs))
-            ok = false;
-
-        osd->change_id = imgs->change_id;
-        ctx->change_flag = true;
-    }
-    osd->num_subparts = ok ? imgs->num_parts : 0;
-
-    MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
-    memcpy(osd->subparts, imgs->parts,
-           osd->num_subparts * sizeof(osd->subparts[0]));
-}
-
-bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
-                           struct gl_shader_cache *sc)
-{
-    assert(index >= 0 && index < MAX_OSD_PARTS);
-    struct mpgl_osd_part *part = ctx->parts[index];
-
-    enum sub_bitmap_format fmt = part->format;
-    if (!fmt || !part->num_subparts)
-        return false;
-
-    gl_sc_uniform_texture(sc, "osdtex", part->texture);
-    switch (fmt) {
-    case SUBBITMAP_RGBA: {
-        GLSL(color = texture(osdtex, texcoord).bgra;)
-        break;
-    }
-    case SUBBITMAP_LIBASS: {
-        GLSL(color =
-            vec4(ass_color.rgb, ass_color.a * texture(osdtex, texcoord).r);)
-        break;
-    }
-    default:
-        abort();
-    }
-
-    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
-
-    return true;
-}
-
-static void write_quad(struct vertex *va, struct gl_transform t,
-                       float x0, float y0, float x1, float y1,
-                       float tx0, float ty0, float tx1, float ty1,
-                       float tex_w, float tex_h, const uint8_t color[4])
-{
-    gl_transform_vec(t, &x0, &y0);
-    gl_transform_vec(t, &x1, &y1);
-
-#define COLOR_INIT {color[0], color[1], color[2], color[3]}
-    va[0] = (struct vertex){ {x0, y0}, {tx0 / tex_w, ty0 / tex_h}, COLOR_INIT };
-    va[1] = (struct vertex){ {x0, y1}, {tx0 / tex_w, ty1 / tex_h}, COLOR_INIT };
-    va[2] = (struct vertex){ {x1, y0}, {tx1 / tex_w, ty0 / tex_h}, COLOR_INIT };
-    va[3] = (struct vertex){ {x1, y1}, {tx1 / tex_w, ty1 / tex_h}, COLOR_INIT };
-    va[4] = va[2];
-    va[5] = va[1];
-#undef COLOR_INIT
-}
-
-static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
-{
-    int num_vertices = part->num_subparts * 6;
-    MP_TARRAY_GROW(part, part->vertices, part->num_vertices + num_vertices);
-
-    for (int n = 0; n < part->num_subparts; n++) {
-        struct sub_bitmap *b = &part->subparts[n];
-        struct vertex *va = &part->vertices[part->num_vertices];
-
-        // NOTE: the blend color is used with SUBBITMAP_LIBASS only, so it
-        //       doesn't matter that we upload garbage for the other formats
-        uint32_t c = b->libass.color;
-        uint8_t color[4] = { c >> 24, (c >> 16) & 0xff,
-                            (c >> 8) & 0xff, 255 - (c & 0xff) };
-
-        write_quad(&va[n * 6], t,
-                   b->x, b->y, b->x + b->dw, b->y + b->dh,
-                   b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
-                   part->w, part->h, color);
-    }
-
-    part->num_vertices += num_vertices;
-}
-
-// number of screen divisions per axis (x=0, y=1) for the current 3D mode
-static void get_3d_side_by_side(int stereo_mode, int div[2])
-{
-    div[0] = div[1] = 1;
-    switch (stereo_mode) {
-    case MP_STEREO3D_SBS2L:
-    case MP_STEREO3D_SBS2R: div[0] = 2; break;
-    case MP_STEREO3D_AB2R:
-    case MP_STEREO3D_AB2L:  div[1] = 2; break;
-    }
-}
-
-void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
-                          struct gl_shader_cache *sc, struct fbodst target)
-{
-    struct mpgl_osd_part *part = ctx->parts[index];
-
-    int div[2];
-    get_3d_side_by_side(ctx->stereo_mode, div);
-
-    part->num_vertices = 0;
-
-    for (int x = 0; x < div[0]; x++) {
-        for (int y = 0; y < div[1]; y++) {
-            struct gl_transform t;
-            gl_transform_ortho_fbodst(&t, target);
-
-            float a_x = ctx->osd_res.w * x;
-            float a_y = ctx->osd_res.h * y;
-            t.t[0] += a_x * t.m[0][0] + a_y * t.m[1][0];
-            t.t[1] += a_x * t.m[0][1] + a_y * t.m[1][1];
-
-            generate_verts(part, t);
-        }
-    }
-
-    const int *factors = &blend_factors[part->format][0];
-    gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
-
-    gl_sc_dispatch_draw(sc, target.tex, part->vertices, part->num_vertices);
-}
-
-static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
-{
-    int div[2];
-    get_3d_side_by_side(stereo_mode, div);
-
-    res.w /= div[0];
-    res.h /= div[1];
-    ctx->osd_res = res;
-}
-
-void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
-                       int stereo_mode, int draw_flags)
-{
-    for (int n = 0; n < MAX_OSD_PARTS; n++)
-        ctx->parts[n]->num_subparts = 0;
-
-    set_res(ctx, res, stereo_mode);
-
-    osd_draw(ctx->osd, ctx->osd_res, pts, draw_flags, ctx->formats, gen_osd_cb, ctx);
-    ctx->stereo_mode = stereo_mode;
-
-    // Parts going away does not necessarily result in gen_osd_cb() being called
-    // (not even with num_parts==0), so check this separately.
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct mpgl_osd_part *part = ctx->parts[n];
-        if (part->num_subparts !=  part->prev_num_subparts)
-            ctx->change_flag = true;
-        part->prev_num_subparts = part->num_subparts;
-    }
-}
-
-// See osd_resize() for remarks. This function is an optional optimization too.
-void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
-{
-    set_res(ctx, res, stereo_mode);
-    osd_resize(ctx->osd, ctx->osd_res);
-}
-
-bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
-                           double pts)
-{
-    ctx->change_flag = false;
-    mpgl_osd_generate(ctx, *res, pts, 0, 0);
-    return ctx->change_flag;
-}
diff --git a/video/out/opengl/osd.h b/video/out/opengl/osd.h
deleted file mode 100644
index 6c2b886de3..0000000000
--- a/video/out/opengl/osd.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef MPLAYER_GL_OSD_H
-#define MPLAYER_GL_OSD_H
-
-#include <stdbool.h>
-#include <inttypes.h>
-
-#include "utils.h"
-#include "shader_cache.h"
-#include "sub/osd.h"
-
-struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
-                               struct osd_state *osd);
-void mpgl_osd_destroy(struct mpgl_osd *ctx);
-
-void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
-                       int stereo_mode, int draw_flags);
-void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode);
-bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
-                           struct gl_shader_cache *sc);
-void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
-                          struct gl_shader_cache *sc, struct fbodst target);
-bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
-                           double pts);
-
-#endif
diff --git a/video/out/opengl/ra.c b/video/out/opengl/ra.c
deleted file mode 100644
index ef1de54d1a..0000000000
--- a/video/out/opengl/ra.c
+++ /dev/null
@@ -1,327 +0,0 @@
-#include "common/common.h"
-#include "common/msg.h"
-#include "video/img_format.h"
-
-#include "ra.h"
-
-struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params)
-{
-    return ra->fns->tex_create(ra, params);
-}
-
-void ra_tex_free(struct ra *ra, struct ra_tex **tex)
-{
-    if (*tex)
-        ra->fns->tex_destroy(ra, *tex);
-    *tex = NULL;
-}
-
-struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params)
-{
-    return ra->fns->buf_create(ra, params);
-}
-
-void ra_buf_free(struct ra *ra, struct ra_buf **buf)
-{
-    if (*buf)
-        ra->fns->buf_destroy(ra, *buf);
-    *buf = NULL;
-}
-
-void ra_free(struct ra **ra)
-{
-    if (*ra)
-        (*ra)->fns->destroy(*ra);
-    talloc_free(*ra);
-    *ra = NULL;
-}
-
-size_t ra_vartype_size(enum ra_vartype type)
-{
-    switch (type) {
-    case RA_VARTYPE_INT:        return sizeof(int);
-    case RA_VARTYPE_FLOAT:      return sizeof(float);
-    case RA_VARTYPE_BYTE_UNORM: return 1;
-    default: return 0;
-    }
-}
-
-struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input)
-{
-    size_t el_size = ra_vartype_size(input->type);
-    if (!el_size)
-        return (struct ra_layout){0};
-
-    // host data is always tightly packed
-    return (struct ra_layout) {
-        .align  = 1,
-        .stride = el_size * input->dim_v,
-        .size   = el_size * input->dim_v * input->dim_m,
-    };
-}
-
-static struct ra_renderpass_input *dup_inputs(void *ta_parent,
-            const struct ra_renderpass_input *inputs, int num_inputs)
-{
-    struct ra_renderpass_input *res =
-        talloc_memdup(ta_parent, (void *)inputs, num_inputs * sizeof(inputs[0]));
-    for (int n = 0; n < num_inputs; n++)
-        res[n].name = talloc_strdup(res, res[n].name);
-    return res;
-}
-
-// Return a newly allocated deep-copy of params.
-struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
-        const struct ra_renderpass_params *params)
-{
-    struct ra_renderpass_params *res = talloc_ptrtype(ta_parent, res);
-    *res = *params;
-    res->inputs = dup_inputs(res, res->inputs, res->num_inputs);
-    res->vertex_attribs =
-        dup_inputs(res, res->vertex_attribs, res->num_vertex_attribs);
-    res->cached_program = bstrdup(res, res->cached_program);
-    res->vertex_shader = talloc_strdup(res, res->vertex_shader);
-    res->frag_shader = talloc_strdup(res, res->frag_shader);
-    res->compute_shader = talloc_strdup(res, res->compute_shader);
-    return res;
-};
-
-
-// Return whether this is a tightly packed format with no external padding and
-// with the same bit size/depth in all components, and the shader returns
-// components in the same order as in memory.
-static bool ra_format_is_regular(const struct ra_format *fmt)
-{
-    if (!fmt->pixel_size || !fmt->num_components || !fmt->ordered)
-        return false;
-    for (int n = 1; n < fmt->num_components; n++) {
-        if (fmt->component_size[n] != fmt->component_size[0] ||
-            fmt->component_depth[n] != fmt->component_depth[0])
-            return false;
-    }
-    if (fmt->component_size[0] * fmt->num_components != fmt->pixel_size * 8)
-        return false;
-    return true;
-}
-
-// Return a regular filterable format using RA_CTYPE_UNORM.
-const struct ra_format *ra_find_unorm_format(struct ra *ra,
-                                             int bytes_per_component,
-                                             int n_components)
-{
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (fmt->ctype == RA_CTYPE_UNORM && fmt->num_components == n_components &&
-            fmt->pixel_size == bytes_per_component * n_components &&
-            fmt->component_depth[0] == bytes_per_component * 8 &&
-            fmt->linear_filter && ra_format_is_regular(fmt))
-            return fmt;
-    }
-    return NULL;
-}
-
-// Return a regular format using RA_CTYPE_UINT.
-const struct ra_format *ra_find_uint_format(struct ra *ra,
-                                            int bytes_per_component,
-                                            int n_components)
-{
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (fmt->ctype == RA_CTYPE_UINT && fmt->num_components == n_components &&
-            fmt->pixel_size == bytes_per_component * n_components &&
-            fmt->component_depth[0] == bytes_per_component * 8 &&
-            ra_format_is_regular(fmt))
-            return fmt;
-    }
-    return NULL;
-}
-
-// Find a float format of any precision that matches the C type of the same
-// size for upload.
-// May drop bits from the mantissa (such as selecting float16 even if
-// bytes_per_component == 32); prefers possibly faster formats first.
-static const struct ra_format *ra_find_float_format(struct ra *ra,
-                                                    int bytes_per_component,
-                                                    int n_components)
-{
-    // Assumes ra_format are ordered by performance.
-    // The >=16 check is to avoid catching fringe formats.
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (fmt->ctype == RA_CTYPE_FLOAT && fmt->num_components == n_components &&
-            fmt->pixel_size == bytes_per_component * n_components &&
-            fmt->component_depth[0] >= 16 &&
-            fmt->linear_filter && ra_format_is_regular(fmt))
-            return fmt;
-    }
-    return NULL;
-}
-
-// Return a filterable regular format that uses at least float16 internally, and
-// uses a normal C float for transfer on the CPU side. (This is just so we don't
-// need 32->16 bit conversion on CPU, which would be messy.)
-const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components)
-{
-    return ra_find_float_format(ra, sizeof(float), n_components);
-}
-
-const struct ra_format *ra_find_named_format(struct ra *ra, const char *name)
-{
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (strcmp(fmt->name, name) == 0)
-            return fmt;
-    }
-    return NULL;
-}
-
-// Like ra_find_unorm_format(), but if no fixed point format is available,
-// return an unsigned integer format.
-static const struct ra_format *find_plane_format(struct ra *ra, int bytes,
-                                                 int n_channels,
-                                                 enum mp_component_type ctype)
-{
-    switch (ctype) {
-    case MP_COMPONENT_TYPE_UINT: {
-        const struct ra_format *f = ra_find_unorm_format(ra, bytes, n_channels);
-        if (f)
-            return f;
-        return ra_find_uint_format(ra, bytes, n_channels);
-    }
-    case MP_COMPONENT_TYPE_FLOAT:
-        return ra_find_float_format(ra, bytes, n_channels);
-    default: return NULL;
-    }
-}
-
-// Put a mapping of imgfmt to texture formats into *out. Basically it selects
-// the correct texture formats needed to represent an imgfmt in a shader, with
-// textures using the same memory organization as on the CPU.
-// Each plane is represented by a texture, and each texture has a RGBA
-// component order. out->components describes the meaning of them.
-// May return integer formats for >8 bit formats, if the driver has no
-// normalized 16 bit formats.
-// Returns false (and *out is not touched) if no format found.
-bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out)
-{
-    struct ra_imgfmt_desc res = {0};
-
-    struct mp_regular_imgfmt regfmt;
-    if (mp_get_regular_imgfmt(&regfmt, imgfmt)) {
-        enum ra_ctype ctype = RA_CTYPE_UNKNOWN;
-        res.num_planes = regfmt.num_planes;
-        res.component_bits = regfmt.component_size * 8;
-        res.component_pad = regfmt.component_pad;
-        for (int n = 0; n < regfmt.num_planes; n++) {
-            struct mp_regular_imgfmt_plane *plane = &regfmt.planes[n];
-            res.planes[n] = find_plane_format(ra, regfmt.component_size,
-                                              plane->num_components,
-                                              regfmt.component_type);
-            if (!res.planes[n])
-                return false;
-            for (int i = 0; i < plane->num_components; i++)
-                res.components[n][i] = plane->components[i];
-            // Dropping LSBs when shifting will lead to dropped MSBs.
-            if (res.component_bits > res.planes[n]->component_depth[0] &&
-                res.component_pad < 0)
-                return false;
-            // Renderer restriction, but actually an unwanted corner case.
-            if (ctype != RA_CTYPE_UNKNOWN && ctype != res.planes[n]->ctype)
-                return false;
-            ctype = res.planes[n]->ctype;
-        }
-        res.chroma_w = regfmt.chroma_w;
-        res.chroma_h = regfmt.chroma_h;
-        goto supported;
-    }
-
-    for (int n = 0; n < ra->num_formats; n++) {
-        if (imgfmt && ra->formats[n]->special_imgfmt == imgfmt) {
-            res = *ra->formats[n]->special_imgfmt_desc;
-            goto supported;
-        }
-    }
-
-    // Unsupported format
-    return false;
-
-supported:
-
-    *out = res;
-    return true;
-}
-
-void ra_dump_tex_formats(struct ra *ra, int msgl)
-{
-    if (!mp_msg_test(ra->log, msgl))
-        return;
-    MP_MSG(ra, msgl, "Texture formats:\n");
-    MP_MSG(ra, msgl, "  NAME       COMP*TYPE SIZE        DEPTH PER COMP.\n");
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        const char *ctype = "unknown";
-        switch (fmt->ctype) {
-        case RA_CTYPE_UNORM:    ctype = "unorm";    break;
-        case RA_CTYPE_UINT:     ctype = "uint ";    break;
-        case RA_CTYPE_FLOAT:    ctype = "float";    break;
-        }
-        char cl[40] = "";
-        for (int i = 0; i < fmt->num_components; i++) {
-            mp_snprintf_cat(cl, sizeof(cl), "%s%d", i ? " " : "",
-                            fmt->component_size[i]);
-            if (fmt->component_size[i] != fmt->component_depth[i])
-                mp_snprintf_cat(cl, sizeof(cl), "/%d", fmt->component_depth[i]);
-        }
-        MP_MSG(ra, msgl, "  %-10s %d*%s %3dB %s %s %s {%s}\n", fmt->name,
-               fmt->num_components, ctype, fmt->pixel_size,
-               fmt->luminance_alpha ? "LA" : "  ",
-               fmt->linear_filter ? "LF" : "  ",
-               fmt->renderable ? "CR" : "  ", cl);
-    }
-    MP_MSG(ra, msgl, " LA = LUMINANCE_ALPHA hack format\n");
-    MP_MSG(ra, msgl, " LF = linear filterable\n");
-    MP_MSG(ra, msgl, " CR = can be used for render targets\n");
-}
-
-void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
-                         int msgl)
-{
-    char pl[80] = "";
-    char pf[80] = "";
-    for (int n = 0; n < desc->num_planes; n++) {
-        if (n > 0) {
-            mp_snprintf_cat(pl, sizeof(pl), "/");
-            mp_snprintf_cat(pf, sizeof(pf), "/");
-        }
-        char t[5] = {0};
-        for (int i = 0; i < 4; i++)
-            t[i] = "_rgba"[desc->components[n][i]];
-        for (int i = 3; i > 0 && t[i] == '_'; i--)
-            t[i] = '\0';
-        mp_snprintf_cat(pl, sizeof(pl), "%s", t);
-        mp_snprintf_cat(pf, sizeof(pf), "%s", desc->planes[n]->name);
-    }
-    MP_MSG(ra, msgl, "%d planes %dx%d %d/%d [%s] (%s)\n",
-           desc->num_planes, desc->chroma_w, desc->chroma_h,
-           desc->component_bits, desc->component_pad, pf, pl);
-}
-
-void ra_dump_img_formats(struct ra *ra, int msgl)
-{
-    if (!mp_msg_test(ra->log, msgl))
-        return;
-    MP_MSG(ra, msgl, "Image formats:\n");
-    for (int imgfmt = IMGFMT_START; imgfmt < IMGFMT_END; imgfmt++) {
-        const char *name = mp_imgfmt_to_name(imgfmt);
-        if (strcmp(name, "unknown") == 0)
-            continue;
-        MP_MSG(ra, msgl, "  %s", name);
-        struct ra_imgfmt_desc desc;
-        if (ra_get_imgfmt_desc(ra, imgfmt, &desc)) {
-            MP_MSG(ra, msgl, " => ");
-            ra_dump_imgfmt_desc(ra, &desc, msgl);
-        } else {
-            MP_MSG(ra, msgl, "\n");
-        }
-    }
-}
diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h
deleted file mode 100644
index ae7fb9aea7..0000000000
--- a/video/out/opengl/ra.h
+++ /dev/null
@@ -1,491 +0,0 @@
-#pragma once
-
-#include "common/common.h"
-#include "misc/bstr.h"
-
-// Handle for a rendering API backend.
-struct ra {
-    struct ra_fns *fns;
-    void *priv;
-
-    int glsl_version;       // GLSL version (e.g. 300 => 3.0)
-    bool glsl_es;           // use ES dialect
-    bool glsl_vulkan;       // use vulkan dialect
-
-    struct mp_log *log;
-
-    // RA_CAP_* bit field. The RA backend must set supported features at init
-    // time.
-    uint64_t caps;
-
-    // Maximum supported width and height of a 2D texture. Set by the RA backend
-    // at init time.
-    int max_texture_wh;
-
-    // Maximum shared memory for compute shaders. Set by the RA backend at init
-    // time.
-    size_t max_shmem;
-
-    // Set of supported texture formats. Must be added by RA backend at init time.
-    // If there are equivalent formats with different caveats, the preferred
-    // formats should have a lower index. (E.g. GLES3 should put rg8 before la.)
-    struct ra_format **formats;
-    int num_formats;
-
-    // Accelerate texture uploads via an extra PBO even when
-    // RA_CAP_DIRECT_UPLOAD is supported. This is basically only relevant for
-    // OpenGL. Set by the RA user.
-    bool use_pbo;
-};
-
-enum {
-    RA_CAP_TEX_1D         = 1 << 0, // supports 1D textures (as shader inputs)
-    RA_CAP_TEX_3D         = 1 << 1, // supports 3D textures (as shader inputs)
-    RA_CAP_BLIT           = 1 << 2, // supports ra_fns.blit
-    RA_CAP_COMPUTE        = 1 << 3, // supports compute shaders
-    RA_CAP_DIRECT_UPLOAD  = 1 << 4, // supports tex_upload without ra_buf
-    RA_CAP_BUF_RO         = 1 << 5, // supports RA_VARTYPE_BUF_RO
-    RA_CAP_BUF_RW         = 1 << 6, // supports RA_VARTYPE_BUF_RW
-    RA_CAP_NESTED_ARRAY   = 1 << 7, // supports nested arrays
-    RA_CAP_SHARED_BINDING = 1 << 8, // sampler/image/buffer namespaces are disjoint
-    RA_CAP_GLOBAL_UNIFORM = 1 << 9, // supports using "naked" uniforms (not UBO)
-};
-
-enum ra_ctype {
-    RA_CTYPE_UNKNOWN = 0,   // also used for inconsistent multi-component formats
-    RA_CTYPE_UNORM,         // unsigned normalized integer (fixed point) formats
-    RA_CTYPE_UINT,          // full integer formats
-    RA_CTYPE_FLOAT,         // float formats (signed, any bit size)
-};
-
-// All formats must be useable as texture formats. All formats must be byte
-// aligned (all pixels start and end on a byte boundary), at least as far CPU
-// transfers are concerned.
-struct ra_format {
-    // All fields are read-only after creation.
-    const char *name;       // symbolic name for user interaction/debugging
-    void *priv;
-    enum ra_ctype ctype;    // data type of each component
-    bool ordered;           // components are sequential in memory, and returned
-                            // by the shader in memory order (the shader can
-                            // return arbitrary values for unused components)
-    int num_components;     // component count, 0 if not applicable, max. 4
-    int component_size[4];  // in bits, all entries 0 if not applicable
-    int component_depth[4]; // bits in use for each component, 0 if not applicable
-                            // (_must_ be set if component_size[] includes padding,
-                            //  and the real procession as seen by shader is lower)
-    int pixel_size;         // in bytes, total pixel size (0 if opaque)
-    bool luminance_alpha;   // pre-GL_ARB_texture_rg hack for 2 component textures
-                            // if this is set, shader must use .ra instead of .rg
-                            // only applies to 2-component textures
-    bool linear_filter;     // linear filtering available from shader
-    bool renderable;        // can be used for render targets
-
-    // If not 0, the format represents some sort of packed fringe format, whose
-    // shader representation is given by the special_imgfmt_desc pointer.
-    int special_imgfmt;
-    const struct ra_imgfmt_desc *special_imgfmt_desc;
-};
-
-struct ra_tex_params {
-    int dimensions;         // 1-3 for 1D-3D textures
-    // Size of the texture. 1D textures require h=d=1, 2D textures require d=1.
-    int w, h, d;
-    const struct ra_format *format;
-    bool render_src;        // must be useable as source texture in a shader
-    bool render_dst;        // must be useable as target texture in a shader
-    bool storage_dst;       // must be usable as a storage image (RA_VARTYPE_IMG_W)
-    bool blit_src;          // must be usable as a blit source
-    bool blit_dst;          // must be usable as a blit destination
-    bool host_mutable;      // texture may be updated with tex_upload
-    // When used as render source texture.
-    bool src_linear;        // if false, use nearest sampling (whether this can
-                            // be true depends on ra_format.linear_filter)
-    bool src_repeat;        // if false, clamp texture coordinates to edge
-                            // if true, repeat texture coordinates
-    bool non_normalized;    // hack for GL_TEXTURE_RECTANGLE OSX idiocy
-                            // always set to false, except in OSX code
-    bool external_oes;      // hack for GL_TEXTURE_EXTERNAL_OES idiocy
-    // If non-NULL, the texture will be created with these contents. Using
-    // this does *not* require setting host_mutable. Otherwise, the initial
-    // data is undefined.
-    void *initial_data;
-};
-
-// Conflates the following typical GPU API concepts:
-// - texture itself
-// - sampler state
-// - staging buffers for texture upload
-// - framebuffer objects
-// - wrappers for swapchain framebuffers
-// - synchronization needed for upload/rendering/etc.
-struct ra_tex {
-    // All fields are read-only after creation.
-    struct ra_tex_params params;
-    void *priv;
-};
-
-struct ra_tex_upload_params {
-    struct ra_tex *tex; // Texture to upload to
-    bool invalidate;    // Discard pre-existing data not in the region uploaded
-    // Uploading from buffer:
-    struct ra_buf *buf; // Buffer to upload from (mutually exclusive with `src`)
-    size_t buf_offset;  // Start of data within buffer (bytes)
-    // Uploading directly: (Note: If RA_CAP_DIRECT_UPLOAD is not set, then this
-    // will be internally translated to a tex_upload buffer by the RA)
-    const void *src;    // Address of data
-    // For 2D textures only:
-    struct mp_rect *rc; // Region to upload. NULL means entire image
-    ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
-};
-
-// Buffer type hint. Setting this may result in more or less efficient
-// operation, although it shouldn't technically prohibit anything
-enum ra_buf_type {
-    RA_BUF_TYPE_INVALID,
-    RA_BUF_TYPE_TEX_UPLOAD,     // texture upload buffer (pixel buffer object)
-    RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
-    RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
-};
-
-struct ra_buf_params {
-    enum ra_buf_type type;
-    size_t size;
-    bool host_mapped;  // create a read-writable persistent mapping (ra_buf.data)
-    bool host_mutable; // contents may be updated via buf_update()
-    // If non-NULL, the buffer will be created with these contents. Otherwise,
-    // the initial data is undefined.
-    void *initial_data;
-};
-
-// A generic buffer, which can be used for many purposes (texture upload,
-// storage buffer, uniform buffer, etc.)
-struct ra_buf {
-    // All fields are read-only after creation.
-    struct ra_buf_params params;
-    void *data; // for persistently mapped buffers, points to the first byte
-    void *priv;
-};
-
-// Type of a shader uniform variable, or a vertex attribute. In all cases,
-// vectors are matrices are done by having more than 1 value.
-enum ra_vartype {
-    RA_VARTYPE_INVALID,
-    RA_VARTYPE_INT,             // C: int, GLSL: int, ivec*
-    RA_VARTYPE_FLOAT,           // C: float, GLSL: float, vec*, mat*
-    RA_VARTYPE_TEX,             // C: ra_tex*, GLSL: various sampler types
-                                // ra_tex.params.render_src must be true
-    RA_VARTYPE_IMG_W,           // C: ra_tex*, GLSL: various image types
-                                // write-only (W) image for compute shaders
-                                // ra_tex.params.storage_dst must be true
-    RA_VARTYPE_BYTE_UNORM,      // C: uint8_t, GLSL: int, vec* (vertex data only)
-    RA_VARTYPE_BUF_RO,          // C: ra_buf*, GLSL: uniform buffer block
-                                // buf type must be RA_BUF_TYPE_UNIFORM
-    RA_VARTYPE_BUF_RW,          // C: ra_buf*, GLSL: shader storage buffer block
-                                // buf type must be RA_BUF_TYPE_SHADER_STORAGE
-    RA_VARTYPE_COUNT
-};
-
-// Returns the host size of a ra_vartype, or 0 for abstract vartypes (e.g. tex)
-size_t ra_vartype_size(enum ra_vartype type);
-
-// Represents a uniform, texture input parameter, and similar things.
-struct ra_renderpass_input {
-    const char *name;       // name as used in the shader
-    enum ra_vartype type;
-    // The total number of values is given by dim_v * dim_m.
-    int dim_v;              // vector dimension (1 for non-vector and non-matrix)
-    int dim_m;              // additional matrix dimension (dim_v x dim_m)
-    // Vertex data: byte offset of the attribute into the vertex struct
-    size_t offset;
-    // RA_VARTYPE_TEX: texture unit
-    // RA_VARTYPE_IMG_W: image unit
-    // RA_VARTYPE_BUF_* buffer binding point
-    // Other uniforms: unused
-    // If RA_CAP_SHARED_BINDING is set, these may only be unique per input type.
-    // Otherwise, these must be unique for all input values.
-    int binding;
-};
-
-// Represents the layout requirements of an input value
-struct ra_layout {
-    size_t align;  // the alignment requirements (always a power of two)
-    size_t stride; // the delta between two rows of an array/matrix
-    size_t size;   // the total size of the input
-};
-
-// Returns the host layout of a render pass input. Returns {0} for renderpass
-// inputs without a corresponding host representation (e.g. textures/buffers)
-struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input);
-
-enum ra_blend {
-    RA_BLEND_ZERO,
-    RA_BLEND_ONE,
-    RA_BLEND_SRC_ALPHA,
-    RA_BLEND_ONE_MINUS_SRC_ALPHA,
-};
-
-enum ra_renderpass_type {
-    RA_RENDERPASS_TYPE_INVALID,
-    RA_RENDERPASS_TYPE_RASTER,  // vertex+fragment shader
-    RA_RENDERPASS_TYPE_COMPUTE, // compute shader
-};
-
-// Static part of a rendering pass. It conflates the following:
-//  - compiled shader and its list of uniforms
-//  - vertex attributes and its shader mappings
-//  - blending parameters
-// (For Vulkan, this would be shader module + pipeline state.)
-// Upon creation, the values of dynamic values such as uniform contents (whose
-// initial values are not provided here) are required to be 0.
-struct ra_renderpass_params {
-    enum ra_renderpass_type type;
-
-    // Uniforms, including texture/sampler inputs.
-    struct ra_renderpass_input *inputs;
-    int num_inputs;
-
-    // Highly implementation-specific byte array storing a compiled version
-    // of the program. Can be used to speed up shader compilation. A backend
-    // xan read this in renderpass_create, or set this on the newly created
-    // ra_renderpass params field.
-    bstr cached_program;
-
-    // --- type==RA_RENDERPASS_TYPE_RASTER only
-
-    // Describes the format of the vertex data. When using ra.glsl_vulkan,
-    // the order of this array must match the vertex attribute locations.
-    struct ra_renderpass_input *vertex_attribs;
-    int num_vertex_attribs;
-    int vertex_stride;
-
-    // Format of the target texture
-    const struct ra_format *target_format;
-
-    // Shader text, in GLSL. (Yes, you need a GLSL compiler.)
-    // These are complete shaders, including prelude and declarations.
-    const char *vertex_shader;
-    const char *frag_shader;
-
-    // Target blending mode. If enable_blend is false, the blend_ fields can
-    // be ignored.
-    bool enable_blend;
-    enum ra_blend blend_src_rgb;
-    enum ra_blend blend_dst_rgb;
-    enum ra_blend blend_src_alpha;
-    enum ra_blend blend_dst_alpha;
-
-    // --- type==RA_RENDERPASS_TYPE_COMPUTE only
-
-    // Shader text, like vertex_shader/frag_shader.
-    const char *compute_shader;
-};
-
-struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
-        const struct ra_renderpass_params *params);
-
-// Conflates the following typical GPU API concepts:
-// - various kinds of shaders
-// - rendering pipelines
-// - descriptor sets, uniforms, other bindings
-// - all synchronization necessary
-// - the current values of all uniforms (this one makes it relatively stateful
-//   from an API perspective)
-struct ra_renderpass {
-    // All fields are read-only after creation.
-    struct ra_renderpass_params params;
-    void *priv;
-};
-
-// An input value (see ra_renderpass_input).
-struct ra_renderpass_input_val {
-    int index;  // index into ra_renderpass_params.inputs[]
-    void *data; // pointer to data according to ra_renderpass_input
-                // (e.g. type==RA_VARTYPE_FLOAT+dim_v=3,dim_m=3 => float[9])
-};
-
-// Parameters for performing a rendering pass (basically the dynamic params).
-// These change potentially every time.
-struct ra_renderpass_run_params {
-    struct ra_renderpass *pass;
-
-    // Generally this lists parameters only which changed since the last
-    // invocation and need to be updated. The ra_renderpass instance is
-    // supposed to keep unchanged values from the previous run.
-    // For non-primitive types like textures, these entries are always added,
-    // even if they do not change.
-    struct ra_renderpass_input_val *values;
-    int num_values;
-
-    // --- pass->params.type==RA_RENDERPASS_TYPE_RASTER only
-
-    // target->params.render_dst must be true, and target->params.format must
-    // match pass->params.target_format.
-    struct ra_tex *target;
-    struct mp_rect viewport;
-    struct mp_rect scissors;
-
-    // (The primitive type is always a triangle list.)
-    void *vertex_data;
-    int vertex_count;   // number of vertex elements, not bytes
-
-    // --- pass->params.type==RA_RENDERPASS_TYPE_COMPUTE only
-
-    // Number of work groups to be run in X/Y/Z dimensions.
-    int compute_groups[3];
-};
-
-// This is an opaque type provided by the implementation, but we want to at
-// least give it a saner name than void* for code readability purposes.
-typedef void ra_timer;
-
-// Rendering API entrypoints. (Note: there are some additional hidden features
-// you need to take care of. For example, hwdec mapping will be provided
-// separately from ra, but might need to call into ra private code.)
-struct ra_fns {
-    void (*destroy)(struct ra *ra);
-
-    // Create a texture (with undefined contents). Return NULL on failure.
-    // This is a rare operation, and normally textures and even FBOs for
-    // temporary rendering intermediate data are cached.
-    struct ra_tex *(*tex_create)(struct ra *ra,
-                                 const struct ra_tex_params *params);
-
-    void (*tex_destroy)(struct ra *ra, struct ra_tex *tex);
-
-    // Upload data to a texture. This is an extremely common operation. When
-    // using a buffer, the contants of the buffer must exactly match the image
-    // - conversions between bit depth etc. are not supported. The buffer *may*
-    // be marked as "in use" while this operation is going on, and the contents
-    // must not be touched again by the API user until buf_poll returns true.
-    // Returns whether successful.
-    bool (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params);
-
-    // Create a buffer. This can be used as a persistently mapped buffer,
-    // a uniform buffer, a shader storage buffer or possibly others.
-    // Not all usage types must be supported; may return NULL if unavailable.
-    struct ra_buf *(*buf_create)(struct ra *ra,
-                                 const struct ra_buf_params *params);
-
-    void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
-
-    // Update the contents of a buffer, starting at a given offset and up to a
-    // given size, with the contents of *data. This is an extremely common
-    // operation. Calling this while the buffer is considered "in use" is an
-    // error. (See: buf_poll)
-    void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
-                       const void *data, size_t size);
-
-    // Returns if a buffer is currently "in use" or not. Updating the contents
-    // of a buffer (via buf_update or writing to buf->data) while it is still
-    // in use is an error and may result in graphical corruption. Optional, if
-    // NULL then all buffers are always usable.
-    bool (*buf_poll)(struct ra *ra, struct ra_buf *buf);
-
-    // Returns the layout requirements of a uniform buffer element. Optional,
-    // but must be implemented if RA_CAP_BUF_RO is supported.
-    struct ra_layout (*uniform_layout)(struct ra_renderpass_input *inp);
-
-    // Clear the dst with the given color (rgba) and within the given scissor.
-    // dst must have dst->params.render_dst==true. Content outside of the
-    // scissor is preserved.
-    void (*clear)(struct ra *ra, struct ra_tex *dst, float color[4],
-                  struct mp_rect *scissor);
-
-    // Copy a sub-rectangle from one texture to another. The source/dest region
-    // is always within the texture bounds. Areas outside the dest region are
-    // preserved. The formats of the textures must be losely compatible. The
-    // dst texture can be a swapchain framebuffer, but src can not. Only 2D
-    // textures are supported.
-    // The textures must have blit_src and blit_dst set, respectively.
-    // Rectangles with negative width/height lead to flipping, different src/dst
-    // sizes lead to point scaling. Coordinates are always in pixels.
-    // Optional. Only available if RA_CAP_BLIT is set (if it's not set, it must
-    // not be called, even if it's non-NULL).
-    void (*blit)(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
-                 struct mp_rect *dst_rc, struct mp_rect *src_rc);
-
-    // Compile a shader and create a pipeline. This is a rare operation.
-    // The params pointer and anything it points to must stay valid until
-    // renderpass_destroy.
-    struct ra_renderpass *(*renderpass_create)(struct ra *ra,
-                                    const struct ra_renderpass_params *params);
-
-    void (*renderpass_destroy)(struct ra *ra, struct ra_renderpass *pass);
-
-    // Perform a render pass, basically drawing a list of triangles to a FBO.
-    // This is an extremely common operation.
-    void (*renderpass_run)(struct ra *ra,
-                           const struct ra_renderpass_run_params *params);
-
-    // Create a timer object. Returns NULL on failure, or if timers are
-    // unavailable for some reason. Optional.
-    ra_timer *(*timer_create)(struct ra *ra);
-
-    void (*timer_destroy)(struct ra *ra, ra_timer *timer);
-
-    // Start recording a timer. Note that valid usage requires you to pair
-    // every start with a stop. Trying to start a timer twice, or trying to
-    // stop a timer before having started it, consistutes invalid usage.
-    void (*timer_start)(struct ra *ra, ra_timer *timer);
-
-    // Stop recording a timer. This also returns any results that have been
-    // measured since the last usage of this ra_timer. It's important to note
-    // that GPU timer measurement are asynchronous, so this function does not
-    // always produce a value - and the values it does produce are typically
-    // delayed by a few frames. When no value is available, this returns 0.
-    uint64_t (*timer_stop)(struct ra *ra, ra_timer *timer);
-
-    // Hint that possibly queued up commands should be sent to the GPU. Optional.
-    void (*flush)(struct ra *ra);
-
-    // Associates a marker with any past error messages, for debugging
-    // purposes. Optional.
-    void (*debug_marker)(struct ra *ra, const char *msg);
-};
-
-struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params);
-void ra_tex_free(struct ra *ra, struct ra_tex **tex);
-
-struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params);
-void ra_buf_free(struct ra *ra, struct ra_buf **buf);
-
-void ra_free(struct ra **ra);
-
-const struct ra_format *ra_find_unorm_format(struct ra *ra,
-                                             int bytes_per_component,
-                                             int n_components);
-const struct ra_format *ra_find_uint_format(struct ra *ra,
-                                            int bytes_per_component,
-                                            int n_components);
-const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components);
-const struct ra_format *ra_find_named_format(struct ra *ra, const char *name);
-
-struct ra_imgfmt_desc {
-    int num_planes;
-    const struct ra_format *planes[4];
-    // Chroma pixel size (1x1 is 4:4:4)
-    uint8_t chroma_w, chroma_h;
-    // Component storage size in bits (possibly padded). For formats with
-    // different sizes per component, this is arbitrary. For padded formats
-    // like P010 or YUV420P10, padding is included.
-    int component_bits;
-    // Like mp_regular_imgfmt.component_pad.
-    int component_pad;
-    // For each texture and each texture output (rgba order) describe what
-    // component it returns.
-    // The values are like the values in mp_regular_imgfmt_plane.components[].
-    // Access as components[plane_nr][component_index]. Set unused items to 0.
-    // For ra_format.luminance_alpha, this returns 1/2 ("rg") instead of 1/4
-    // ("ra"). the logic is that the texture format has 2 channels, thus the
-    // data must be returned in the first two components. The renderer fixes
-    // this later.
-    uint8_t components[4][4];
-};
-
-bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out);
-
-void ra_dump_tex_formats(struct ra *ra, int msgl);
-void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
-                         int msgl);
-void ra_dump_img_formats(struct ra *ra, int msgl);
diff --git a/video/out/opengl/ra_gl.c b/video/out/opengl/ra_gl.c
index 0d99877a9e..ccb8755ba6 100644
--- a/video/out/opengl/ra_gl.c
+++ b/video/out/opengl/ra_gl.c
@@ -1097,12 +1097,6 @@ static uint64_t gl_timer_stop(struct ra *ra, ra_timer *ratimer)
     return timer->result;
 }
 
-static void gl_flush(struct ra *ra)
-{
-    GL *gl = ra_gl_get(ra);
-    gl->Flush();
-}
-
 static void gl_debug_marker(struct ra *ra, const char *msg)
 {
     struct ra_gl *p = ra->priv;
@@ -1130,6 +1124,5 @@ static struct ra_fns ra_fns_gl = {
     .timer_destroy          = gl_timer_destroy,
     .timer_start            = gl_timer_start,
     .timer_stop             = gl_timer_stop,
-    .flush                  = gl_flush,
     .debug_marker           = gl_debug_marker,
 };
diff --git a/video/out/opengl/ra_gl.h b/video/out/opengl/ra_gl.h
index e5e09a0197..9844977801 100644
--- a/video/out/opengl/ra_gl.h
+++ b/video/out/opengl/ra_gl.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include "common.h"
-#include "ra.h"
-#include "gl_utils.h"
+#include "utils.h"
 
 struct ra *ra_create_gl(GL *gl, struct mp_log *log);
 struct ra_tex *ra_create_wrapped_tex(struct ra *ra,
diff --git a/video/out/opengl/shader_cache.c b/video/out/opengl/shader_cache.c
deleted file mode 100644
index 90a757617b..0000000000
--- a/video/out/opengl/shader_cache.c
+++ /dev/null
@@ -1,955 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include <libavutil/sha.h>
-#include <libavutil/mem.h>
-
-#include "osdep/io.h"
-
-#include "common/common.h"
-#include "options/path.h"
-#include "stream/stream.h"
-#include "shader_cache.h"
-#include "formats.h"
-#include "utils.h"
-
-// Force cache flush if more than this number of shaders is created.
-#define SC_MAX_ENTRIES 48
-
-union uniform_val {
-    float f[9];         // RA_VARTYPE_FLOAT
-    int i[4];           // RA_VARTYPE_INT
-    struct ra_tex *tex; // RA_VARTYPE_TEX, RA_VARTYPE_IMG_*
-    struct ra_buf *buf; // RA_VARTYPE_BUF_*
-};
-
-enum sc_uniform_type {
-    SC_UNIFORM_TYPE_GLOBAL = 0, // global uniform (RA_CAP_GLOBAL_UNIFORM)
-    SC_UNIFORM_TYPE_UBO = 1,    // uniform buffer (RA_CAP_BUF_RO)
-};
-
-struct sc_uniform {
-    enum sc_uniform_type type;
-    struct ra_renderpass_input input;
-    const char *glsl_type;
-    union uniform_val v;
-    char *buffer_format;
-    // for SC_UNIFORM_TYPE_UBO:
-    struct ra_layout layout;
-    size_t offset; // byte offset within the buffer
-};
-
-struct sc_cached_uniform {
-    union uniform_val v;
-    int index; // for ra_renderpass_input_val
-    bool set; // whether the uniform has ever been set
-};
-
-struct sc_entry {
-    struct ra_renderpass *pass;
-    struct sc_cached_uniform *cached_uniforms;
-    int num_cached_uniforms;
-    bstr total;
-    struct timer_pool *timer;
-    struct ra_buf *ubo;
-    int ubo_index; // for ra_renderpass_input_val.index
-};
-
-struct gl_shader_cache {
-    struct ra *ra;
-    struct mp_log *log;
-
-    // permanent
-    char **exts;
-    int num_exts;
-
-    // this is modified during use (gl_sc_add() etc.) and reset for each shader
-    bstr prelude_text;
-    bstr header_text;
-    bstr text;
-
-    // Next binding point (texture unit, image unit, buffer binding, etc.)
-    // In OpenGL these are separate for each input type
-    int next_binding[RA_VARTYPE_COUNT];
-
-    struct ra_renderpass_params params;
-
-    struct sc_entry **entries;
-    int num_entries;
-
-    struct sc_entry *current_shader; // set by gl_sc_generate()
-
-    struct sc_uniform *uniforms;
-    int num_uniforms;
-
-    int ubo_binding;
-    size_t ubo_size;
-
-    struct ra_renderpass_input_val *values;
-    int num_values;
-
-    // For checking that the user is calling gl_sc_reset() properly.
-    bool needs_reset;
-
-    bool error_state; // true if an error occurred
-
-    // temporary buffers (avoids frequent reallocations)
-    bstr tmp[6];
-
-    // For the disk-cache.
-    char *cache_dir;
-    struct mpv_global *global; // can be NULL
-};
-
-static void gl_sc_reset(struct gl_shader_cache *sc);
-
-struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
-                                     struct mp_log *log)
-{
-    struct gl_shader_cache *sc = talloc_ptrtype(NULL, sc);
-    *sc = (struct gl_shader_cache){
-        .ra = ra,
-        .global = global,
-        .log = log,
-    };
-    gl_sc_reset(sc);
-    return sc;
-}
-
-// Reset the previous pass. This must be called after gl_sc_generate and before
-// starting a new shader.
-static void gl_sc_reset(struct gl_shader_cache *sc)
-{
-    sc->prelude_text.len = 0;
-    sc->header_text.len = 0;
-    sc->text.len = 0;
-    for (int n = 0; n < sc->num_uniforms; n++)
-        talloc_free((void *)sc->uniforms[n].input.name);
-    sc->num_uniforms = 0;
-    sc->ubo_binding = 0;
-    sc->ubo_size = 0;
-    for (int i = 0; i < RA_VARTYPE_COUNT; i++)
-        sc->next_binding[i] = 0;
-    sc->current_shader = NULL;
-    sc->params = (struct ra_renderpass_params){0};
-    sc->needs_reset = false;
-}
-
-static void sc_flush_cache(struct gl_shader_cache *sc)
-{
-    MP_VERBOSE(sc, "flushing shader cache\n");
-
-    for (int n = 0; n < sc->num_entries; n++) {
-        struct sc_entry *e = sc->entries[n];
-        ra_buf_free(sc->ra, &e->ubo);
-        if (e->pass)
-            sc->ra->fns->renderpass_destroy(sc->ra, e->pass);
-        timer_pool_destroy(e->timer);
-        talloc_free(e);
-    }
-    sc->num_entries = 0;
-}
-
-void gl_sc_destroy(struct gl_shader_cache *sc)
-{
-    if (!sc)
-        return;
-    gl_sc_reset(sc);
-    sc_flush_cache(sc);
-    talloc_free(sc);
-}
-
-bool gl_sc_error_state(struct gl_shader_cache *sc)
-{
-    return sc->error_state;
-}
-
-void gl_sc_reset_error(struct gl_shader_cache *sc)
-{
-    sc->error_state = false;
-}
-
-void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name)
-{
-    for (int n = 0; n < sc->num_exts; n++) {
-        if (strcmp(sc->exts[n], name) == 0)
-            return;
-    }
-    MP_TARRAY_APPEND(sc, sc->exts, sc->num_exts, talloc_strdup(sc, name));
-}
-
-#define bstr_xappend0(sc, b, s) bstr_xappend(sc, b, bstr0(s))
-
-void gl_sc_add(struct gl_shader_cache *sc, const char *text)
-{
-    bstr_xappend0(sc, &sc->text, text);
-}
-
-void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
-{
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(sc, &sc->text, textf, ap);
-    va_end(ap);
-}
-
-void gl_sc_hadd(struct gl_shader_cache *sc, const char *text)
-{
-    bstr_xappend0(sc, &sc->header_text, text);
-}
-
-void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
-{
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(sc, &sc->header_text, textf, ap);
-    va_end(ap);
-}
-
-void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text)
-{
-    bstr_xappend(sc, &sc->header_text, text);
-}
-
-void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
-{
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(sc, &sc->prelude_text, textf, ap);
-    va_end(ap);
-}
-
-static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
-                                       const char *name)
-{
-    struct sc_uniform new = {
-        .input = {
-            .dim_v = 1,
-            .dim_m = 1,
-        },
-    };
-
-    for (int n = 0; n < sc->num_uniforms; n++) {
-        struct sc_uniform *u = &sc->uniforms[n];
-        if (strcmp(u->input.name, name) == 0) {
-            const char *allocname = u->input.name;
-            *u = new;
-            u->input.name = allocname;
-            return u;
-        }
-    }
-
-    // not found -> add it
-    new.input.name = talloc_strdup(NULL, name);
-    MP_TARRAY_APPEND(sc, sc->uniforms, sc->num_uniforms, new);
-    return &sc->uniforms[sc->num_uniforms - 1];
-}
-
-static int gl_sc_next_binding(struct gl_shader_cache *sc, enum ra_vartype type)
-{
-    if (sc->ra->caps & RA_CAP_SHARED_BINDING) {
-        return sc->next_binding[type]++;
-    } else {
-        return sc->next_binding[0]++;
-    }
-}
-
-// Updates the UBO metadata for the given sc_uniform. Assumes sc_uniform->input
-// is already set. Also updates sc_uniform->type.
-static void update_ubo_params(struct gl_shader_cache *sc, struct sc_uniform *u)
-{
-    if (!(sc->ra->caps & RA_CAP_BUF_RO))
-        return;
-
-    // Using UBOs with explicit layout(offset) like we do requires GLSL version
-    // 440 or higher. In theory the UBO code can also use older versions, but
-    // just try and avoid potential headaches. This also ensures they're only
-    // used on drivers that are probably modern enough to actually support them
-    // correctly.
-    if (sc->ra->glsl_version < 440)
-        return;
-
-    u->type = SC_UNIFORM_TYPE_UBO;
-    u->layout = sc->ra->fns->uniform_layout(&u->input);
-    u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align);
-    sc->ubo_size = u->offset + u->layout.size;
-}
-
-void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
-                           struct ra_tex *tex)
-{
-    const char *glsl_type = "sampler2D";
-    if (tex->params.dimensions == 1) {
-        glsl_type = "sampler1D";
-    } else if (tex->params.dimensions == 3) {
-        glsl_type = "sampler3D";
-    } else if (tex->params.non_normalized) {
-        glsl_type = "sampler2DRect";
-    } else if (tex->params.external_oes) {
-        glsl_type = "samplerExternalOES";
-    } else if (tex->params.format->ctype == RA_CTYPE_UINT) {
-        glsl_type = sc->ra->glsl_es ? "highp usampler2D" : "usampler2D";
-    }
-
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_TEX;
-    u->glsl_type = glsl_type;
-    u->input.binding = gl_sc_next_binding(sc, u->input.type);
-    u->v.tex = tex;
-}
-
-void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
-                              struct ra_tex *tex)
-{
-    gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
-
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_IMG_W;
-    u->glsl_type = "writeonly image2D";
-    u->input.binding = gl_sc_next_binding(sc, u->input.type);
-    u->v.tex = tex;
-}
-
-void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
-                char *format, ...)
-{
-    assert(sc->ra->caps & RA_CAP_BUF_RW);
-    gl_sc_enable_extension(sc, "GL_ARB_shader_storage_buffer_object");
-
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_BUF_RW;
-    u->glsl_type = "";
-    u->input.binding = gl_sc_next_binding(sc, u->input.type);
-    u->v.buf = buf;
-
-    va_list ap;
-    va_start(ap, format);
-    u->buffer_format = ta_vasprintf(sc, format, ap);
-    va_end(ap);
-}
-
-void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->glsl_type = "float";
-    update_ubo_params(sc, u);
-    u->v.f[0] = f;
-}
-
-void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int i)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_INT;
-    u->glsl_type = "int";
-    update_ubo_params(sc, u);
-    u->v.i[0] = i;
-}
-
-void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2])
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 2;
-    u->glsl_type = "vec2";
-    update_ubo_params(sc, u);
-    u->v.f[0] = f[0];
-    u->v.f[1] = f[1];
-}
-
-void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, GLfloat f[3])
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 3;
-    u->glsl_type = "vec3";
-    update_ubo_params(sc, u);
-    u->v.f[0] = f[0];
-    u->v.f[1] = f[1];
-    u->v.f[2] = f[2];
-}
-
-static void transpose2x2(float r[2 * 2])
-{
-    MPSWAP(float, r[0+2*1], r[1+2*0]);
-}
-
-void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
-                        bool transpose, GLfloat *v)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 2;
-    u->input.dim_m = 2;
-    u->glsl_type = "mat2";
-    update_ubo_params(sc, u);
-    for (int n = 0; n < 4; n++)
-        u->v.f[n] = v[n];
-    if (transpose)
-        transpose2x2(&u->v.f[0]);
-}
-
-static void transpose3x3(float r[3 * 3])
-{
-    MPSWAP(float, r[0+3*1], r[1+3*0]);
-    MPSWAP(float, r[0+3*2], r[2+3*0]);
-    MPSWAP(float, r[1+3*2], r[2+3*1]);
-}
-
-void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
-                        bool transpose, GLfloat *v)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 3;
-    u->input.dim_m = 3;
-    u->glsl_type = "mat3";
-    update_ubo_params(sc, u);
-    for (int n = 0; n < 9; n++)
-        u->v.f[n] = v[n];
-    if (transpose)
-        transpose3x3(&u->v.f[0]);
-}
-
-// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
-// data layout and attribute names. The entries array is terminated with a {0}
-// entry. The array memory must remain valid indefinitely (for now).
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *entries,
-                             int vertex_stride)
-{
-    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
-    sc->params.num_vertex_attribs = 0;
-    while (entries[sc->params.num_vertex_attribs].name)
-        sc->params.num_vertex_attribs++;
-    sc->params.vertex_stride = vertex_stride;
-}
-
-void gl_sc_blend(struct gl_shader_cache *sc,
-                 enum ra_blend blend_src_rgb,
-                 enum ra_blend blend_dst_rgb,
-                 enum ra_blend blend_src_alpha,
-                 enum ra_blend blend_dst_alpha)
-{
-    sc->params.enable_blend = true;
-    sc->params.blend_src_rgb = blend_src_rgb;
-    sc->params.blend_dst_rgb = blend_dst_rgb;
-    sc->params.blend_src_alpha = blend_src_alpha;
-    sc->params.blend_dst_alpha = blend_dst_alpha;
-}
-
-static const char *vao_glsl_type(const struct ra_renderpass_input *e)
-{
-    // pretty dumb... too dumb, but works for us
-    switch (e->dim_v) {
-    case 1: return "float";
-    case 2: return "vec2";
-    case 3: return "vec3";
-    case 4: return "vec4";
-    default: abort();
-    }
-}
-
-static void update_ubo(struct ra *ra, struct ra_buf *ubo, struct sc_uniform *u)
-{
-    uintptr_t src = (uintptr_t) &u->v;
-    size_t dst = u->offset;
-    struct ra_layout src_layout = ra_renderpass_input_layout(&u->input);
-    struct ra_layout dst_layout = u->layout;
-
-    for (int i = 0; i < u->input.dim_m; i++) {
-        ra->fns->buf_update(ra, ubo, dst, (void *)src, src_layout.stride);
-        src += src_layout.stride;
-        dst += dst_layout.stride;
-    }
-}
-
-static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
-                           struct sc_uniform *u, int n)
-{
-    struct sc_cached_uniform *un = &e->cached_uniforms[n];
-    struct ra_layout layout = ra_renderpass_input_layout(&u->input);
-    if (layout.size > 0 && un->set && memcmp(&un->v, &u->v, layout.size) == 0)
-        return;
-
-    un->v = u->v;
-    un->set = true;
-
-    switch (u->type) {
-    case SC_UNIFORM_TYPE_GLOBAL: {
-        struct ra_renderpass_input_val value = {
-            .index = un->index,
-            .data = &un->v,
-        };
-        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, value);
-        break;
-    }
-    case SC_UNIFORM_TYPE_UBO:
-        assert(e->ubo);
-        update_ubo(sc->ra, e->ubo, u);
-        break;
-    default: abort();
-    }
-}
-
-void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir)
-{
-    talloc_free(sc->cache_dir);
-    sc->cache_dir = talloc_strdup(sc, dir);
-}
-
-static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
-{
-    bool ret = false;
-
-    void *tmp = talloc_new(NULL);
-    struct ra_renderpass_params params = sc->params;
-
-    MP_VERBOSE(sc, "new shader program:\n");
-    if (sc->header_text.len) {
-        MP_VERBOSE(sc, "header:\n");
-        mp_log_source(sc->log, MSGL_V, sc->header_text.start);
-        MP_VERBOSE(sc, "body:\n");
-    }
-    if (sc->text.len)
-        mp_log_source(sc->log, MSGL_V, sc->text.start);
-
-    // The vertex shader uses mangled names for the vertex attributes, so that
-    // the fragment shader can use the "real" names. But the shader is expecting
-    // the vertex attribute names (at least with older GLSL targets for GL).
-    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
-                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
-    for (int n = 0; n < params.num_vertex_attribs; n++) {
-        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
-        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
-    }
-
-    const char *cache_header = "mpv shader cache v1\n";
-    char *cache_filename = NULL;
-    char *cache_dir = NULL;
-
-    if (sc->cache_dir && sc->cache_dir[0]) {
-        // Try to load it from a disk cache.
-        cache_dir = mp_get_user_path(tmp, sc->global, sc->cache_dir);
-
-        struct AVSHA *sha = av_sha_alloc();
-        if (!sha)
-            abort();
-        av_sha_init(sha, 256);
-        av_sha_update(sha, entry->total.start, entry->total.len);
-
-        uint8_t hash[256 / 8];
-        av_sha_final(sha, hash);
-        av_free(sha);
-
-        char hashstr[256 / 8 * 2 + 1];
-        for (int n = 0; n < 256 / 8; n++)
-            snprintf(hashstr + n * 2, sizeof(hashstr) - n * 2, "%02X", hash[n]);
-
-        cache_filename = mp_path_join(tmp, cache_dir, hashstr);
-        if (stat(cache_filename, &(struct stat){0}) == 0) {
-            MP_VERBOSE(sc, "Trying to load shader from disk...\n");
-            struct bstr cachedata =
-                stream_read_file(cache_filename, tmp, sc->global, 1000000000);
-            if (bstr_eatstart0(&cachedata, cache_header))
-                params.cached_program = cachedata;
-        }
-    }
-
-    // If using a UBO, also make sure to add it as an input value so the RA
-    // can see it
-    if (sc->ubo_size) {
-        entry->ubo_index = sc->params.num_inputs;
-        struct ra_renderpass_input ubo_input = {
-            .name = "UBO",
-            .type = RA_VARTYPE_BUF_RO,
-            .dim_v = 1,
-            .dim_m = 1,
-            .binding = sc->ubo_binding,
-        };
-        MP_TARRAY_APPEND(sc, params.inputs, params.num_inputs, ubo_input);
-    }
-
-    entry->pass = sc->ra->fns->renderpass_create(sc->ra, &params);
-    if (!entry->pass)
-        goto error;
-
-    if (sc->ubo_size) {
-        struct ra_buf_params ubo_params = {
-            .type = RA_BUF_TYPE_UNIFORM,
-            .size = sc->ubo_size,
-            .host_mutable = true,
-        };
-
-        entry->ubo = ra_buf_create(sc->ra, &ubo_params);
-        if (!entry->ubo) {
-            MP_ERR(sc, "Failed creating uniform buffer!\n");
-            goto error;
-        }
-    }
-
-    if (entry->pass && cache_filename) {
-        bstr nc = entry->pass->params.cached_program;
-        if (nc.len && !bstr_equals(params.cached_program, nc)) {
-            mp_mkdirp(cache_dir);
-
-            MP_VERBOSE(sc, "Writing shader cache file: %s\n", cache_filename);
-            FILE *out = fopen(cache_filename, "wb");
-            if (out) {
-                fwrite(cache_header, strlen(cache_header), 1, out);
-                fwrite(nc.start, nc.len, 1, out);
-                fclose(out);
-            }
-        }
-    }
-
-    ret = true;
-
-error:
-    talloc_free(tmp);
-    return ret;
-}
-
-#define ADD(x, ...) bstr_xappend_asprintf(sc, (x), __VA_ARGS__)
-#define ADD_BSTR(x, s) bstr_xappend(sc, (x), (s))
-
-static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
-{
-    // Add all of the UBO entries separately as members of their own buffer
-    if (sc->ubo_size > 0) {
-        ADD(dst, "layout(std140, binding=%d) uniform UBO {\n", sc->ubo_binding);
-        for (int n = 0; n < sc->num_uniforms; n++) {
-            struct sc_uniform *u = &sc->uniforms[n];
-            if (u->type != SC_UNIFORM_TYPE_UBO)
-                continue;
-            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset,
-                u->glsl_type, u->input.name);
-        }
-        ADD(dst, "};\n");
-    }
-
-    for (int n = 0; n < sc->num_uniforms; n++) {
-        struct sc_uniform *u = &sc->uniforms[n];
-        if (u->type != SC_UNIFORM_TYPE_GLOBAL)
-            continue;
-        switch (u->input.type) {
-        case RA_VARTYPE_INT:
-        case RA_VARTYPE_FLOAT:
-            assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM);
-            // fall through
-        case RA_VARTYPE_TEX:
-        case RA_VARTYPE_IMG_W:
-            // Vulkan requires explicitly assigning the bindings in the shader
-            // source. For OpenGL it's optional, but requires higher GL version
-            // so we don't do it (and instead have ra_gl update the bindings
-            // after program creation).
-            if (sc->ra->glsl_vulkan)
-                ADD(dst, "layout(binding=%d) ", u->input.binding);
-            ADD(dst, "uniform %s %s;\n", u->glsl_type, u->input.name);
-            break;
-        case RA_VARTYPE_BUF_RO:
-            ADD(dst, "layout(std140, binding=%d) uniform %s { %s };\n",
-                u->input.binding, u->input.name, u->buffer_format);
-            break;
-        case RA_VARTYPE_BUF_RW:
-            ADD(dst, "layout(std430, binding=%d) buffer %s { %s };\n",
-                u->input.binding, u->input.name, u->buffer_format);
-            break;
-        }
-    }
-}
-
-// 1. Generate vertex and fragment shaders from the fragment shader text added
-//    with gl_sc_add(). The generated shader program is cached (based on the
-//    text), so actual compilation happens only the first time.
-// 2. Update the uniforms and textures set with gl_sc_uniform_*.
-// 3. Make the new shader program current (glUseProgram()).
-// After that, you render, and then you call gc_sc_reset(), which does:
-// 1. Unbind the program and all textures.
-// 2. Reset the sc state and prepare for a new shader program. (All uniforms
-//    and fragment operations needed for the next program have to be re-added.)
-static void gl_sc_generate(struct gl_shader_cache *sc,
-                           enum ra_renderpass_type type,
-                           const struct ra_format *target_format)
-{
-    int glsl_version = sc->ra->glsl_version;
-    int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
-
-    sc->params.type = type;
-
-    // gl_sc_reset() must be called after ending the previous render process,
-    // and before starting a new one.
-    assert(!sc->needs_reset);
-    sc->needs_reset = true;
-
-    // gl_sc_set_vertex_format() must always be called
-    assert(sc->params.vertex_attribs);
-
-    // If using a UBO, pick a binding (needed for shader generation)
-    if (sc->ubo_size)
-        sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
-
-    for (int n = 0; n < MP_ARRAY_SIZE(sc->tmp); n++)
-        sc->tmp[n].len = 0;
-
-    // set up shader text (header + uniforms + body)
-    bstr *header = &sc->tmp[0];
-    ADD(header, "#version %d%s\n", glsl_version, glsl_es >= 300 ? " es" : "");
-    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
-        // This extension cannot be enabled in fragment shader. Enable it as
-        // an exception for compute shader.
-        ADD(header, "#extension GL_ARB_compute_shader : enable\n");
-    }
-    for (int n = 0; n < sc->num_exts; n++)
-        ADD(header, "#extension %s : enable\n", sc->exts[n]);
-    if (glsl_es) {
-        ADD(header, "precision mediump float;\n");
-        ADD(header, "precision mediump sampler2D;\n");
-        if (sc->ra->caps & RA_CAP_TEX_3D)
-            ADD(header, "precision mediump sampler3D;\n");
-    }
-
-    if (glsl_version >= 130) {
-        ADD(header, "#define tex1D texture\n");
-        ADD(header, "#define tex3D texture\n");
-    } else {
-        ADD(header, "#define tex1D texture1D\n");
-        ADD(header, "#define tex3D texture3D\n");
-        ADD(header, "#define texture texture2D\n");
-    }
-
-    if (sc->ra->glsl_vulkan && type == RA_RENDERPASS_TYPE_COMPUTE) {
-        ADD(header, "#define gl_GlobalInvocationIndex "
-                    "(gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID)\n");
-    }
-
-    // Additional helpers.
-    ADD(header, "#define LUT_POS(x, lut_size)"
-                " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
-
-    char *vert_in = glsl_version >= 130 ? "in" : "attribute";
-    char *vert_out = glsl_version >= 130 ? "out" : "varying";
-    char *frag_in = glsl_version >= 130 ? "in" : "varying";
-
-    struct bstr *vert = NULL, *frag = NULL, *comp = NULL;
-
-    if (type == RA_RENDERPASS_TYPE_RASTER) {
-        // vertex shader: we don't use the vertex shader, so just setup a
-        // dummy, which passes through the vertex array attributes.
-        bstr *vert_head = &sc->tmp[1];
-        ADD_BSTR(vert_head, *header);
-        bstr *vert_body = &sc->tmp[2];
-        ADD(vert_body, "void main() {\n");
-        bstr *frag_vaos = &sc->tmp[3];
-        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
-            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
-            const char *glsl_type = vao_glsl_type(e);
-            char loc[32] = {0};
-            if (sc->ra->glsl_vulkan)
-                snprintf(loc, sizeof(loc), "layout(location=%d) ", n);
-            if (strcmp(e->name, "position") == 0) {
-                // setting raster pos. requires setting gl_Position magic variable
-                assert(e->dim_v == 2 && e->type == RA_VARTYPE_FLOAT);
-                ADD(vert_head, "%s%s vec2 vertex_position;\n", loc, vert_in);
-                ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n");
-            } else {
-                ADD(vert_head, "%s%s %s vertex_%s;\n", loc, vert_in, glsl_type, e->name);
-                ADD(vert_head, "%s%s %s %s;\n", loc, vert_out, glsl_type, e->name);
-                ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name);
-                ADD(frag_vaos, "%s%s %s %s;\n", loc, frag_in, glsl_type, e->name);
-            }
-        }
-        ADD(vert_body, "}\n");
-        vert = vert_head;
-        ADD_BSTR(vert, *vert_body);
-
-        // fragment shader; still requires adding used uniforms and VAO elements
-        frag = &sc->tmp[4];
-        ADD_BSTR(frag, *header);
-        if (glsl_version >= 130) {
-            ADD(frag, "%sout vec4 out_color;\n",
-                sc->ra->glsl_vulkan ? "layout(location=0) " : "");
-        }
-        ADD_BSTR(frag, *frag_vaos);
-        add_uniforms(sc, frag);
-
-        ADD_BSTR(frag, sc->prelude_text);
-        ADD_BSTR(frag, sc->header_text);
-
-        ADD(frag, "void main() {\n");
-        // we require _all_ frag shaders to write to a "vec4 color"
-        ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
-        ADD_BSTR(frag, sc->text);
-        if (glsl_version >= 130) {
-            ADD(frag, "out_color = color;\n");
-        } else {
-            ADD(frag, "gl_FragColor = color;\n");
-        }
-        ADD(frag, "}\n");
-
-        // We need to fix the format of the render dst at renderpass creation
-        // time
-        assert(target_format);
-        sc->params.target_format = target_format;
-    }
-
-    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
-        comp = &sc->tmp[4];
-        ADD_BSTR(comp, *header);
-
-        add_uniforms(sc, comp);
-
-        ADD_BSTR(comp, sc->prelude_text);
-        ADD_BSTR(comp, sc->header_text);
-
-        ADD(comp, "void main() {\n");
-        ADD(comp, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); // convenience
-        ADD_BSTR(comp, sc->text);
-        ADD(comp, "}\n");
-    }
-
-    bstr *hash_total = &sc->tmp[5];
-
-    ADD(hash_total, "type %d\n", sc->params.type);
-
-    if (frag) {
-        ADD_BSTR(hash_total, *frag);
-        sc->params.frag_shader = frag->start;
-    }
-    ADD(hash_total, "\n");
-    if (vert) {
-        ADD_BSTR(hash_total, *vert);
-        sc->params.vertex_shader = vert->start;
-    }
-    ADD(hash_total, "\n");
-    if (comp) {
-        ADD_BSTR(hash_total, *comp);
-        sc->params.compute_shader = comp->start;
-    }
-    ADD(hash_total, "\n");
-
-    if (sc->params.enable_blend) {
-        ADD(hash_total, "blend %d %d %d %d\n",
-            sc->params.blend_src_rgb, sc->params.blend_dst_rgb,
-            sc->params.blend_src_alpha, sc->params.blend_dst_alpha);
-    }
-
-    if (sc->params.target_format)
-        ADD(hash_total, "format %s\n", sc->params.target_format->name);
-
-    struct sc_entry *entry = NULL;
-    for (int n = 0; n < sc->num_entries; n++) {
-        struct sc_entry *cur = sc->entries[n];
-        if (bstr_equals(cur->total, *hash_total)) {
-            entry = cur;
-            break;
-        }
-    }
-    if (!entry) {
-        if (sc->num_entries == SC_MAX_ENTRIES)
-            sc_flush_cache(sc);
-        entry = talloc_ptrtype(NULL, entry);
-        *entry = (struct sc_entry){
-            .total = bstrdup(entry, *hash_total),
-            .timer = timer_pool_create(sc->ra),
-        };
-        for (int n = 0; n < sc->num_uniforms; n++) {
-            struct sc_cached_uniform u = {0};
-            if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
-                // global uniforms need to be made visible to the ra_renderpass
-                u.index = sc->params.num_inputs;
-                MP_TARRAY_APPEND(sc, sc->params.inputs, sc->params.num_inputs,
-                                 sc->uniforms[n].input);
-            }
-            MP_TARRAY_APPEND(entry, entry->cached_uniforms,
-                             entry->num_cached_uniforms, u);
-        }
-        if (!create_pass(sc, entry))
-            sc->error_state = true;
-        MP_TARRAY_APPEND(sc, sc->entries, sc->num_entries, entry);
-    }
-    if (sc->error_state)
-        return;
-
-    assert(sc->num_uniforms == entry->num_cached_uniforms);
-
-    sc->num_values = 0;
-    for (int n = 0; n < sc->num_uniforms; n++)
-        update_uniform(sc, entry, &sc->uniforms[n], n);
-
-    // If we're using a UBO, make sure to bind it as well
-    if (sc->ubo_size) {
-        struct ra_renderpass_input_val ubo_val = {
-            .index = entry->ubo_index,
-            .data = &entry->ubo,
-        };
-        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, ubo_val);
-    }
-
-    sc->current_shader = entry;
-}
-
-struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
-                                        void *ptr, size_t num)
-{
-    struct timer_pool *timer = NULL;
-
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
-    if (!sc->current_shader)
-        goto error;
-
-    timer = sc->current_shader->timer;
-
-    struct mp_rect full_rc = {0, 0, target->params.w, target->params.h};
-
-    struct ra_renderpass_run_params run = {
-        .pass = sc->current_shader->pass,
-        .values = sc->values,
-        .num_values = sc->num_values,
-        .target = target,
-        .vertex_data = ptr,
-        .vertex_count = num,
-        .viewport = full_rc,
-        .scissors = full_rc,
-    };
-
-    timer_pool_start(timer);
-    sc->ra->fns->renderpass_run(sc->ra, &run);
-    timer_pool_stop(timer);
-
-error:
-    gl_sc_reset(sc);
-    return timer_pool_measure(timer);
-}
-
-struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
-                                           int w, int h, int d)
-{
-    struct timer_pool *timer = NULL;
-
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
-    if (!sc->current_shader)
-        goto error;
-
-    timer = sc->current_shader->timer;
-
-    struct ra_renderpass_run_params run = {
-        .pass = sc->current_shader->pass,
-        .values = sc->values,
-        .num_values = sc->num_values,
-        .compute_groups = {w, h, d},
-    };
-
-    timer_pool_start(timer);
-    sc->ra->fns->renderpass_run(sc->ra, &run);
-    timer_pool_stop(timer);
-
-error:
-    gl_sc_reset(sc);
-    return timer_pool_measure(timer);
-}
diff --git a/video/out/opengl/shader_cache.h b/video/out/opengl/shader_cache.h
deleted file mode 100644
index 82a078079b..0000000000
--- a/video/out/opengl/shader_cache.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "ra.h"
-
-// For mp_pass_perf
-#include "video/out/vo.h"
-
-struct mp_log;
-struct mpv_global;
-struct gl_shader_cache;
-
-struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
-                                     struct mp_log *log);
-void gl_sc_destroy(struct gl_shader_cache *sc);
-bool gl_sc_error_state(struct gl_shader_cache *sc);
-void gl_sc_reset_error(struct gl_shader_cache *sc);
-void gl_sc_add(struct gl_shader_cache *sc, const char *text);
-void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
-    PRINTF_ATTRIBUTE(2, 3);
-void gl_sc_hadd(struct gl_shader_cache *sc, const char *text);
-void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
-    PRINTF_ATTRIBUTE(2, 3);
-void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
-void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
-    PRINTF_ATTRIBUTE(2, 3);
-void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
-                           struct ra_tex *tex);
-void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
-                              struct ra_tex *tex);
-void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
-                char *format, ...) PRINTF_ATTRIBUTE(4, 5);
-void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f);
-void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int f);
-void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2]);
-void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3]);
-void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
-                        bool transpose, float *v);
-void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
-                        bool transpose, float *v);
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *vertex_attribs,
-                             int vertex_stride);
-void gl_sc_blend(struct gl_shader_cache *sc,
-                 enum ra_blend blend_src_rgb,
-                 enum ra_blend blend_dst_rgb,
-                 enum ra_blend blend_src_alpha,
-                 enum ra_blend blend_dst_alpha);
-void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
-struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
-                                        void *ptr, size_t num);
-struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
-                                           int w, int h, int d);
-void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir);
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
deleted file mode 100644
index 58a1ac9e64..0000000000
--- a/video/out/opengl/user_shaders.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <assert.h>
-
-#include "misc/ctype.h"
-#include "user_shaders.h"
-#include "formats.h"
-
-static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
-{
-    int pos = 0;
-
-    while (line.len > 0) {
-        struct bstr word = bstr_strip(bstr_splitchar(line, &line, ' '));
-        if (word.len == 0)
-            continue;
-
-        if (pos >= MAX_SZEXP_SIZE)
-            return false;
-
-        struct szexp *exp = &out[pos++];
-
-        if (bstr_eatend0(&word, ".w") || bstr_eatend0(&word, ".width")) {
-            exp->tag = SZEXP_VAR_W;
-            exp->val.varname = word;
-            continue;
-        }
-
-        if (bstr_eatend0(&word, ".h") || bstr_eatend0(&word, ".height")) {
-            exp->tag = SZEXP_VAR_H;
-            exp->val.varname = word;
-            continue;
-        }
-
-        switch (word.start[0]) {
-        case '+': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_ADD; continue;
-        case '-': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_SUB; continue;
-        case '*': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_MUL; continue;
-        case '/': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_DIV; continue;
-        case '!': exp->tag = SZEXP_OP1; exp->val.op = SZEXP_OP_NOT; continue;
-        case '>': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_GT;  continue;
-        case '<': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_LT;  continue;
-        }
-
-        if (mp_isdigit(word.start[0])) {
-            exp->tag = SZEXP_CONST;
-            if (bstr_sscanf(word, "%f", &exp->val.cval) != 1)
-                return false;
-            continue;
-        }
-
-        // Some sort of illegal expression
-        return false;
-    }
-
-    return true;
-}
-
-// Returns whether successful. 'result' is left untouched on failure
-bool eval_szexpr(struct mp_log *log, void *priv,
-                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
-                 struct szexp expr[MAX_SZEXP_SIZE], float *result)
-{
-    float stack[MAX_SZEXP_SIZE] = {0};
-    int idx = 0; // points to next element to push
-
-    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
-        switch (expr[i].tag) {
-        case SZEXP_END:
-            goto done;
-
-        case SZEXP_CONST:
-            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
-            // impossible to overflow the stack
-            assert(idx < MAX_SZEXP_SIZE);
-            stack[idx++] = expr[i].val.cval;
-            continue;
-
-        case SZEXP_OP1:
-            if (idx < 1) {
-                mp_warn(log, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            switch (expr[i].val.op) {
-            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
-            default: abort();
-            }
-            continue;
-
-        case SZEXP_OP2:
-            if (idx < 2) {
-                mp_warn(log, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            // Pop the operands in reverse order
-            float op2 = stack[--idx];
-            float op1 = stack[--idx];
-            float res = 0.0;
-            switch (expr[i].val.op) {
-            case SZEXP_OP_ADD: res = op1 + op2; break;
-            case SZEXP_OP_SUB: res = op1 - op2; break;
-            case SZEXP_OP_MUL: res = op1 * op2; break;
-            case SZEXP_OP_DIV: res = op1 / op2; break;
-            case SZEXP_OP_GT:  res = op1 > op2; break;
-            case SZEXP_OP_LT:  res = op1 < op2; break;
-            default: abort();
-            }
-
-            if (!isfinite(res)) {
-                mp_warn(log, "Illegal operation in RPN expression!\n");
-                return false;
-            }
-
-            stack[idx++] = res;
-            continue;
-
-        case SZEXP_VAR_W:
-        case SZEXP_VAR_H: {
-            struct bstr name = expr[i].val.varname;
-            float size[2];
-
-            if (!lookup(priv, name, size)) {
-                mp_warn(log, "Variable %.*s not found in RPN expression!\n",
-                        BSTR_P(name));
-                return false;
-            }
-
-            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? size[0] : size[1];
-            continue;
-            }
-        }
-    }
-
-done:
-    // Return the single stack element
-    if (idx != 1) {
-        mp_warn(log, "Malformed stack after RPN expression!\n");
-        return false;
-    }
-
-    *result = stack[0];
-    return true;
-}
-
-static bool parse_hook(struct mp_log *log, struct bstr *body,
-                       struct gl_user_shader_hook *out)
-{
-    *out = (struct gl_user_shader_hook){
-        .pass_desc = bstr0("(unknown)"),
-        .offset = identity_trans,
-        .width = {{ SZEXP_VAR_W, { .varname = bstr0("HOOKED") }}},
-        .height = {{ SZEXP_VAR_H, { .varname = bstr0("HOOKED") }}},
-        .cond = {{ SZEXP_CONST, { .cval = 1.0 }}},
-    };
-
-    int hook_idx = 0;
-    int bind_idx = 0;
-
-    // Parse all headers
-    while (true) {
-        struct bstr rest;
-        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
-
-        // Check for the presence of the magic line beginning
-        if (!bstr_eatstart0(&line, "//!"))
-            break;
-
-        *body = rest;
-
-        // Parse the supported commands
-        if (bstr_eatstart0(&line, "HOOK")) {
-            if (hook_idx == SHADER_MAX_HOOKS) {
-                mp_err(log, "Passes may only hook up to %d textures!\n",
-                       SHADER_MAX_HOOKS);
-                return false;
-            }
-            out->hook_tex[hook_idx++] = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "BIND")) {
-            if (bind_idx == SHADER_MAX_BINDS) {
-                mp_err(log, "Passes may only bind up to %d textures!\n",
-                       SHADER_MAX_BINDS);
-                return false;
-            }
-            out->bind_tex[bind_idx++] = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "SAVE")) {
-            out->save_tex = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "DESC")) {
-            out->pass_desc = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "OFFSET")) {
-            float ox, oy;
-            if (bstr_sscanf(line, "%f %f", &ox, &oy) != 2) {
-                mp_err(log, "Error while parsing OFFSET!\n");
-                return false;
-            }
-            out->offset.t[0] = ox;
-            out->offset.t[1] = oy;
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "WIDTH")) {
-            if (!parse_rpn_szexpr(line, out->width)) {
-                mp_err(log, "Error while parsing WIDTH!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "HEIGHT")) {
-            if (!parse_rpn_szexpr(line, out->height)) {
-                mp_err(log, "Error while parsing HEIGHT!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "WHEN")) {
-            if (!parse_rpn_szexpr(line, out->cond)) {
-                mp_err(log, "Error while parsing WHEN!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "COMPONENTS")) {
-            if (bstr_sscanf(line, "%d", &out->components) != 1) {
-                mp_err(log, "Error while parsing COMPONENTS!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "COMPUTE")) {
-            struct compute_info *ci = &out->compute;
-            int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
-                                  &ci->threads_w, &ci->threads_h);
-
-            if (num == 2 || num == 4) {
-                ci->active = true;
-                ci->directly_writes = true;
-            } else {
-                mp_err(log, "Error while parsing COMPUTE!\n");
-                return false;
-            }
-            continue;
-        }
-
-        // Unknown command type
-        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
-        return false;
-    }
-
-    // The rest of the file up until the next magic line beginning (if any)
-    // shall be the shader body
-    if (bstr_split_tok(*body, "//!", &out->pass_body, body)) {
-        // Make sure the magic line is part of the rest
-        body->start -= 3;
-        body->len += 3;
-    }
-
-    // Sanity checking
-    if (hook_idx == 0)
-        mp_warn(log, "Pass has no hooked textures (will be ignored)!\n");
-
-    return true;
-}
-
-static bool parse_tex(struct mp_log *log, struct ra *ra, struct bstr *body,
-                      struct gl_user_shader_tex *out)
-{
-    *out = (struct gl_user_shader_tex){
-        .name = bstr0("USER_TEX"),
-        .params = {
-            .dimensions = 2,
-            .w = 1, .h = 1, .d = 1,
-            .render_src = true,
-            .src_linear = true,
-        },
-    };
-    struct ra_tex_params *p = &out->params;
-
-    while (true) {
-        struct bstr rest;
-        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
-
-        if (!bstr_eatstart0(&line, "//!"))
-            break;
-
-        *body = rest;
-
-        if (bstr_eatstart0(&line, "TEXTURE")) {
-            out->name = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "SIZE")) {
-            p->dimensions = bstr_sscanf(line, "%d %d %d", &p->w, &p->h, &p->d);
-            if (p->dimensions < 1 || p->dimensions > 3 ||
-                p->w < 1 || p->h < 1 || p->d < 1)
-            {
-                mp_err(log, "Error while parsing SIZE!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "FORMAT ")) {
-            p->format = NULL;
-            for (int n = 0; n < ra->num_formats; n++) {
-                const struct ra_format *fmt = ra->formats[n];
-                if (bstr_equals0(line, fmt->name)) {
-                    p->format = fmt;
-                    break;
-                }
-            }
-            // (pixel_size==0 is for opaque formats)
-            if (!p->format || !p->format->pixel_size) {
-                mp_err(log, "Unrecognized/unavailable FORMAT name: '%.*s'!\n",
-                       BSTR_P(line));
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "FILTER")) {
-            line = bstr_strip(line);
-            if (bstr_equals0(line, "LINEAR")) {
-                p->src_linear = true;
-            } else if (bstr_equals0(line, "NEAREST")) {
-                p->src_linear = false;
-            } else {
-                mp_err(log, "Unrecognized FILTER: '%.*s'!\n", BSTR_P(line));
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "BORDER")) {
-            line = bstr_strip(line);
-            if (bstr_equals0(line, "CLAMP")) {
-                p->src_repeat = false;
-            } else if (bstr_equals0(line, "REPEAT")) {
-                p->src_repeat = true;
-            } else {
-                mp_err(log, "Unrecognized BORDER: '%.*s'!\n", BSTR_P(line));
-                return false;
-            }
-            continue;
-        }
-
-        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
-        return false;
-    }
-
-    if (!p->format) {
-        mp_err(log, "No FORMAT specified.\n");
-        return false;
-    }
-
-    if (p->src_linear && !p->format->linear_filter) {
-        mp_err(log, "The specified texture format cannot be filtered!\n");
-        return false;
-    }
-
-    // Decode the rest of the section (up to the next //! marker) as raw hex
-    // data for the texture
-    struct bstr hexdata;
-    if (bstr_split_tok(*body, "//!", &hexdata, body)) {
-        // Make sure the magic line is part of the rest
-        body->start -= 3;
-        body->len += 3;
-    }
-
-    struct bstr tex;
-    if (!bstr_decode_hex(NULL, bstr_strip(hexdata), &tex)) {
-        mp_err(log, "Error while parsing TEXTURE body: must be a valid "
-                    "hexadecimal sequence, on a single line!\n");
-        return false;
-    }
-
-    int expected_len = p->w * p->h * p->d * p->format->pixel_size;
-    if (tex.len != expected_len) {
-        mp_err(log, "Shader TEXTURE size mismatch: got %zd bytes, expected %d!\n",
-               tex.len, expected_len);
-        talloc_free(tex.start);
-        return false;
-    }
-
-    p->initial_data = tex.start;
-    return true;
-}
-
-void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
-                       void *priv,
-                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
-                       bool (*dotex)(void *p, struct gl_user_shader_tex tex))
-{
-    if (!dohook || !dotex || !shader.len)
-        return;
-
-    // Skip all garbage (e.g. comments) before the first header
-    int pos = bstr_find(shader, bstr0("//!"));
-    if (pos < 0) {
-        mp_warn(log, "Shader appears to contain no headers!\n");
-        return;
-    }
-    shader = bstr_cut(shader, pos);
-
-    // Loop over the file
-    while (shader.len > 0)
-    {
-        // Peek at the first header to dispatch the right type
-        if (bstr_startswith0(shader, "//!TEXTURE")) {
-            struct gl_user_shader_tex t;
-            if (!parse_tex(log, ra, &shader, &t) || !dotex(priv, t))
-                return;
-            continue;
-        }
-
-        struct gl_user_shader_hook h;
-        if (!parse_hook(log, &shader, &h) || !dohook(priv, h))
-            return;
-    }
-}
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
deleted file mode 100644
index 94a070c8e2..0000000000
--- a/video/out/opengl/user_shaders.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_USER_SHADERS_H
-#define MP_GL_USER_SHADERS_H
-
-#include "utils.h"
-#include "ra.h"
-
-#define SHADER_MAX_PASSES 32
-#define SHADER_MAX_HOOKS 16
-#define SHADER_MAX_BINDS 6
-#define SHADER_MAX_SAVED 64
-#define MAX_SZEXP_SIZE 32
-
-enum szexp_op {
-    SZEXP_OP_ADD,
-    SZEXP_OP_SUB,
-    SZEXP_OP_MUL,
-    SZEXP_OP_DIV,
-    SZEXP_OP_NOT,
-    SZEXP_OP_GT,
-    SZEXP_OP_LT,
-};
-
-enum szexp_tag {
-    SZEXP_END = 0, // End of an RPN expression
-    SZEXP_CONST, // Push a constant value onto the stack
-    SZEXP_VAR_W, // Get the width/height of a named texture (variable)
-    SZEXP_VAR_H,
-    SZEXP_OP2, // Pop two elements and push the result of a dyadic operation
-    SZEXP_OP1, // Pop one element and push the result of a monadic operation
-};
-
-struct szexp {
-    enum szexp_tag tag;
-    union {
-        float cval;
-        struct bstr varname;
-        enum szexp_op op;
-    } val;
-};
-
-struct compute_info {
-    bool active;
-    int block_w, block_h;     // Block size (each block corresponds to one WG)
-    int threads_w, threads_h; // How many threads form a working group
-    bool directly_writes;     // If true, shader is assumed to imageStore(out_image)
-};
-
-struct gl_user_shader_hook {
-    struct bstr pass_desc;
-    struct bstr hook_tex[SHADER_MAX_HOOKS];
-    struct bstr bind_tex[SHADER_MAX_BINDS];
-    struct bstr save_tex;
-    struct bstr pass_body;
-    struct gl_transform offset;
-    struct szexp width[MAX_SZEXP_SIZE];
-    struct szexp height[MAX_SZEXP_SIZE];
-    struct szexp cond[MAX_SZEXP_SIZE];
-    int components;
-    struct compute_info compute;
-};
-
-struct gl_user_shader_tex {
-    struct bstr name;
-    struct ra_tex_params params;
-    // for video.c
-    struct ra_tex *tex;
-};
-
-// Parse the next shader block from `body`. The callbacks are invoked on every
-// valid shader block parsed.
-void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
-                       void *priv,
-                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
-                       bool (*dotex)(void *p, struct gl_user_shader_tex tex));
-
-// Evaluate a szexp, given a lookup function for named textures
-bool eval_szexpr(struct mp_log *log, void *priv,
-                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
-                 struct szexp expr[MAX_SZEXP_SIZE], float *result);
-
-#endif
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index b8fc24a52e..3b296d52de 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -1,371 +1,269 @@
-#include "common/msg.h"
-#include "video/out/vo.h"
+/*
+ * This file is part of mpv.
+ * Parts based on MPlayer code by Reimar Döffinger.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <libavutil/sha.h>
+#include <libavutil/intreadwrite.h>
+#include <libavutil/mem.h>
+
+#include "osdep/io.h"
+
+#include "common/common.h"
+#include "options/path.h"
+#include "stream/stream.h"
+#include "formats.h"
 #include "utils.h"
 
-// Standard parallel 2D projection, except y1 < y0 means that the coordinate
-// system is flipped, not the projection.
-void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
-                        float y0, float y1)
+// GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
+static const char *gl_error_to_string(GLenum error)
 {
-    if (y1 < y0) {
-        float tmp = y0;
-        y0 = tmp - y1;
-        y1 = tmp;
+    switch (error) {
+    case GL_INVALID_ENUM: return "INVALID_ENUM";
+    case GL_INVALID_VALUE: return "INVALID_VALUE";
+    case GL_INVALID_OPERATION: return "INVALID_OPERATION";
+    case GL_INVALID_FRAMEBUFFER_OPERATION: return "INVALID_FRAMEBUFFER_OPERATION";
+    case GL_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
+    default: return "unknown";
     }
-
-    t->m[0][0] = 2.0f / (x1 - x0);
-    t->m[0][1] = 0.0f;
-    t->m[1][0] = 0.0f;
-    t->m[1][1] = 2.0f / (y1 - y0);
-    t->t[0] = -(x1 + x0) / (x1 - x0);
-    t->t[1] = -(y1 + y0) / (y1 - y0);
-}
-
-// Apply the effects of one transformation to another, transforming it in the
-// process. In other words: post-composes t onto x
-void gl_transform_trans(struct gl_transform t, struct gl_transform *x)
-{
-    struct gl_transform xt = *x;
-    x->m[0][0] = t.m[0][0] * xt.m[0][0] + t.m[0][1] * xt.m[1][0];
-    x->m[1][0] = t.m[1][0] * xt.m[0][0] + t.m[1][1] * xt.m[1][0];
-    x->m[0][1] = t.m[0][0] * xt.m[0][1] + t.m[0][1] * xt.m[1][1];
-    x->m[1][1] = t.m[1][0] * xt.m[0][1] + t.m[1][1] * xt.m[1][1];
-    gl_transform_vec(t, &x->t[0], &x->t[1]);
-}
-
-void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo)
-{
-    int y_dir = fbo.flip ? -1 : 1;
-    gl_transform_ortho(t, 0, fbo.tex->params.w, 0, fbo.tex->params.h * y_dir);
 }
 
-void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool)
+void gl_check_error(GL *gl, struct mp_log *log, const char *info)
 {
-    for (int i = 0; i < pool->num_buffers; i++)
-        ra_buf_free(ra, &pool->buffers[i]);
-
-    talloc_free(pool->buffers);
-    *pool = (struct ra_buf_pool){0};
+    for (;;) {
+        GLenum error = gl->GetError();
+        if (error == GL_NO_ERROR)
+            break;
+        mp_msg(log, MSGL_ERR, "%s: OpenGL error %s.\n", info,
+               gl_error_to_string(error));
+    }
 }
 
-static bool ra_buf_params_compatible(const struct ra_buf_params *new,
-                                     const struct ra_buf_params *old)
+static int get_alignment(int stride)
 {
-    return new->type == old->type &&
-           new->size <= old->size &&
-           new->host_mapped  == old->host_mapped &&
-           new->host_mutable == old->host_mutable;
+    if (stride % 8 == 0)
+        return 8;
+    if (stride % 4 == 0)
+        return 4;
+    if (stride % 2 == 0)
+        return 2;
+    return 1;
 }
 
-static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
+// upload a texture, handling things like stride and slices
+//  target: texture target, usually GL_TEXTURE_2D
+//  format, type: texture parameters
+//  dataptr, stride: image data
+//  x, y, width, height: part of the image to upload
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h)
 {
-    struct ra_buf *buf = ra_buf_create(ra, &pool->current_params);
-    if (!buf)
-        return false;
-
-    MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
-    MP_VERBOSE(ra, "Resized buffer pool to size %d\n", pool->num_buffers);
-    return true;
+    int bpp = gl_bytes_per_pixel(format, type);
+    const uint8_t *data = dataptr;
+    int y_max = y + h;
+    if (w <= 0 || h <= 0 || !bpp)
+        return;
+    if (stride < 0) {
+        data += (h - 1) * stride;
+        stride = -stride;
+    }
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
+    int slice = h;
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
+        // this is not always correct, but should work for MPlayer
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
+    } else {
+        if (stride != bpp * w)
+            slice = 1; // very inefficient, but at least it works
+    }
+    for (; y + slice <= y_max; y += slice) {
+        gl->TexSubImage2D(target, 0, x, y, w, slice, format, type, data);
+        data += stride * slice;
+    }
+    if (y < y_max)
+        gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
 }
 
-struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
-                               const struct ra_buf_params *params)
+mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
 {
-    assert(!params->initial_data);
-
-    if (!ra_buf_params_compatible(params, &pool->current_params)) {
-        ra_buf_pool_uninit(ra, pool);
-        pool->current_params = *params;
-    }
-
-    // Make sure we have at least one buffer available
-    if (!pool->buffers && !ra_buf_pool_grow(ra, pool))
-        return NULL;
-
-    // Make sure the next buffer is available for use
-    if (!ra->fns->buf_poll(ra, pool->buffers[pool->index]) &&
-        !ra_buf_pool_grow(ra, pool))
-    {
+    if (gl->es)
+        return NULL; // ES can't read from front buffer
+    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
+    if (!image)
         return NULL;
+    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
+    GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
+    gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
+    gl->ReadBuffer(obj);
+    //flip image while reading (and also avoid stride-related trouble)
+    for (int y = 0; y < h; y++) {
+        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
+                       image->planes[0] + y * image->stride[0]);
     }
-
-    struct ra_buf *buf = pool->buffers[pool->index++];
-    pool->index %= pool->num_buffers;
-
-    return buf;
+    gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+    return image;
 }
 
-bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
-                       const struct ra_tex_upload_params *params)
+static void gl_vao_enable_attribs(struct gl_vao *vao)
 {
-    if (params->buf)
-        return ra->fns->tex_upload(ra, params);
-
-    struct ra_tex *tex = params->tex;
-    size_t row_size = tex->params.dimensions == 2 ? params->stride :
-                      tex->params.w * tex->params.format->pixel_size;
-
-    struct ra_buf_params bufparams = {
-        .type = RA_BUF_TYPE_TEX_UPLOAD,
-        .size = row_size * tex->params.h * tex->params.d,
-        .host_mutable = true,
-    };
-
-    struct ra_buf *buf = ra_buf_pool_get(ra, pbo, &bufparams);
-    if (!buf)
-        return false;
-
-    ra->fns->buf_update(ra, buf, 0, params->src, bufparams.size);
-
-    struct ra_tex_upload_params newparams = *params;
-    newparams.buf = buf;
-    newparams.src = NULL;
-
-    return ra->fns->tex_upload(ra, &newparams);
-}
+    GL *gl = vao->gl;
+
+    for (int n = 0; n < vao->num_entries; n++) {
+        const struct ra_renderpass_input *e = &vao->entries[n];
+        GLenum type = 0;
+        bool normalized = false;
+        switch (e->type) {
+        case RA_VARTYPE_INT:
+            type = GL_INT;
+            break;
+        case RA_VARTYPE_FLOAT:
+            type = GL_FLOAT;
+            break;
+        case RA_VARTYPE_BYTE_UNORM:
+            type = GL_UNSIGNED_BYTE;
+            normalized = true;
+            break;
+        default:
+            abort();
+        }
+        assert(e->dim_m == 1);
 
-struct ra_layout std140_layout(struct ra_renderpass_input *inp)
-{
-    size_t el_size = ra_vartype_size(inp->type);
-
-    // std140 packing rules:
-    // 1. The alignment of generic values is their size in bytes
-    // 2. The alignment of vectors is the vector length * the base count, with
-    // the exception of vec3 which is always aligned like vec4
-    // 3. The alignment of arrays is that of the element size rounded up to
-    // the nearest multiple of vec4
-    // 4. Matrices are treated like arrays of vectors
-    // 5. Arrays/matrices are laid out with a stride equal to the alignment
-    size_t size = el_size * inp->dim_v;
-    if (inp->dim_v == 3)
-        size += el_size;
-    if (inp->dim_m > 1)
-        size = MP_ALIGN_UP(size, sizeof(float[4]));
-
-    return (struct ra_layout) {
-        .align  = size,
-        .stride = size,
-        .size   = size * inp->dim_m,
-    };
+        gl->EnableVertexAttribArray(n);
+        gl->VertexAttribPointer(n, e->dim_v, type, normalized,
+                                vao->stride, (void *)(intptr_t)e->offset);
+    }
 }
 
-struct ra_layout std430_layout(struct ra_renderpass_input *inp)
+void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
+                 const struct ra_renderpass_input *entries,
+                 int num_entries)
 {
-    size_t el_size = ra_vartype_size(inp->type);
-
-    // std430 packing rules: like std140, except arrays/matrices are always
-    // "tightly" packed, even arrays/matrices of vec3s
-    size_t align = el_size * inp->dim_v;
-    if (inp->dim_v == 3 && inp->dim_m == 1)
-        align += el_size;
-
-    return (struct ra_layout) {
-        .align  = align,
-        .stride = align,
-        .size   = align * inp->dim_m,
+    assert(!vao->vao);
+    assert(!vao->buffer);
+
+    *vao = (struct gl_vao){
+        .gl = gl,
+        .stride = stride,
+        .entries = entries,
+        .num_entries = num_entries,
     };
-}
-
-// Create a texture and a FBO using the texture as color attachments.
-//  fmt: texture internal format
-// If the parameters are the same as the previous call, do not touch it.
-// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H.
-// Enabling FUZZY for W or H means the w or h does not need to be exact.
-bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
-                   int w, int h, const struct ra_format *fmt, int flags)
-{
-    int lw = w, lh = h;
-
-    if (fbo->tex) {
-        int cw = w, ch = h;
-        int rw = fbo->tex->params.w, rh = fbo->tex->params.h;
 
-        if ((flags & FBOTEX_FUZZY_W) && cw < rw)
-            cw = rw;
-        if ((flags & FBOTEX_FUZZY_H) && ch < rh)
-            ch = rh;
+    gl->GenBuffers(1, &vao->buffer);
 
-        if (rw == cw && rh == ch && fbo->tex->params.format == fmt)
-            goto done;
-    }
+    if (gl->BindVertexArray) {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
 
-    if (flags & FBOTEX_FUZZY_W)
-        w = MP_ALIGN_UP(w, 256);
-    if (flags & FBOTEX_FUZZY_H)
-        h = MP_ALIGN_UP(h, 256);
+        gl->GenVertexArrays(1, &vao->vao);
+        gl->BindVertexArray(vao->vao);
+        gl_vao_enable_attribs(vao);
+        gl->BindVertexArray(0);
 
-    mp_verbose(log, "Create FBO: %dx%d (%dx%d)\n", lw, lh, w, h);
-
-    if (!fmt || !fmt->renderable || !fmt->linear_filter) {
-        mp_err(log, "Format %s not supported.\n", fmt ? fmt->name : "(unset)");
-        return false;
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
     }
+}
 
-    fbotex_uninit(fbo);
-
-    *fbo = (struct fbotex) {
-        .ra = ra,
-    };
-
-    struct ra_tex_params params = {
-        .dimensions = 2,
-        .w = w,
-        .h = h,
-        .d = 1,
-        .format = fmt,
-        .src_linear = true,
-        .render_src = true,
-        .render_dst = true,
-        .storage_dst = true,
-        .blit_src = true,
-    };
-
-    fbo->tex = ra_tex_create(fbo->ra, &params);
-
-    if (!fbo->tex) {
-        mp_err(log, "Error: framebuffer could not be created.\n");
-        fbotex_uninit(fbo);
-        return false;
-    }
-
-done:
-
-    fbo->lw = lw;
-    fbo->lh = lh;
+void gl_vao_uninit(struct gl_vao *vao)
+{
+    GL *gl = vao->gl;
+    if (!gl)
+        return;
 
-    fbo->fbo = (struct fbodst){
-        .tex = fbo->tex,
-    };
+    if (gl->DeleteVertexArrays)
+        gl->DeleteVertexArrays(1, &vao->vao);
+    gl->DeleteBuffers(1, &vao->buffer);
 
-    return true;
+    *vao = (struct gl_vao){0};
 }
 
-void fbotex_uninit(struct fbotex *fbo)
+static void gl_vao_bind(struct gl_vao *vao)
 {
-    if (fbo->ra) {
-        ra_tex_free(fbo->ra, &fbo->tex);
-        *fbo = (struct fbotex) {0};
+    GL *gl = vao->gl;
+
+    if (gl->BindVertexArray) {
+        gl->BindVertexArray(vao->vao);
+    } else {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
+        gl_vao_enable_attribs(vao);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
     }
 }
 
-struct timer_pool {
-    struct ra *ra;
-    ra_timer *timer;
-    bool running; // detect invalid usage
-
-    uint64_t samples[VO_PERF_SAMPLE_COUNT];
-    int sample_idx;
-    int sample_count;
-
-    uint64_t sum;
-    uint64_t peak;
-};
-
-struct timer_pool *timer_pool_create(struct ra *ra)
+static void gl_vao_unbind(struct gl_vao *vao)
 {
-    if (!ra->fns->timer_create)
-        return NULL;
-
-    ra_timer *timer = ra->fns->timer_create(ra);
-    if (!timer)
-        return NULL;
+    GL *gl = vao->gl;
 
-    struct timer_pool *pool = talloc(NULL, struct timer_pool);
-    if (!pool) {
-        ra->fns->timer_destroy(ra, timer);
-        return NULL;
+    if (gl->BindVertexArray) {
+        gl->BindVertexArray(0);
+    } else {
+        for (int n = 0; n < vao->num_entries; n++)
+            gl->DisableVertexAttribArray(n);
     }
-
-    *pool = (struct timer_pool){ .ra = ra, .timer = timer };
-    return pool;
 }
 
-void timer_pool_destroy(struct timer_pool *pool)
+// Draw the vertex data (as described by the gl_vao_entry entries) in ptr
+// to the screen. num is the number of vertexes. prim is usually GL_TRIANGLES.
+// If ptr is NULL, then skip the upload, and use the data uploaded with the
+// previous call.
+void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
 {
-    if (!pool)
-        return;
-
-    pool->ra->fns->timer_destroy(pool->ra, pool->timer);
-    talloc_free(pool);
-}
+    GL *gl = vao->gl;
 
-void timer_pool_start(struct timer_pool *pool)
-{
-    if (!pool)
-        return;
+    if (ptr) {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
+        gl->BufferData(GL_ARRAY_BUFFER, num * vao->stride, ptr, GL_STREAM_DRAW);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+    }
 
-    assert(!pool->running);
-    pool->ra->fns->timer_start(pool->ra, pool->timer);
-    pool->running = true;
-}
+    gl_vao_bind(vao);
 
-void timer_pool_stop(struct timer_pool *pool)
-{
-    if (!pool)
-        return;
+    gl->DrawArrays(prim, 0, num);
 
-    assert(pool->running);
-    uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
-    pool->running = false;
-
-    if (res) {
-        // Input res into the buffer and grab the previous value
-        uint64_t old = pool->samples[pool->sample_idx];
-        pool->sample_count = MPMIN(pool->sample_count + 1, VO_PERF_SAMPLE_COUNT);
-        pool->samples[pool->sample_idx++] = res;
-        pool->sample_idx %= VO_PERF_SAMPLE_COUNT;
-        pool->sum = pool->sum + res - old;
-
-        // Update peak if necessary
-        if (res >= pool->peak) {
-            pool->peak = res;
-        } else if (pool->peak == old) {
-            // It's possible that the last peak was the value we just removed,
-            // if so we need to scan for the new peak
-            uint64_t peak = res;
-            for (int i = 0; i < VO_PERF_SAMPLE_COUNT; i++)
-                peak = MPMAX(peak, pool->samples[i]);
-            pool->peak = peak;
-        }
-    }
+    gl_vao_unbind(vao);
 }
 
-struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
+static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
+                                   GLenum severity, GLsizei length,
+                                   const GLchar *message, const void *userParam)
 {
-    if (!pool)
-        return (struct mp_pass_perf){0};
-
-    struct mp_pass_perf res = {
-        .peak = pool->peak,
-        .count = pool->sample_count,
-    };
-
-    int idx = pool->sample_idx - pool->sample_count + VO_PERF_SAMPLE_COUNT;
-    for (int i = 0; i < res.count; i++) {
-        idx %= VO_PERF_SAMPLE_COUNT;
-        res.samples[i] = pool->samples[idx++];
+    // keep in mind that the debug callback can be asynchronous
+    struct mp_log *log = (void *)userParam;
+    int level = MSGL_ERR;
+    switch (severity) {
+    case GL_DEBUG_SEVERITY_NOTIFICATION:level = MSGL_V; break;
+    case GL_DEBUG_SEVERITY_LOW:         level = MSGL_INFO; break;
+    case GL_DEBUG_SEVERITY_MEDIUM:      level = MSGL_WARN; break;
+    case GL_DEBUG_SEVERITY_HIGH:        level = MSGL_ERR; break;
     }
-
-    if (res.count > 0) {
-        res.last = res.samples[res.count - 1];
-        res.avg = pool->sum / res.count;
-    }
-
-    return res;
+    mp_msg(log, level, "GL: %s\n", message);
 }
 
-void mp_log_source(struct mp_log *log, int lev, const char *src)
+void gl_set_debug_logger(GL *gl, struct mp_log *log)
 {
-    int line = 1;
-    if (!src)
-        return;
-    while (*src) {
-        const char *end = strchr(src, '\n');
-        const char *next = end + 1;
-        if (!end)
-            next = end = src + strlen(src);
-        mp_msg(log, lev, "[%3d] %.*s\n", line, (int)(end - src), src);
-        line++;
-        src = next;
-    }
+    if (gl->DebugMessageCallback)
+        gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
 }
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 7d00d26cf5..18cab476ed 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -1,121 +1,54 @@
-#pragma once
+/*
+ * This file is part of mpv.
+ * Parts based on MPlayer code by Reimar Döffinger.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_UTILS_
+#define MP_GL_UTILS_
 
-#include <stdbool.h>
 #include <math.h>
 
-#include "video/out/vo.h"
-#include "ra.h"
+#include "video/out/gpu/utils.h"
+#include "common.h"
 
-// A 3x2 matrix, with the translation part separate.
-struct gl_transform {
-    // row-major, e.g. in mathematical notation:
-    //  | m[0][0] m[0][1] |
-    //  | m[1][0] m[1][1] |
-    float m[2][2];
-    float t[2];
-};
-
-static const struct gl_transform identity_trans = {
-    .m = {{1.0, 0.0}, {0.0, 1.0}},
-    .t = {0.0, 0.0},
-};
-
-void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
-                        float y0, float y1);
-
-// This treats m as an affine transformation, in other words m[2][n] gets
-// added to the output.
-static inline void gl_transform_vec(struct gl_transform t, float *x, float *y)
-{
-    float vx = *x, vy = *y;
-    *x = vx * t.m[0][0] + vy * t.m[0][1] + t.t[0];
-    *y = vx * t.m[1][0] + vy * t.m[1][1] + t.t[1];
-}
+struct mp_log;
 
-struct mp_rect_f {
-    float x0, y0, x1, y1;
-};
-
-// Semantic equality (fuzzy comparison)
-static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
-{
-    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
-           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
-}
-
-static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
-{
-    gl_transform_vec(t, &r->x0, &r->y0);
-    gl_transform_vec(t, &r->x1, &r->y1);
-}
+void gl_check_error(GL *gl, struct mp_log *log, const char *info);
 
-static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
-{
-    for (int x = 0; x < 2; x++) {
-        for (int y = 0; y < 2; y++) {
-            if (a.m[x][y] != b.m[x][y])
-                return false;
-        }
-    }
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h);
 
-    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
-}
+mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
 
-void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
-
-struct fbodst {
-    struct ra_tex *tex;
-    bool flip; // mirror vertically
+struct gl_vao {
+    GL *gl;
+    GLuint vao;     // the VAO object, or 0 if unsupported by driver
+    GLuint buffer;  // GL_ARRAY_BUFFER used for the data
+    int stride;     // size of each element (interleaved elements are assumed)
+    const struct ra_renderpass_input *entries;
+    int num_entries;
 };
 
-void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo);
-
-// A pool of buffers, which can grow as needed
-struct ra_buf_pool {
-    struct ra_buf_params current_params;
-    struct ra_buf **buffers;
-    int num_buffers;
-    int index;
-};
-
-void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool);
-
-// Note: params->initial_data is *not* supported
-struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
-                               const struct ra_buf_params *params);
-
-// Helper that wraps ra_tex_upload using texture upload buffers to ensure that
-// params->buf is always set. This is intended for RA-internal usage.
-bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
-                       const struct ra_tex_upload_params *params);
-
-// Layout rules for GLSL's packing modes
-struct ra_layout std140_layout(struct ra_renderpass_input *inp);
-struct ra_layout std430_layout(struct ra_renderpass_input *inp);
-
-struct fbotex {
-    struct ra *ra;
-    struct ra_tex *tex;
-    int lw, lh; // logical (configured) size, <= than texture size
-    struct fbodst fbo;
-};
-
-void fbotex_uninit(struct fbotex *fbo);
-bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
-                   int w, int h, const struct ra_format *fmt, int flags);
-#define FBOTEX_FUZZY_W 1
-#define FBOTEX_FUZZY_H 2
-#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
-
-// A wrapper around ra_timer that does result pooling, averaging etc.
-struct timer_pool;
+void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
+                 const struct ra_renderpass_input *entries,
+                 int num_entries);
+void gl_vao_uninit(struct gl_vao *vao);
+void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
 
-struct timer_pool *timer_pool_create(struct ra *ra);
-void timer_pool_destroy(struct timer_pool *pool);
-void timer_pool_start(struct timer_pool *pool);
-void timer_pool_stop(struct timer_pool *pool);
-struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);
+void gl_set_debug_logger(GL *gl, struct mp_log *log);
 
-// print a multi line string with line numbers (e.g. for shader sources)
-// log, lev: module and log level, as in mp_msg()
-void mp_log_source(struct mp_log *log, int lev, const char *src);
+#endif
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
deleted file mode 100644
index 3362381eff..0000000000
--- a/video/out/opengl/video.c
+++ /dev/null
@@ -1,3813 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <string.h>
-#include <assert.h>
-
-#include <libavutil/common.h>
-#include <libavutil/lfg.h>
-
-#include "video.h"
-
-#include "misc/bstr.h"
-#include "options/m_config.h"
-#include "common/global.h"
-#include "options/options.h"
-#include "utils.h"
-#include "hwdec.h"
-#include "osd.h"
-#include "ra.h"
-#include "stream/stream.h"
-#include "video_shaders.h"
-#include "user_shaders.h"
-#include "video/out/filter_kernels.h"
-#include "video/out/aspect.h"
-#include "video/out/dither.h"
-#include "video/out/vo.h"
-
-// scale/cscale arguments that map directly to shader filter routines.
-// Note that the convolution filters are not included in this list.
-static const char *const fixed_scale_filters[] = {
-    "bilinear",
-    "bicubic_fast",
-    "oversample",
-    NULL
-};
-static const char *const fixed_tscale_filters[] = {
-    "oversample",
-    "linear",
-    NULL
-};
-
-// must be sorted, and terminated with 0
-int filter_sizes[] =
-    {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
-int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
-
-struct vertex_pt {
-    float x, y;
-};
-
-struct vertex {
-    struct vertex_pt position;
-    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
-    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
-    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
-    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
-    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
-    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
-    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
-    {0}
-};
-
-struct texplane {
-    struct ra_tex *tex;
-    int w, h;
-    bool flipped;
-};
-
-struct video_image {
-    struct texplane planes[4];
-    struct mp_image *mpi;       // original input image
-    uint64_t id;                // unique ID identifying mpi contents
-    bool hwdec_mapped;
-};
-
-enum plane_type {
-    PLANE_NONE = 0,
-    PLANE_RGB,
-    PLANE_LUMA,
-    PLANE_CHROMA,
-    PLANE_ALPHA,
-    PLANE_XYZ,
-};
-
-static const char *plane_names[] = {
-    [PLANE_NONE] = "unknown",
-    [PLANE_RGB] = "rgb",
-    [PLANE_LUMA] = "luma",
-    [PLANE_CHROMA] = "chroma",
-    [PLANE_ALPHA] = "alpha",
-    [PLANE_XYZ] = "xyz",
-};
-
-// A self-contained description of a source image which can be bound to a
-// texture unit and sampled from. Contains metadata about how it's to be used
-struct img_tex {
-    enum plane_type type; // must be set to something non-zero
-    int components; // number of relevant coordinates
-    float multiplier; // multiplier to be used when sampling
-    struct ra_tex *tex;
-    int w, h; // logical size (after transformation)
-    struct gl_transform transform; // rendering transformation
-};
-
-// A named img_tex, for user scripting purposes
-struct saved_tex {
-    const char *name;
-    struct img_tex tex;
-};
-
-// A texture hook. This is some operation that transforms a named texture as
-// soon as it's generated
-struct tex_hook {
-    const char *save_tex;
-    const char *hook_tex[SHADER_MAX_HOOKS];
-    const char *bind_tex[TEXUNIT_VIDEO_NUM];
-    int components; // how many components are relevant (0 = same as input)
-    void *priv; // this gets talloc_freed when the tex_hook is removed
-    void (*hook)(struct gl_video *p, struct img_tex tex, // generates GLSL
-                 struct gl_transform *trans, void *priv);
-    bool (*cond)(struct gl_video *p, struct img_tex tex, void *priv);
-};
-
-struct fbosurface {
-    struct fbotex fbotex;
-    uint64_t id;
-    double pts;
-};
-
-#define FBOSURFACES_MAX 10
-
-struct cached_file {
-    char *path;
-    struct bstr body;
-};
-
-struct pass_info {
-    struct bstr desc;
-    struct mp_pass_perf perf;
-};
-
-#define PASS_INFO_MAX (SHADER_MAX_PASSES + 32)
-
-struct dr_buffer {
-    struct ra_buf *buf;
-    // The mpi reference will keep the data from being recycled (or from other
-    // references gaining write access) while the GPU is accessing the buffer.
-    struct mp_image *mpi;
-};
-
-struct gl_video {
-    struct ra *ra;
-
-    struct mpv_global *global;
-    struct mp_log *log;
-    struct gl_video_opts opts;
-    struct m_config_cache *opts_cache;
-    struct gl_lcms *cms;
-
-    int fb_depth;               // actual bits available in GL main framebuffer
-    struct m_color clear_color;
-    bool force_clear_color;
-
-    struct gl_shader_cache *sc;
-
-    struct osd_state *osd_state;
-    struct mpgl_osd *osd;
-    double osd_pts;
-
-    struct ra_tex *lut_3d_texture;
-    bool use_lut_3d;
-    int lut_3d_size[3];
-
-    struct ra_tex *dither_texture;
-
-    struct mp_image_params real_image_params;   // configured format
-    struct mp_image_params image_params;        // texture format (mind hwdec case)
-    struct ra_imgfmt_desc ra_format;            // texture format
-    int plane_count;
-
-    bool is_gray;
-    bool has_alpha;
-    char color_swizzle[5];
-    bool use_integer_conversion;
-
-    struct video_image image;
-
-    struct dr_buffer *dr_buffers;
-    int num_dr_buffers;
-
-    bool using_dr_path;
-
-    bool dumb_mode;
-    bool forced_dumb_mode;
-
-    const struct ra_format *fbo_format;
-    struct fbotex merge_fbo[4];
-    struct fbotex scale_fbo[4];
-    struct fbotex integer_fbo[4];
-    struct fbotex indirect_fbo;
-    struct fbotex blend_subs_fbo;
-    struct fbotex screen_fbo;
-    struct fbotex output_fbo;
-    struct fbosurface surfaces[FBOSURFACES_MAX];
-    struct fbotex vdpau_deinterleave_fbo[2];
-    struct ra_buf *hdr_peak_ssbo;
-
-    // user pass descriptions and textures
-    struct tex_hook tex_hooks[SHADER_MAX_PASSES];
-    int tex_hook_num;
-    struct gl_user_shader_tex user_textures[SHADER_MAX_PASSES];
-    int user_tex_num;
-
-    int surface_idx;
-    int surface_now;
-    int frames_drawn;
-    bool is_interpolated;
-    bool output_fbo_valid;
-
-    // state for configured scalers
-    struct scaler scaler[SCALER_COUNT];
-
-    struct mp_csp_equalizer_state *video_eq;
-
-    struct mp_rect src_rect;    // displayed part of the source video
-    struct mp_rect dst_rect;    // video rectangle on output window
-    struct mp_osd_res osd_rect; // OSD size/margins
-
-    // temporary during rendering
-    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
-    struct compute_info pass_compute; // compute shader metadata for this pass
-    int pass_tex_num;
-    int texture_w, texture_h;
-    struct gl_transform texture_offset; // texture transform without rotation
-    int components;
-    bool use_linear;
-    float user_gamma;
-
-    // pass info / metrics
-    struct pass_info pass_fresh[PASS_INFO_MAX];
-    struct pass_info pass_redraw[PASS_INFO_MAX];
-    struct pass_info *pass;
-    int pass_idx;
-    struct timer_pool *upload_timer;
-    struct timer_pool *blit_timer;
-    struct timer_pool *osd_timer;
-
-    // intermediate textures
-    struct saved_tex saved_tex[SHADER_MAX_SAVED];
-    int saved_tex_num;
-    struct fbotex hook_fbos[SHADER_MAX_SAVED];
-    int hook_fbo_num;
-
-    int frames_uploaded;
-    int frames_rendered;
-    AVLFG lfg;
-
-    // Cached because computing it can take relatively long
-    int last_dither_matrix_size;
-    float *last_dither_matrix;
-
-    struct cached_file *files;
-    int num_files;
-
-    struct ra_hwdec *hwdec;
-    struct ra_hwdec_mapper *hwdec_mapper;
-    bool hwdec_active;
-
-    bool dsi_warned;
-    bool broken_frame; // temporary error state
-};
-
-static const struct gl_video_opts gl_video_opts_def = {
-    .dither_algo = DITHER_FRUIT,
-    .dither_depth = -1,
-    .dither_size = 6,
-    .temporal_dither_period = 1,
-    .fbo_format = "auto",
-    .sigmoid_center = 0.75,
-    .sigmoid_slope = 6.5,
-    .scaler = {
-        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .cutoff = 0.001}, // scale
-        {{NULL,       .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .cutoff = 0.001}, // dscale
-        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .cutoff = 0.001}, // cscale
-        {{"mitchell", .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .clamp = 1, }, // tscale
-    },
-    .scaler_resizes_only = 1,
-    .scaler_lut_size = 6,
-    .interpolation_threshold = 0.0001,
-    .alpha_mode = ALPHA_BLEND_TILES,
-    .background = {0, 0, 0, 255},
-    .gamma = 1.0f,
-    .tone_mapping = TONE_MAPPING_MOBIUS,
-    .tone_mapping_param = NAN,
-    .tone_mapping_desat = 2.0,
-    .early_flush = -1,
-};
-
-static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param);
-
-static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param);
-
-#define OPT_BASE_STRUCT struct gl_video_opts
-
-#define SCALER_OPTS(n, i) \
-    OPT_STRING_VALIDATE(n, scaler[i].kernel.name, 0, validate_scaler_opt), \
-    OPT_FLOAT(n"-param1", scaler[i].kernel.params[0], 0),                  \
-    OPT_FLOAT(n"-param2", scaler[i].kernel.params[1], 0),                  \
-    OPT_FLOAT(n"-blur",   scaler[i].kernel.blur, 0),                       \
-    OPT_FLOATRANGE(n"-cutoff", scaler[i].cutoff, 0, 0.0, 1.0),             \
-    OPT_FLOATRANGE(n"-taper", scaler[i].kernel.taper, 0, 0.0, 1.0),        \
-    OPT_FLOAT(n"-wparam", scaler[i].window.params[0], 0),                  \
-    OPT_FLOAT(n"-wblur",  scaler[i].window.blur, 0),                       \
-    OPT_FLOATRANGE(n"-wtaper", scaler[i].window.taper, 0, 0.0, 1.0),       \
-    OPT_FLOATRANGE(n"-clamp", scaler[i].clamp, 0, 0.0, 1.0),               \
-    OPT_FLOATRANGE(n"-radius",    scaler[i].radius, 0, 0.5, 16.0),         \
-    OPT_FLOATRANGE(n"-antiring",  scaler[i].antiring, 0, 0.0, 1.0),        \
-    OPT_STRING_VALIDATE(n"-window", scaler[i].window.name, 0, validate_window_opt)
-
-const struct m_sub_options gl_video_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_CHOICE("opengl-dumb-mode", dumb_mode, 0,
-                   ({"auto", 0}, {"yes", 1}, {"no", -1})),
-        OPT_FLOATRANGE("opengl-gamma", gamma, 0, 0.1, 2.0),
-        OPT_FLAG("gamma-auto", gamma_auto, 0),
-        OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
-        OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
-        OPT_CHOICE("tone-mapping", tone_mapping, 0,
-                   ({"clip",     TONE_MAPPING_CLIP},
-                    {"mobius",   TONE_MAPPING_MOBIUS},
-                    {"reinhard", TONE_MAPPING_REINHARD},
-                    {"hable",    TONE_MAPPING_HABLE},
-                    {"gamma",    TONE_MAPPING_GAMMA},
-                    {"linear",   TONE_MAPPING_LINEAR})),
-        OPT_FLAG("hdr-compute-peak", compute_hdr_peak, 0),
-        OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
-        OPT_FLOAT("tone-mapping-desaturate", tone_mapping_desat, 0),
-        OPT_FLAG("gamut-warning", gamut_warning, 0),
-        OPT_FLAG("opengl-pbo", pbo, 0),
-        SCALER_OPTS("scale",  SCALER_SCALE),
-        SCALER_OPTS("dscale", SCALER_DSCALE),
-        SCALER_OPTS("cscale", SCALER_CSCALE),
-        SCALER_OPTS("tscale", SCALER_TSCALE),
-        OPT_INTRANGE("scaler-lut-size", scaler_lut_size, 0, 4, 10),
-        OPT_FLAG("scaler-resizes-only", scaler_resizes_only, 0),
-        OPT_FLAG("linear-scaling", linear_scaling, 0),
-        OPT_FLAG("correct-downscaling", correct_downscaling, 0),
-        OPT_FLAG("sigmoid-upscaling", sigmoid_upscaling, 0),
-        OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
-        OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
-        OPT_STRING("opengl-fbo-format", fbo_format, 0),
-        OPT_CHOICE_OR_INT("dither-depth", dither_depth, 0, -1, 16,
-                          ({"no", -1}, {"auto", 0})),
-        OPT_CHOICE("dither", dither_algo, 0,
-                   ({"fruit", DITHER_FRUIT},
-                    {"ordered", DITHER_ORDERED},
-                    {"no", DITHER_NONE})),
-        OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
-        OPT_FLAG("temporal-dither", temporal_dither, 0),
-        OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
-        OPT_CHOICE("alpha", alpha_mode, 0,
-                   ({"no", ALPHA_NO},
-                    {"yes", ALPHA_YES},
-                    {"blend", ALPHA_BLEND},
-                    {"blend-tiles", ALPHA_BLEND_TILES})),
-        OPT_FLAG("opengl-rectangle-textures", use_rectangle, 0),
-        OPT_COLOR("background", background, 0),
-        OPT_FLAG("interpolation", interpolation, 0),
-        OPT_FLOAT("interpolation-threshold", interpolation_threshold, 0),
-        OPT_CHOICE("blend-subtitles", blend_subs, 0,
-                   ({"no", BLEND_SUBS_NO},
-                    {"yes", BLEND_SUBS_YES},
-                    {"video", BLEND_SUBS_VIDEO})),
-        OPT_PATHLIST("opengl-shaders", user_shaders, 0),
-        OPT_CLI_ALIAS("opengl-shader", "opengl-shaders-append"),
-        OPT_FLAG("deband", deband, 0),
-        OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
-        OPT_FLOAT("sharpen", unsharp, 0),
-        OPT_INTRANGE("opengl-tex-pad-x", tex_pad_x, 0, 0, 4096),
-        OPT_INTRANGE("opengl-tex-pad-y", tex_pad_y, 0, 0, 4096),
-        OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
-        OPT_CHOICE("opengl-early-flush", early_flush, 0,
-                   ({"no", 0}, {"yes", 1}, {"auto", -1})),
-        OPT_STRING("opengl-shader-cache-dir", shader_cache_dir, 0),
-        OPT_REPLACED("hdr-tone-mapping", "tone-mapping"),
-        {0}
-    },
-    .size = sizeof(struct gl_video_opts),
-    .defaults = &gl_video_opts_def,
-};
-
-static void uninit_rendering(struct gl_video *p);
-static void uninit_scaler(struct gl_video *p, struct scaler *scaler);
-static void check_gl_features(struct gl_video *p);
-static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id);
-static const char *handle_scaler_opt(const char *name, bool tscale);
-static void reinit_from_options(struct gl_video *p);
-static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
-static void gl_video_setup_hooks(struct gl_video *p);
-
-#define GLSL(x) gl_sc_add(p->sc, #x "\n");
-#define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
-#define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__)
-#define PRELUDE(...) gl_sc_paddf(p->sc, __VA_ARGS__)
-
-static struct bstr load_cached_file(struct gl_video *p, const char *path)
-{
-    if (!path || !path[0])
-        return (struct bstr){0};
-    for (int n = 0; n < p->num_files; n++) {
-        if (strcmp(p->files[n].path, path) == 0)
-            return p->files[n].body;
-    }
-    // not found -> load it
-    struct bstr s = stream_read_file(path, p, p->global, 1024000); // 1024 kB
-    if (s.len) {
-        struct cached_file new = {
-            .path = talloc_strdup(p, path),
-            .body = s,
-        };
-        MP_TARRAY_APPEND(p, p->files, p->num_files, new);
-        return new.body;
-    }
-    return (struct bstr){0};
-}
-
-static void debug_check_gl(struct gl_video *p, const char *msg)
-{
-    if (p->ra->fns->debug_marker)
-        p->ra->fns->debug_marker(p->ra, msg);
-}
-
-static void gl_video_reset_surfaces(struct gl_video *p)
-{
-    for (int i = 0; i < FBOSURFACES_MAX; i++) {
-        p->surfaces[i].id = 0;
-        p->surfaces[i].pts = MP_NOPTS_VALUE;
-    }
-    p->surface_idx = 0;
-    p->surface_now = 0;
-    p->frames_drawn = 0;
-    p->output_fbo_valid = false;
-}
-
-static void gl_video_reset_hooks(struct gl_video *p)
-{
-    for (int i = 0; i < p->tex_hook_num; i++)
-        talloc_free(p->tex_hooks[i].priv);
-
-    for (int i = 0; i < p->user_tex_num; i++)
-        ra_tex_free(p->ra, &p->user_textures[i].tex);
-
-    p->tex_hook_num = 0;
-    p->user_tex_num = 0;
-}
-
-static inline int fbosurface_wrap(int id)
-{
-    id = id % FBOSURFACES_MAX;
-    return id < 0 ? id + FBOSURFACES_MAX : id;
-}
-
-static void reinit_osd(struct gl_video *p)
-{
-    mpgl_osd_destroy(p->osd);
-    p->osd = NULL;
-    if (p->osd_state)
-        p->osd = mpgl_osd_init(p->ra, p->log, p->osd_state);
-}
-
-static void uninit_rendering(struct gl_video *p)
-{
-    for (int n = 0; n < SCALER_COUNT; n++)
-        uninit_scaler(p, &p->scaler[n]);
-
-    ra_tex_free(p->ra, &p->dither_texture);
-
-    for (int n = 0; n < 4; n++) {
-        fbotex_uninit(&p->merge_fbo[n]);
-        fbotex_uninit(&p->scale_fbo[n]);
-        fbotex_uninit(&p->integer_fbo[n]);
-    }
-
-    fbotex_uninit(&p->indirect_fbo);
-    fbotex_uninit(&p->blend_subs_fbo);
-    fbotex_uninit(&p->screen_fbo);
-    fbotex_uninit(&p->output_fbo);
-
-    for (int n = 0; n < FBOSURFACES_MAX; n++)
-        fbotex_uninit(&p->surfaces[n].fbotex);
-
-    for (int n = 0; n < SHADER_MAX_SAVED; n++)
-        fbotex_uninit(&p->hook_fbos[n]);
-
-    for (int n = 0; n < 2; n++)
-        fbotex_uninit(&p->vdpau_deinterleave_fbo[n]);
-
-    gl_video_reset_surfaces(p);
-    gl_video_reset_hooks(p);
-
-    gl_sc_reset_error(p->sc);
-}
-
-bool gl_video_gamma_auto_enabled(struct gl_video *p)
-{
-    return p->opts.gamma_auto;
-}
-
-struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p)
-{
-    return (struct mp_colorspace) {
-        .primaries = p->opts.target_prim,
-        .gamma = p->opts.target_trc,
-    };
-}
-
-// Warning: profile.start must point to a ta allocation, and the function
-//          takes over ownership.
-void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data)
-{
-    if (gl_lcms_set_memory_profile(p->cms, icc_data))
-        reinit_from_options(p);
-}
-
-bool gl_video_icc_auto_enabled(struct gl_video *p)
-{
-    return p->opts.icc_opts ? p->opts.icc_opts->profile_auto : false;
-}
-
-static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
-                               enum mp_csp_trc trc)
-{
-    if (!p->use_lut_3d)
-        return false;
-
-    struct AVBufferRef *icc = NULL;
-    if (p->image.mpi)
-        icc = p->image.mpi->icc_profile;
-
-    if (p->lut_3d_texture && !gl_lcms_has_changed(p->cms, prim, trc, icc))
-        return true;
-
-    // GLES3 doesn't provide filtered 16 bit integer textures
-    // GLES2 doesn't even provide 3D textures
-    const struct ra_format *fmt = ra_find_unorm_format(p->ra, 2, 4);
-    if (!fmt || !(p->ra->caps & RA_CAP_TEX_3D)) {
-        p->use_lut_3d = false;
-        MP_WARN(p, "Disabling color management (no RGBA16 3D textures).\n");
-        return false;
-    }
-
-    struct lut3d *lut3d = NULL;
-    if (!fmt || !gl_lcms_get_lut3d(p->cms, &lut3d, prim, trc, icc) || !lut3d) {
-        p->use_lut_3d = false;
-        return false;
-    }
-
-    ra_tex_free(p->ra, &p->lut_3d_texture);
-
-    struct ra_tex_params params = {
-        .dimensions = 3,
-        .w = lut3d->size[0],
-        .h = lut3d->size[1],
-        .d = lut3d->size[2],
-        .format = fmt,
-        .render_src = true,
-        .src_linear = true,
-        .initial_data = lut3d->data,
-    };
-    p->lut_3d_texture = ra_tex_create(p->ra, &params);
-
-    debug_check_gl(p, "after 3d lut creation");
-
-    for (int i = 0; i < 3; i++)
-        p->lut_3d_size[i] = lut3d->size[i];
-
-    talloc_free(lut3d);
-
-    return true;
-}
-
-// Fill an img_tex struct from an FBO + some metadata
-static struct img_tex img_tex_fbo(struct fbotex *fbo, enum plane_type type,
-                                  int components)
-{
-    assert(type != PLANE_NONE);
-    return (struct img_tex){
-        .type = type,
-        .tex = fbo->tex,
-        .multiplier = 1.0,
-        .w = fbo->lw,
-        .h = fbo->lh,
-        .transform = identity_trans,
-        .components = components,
-    };
-}
-
-// Bind an img_tex to a free texture unit and return its ID. At most
-// TEXUNIT_VIDEO_NUM texture units can be bound at once
-static int pass_bind(struct gl_video *p, struct img_tex tex)
-{
-    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
-    p->pass_tex[p->pass_tex_num] = tex;
-    return p->pass_tex_num++;
-}
-
-// Rotation by 90° and flipping.
-// w/h is used for recentering.
-static void get_transform(float w, float h, int rotate, bool flip,
-                          struct gl_transform *out_tr)
-{
-    int a = rotate % 90 ? 0 : rotate / 90;
-    int sin90[4] = {0, 1, 0, -1}; // just to avoid rounding issues etc.
-    int cos90[4] = {1, 0, -1, 0};
-    struct gl_transform tr = {{{ cos90[a], sin90[a]},
-                               {-sin90[a], cos90[a]}}};
-
-    // basically, recenter to keep the whole image in view
-    float b[2] = {1, 1};
-    gl_transform_vec(tr, &b[0], &b[1]);
-    tr.t[0] += b[0] < 0 ? w : 0;
-    tr.t[1] += b[1] < 0 ? h : 0;
-
-    if (flip) {
-        struct gl_transform fliptr = {{{1, 0}, {0, -1}}, {0, h}};
-        gl_transform_trans(fliptr, &tr);
-    }
-
-    *out_tr = tr;
-}
-
-// Return the chroma plane upscaled to luma size, but with additional padding
-// for image sizes not aligned to subsampling.
-static int chroma_upsize(int size, int pixel)
-{
-    return (size + pixel - 1) / pixel * pixel;
-}
-
-// If a and b are on the same plane, return what plane type should be used.
-// If a or b are none, the other type always wins.
-// Usually: LUMA/RGB/XYZ > CHROMA > ALPHA
-static enum plane_type merge_plane_types(enum plane_type a, enum plane_type b)
-{
-    if (a == PLANE_NONE)
-        return b;
-    if (b == PLANE_LUMA || b == PLANE_RGB || b == PLANE_XYZ)
-        return b;
-    if (b != PLANE_NONE && a == PLANE_ALPHA)
-        return b;
-    return a;
-}
-
-// Places a video_image's image textures + associated metadata into tex[]. The
-// number of textures is equal to p->plane_count. Any necessary plane offsets
-// are stored in off. (e.g. chroma position)
-static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
-                             struct img_tex tex[4], struct gl_transform off[4])
-{
-    assert(vimg->mpi);
-
-    int w = p->image_params.w;
-    int h = p->image_params.h;
-
-    // Determine the chroma offset
-    float ls_w = 1.0 / p->ra_format.chroma_w;
-    float ls_h = 1.0 / p->ra_format.chroma_h;
-
-    struct gl_transform chroma = {{{ls_w, 0.0}, {0.0, ls_h}}};
-
-    if (p->image_params.chroma_location != MP_CHROMA_CENTER) {
-        int cx, cy;
-        mp_get_chroma_location(p->image_params.chroma_location, &cx, &cy);
-        // By default texture coordinates are such that chroma is centered with
-        // any chroma subsampling. If a specific direction is given, make it
-        // so that the luma and chroma sample line up exactly.
-        // For 4:4:4, setting chroma location should have no effect at all.
-        // luma sample size (in chroma coord. space)
-        chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
-        chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
-    }
-
-    int msb_valid_bits =
-        p->ra_format.component_bits + MPMIN(p->ra_format.component_pad, 0);
-    // The existing code assumes we just have a single tex multiplier for
-    // all of the planes. This may change in the future
-    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.color.space,
-                                         msb_valid_bits,
-                                         p->ra_format.component_bits);
-
-    memset(tex, 0, 4 * sizeof(tex[0]));
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *t = &vimg->planes[n];
-
-        enum plane_type type = PLANE_NONE;
-        for (int i = 0; i < 4; i++) {
-            int c = p->ra_format.components[n][i];
-            enum plane_type ctype;
-            if (c == 0) {
-                ctype = PLANE_NONE;
-            } else if (c == 4) {
-                ctype = PLANE_ALPHA;
-            } else if (p->image_params.color.space == MP_CSP_RGB) {
-                ctype = PLANE_RGB;
-            } else if (p->image_params.color.space == MP_CSP_XYZ) {
-                ctype = PLANE_XYZ;
-            } else {
-                ctype = c == 1 ? PLANE_LUMA : PLANE_CHROMA;
-            }
-            type = merge_plane_types(type, ctype);
-        }
-
-        tex[n] = (struct img_tex){
-            .type = type,
-            .tex = t->tex,
-            .multiplier = tex_mul,
-            .w = t->w,
-            .h = t->h,
-        };
-
-        for (int i = 0; i < 4; i++)
-            tex[n].components += !!p->ra_format.components[n][i];
-
-        get_transform(t->w, t->h, p->image_params.rotate, t->flipped,
-                      &tex[n].transform);
-        if (p->image_params.rotate % 180 == 90)
-            MPSWAP(int, tex[n].w, tex[n].h);
-
-        off[n] = identity_trans;
-
-        if (type == PLANE_CHROMA) {
-            struct gl_transform rot;
-            get_transform(0, 0, p->image_params.rotate, true, &rot);
-
-            struct gl_transform tr = chroma;
-            gl_transform_vec(rot, &tr.t[0], &tr.t[1]);
-
-            float dx = (chroma_upsize(w, p->ra_format.chroma_w) - w) * ls_w;
-            float dy = (chroma_upsize(h, p->ra_format.chroma_h) - h) * ls_h;
-
-            // Adjust the chroma offset if the real chroma size is fractional
-            // due image sizes not aligned to chroma subsampling.
-            struct gl_transform rot2;
-            get_transform(0, 0, p->image_params.rotate, t->flipped, &rot2);
-            if (rot2.m[0][0] < 0)
-                tr.t[0] += dx;
-            if (rot2.m[1][0] < 0)
-                tr.t[0] += dy;
-            if (rot2.m[0][1] < 0)
-                tr.t[1] += dx;
-            if (rot2.m[1][1] < 0)
-                tr.t[1] += dy;
-
-            off[n] = tr;
-        }
-    }
-}
-
-// Return the index of the given component (assuming all non-padding components
-// of all planes are concatenated into a linear list).
-static int find_comp(struct ra_imgfmt_desc *desc, int component)
-{
-    int cur = 0;
-    for (int n = 0; n < desc->num_planes; n++) {
-        for (int i = 0; i < 4; i++) {
-            if (desc->components[n][i]) {
-                if (desc->components[n][i] == component)
-                    return cur;
-                cur++;
-            }
-        }
-    }
-    return -1;
-}
-
-static void init_video(struct gl_video *p)
-{
-    p->use_integer_conversion = false;
-
-    if (p->hwdec && ra_hwdec_test_format(p->hwdec, p->image_params.imgfmt)) {
-        if (p->hwdec->driver->overlay_frame) {
-            MP_WARN(p, "Using HW-overlay mode. No GL filtering is performed "
-                       "on the video!\n");
-        } else {
-            p->hwdec_mapper = ra_hwdec_mapper_create(p->hwdec, &p->image_params);
-            if (!p->hwdec_mapper)
-                MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
-        }
-        if (p->hwdec_mapper)
-            p->image_params = p->hwdec_mapper->dst_params;
-        const char **exts = p->hwdec->glsl_extensions;
-        for (int n = 0; exts && exts[n]; n++)
-            gl_sc_enable_extension(p->sc, (char *)exts[n]);
-        p->hwdec_active = true;
-    }
-
-    p->ra_format = (struct ra_imgfmt_desc){0};
-    ra_get_imgfmt_desc(p->ra, p->image_params.imgfmt, &p->ra_format);
-
-    p->plane_count = p->ra_format.num_planes;
-
-    p->has_alpha = false;
-    p->is_gray = true;
-
-    for (int n = 0; n < p->ra_format.num_planes; n++) {
-        for (int i = 0; i < 4; i++) {
-            if (p->ra_format.components[n][i]) {
-                p->has_alpha |= p->ra_format.components[n][i] == 4;
-                p->is_gray &= p->ra_format.components[n][i] == 1 ||
-                              p->ra_format.components[n][i] == 4;
-            }
-        }
-    }
-
-    for (int c = 0; c < 4; c++) {
-        int loc = find_comp(&p->ra_format, c + 1);
-        p->color_swizzle[c] = "rgba"[loc >= 0 && loc < 4 ? loc : 0];
-    }
-    p->color_swizzle[4] = '\0';
-
-    // Format-dependent checks.
-    check_gl_features(p);
-
-    mp_image_params_guess_csp(&p->image_params);
-
-    av_lfg_init(&p->lfg, 1);
-
-    debug_check_gl(p, "before video texture creation");
-
-    if (!p->hwdec_active) {
-        struct video_image *vimg = &p->image;
-
-        struct mp_image layout = {0};
-        mp_image_set_params(&layout, &p->image_params);
-
-        for (int n = 0; n < p->plane_count; n++) {
-            struct texplane *plane = &vimg->planes[n];
-            const struct ra_format *format = p->ra_format.planes[n];
-
-            plane->w = mp_image_plane_w(&layout, n);
-            plane->h = mp_image_plane_h(&layout, n);
-
-            struct ra_tex_params params = {
-                .dimensions = 2,
-                .w = plane->w + p->opts.tex_pad_x,
-                .h = plane->h + p->opts.tex_pad_y,
-                .d = 1,
-                .format = format,
-                .render_src = true,
-                .src_linear = format->linear_filter,
-                .non_normalized = p->opts.use_rectangle,
-                .host_mutable = true,
-            };
-
-            MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n,
-                       params.w, params.h);
-
-            plane->tex = ra_tex_create(p->ra, &params);
-            if (!plane->tex)
-                abort(); // shit happens
-
-            p->use_integer_conversion |= format->ctype == RA_CTYPE_UINT;
-        }
-    }
-
-    debug_check_gl(p, "after video texture creation");
-
-    gl_video_setup_hooks(p);
-}
-
-// Release any texture mappings associated with the current frame.
-static void unmap_current_image(struct gl_video *p)
-{
-    struct video_image *vimg = &p->image;
-
-    if (vimg->hwdec_mapped) {
-        assert(p->hwdec_active && p->hwdec_mapper);
-        ra_hwdec_mapper_unmap(p->hwdec_mapper);
-        memset(vimg->planes, 0, sizeof(vimg->planes));
-        vimg->hwdec_mapped = false;
-        vimg->id = 0; // needs to be mapped again
-    }
-}
-
-static struct dr_buffer *gl_find_dr_buffer(struct gl_video *p, uint8_t *ptr)
-{
-   for (int i = 0; i < p->num_dr_buffers; i++) {
-       struct dr_buffer *buffer = &p->dr_buffers[i];
-        uint8_t *bufptr = buffer->buf->data;
-        size_t size = buffer->buf->params.size;
-        if (ptr >= bufptr && ptr < bufptr + size)
-            return buffer;
-    }
-
-    return NULL;
-}
-
-static void gc_pending_dr_fences(struct gl_video *p, bool force)
-{
-again:;
-    for (int n = 0; n < p->num_dr_buffers; n++) {
-        struct dr_buffer *buffer = &p->dr_buffers[n];
-        if (!buffer->mpi)
-            continue;
-
-        bool res = p->ra->fns->buf_poll(p->ra, buffer->buf);
-        if (res || force) {
-            // Unreferencing the image could cause gl_video_dr_free_buffer()
-            // to be called by the talloc destructor (if it was the last
-            // reference). This will implicitly invalidate the buffer pointer
-            // and change the p->dr_buffers array. To make it worse, it could
-            // free multiple dr_buffers due to weird theoretical corner cases.
-            // This is also why we use the goto to iterate again from the
-            // start, because everything gets fucked up. Hail satan!
-            struct mp_image *ref = buffer->mpi;
-            buffer->mpi = NULL;
-            talloc_free(ref);
-            goto again;
-        }
-    }
-}
-
-static void unref_current_image(struct gl_video *p)
-{
-    unmap_current_image(p);
-    p->image.id = 0;
-
-    mp_image_unrefp(&p->image.mpi);
-
-    // While we're at it, also garbage collect pending fences in here to
-    // get it out of the way.
-    gc_pending_dr_fences(p, false);
-}
-
-// If overlay mode is used, make sure to remove the overlay.
-// Be careful with this. Removing the overlay and adding another one will
-// lead to flickering artifacts.
-static void unmap_overlay(struct gl_video *p)
-{
-    if (p->hwdec_active && p->hwdec->driver->overlay_frame)
-        p->hwdec->driver->overlay_frame(p->hwdec, NULL, NULL, NULL, true);
-}
-
-static void uninit_video(struct gl_video *p)
-{
-    uninit_rendering(p);
-
-    struct video_image *vimg = &p->image;
-
-    unmap_overlay(p);
-    unref_current_image(p);
-
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
-        ra_tex_free(p->ra, &plane->tex);
-    }
-    *vimg = (struct video_image){0};
-
-    // Invalidate image_params to ensure that gl_video_config() will call
-    // init_video() on uninitialized gl_video.
-    p->real_image_params = (struct mp_image_params){0};
-    p->image_params = p->real_image_params;
-    p->hwdec_active = false;
-    ra_hwdec_mapper_free(&p->hwdec_mapper);
-}
-
-static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
-{
-    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
-        return;
-
-    struct pass_info *pass = &p->pass[p->pass_idx];
-    pass->perf = perf;
-
-    if (pass->desc.len == 0)
-        bstr_xappend(p, &pass->desc, bstr0("(unknown)"));
-
-    p->pass_idx++;
-}
-
-PRINTF_ATTRIBUTE(2, 3)
-static void pass_describe(struct gl_video *p, const char *textf, ...)
-{
-    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
-        return;
-
-    struct pass_info *pass = &p->pass[p->pass_idx];
-
-    if (pass->desc.len > 0)
-        bstr_xappend(p, &pass->desc, bstr0(" + "));
-
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(p, &pass->desc, textf, ap);
-    va_end(ap);
-}
-
-static void pass_info_reset(struct gl_video *p, bool is_redraw)
-{
-    p->pass = is_redraw ? p->pass_redraw : p->pass_fresh;
-    p->pass_idx = 0;
-
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        p->pass[i].desc.len = 0;
-        p->pass[i].perf = (struct mp_pass_perf){0};
-    }
-}
-
-static void pass_report_performance(struct gl_video *p)
-{
-    if (!p->pass)
-        return;
-
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        struct pass_info *pass = &p->pass[i];
-        if (pass->desc.len) {
-            MP_DBG(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
-                   BSTR_P(pass->desc),
-                   (int)pass->perf.last/1000,
-                   (int)pass->perf.avg/1000,
-                   (int)pass->perf.peak/1000);
-        }
-    }
-}
-
-static void pass_prepare_src_tex(struct gl_video *p)
-{
-    struct gl_shader_cache *sc = p->sc;
-
-    for (int n = 0; n < p->pass_tex_num; n++) {
-        struct img_tex *s = &p->pass_tex[n];
-        if (!s->tex)
-            continue;
-
-        char *texture_name = mp_tprintf(32, "texture%d", n);
-        char *texture_size = mp_tprintf(32, "texture_size%d", n);
-        char *texture_rot = mp_tprintf(32, "texture_rot%d", n);
-        char *texture_off = mp_tprintf(32, "texture_off%d", n);
-        char *pixel_size = mp_tprintf(32, "pixel_size%d", n);
-
-        gl_sc_uniform_texture(sc, texture_name, s->tex);
-        float f[2] = {1, 1};
-        if (!s->tex->params.non_normalized) {
-            f[0] = s->tex->params.w;
-            f[1] = s->tex->params.h;
-        }
-        gl_sc_uniform_vec2(sc, texture_size, f);
-        gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m);
-        gl_sc_uniform_vec2(sc, texture_off, (float *)s->transform.t);
-        gl_sc_uniform_vec2(sc, pixel_size, (float[]){1.0f / f[0],
-                                                     1.0f / f[1]});
-    }
-}
-
-// Sets the appropriate compute shader metadata for an implicit compute pass
-// bw/bh: block size
-static void pass_is_compute(struct gl_video *p, int bw, int bh)
-{
-    p->pass_compute = (struct compute_info){
-        .active = true,
-        .block_w = bw,
-        .block_h = bh,
-    };
-}
-
-// w/h: the width/height of the compute shader's operating domain (e.g. the
-// target target that needs to be written, or the source texture that needs to
-// be reduced)
-static void dispatch_compute(struct gl_video *p, int w, int h,
-                             struct compute_info info)
-{
-    PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
-            info.threads_w > 0 ? info.threads_w : info.block_w,
-            info.threads_h > 0 ? info.threads_h : info.block_h);
-
-    pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
-
-    // Since we don't actually have vertices, we pretend for convenience
-    // reasons that we do and calculate the right texture coordinates based on
-    // the output sample ID
-    gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
-    PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
-
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct img_tex *s = &p->pass_tex[n];
-        if (!s->tex)
-            continue;
-
-        // We need to rescale the coordinates to the true texture size
-        char tex_scale[32];
-        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
-        gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
-                (float)s->w / s->tex->params.w,
-                (float)s->h / s->tex->params.h,
-        });
-
-        PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
-        PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + "
-               "pixel_size%d * texture_off%d)\n", n, n, n, n, n);
-        // Clamp the texture coordinates to prevent sampling out-of-bounds in
-        // threads that exceed the requested width/height
-        PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n);
-        PRELUDE("#define texcoord%d texmap%d(gl_GlobalInvocationID)\n", n, n);
-    }
-
-    // always round up when dividing to make sure we don't leave off a part of
-    // the image
-    int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
-        num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
-
-    pass_record(p, gl_sc_dispatch_compute(p->sc, num_x, num_y, 1));
-
-    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
-    p->pass_tex_num = 0;
-}
-
-static struct mp_pass_perf render_pass_quad(struct gl_video *p,
-                                            struct fbodst target,
-                                            const struct mp_rect *dst)
-{
-    struct vertex va[6] = {0};
-
-    struct gl_transform t;
-    gl_transform_ortho_fbodst(&t, target);
-
-    float x[2] = {dst->x0, dst->x1};
-    float y[2] = {dst->y0, dst->y1};
-    gl_transform_vec(t, &x[0], &y[0]);
-    gl_transform_vec(t, &x[1], &y[1]);
-
-    for (int n = 0; n < 4; n++) {
-        struct vertex *v = &va[n];
-        v->position.x = x[n / 2];
-        v->position.y = y[n % 2];
-        for (int i = 0; i < p->pass_tex_num; i++) {
-            struct img_tex *s = &p->pass_tex[i];
-            if (!s->tex)
-                continue;
-            struct gl_transform tr = s->transform;
-            float tx = (n / 2) * s->w;
-            float ty = (n % 2) * s->h;
-            gl_transform_vec(tr, &tx, &ty);
-            bool rect = s->tex->params.non_normalized;
-            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
-            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
-        }
-    }
-
-    va[4] = va[2];
-    va[5] = va[1];
-
-    return gl_sc_dispatch_draw(p->sc, target.tex, va, 6);
-}
-
-static void finish_pass_direct(struct gl_video *p, struct fbodst target,
-                               const struct mp_rect *dst)
-{
-    pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
-    pass_record(p, render_pass_quad(p, target, dst));
-    debug_check_gl(p, "after rendering");
-    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
-    p->pass_tex_num = 0;
-}
-
-// dst_fbo: this will be used for rendering; possibly reallocating the whole
-//          FBO, if the required parameters have changed
-// w, h: required FBO target dimension, and also defines the target rectangle
-//       used for rasterization
-// flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
-//        flags allows the FBO to be larger than the w/h parameters)
-static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
-                            int w, int h, int flags)
-{
-    fbotex_change(dst_fbo, p->ra, p->log, w, h, p->fbo_format, flags);
-
-    if (p->pass_compute.active) {
-        if (!dst_fbo->tex)
-            return;
-        gl_sc_uniform_image2D_wo(p->sc, "out_image", dst_fbo->tex);
-        if (!p->pass_compute.directly_writes)
-            GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
-
-        dispatch_compute(p, w, h, p->pass_compute);
-        p->pass_compute = (struct compute_info){0};
-
-        debug_check_gl(p, "after dispatching compute shader");
-    } else {
-        finish_pass_direct(p, dst_fbo->fbo, &(struct mp_rect){0, 0, w, h});
-    }
-}
-
-static const char *get_tex_swizzle(struct img_tex *img)
-{
-    if (!img->tex)
-        return "rgba";
-    return img->tex->params.format->luminance_alpha ? "raaa" : "rgba";
-}
-
-// Copy a texture to the vec4 color, while increasing offset. Also applies
-// the texture multiplier to the sampled color
-static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
-{
-    int count = img.components;
-    assert(*offset + count <= 4);
-
-    int id = pass_bind(p, img);
-    char src[5] = {0};
-    char dst[5] = {0};
-    const char *tex_fmt = get_tex_swizzle(&img);
-    const char *dst_fmt = "rgba";
-    for (int i = 0; i < count; i++) {
-        src[i] = tex_fmt[i];
-        dst[i] = dst_fmt[*offset + i];
-    }
-
-    if (img.tex && img.tex->params.format->ctype == RA_CTYPE_UINT) {
-        uint64_t tex_max = 1ull << p->ra_format.component_bits;
-        img.multiplier *= 1.0 / (tex_max - 1);
-    }
-
-    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
-          dst, img.multiplier, id, id, src);
-
-    *offset += count;
-}
-
-static void skip_unused(struct gl_video *p, int num_components)
-{
-    for (int i = num_components; i < 4; i++)
-        GLSLF("color.%c = %f;\n", "rgba"[i], i < 3 ? 0.0 : 1.0);
-}
-
-static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
-{
-    fbotex_uninit(&scaler->sep_fbo);
-    ra_tex_free(p->ra, &scaler->lut);
-    scaler->kernel = NULL;
-    scaler->initialized = false;
-}
-
-static void hook_prelude(struct gl_video *p, const char *name, int id,
-                         struct img_tex tex)
-{
-    GLSLHF("#define %s_raw texture%d\n", name, id);
-    GLSLHF("#define %s_pos texcoord%d\n", name, id);
-    GLSLHF("#define %s_size texture_size%d\n", name, id);
-    GLSLHF("#define %s_rot texture_rot%d\n", name, id);
-    GLSLHF("#define %s_pt pixel_size%d\n", name, id);
-    GLSLHF("#define %s_map texmap%d\n", name, id);
-    GLSLHF("#define %s_mul %f\n", name, tex.multiplier);
-
-    // Set up the sampling functions
-    GLSLHF("#define %s_tex(pos) (%s_mul * vec4(texture(%s_raw, pos)).%s)\n",
-           name, name, name, get_tex_swizzle(&tex));
-
-    // Since the extra matrix multiplication impacts performance,
-    // skip it unless the texture was actually rotated
-    if (gl_transform_eq(tex.transform, identity_trans)) {
-        GLSLHF("#define %s_texOff(off) %s_tex(%s_pos + %s_pt * vec2(off))\n",
-               name, name, name, name);
-    } else {
-        GLSLHF("#define %s_texOff(off) "
-                   "%s_tex(%s_pos + %s_rot * vec2(off)/%s_size)\n",
-               name, name, name, name, name);
-    }
-}
-
-static bool saved_tex_find(struct gl_video *p, const char *name,
-                           struct img_tex *out)
-{
-    if (!name || !out)
-        return false;
-
-    for (int i = 0; i < p->saved_tex_num; i++) {
-        if (strcmp(p->saved_tex[i].name, name) == 0) {
-            *out = p->saved_tex[i].tex;
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static void saved_tex_store(struct gl_video *p, const char *name,
-                            struct img_tex tex)
-{
-    assert(name);
-
-    for (int i = 0; i < p->saved_tex_num; i++) {
-        if (strcmp(p->saved_tex[i].name, name) == 0) {
-            p->saved_tex[i].tex = tex;
-            return;
-        }
-    }
-
-    assert(p->saved_tex_num < SHADER_MAX_SAVED);
-    p->saved_tex[p->saved_tex_num++] = (struct saved_tex) {
-        .name = name,
-        .tex = tex
-    };
-}
-
-static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
-                                  struct img_tex tex, struct tex_hook *hook)
-{
-    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
-        char *bind_name = (char *)hook->bind_tex[t];
-
-        if (!bind_name)
-            continue;
-
-        // This is a special name that means "currently hooked texture"
-        if (strcmp(bind_name, "HOOKED") == 0) {
-            int id = pass_bind(p, tex);
-            hook_prelude(p, "HOOKED", id, tex);
-            hook_prelude(p, name, id, tex);
-            continue;
-        }
-
-        // BIND can also be used to load user-defined textures, in which
-        // case we will directly load them as a uniform instead of
-        // generating the hook_prelude boilerplate
-        for (int u = 0; u < p->user_tex_num; u++) {
-            struct gl_user_shader_tex *utex = &p->user_textures[u];
-            if (bstr_equals0(utex->name, bind_name)) {
-                gl_sc_uniform_texture(p->sc, bind_name, utex->tex);
-                goto next_bind;
-            }
-        }
-
-        struct img_tex bind_tex;
-        if (!saved_tex_find(p, bind_name, &bind_tex)) {
-            // Clean up texture bindings and move on to the next hook
-            MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
-                   name, bind_name);
-            p->pass_tex_num -= t;
-            return false;
-        }
-
-        hook_prelude(p, bind_name, pass_bind(p, bind_tex), bind_tex);
-
-next_bind: ;
-    }
-
-    return true;
-}
-
-// Process hooks for a plane, saving the result and returning a new img_tex
-// If 'trans' is NULL, the shader is forbidden from transforming tex
-static struct img_tex pass_hook(struct gl_video *p, const char *name,
-                                struct img_tex tex, struct gl_transform *trans)
-{
-    if (!name)
-        return tex;
-
-    saved_tex_store(p, name, tex);
-
-    MP_DBG(p, "Running hooks for %s\n", name);
-    for (int i = 0; i < p->tex_hook_num; i++) {
-        struct tex_hook *hook = &p->tex_hooks[i];
-
-        // Figure out if this pass hooks this texture
-        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
-            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
-                goto found;
-        }
-
-        continue;
-
-found:
-        // Check the hook's condition
-        if (hook->cond && !hook->cond(p, tex, hook->priv)) {
-            MP_DBG(p, "Skipping hook on %s due to condition.\n", name);
-            continue;
-        }
-
-        if (!pass_hook_setup_binds(p, name, tex, hook))
-            continue;
-
-        // Run the actual hook. This generates a series of GLSL shader
-        // instructions sufficient for drawing the hook's output
-        struct gl_transform hook_off = identity_trans;
-        hook->hook(p, tex, &hook_off, hook->priv);
-
-        int comps = hook->components ? hook->components : tex.components;
-        skip_unused(p, comps);
-
-        // Compute the updated FBO dimensions and store the result
-        struct mp_rect_f sz = {0, 0, tex.w, tex.h};
-        gl_transform_rect(hook_off, &sz);
-        int w = lroundf(fabs(sz.x1 - sz.x0));
-        int h = lroundf(fabs(sz.y1 - sz.y0));
-
-        assert(p->hook_fbo_num < SHADER_MAX_SAVED);
-        struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
-        finish_pass_fbo(p, fbo, w, h, 0);
-
-        const char *store_name = hook->save_tex ? hook->save_tex : name;
-        struct img_tex saved_tex = img_tex_fbo(fbo, tex.type, comps);
-
-        // If the texture we're saving overwrites the "current" texture, also
-        // update the tex parameter so that the future loop cycles will use the
-        // updated values, and export the offset
-        if (strcmp(store_name, name) == 0) {
-            if (!trans && !gl_transform_eq(hook_off, identity_trans)) {
-                MP_ERR(p, "Hook tried changing size of unscalable texture %s!\n",
-                       name);
-                return tex;
-            }
-
-            tex = saved_tex;
-            if (trans)
-                gl_transform_trans(hook_off, trans);
-        }
-
-        saved_tex_store(p, store_name, saved_tex);
-    }
-
-    return tex;
-}
-
-// This can be used at any time in the middle of rendering to specify an
-// optional hook point, which if triggered will render out to a new FBO and
-// load the result back into vec4 color. Offsets applied by the hooks are
-// accumulated in tex_trans, and the FBO is dimensioned according
-// to p->texture_w/h
-static void pass_opt_hook_point(struct gl_video *p, const char *name,
-                                struct gl_transform *tex_trans)
-{
-    if (!name)
-        return;
-
-    for (int i = 0; i < p->tex_hook_num; i++) {
-        struct tex_hook *hook = &p->tex_hooks[i];
-
-        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
-            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
-                goto found;
-        }
-
-        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
-            if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
-                goto found;
-        }
-    }
-
-    // Nothing uses this texture, don't bother storing it
-    return;
-
-found:
-    assert(p->hook_fbo_num < SHADER_MAX_SAVED);
-    struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
-    finish_pass_fbo(p, fbo, p->texture_w, p->texture_h, 0);
-
-    struct img_tex img = img_tex_fbo(fbo, PLANE_RGB, p->components);
-    img = pass_hook(p, name, img, tex_trans);
-    copy_img_tex(p, &(int){0}, img);
-    p->texture_w = img.w;
-    p->texture_h = img.h;
-    p->components = img.components;
-    pass_describe(p, "(remainder pass)");
-}
-
-static void load_shader(struct gl_video *p, struct bstr body)
-{
-    gl_sc_hadd_bstr(p->sc, body);
-    gl_sc_uniform_f(p->sc, "random", (double)av_lfg_get(&p->lfg) / UINT32_MAX);
-    gl_sc_uniform_i(p->sc, "frame", p->frames_uploaded);
-    gl_sc_uniform_vec2(p->sc, "input_size",
-                       (float[]){(p->src_rect.x1 - p->src_rect.x0) *
-                                  p->texture_offset.m[0][0],
-                                  (p->src_rect.y1 - p->src_rect.y0) *
-                                  p->texture_offset.m[1][1]});
-    gl_sc_uniform_vec2(p->sc, "target_size",
-                       (float[]){p->dst_rect.x1 - p->dst_rect.x0,
-                                 p->dst_rect.y1 - p->dst_rect.y0});
-    gl_sc_uniform_vec2(p->sc, "tex_offset",
-                       (float[]){p->src_rect.x0 * p->texture_offset.m[0][0] +
-                                 p->texture_offset.t[0],
-                                 p->src_rect.y0 * p->texture_offset.m[1][1] +
-                                 p->texture_offset.t[1]});
-}
-
-// Semantic equality
-static bool double_seq(double a, double b)
-{
-    return (isnan(a) && isnan(b)) || a == b;
-}
-
-static bool scaler_fun_eq(struct scaler_fun a, struct scaler_fun b)
-{
-    if ((a.name && !b.name) || (b.name && !a.name))
-        return false;
-
-    return ((!a.name && !b.name) || strcmp(a.name, b.name) == 0) &&
-           double_seq(a.params[0], b.params[0]) &&
-           double_seq(a.params[1], b.params[1]) &&
-           a.blur == b.blur &&
-           a.taper == b.taper;
-}
-
-static bool scaler_conf_eq(struct scaler_config a, struct scaler_config b)
-{
-    // Note: antiring isn't compared because it doesn't affect LUT
-    // generation
-    return scaler_fun_eq(a.kernel, b.kernel) &&
-           scaler_fun_eq(a.window, b.window) &&
-           a.radius == b.radius &&
-           a.clamp == b.clamp;
-}
-
-static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
-                          const struct scaler_config *conf,
-                          double scale_factor,
-                          int sizes[])
-{
-    if (scaler_conf_eq(scaler->conf, *conf) &&
-        scaler->scale_factor == scale_factor &&
-        scaler->initialized)
-        return;
-
-    uninit_scaler(p, scaler);
-
-    scaler->conf = *conf;
-    bool is_tscale = scaler->index == SCALER_TSCALE;
-    scaler->conf.kernel.name = (char *)handle_scaler_opt(conf->kernel.name, is_tscale);
-    scaler->conf.window.name = (char *)handle_scaler_opt(conf->window.name, is_tscale);
-    scaler->scale_factor = scale_factor;
-    scaler->insufficient = false;
-    scaler->initialized = true;
-
-    const struct filter_kernel *t_kernel = mp_find_filter_kernel(conf->kernel.name);
-    if (!t_kernel)
-        return;
-
-    scaler->kernel_storage = *t_kernel;
-    scaler->kernel = &scaler->kernel_storage;
-
-    const char *win = conf->window.name;
-    if (!win || !win[0])
-        win = t_kernel->window; // fall back to the scaler's default window
-    const struct filter_window *t_window = mp_find_filter_window(win);
-    if (t_window)
-        scaler->kernel->w = *t_window;
-
-    for (int n = 0; n < 2; n++) {
-        if (!isnan(conf->kernel.params[n]))
-            scaler->kernel->f.params[n] = conf->kernel.params[n];
-        if (!isnan(conf->window.params[n]))
-            scaler->kernel->w.params[n] = conf->window.params[n];
-    }
-
-    if (conf->kernel.blur > 0.0)
-        scaler->kernel->f.blur = conf->kernel.blur;
-    if (conf->window.blur > 0.0)
-        scaler->kernel->w.blur = conf->window.blur;
-
-    if (conf->kernel.taper > 0.0)
-        scaler->kernel->f.taper = conf->kernel.taper;
-    if (conf->window.taper > 0.0)
-        scaler->kernel->w.taper = conf->window.taper;
-
-    if (scaler->kernel->f.resizable && conf->radius > 0.0)
-        scaler->kernel->f.radius = conf->radius;
-
-    scaler->kernel->clamp = conf->clamp;
-    scaler->kernel->value_cutoff = conf->cutoff;
-
-    scaler->insufficient = !mp_init_filter(scaler->kernel, sizes, scale_factor);
-
-    int size = scaler->kernel->size;
-    int num_components = size > 2 ? 4 : size;
-    const struct ra_format *fmt = ra_find_float16_format(p->ra, num_components);
-    assert(fmt);
-
-    int width = (size + num_components - 1) / num_components; // round up
-    int stride = width * num_components;
-    assert(size <= stride);
-
-    scaler->lut_size = 1 << p->opts.scaler_lut_size;
-
-    float *weights = talloc_array(NULL, float, scaler->lut_size * stride);
-    mp_compute_lut(scaler->kernel, scaler->lut_size, stride, weights);
-
-    bool use_1d = scaler->kernel->polar && (p->ra->caps & RA_CAP_TEX_1D);
-
-    struct ra_tex_params lut_params = {
-        .dimensions = use_1d ? 1 : 2,
-        .w = use_1d ? scaler->lut_size : width,
-        .h = use_1d ? 1 : scaler->lut_size,
-        .d = 1,
-        .format = fmt,
-        .render_src = true,
-        .src_linear = true,
-        .initial_data = weights,
-    };
-    scaler->lut = ra_tex_create(p->ra, &lut_params);
-
-    talloc_free(weights);
-
-    debug_check_gl(p, "after initializing scaler");
-}
-
-// Special helper for sampling from two separated stages
-static void pass_sample_separated(struct gl_video *p, struct img_tex src,
-                                  struct scaler *scaler, int w, int h)
-{
-    // Separate the transformation into x and y components, per pass
-    struct gl_transform t_x = {
-        .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}},
-        .t = {src.transform.t[0], 0.0},
-    };
-    struct gl_transform t_y = {
-        .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}},
-        .t = {0.0, src.transform.t[1]},
-    };
-
-    // First pass (scale only in the y dir)
-    src.transform = t_y;
-    sampler_prelude(p->sc, pass_bind(p, src));
-    GLSLF("// first pass\n");
-    pass_sample_separated_gen(p->sc, scaler, 0, 1);
-    GLSLF("color *= %f;\n", src.multiplier);
-    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
-
-    // Second pass (scale only in the x dir)
-    src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
-    src.transform = t_x;
-    pass_describe(p, "%s second pass", scaler->conf.kernel.name);
-    sampler_prelude(p->sc, pass_bind(p, src));
-    pass_sample_separated_gen(p->sc, scaler, 1, 0);
-}
-
-// Picks either the compute shader version or the regular sampler version
-// depending on hardware support
-static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
-                                       struct img_tex tex, int w, int h)
-{
-    uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY;
-    if ((p->ra->caps & reqs) != reqs)
-        goto fallback;
-
-    int bound = ceil(scaler->kernel->radius_cutoff);
-    int offset = bound - 1; // padding top/left
-    int padding = offset + bound; // total padding
-
-    float ratiox = (float)w / tex.w,
-          ratioy = (float)h / tex.h;
-
-    // For performance we want to load at least as many pixels
-    // horizontally as there are threads in a warp (32 for nvidia), as
-    // well as enough to take advantage of shmem parallelism
-    const int warp_size = 32, threads = 256;
-    int bw = warp_size;
-    int bh = threads / bw;
-
-    // We need to sample everything from base_min to base_max, so make sure
-    // we have enough room in shmem
-    int iw = (int)ceil(bw / ratiox) + padding + 1,
-        ih = (int)ceil(bh / ratioy) + padding + 1;
-
-    int shmem_req = iw * ih * tex.components * sizeof(float);
-    if (shmem_req > p->ra->max_shmem)
-        goto fallback;
-
-    pass_is_compute(p, bw, bh);
-    pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
-    return;
-
-fallback:
-    // Fall back to regular polar shader when compute shaders are unsupported
-    // or the kernel is too big for shmem
-    pass_sample_polar(p->sc, scaler, tex.components, p->ra->glsl_version);
-}
-
-// Sample from img_tex, with the src rectangle given by it.
-// The dst rectangle is implicit by what the caller will do next, but w and h
-// must still be what is going to be used (to dimension FBOs correctly).
-// This will write the scaled contents to the vec4 "color".
-// The scaler unit is initialized by this function; in order to avoid cache
-// thrashing, the scaler unit should usually use the same parameters.
-static void pass_sample(struct gl_video *p, struct img_tex tex,
-                        struct scaler *scaler, const struct scaler_config *conf,
-                        double scale_factor, int w, int h)
-{
-    reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
-
-    // Describe scaler
-    const char *scaler_opt[] = {
-        [SCALER_SCALE] = "scale",
-        [SCALER_DSCALE] = "dscale",
-        [SCALER_CSCALE] = "cscale",
-        [SCALER_TSCALE] = "tscale",
-    };
-
-    pass_describe(p, "%s=%s (%s)", scaler_opt[scaler->index],
-                  scaler->conf.kernel.name, plane_names[tex.type]);
-
-    bool is_separated = scaler->kernel && !scaler->kernel->polar;
-
-    // Set up the transformation+prelude and bind the texture, for everything
-    // other than separated scaling (which does this in the subfunction)
-    if (!is_separated)
-        sampler_prelude(p->sc, pass_bind(p, tex));
-
-    // Dispatch the scaler. They're all wildly different.
-    const char *name = scaler->conf.kernel.name;
-    if (strcmp(name, "bilinear") == 0) {
-        GLSL(color = texture(tex, pos);)
-    } else if (strcmp(name, "bicubic_fast") == 0) {
-        pass_sample_bicubic_fast(p->sc);
-    } else if (strcmp(name, "oversample") == 0) {
-        pass_sample_oversample(p->sc, scaler, w, h);
-    } else if (scaler->kernel && scaler->kernel->polar) {
-        pass_dispatch_sample_polar(p, scaler, tex, w, h);
-    } else if (scaler->kernel) {
-        pass_sample_separated(p, tex, scaler, w, h);
-    } else {
-        // Should never happen
-        abort();
-    }
-
-    // Apply any required multipliers. Separated scaling already does this in
-    // its first stage
-    if (!is_separated)
-        GLSLF("color *= %f;\n", tex.multiplier);
-
-    // Micro-optimization: Avoid scaling unneeded channels
-    skip_unused(p, tex.components);
-}
-
-// Returns true if two img_texs are semantically equivalent (same metadata)
-static bool img_tex_equiv(struct img_tex a, struct img_tex b)
-{
-    return a.type == b.type &&
-           a.components == b.components &&
-           a.multiplier == b.multiplier &&
-           a.tex->params.format == b.tex->params.format &&
-           a.tex->params.w == b.tex->params.w &&
-           a.tex->params.h == b.tex->params.h &&
-           a.w == b.w &&
-           a.h == b.h &&
-           gl_transform_eq(a.transform, b.transform);
-}
-
-static bool add_hook(struct gl_video *p, struct tex_hook hook)
-{
-    if (p->tex_hook_num < SHADER_MAX_PASSES) {
-        p->tex_hooks[p->tex_hook_num++] = hook;
-        return true;
-    } else {
-        MP_ERR(p, "Too many passes! Limit is %d.\n", SHADER_MAX_PASSES);
-        talloc_free(hook.priv);
-        return false;
-    }
-}
-
-static void deband_hook(struct gl_video *p, struct img_tex tex,
-                        struct gl_transform *trans, void *priv)
-{
-    pass_describe(p, "debanding (%s)", plane_names[tex.type]);
-    pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg,
-                       p->image_params.color.gamma);
-}
-
-static void unsharp_hook(struct gl_video *p, struct img_tex tex,
-                         struct gl_transform *trans, void *priv)
-{
-    pass_describe(p, "unsharp masking");
-    pass_sample_unsharp(p->sc, p->opts.unsharp);
-}
-
-struct szexp_ctx {
-    struct gl_video *p;
-    struct img_tex tex;
-};
-
-static bool szexp_lookup(void *priv, struct bstr var, float size[2])
-{
-    struct szexp_ctx *ctx = priv;
-    struct gl_video *p = ctx->p;
-
-    if (bstr_equals0(var, "NATIVE_CROPPED")) {
-        size[0] = (p->src_rect.x1 - p->src_rect.x0) * p->texture_offset.m[0][0];
-        size[1] = (p->src_rect.y1 - p->src_rect.y0) * p->texture_offset.m[1][1];
-        return true;
-    }
-
-    // The size of OUTPUT is determined. It could be useful for certain
-    // user shaders to skip passes.
-    if (bstr_equals0(var, "OUTPUT")) {
-        size[0] = p->dst_rect.x1 - p->dst_rect.x0;
-        size[1] = p->dst_rect.y1 - p->dst_rect.y0;
-        return true;
-    }
-
-    // HOOKED is a special case
-    if (bstr_equals0(var, "HOOKED")) {
-        size[0] = ctx->tex.w;
-        size[1] = ctx->tex.h;
-        return true;
-    }
-
-    for (int o = 0; o < p->saved_tex_num; o++) {
-        if (bstr_equals0(var, p->saved_tex[o].name)) {
-            size[0] = p->saved_tex[o].tex.w;
-            size[1] = p->saved_tex[o].tex.h;
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
-{
-    struct gl_user_shader_hook *shader = priv;
-    assert(shader);
-
-    float res = false;
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->cond, &res);
-    return res;
-}
-
-static void user_hook(struct gl_video *p, struct img_tex tex,
-                      struct gl_transform *trans, void *priv)
-{
-    struct gl_user_shader_hook *shader = priv;
-    assert(shader);
-    load_shader(p, shader->pass_body);
-
-    pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
-                  plane_names[tex.type]);
-
-    if (shader->compute.active) {
-        p->pass_compute = shader->compute;
-        GLSLF("hook();\n");
-    } else {
-        GLSLF("color = hook();\n");
-    }
-
-    // Make sure we at least create a legal FBO on failure, since it's better
-    // to do this and display an error message than just crash OpenGL
-    float w = 1.0, h = 1.0;
-
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->width, &w);
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->height, &h);
-
-    *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
-    gl_transform_trans(shader->offset, trans);
-}
-
-static bool add_user_hook(void *priv, struct gl_user_shader_hook hook)
-{
-    struct gl_video *p = priv;
-    struct gl_user_shader_hook *copy = talloc_ptrtype(p, copy);
-    *copy = hook;
-
-    struct tex_hook texhook = {
-        .save_tex = bstrdup0(copy, hook.save_tex),
-        .components = hook.components,
-        .hook = user_hook,
-        .cond = user_hook_cond,
-        .priv = copy,
-    };
-
-    for (int h = 0; h < SHADER_MAX_HOOKS; h++)
-        texhook.hook_tex[h] = bstrdup0(copy, hook.hook_tex[h]);
-    for (int h = 0; h < SHADER_MAX_BINDS; h++)
-        texhook.bind_tex[h] = bstrdup0(copy, hook.bind_tex[h]);
-
-    return add_hook(p, texhook);
-}
-
-static bool add_user_tex(void *priv, struct gl_user_shader_tex tex)
-{
-    struct gl_video *p = priv;
-
-    if (p->user_tex_num == SHADER_MAX_PASSES) {
-        MP_ERR(p, "Too many textures! Limit is %d.\n", SHADER_MAX_PASSES);
-        goto err;
-    }
-
-    tex.tex = ra_tex_create(p->ra, &tex.params);
-    TA_FREEP(&tex.params.initial_data);
-
-    p->user_textures[p->user_tex_num++] = tex;
-    return true;
-
-err:
-    talloc_free(tex.params.initial_data);
-    return false;
-}
-
-static void load_user_shaders(struct gl_video *p, char **shaders)
-{
-    if (!shaders)
-        return;
-
-    for (int n = 0; shaders[n] != NULL; n++) {
-        struct bstr file = load_cached_file(p, shaders[n]);
-        parse_user_shader(p->log, p->ra, file, p, add_user_hook, add_user_tex);
-    }
-}
-
-static void gl_video_setup_hooks(struct gl_video *p)
-{
-    gl_video_reset_hooks(p);
-
-    if (p->opts.deband) {
-        add_hook(p, (struct tex_hook) {
-            .hook_tex = {"LUMA", "CHROMA", "RGB", "XYZ"},
-            .bind_tex = {"HOOKED"},
-            .hook = deband_hook,
-        });
-    }
-
-    if (p->opts.unsharp != 0.0) {
-        add_hook(p, (struct tex_hook) {
-            .hook_tex = {"MAIN"},
-            .bind_tex = {"HOOKED"},
-            .hook = unsharp_hook,
-        });
-    }
-
-    load_user_shaders(p, p->opts.user_shaders);
-}
-
-// sample from video textures, set "color" variable to yuv value
-static void pass_read_video(struct gl_video *p)
-{
-    struct img_tex tex[4];
-    struct gl_transform offsets[4];
-    pass_get_img_tex(p, &p->image, tex, offsets);
-
-    // To keep the code as simple as possibly, we currently run all shader
-    // stages even if they would be unnecessary (e.g. no hooks for a texture).
-    // In the future, deferred img_tex should optimize this away.
-
-    // Merge semantically identical textures. This loop is done from back
-    // to front so that merged textures end up in the right order while
-    // simultaneously allowing us to skip unnecessary merges
-    for (int n = 3; n >= 0; n--) {
-        if (tex[n].type == PLANE_NONE)
-            continue;
-
-        int first = n;
-        int num = 0;
-
-        for (int i = 0; i < n; i++) {
-            if (img_tex_equiv(tex[n], tex[i]) &&
-                gl_transform_eq(offsets[n], offsets[i]))
-            {
-                GLSLF("// merging plane %d ...\n", i);
-                copy_img_tex(p, &num, tex[i]);
-                first = MPMIN(first, i);
-                tex[i] = (struct img_tex){0};
-            }
-        }
-
-        if (num > 0) {
-            GLSLF("// merging plane %d ... into %d\n", n, first);
-            copy_img_tex(p, &num, tex[n]);
-            pass_describe(p, "merging planes");
-            finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
-            tex[n] = (struct img_tex){0};
-        }
-    }
-
-    // If any textures are still in integer format by this point, we need
-    // to introduce an explicit conversion pass to avoid breaking hooks/scaling
-    for (int n = 0; n < 4; n++) {
-        if (tex[n].tex && tex[n].tex->params.format->ctype == RA_CTYPE_UINT) {
-            GLSLF("// use_integer fix for plane %d\n", n);
-            copy_img_tex(p, &(int){0}, tex[n]);
-            pass_describe(p, "use_integer fix");
-            finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
-                                 tex[n].components);
-        }
-    }
-
-    // Dispatch the hooks for all of these textures, saving and perhaps
-    // modifying them in the process
-    for (int n = 0; n < 4; n++) {
-        const char *name;
-        switch (tex[n].type) {
-        case PLANE_RGB:    name = "RGB";    break;
-        case PLANE_LUMA:   name = "LUMA";   break;
-        case PLANE_CHROMA: name = "CHROMA"; break;
-        case PLANE_ALPHA:  name = "ALPHA";  break;
-        case PLANE_XYZ:    name = "XYZ";    break;
-        default: continue;
-        }
-
-        tex[n] = pass_hook(p, name, tex[n], &offsets[n]);
-    }
-
-    // At this point all planes are finalized but they may not be at the
-    // required size yet. Furthermore, they may have texture offsets that
-    // require realignment. For lack of something better to do, we assume
-    // the rgb/luma texture is the "reference" and scale everything else
-    // to match.
-    for (int n = 0; n < 4; n++) {
-        switch (tex[n].type) {
-        case PLANE_RGB:
-        case PLANE_XYZ:
-        case PLANE_LUMA: break;
-        default: continue;
-        }
-
-        p->texture_w = tex[n].w;
-        p->texture_h = tex[n].h;
-        p->texture_offset = offsets[n];
-        break;
-    }
-
-    // Compute the reference rect
-    struct mp_rect_f src = {0.0, 0.0, p->image_params.w, p->image_params.h};
-    struct mp_rect_f ref = src;
-    gl_transform_rect(p->texture_offset, &ref);
-    MP_DBG(p, "ref rect: {%f %f} {%f %f}\n", ref.x0, ref.y0, ref.x1, ref.y1);
-
-    // Explicitly scale all of the textures that don't match
-    for (int n = 0; n < 4; n++) {
-        if (tex[n].type == PLANE_NONE)
-            continue;
-
-        // If the planes are aligned identically, we will end up with the
-        // exact same source rectangle.
-        struct mp_rect_f rect = src;
-        gl_transform_rect(offsets[n], &rect);
-        MP_DBG(p, "rect[%d]: {%f %f} {%f %f}\n", n,
-               rect.x0, rect.y0, rect.x1, rect.y1);
-
-        if (mp_rect_f_seq(ref, rect))
-            continue;
-
-        // If the rectangles differ, then our planes have a different
-        // alignment and/or size. First of all, we have to compute the
-        // corrections required to meet the target rectangle
-        struct gl_transform fix = {
-            .m = {{(ref.x1 - ref.x0) / (rect.x1 - rect.x0), 0.0},
-                  {0.0, (ref.y1 - ref.y0) / (rect.y1 - rect.y0)}},
-            .t = {ref.x0, ref.y0},
-        };
-        MP_DBG(p, "-> fix[%d] = {%f %f} + off {%f %f}\n", n,
-               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
-
-        // Since the scale in texture space is different from the scale in
-        // absolute terms, we have to scale the coefficients down to be
-        // relative to the texture's physical dimensions and local offset
-        struct gl_transform scale = {
-            .m = {{(float)tex[n].w / p->texture_w, 0.0},
-                  {0.0, (float)tex[n].h / p->texture_h}},
-            .t = {-rect.x0, -rect.y0},
-        };
-        if (p->image_params.rotate % 180 == 90)
-            MPSWAP(double, scale.m[0][0], scale.m[1][1]);
-
-        gl_transform_trans(scale, &fix);
-        MP_DBG(p, "-> scaled[%d] = {%f %f} + off {%f %f}\n", n,
-               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
-
-        // Since the texture transform is a function of the texture coordinates
-        // to texture space, rather than the other way around, we have to
-        // actually apply the *inverse* of this. Fortunately, calculating
-        // the inverse is relatively easy here.
-        fix.m[0][0] = 1.0 / fix.m[0][0];
-        fix.m[1][1] = 1.0 / fix.m[1][1];
-        fix.t[0] = fix.m[0][0] * -fix.t[0];
-        fix.t[1] = fix.m[1][1] * -fix.t[1];
-        gl_transform_trans(fix, &tex[n].transform);
-
-        int scaler_id = -1;
-        const char *name = NULL;
-        switch (tex[n].type) {
-        case PLANE_RGB:
-        case PLANE_LUMA:
-        case PLANE_XYZ:
-            scaler_id = SCALER_SCALE;
-            // these aren't worth hooking, fringe hypothetical cases only
-            break;
-        case PLANE_CHROMA:
-            scaler_id = SCALER_CSCALE;
-            name = "CHROMA_SCALED";
-            break;
-        case PLANE_ALPHA:
-            // alpha always uses bilinear
-            name = "ALPHA_SCALED";
-        }
-
-        if (scaler_id < 0)
-            continue;
-
-        const struct scaler_config *conf = &p->opts.scaler[scaler_id];
-        struct scaler *scaler = &p->scaler[scaler_id];
-
-        // bilinear scaling is a free no-op thanks to GPU sampling
-        if (strcmp(conf->kernel.name, "bilinear") != 0) {
-            GLSLF("// upscaling plane %d\n", n);
-            pass_sample(p, tex[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
-            finish_pass_fbo(p, &p->scale_fbo[n], p->texture_w, p->texture_h, 0);
-            tex[n] = img_tex_fbo(&p->scale_fbo[n], tex[n].type, tex[n].components);
-        }
-
-        // Run any post-scaling hooks
-        tex[n] = pass_hook(p, name, tex[n], NULL);
-    }
-
-    // All planes are of the same size and properly aligned at this point
-    GLSLF("// combining planes\n");
-    int coord = 0;
-    for (int i = 0; i < 4; i++) {
-        if (tex[i].type != PLANE_NONE)
-            copy_img_tex(p, &coord, tex[i]);
-    }
-    p->components = coord;
-}
-
-// Utility function that simply binds an FBO and reads from it, without any
-// transformations.
-static void pass_read_fbo(struct gl_video *p, struct fbotex *fbo)
-{
-    struct img_tex tex = img_tex_fbo(fbo, PLANE_RGB, p->components);
-    copy_img_tex(p, &(int){0}, tex);
-}
-
-// yuv conversion, and any other conversions before main up/down-scaling
-static void pass_convert_yuv(struct gl_video *p)
-{
-    struct gl_shader_cache *sc = p->sc;
-
-    struct mp_csp_params cparams = MP_CSP_PARAMS_DEFAULTS;
-    cparams.gray = p->is_gray;
-    mp_csp_set_image_params(&cparams, &p->image_params);
-    mp_csp_equalizer_state_get(p->video_eq, &cparams);
-    p->user_gamma = 1.0 / (cparams.gamma * p->opts.gamma);
-
-    pass_describe(p, "color conversion");
-
-    if (p->color_swizzle[0])
-        GLSLF("color = color.%s;\n", p->color_swizzle);
-
-    // Pre-colormatrix input gamma correction
-    if (cparams.color.space == MP_CSP_XYZ)
-        GLSL(color.rgb = pow(color.rgb, vec3(2.6));) // linear light
-
-    // We always explicitly normalize the range in pass_read_video
-    cparams.input_bits = cparams.texture_bits = 0;
-
-    // Conversion to RGB. For RGB itself, this still applies e.g. brightness
-    // and contrast controls, or expansion of e.g. LSB-packed 10 bit data.
-    struct mp_cmat m = {{{0}}};
-    mp_get_csp_matrix(&cparams, &m);
-    gl_sc_uniform_mat3(sc, "colormatrix", true, &m.m[0][0]);
-    gl_sc_uniform_vec3(sc, "colormatrix_c", m.c);
-
-    GLSL(color.rgb = mat3(colormatrix) * color.rgb + colormatrix_c;)
-
-    if (p->image_params.color.space == MP_CSP_BT_2020_C) {
-        // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
-        // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
-        //      = (B'-Y'c) / 1.5816  | C'bc >  0
-        //
-        // C'rc = (R'-Y'c) / 1.7184  | C'rc <= 0
-        //      = (R'-Y'c) / 0.9936  | C'rc >  0
-        //
-        // as per the BT.2020 specification, table 4. This is a non-linear
-        // transformation because (constant) luminance receives non-equal
-        // contributions from the three different channels.
-        GLSLF("// constant luminance conversion\n");
-        GLSL(color.br = color.br * mix(vec2(1.5816, 0.9936),
-                                       vec2(1.9404, 1.7184),
-                                       lessThanEqual(color.br, vec2(0)))
-                        + color.gg;)
-        // Expand channels to camera-linear light. This shader currently just
-        // assumes everything uses the BT.2020 12-bit gamma function, since the
-        // difference between 10 and 12-bit is negligible for anything other
-        // than 12-bit content.
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
-                             pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993),
-                                 vec3(1.0/0.45)),
-                             lessThanEqual(vec3(0.08145), color.rgb));)
-        // Calculate the green channel from the expanded RYcB
-        // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
-        GLSL(color.g = (color.g - 0.2627*color.r - 0.0593*color.b)*1.0/0.6780;)
-        // Recompress to receive the R'G'B' result, same as other systems
-        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
-                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
-                             lessThanEqual(vec3(0.0181), color.rgb));)
-    }
-
-    p->components = 3;
-    if (!p->has_alpha || p->opts.alpha_mode == ALPHA_NO) {
-        GLSL(color.a = 1.0;)
-    } else { // alpha present in image
-        p->components = 4;
-        GLSL(color = vec4(color.rgb * color.a, color.a);)
-    }
-}
-
-static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2])
-{
-    double target_w = p->src_rect.x1 - p->src_rect.x0;
-    double target_h = p->src_rect.y1 - p->src_rect.y0;
-    if (transpose_rot && p->image_params.rotate % 180 == 90)
-        MPSWAP(double, target_w, target_h);
-    xy[0] = (p->dst_rect.x1 - p->dst_rect.x0) / target_w;
-    xy[1] = (p->dst_rect.y1 - p->dst_rect.y0) / target_h;
-}
-
-// Cropping.
-static void compute_src_transform(struct gl_video *p, struct gl_transform *tr)
-{
-    float sx = (p->src_rect.x1 - p->src_rect.x0) / (float)p->texture_w,
-          sy = (p->src_rect.y1 - p->src_rect.y0) / (float)p->texture_h,
-          ox = p->src_rect.x0,
-          oy = p->src_rect.y0;
-    struct gl_transform transform = {{{sx, 0}, {0, sy}}, {ox, oy}};
-
-    gl_transform_trans(p->texture_offset, &transform);
-
-    *tr = transform;
-}
-
-// Takes care of the main scaling and pre/post-conversions
-static void pass_scale_main(struct gl_video *p)
-{
-    // Figure out the main scaler.
-    double xy[2];
-    get_scale_factors(p, true, xy);
-
-    // actual scale factor should be divided by the scale factor of prescaling.
-    xy[0] /= p->texture_offset.m[0][0];
-    xy[1] /= p->texture_offset.m[1][1];
-
-    bool downscaling = xy[0] < 1.0 || xy[1] < 1.0;
-    bool upscaling = !downscaling && (xy[0] > 1.0 || xy[1] > 1.0);
-    double scale_factor = 1.0;
-
-    struct scaler *scaler = &p->scaler[SCALER_SCALE];
-    struct scaler_config scaler_conf = p->opts.scaler[SCALER_SCALE];
-    if (p->opts.scaler_resizes_only && !downscaling && !upscaling) {
-        scaler_conf.kernel.name = "bilinear";
-        // For scaler-resizes-only, we round the texture offset to
-        // the nearest round value in order to prevent ugly blurriness
-        // (in exchange for slightly shifting the image by up to half a
-        // subpixel)
-        p->texture_offset.t[0] = roundf(p->texture_offset.t[0]);
-        p->texture_offset.t[1] = roundf(p->texture_offset.t[1]);
-    }
-    if (downscaling && p->opts.scaler[SCALER_DSCALE].kernel.name) {
-        scaler_conf = p->opts.scaler[SCALER_DSCALE];
-        scaler = &p->scaler[SCALER_DSCALE];
-    }
-
-    // When requesting correct-downscaling and the clip is anamorphic, and
-    // because only a single scale factor is used for both axes, enable it only
-    // when both axes are downscaled, and use the milder of the factors to not
-    // end up with too much blur on one axis (even if we end up with sub-optimal
-    // scale factor on the other axis). This is better than not respecting
-    // correct scaling at all for anamorphic clips.
-    double f = MPMAX(xy[0], xy[1]);
-    if (p->opts.correct_downscaling && f < 1.0)
-        scale_factor = 1.0 / f;
-
-    // Pre-conversion, like linear light/sigmoidization
-    GLSLF("// scaler pre-conversion\n");
-    bool use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
-
-    // Linear light downscaling results in nasty artifacts for HDR curves due
-    // to the potentially extreme brightness differences severely compounding
-    // any ringing. So just scale in gamma light instead.
-    if (mp_trc_is_hdr(p->image_params.color.gamma) && downscaling)
-        use_linear = false;
-
-    if (use_linear) {
-        p->use_linear = true;
-        pass_linearize(p->sc, p->image_params.color.gamma);
-        pass_opt_hook_point(p, "LINEAR", NULL);
-    }
-
-    bool use_sigmoid = use_linear && p->opts.sigmoid_upscaling && upscaling;
-    float sig_center, sig_slope, sig_offset, sig_scale;
-    if (use_sigmoid) {
-        // Coefficients for the sigmoidal transform are taken from the
-        // formula here: http://www.imagemagick.org/Usage/color_mods/#sigmoidal
-        sig_center = p->opts.sigmoid_center;
-        sig_slope  = p->opts.sigmoid_slope;
-        // This function needs to go through (0,0) and (1,1) so we compute the
-        // values at 1 and 0, and then scale/shift them, respectively.
-        sig_offset = 1.0/(1+expf(sig_slope * sig_center));
-        sig_scale  = 1.0/(1+expf(sig_slope * (sig_center-1))) - sig_offset;
-        GLSLF("color.rgb = %f - log(1.0/(color.rgb * %f + %f) - 1.0) * 1.0/%f;\n",
-                sig_center, sig_scale, sig_offset, sig_slope);
-        pass_opt_hook_point(p, "SIGMOID", NULL);
-    }
-
-    pass_opt_hook_point(p, "PREKERNEL", NULL);
-
-    int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
-    int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-    struct gl_transform transform;
-    compute_src_transform(p, &transform);
-
-    GLSLF("// main scaling\n");
-    finish_pass_fbo(p, &p->indirect_fbo, p->texture_w, p->texture_h, 0);
-    struct img_tex src = img_tex_fbo(&p->indirect_fbo, PLANE_RGB, p->components);
-    gl_transform_trans(transform, &src.transform);
-    pass_sample(p, src, scaler, &scaler_conf, scale_factor, vp_w, vp_h);
-
-    // Changes the texture size to display size after main scaler.
-    p->texture_w = vp_w;
-    p->texture_h = vp_h;
-
-    pass_opt_hook_point(p, "POSTKERNEL", NULL);
-
-    GLSLF("// scaler post-conversion\n");
-    if (use_sigmoid) {
-        // Inverse of the transformation above
-        GLSLF("color.rgb = (1.0/(1.0 + exp(%f * (%f - color.rgb))) - %f) * 1.0/%f;\n",
-                sig_slope, sig_center, sig_offset, sig_scale);
-    }
-}
-
-// Adapts the colors to the right output color space. (Final pass during
-// rendering)
-// If OSD is true, ignore any changes that may have been made to the video
-// by previous passes (i.e. linear scaling)
-static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool osd)
-{
-    struct ra *ra = p->ra;
-
-    // Figure out the target color space from the options, or auto-guess if
-    // none were set
-    struct mp_colorspace dst = {
-        .gamma = p->opts.target_trc,
-        .primaries = p->opts.target_prim,
-        .light = MP_CSP_LIGHT_DISPLAY,
-    };
-
-    if (p->use_lut_3d) {
-        // The 3DLUT is always generated against the video's original source
-        // space, *not* the reference space. (To avoid having to regenerate
-        // the 3DLUT for the OSD on every frame)
-        enum mp_csp_prim prim_orig = p->image_params.color.primaries;
-        enum mp_csp_trc trc_orig = p->image_params.color.gamma;
-
-        // One exception: HDR is not implemented by LittleCMS for technical
-        // limitation reasons, so we use a gamma 2.2 input curve here instead.
-        // We could pick any value we want here, the difference is just coding
-        // efficiency.
-        if (mp_trc_is_hdr(trc_orig))
-            trc_orig = MP_CSP_TRC_GAMMA22;
-
-        if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
-            dst.primaries = prim_orig;
-            dst.gamma = trc_orig;
-        }
-    }
-
-    if (dst.primaries == MP_CSP_PRIM_AUTO) {
-        // The vast majority of people are on sRGB or BT.709 displays, so pick
-        // this as the default output color space.
-        dst.primaries = MP_CSP_PRIM_BT_709;
-
-        if (src.primaries == MP_CSP_PRIM_BT_601_525 ||
-            src.primaries == MP_CSP_PRIM_BT_601_625)
-        {
-            // Since we auto-pick BT.601 and BT.709 based on the dimensions,
-            // combined with the fact that they're very similar to begin with,
-            // and to avoid confusing the average user, just don't adapt BT.601
-            // content automatically at all.
-            dst.primaries = src.primaries;
-        }
-    }
-
-    if (dst.gamma == MP_CSP_TRC_AUTO) {
-        // Most people seem to complain when the image is darker or brighter
-        // than what they're "used to", so just avoid changing the gamma
-        // altogether by default. The only exceptions to this rule apply to
-        // very unusual TRCs, which even hardcode technoluddites would probably
-        // not enjoy viewing unaltered.
-        dst.gamma = src.gamma;
-
-        // Avoid outputting linear light or HDR content "by default". For these
-        // just pick gamma 2.2 as a default, since it's a good estimate for
-        // the response of typical displays
-        if (dst.gamma == MP_CSP_TRC_LINEAR || mp_trc_is_hdr(dst.gamma))
-            dst.gamma = MP_CSP_TRC_GAMMA22;
-    }
-
-    bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
-    if (detect_peak && !p->hdr_peak_ssbo) {
-        struct {
-            unsigned int sig_peak_raw;
-            unsigned int index;
-            unsigned int frame_max[PEAK_DETECT_FRAMES+1];
-        } peak_ssbo = {0};
-
-        // Prefill with safe values
-        int safe = MP_REF_WHITE * mp_trc_nom_peak(p->image_params.color.gamma);
-        peak_ssbo.sig_peak_raw = PEAK_DETECT_FRAMES * safe;
-        for (int i = 0; i < PEAK_DETECT_FRAMES+1; i++)
-            peak_ssbo.frame_max[i] = safe;
-
-        struct ra_buf_params params = {
-            .type = RA_BUF_TYPE_SHADER_STORAGE,
-            .size = sizeof(peak_ssbo),
-            .initial_data = &peak_ssbo,
-        };
-
-        p->hdr_peak_ssbo = ra_buf_create(ra, &params);
-        if (!p->hdr_peak_ssbo) {
-            MP_WARN(p, "Failed to create HDR peak detection SSBO, disabling.\n");
-            detect_peak = (p->opts.compute_hdr_peak = false);
-        }
-    }
-
-    if (detect_peak) {
-        pass_describe(p, "detect HDR peak");
-        pass_is_compute(p, 8, 8); // 8x8 is good for performance
-        gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
-            "uint sig_peak_raw;"
-            "uint index;"
-            "uint frame_max[%d];", PEAK_DETECT_FRAMES + 1
-        );
-    }
-
-    // Adapt from src to dst as necessary
-    pass_color_map(p->sc, src, dst, p->opts.tone_mapping,
-                   p->opts.tone_mapping_param, p->opts.tone_mapping_desat,
-                   detect_peak, p->opts.gamut_warning, p->use_linear && !osd);
-
-    if (p->use_lut_3d) {
-        gl_sc_uniform_texture(p->sc, "lut_3d", p->lut_3d_texture);
-        GLSL(vec3 cpos;)
-        for (int i = 0; i < 3; i++)
-            GLSLF("cpos[%d] = LUT_POS(color[%d], %d.0);\n", i, i, p->lut_3d_size[i]);
-        GLSL(color.rgb = tex3D(lut_3d, cpos).rgb;)
-    }
-}
-
-void gl_video_set_fb_depth(struct gl_video *p, int fb_depth)
-{
-    p->fb_depth = fb_depth;
-}
-
-static void pass_dither(struct gl_video *p)
-{
-    // Assume 8 bits per component if unknown.
-    int dst_depth = p->fb_depth > 0 ? p->fb_depth : 8;
-    if (p->opts.dither_depth > 0)
-        dst_depth = p->opts.dither_depth;
-
-    if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
-        return;
-
-    if (!p->dither_texture) {
-        MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
-
-        int tex_size = 0;
-        void *tex_data = NULL;
-        const struct ra_format *fmt = NULL;
-        void *temp = NULL;
-
-        if (p->opts.dither_algo == DITHER_FRUIT) {
-            int sizeb = p->opts.dither_size;
-            int size = 1 << sizeb;
-
-            if (p->last_dither_matrix_size != size) {
-                p->last_dither_matrix = talloc_realloc(p, p->last_dither_matrix,
-                                                       float, size * size);
-                mp_make_fruit_dither_matrix(p->last_dither_matrix, sizeb);
-                p->last_dither_matrix_size = size;
-            }
-
-            // Prefer R16 texture since they provide higher precision.
-            fmt = ra_find_unorm_format(p->ra, 2, 1);
-            if (!fmt)
-                fmt = ra_find_float16_format(p->ra, 1);
-            if (fmt) {
-                tex_size = size;
-                tex_data = p->last_dither_matrix;
-                if (fmt->ctype == RA_CTYPE_UNORM) {
-                    uint16_t *t = temp = talloc_array(NULL, uint16_t, size * size);
-                    for (int n = 0; n < size * size; n++)
-                        t[n] = p->last_dither_matrix[n] * UINT16_MAX;
-                    tex_data = t;
-                }
-            } else {
-                MP_VERBOSE(p, "GL too old. Falling back to ordered dither.\n");
-                p->opts.dither_algo = DITHER_ORDERED;
-            }
-        }
-
-        if (p->opts.dither_algo == DITHER_ORDERED) {
-            temp = talloc_array(NULL, char, 8 * 8);
-            mp_make_ordered_dither_matrix(temp, 8);
-
-            fmt = ra_find_unorm_format(p->ra, 1, 1);
-            tex_size = 8;
-            tex_data = temp;
-        }
-
-        struct ra_tex_params params = {
-            .dimensions = 2,
-            .w = tex_size,
-            .h = tex_size,
-            .d = 1,
-            .format = fmt,
-            .render_src = true,
-            .src_repeat = true,
-            .initial_data = tex_data,
-        };
-        p->dither_texture = ra_tex_create(p->ra, &params);
-
-        debug_check_gl(p, "dither setup");
-
-        talloc_free(temp);
-    }
-
-    GLSLF("// dithering\n");
-
-    // This defines how many bits are considered significant for output on
-    // screen. The superfluous bits will be used for rounding according to the
-    // dither matrix. The precision of the source implicitly decides how many
-    // dither patterns can be visible.
-    int dither_quantization = (1 << dst_depth) - 1;
-    int dither_size = p->dither_texture->params.w;
-
-    gl_sc_uniform_texture(p->sc, "dither", p->dither_texture);
-
-    GLSLF("vec2 dither_pos = gl_FragCoord.xy * 1.0/%d.0;\n", dither_size);
-
-    if (p->opts.temporal_dither) {
-        int phase = (p->frames_rendered / p->opts.temporal_dither_period) % 8u;
-        float r = phase * (M_PI / 2); // rotate
-        float m = phase < 4 ? 1 : -1; // mirror
-
-        float matrix[2][2] = {{cos(r),     -sin(r)    },
-                              {sin(r) * m,  cos(r) * m}};
-        gl_sc_uniform_mat2(p->sc, "dither_trafo", true, &matrix[0][0]);
-
-        GLSL(dither_pos = dither_trafo * dither_pos;)
-    }
-
-    GLSL(float dither_value = texture(dither, dither_pos).r;)
-    GLSLF("color = floor(color * %d.0 + dither_value + 0.5 / %d.0) * 1.0/%d.0;\n",
-          dither_quantization, dither_size * dither_size, dither_quantization);
-}
-
-// Draws the OSD, in scene-referred colors.. If cms is true, subtitles are
-// instead adapted to the display's gamut.
-static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
-                          struct mp_osd_res rect, struct fbodst target, bool cms)
-{
-    mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo_out, draw_flags);
-
-    timer_pool_start(p->osd_timer);
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        // (This returns false if this part is empty with nothing to draw.)
-        if (!mpgl_osd_draw_prepare(p->osd, n, p->sc))
-            continue;
-        // When subtitles need to be color managed, assume they're in sRGB
-        // (for lack of anything saner to do)
-        if (cms) {
-            static const struct mp_colorspace csp_srgb = {
-                .primaries = MP_CSP_PRIM_BT_709,
-                .gamma = MP_CSP_TRC_SRGB,
-                .light = MP_CSP_LIGHT_DISPLAY,
-            };
-
-            pass_colormanage(p, csp_srgb, true);
-        }
-        mpgl_osd_draw_finish(p->osd, n, p->sc, target);
-    }
-
-    timer_pool_stop(p->osd_timer);
-    pass_describe(p, "drawing osd");
-    pass_record(p, timer_pool_measure(p->osd_timer));
-}
-
-static float chroma_realign(int size, int pixel)
-{
-    return size / (float)chroma_upsize(size, pixel);
-}
-
-// Minimal rendering code path, for GLES or OpenGL 2.1 without proper FBOs.
-static void pass_render_frame_dumb(struct gl_video *p)
-{
-    struct img_tex tex[4];
-    struct gl_transform off[4];
-    pass_get_img_tex(p, &p->image, tex, off);
-
-    struct gl_transform transform;
-    compute_src_transform(p, &transform);
-
-    int index = 0;
-    for (int i = 0; i < p->plane_count; i++) {
-        int cw = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_w : 1;
-        int ch = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_h : 1;
-        if (p->image_params.rotate % 180 == 90)
-            MPSWAP(int, cw, ch);
-
-        struct gl_transform t = transform;
-        t.m[0][0] *= chroma_realign(p->texture_w, cw);
-        t.m[1][1] *= chroma_realign(p->texture_h, ch);
-
-        t.t[0] /= cw;
-        t.t[1] /= ch;
-
-        t.t[0] += off[i].t[0];
-        t.t[1] += off[i].t[1];
-
-        gl_transform_trans(tex[i].transform, &t);
-        tex[i].transform = t;
-
-        copy_img_tex(p, &index, tex[i]);
-    }
-
-    pass_convert_yuv(p);
-}
-
-// The main rendering function, takes care of everything up to and including
-// upscaling. p->image is rendered.
-static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t id)
-{
-    // initialize the texture parameters and temporary variables
-    p->texture_w = p->image_params.w;
-    p->texture_h = p->image_params.h;
-    p->texture_offset = identity_trans;
-    p->components = 0;
-    p->saved_tex_num = 0;
-    p->hook_fbo_num = 0;
-    p->use_linear = false;
-
-    // try uploading the frame
-    if (!pass_upload_image(p, mpi, id))
-        return false;
-
-    if (p->image_params.rotate % 180 == 90)
-        MPSWAP(int, p->texture_w, p->texture_h);
-
-    if (p->dumb_mode)
-        return true;
-
-    pass_read_video(p);
-    pass_opt_hook_point(p, "NATIVE", &p->texture_offset);
-    pass_convert_yuv(p);
-    pass_opt_hook_point(p, "MAINPRESUB", &p->texture_offset);
-
-    // For subtitles
-    double vpts = p->image.mpi->pts;
-    if (vpts == MP_NOPTS_VALUE)
-        vpts = p->osd_pts;
-
-    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO) {
-        double scale[2];
-        get_scale_factors(p, false, scale);
-        struct mp_osd_res rect = {
-            .w = p->texture_w, .h = p->texture_h,
-            .display_par = scale[1] / scale[0], // counter compensate scaling
-        };
-        finish_pass_fbo(p, &p->blend_subs_fbo, rect.w, rect.h, 0);
-        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
-                      p->blend_subs_fbo.fbo, false);
-        pass_read_fbo(p, &p->blend_subs_fbo);
-        pass_describe(p, "blend subs video");
-    }
-    pass_opt_hook_point(p, "MAIN", &p->texture_offset);
-
-    pass_scale_main(p);
-
-    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
-        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES) {
-        // Recreate the real video size from the src/dst rects
-        struct mp_osd_res rect = {
-            .w = vp_w, .h = vp_h,
-            .ml = -p->src_rect.x0, .mr = p->src_rect.x1 - p->image_params.w,
-            .mt = -p->src_rect.y0, .mb = p->src_rect.y1 - p->image_params.h,
-            .display_par = 1.0,
-        };
-        // Adjust margins for scale
-        double scale[2];
-        get_scale_factors(p, true, scale);
-        rect.ml *= scale[0]; rect.mr *= scale[0];
-        rect.mt *= scale[1]; rect.mb *= scale[1];
-        // We should always blend subtitles in non-linear light
-        if (p->use_linear) {
-            pass_delinearize(p->sc, p->image_params.color.gamma);
-            p->use_linear = false;
-        }
-        finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h, 0);
-        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
-                      p->blend_subs_fbo.fbo, false);
-        pass_read_fbo(p, &p->blend_subs_fbo);
-        pass_describe(p, "blend subs");
-    }
-
-    pass_opt_hook_point(p, "SCALED", NULL);
-
-    return true;
-}
-
-static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
-{
-    if (p->dumb_mode)
-        pass_render_frame_dumb(p);
-
-    // Adjust the overall gamma before drawing to screen
-    if (p->user_gamma != 1) {
-        gl_sc_uniform_f(p->sc, "user_gamma", p->user_gamma);
-        GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-        GLSL(color.rgb = pow(color.rgb, vec3(user_gamma));)
-    }
-
-    pass_colormanage(p, p->image_params.color, false);
-
-    // Since finish_pass_direct doesn't work with compute shaders, and neither
-    // does the checkerboard/dither code, we may need an indirection via
-    // p->screen_fbo here.
-    if (p->pass_compute.active) {
-        int o_w = p->dst_rect.x1 - p->dst_rect.x0,
-            o_h = p->dst_rect.y1 - p->dst_rect.y0;
-        finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
-        struct img_tex tmp = img_tex_fbo(&p->screen_fbo, PLANE_RGB, p->components);
-        copy_img_tex(p, &(int){0}, tmp);
-    }
-
-    if (p->has_alpha){
-        if (p->opts.alpha_mode == ALPHA_BLEND_TILES) {
-            // Draw checkerboard pattern to indicate transparency
-            GLSLF("// transparency checkerboard\n");
-            GLSL(bvec2 tile = lessThan(fract(gl_FragCoord.xy * 1.0/32.0), vec2(0.5));)
-            GLSL(vec3 background = vec3(tile.x == tile.y ? 1.0 : 0.75);)
-            GLSL(color.rgb = mix(background, color.rgb, color.a);)
-        } else if (p->opts.alpha_mode == ALPHA_BLEND) {
-            // Blend into background color (usually black)
-            struct m_color c = p->opts.background;
-            GLSLF("vec4 background = vec4(%f, %f, %f, %f);\n",
-                  c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0);
-            GLSL(color = mix(background, vec4(color.rgb, 1.0), color.a);)
-        }
-    }
-
-    pass_opt_hook_point(p, "OUTPUT", NULL);
-
-    pass_dither(p);
-    pass_describe(p, "output to screen");
-    finish_pass_direct(p, fbo, &p->dst_rect);
-}
-
-static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
-                              uint64_t id, struct fbosurface *surf)
-{
-    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
-        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-
-    pass_info_reset(p, false);
-    if (!pass_render_frame(p, mpi, id))
-        return false;
-
-    // Frame blending should always be done in linear light to preserve the
-    // overall brightness, otherwise this will result in flashing dark frames
-    // because mixing in compressed light artificially darkens the results
-    if (!p->use_linear) {
-        p->use_linear = true;
-        pass_linearize(p->sc, p->image_params.color.gamma);
-    }
-
-    finish_pass_fbo(p, &surf->fbotex, vp_w, vp_h, FBOTEX_FUZZY);
-    surf->id  = id;
-    surf->pts = mpi->pts;
-    return true;
-}
-
-// Draws an interpolate frame to fbo, based on the frame timing in t
-static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
-                                       struct fbodst fbo)
-{
-    bool is_new = false;
-
-    // Reset the queue completely if this is a still image, to avoid any
-    // interpolation artifacts from surrounding frames when unpausing or
-    // framestepping
-    if (t->still)
-        gl_video_reset_surfaces(p);
-
-    // First of all, figure out if we have a frame available at all, and draw
-    // it manually + reset the queue if not
-    if (p->surfaces[p->surface_now].id == 0) {
-        struct fbosurface *now = &p->surfaces[p->surface_now];
-        if (!update_fbosurface(p, t->current, t->frame_id, now))
-            return;
-        p->surface_idx = p->surface_now;
-        is_new = true;
-    }
-
-    // Find the right frame for this instant
-    if (t->current) {
-        int next = fbosurface_wrap(p->surface_now + 1);
-        while (p->surfaces[next].id &&
-               p->surfaces[next].id > p->surfaces[p->surface_now].id &&
-               p->surfaces[p->surface_now].id < t->frame_id)
-        {
-            p->surface_now = next;
-            next = fbosurface_wrap(next + 1);
-        }
-    }
-
-    // Figure out the queue size. For illustration, a filter radius of 2 would
-    // look like this: _ A [B] C D _
-    // A is surface_bse, B is surface_now, C is surface_now+1 and D is
-    // surface_end.
-    struct scaler *tscale = &p->scaler[SCALER_TSCALE];
-    reinit_scaler(p, tscale, &p->opts.scaler[SCALER_TSCALE], 1, tscale_sizes);
-    bool oversample = strcmp(tscale->conf.kernel.name, "oversample") == 0;
-    bool linear = strcmp(tscale->conf.kernel.name, "linear") == 0;
-    int size;
-
-    if (oversample || linear) {
-        size = 2;
-    } else {
-        assert(tscale->kernel && !tscale->kernel->polar);
-        size = ceil(tscale->kernel->size);
-        assert(size <= TEXUNIT_VIDEO_NUM);
-    }
-
-    int radius = size/2;
-    int surface_now = p->surface_now;
-    int surface_bse = fbosurface_wrap(surface_now - (radius-1));
-    int surface_end = fbosurface_wrap(surface_now + radius);
-    assert(fbosurface_wrap(surface_bse + size-1) == surface_end);
-
-    // Render new frames while there's room in the queue. Note that technically,
-    // this should be done before the step where we find the right frame, but
-    // it only barely matters at the very beginning of playback, and this way
-    // makes the code much more linear.
-    int surface_dst = fbosurface_wrap(p->surface_idx + 1);
-    for (int i = 0; i < t->num_frames; i++) {
-        // Avoid overwriting data we might still need
-        if (surface_dst == surface_bse - 1)
-            break;
-
-        struct mp_image *f = t->frames[i];
-        uint64_t f_id = t->frame_id + i;
-        if (!mp_image_params_equal(&f->params, &p->real_image_params))
-            continue;
-
-        if (f_id > p->surfaces[p->surface_idx].id) {
-            struct fbosurface *dst = &p->surfaces[surface_dst];
-            if (!update_fbosurface(p, f, f_id, dst))
-                return;
-            p->surface_idx = surface_dst;
-            surface_dst = fbosurface_wrap(surface_dst + 1);
-            is_new = true;
-        }
-    }
-
-    // Figure out whether the queue is "valid". A queue is invalid if the
-    // frames' PTS is not monotonically increasing. Anything else is invalid,
-    // so avoid blending incorrect data and just draw the latest frame as-is.
-    // Possible causes for failure of this condition include seeks, pausing,
-    // end of playback or start of playback.
-    bool valid = true;
-    for (int i = surface_bse, ii; valid && i != surface_end; i = ii) {
-        ii = fbosurface_wrap(i + 1);
-        if (p->surfaces[i].id == 0 || p->surfaces[ii].id == 0) {
-            valid = false;
-        } else if (p->surfaces[ii].id < p->surfaces[i].id) {
-            valid = false;
-            MP_DBG(p, "interpolation queue underrun\n");
-        }
-    }
-
-    // Update OSD PTS to synchronize subtitles with the displayed frame
-    p->osd_pts = p->surfaces[surface_now].pts;
-
-    // Finally, draw the right mix of frames to the screen.
-    if (!is_new)
-        pass_info_reset(p, true);
-    pass_describe(p, "interpolation");
-    if (!valid || t->still) {
-        // surface_now is guaranteed to be valid, so we can safely use it.
-        pass_read_fbo(p, &p->surfaces[surface_now].fbotex);
-        p->is_interpolated = false;
-    } else {
-        double mix = t->vsync_offset / t->ideal_frame_duration;
-        // The scaler code always wants the fcoord to be between 0 and 1,
-        // so we try to adjust by using the previous set of N frames instead
-        // (which requires some extra checking to make sure it's valid)
-        if (mix < 0.0) {
-            int prev = fbosurface_wrap(surface_bse - 1);
-            if (p->surfaces[prev].id != 0 &&
-                p->surfaces[prev].id < p->surfaces[surface_bse].id)
-            {
-                mix += 1.0;
-                surface_bse = prev;
-            } else {
-                mix = 0.0; // at least don't blow up, this should only
-                           // ever happen at the start of playback
-            }
-        }
-
-        if (oversample) {
-            // Oversample uses the frame area as mix ratio, not the the vsync
-            // position itself
-            double vsync_dist = t->vsync_interval / t->ideal_frame_duration,
-                   threshold = tscale->conf.kernel.params[0];
-            threshold = isnan(threshold) ? 0.0 : threshold;
-            mix = (1 - mix) / vsync_dist;
-            mix = mix <= 0 + threshold ? 0 : mix;
-            mix = mix >= 1 - threshold ? 1 : mix;
-            mix = 1 - mix;
-        }
-
-        // Blend the frames together
-        if (oversample || linear) {
-            gl_sc_uniform_f(p->sc, "inter_coeff", mix);
-            GLSL(color = mix(texture(texture0, texcoord0),
-                             texture(texture1, texcoord1),
-                             inter_coeff);)
-        } else {
-            gl_sc_uniform_f(p->sc, "fcoord", mix);
-            pass_sample_separated_gen(p->sc, tscale, 0, 0);
-        }
-
-        // Load all the required frames
-        for (int i = 0; i < size; i++) {
-            struct img_tex img =
-                img_tex_fbo(&p->surfaces[fbosurface_wrap(surface_bse+i)].fbotex,
-                            PLANE_RGB, p->components);
-            // Since the code in pass_sample_separated currently assumes
-            // the textures are bound in-order and starting at 0, we just
-            // assert to make sure this is the case (which it should always be)
-            int id = pass_bind(p, img);
-            assert(id == i);
-        }
-
-        MP_DBG(p, "inter frame dur: %f vsync: %f, mix: %f\n",
-               t->ideal_frame_duration, t->vsync_interval, mix);
-        p->is_interpolated = true;
-    }
-    pass_draw_to_screen(p, fbo);
-
-    p->frames_drawn += 1;
-}
-
-void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct fbodst target)
-{
-    struct mp_rect target_rc = {0, 0, target.tex->params.w, target.tex->params.h};
-
-    p->broken_frame = false;
-
-    bool has_frame = !!frame->current;
-
-    if (!has_frame || !mp_rect_equals(&p->dst_rect, &target_rc)) {
-        struct m_color c = p->clear_color;
-        float color[4] = {c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0};
-        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
-    }
-
-    if (p->hwdec_active && p->hwdec->driver->overlay_frame) {
-        if (has_frame) {
-            float *color = p->hwdec->overlay_colorkey;
-            p->ra->fns->clear(p->ra, target.tex, color, &p->dst_rect);
-        }
-
-        p->hwdec->driver->overlay_frame(p->hwdec, frame->current,
-                                        &p->src_rect, &p->dst_rect,
-                                        frame->frame_id != p->image.id);
-
-        if (frame->current)
-            p->osd_pts = frame->current->pts;
-
-        // Disable GL rendering
-        has_frame = false;
-    }
-
-    if (has_frame) {
-        bool interpolate = p->opts.interpolation && frame->display_synced &&
-                           (p->frames_drawn || !frame->still);
-        if (interpolate) {
-            double ratio = frame->ideal_frame_duration / frame->vsync_interval;
-            if (fabs(ratio - 1.0) < p->opts.interpolation_threshold)
-                interpolate = false;
-        }
-
-        if (interpolate) {
-            gl_video_interpolate_frame(p, frame, target);
-        } else {
-            bool is_new = frame->frame_id != p->image.id;
-
-            // Redrawing a frame might update subtitles.
-            if (frame->still && p->opts.blend_subs)
-                is_new = true;
-
-            if (is_new || !p->output_fbo_valid) {
-                p->output_fbo_valid = false;
-
-                pass_info_reset(p, !is_new);
-                if (!pass_render_frame(p, frame->current, frame->frame_id))
-                    goto done;
-
-                // For the non-interpolation case, we draw to a single "cache"
-                // FBO to speed up subsequent re-draws (if any exist)
-                struct fbodst dest_fbo = target;
-                if (frame->num_vsyncs > 1 && frame->display_synced &&
-                    !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
-                {
-                    fbotex_change(&p->output_fbo, p->ra, p->log,
-                                  target.tex->params.w, target.tex->params.h,
-                                  p->fbo_format, FBOTEX_FUZZY);
-                    dest_fbo = p->output_fbo.fbo;
-                    p->output_fbo_valid = true;
-                }
-                pass_draw_to_screen(p, dest_fbo);
-            }
-
-            // "output fbo valid" and "output fbo needed" are equivalent
-            if (p->output_fbo_valid) {
-                pass_info_reset(p, true);
-                pass_describe(p, "redraw cached frame");
-                struct mp_rect src = p->dst_rect;
-                struct mp_rect dst = src;
-                if (target.flip) {
-                    dst.y0 = target.tex->params.h - src.y0;
-                    dst.y1 = target.tex->params.h - src.y1;
-                }
-                timer_pool_start(p->blit_timer);
-                p->ra->fns->blit(p->ra, target.tex, p->output_fbo.tex,
-                                 &dst, &src);
-                timer_pool_stop(p->blit_timer);
-                pass_record(p, timer_pool_measure(p->blit_timer));
-            }
-        }
-    }
-
-done:
-
-    unmap_current_image(p);
-
-    debug_check_gl(p, "after video rendering");
-
-    if (p->osd) {
-        // If we haven't actually drawn anything so far, then we technically
-        // need to consider this the start of a new pass. Let's call it a
-        // redraw just because, since it's basically a blank frame anyway
-        if (!has_frame)
-            pass_info_reset(p, true);
-
-        pass_draw_osd(p, p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0,
-                      p->osd_pts, p->osd_rect, target, true);
-        debug_check_gl(p, "after OSD rendering");
-    }
-
-    if (gl_sc_error_state(p->sc) || p->broken_frame) {
-        // Make the screen solid blue to make it visually clear that an
-        // error has occurred
-        float color[4] = {0.0, 0.05, 0.5, 1.0};
-        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
-    }
-
-    // The playloop calls this last before waiting some time until it decides
-    // to call flip_page(). Tell OpenGL to start execution of the GPU commands
-    // while we sleep (this happens asynchronously).
-    if ((p->opts.early_flush == -1 && !frame->display_synced) ||
-        p->opts.early_flush == 1)
-    {
-        if (p->ra->fns->flush)
-            p->ra->fns->flush(p->ra);
-    }
-
-    p->frames_rendered++;
-    pass_report_performance(p);
-}
-
-// Use this color instead of the global option.
-void gl_video_set_clear_color(struct gl_video *p, struct m_color c)
-{
-    p->force_clear_color = true;
-    p->clear_color = c;
-}
-
-void gl_video_set_osd_pts(struct gl_video *p, double pts)
-{
-    p->osd_pts = pts;
-}
-
-bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *res,
-                               double pts)
-{
-    return p->osd ? mpgl_osd_check_change(p->osd, res, pts) : false;
-}
-
-void gl_video_resize(struct gl_video *p,
-                     struct mp_rect *src, struct mp_rect *dst,
-                     struct mp_osd_res *osd)
-{
-    if (mp_rect_equals(&p->src_rect, src) &&
-        mp_rect_equals(&p->dst_rect, dst) &&
-        osd_res_equals(p->osd_rect, *osd))
-        return;
-
-    p->src_rect = *src;
-    p->dst_rect = *dst;
-    p->osd_rect = *osd;
-
-    gl_video_reset_surfaces(p);
-
-    if (p->osd)
-        mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo_out);
-}
-
-static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
-{
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        if (!pass[i].desc.len)
-            break;
-        out->perf[out->count] = pass[i].perf;
-        out->desc[out->count] = pass[i].desc.start;
-        out->count++;
-    }
-}
-
-void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out)
-{
-    *out = (struct voctrl_performance_data){0};
-    frame_perf_data(p->pass_fresh,  &out->fresh);
-    frame_perf_data(p->pass_redraw, &out->redraw);
-}
-
-// This assumes nv12, with textures set to GL_NEAREST filtering.
-static void reinterleave_vdpau(struct gl_video *p,
-                               struct ra_tex *input[4], struct ra_tex *output[2])
-{
-    for (int n = 0; n < 2; n++) {
-        struct fbotex *fbo = &p->vdpau_deinterleave_fbo[n];
-        // This is an array of the 2 to-merge planes.
-        struct ra_tex **src = &input[n * 2];
-        int w = src[0]->params.w;
-        int h = src[0]->params.h;
-        int ids[2];
-        for (int t = 0; t < 2; t++) {
-            ids[t] = pass_bind(p, (struct img_tex){
-                .tex = src[t],
-                .multiplier = 1.0,
-                .transform = identity_trans,
-                .w = w,
-                .h = h,
-            });
-        }
-
-        GLSLF("color = fract(gl_FragCoord.y * 0.5) < 0.5\n");
-        GLSLF("      ? texture(texture%d, texcoord%d)\n", ids[0], ids[0]);
-        GLSLF("      : texture(texture%d, texcoord%d);", ids[1], ids[1]);
-
-        const struct ra_format *fmt =
-            ra_find_unorm_format(p->ra, 1, n == 0 ? 1 : 2);
-        fbotex_change(fbo, p->ra, p->log, w, h * 2, fmt, 0);
-
-        pass_describe(p, "vdpau reinterleaving");
-        finish_pass_direct(p, fbo->fbo, &(struct mp_rect){0, 0, w, h * 2});
-
-        output[n] = fbo->tex;
-    }
-}
-
-// Returns false on failure.
-static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id)
-{
-    struct video_image *vimg = &p->image;
-
-    if (vimg->id == id)
-        return true;
-
-    unref_current_image(p);
-
-    mpi = mp_image_new_ref(mpi);
-    if (!mpi)
-        goto error;
-
-    vimg->mpi = mpi;
-    vimg->id = id;
-    p->osd_pts = mpi->pts;
-    p->frames_uploaded++;
-
-    if (p->hwdec_active) {
-        // Hardware decoding
-
-        if (!p->hwdec_mapper)
-            goto error;
-
-        pass_describe(p, "map frame (hwdec)");
-        timer_pool_start(p->upload_timer);
-        bool ok = ra_hwdec_mapper_map(p->hwdec_mapper, vimg->mpi) >= 0;
-        timer_pool_stop(p->upload_timer);
-        pass_record(p, timer_pool_measure(p->upload_timer));
-
-        vimg->hwdec_mapped = true;
-        if (ok) {
-            struct mp_image layout = {0};
-            mp_image_set_params(&layout, &p->image_params);
-            struct ra_tex **tex = p->hwdec_mapper->tex;
-            struct ra_tex *tmp[4] = {0};
-            if (p->hwdec_mapper->vdpau_fields) {
-                reinterleave_vdpau(p, tex, tmp);
-                tex = tmp;
-            }
-            for (int n = 0; n < p->plane_count; n++) {
-                vimg->planes[n] = (struct texplane){
-                    .w = mp_image_plane_w(&layout, n),
-                    .h = mp_image_plane_h(&layout, n),
-                    .tex = tex[n],
-                };
-            }
-        } else {
-            MP_FATAL(p, "Mapping hardware decoded surface failed.\n");
-            goto error;
-        }
-        return true;
-    }
-
-    // Software decoding
-    assert(mpi->num_planes == p->plane_count);
-
-    timer_pool_start(p->upload_timer);
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
-
-        plane->flipped = mpi->stride[0] < 0;
-
-        struct ra_tex_upload_params params = {
-            .tex = plane->tex,
-            .src = mpi->planes[n],
-            .invalidate = true,
-            .stride = mpi->stride[n],
-        };
-
-        struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]);
-        if (mapped) {
-            params.buf = mapped->buf;
-            params.buf_offset = (uintptr_t)params.src -
-                                (uintptr_t)mapped->buf->data;
-            params.src = NULL;
-        }
-
-        if (p->using_dr_path != !!mapped) {
-            p->using_dr_path = !!mapped;
-            MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no");
-        }
-
-        if (!p->ra->fns->tex_upload(p->ra, &params)) {
-            timer_pool_stop(p->upload_timer);
-            goto error;
-        }
-
-        if (mapped && !mapped->mpi)
-            mapped->mpi = mp_image_new_ref(mpi);
-    }
-    timer_pool_stop(p->upload_timer);
-
-    bool using_pbo = p->ra->use_pbo || !(p->ra->caps & RA_CAP_DIRECT_UPLOAD);
-    const char *mode = p->using_dr_path ? "DR" : using_pbo ? "PBO" : "naive";
-    pass_describe(p, "upload frame (%s)", mode);
-    pass_record(p, timer_pool_measure(p->upload_timer));
-
-    return true;
-
-error:
-    unref_current_image(p);
-    p->broken_frame = true;
-    return false;
-}
-
-static bool test_fbo(struct gl_video *p, const struct ra_format *fmt)
-{
-    MP_VERBOSE(p, "Testing FBO format %s\n", fmt->name);
-    struct fbotex fbo = {0};
-    bool success = fbotex_change(&fbo, p->ra, p->log, 16, 16, fmt, 0);
-    fbotex_uninit(&fbo);
-    return success;
-}
-
-// Return whether dumb-mode can be used without disabling any features.
-// Essentially, vo_opengl with mostly default settings will return true.
-static bool check_dumb_mode(struct gl_video *p)
-{
-    struct gl_video_opts *o = &p->opts;
-    if (p->use_integer_conversion)
-        return false;
-    if (o->dumb_mode > 0) // requested by user
-        return true;
-    if (o->dumb_mode < 0) // disabled by user
-        return false;
-
-    // otherwise, use auto-detection
-    if (o->target_prim || o->target_trc || o->linear_scaling ||
-        o->correct_downscaling || o->sigmoid_upscaling || o->interpolation ||
-        o->blend_subs || o->deband || o->unsharp)
-        return false;
-    // check remaining scalers (tscale is already implicitly excluded above)
-    for (int i = 0; i < SCALER_COUNT; i++) {
-        if (i != SCALER_TSCALE) {
-            const char *name = o->scaler[i].kernel.name;
-            if (name && strcmp(name, "bilinear") != 0)
-                return false;
-        }
-    }
-    if (o->user_shaders && o->user_shaders[0])
-        return false;
-    if (p->use_lut_3d)
-        return false;
-    return true;
-}
-
-// Disable features that are not supported with the current OpenGL version.
-static void check_gl_features(struct gl_video *p)
-{
-    struct ra *ra = p->ra;
-    bool have_float_tex = !!ra_find_float16_format(ra, 1);
-    bool have_mglsl = ra->glsl_version >= 130; // modern GLSL
-    const struct ra_format *rg_tex = ra_find_unorm_format(p->ra, 1, 2);
-    bool have_texrg = rg_tex && !rg_tex->luminance_alpha;
-    bool have_compute = ra->caps & RA_CAP_COMPUTE;
-    bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
-
-    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgb10_a2", "rgba8", 0};
-    const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
-    const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
-                          ? user_fbo_fmts : auto_fbo_fmts;
-    bool have_fbo = false;
-    p->fbo_format = NULL;
-    for (int n = 0; fbo_fmts[n]; n++) {
-        const char *fmt = fbo_fmts[n];
-        const struct ra_format *f = ra_find_named_format(p->ra, fmt);
-        if (!f && fbo_fmts == user_fbo_fmts)
-            MP_WARN(p, "FBO format '%s' not found!\n", fmt);
-        if (f && f->renderable && f->linear_filter && test_fbo(p, f)) {
-            MP_VERBOSE(p, "Using FBO format %s.\n", f->name);
-            have_fbo = true;
-            p->fbo_format = f;
-            break;
-        }
-    }
-
-    p->forced_dumb_mode = p->opts.dumb_mode > 0 || !have_fbo || !have_texrg;
-    bool voluntarily_dumb = check_dumb_mode(p);
-    if (p->forced_dumb_mode || voluntarily_dumb) {
-        if (voluntarily_dumb) {
-            MP_VERBOSE(p, "No advanced processing required. Enabling dumb mode.\n");
-        } else if (p->opts.dumb_mode <= 0) {
-            MP_WARN(p, "High bit depth FBOs unsupported. Enabling dumb mode.\n"
-                       "Most extended features will be disabled.\n");
-        }
-        p->dumb_mode = true;
-        p->use_lut_3d = false;
-        // Most things don't work, so whitelist all options that still work.
-        p->opts = (struct gl_video_opts){
-            .gamma = p->opts.gamma,
-            .gamma_auto = p->opts.gamma_auto,
-            .pbo = p->opts.pbo,
-            .fbo_format = p->opts.fbo_format,
-            .alpha_mode = p->opts.alpha_mode,
-            .use_rectangle = p->opts.use_rectangle,
-            .background = p->opts.background,
-            .dither_algo = p->opts.dither_algo,
-            .dither_depth = p->opts.dither_depth,
-            .dither_size = p->opts.dither_size,
-            .temporal_dither = p->opts.temporal_dither,
-            .temporal_dither_period = p->opts.temporal_dither_period,
-            .tex_pad_x = p->opts.tex_pad_x,
-            .tex_pad_y = p->opts.tex_pad_y,
-            .tone_mapping = p->opts.tone_mapping,
-            .tone_mapping_param = p->opts.tone_mapping_param,
-            .tone_mapping_desat = p->opts.tone_mapping_desat,
-            .early_flush = p->opts.early_flush,
-        };
-        for (int n = 0; n < SCALER_COUNT; n++)
-            p->opts.scaler[n] = gl_video_opts_def.scaler[n];
-        return;
-    }
-    p->dumb_mode = false;
-
-    // Normally, we want to disable them by default if FBOs are unavailable,
-    // because they will be slow (not critically slow, but still slower).
-    // Without FP textures, we must always disable them.
-    // I don't know if luminance alpha float textures exist, so disregard them.
-    for (int n = 0; n < SCALER_COUNT; n++) {
-        const struct filter_kernel *kernel =
-            mp_find_filter_kernel(p->opts.scaler[n].kernel.name);
-        if (kernel) {
-            char *reason = NULL;
-            if (!have_float_tex)
-                reason = "(float tex. missing)";
-            if (!have_mglsl)
-                reason = "(GLSL version too old)";
-            if (reason) {
-                MP_WARN(p, "Disabling scaler #%d %s %s.\n", n,
-                        p->opts.scaler[n].kernel.name, reason);
-                // p->opts is a copy => we can just mess with it.
-                p->opts.scaler[n].kernel.name = "bilinear";
-                if (n == SCALER_TSCALE)
-                    p->opts.interpolation = 0;
-            }
-        }
-    }
-
-    int use_cms = p->opts.target_prim != MP_CSP_PRIM_AUTO ||
-                  p->opts.target_trc != MP_CSP_TRC_AUTO || p->use_lut_3d;
-
-    // mix() is needed for some gamma functions
-    if (!have_mglsl && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
-        p->opts.linear_scaling = false;
-        p->opts.sigmoid_upscaling = false;
-        MP_WARN(p, "Disabling linear/sigmoid scaling (GLSL version too old).\n");
-    }
-    if (!have_mglsl && use_cms) {
-        p->opts.target_prim = MP_CSP_PRIM_AUTO;
-        p->opts.target_trc = MP_CSP_TRC_AUTO;
-        p->use_lut_3d = false;
-        MP_WARN(p, "Disabling color management (GLSL version too old).\n");
-    }
-    if (!have_mglsl && p->opts.deband) {
-        p->opts.deband = 0;
-        MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
-    }
-    if ((!have_compute || !have_ssbo) && p->opts.compute_hdr_peak) {
-        p->opts.compute_hdr_peak = 0;
-        MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
-    }
-}
-
-static void init_gl(struct gl_video *p)
-{
-    debug_check_gl(p, "before init_gl");
-
-    p->upload_timer = timer_pool_create(p->ra);
-    p->blit_timer = timer_pool_create(p->ra);
-    p->osd_timer = timer_pool_create(p->ra);
-
-    debug_check_gl(p, "after init_gl");
-
-    ra_dump_tex_formats(p->ra, MSGL_DEBUG);
-    ra_dump_img_formats(p->ra, MSGL_DEBUG);
-}
-
-void gl_video_uninit(struct gl_video *p)
-{
-    if (!p)
-        return;
-
-    uninit_video(p);
-
-    gl_sc_destroy(p->sc);
-
-    ra_tex_free(p->ra, &p->lut_3d_texture);
-    ra_buf_free(p->ra, &p->hdr_peak_ssbo);
-
-    timer_pool_destroy(p->upload_timer);
-    timer_pool_destroy(p->blit_timer);
-    timer_pool_destroy(p->osd_timer);
-
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        talloc_free(p->pass_fresh[i].desc.start);
-        talloc_free(p->pass_redraw[i].desc.start);
-    }
-
-    mpgl_osd_destroy(p->osd);
-
-    // Forcibly destroy possibly remaining image references. This should also
-    // cause gl_video_dr_free_buffer() to be called for the remaining buffers.
-    gc_pending_dr_fences(p, true);
-
-    // Should all have been unreffed already.
-    assert(!p->num_dr_buffers);
-
-    talloc_free(p);
-}
-
-void gl_video_reset(struct gl_video *p)
-{
-    gl_video_reset_surfaces(p);
-}
-
-bool gl_video_showing_interpolated_frame(struct gl_video *p)
-{
-    return p->is_interpolated;
-}
-
-static bool is_imgfmt_desc_supported(struct gl_video *p,
-                                     const struct ra_imgfmt_desc *desc)
-{
-    if (!desc->num_planes)
-        return false;
-
-    if (desc->planes[0]->ctype == RA_CTYPE_UINT && p->forced_dumb_mode)
-        return false;
-
-    return true;
-}
-
-bool gl_video_check_format(struct gl_video *p, int mp_format)
-{
-    struct ra_imgfmt_desc desc;
-    if (ra_get_imgfmt_desc(p->ra, mp_format, &desc) &&
-        is_imgfmt_desc_supported(p, &desc))
-        return true;
-    if (p->hwdec && ra_hwdec_test_format(p->hwdec, mp_format))
-        return true;
-    return false;
-}
-
-void gl_video_config(struct gl_video *p, struct mp_image_params *params)
-{
-    unmap_overlay(p);
-    unref_current_image(p);
-
-    if (!mp_image_params_equal(&p->real_image_params, params)) {
-        uninit_video(p);
-        p->real_image_params = *params;
-        p->image_params = *params;
-        if (params->imgfmt)
-            init_video(p);
-    }
-
-    gl_video_reset_surfaces(p);
-}
-
-void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd)
-{
-    mpgl_osd_destroy(p->osd);
-    p->osd = NULL;
-    p->osd_state = osd;
-    reinit_osd(p);
-}
-
-struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
-                               struct mpv_global *g)
-{
-    struct gl_video *p = talloc_ptrtype(NULL, p);
-    *p = (struct gl_video) {
-        .ra = ra,
-        .global = g,
-        .log = log,
-        .sc = gl_sc_create(ra, g, log),
-        .video_eq = mp_csp_equalizer_create(p, g),
-        .opts_cache = m_config_cache_alloc(p, g, &gl_video_conf),
-    };
-    // make sure this variable is initialized to *something*
-    p->pass = p->pass_fresh;
-    struct gl_video_opts *opts = p->opts_cache->opts;
-    p->cms = gl_lcms_init(p, log, g, opts->icc_opts),
-    p->opts = *opts;
-    for (int n = 0; n < SCALER_COUNT; n++)
-        p->scaler[n] = (struct scaler){.index = n};
-    init_gl(p);
-    reinit_from_options(p);
-    return p;
-}
-
-// Get static string for scaler shader. If "tscale" is set to true, the
-// scaler must be a separable convolution filter.
-static const char *handle_scaler_opt(const char *name, bool tscale)
-{
-    if (name && name[0]) {
-        const struct filter_kernel *kernel = mp_find_filter_kernel(name);
-        if (kernel && (!tscale || !kernel->polar))
-                return kernel->f.name;
-
-        for (const char *const *filter = tscale ? fixed_tscale_filters
-                                                : fixed_scale_filters;
-             *filter; filter++) {
-            if (strcmp(*filter, name) == 0)
-                return *filter;
-        }
-    }
-    return NULL;
-}
-
-void gl_video_update_options(struct gl_video *p)
-{
-    if (m_config_cache_update(p->opts_cache)) {
-        gl_lcms_update_options(p->cms);
-        reinit_from_options(p);
-    }
-}
-
-static void reinit_from_options(struct gl_video *p)
-{
-    p->use_lut_3d = gl_lcms_has_profile(p->cms);
-
-    // Copy the option fields, so that check_gl_features() can mutate them.
-    // This works only for the fields themselves of course, not for any memory
-    // referenced by them.
-    p->opts = *(struct gl_video_opts *)p->opts_cache->opts;
-
-    if (!p->force_clear_color)
-        p->clear_color = p->opts.background;
-
-    check_gl_features(p);
-    uninit_rendering(p);
-    gl_sc_set_cache_dir(p->sc, p->opts.shader_cache_dir);
-    p->ra->use_pbo = p->opts.pbo;
-    gl_video_setup_hooks(p);
-    reinit_osd(p);
-
-    if (p->opts.interpolation && !p->global->opts->video_sync && !p->dsi_warned) {
-        MP_WARN(p, "Interpolation now requires enabling display-sync mode.\n"
-                   "E.g.: --video-sync=display-resample\n");
-        p->dsi_warned = true;
-    }
-}
-
-void gl_video_configure_queue(struct gl_video *p, struct vo *vo)
-{
-    int queue_size = 1;
-
-    // Figure out an adequate size for the interpolation queue. The larger
-    // the radius, the earlier we need to queue frames.
-    if (p->opts.interpolation) {
-        const struct filter_kernel *kernel =
-            mp_find_filter_kernel(p->opts.scaler[SCALER_TSCALE].kernel.name);
-        if (kernel) {
-            // filter_scale wouldn't be correctly initialized were we to use it here.
-            // This is fine since we're always upsampling, but beware if downsampling
-            // is added!
-            double radius = kernel->f.radius;
-            radius = radius > 0 ? radius : p->opts.scaler[SCALER_TSCALE].radius;
-            queue_size += 1 + ceil(radius);
-        } else {
-            // Oversample/linear case
-            queue_size += 2;
-        }
-    }
-
-    vo_set_queue_params(vo, 0, queue_size);
-}
-
-static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param)
-{
-    char s[20] = {0};
-    int r = 1;
-    bool tscale = bstr_equals0(name, "tscale");
-    if (bstr_equals0(param, "help")) {
-        r = M_OPT_EXIT;
-    } else {
-        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-        if (!handle_scaler_opt(s, tscale))
-            r = M_OPT_INVALID;
-    }
-    if (r < 1) {
-        mp_info(log, "Available scalers:\n");
-        for (const char *const *filter = tscale ? fixed_tscale_filters
-                                                : fixed_scale_filters;
-             *filter; filter++) {
-            mp_info(log, "    %s\n", *filter);
-        }
-        for (int n = 0; mp_filter_kernels[n].f.name; n++) {
-            if (!tscale || !mp_filter_kernels[n].polar)
-                mp_info(log, "    %s\n", mp_filter_kernels[n].f.name);
-        }
-        if (s[0])
-            mp_fatal(log, "No scaler named '%s' found!\n", s);
-    }
-    return r;
-}
-
-static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param)
-{
-    char s[20] = {0};
-    int r = 1;
-    if (bstr_equals0(param, "help")) {
-        r = M_OPT_EXIT;
-    } else {
-        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-        const struct filter_window *window = mp_find_filter_window(s);
-        if (!window)
-            r = M_OPT_INVALID;
-    }
-    if (r < 1) {
-        mp_info(log, "Available windows:\n");
-        for (int n = 0; mp_filter_windows[n].name; n++)
-            mp_info(log, "    %s\n", mp_filter_windows[n].name);
-        if (s[0])
-            mp_fatal(log, "No window named '%s' found!\n", s);
-    }
-    return r;
-}
-
-float gl_video_scale_ambient_lux(float lmin, float lmax,
-                                 float rmin, float rmax, float lux)
-{
-    assert(lmax > lmin);
-
-    float num = (rmax - rmin) * (log10(lux) - log10(lmin));
-    float den = log10(lmax) - log10(lmin);
-    float result = num / den + rmin;
-
-    // clamp the result
-    float max = MPMAX(rmax, rmin);
-    float min = MPMIN(rmax, rmin);
-    return MPMAX(MPMIN(result, max), min);
-}
-
-void gl_video_set_ambient_lux(struct gl_video *p, int lux)
-{
-    if (p->opts.gamma_auto) {
-        float gamma = gl_video_scale_ambient_lux(16.0, 64.0, 2.40, 1.961, lux);
-        MP_VERBOSE(p, "ambient light changed: %dlux (gamma: %f)\n", lux, gamma);
-        p->opts.gamma = MPMIN(1.0, 1.961 / gamma);
-    }
-}
-
-void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec)
-{
-    unref_current_image(p);
-    ra_hwdec_mapper_free(&p->hwdec_mapper);
-    p->hwdec = hwdec;
-}
-
-static void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size)
-{
-    struct ra_buf_params params = {
-        .type = RA_BUF_TYPE_TEX_UPLOAD,
-        .host_mapped = true,
-        .size = size,
-    };
-
-    struct ra_buf *buf = ra_buf_create(p->ra, &params);
-    if (!buf)
-        return NULL;
-
-    MP_TARRAY_GROW(p, p->dr_buffers, p->num_dr_buffers);
-    p->dr_buffers[p->num_dr_buffers++] = (struct dr_buffer){ .buf = buf };
-
-    return buf->data;
-};
-
-static void gl_video_dr_free_buffer(void *opaque, uint8_t *data)
-{
-    struct gl_video *p = opaque;
-
-    for (int n = 0; n < p->num_dr_buffers; n++) {
-        struct dr_buffer *buffer = &p->dr_buffers[n];
-        if (buffer->buf->data == data) {
-            assert(!buffer->mpi); // can't be freed while it has a ref
-            ra_buf_free(p->ra, &buffer->buf);
-            MP_TARRAY_REMOVE_AT(p->dr_buffers, p->num_dr_buffers, n);
-            return;
-        }
-    }
-    // not found - must not happen
-    assert(0);
-}
-
-struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
-                                    int stride_align)
-{
-    int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
-    if (size < 0)
-        return NULL;
-
-    int alloc_size = size + stride_align;
-    void *ptr = gl_video_dr_alloc_buffer(p, alloc_size);
-    if (!ptr)
-        return NULL;
-
-    // (we expect vo.c to proxy the free callback, so it happens in the same
-    // thread it was allocated in, removing the need for synchronization)
-    struct mp_image *res = mp_image_from_buffer(imgfmt, w, h, stride_align,
-                                                ptr, alloc_size, p,
-                                                gl_video_dr_free_buffer);
-    if (!res)
-        gl_video_dr_free_buffer(p, ptr);
-    return res;
-}
diff --git a/video/out/opengl/video.h b/video/out/opengl/video.h
deleted file mode 100644
index d163bc8405..0000000000
--- a/video/out/opengl/video.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_VIDEO_H
-#define MP_GL_VIDEO_H
-
-#include <stdbool.h>
-
-#include "options/m_option.h"
-#include "sub/osd.h"
-#include "utils.h"
-#include "lcms.h"
-#include "shader_cache.h"
-#include "video/csputils.h"
-#include "video/out/filter_kernels.h"
-#include "video/out/vo.h"
-
-// Assume we have this many texture units for sourcing additional passes.
-// The actual texture unit assignment is dynamic.
-#define TEXUNIT_VIDEO_NUM 6
-
-struct scaler_fun {
-    char *name;
-    float params[2];
-    float blur;
-    float taper;
-};
-
-struct scaler_config {
-    struct scaler_fun kernel;
-    struct scaler_fun window;
-    float radius;
-    float antiring;
-    float cutoff;
-    float clamp;
-};
-
-struct scaler {
-    int index;
-    struct scaler_config conf;
-    double scale_factor;
-    bool initialized;
-    struct filter_kernel *kernel;
-    struct ra_tex *lut;
-    struct fbotex sep_fbo;
-    bool insufficient;
-    int lut_size;
-
-    // kernel points here
-    struct filter_kernel kernel_storage;
-};
-
-enum scaler_unit {
-    SCALER_SCALE,  // luma/video
-    SCALER_DSCALE, // luma-video downscaling
-    SCALER_CSCALE, // chroma upscaling
-    SCALER_TSCALE, // temporal scaling (interpolation)
-    SCALER_COUNT
-};
-
-enum dither_algo {
-    DITHER_NONE = 0,
-    DITHER_FRUIT,
-    DITHER_ORDERED,
-};
-
-enum alpha_mode {
-    ALPHA_NO = 0,
-    ALPHA_YES,
-    ALPHA_BLEND,
-    ALPHA_BLEND_TILES,
-};
-
-enum blend_subs_mode {
-    BLEND_SUBS_NO = 0,
-    BLEND_SUBS_YES,
-    BLEND_SUBS_VIDEO,
-};
-
-enum tone_mapping {
-    TONE_MAPPING_CLIP,
-    TONE_MAPPING_MOBIUS,
-    TONE_MAPPING_REINHARD,
-    TONE_MAPPING_HABLE,
-    TONE_MAPPING_GAMMA,
-    TONE_MAPPING_LINEAR,
-};
-
-// How many frames to average over for HDR peak detection
-#define PEAK_DETECT_FRAMES 100
-
-struct gl_video_opts {
-    int dumb_mode;
-    struct scaler_config scaler[4];
-    int scaler_lut_size;
-    float gamma;
-    int gamma_auto;
-    int target_prim;
-    int target_trc;
-    int target_brightness;
-    int tone_mapping;
-    int compute_hdr_peak;
-    float tone_mapping_param;
-    float tone_mapping_desat;
-    int gamut_warning;
-    int linear_scaling;
-    int correct_downscaling;
-    int sigmoid_upscaling;
-    float sigmoid_center;
-    float sigmoid_slope;
-    int scaler_resizes_only;
-    int pbo;
-    int dither_depth;
-    int dither_algo;
-    int dither_size;
-    int temporal_dither;
-    int temporal_dither_period;
-    char *fbo_format;
-    int alpha_mode;
-    int use_rectangle;
-    struct m_color background;
-    int interpolation;
-    float interpolation_threshold;
-    int blend_subs;
-    char **user_shaders;
-    int deband;
-    struct deband_opts *deband_opts;
-    float unsharp;
-    int tex_pad_x, tex_pad_y;
-    struct mp_icc_opts *icc_opts;
-    int early_flush;
-    char *shader_cache_dir;
-};
-
-extern const struct m_sub_options gl_video_conf;
-
-struct gl_video;
-struct vo_frame;
-
-struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
-                               struct mpv_global *g);
-void gl_video_uninit(struct gl_video *p);
-void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
-void gl_video_update_options(struct gl_video *p);
-bool gl_video_check_format(struct gl_video *p, int mp_format);
-void gl_video_config(struct gl_video *p, struct mp_image_params *params);
-void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
-void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct fbodst target);
-void gl_video_resize(struct gl_video *p,
-                     struct mp_rect *src, struct mp_rect *dst,
-                     struct mp_osd_res *osd);
-void gl_video_set_fb_depth(struct gl_video *p, int fb_depth);
-void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out);
-void gl_video_set_clear_color(struct gl_video *p, struct m_color color);
-void gl_video_set_osd_pts(struct gl_video *p, double pts);
-bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *osd,
-                               double pts);
-
-float gl_video_scale_ambient_lux(float lmin, float lmax,
-                                 float rmin, float rmax, float lux);
-void gl_video_set_ambient_lux(struct gl_video *p, int lux);
-void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data);
-bool gl_video_icc_auto_enabled(struct gl_video *p);
-bool gl_video_gamma_auto_enabled(struct gl_video *p);
-struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p);
-
-void gl_video_reset(struct gl_video *p);
-bool gl_video_showing_interpolated_frame(struct gl_video *p);
-
-struct ra_hwdec;
-void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec);
-
-struct vo;
-void gl_video_configure_queue(struct gl_video *p, struct vo *vo);
-
-struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
-                                    int stride_align);
-
-
-#endif
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
deleted file mode 100644
index 60c5ce82ac..0000000000
--- a/video/out/opengl/video_shaders.c
+++ /dev/null
@@ -1,872 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <math.h>
-
-#include "video_shaders.h"
-#include "video.h"
-
-#define GLSL(x) gl_sc_add(sc, #x "\n");
-#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
-#define GLSLH(x) gl_sc_hadd(sc, #x "\n");
-#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__)
-
-// Set up shared/commonly used variables and macros
-void sampler_prelude(struct gl_shader_cache *sc, int tex_num)
-{
-    GLSLF("#undef tex\n");
-    GLSLF("#undef texmap\n");
-    GLSLF("#define tex texture%d\n", tex_num);
-    GLSLF("#define texmap texmap%d\n", tex_num);
-    GLSLF("vec2 pos = texcoord%d;\n", tex_num);
-    GLSLF("vec2 size = texture_size%d;\n", tex_num);
-    GLSLF("vec2 pt = pixel_size%d;\n", tex_num);
-}
-
-static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
-                                              struct scaler *scaler)
-{
-    gl_sc_uniform_texture(sc, "lut", scaler->lut);
-    GLSLF("float ypos = LUT_POS(fcoord, %d.0);\n", scaler->lut_size);
-
-    int N = scaler->kernel->size;
-    int width = (N + 3) / 4; // round up
-
-    GLSLF("float weights[%d];\n", N);
-    for (int i = 0; i < N; i++) {
-        if (i % 4 == 0)
-            GLSLF("c = texture(lut, vec2(%f, ypos));\n", (i / 4 + 0.5) / width);
-        GLSLF("weights[%d] = c[%d];\n", i, i % 4);
-    }
-}
-
-// Handle a single pass (either vertical or horizontal). The direction is given
-// by the vector (d_x, d_y). If the vector is 0, then planar interpolation is
-// used instead (samples from texture0 through textureN)
-void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
-                               int d_x, int d_y)
-{
-    int N = scaler->kernel->size;
-    bool use_ar = scaler->conf.antiring > 0;
-    bool planar = d_x == 0 && d_y == 0;
-    GLSL(color = vec4(0.0);)
-    GLSLF("{\n");
-    if (!planar) {
-        GLSLF("vec2 dir = vec2(%d.0, %d.0);\n", d_x, d_y);
-        GLSL(pt *= dir;)
-        GLSL(float fcoord = dot(fract(pos * size - vec2(0.5)), dir);)
-        GLSLF("vec2 base = pos - fcoord * pt - pt * vec2(%d.0);\n", N / 2 - 1);
-    }
-    GLSL(vec4 c;)
-    if (use_ar) {
-        GLSL(vec4 hi = vec4(0.0);)
-        GLSL(vec4 lo = vec4(1.0);)
-    }
-    pass_sample_separated_get_weights(sc, scaler);
-    GLSLF("// scaler samples\n");
-    for (int n = 0; n < N; n++) {
-        if (planar) {
-            GLSLF("c = texture(texture%d, texcoord%d);\n", n, n);
-        } else {
-            GLSLF("c = texture(tex, base + pt * vec2(%d.0));\n", n);
-        }
-        GLSLF("color += vec4(weights[%d]) * c;\n", n);
-        if (use_ar && (n == N/2-1 || n == N/2)) {
-            GLSL(lo = min(lo, c);)
-            GLSL(hi = max(hi, c);)
-        }
-    }
-    if (use_ar)
-        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
-              scaler->conf.antiring);
-    GLSLF("}\n");
-}
-
-// Subroutine for computing and adding an individual texel contribution
-// If subtexel < 0 and offset < 0, samples directly.
-// If subtexel >= 0, takes the texel from cN[subtexel]
-// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
-static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
-                         int x, int y, int subtexel, int offset, int components)
-{
-    double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
-    double radius_cutoff = scaler->kernel->radius_cutoff;
-
-    // Since we can't know the subpixel position in advance, assume a
-    // worst case scenario
-    int yy = y > 0 ? y-1 : y;
-    int xx = x > 0 ? x-1 : x;
-    double dmax = sqrt(xx*xx + yy*yy);
-    // Skip samples definitely outside the radius
-    if (dmax >= radius_cutoff)
-        return;
-    GLSLF("d = length(vec2(%d.0, %d.0) - fcoord);\n", x, y);
-    // Check for samples that might be skippable
-    bool maybe_skippable = dmax >= radius_cutoff - M_SQRT2;
-    if (maybe_skippable)
-        GLSLF("if (d < %f) {\n", radius_cutoff);
-
-    // get the weight for this pixel
-    if (scaler->lut->params.dimensions == 1) {
-        GLSLF("w = tex1D(lut, LUT_POS(d * 1.0/%f, %d.0)).r;\n",
-              radius, scaler->lut_size);
-    } else {
-        GLSLF("w = texture(lut, vec2(0.5, LUT_POS(d * 1.0/%f, %d.0))).r;\n",
-              radius, scaler->lut_size);
-    }
-    GLSL(wsum += w;)
-
-    if (subtexel < 0 && offset < 0) {
-        GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
-        GLSL(color += vec4(w) * c0;)
-    } else if (subtexel >= 0) {
-        for (int n = 0; n < components; n++)
-            GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
-    } else if (offset >= 0) {
-        for (int n = 0; n <components; n++)
-            GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
-                  y + offset, x + offset);
-    } else {
-        // invalid usage
-        abort();
-    }
-
-    if (maybe_skippable)
-        GLSLF("}\n");
-}
-
-void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                       int components, int glsl_version)
-{
-    GLSL(color = vec4(0.0);)
-    GLSLF("{\n");
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    GLSL(vec2 base = pos - fcoord * pt;)
-    GLSLF("float w, d, wsum = 0.0;\n");
-    for (int n = 0; n < components; n++)
-        GLSLF("vec4 c%d;\n", n);
-
-    gl_sc_uniform_texture(sc, "lut", scaler->lut);
-
-    GLSLF("// scaler samples\n");
-    int bound = ceil(scaler->kernel->radius_cutoff);
-    for (int y = 1-bound; y <= bound; y += 2) {
-        for (int x = 1-bound; x <= bound; x += 2) {
-            // First we figure out whether it's more efficient to use direct
-            // sampling or gathering. The problem is that gathering 4 texels
-            // only to discard some of them is very wasteful, so only do it if
-            // we suspect it will be a win rather than a loss. This is the case
-            // exactly when all four texels are within bounds
-            bool use_gather = sqrt(x*x + y*y) < scaler->kernel->radius_cutoff;
-
-            // textureGather is only supported in GLSL 400+
-            if (glsl_version < 400)
-                use_gather = false;
-
-            if (use_gather) {
-                // Gather the four surrounding texels simultaneously
-                for (int n = 0; n < components; n++) {
-                    GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
-                          n, x, y, n);
-                }
-
-                // Mix in all of the points with their weights
-                for (int p = 0; p < 4; p++) {
-                    // The four texels are gathered counterclockwise starting
-                    // from the bottom left
-                    static const int xo[4] = {0, 1, 1, 0};
-                    static const int yo[4] = {1, 1, 0, 0};
-                    if (x+xo[p] > bound || y+yo[p] > bound)
-                        continue;
-                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
-                }
-            } else {
-                // switch to direct sampling instead, for efficiency/compatibility
-                for (int yy = y; yy <= bound && yy <= y+1; yy++) {
-                    for (int xx = x; xx <= bound && xx <= x+1; xx++)
-                        polar_sample(sc, scaler, xx, yy, -1, -1, components);
-                }
-            }
-        }
-    }
-
-    GLSL(color = color / vec4(wsum);)
-    GLSLF("}\n");
-}
-
-// bw/bh: block size
-// iw/ih: input size (pre-calculated to fit all required texels)
-void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                        int components, int bw, int bh, int iw, int ih)
-{
-    int bound = ceil(scaler->kernel->radius_cutoff);
-    int offset = bound - 1; // padding top/left
-
-    GLSL(color = vec4(0.0);)
-    GLSLF("{\n");
-    GLSL(vec2 wpos = texmap(gl_WorkGroupID * gl_WorkGroupSize);)
-    GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));)
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    GLSL(vec2 base = pos - pt * fcoord;)
-    GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
-    GLSLF("float w, d, wsum = 0.0;\n");
-    gl_sc_uniform_texture(sc, "lut", scaler->lut);
-
-    // Load all relevant texels into shmem
-    gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays");
-    for (int c = 0; c < components; c++)
-        GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
-
-    GLSL(vec4 c;)
-    GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
-    GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
-    GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
-    for (int c = 0; c < components; c++)
-        GLSLF("in%d[y][x] = c[%d];\n", c, c);
-    GLSLF("}}\n");
-    GLSL(groupMemoryBarrier();)
-    GLSL(barrier();)
-
-    // Dispatch the actual samples
-    GLSLF("// scaler samples\n");
-    for (int y = 1-bound; y <= bound; y++) {
-        for (int x = 1-bound; x <= bound; x++)
-            polar_sample(sc, scaler, x, y, -1, offset, components);
-    }
-
-    GLSL(color = color / vec4(wsum);)
-    GLSLF("}\n");
-}
-
-static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
-{
-    // Explanation of how bicubic scaling with only 4 texel fetches is done:
-    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
-    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
-    // Explanation why this algorithm normally always blurs, even with unit
-    // scaling:
-    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
-    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
-    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
-                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
-    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
-    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
-    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
-    GLSLF("%s.xy += vec2(1.0 + %s, 1.0 - %s);\n", t, s, s);
-}
-
-void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
-{
-    GLSLF("{\n");
-    GLSL(vec2 fcoord = fract(pos * size + vec2(0.5, 0.5));)
-    bicubic_calcweights(sc, "parmx", "fcoord.x");
-    bicubic_calcweights(sc, "parmy", "fcoord.y");
-    GLSL(vec4 cdelta;)
-    GLSL(cdelta.xz = parmx.rg * vec2(-pt.x, pt.x);)
-    GLSL(cdelta.yw = parmy.rg * vec2(-pt.y, pt.y);)
-    // first y-interpolation
-    GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
-    GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
-    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
-    // second y-interpolation
-    GLSL(vec4 br = texture(tex, pos + cdelta.zy);)
-    GLSL(vec4 bg = texture(tex, pos + cdelta.zw);)
-    GLSL(vec4 aa = mix(bg, br, parmy.b);)
-    // x-interpolation
-    GLSL(color = mix(aa, ab, parmx.b);)
-    GLSLF("}\n");
-}
-
-void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
-                                   int w, int h)
-{
-    GLSLF("{\n");
-    GLSL(vec2 pos = pos - vec2(0.5) * pt;) // round to nearest
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    // Determine the mixing coefficient vector
-    gl_sc_uniform_vec2(sc, "output_size", (float[2]){w, h});
-    GLSL(vec2 coeff = fcoord * output_size/size;)
-    float threshold = scaler->conf.kernel.params[0];
-    threshold = isnan(threshold) ? 0.0 : threshold;
-    GLSLF("coeff = (coeff - %f) * 1.0/%f;\n", threshold, 1.0 - 2 * threshold);
-    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
-    // Compute the right blend of colors
-    GLSL(color = texture(tex, pos + pt * (coeff - fcoord));)
-    GLSLF("}\n");
-}
-
-// Common constants for SMPTE ST.2084 (HDR)
-static const float PQ_M1 = 2610./4096 * 1./4,
-                   PQ_M2 = 2523./4096 * 128,
-                   PQ_C1 = 3424./4096,
-                   PQ_C2 = 2413./4096 * 32,
-                   PQ_C3 = 2392./4096 * 32;
-
-// Common constants for ARIB STD-B67 (HLG)
-static const float HLG_A = 0.17883277,
-                   HLG_B = 0.28466892,
-                   HLG_C = 0.55991073;
-
-// Common constants for Panasonic V-Log
-static const float VLOG_B = 0.00873,
-                   VLOG_C = 0.241514,
-                   VLOG_D = 0.598206;
-
-// Common constants for Sony S-Log
-static const float SLOG_A = 0.432699,
-                   SLOG_B = 0.037584,
-                   SLOG_C = 0.616596 + 0.03,
-                   SLOG_P = 3.538813,
-                   SLOG_Q = 0.030001,
-                   SLOG_K2 = 155.0 / 219.0;
-
-// Linearize (expand), given a TRC as input. In essence, this is the ITU-R
-// EOTF, calculated on an idealized (reference) monitor with a white point of
-// MP_REF_WHITE and infinite contrast.
-void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
-{
-    if (trc == MP_CSP_TRC_LINEAR)
-        return;
-
-    GLSLF("// linearize\n");
-
-    // Note that this clamp may technically violate the definition of
-    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
-    // displayed on the display where such would be possible. That said, the
-    // problem is that not all gamma curves are well-defined on the values
-    // outside this range, so we ignore it and just clip anyway for sanity.
-    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-
-    switch (trc) {
-    case MP_CSP_TRC_SRGB:
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/12.92),
-                             pow((color.rgb + vec3(0.055))/vec3(1.055), vec3(2.4)),
-                             lessThan(vec3(0.04045), color.rgb));)
-        break;
-    case MP_CSP_TRC_BT_1886:
-        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
-        break;
-    case MP_CSP_TRC_GAMMA18:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.8));)
-        break;
-    case MP_CSP_TRC_GAMMA22:
-        GLSL(color.rgb = pow(color.rgb, vec3(2.2));)
-        break;
-    case MP_CSP_TRC_GAMMA28:
-        GLSL(color.rgb = pow(color.rgb, vec3(2.8));)
-        break;
-    case MP_CSP_TRC_PRO_PHOTO:
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/16.0),
-                             pow(color.rgb, vec3(1.8)),
-                             lessThan(vec3(0.03125), color.rgb));)
-        break;
-    case MP_CSP_TRC_PQ:
-        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M2);
-        GLSLF("color.rgb = max(color.rgb - vec3(%f), vec3(0.0)) \n"
-              "             / (vec3(%f) - vec3(%f) * color.rgb);\n",
-              PQ_C1, PQ_C2, PQ_C3);
-        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M1);
-        // PQ's output range is 0-10000, but we need it to be relative to to
-        // MP_REF_WHITE instead, so rescale
-        GLSLF("color.rgb *= vec3(%f);\n", 10000 / MP_REF_WHITE);
-        break;
-    case MP_CSP_TRC_HLG:
-        GLSLF("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,\n"
-              "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) + vec3(%f),\n"
-              "                lessThan(vec3(0.5), color.rgb));\n",
-              HLG_C, HLG_A, HLG_B);
-        break;
-    case MP_CSP_TRC_V_LOG:
-        GLSLF("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
-              "    pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
-              "              - vec3(%f),                                  \n"
-              "    lessThanEqual(vec3(0.181), color.rgb));                \n",
-              VLOG_D, VLOG_C, VLOG_B);
-        break;
-    case MP_CSP_TRC_S_LOG1:
-        GLSLF("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
-              "            - vec3(%f);\n",
-              SLOG_C, SLOG_A, SLOG_B);
-        break;
-    case MP_CSP_TRC_S_LOG2:
-        GLSLF("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f),      \n"
-              "    (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
-              "              - vec3(%f)) * vec3(1.0/%f),                   \n"
-              "    lessThanEqual(vec3(%f), color.rgb));                    \n",
-              SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
-        break;
-    default:
-        abort();
-    }
-
-    // Rescale to prevent clipping on non-float textures
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", mp_trc_nom_peak(trc));
-}
-
-// Delinearize (compress), given a TRC as output. This corresponds to the
-// inverse EOTF (not the OETF) in ITU-R terminology, again assuming a
-// reference monitor.
-void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
-{
-    if (trc == MP_CSP_TRC_LINEAR)
-        return;
-
-    GLSLF("// delinearize\n");
-    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-    GLSLF("color.rgb *= vec3(%f);\n", mp_trc_nom_peak(trc));
-
-    switch (trc) {
-    case MP_CSP_TRC_SRGB:
-        GLSL(color.rgb = mix(color.rgb * vec3(12.92),
-                             vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))
-                                 - vec3(0.055),
-                             lessThanEqual(vec3(0.0031308), color.rgb));)
-        break;
-    case MP_CSP_TRC_BT_1886:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
-        break;
-    case MP_CSP_TRC_GAMMA18:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.8));)
-        break;
-    case MP_CSP_TRC_GAMMA22:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.2));)
-        break;
-    case MP_CSP_TRC_GAMMA28:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.8));)
-        break;
-    case MP_CSP_TRC_PRO_PHOTO:
-        GLSL(color.rgb = mix(color.rgb * vec3(16.0),
-                             pow(color.rgb, vec3(1.0/1.8)),
-                             lessThanEqual(vec3(0.001953), color.rgb));)
-        break;
-    case MP_CSP_TRC_PQ:
-        GLSLF("color.rgb *= vec3(1.0/%f);\n", 10000 / MP_REF_WHITE);
-        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M1);
-        GLSLF("color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
-              "             / (vec3(1.0) + vec3(%f) * color.rgb);\n",
-              PQ_C1, PQ_C2, PQ_C3);
-        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M2);
-        break;
-    case MP_CSP_TRC_HLG:
-        GLSLF("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),\n"
-              "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),\n"
-              "                lessThan(vec3(1.0), color.rgb));\n",
-              HLG_A, HLG_B, HLG_C);
-        break;
-    case MP_CSP_TRC_V_LOG:
-        GLSLF("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),   \n"
-              "                vec3(%f) * log(color.rgb + vec3(%f))   \n"
-              "                    + vec3(%f),                        \n"
-              "                lessThanEqual(vec3(0.01), color.rgb)); \n",
-              VLOG_C / M_LN10, VLOG_B, VLOG_D);
-        break;
-    case MP_CSP_TRC_S_LOG1:
-        GLSLF("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
-              SLOG_A / M_LN10, SLOG_B, SLOG_C);
-        break;
-    case MP_CSP_TRC_S_LOG2:
-        GLSLF("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f),                \n"
-              "                vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
-              "                    + vec3(%f),                                 \n"
-              "                lessThanEqual(vec3(0.0), color.rgb));           \n",
-              SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
-        break;
-    default:
-        abort();
-    }
-}
-
-// Apply the OOTF mapping from a given light type to display-referred light.
-// The extra peak parameter is used to scale the values before and after
-// the OOTF, and can be inferred using mp_trc_nom_peak
-void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
-{
-    if (light == MP_CSP_LIGHT_DISPLAY)
-        return;
-
-    GLSLF("// apply ootf\n");
-    GLSLF("color.rgb *= vec3(%f);\n", peak);
-
-    switch (light)
-    {
-    case MP_CSP_LIGHT_SCENE_HLG:
-        // HLG OOTF from BT.2100, assuming a reference display with a
-        // peak of 1000 cd/m² -> gamma = 1.2
-        GLSLF("color.rgb *= vec3(%f * pow(dot(src_luma, color.rgb), 0.2));\n",
-              (1000 / MP_REF_WHITE) / pow(12, 1.2));
-        break;
-    case MP_CSP_LIGHT_SCENE_709_1886:
-        // This OOTF is defined by encoding the result as 709 and then decoding
-        // it as 1886; although this is called 709_1886 we actually use the
-        // more precise (by one decimal) values from BT.2020 instead
-        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
-                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
-                             lessThan(vec3(0.0181), color.rgb));)
-        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
-        break;
-    case MP_CSP_LIGHT_SCENE_1_2:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.2));)
-        break;
-    default:
-        abort();
-    }
-
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
-}
-
-// Inverse of the function pass_ootf, for completeness' sake.
-void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
-{
-    if (light == MP_CSP_LIGHT_DISPLAY)
-        return;
-
-    GLSLF("// apply inverse ootf\n");
-    GLSLF("color.rgb *= vec3(%f);\n", peak);
-
-    switch (light)
-    {
-    case MP_CSP_LIGHT_SCENE_HLG:
-        GLSLF("color.rgb *= vec3(1.0/%f);\n", (1000 / MP_REF_WHITE) / pow(12, 1.2));
-        GLSL(color.rgb /= vec3(max(1e-6, pow(dot(src_luma, color.rgb), 0.2/1.2)));)
-        break;
-    case MP_CSP_LIGHT_SCENE_709_1886:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
-                             pow((color.rgb + vec3(0.0993)) * vec3(1.0/1.0993),
-                                 vec3(1/0.45)),
-                             lessThan(vec3(0.08145), color.rgb));)
-        break;
-    case MP_CSP_LIGHT_SCENE_1_2:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.2));)
-        break;
-    default:
-        abort();
-    }
-
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
-}
-
-// Tone map from a known peak brightness to the range [0,1]. If ref_peak
-// is 0, we will use peak detection instead
-static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
-                          enum tone_mapping algo, float param, float desat)
-{
-    GLSLF("// HDR tone mapping\n");
-
-    // Desaturate the color using a coefficient dependent on the luminance
-    GLSL(float luma = dot(dst_luma, color.rgb);)
-    if (desat > 0) {
-        GLSLF("float overbright = max(luma - %f, 1e-6) / max(luma, 1e-6);\n", desat);
-        GLSL(color.rgb = mix(color.rgb, vec3(luma), overbright);)
-    }
-
-    // To prevent discoloration due to out-of-bounds clipping, we need to make
-    // sure to reduce the value range as far as necessary to keep the entire
-    // signal in range, so tone map based on the brightest component.
-    GLSL(float sig = max(max(color.r, color.g), color.b);)
-    GLSL(float sig_orig = sig;)
-
-    if (!ref_peak) {
-        // For performance, we want to do as few atomic operations on global
-        // memory as possible, so use an atomic in shmem for the work group.
-        // We also want slightly more stable values, so use the group average
-        // instead of the group max
-        GLSLHF("shared uint group_sum = 0;\n");
-        GLSLF("atomicAdd(group_sum, uint(sig * %f));\n", MP_REF_WHITE);
-
-        // Have one thread in each work group update the frame maximum
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSL(if (gl_LocalInvocationIndex == 0))
-            GLSL(atomicMax(frame_max[index], group_sum /
-                 (gl_WorkGroupSize.x * gl_WorkGroupSize.y));)
-
-        // Finally, have one thread per invocation update the total maximum
-        // and advance the index
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSL(if (gl_GlobalInvocationID == ivec3(0)) {) // do this once per invocation
-            GLSLF("uint next = (index + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
-            GLSLF("sig_peak_raw = sig_peak_raw + frame_max[index] - frame_max[next];\n");
-            GLSLF("frame_max[next] = %d;\n", (int)MP_REF_WHITE);
-            GLSL(index = next;)
-        GLSL(})
-
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSLF("float sig_peak = 1.0/%f * float(sig_peak_raw);\n",
-              MP_REF_WHITE * PEAK_DETECT_FRAMES);
-    } else {
-        GLSLHF("const float sig_peak = %f;\n", ref_peak);
-    }
-
-    switch (algo) {
-    case TONE_MAPPING_CLIP:
-        GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
-        break;
-
-    case TONE_MAPPING_MOBIUS:
-        GLSLF("const float j = %f;\n", isnan(param) ? 0.3 : param);
-        // solve for M(j) = j; M(sig_peak) = 1.0; M'(j) = 1.0
-        // where M(x) = scale * (x+a)/(x+b)
-        GLSLF("float a = -j*j * (sig_peak - 1.0) / (j*j - 2.0*j + sig_peak);\n");
-        GLSLF("float b = (j*j - 2.0*j*sig_peak + sig_peak) / "
-              "max(1e-6, sig_peak - 1.0);\n");
-        GLSLF("float scale = (b*b + 2.0*b*j + j*j) / (b-a);\n");
-        GLSL(sig = mix(sig, scale * (sig + a) / (sig + b), sig > j);)
-        break;
-
-    case TONE_MAPPING_REINHARD: {
-        float contrast = isnan(param) ? 0.5 : param,
-              offset = (1.0 - contrast) / contrast;
-        GLSLF("sig = sig / (sig + %f);\n", offset);
-        GLSLF("float scale = (sig_peak + %f) / sig_peak;\n", offset);
-        GLSL(sig *= scale;)
-        break;
-    }
-
-    case TONE_MAPPING_HABLE: {
-        float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
-        GLSLHF("float hable(float x) {\n");
-        GLSLHF("return ((x * (%f*x + %f)+%f)/(x * (%f*x + %f) + %f)) - %f;\n",
-               A, C*B, D*E, A, B, D*F, E/F);
-        GLSLHF("}\n");
-        GLSL(sig = hable(sig) / hable(sig_peak);)
-        break;
-    }
-
-    case TONE_MAPPING_GAMMA: {
-        float gamma = isnan(param) ? 1.8 : param;
-        GLSLF("const float cutoff = 0.05, gamma = %f;\n", 1.0/gamma);
-        GLSL(float scale = pow(cutoff / sig_peak, gamma) / cutoff;)
-        GLSL(sig = sig > cutoff ? pow(sig / sig_peak, gamma) : scale * sig;)
-        break;
-    }
-
-    case TONE_MAPPING_LINEAR: {
-        float coeff = isnan(param) ? 1.0 : param;
-        GLSLF("sig = %f / sig_peak * sig;\n", coeff);
-        break;
-    }
-
-    default:
-        abort();
-    }
-
-    // Apply the computed scale factor to the color, linearly to prevent
-    // discoloration
-    GLSL(color.rgb *= sig / sig_orig;)
-}
-
-// Map colors from one source space to another. These source spaces must be
-// known (i.e. not MP_CSP_*_AUTO), as this function won't perform any
-// auto-guessing. If is_linear is true, we assume the input has already been
-// linearized (e.g. for linear-scaling). If `detect_peak` is true, we will
-// detect the peak instead of relying on metadata. Note that this requires
-// the caller to have already bound the appropriate SSBO and set up the
-// compute shader metadata
-void pass_color_map(struct gl_shader_cache *sc,
-                    struct mp_colorspace src, struct mp_colorspace dst,
-                    enum tone_mapping algo, float tone_mapping_param,
-                    float tone_mapping_desat, bool detect_peak,
-                    bool gamut_warning, bool is_linear)
-{
-    GLSLF("// color mapping\n");
-
-    // Compute the highest encodable level
-    float src_range = mp_trc_nom_peak(src.gamma),
-          dst_range = mp_trc_nom_peak(dst.gamma);
-    float ref_peak = src.sig_peak / dst_range;
-
-    // Some operations need access to the video's luma coefficients, so make
-    // them available
-    float rgb2xyz[3][3];
-    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(src.primaries), rgb2xyz);
-    gl_sc_uniform_vec3(sc, "src_luma", rgb2xyz[1]);
-    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(dst.primaries), rgb2xyz);
-    gl_sc_uniform_vec3(sc, "dst_luma", rgb2xyz[1]);
-
-    // All operations from here on require linear light as a starting point,
-    // so we linearize even if src.gamma == dst.gamma when one of the other
-    // operations needs it
-    bool need_gamma = src.gamma != dst.gamma ||
-                      src.primaries != dst.primaries ||
-                      src_range != dst_range ||
-                      src.sig_peak > dst_range ||
-                      src.light != dst.light;
-
-    if (need_gamma && !is_linear) {
-        pass_linearize(sc, src.gamma);
-        is_linear= true;
-    }
-
-    if (src.light != dst.light)
-        pass_ootf(sc, src.light, mp_trc_nom_peak(src.gamma));
-
-    // Rescale the signal to compensate for differences in the encoding range
-    // and reference white level. This is necessary because of how mpv encodes
-    // brightness in textures.
-    if (src_range != dst_range) {
-        GLSLF("// rescale value range;\n");
-        GLSLF("color.rgb *= vec3(%f);\n", src_range / dst_range);
-    }
-
-    // Adapt to the right colorspace if necessary
-    if (src.primaries != dst.primaries) {
-        struct mp_csp_primaries csp_src = mp_get_csp_primaries(src.primaries),
-                                csp_dst = mp_get_csp_primaries(dst.primaries);
-        float m[3][3] = {{0}};
-        mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
-        gl_sc_uniform_mat3(sc, "cms_matrix", true, &m[0][0]);
-        GLSL(color.rgb = cms_matrix * color.rgb;)
-        // Since this can reduce the gamut, figure out by how much
-        for (int c = 0; c < 3; c++)
-            ref_peak = MPMAX(ref_peak, m[c][c]);
-    }
-
-    // Tone map to prevent clipping when the source signal peak exceeds the
-    // encodable range or we've reduced the gamut
-    if (ref_peak > 1) {
-        pass_tone_map(sc, detect_peak ? 0 : ref_peak, algo,
-                      tone_mapping_param, tone_mapping_desat);
-    }
-
-    if (src.light != dst.light)
-        pass_inverse_ootf(sc, dst.light, mp_trc_nom_peak(dst.gamma));
-
-    // Warn for remaining out-of-gamut colors is enabled
-    if (gamut_warning) {
-        GLSL(if (any(greaterThan(color.rgb, vec3(1.01)))))
-            GLSL(color.rgb = vec3(1.0) - color.rgb;) // invert
-    }
-
-    if (is_linear)
-        pass_delinearize(sc, dst.gamma);
-}
-
-// Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
-// Obtain random numbers by calling rand(h), followed by h = permute(h) to
-// update the state. Assumes the texture was hooked.
-static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
-{
-    GLSLH(float mod289(float x)  { return x - floor(x * 1.0/289.0) * 289.0; })
-    GLSLH(float permute(float x) { return mod289((34.0*x + 1.0) * x); })
-    GLSLH(float rand(float x)    { return fract(x * 1.0/41.0); })
-
-    // Initialize the PRNG by hashing the position + a random uniform
-    GLSL(vec3 _m = vec3(HOOKED_pos, random) + vec3(1.0);)
-    GLSL(float h = permute(permute(permute(_m.x)+_m.y)+_m.z);)
-    gl_sc_uniform_f(sc, "random", (double)av_lfg_get(lfg) / UINT32_MAX);
-}
-
-struct deband_opts {
-    int enabled;
-    int iterations;
-    float threshold;
-    float range;
-    float grain;
-};
-
-const struct deband_opts deband_opts_def = {
-    .iterations = 1,
-    .threshold = 64.0,
-    .range = 16.0,
-    .grain = 48.0,
-};
-
-#define OPT_BASE_STRUCT struct deband_opts
-const struct m_sub_options deband_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_INTRANGE("iterations", iterations, 0, 1, 16),
-        OPT_FLOATRANGE("threshold", threshold, 0, 0.0, 4096.0),
-        OPT_FLOATRANGE("range", range, 0, 1.0, 64.0),
-        OPT_FLOATRANGE("grain", grain, 0, 0.0, 4096.0),
-        {0}
-    },
-    .size = sizeof(struct deband_opts),
-    .defaults = &deband_opts_def,
-};
-
-// Stochastically sample a debanded result from a hooked texture.
-void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
-                        AVLFG *lfg, enum mp_csp_trc trc)
-{
-    // Initialize the PRNG
-    GLSLF("{\n");
-    prng_init(sc, lfg);
-
-    // Helper: Compute a stochastic approximation of the avg color around a
-    // pixel
-    GLSLHF("vec4 average(float range, inout float h) {\n");
-        // Compute a random rangle and distance
-        GLSLH(float dist = rand(h) * range;     h = permute(h);)
-        GLSLH(float dir  = rand(h) * 6.2831853; h = permute(h);)
-        GLSLH(vec2 o = dist * vec2(cos(dir), sin(dir));)
-
-        // Sample at quarter-turn intervals around the source pixel
-        GLSLH(vec4 ref[4];)
-        GLSLH(ref[0] = HOOKED_texOff(vec2( o.x,  o.y));)
-        GLSLH(ref[1] = HOOKED_texOff(vec2(-o.y,  o.x));)
-        GLSLH(ref[2] = HOOKED_texOff(vec2(-o.x, -o.y));)
-        GLSLH(ref[3] = HOOKED_texOff(vec2( o.y, -o.x));)
-
-        // Return the (normalized) average
-        GLSLH(return (ref[0] + ref[1] + ref[2] + ref[3])*0.25;)
-    GLSLHF("}\n");
-
-    // Sample the source pixel
-    GLSL(color = HOOKED_tex(HOOKED_pos);)
-    GLSLF("vec4 avg, diff;\n");
-    for (int i = 1; i <= opts->iterations; i++) {
-        // Sample the average pixel and use it instead of the original if
-        // the difference is below the given threshold
-        GLSLF("avg = average(%f, h);\n", i * opts->range);
-        GLSL(diff = abs(color - avg);)
-        GLSLF("color = mix(avg, color, greaterThan(diff, vec4(%f)));\n",
-              opts->threshold / (i * 16384.0));
-    }
-
-    // Add some random noise to smooth out residual differences
-    GLSL(vec3 noise;)
-    GLSL(noise.x = rand(h); h = permute(h);)
-    GLSL(noise.y = rand(h); h = permute(h);)
-    GLSL(noise.z = rand(h); h = permute(h);)
-
-    // Noise is scaled to the signal level to prevent extreme noise for HDR
-    float gain = opts->grain/8192.0 / mp_trc_nom_peak(trc);
-    GLSLF("color.xyz += %f * (noise - vec3(0.5));\n", gain);
-    GLSLF("}\n");
-}
-
-// Assumes the texture was hooked
-void pass_sample_unsharp(struct gl_shader_cache *sc, float param) {
-    GLSLF("{\n");
-    GLSL(float st1 = 1.2;)
-    GLSL(vec4 p = HOOKED_tex(HOOKED_pos);)
-    GLSL(vec4 sum1 = HOOKED_texOff(st1 * vec2(+1, +1))
-                   + HOOKED_texOff(st1 * vec2(+1, -1))
-                   + HOOKED_texOff(st1 * vec2(-1, +1))
-                   + HOOKED_texOff(st1 * vec2(-1, -1));)
-    GLSL(float st2 = 1.5;)
-    GLSL(vec4 sum2 = HOOKED_texOff(st2 * vec2(+1,  0))
-                   + HOOKED_texOff(st2 * vec2( 0, +1))
-                   + HOOKED_texOff(st2 * vec2(-1,  0))
-                   + HOOKED_texOff(st2 * vec2( 0, -1));)
-    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
-    GLSLF("color = p + t * %f;\n", param);
-    GLSLF("}\n");
-}
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
deleted file mode 100644
index 8345e4c598..0000000000
--- a/video/out/opengl/video_shaders.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_VIDEO_SHADERS_H
-#define MP_GL_VIDEO_SHADERS_H
-
-#include <libavutil/lfg.h>
-
-#include "utils.h"
-#include "video.h"
-
-extern const struct deband_opts deband_opts_def;
-extern const struct m_sub_options deband_conf;
-
-void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
-void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
-                               int d_x, int d_y);
-void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                       int components, int glsl_version);
-void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                        int components, int bw, int bh, int iw, int ih);
-void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
-void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
-                            int w, int h);
-
-void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
-void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
-void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
-void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
-
-void pass_color_map(struct gl_shader_cache *sc,
-                    struct mp_colorspace src, struct mp_colorspace dst,
-                    enum tone_mapping algo, float tone_mapping_param,
-                    float tone_mapping_desat, bool use_detected_peak,
-                    bool gamut_warning, bool is_linear);
-
-void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
-                        AVLFG *lfg, enum mp_csp_trc trc);
-
-void pass_sample_unsharp(struct gl_shader_cache *sc, float param);
-
-#endif
diff --git a/video/out/vo.c b/video/out/vo.c
index f9c5d04e24..a40360b188 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -50,6 +50,7 @@
 extern const struct vo_driver video_out_x11;
 extern const struct vo_driver video_out_vdpau;
 extern const struct vo_driver video_out_xv;
+extern const struct vo_driver video_out_gpu;
 extern const struct vo_driver video_out_opengl;
 extern const struct vo_driver video_out_opengl_cb;
 extern const struct vo_driver video_out_null;
@@ -69,8 +70,8 @@ const struct vo_driver *const video_out_drivers[] =
 #if HAVE_RPI
     &video_out_rpi,
 #endif
-#if HAVE_GL
-    &video_out_opengl,
+#if HAVE_GPU
+    &video_out_gpu,
 #endif
 #if HAVE_VDPAU
     &video_out_vdpau,
@@ -107,6 +108,7 @@ const struct vo_driver *const video_out_drivers[] =
     &video_out_lavc,
 #endif
 #if HAVE_GL
+    &video_out_opengl,
     &video_out_opengl_cb,
 #endif
     NULL
diff --git a/video/out/vo_gpu.c b/video/out/vo_gpu.c
new file mode 100644
index 0000000000..5df9e06f47
--- /dev/null
+++ b/video/out/vo_gpu.c
@@ -0,0 +1,385 @@
+/*
+ * Based on vo_gl.c by Reimar Doeffinger.
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include <libavutil/common.h>
+
+#include "config.h"
+
+#include "mpv_talloc.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "common/msg.h"
+#include "common/global.h"
+#include "options/m_config.h"
+#include "vo.h"
+#include "video/mp_image.h"
+#include "sub/osd.h"
+
+#include "gpu/context.h"
+#include "gpu/hwdec.h"
+#include "gpu/video.h"
+
+struct gpu_priv {
+    struct vo *vo;
+    struct mp_log *log;
+    struct ra_ctx *ctx;
+
+    char *context_name;
+    char *context_type;
+    struct ra_ctx_opts opts;
+    struct gl_video *renderer;
+    struct ra_hwdec *hwdec;
+
+    int events;
+};
+
+static void resize(struct gpu_priv *p)
+{
+    struct vo *vo = p->vo;
+
+    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
+
+    struct mp_rect src, dst;
+    struct mp_osd_res osd;
+    vo_get_src_dst_rects(vo, &src, &dst, &osd);
+
+    gl_video_resize(p->renderer, &src, &dst, &osd);
+
+    vo->want_redraw = true;
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    struct ra_tex *tex = sw->fns->start_frame(sw);
+    if (!tex) {
+        MP_ERR(vo, "Failed starting frame!\n");
+        return;
+    }
+
+    struct fbodst dst = {
+        .tex  = tex,
+        .flip = sw->flip_v,
+    };
+
+    gl_video_render_frame(p->renderer, frame, dst);
+    if (!sw->fns->submit_frame(sw, frame)) {
+        MP_ERR(vo, "Failed presenting frame!\n");
+        return;
+    }
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+    sw->fns->swap_buffers(sw);
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    struct gpu_priv *p = vo->priv;
+    if (!gl_video_check_format(p->renderer, format))
+        return 0;
+    return 1;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    struct gpu_priv *p = vo->priv;
+
+    if (!p->ctx->fns->reconfig(p->ctx))
+        return -1;
+
+    resize(p);
+    gl_video_config(p->renderer, params);
+
+    return 0;
+}
+
+static void request_hwdec_api(struct vo *vo, void *api)
+{
+    struct gpu_priv *p = vo->priv;
+
+    if (p->hwdec)
+        return;
+
+    p->hwdec = ra_hwdec_load_api(p->vo->log, p->ctx->ra, p->vo->global,
+                                 vo->hwdec_devs, (intptr_t)api);
+    gl_video_set_hwdec(p->renderer, p->hwdec);
+}
+
+static void call_request_hwdec_api(void *ctx, enum hwdec_type type)
+{
+    // Roundabout way to run hwdec loading on the VO thread.
+    // Redirects to request_hwdec_api().
+    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t)type);
+}
+
+static void get_and_update_icc_profile(struct gpu_priv *p)
+{
+    if (gl_video_icc_auto_enabled(p->renderer)) {
+        MP_VERBOSE(p, "Querying ICC profile...\n");
+        bstr icc = bstr0(NULL);
+        int r = p->ctx->fns->control(p->ctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
+
+        if (r != VO_NOTAVAIL) {
+            if (r == VO_FALSE) {
+                MP_WARN(p, "Could not retrieve an ICC profile.\n");
+            } else if (r == VO_NOTIMPL) {
+                MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
+            }
+
+            gl_video_set_icc_profile(p->renderer, icc);
+        }
+    }
+}
+
+static void get_and_update_ambient_lighting(struct gpu_priv *p)
+{
+    int lux;
+    int r = p->ctx->fns->control(p->ctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
+    if (r == VO_TRUE) {
+        gl_video_set_ambient_lux(p->renderer, lux);
+    }
+    if (r != VO_TRUE && gl_video_gamma_auto_enabled(p->renderer)) {
+        MP_ERR(p, "gamma_auto option provided, but querying for ambient"
+                  " lighting is not supported on this platform\n");
+    }
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    switch (request) {
+    case VOCTRL_SET_PANSCAN:
+        resize(p);
+        return VO_TRUE;
+    case VOCTRL_SET_EQUALIZER:
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_SCREENSHOT_WIN: {
+        struct mp_image *screen = NULL;
+        if (sw->fns->screenshot)
+            screen = sw->fns->screenshot(sw);
+        if (!screen)
+            break; // redirect to backend
+        // set image parameters according to the display, if possible
+        screen->params.color = gl_video_get_output_colorspace(p->renderer);
+        *(struct mp_image **)data = screen;
+        return true;
+    }
+    case VOCTRL_LOAD_HWDEC_API:
+        request_hwdec_api(vo, data);
+        return true;
+    case VOCTRL_UPDATE_RENDER_OPTS: {
+        gl_video_update_options(p->renderer);
+        get_and_update_icc_profile(p);
+        gl_video_configure_queue(p->renderer, p->vo);
+        p->vo->want_redraw = true;
+        return true;
+    }
+    case VOCTRL_RESET:
+        gl_video_reset(p->renderer);
+        return true;
+    case VOCTRL_PAUSE:
+        if (gl_video_showing_interpolated_frame(p->renderer))
+            vo->want_redraw = true;
+        return true;
+    case VOCTRL_PERFORMANCE_DATA:
+        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
+        return true;
+    }
+
+    int events = 0;
+    int r = p->ctx->fns->control(p->ctx, &events, request, data);
+    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
+        get_and_update_icc_profile(p);
+        vo->want_redraw = true;
+    }
+    if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
+        get_and_update_ambient_lighting(p);
+        vo->want_redraw = true;
+    }
+    events |= p->events;
+    p->events = 0;
+    if (events & VO_EVENT_RESIZE)
+        resize(p);
+    if (events & VO_EVENT_EXPOSE)
+        vo->want_redraw = true;
+    vo_event(vo, events);
+
+    return r;
+}
+
+static void wakeup(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    if (p->ctx && p->ctx->fns->wakeup)
+        p->ctx->fns->wakeup(p->ctx);
+}
+
+static void wait_events(struct vo *vo, int64_t until_time_us)
+{
+    struct gpu_priv *p = vo->priv;
+    if (p->ctx && p->ctx->fns->wait_events) {
+        p->ctx->fns->wait_events(p->ctx, until_time_us);
+    } else {
+        vo_wait_default(vo, until_time_us);
+    }
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct gpu_priv *p = vo->priv;
+
+    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
+}
+
+static void uninit(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+
+    gl_video_uninit(p->renderer);
+    ra_hwdec_uninit(p->hwdec);
+    if (vo->hwdec_devs) {
+        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
+        hwdec_devices_destroy(vo->hwdec_devs);
+    }
+    ra_ctx_destroy(&p->ctx);
+}
+
+static int preinit(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    p->vo = vo;
+    p->log = vo->log;
+
+    int alpha_mode;
+    mp_read_option_raw(vo->global, "alpha", &m_option_type_choice, &alpha_mode);
+
+    struct ra_ctx_opts opts = p->opts;
+    opts.want_alpha = alpha_mode == 1;
+
+    p->ctx = ra_ctx_create(vo, p->context_type, p->context_name, opts);
+    if (!p->ctx)
+        goto err_out;
+    assert(p->ctx->ra);
+    assert(p->ctx->swapchain);
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    p->renderer = gl_video_init(p->ctx->ra, vo->log, vo->global);
+    gl_video_set_osd_source(p->renderer, vo->osd);
+    gl_video_configure_queue(p->renderer, vo);
+
+    get_and_update_icc_profile(p);
+
+    vo->hwdec_devs = hwdec_devices_create();
+
+    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
+
+    p->hwdec = ra_hwdec_load(p->vo->log, p->ctx->ra, vo->global,
+                             vo->hwdec_devs, vo->opts->gl_hwdec_interop);
+    gl_video_set_hwdec(p->renderer, p->hwdec);
+
+    int fb_depth = sw->fns->color_depth ? sw->fns->color_depth(sw) : 0;
+    if (fb_depth)
+        MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth);
+    gl_video_set_fb_depth(p->renderer, fb_depth);
+
+    return 0;
+
+err_out:
+    uninit(vo);
+    return -1;
+}
+
+#define OPT_BASE_STRUCT struct gpu_priv
+static const m_option_t options[] = {
+    OPT_STRING_VALIDATE("gpu-context", context_name, 0, ra_ctx_validate_context),
+    OPT_STRING_VALIDATE("gpu-api", context_type, 0, ra_ctx_validate_api),
+    OPT_FLAG("gpu-debug", opts.debug, 0),
+    OPT_FLAG("gpu-sw", opts.allow_sw, 0),
+    OPT_INTRANGE("swapchain-depth", opts.swapchain_depth, 0, 1, 8),
+    {0}
+};
+
+static const struct gpu_priv defaults = { .opts = {
+    .swapchain_depth = 3,
+}};
+
+const struct vo_driver video_out_gpu = {
+    .description = "Shader-based GPU Renderer",
+    .name = "gpu",
+    .caps = VO_CAP_ROTATE90,
+    .preinit = preinit,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct gpu_priv),
+    .priv_defaults = &defaults,
+    .options = options,
+};
+
+static int preinit_opengl(struct vo *vo)
+{
+    MP_WARN(vo, "--vo=opengl was replaced by --vo=gpu --gpu-api=opengl, and will"
+            " be removed in the future!\n");
+
+    struct gpu_priv *p = vo->priv;
+    p->context_type = "opengl";
+    return preinit(vo);
+}
+
+const struct vo_driver video_out_opengl = {
+    .description = "Shader-based GPU Renderer",
+    .name = "opengl",
+    .caps = VO_CAP_ROTATE90,
+    .preinit = preinit_opengl,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct gpu_priv),
+    .priv_defaults = &defaults,
+    .options = options,
+};
diff --git a/video/out/vo_opengl.c b/video/out/vo_opengl.c
deleted file mode 100644
index 72691e56c2..0000000000
--- a/video/out/vo_opengl.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Based on vo_gl.c by Reimar Doeffinger.
- *
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <stdbool.h>
-#include <assert.h>
-
-#include <libavutil/common.h>
-
-#include "config.h"
-
-#include "mpv_talloc.h"
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "common/msg.h"
-#include "common/global.h"
-#include "options/m_config.h"
-#include "vo.h"
-#include "video/mp_image.h"
-#include "sub/osd.h"
-
-#include "opengl/context.h"
-#include "opengl/utils.h"
-#include "opengl/hwdec.h"
-#include "opengl/osd.h"
-#include "filter_kernels.h"
-#include "video/hwdec.h"
-#include "opengl/video.h"
-#include "opengl/ra_gl.h"
-
-#define NUM_VSYNC_FENCES 10
-
-struct vo_opengl_opts {
-    int use_glFinish;
-    int waitvsync;
-    int use_gl_debug;
-    int allow_sw;
-    int swap_interval;
-    int vsync_fences;
-    char *backend;
-    int es;
-    int pattern[2];
-};
-
-struct gl_priv {
-    struct vo *vo;
-    struct mp_log *log;
-    MPGLContext *glctx;
-    GL *gl;
-    struct ra *ra;
-
-    struct vo_opengl_opts opts;
-
-    struct gl_video *renderer;
-
-    struct ra_hwdec *hwdec;
-
-    int events;
-
-    int frames_rendered;
-    unsigned int prev_sgi_sync_count;
-
-    // check-pattern sub-option; for testing/debugging
-    int last_pattern;
-    int matches, mismatches;
-
-    GLsync vsync_fences[NUM_VSYNC_FENCES];
-    int num_vsync_fences;
-};
-
-static void resize(struct gl_priv *p)
-{
-    struct vo *vo = p->vo;
-
-    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
-
-    struct mp_rect src, dst;
-    struct mp_osd_res osd;
-    vo_get_src_dst_rects(vo, &src, &dst, &osd);
-
-    gl_video_resize(p->renderer, &src, &dst, &osd);
-
-    vo->want_redraw = true;
-}
-
-static void check_pattern(struct vo *vo, int item)
-{
-    struct gl_priv *p = vo->priv;
-    int expected = p->opts.pattern[p->last_pattern];
-    if (item == expected) {
-        p->last_pattern++;
-        if (p->last_pattern >= 2)
-            p->last_pattern = 0;
-        p->matches++;
-    } else {
-        p->mismatches++;
-        MP_WARN(vo, "wrong pattern, expected %d got %d (hit: %d, mis: %d)\n",
-                expected, item, p->matches, p->mismatches);
-    }
-}
-
-static void draw_frame(struct vo *vo, struct vo_frame *frame)
-{
-    struct gl_priv *p = vo->priv;
-    GL *gl = p->gl;
-
-    mpgl_start_frame(p->glctx);
-
-    if (gl->FenceSync && p->num_vsync_fences < p->opts.vsync_fences) {
-        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);;
-        if (fence)
-            p->vsync_fences[p->num_vsync_fences++] = fence;
-    }
-
-    struct fbodst target = {
-        .tex = ra_create_wrapped_fb(p->ra, p->glctx->main_fb,
-                                    vo->dwidth, vo->dheight),
-        .flip = !p->glctx->flip_v,
-    };
-    gl_video_render_frame(p->renderer, frame, target);
-    ra_tex_free(p->ra, &target.tex);
-
-    if (p->opts.use_glFinish)
-        gl->Finish();
-}
-
-static void flip_page(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    GL *gl = p->gl;
-
-    mpgl_swap_buffers(p->glctx);
-
-    p->frames_rendered++;
-    if (p->frames_rendered > 5 && !p->opts.use_gl_debug)
-        ra_gl_set_debug(p->ra, false);
-
-    if (p->opts.use_glFinish)
-        gl->Finish();
-
-    if (p->opts.waitvsync || p->opts.pattern[0]) {
-        if (gl->GetVideoSync) {
-            unsigned int n1 = 0, n2 = 0;
-            gl->GetVideoSync(&n1);
-            if (p->opts.waitvsync)
-                gl->WaitVideoSync(2, (n1 + 1) % 2, &n2);
-            int step = n1 - p->prev_sgi_sync_count;
-            p->prev_sgi_sync_count = n1;
-            MP_DBG(vo, "Flip counts: %u->%u, step=%d\n", n1, n2, step);
-            if (p->opts.pattern[0])
-                check_pattern(vo, step);
-        } else {
-            MP_WARN(vo, "GLX_SGI_video_sync not available, disabling.\n");
-            p->opts.waitvsync = 0;
-            p->opts.pattern[0] = 0;
-        }
-    }
-    while (p->opts.vsync_fences > 0 && p->num_vsync_fences >= p->opts.vsync_fences) {
-        gl->ClientWaitSync(p->vsync_fences[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
-        gl->DeleteSync(p->vsync_fences[0]);
-        MP_TARRAY_REMOVE_AT(p->vsync_fences, p->num_vsync_fences, 0);
-    }
-}
-
-static int query_format(struct vo *vo, int format)
-{
-    struct gl_priv *p = vo->priv;
-    if (!gl_video_check_format(p->renderer, format))
-        return 0;
-    return 1;
-}
-
-static int reconfig(struct vo *vo, struct mp_image_params *params)
-{
-    struct gl_priv *p = vo->priv;
-
-    if (mpgl_reconfig_window(p->glctx) < 0)
-        return -1;
-
-    resize(p);
-
-    gl_video_config(p->renderer, params);
-
-    return 0;
-}
-
-static void request_hwdec_api(struct vo *vo, void *api)
-{
-    struct gl_priv *p = vo->priv;
-
-    if (p->hwdec)
-        return;
-
-    p->hwdec = ra_hwdec_load_api(p->vo->log, p->ra, p->vo->global,
-                                 vo->hwdec_devs, (intptr_t)api);
-    gl_video_set_hwdec(p->renderer, p->hwdec);
-}
-
-static void call_request_hwdec_api(void *ctx, enum hwdec_type type)
-{
-    // Roundabout way to run hwdec loading on the VO thread.
-    // Redirects to request_hwdec_api().
-    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t)type);
-}
-
-static void get_and_update_icc_profile(struct gl_priv *p)
-{
-    if (gl_video_icc_auto_enabled(p->renderer)) {
-        MP_VERBOSE(p, "Querying ICC profile...\n");
-        bstr icc = bstr0(NULL);
-        int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
-
-        if (r != VO_NOTAVAIL) {
-            if (r == VO_FALSE) {
-                MP_WARN(p, "Could not retrieve an ICC profile.\n");
-            } else if (r == VO_NOTIMPL) {
-                MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
-            }
-
-            gl_video_set_icc_profile(p->renderer, icc);
-        }
-    }
-}
-
-static void get_and_update_ambient_lighting(struct gl_priv *p)
-{
-    int lux;
-    int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
-    if (r == VO_TRUE) {
-        gl_video_set_ambient_lux(p->renderer, lux);
-    }
-    if (r != VO_TRUE && gl_video_gamma_auto_enabled(p->renderer)) {
-        MP_ERR(p, "gamma_auto option provided, but querying for ambient"
-                  " lighting is not supported on this platform\n");
-    }
-}
-
-static int control(struct vo *vo, uint32_t request, void *data)
-{
-    struct gl_priv *p = vo->priv;
-
-    switch (request) {
-    case VOCTRL_SET_PANSCAN:
-        resize(p);
-        return VO_TRUE;
-    case VOCTRL_SET_EQUALIZER:
-        vo->want_redraw = true;
-        return VO_TRUE;
-    case VOCTRL_SCREENSHOT_WIN: {
-        struct mp_image *screen = gl_read_fbo_contents(p->gl, p->glctx->main_fb,
-                                                       vo->dwidth, vo->dheight);
-        if (!screen)
-            break; // redirect to backend
-        // set image parameters according to the display, if possible
-        screen->params.color = gl_video_get_output_colorspace(p->renderer);
-        if (p->glctx->flip_v)
-            mp_image_vflip(screen);
-        *(struct mp_image **)data = screen;
-        return true;
-    }
-    case VOCTRL_LOAD_HWDEC_API:
-        request_hwdec_api(vo, data);
-        return true;
-    case VOCTRL_UPDATE_RENDER_OPTS: {
-        gl_video_update_options(p->renderer);
-        get_and_update_icc_profile(p);
-        gl_video_configure_queue(p->renderer, p->vo);
-        p->vo->want_redraw = true;
-        return true;
-    }
-    case VOCTRL_RESET:
-        gl_video_reset(p->renderer);
-        return true;
-    case VOCTRL_PAUSE:
-        if (gl_video_showing_interpolated_frame(p->renderer))
-            vo->want_redraw = true;
-        return true;
-    case VOCTRL_PERFORMANCE_DATA:
-        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
-        return true;
-    }
-
-    int events = 0;
-    int r = mpgl_control(p->glctx, &events, request, data);
-    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
-        get_and_update_icc_profile(p);
-        vo->want_redraw = true;
-    }
-    if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
-        get_and_update_ambient_lighting(p);
-        vo->want_redraw = true;
-    }
-    events |= p->events;
-    p->events = 0;
-    if (events & VO_EVENT_RESIZE)
-        resize(p);
-    if (events & VO_EVENT_EXPOSE)
-        vo->want_redraw = true;
-    vo_event(vo, events);
-
-    return r;
-}
-
-static void wakeup(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    if (p->glctx && p->glctx->driver->wakeup)
-        p->glctx->driver->wakeup(p->glctx);
-}
-
-static void wait_events(struct vo *vo, int64_t until_time_us)
-{
-    struct gl_priv *p = vo->priv;
-    if (p->glctx->driver->wait_events) {
-        p->glctx->driver->wait_events(p->glctx, until_time_us);
-    } else {
-        vo_wait_default(vo, until_time_us);
-    }
-}
-
-static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
-                                  int stride_align)
-{
-    struct gl_priv *p = vo->priv;
-
-    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
-}
-
-static void uninit(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-
-    gl_video_uninit(p->renderer);
-    ra_hwdec_uninit(p->hwdec);
-    if (vo->hwdec_devs) {
-        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
-        hwdec_devices_destroy(vo->hwdec_devs);
-    }
-    ra_free(&p->ra);
-    mpgl_uninit(p->glctx);
-}
-
-static int preinit(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    p->vo = vo;
-    p->log = vo->log;
-
-    int vo_flags = 0;
-
-    int alpha_mode;
-    mp_read_option_raw(vo->global, "alpha", &m_option_type_choice, &alpha_mode);
-
-    if (alpha_mode == 1)
-        vo_flags |= VOFLAG_ALPHA;
-
-    if (p->opts.use_gl_debug)
-        vo_flags |= VOFLAG_GL_DEBUG;
-
-    if (p->opts.es == 1)
-        vo_flags |= VOFLAG_GLES;
-    if (p->opts.es == 2)
-        vo_flags |= VOFLAG_GLES | VOFLAG_GLES2;
-    if (p->opts.es == -1)
-        vo_flags |= VOFLAG_NO_GLES;
-
-    if (p->opts.allow_sw)
-        vo_flags |= VOFLAG_SW;
-
-    p->glctx = mpgl_init(vo, p->opts.backend, vo_flags);
-    if (!p->glctx)
-        goto err_out;
-    p->gl = p->glctx->gl;
-
-    if (p->gl->SwapInterval) {
-        p->gl->SwapInterval(p->opts.swap_interval);
-    } else {
-        MP_VERBOSE(vo, "swap_control extension missing.\n");
-    }
-
-    p->ra = ra_create_gl(p->gl, vo->log);
-    if (!p->ra)
-        goto err_out;
-
-    p->renderer = gl_video_init(p->ra, vo->log, vo->global);
-    gl_video_set_osd_source(p->renderer, vo->osd);
-    gl_video_configure_queue(p->renderer, vo);
-
-    get_and_update_icc_profile(p);
-
-    vo->hwdec_devs = hwdec_devices_create();
-
-    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
-
-    p->hwdec = ra_hwdec_load(p->vo->log, p->ra, vo->global,
-                             vo->hwdec_devs, vo->opts->gl_hwdec_interop);
-    gl_video_set_hwdec(p->renderer, p->hwdec);
-
-    gl_check_error(p->gl, p->log, "before retrieving framebuffer depth");
-    int fb_depth = gl_get_fb_depth(p->gl, p->glctx->main_fb);
-    gl_check_error(p->gl, p->log, "retrieving framebuffer depth");
-    if (fb_depth)
-        MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth);
-    gl_video_set_fb_depth(p->renderer, fb_depth);
-
-    return 0;
-
-err_out:
-    uninit(vo);
-    return -1;
-}
-
-#define OPT_BASE_STRUCT struct gl_priv
-
-const struct vo_driver video_out_opengl = {
-    .description = "Extended OpenGL Renderer",
-    .name = "opengl",
-    .caps = VO_CAP_ROTATE90,
-    .preinit = preinit,
-    .query_format = query_format,
-    .reconfig = reconfig,
-    .control = control,
-    .get_image = get_image,
-    .draw_frame = draw_frame,
-    .flip_page = flip_page,
-    .wait_events = wait_events,
-    .wakeup = wakeup,
-    .uninit = uninit,
-    .priv_size = sizeof(struct gl_priv),
-    .options = (const m_option_t[]) {
-        OPT_FLAG("opengl-glfinish", opts.use_glFinish, 0),
-        OPT_FLAG("opengl-waitvsync", opts.waitvsync, 0),
-        OPT_INT("opengl-swapinterval", opts.swap_interval, 0),
-        OPT_FLAG("opengl-debug", opts.use_gl_debug, 0),
-        OPT_STRING_VALIDATE("opengl-backend", opts.backend, 0,
-                            mpgl_validate_backend_opt),
-        OPT_FLAG("opengl-sw", opts.allow_sw, 0),
-        OPT_CHOICE("opengl-es", opts.es, 0, ({"no", -1}, {"auto", 0},
-                                             {"yes", 1}, {"force2", 2})),
-        OPT_INTPAIR("opengl-check-pattern", opts.pattern, 0),
-        OPT_INTRANGE("opengl-vsync-fences", opts.vsync_fences, 0,
-                     0, NUM_VSYNC_FENCES),
-
-        {0}
-    },
-    .priv_defaults = &(const struct gl_priv){
-        .opts = {
-            .swap_interval = 1,
-        },
-    },
-};
diff --git a/video/out/vo_opengl_cb.c b/video/out/vo_opengl_cb.c
index ea6aaa9048..7e95e8bd31 100644
--- a/video/out/vo_opengl_cb.c
+++ b/video/out/vo_opengl_cb.c
@@ -24,9 +24,10 @@
 #include "common/global.h"
 #include "player/client.h"
 
+#include "gpu/video.h"
+#include "gpu/hwdec.h"
 #include "opengl/common.h"
-#include "opengl/video.h"
-#include "opengl/hwdec.h"
+#include "opengl/context.h"
 #include "opengl/ra_gl.h"
 
 #include "libmpv/opengl_cb.h"
@@ -86,7 +87,7 @@ struct mpv_opengl_cb_context {
     //     application's OpenGL context is current - i.e. only while the
     //     host application is calling certain mpv_opengl_cb_* APIs.
     GL *gl;
-    struct ra *ra;
+    struct ra_ctx *ra_ctx;
     struct gl_video *renderer;
     struct ra_hwdec *hwdec;
     struct m_config_cache *vo_opts_cache;
@@ -171,16 +172,36 @@ int mpv_opengl_cb_init_gl(struct mpv_opengl_cb_context *ctx, const char *exts,
         return MPV_ERROR_UNSUPPORTED;
     }
 
-    ctx->ra = ra_create_gl(ctx->gl, ctx->log);
-    if (!ctx->ra)
+    // initialize a blank ra_ctx to reuse ra_gl_ctx
+    ctx->ra_ctx = talloc_zero(ctx, struct ra_ctx);
+    ctx->ra_ctx->log = ctx->log;
+    ctx->ra_ctx->global = ctx->global;
+    ctx->ra_ctx->opts = (struct ra_ctx_opts) {
+        .probing = false,
+        .allow_sw = true,
+    };
+
+    static const struct ra_swapchain_fns empty_swapchain_fns = {0};
+    struct ra_gl_ctx_params gl_params = {
+        // vo_opengl_cb is essentially like a gigantic external swapchain where
+        // the user is in charge of presentation / swapping etc. But we don't
+        // actually need to provide any of these functions, since we can just
+        // not call them to begin with - so just set it to an empty object to
+        // signal to ra_gl_ctx that we don't care about its latency emulation
+        // functionality
+        .external_swapchain = &empty_swapchain_fns
+    };
+
+    ctx->gl->SwapInterval = NULL; // we shouldn't randomly change this, so lock it
+    if (!ra_gl_ctx_init(ctx->ra_ctx, ctx->gl, gl_params))
         return MPV_ERROR_UNSUPPORTED;
 
-    ctx->renderer = gl_video_init(ctx->ra, ctx->log, ctx->global);
+    ctx->renderer = gl_video_init(ctx->ra_ctx->ra, ctx->log, ctx->global);
 
     m_config_cache_update(ctx->vo_opts_cache);
 
     ctx->hwdec_devs = hwdec_devices_create();
-    ctx->hwdec = ra_hwdec_load(ctx->log, ctx->ra, ctx->global,
+    ctx->hwdec = ra_hwdec_load(ctx->log, ctx->ra_ctx->ra, ctx->global,
                                ctx->hwdec_devs, ctx->vo_opts->gl_hwdec_interop);
     gl_video_set_hwdec(ctx->renderer, ctx->hwdec);
 
@@ -221,7 +242,7 @@ int mpv_opengl_cb_uninit_gl(struct mpv_opengl_cb_context *ctx)
     ctx->hwdec = NULL;
     hwdec_devices_destroy(ctx->hwdec_devs);
     ctx->hwdec_devs = NULL;
-    ra_free(&ctx->ra);
+    ra_ctx_destroy(&ctx->ra_ctx);
     talloc_free(ctx->gl);
     ctx->gl = NULL;
     return 0;
@@ -236,11 +257,6 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         return MPV_ERROR_UNSUPPORTED;
     }
 
-    struct fbodst target = {
-        .tex = ra_create_wrapped_fb(ctx->ra, fbo, vp_w, abs(vp_h)),
-        .flip = vp_h < 0,
-    };
-
     reset_gl_state(ctx->gl);
 
     pthread_mutex_lock(&ctx->lock);
@@ -280,7 +296,7 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         mp_read_option_raw(ctx->global, "opengl-debug", &m_option_type_flag,
                            &debug);
         ctx->gl->debug_context = debug;
-        ra_gl_set_debug(ctx->ra, debug);
+        ra_gl_set_debug(ctx->ra_ctx->ra, debug);
         if (gl_video_icc_auto_enabled(ctx->renderer))
             MP_ERR(ctx, "icc-profile-auto is not available with opengl-cb\n");
     }
@@ -316,7 +332,14 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
     pthread_mutex_unlock(&ctx->lock);
 
     MP_STATS(ctx, "glcb-render");
+    struct ra_swapchain *sw = ctx->ra_ctx->swapchain;
+    ra_gl_ctx_resize(sw, vp_w, abs(vp_h), fbo);
+    struct fbodst target = {
+        .tex = ra_gl_ctx_start_frame(sw),
+        .flip = vp_h < 0,
+    };
     gl_video_render_frame(ctx->renderer, frame, target);
+    ra_gl_ctx_submit_frame(sw, frame);
 
     reset_gl_state(ctx->gl);
 
@@ -328,8 +351,6 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         pthread_cond_wait(&ctx->wakeup, &ctx->lock);
     pthread_mutex_unlock(&ctx->lock);
 
-    ra_tex_free(ctx->ra, &target.tex);
-
     return 0;
 }
 
diff --git a/video/out/vo_rpi.c b/video/out/vo_rpi.c
index 5b5d62c78f..8b819af163 100644
--- a/video/out/vo_rpi.c
+++ b/video/out/vo_rpi.c
@@ -44,7 +44,7 @@
 #include "sub/osd.h"
 
 #include "opengl/ra_gl.h"
-#include "opengl/video.h"
+#include "gpu/video.h"
 
 struct mp_egl_rpi {
     struct mp_log *log;
-- 
cgit v1.2.3