From 65979986a923a8f08019b257c3fe72cd5e8ecf68 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.xyz>
Date: Thu, 14 Sep 2017 08:04:55 +0200
Subject: vo_opengl: refactor into vo_gpu

This is done in several steps:

1. refactor MPGLContext -> struct ra_ctx
2. move GL-specific stuff in vo_opengl into opengl/context.c
3. generalize context creation to support other APIs, and add --gpu-api
4. rename all of the --opengl- options that are no longer opengl-specific
5. move all of the stuff from opengl/* that isn't GL-specific into gpu/
   (note: opengl/gl_utils.h became opengl/utils.h)
6. rename vo_opengl to vo_gpu
7. to handle window screenshots, the short-term approach was to just add
   it to ra_swchain_fns. Long term (and for vulkan) this has to be moved to
   ra itself (and vo_gpu altered to compensate), but this was a stop-gap
   measure to prevent this commit from getting too big
8. move ra->fns->flush to ra_gl_ctx instead
9. some other minor changes that I've probably already forgotten

Note: This is one half of a major refactor, the other half of which is
provided by rossy's following commit. This commit enables support for
all linux platforms, while his version enables support for all non-linux
platforms.

Note 2: vo_opengl_cb.c also re-uses ra_gl_ctx so it benefits from the
--opengl- options like --opengl-early-flush, --opengl-finish etc. Should
be a strict superset of the old functionality.

Disclaimer: Since I have no way of compiling mpv on all platforms, some
of these ports were done blindly. Specifically, the blind ports included
context_mali_fbdev.c and context_rpi.c. Since they're both based on
egl_helpers, the port should have gone smoothly without any major
changes required. But if somebody complains about a compile error on
those platforms (assuming anybody actually uses them), you know where to
complain.
---
 DOCS/interface-changes.rst            |   16 +
 DOCS/man/mpv.rst                      |    6 +-
 DOCS/man/options.rst                  |  170 +-
 DOCS/man/vo.rst                       |   33 +-
 DOCS/mplayer-changes.rst              |    6 +-
 etc/builtin.conf                      |    6 +-
 etc/mpv.conf                          |    4 +-
 options/options.c                     |   11 +-
 options/options.h                     |    1 +
 player/main.c                         |   17 -
 video/out/gpu/context.c               |  186 ++
 video/out/gpu/context.h               |   95 +
 video/out/gpu/hwdec.c                 |  239 +++
 video/out/gpu/hwdec.h                 |  130 ++
 video/out/gpu/lcms.c                  |  531 +++++
 video/out/gpu/lcms.h                  |   43 +
 video/out/gpu/osd.c                   |  367 ++++
 video/out/gpu/osd.h                   |   25 +
 video/out/gpu/ra.c                    |  327 +++
 video/out/gpu/ra.h                    |  488 +++++
 video/out/gpu/shader_cache.c          |  954 +++++++++
 video/out/gpu/shader_cache.h          |   56 +
 video/out/gpu/user_shaders.c          |  452 ++++
 video/out/gpu/user_shaders.h          |   98 +
 video/out/gpu/utils.c                 |  372 ++++
 video/out/gpu/utils.h                 |  120 ++
 video/out/gpu/video.c                 | 3809 ++++++++++++++++++++++++++++++++
 video/out/gpu/video.h                 |  194 ++
 video/out/gpu/video_shaders.c         |  872 ++++++++
 video/out/gpu/video_shaders.h         |   56 +
 video/out/opengl/common.h             |    4 +-
 video/out/opengl/context.c            |  446 ++--
 video/out/opengl/context.h            |  152 +-
 video/out/opengl/context_cocoa.c      |    2 +-
 video/out/opengl/context_drm_egl.c    |  194 +-
 video/out/opengl/context_glx.c        |  376 ++++
 video/out/opengl/context_mali_fbdev.c |   58 +-
 video/out/opengl/context_rpi.c        |   84 +-
 video/out/opengl/context_vdpau.c      |  202 +-
 video/out/opengl/context_wayland.c    |   74 +-
 video/out/opengl/context_x11.c        |  358 ----
 video/out/opengl/context_x11egl.c     |   84 +-
 video/out/opengl/egl_helpers.c        |  114 +-
 video/out/opengl/egl_helpers.h        |   19 +-
 video/out/opengl/formats.h            |    1 -
 video/out/opengl/gl_utils.c           |  291 ---
 video/out/opengl/gl_utils.h           |   56 -
 video/out/opengl/hwdec.c              |  239 ---
 video/out/opengl/hwdec.h              |  130 --
 video/out/opengl/hwdec_cuda.c         |    3 +-
 video/out/opengl/hwdec_ios.m          |    2 +-
 video/out/opengl/hwdec_osx.c          |    2 +-
 video/out/opengl/hwdec_rpi.c          |    2 +-
 video/out/opengl/hwdec_vaegl.c        |    4 +-
 video/out/opengl/hwdec_vaglx.c        |    5 +-
 video/out/opengl/hwdec_vdpau.c        |    2 +-
 video/out/opengl/lcms.c               |  531 -----
 video/out/opengl/lcms.h               |   43 -
 video/out/opengl/osd.c                |  367 ----
 video/out/opengl/osd.h                |   25 -
 video/out/opengl/ra.c                 |  327 ---
 video/out/opengl/ra.h                 |  491 -----
 video/out/opengl/ra_gl.c              |    7 -
 video/out/opengl/ra_gl.h              |    3 +-
 video/out/opengl/shader_cache.c       |  955 ---------
 video/out/opengl/shader_cache.h       |   56 -
 video/out/opengl/user_shaders.c       |  452 ----
 video/out/opengl/user_shaders.h       |   98 -
 video/out/opengl/utils.c              |  524 ++---
 video/out/opengl/utils.h              |  151 +-
 video/out/opengl/video.c              | 3813 ---------------------------------
 video/out/opengl/video.h              |  195 --
 video/out/opengl/video_shaders.c      |  872 --------
 video/out/opengl/video_shaders.h      |   56 -
 video/out/vo.c                        |    6 +-
 video/out/vo_gpu.c                    |  385 ++++
 video/out/vo_opengl.c                 |  470 ----
 video/out/vo_opengl_cb.c              |   53 +-
 video/out/vo_rpi.c                    |    2 +-
 wscript                               |   13 +-
 wscript_build.py                      |   47 +-
 81 files changed, 11414 insertions(+), 11116 deletions(-)
 create mode 100644 video/out/gpu/context.c
 create mode 100644 video/out/gpu/context.h
 create mode 100644 video/out/gpu/hwdec.c
 create mode 100644 video/out/gpu/hwdec.h
 create mode 100644 video/out/gpu/lcms.c
 create mode 100644 video/out/gpu/lcms.h
 create mode 100644 video/out/gpu/osd.c
 create mode 100644 video/out/gpu/osd.h
 create mode 100644 video/out/gpu/ra.c
 create mode 100644 video/out/gpu/ra.h
 create mode 100644 video/out/gpu/shader_cache.c
 create mode 100644 video/out/gpu/shader_cache.h
 create mode 100644 video/out/gpu/user_shaders.c
 create mode 100644 video/out/gpu/user_shaders.h
 create mode 100644 video/out/gpu/utils.c
 create mode 100644 video/out/gpu/utils.h
 create mode 100644 video/out/gpu/video.c
 create mode 100644 video/out/gpu/video.h
 create mode 100644 video/out/gpu/video_shaders.c
 create mode 100644 video/out/gpu/video_shaders.h
 create mode 100644 video/out/opengl/context_glx.c
 delete mode 100644 video/out/opengl/context_x11.c
 delete mode 100644 video/out/opengl/gl_utils.c
 delete mode 100644 video/out/opengl/gl_utils.h
 delete mode 100644 video/out/opengl/hwdec.c
 delete mode 100644 video/out/opengl/hwdec.h
 delete mode 100644 video/out/opengl/lcms.c
 delete mode 100644 video/out/opengl/lcms.h
 delete mode 100644 video/out/opengl/osd.c
 delete mode 100644 video/out/opengl/osd.h
 delete mode 100644 video/out/opengl/ra.c
 delete mode 100644 video/out/opengl/ra.h
 delete mode 100644 video/out/opengl/shader_cache.c
 delete mode 100644 video/out/opengl/shader_cache.h
 delete mode 100644 video/out/opengl/user_shaders.c
 delete mode 100644 video/out/opengl/user_shaders.h
 delete mode 100644 video/out/opengl/video.c
 delete mode 100644 video/out/opengl/video.h
 delete mode 100644 video/out/opengl/video_shaders.c
 delete mode 100644 video/out/opengl/video_shaders.h
 create mode 100644 video/out/vo_gpu.c
 delete mode 100644 video/out/vo_opengl.c

diff --git a/DOCS/interface-changes.rst b/DOCS/interface-changes.rst
index 656a5d204f..8d8870d81d 100644
--- a/DOCS/interface-changes.rst
+++ b/DOCS/interface-changes.rst
@@ -22,6 +22,22 @@ Interface changes
  --- mpv 0.28.0 ---
     - drop previously deprecated --heartbeat-cmd and --heartbeat--interval
       options
+    - rename --vo=opengl to --vo=gpu
+    - rename --opengl-backend to --gpu-context
+    - rename --opengl-shaders to --glsl-shaders
+    - rename --opengl-shader-cache-dir to --gpu-shader-cache-dir
+    - rename --opengl-tex-pad-x/y to --gpu-tex-pad-x/y
+    - rename --opengl-fbo-format to --fbo-format
+    - rename --opengl-gamma to --gamma-factor
+    - rename --opengl-debug to --gpu-debug
+    - rename --opengl-sw to --gpu-sw
+    - rename --opengl-vsync-fences to --swapchain-depth, and the interpretation
+      slightly changed. Now defaults to 3.
+    - rename the built-in profile `opengl-hq` to `gpu-hq`
+    - the semantics of --opengl-es=yes are slightly changed -> now requires GLES
+    - remove the (deprecated) alias --gpu-context=drm-egl
+    - remove the (deprecated) --vo=opengl-hq
+    - remove --opengl-es=force2 (use --opengl-es=yes --opengl-restrict=300)
  --- mpv 0.27.0 ---
     - drop previously deprecated --field-dominance option
     - drop previously deprecated "osd" command
diff --git a/DOCS/man/mpv.rst b/DOCS/man/mpv.rst
index 7202e326e7..a307cc2ff2 100644
--- a/DOCS/man/mpv.rst
+++ b/DOCS/man/mpv.rst
@@ -510,8 +510,8 @@ setting them to *no*. Even suboptions can be specified in this way.
 
     ::
 
-        # Use opengl video output by default.
-        vo=opengl
+        # Use GPU-accelerated video output by default.
+        vo=gpu
         # Use quotes for text that can contain spaces:
         status-msg="Time: ${time-pos}"
 
@@ -582,7 +582,7 @@ profile name ``default`` to continue with normal options.
         [slow]
         profile-desc="some profile name"
         # reference a builtin profile
-        profile=opengl-hq
+        profile=gpu-hq
 
         [fast]
         vo=vdpau
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index fa2f676190..4a880fb72c 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -667,29 +667,29 @@ Video
     :auto:      enable best hw decoder (see below)
     :yes:       exactly the same as ``auto``
     :auto-copy: enable best hw decoder with copy-back (see below)
-    :vdpau:     requires ``--vo=vdpau`` or ``--vo=opengl`` (Linux only)
+    :vdpau:     requires ``--vo=gpu`` or ``--vo=vdpau`` (Linux only)
     :vdpau-copy: copies video back into system RAM (Linux with some GPUs only)
-    :vaapi:     requires ``--vo=opengl`` or ``--vo=vaapi`` (Linux only)
+    :vaapi:     requires ``--vo=gpu`` or ``--vo=vaapi`` (Linux only)
     :vaapi-copy: copies video back into system RAM (Linux with Intel GPUs only)
-    :videotoolbox: requires ``--vo=opengl`` (OS X 10.8 and up),
+    :videotoolbox: requires ``--vo=gpu`` (OS X 10.8 and up),
                    or ``--vo=opengl-cb`` (iOS 9.0 and up)
     :videotoolbox-copy: copies video back into system RAM (OS X 10.8 or iOS 9.0 and up)
-    :dxva2:     requires ``--vo=opengl`` with ``--opengl-backend=angle`` or
-                ``--opengl-backend=dxinterop`` (Windows only)
+    :dxva2:     requires ``--vo=gpu`` with ``--gpu-context=angle`` or
+                ``--gpu-context=dxinterop`` (Windows only)
     :dxva2-copy: copies video back to system RAM (Windows only)
-    :d3d11va:   requires ``--vo=opengl`` with ``--opengl-backend=angle``
+    :d3d11va:   requires ``--vo=gpu`` with ``--gpu-context=angle``
                 (Windows 8+ only)
     :d3d11va-copy: copies video back to system RAM (Windows 8+ only)
     :mediacodec: copies video back to system RAM (Android only)
-    :rpi:       requires ``--vo=opengl`` (Raspberry Pi only - default if available)
+    :rpi:       requires ``--vo=gpu`` (Raspberry Pi only - default if available)
     :rpi-copy:  copies video back to system RAM (Raspberry Pi only)
-    :cuda:      requires ``--vo=opengl`` (Any platform CUDA is available)
+    :cuda:      requires ``--vo=gpu`` (Any platform CUDA is available)
     :cuda-copy: copies video back to system RAM (Any platform CUDA is available)
     :crystalhd: copies video back to system RAM (Any platform supported by hardware)
 
     ``auto`` tries to automatically enable hardware decoding using the first
     available method. This still depends what VO you are using. For example,
-    if you are not using ``--vo=vdpau`` or ``--vo=opengl``, vdpau decoding will
+    if you are not using ``--vo=gpu`` or ``--vo=vdpau``, vdpau decoding will
     never be enabled. Also note that if the first found method doesn't actually
     work, it will always fall back to software decoding, instead of trying the
     next method (might matter on some Linux systems).
@@ -701,10 +701,10 @@ Video
     guaranteed to incur no additional loss compared to software decoding, and
     will allow CPU processing with video filters.
 
-    The ``vaapi`` mode, if used with ``--vo=opengl``, requires Mesa 11 and most
+    The ``vaapi`` mode, if used with ``--vo=gpu``, requires Mesa 11 and most
     likely works with Intel GPUs only. It also requires the opengl EGL backend
     (automatically used if available). You can also try the old GLX backend by
-    forcing it with ``--opengl-backend=x11``, but the vaapi/GLX interop is
+    forcing it with ``--gpu-context=x11``, but the vaapi/GLX interop is
     said to be slower than ``vaapi-copy``.
 
     The ``cuda`` and ``cuda-copy`` modes provides deinterlacing in the decoder
@@ -712,7 +712,7 @@ Video
     output path. To use this deinterlacing you must pass the option:
     ``vd-lavc-o=deint=[weave|bob|adaptive]``.
     Pass ``weave`` (or leave the option unset) to not attempt any
-    deinterlacing. ``cuda`` should always be preferred unless the ``opengl``
+    deinterlacing. ``cuda`` should always be preferred unless the ``gpu``
     vo is not being used or filters are required.
 
     Most video filters will not work with hardware decoding as they are
@@ -739,8 +739,8 @@ Video
         be some loss, or even blatantly incorrect results.
 
         In some cases, RGB conversion is forced, which means the RGB conversion
-        is performed by the hardware decoding API, instead of the OpenGL code
-        used by ``--vo=opengl``. This means certain colorspaces may not display
+        is performed by the hardware decoding API, instead of the shaders
+        used by ``--vo=gpu``. This means certain colorspaces may not display
         correctly, and certain filtering (such as debanding) cannot be applied
         in an ideal way. This will also usually force the use of low quality
         chroma scalers instead of the one specified by ``--cscale``. In other
@@ -772,7 +772,7 @@ Video
         completely ordinary video sources.
 
         ``rpi`` always uses the hardware overlay renderer, even with
-        ``--vo=opengl``.
+        ``--vo=gpu``.
 
         ``cuda`` should be safe, but it has been reported to corrupt the
         timestamps causing glitched, flashing frames on some files. It can also
@@ -800,13 +800,13 @@ Video
         the first thing you should try is disabling it.
 
 ``--opengl-hwdec-interop=<name>``
-    This is useful for the ``opengl`` and ``opengl-cb`` VOs for creating the
+    This is useful for the ``gpu`` and ``opengl-cb`` VOs for creating the
     hardware decoding OpenGL interop context, but without actually enabling
     hardware decoding itself (like ``--hwdec`` does).
 
     If set to an empty string (default), the ``--hwdec`` option is used.
 
-    For ``opengl``, if set, do not create the interop context on demand, but
+    For ``gpu``, if set, do not create the interop context on demand, but
     when the VO is created.
 
     For ``opengl-cb``, if set, load the interop context as soon as the OpenGL
@@ -1049,7 +1049,7 @@ Video
     This can speed up video upload, and may help with large resolutions or
     slow hardware. This works only with the following VOs:
 
-        - ``opengl``: requires at least OpenGL 4.4.
+        - ``gpu``: requires at least OpenGL 4.4.
 
     (In particular, this can't be made work with ``opengl-cb``.)
 
@@ -2402,8 +2402,8 @@ Window
 ``--force-rgba-osd-rendering``
     Change how some video outputs render the OSD and text subtitles. This
     does not change appearance of the subtitles and only has performance
-    implications. For VOs which support native ASS rendering (like ``vdpau``,
-    ``opengl``, ``direct3d``), this can be slightly faster or slower,
+    implications. For VOs which support native ASS rendering (like ``gpu``,
+    ``vdpau``, ``direct3d``), this can be slightly faster or slower,
     depending on GPU drivers and hardware. For other VOs, this just makes
     rendering slower.
 
@@ -3903,10 +3903,10 @@ ALSA audio output options
     ALSA device).
 
 
-OpenGL renderer options
+GPU renderer options
 -----------------------
 
-The following video options are currently all specific to ``--vo=opengl`` and
+The following video options are currently all specific to ``--vo=gpu`` and
 ``--vo=opengl-cb`` only, which are the only VOs that implement them.
 
 ``--scale=<filter>``
@@ -3917,7 +3917,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
         is the default for compatibility reasons.
 
     ``spline36``
-        Mid quality and speed. This is the default when using ``opengl-hq``.
+        Mid quality and speed. This is the default when using ``gpu-hq``.
 
     ``lanczos``
         Lanczos scaling. Provides mid quality and speed. Generally worse than
@@ -4080,7 +4080,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
 
 ``--linear-scaling``
     Scale in linear light. It should only be used with a
-    ``--opengl-fbo-format`` that has at least 16 bit precision. This option
+    ``--fbo-format`` that has at least 16 bit precision. This option
     has no effect on HDR content.
 
 ``--correct-downscaling``
@@ -4104,7 +4104,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     the ``--tscale`` setting.
 
     Note that this relies on vsync to work, see ``--opengl-swapinterval`` for
-    more information. It should also only be used with an ``--opengl-fbo-format``
+    more information. It should also only be used with an ``--fbo-format``
     that has at least 16 bit precision.
 
 ``--interpolation-threshold=<0..1,-1>``
@@ -4168,10 +4168,10 @@ The following video options are currently all specific to ``--vo=opengl`` and
     ``--temporal-dither`` is in use. 1 (the default) will update on every video
     frame, 2 on every other frame, etc.
 
-``--opengl-debug``
-    Check for OpenGL errors, i.e. call ``glGetError()``. Also, request a
-    debug OpenGL context (which does nothing with current graphics drivers
-    as of this writing).
+``--gpu-debug``
+    Enables GPU debugging. What this means depends on the API type. For OpenGL,
+    it calls ``glGetError()``, and requests a debug context. For Vulkan, it
+    enables validation layers.
 
 ``--opengl-swapinterval=<n>``
     Interval in displayed frames between two buffer swaps. 1 is equivalent to
@@ -4184,7 +4184,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     results, as can missing or incorrect display FPS information (see
     ``--display-fps``).
 
-``--opengl-shaders=<file-list>``
+``--glsl-shaders=<file-list>``
     Custom GLSL hooks. These are a flexible way to add custom fragment shaders,
     which can be injected at almost arbitrary points in the rendering pipeline,
     and access all previous intermediate textures. Each use of the option will
@@ -4226,7 +4226,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
 
     FORMAT <name> (required)
         The texture format for the samples. Supported texture formats are listed
-        in debug logging when the ``opengl`` VO is initialized (look for
+        in debug logging when the ``gpu`` VO is initialized (look for
         ``Texture formats:``). Usually, this follows OpenGL naming conventions.
         For example, ``rgb16`` provides 3 channels with normalized 16 bit
         components. One oddity are float formats: for example, ``rgba16f`` has
@@ -4369,8 +4369,8 @@ The following video options are currently all specific to ``--vo=opengl`` and
     vec2 tex_offset
         Texture offset introduced by user shaders or options like panscan, video-align-x/y, video-pan-x/y.
 
-    Internally, vo_opengl may generate any number of the following textures.
-    Whenever a texture is rendered and saved by vo_opengl, all of the passes
+    Internally, vo_gpu may generate any number of the following textures.
+    Whenever a texture is rendered and saved by vo_gpu, all of the passes
     that have hooked into it will run, in the order they were added by the
     user. This is a list of the legal hook points:
 
@@ -4416,8 +4416,8 @@ The following video options are currently all specific to ``--vo=opengl`` and
     pass. When overwriting a texture marked ``fixed``, the WIDTH, HEIGHT and
     OFFSET must be left at their default values.
 
-``--opengl-shader=<file>``
-    CLI/config file only alias for ``--opengl-shaders-append``.
+``--glsl-shader=<file>``
+    CLI/config file only alias for ``--glsl-shaders-append``.
 
 ``--deband``
     Enable the debanding algorithm. This greatly reduces the amount of visible
@@ -4470,9 +4470,9 @@ The following video options are currently all specific to ``--vo=opengl`` and
     ``--scale-blur`` option.
 
 ``--opengl-glfinish``
-    Call ``glFinish()`` before and after swapping buffers (default: disabled).
-    Slower, but might improve results when doing framedropping. Can completely
-    ruin performance. The details depend entirely on the OpenGL driver.
+    Call ``glFinish()`` before swapping buffers (default: disabled). Slower,
+    but might improve results when doing framedropping. Can completely ruin
+    performance. The details depend entirely on the OpenGL driver.
 
 ``--opengl-waitvsync``
     Call ``glXWaitVideoSyncSGI`` after each buffer swap (default: disabled).
@@ -4481,15 +4481,6 @@ The following video options are currently all specific to ``--vo=opengl`` and
 
     X11/GLX only.
 
-``--opengl-vsync-fences=<N>``
-    Synchronize the CPU to the Nth past frame using the ``GL_ARB_sync``
-    extension. A value of 0 disables this behavior (default). A value of 1
-    means it will synchronize to the current frame after rendering it. Like
-    ``--glfinish`` and ``--waitvsync``, this can lower or ruin performance. Its
-    advantage is that it can span multiple frames, and effectively limit the
-    number of frames the GPU queues ahead (which also has an influence on
-    vsync).
-
 ``--opengl-dwmflush=<no|windowed|yes|auto>``
     Calls ``DwmFlush`` after swapping buffers on Windows (default: auto). It
     also sets ``SwapInterval(0)`` to ignore the OpenGL timing. Values are: no
@@ -4510,7 +4501,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     used to select a lower feature level, which is mainly useful for debugging.
     Note that OpenGL ES 3.0 is only supported at feature level 10_1 or higher.
     Most extended OpenGL features will not work at lower feature levels
-    (similar to ``--opengl-dumb-mode``).
+    (similar to ``--gpu-dumb-mode``).
 
     Windows with ANGLE only.
 
@@ -4566,7 +4557,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     renderer, though ``--angle-renderer=d3d9`` may give slightly better
     performance on old hardware. Note that the D3D9 renderer only supports
     OpenGL ES 2.0, so most extended OpenGL features will not work if this
-    renderer is selected (similar to ``--opengl-dumb-mode``).
+    renderer is selected (similar to ``--gpu-dumb-mode``).
 
     Windows with ANGLE only.
 
@@ -4587,13 +4578,21 @@ The following video options are currently all specific to ``--vo=opengl`` and
 
     OS X only.
 
-``--opengl-sw``
+``--swapchain-depth=<N>``
+    Allow up to N in-flight frames. This essentially controls the frame
+    latency. Increasing the swapchain depth can improve pipelining and prevent
+    missed vsyncs, but increases visible latency. This option only mandates an
+    upper limit, the implementation can use a lower latency than requested
+    internally. A setting of 1 means that the VO will wait for every frame to
+    become visible before starting to render the next frame. (Default: 3)
+
+``--gpu-sw``
     Continue even if a software renderer is detected.
 
-``--opengl-backend=<sys>``
-    The value ``auto`` (the default) selects the windowing backend. You can
-    also pass ``help`` to get a complete list of compiled in backends (sorted
-    by autoprobe order).
+``--gpu-context=<sys>``
+    The value ``auto`` (the default) selects the GPU context. You can also pass
+    ``help`` to get a complete list of compiled in backends (sorted by
+    autoprobe order).
 
     auto
         auto-select (default)
@@ -4617,7 +4616,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     wayland
         Wayland/EGL
     drm
-        DRM/EGL (``drm-egl`` is a deprecated alias)
+        DRM/EGL
     x11egl
         X11/EGL
     mali-fbdev
@@ -4628,19 +4627,32 @@ The following video options are currently all specific to ``--vo=opengl`` and
         performance problems), and is for doing experiments only. Will not
         be used automatically.
 
+``--gpu-api=<type>``
+    Controls which type of graphics APIs will be accepted:
+
+    auto
+        Use any available API (default)
+    opengl
+        Allow only OpenGL (requires OpenGL 2.1+ or GLES 2.0+)
+
 ``--opengl-es=<mode>``
-    Select whether to use GLES:
+    Controls which type of OpenGL context will be accepted:
 
+    auto
+        Allow all types of OpenGL (default)
     yes
-        Try to prefer ES over Desktop GL
-    force2
-        Try to request a ES 2.0 context (the driver might ignore this)
+        Only allow GLES
     no
-        Try to prefer desktop GL over ES
-    auto
-        Use the default for each backend (default)
+        Only allow desktop/core GL
 
-``--opengl-fbo-format=<fmt>``
+``--opengl-restrict=<version>``
+    Restricts all OpenGL versions above a certain version. Versions are encoded
+    in hundreds, i.e. OpenGL 4.5 -> 450. As an example, --opengl-restrict=300
+    would restrict OpenGL 3.0 and higher, effectively only allowing 2.x
+    contexts. Note that this only imposes a limit on context creation APIs, the
+    actual OpenGL context may still have a higher OpenGL version. (Default: 0)
+
+``--fbo-format=<fmt>``
     Selects the internal format of textures used for FBOs. The format can
     influence performance and quality of the video output. ``fmt`` can be one
     of: rgb8, rgb10, rgb10_a2, rgb16, rgb16f, rgb32f, rgba12, rgba16, rgba16f,
@@ -4648,10 +4660,10 @@ The following video options are currently all specific to ``--vo=opengl`` and
     or rgb10_a2 on GLES (e.g. ANGLE), unless GL_EXT_texture_norm16 is
     available.
 
-``--opengl-gamma=<0.1..2.0>``
-    Set a gamma value (default: 1.0). If gamma is adjusted in other ways (like
-    with the ``--gamma`` option or key bindings and the ``gamma`` property),
-    the value is multiplied with the other gamma value.
+``--gamma-factor=<0.1..2.0>``
+    Set an additional raw gamma factor (default: 1.0). If gamma is adjusted in
+    other ways (like with the ``--gamma`` option or key bindings and the
+    ``gamma`` property), the value is multiplied with the other gamma value.
 
     Recommended values based on the environmental brightness:
 
@@ -4888,7 +4900,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     Blend subtitles directly onto upscaled video frames, before interpolation
     and/or color management (default: no). Enabling this causes subtitles to be
     affected by ``--icc-profile``, ``--target-prim``, ``--target-trc``,
-    ``--interpolation``, ``--opengl-gamma`` and ``--post-shader``. It also
+    ``--interpolation``, ``--gpu-gamma`` and ``--post-shader``. It also
     increases subtitle performance when using ``--interpolation``.
 
     The downside of enabling this is that it restricts subtitles to the visible
@@ -4918,7 +4930,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
         if the video contains alpha information (which is extremely rare). May
         not be supported on all platforms. If alpha framebuffers are
         unavailable, it silently falls back on a normal framebuffer. Note that
-        if you set the ``--opengl-fbo-format`` option to a non-default value, a
+        if you set the ``--fbo-format`` option to a non-default value, a
         format with alpha must be specified, or this won't work.
         This does not work on X11 with EGL and Mesa (freedesktop bug 67676).
     no
@@ -4933,7 +4945,7 @@ The following video options are currently all specific to ``--vo=opengl`` and
     Color used to draw parts of the mpv window not covered by video. See
     ``--osd-color`` option how colors are defined.
 
-``--opengl-tex-pad-x``, ``--opengl-tex-pad-y``
+``--gpu-tex-pad-x``, ``--gpu-tex-pad-y``
     Enlarge the video source textures by this many pixels. For debugging only
     (normally textures are sized exactly, but due to hardware decoding interop
     we may have to deal with additional padding, which can be tested with these
@@ -4947,8 +4959,8 @@ The following video options are currently all specific to ``--vo=opengl`` and
     flipping GL front and backbuffers immediately (i.e. it doesn't call it
     in display-sync mode).
 
-``--opengl-dumb-mode=<yes|no|auto>``
-    This mode is extremely restricted, and will disable most extended OpenGL
+``--gpu-dumb-mode=<yes|no|auto>``
+    This mode is extremely restricted, and will disable most extended
     features. That includes high quality scalers and custom shaders!
 
     It is intended for hardware that does not support FBOs (including GLES,
@@ -4961,18 +4973,16 @@ The following video options are currently all specific to ``--vo=opengl`` and
 
     This option might be silently removed in the future.
 
-``--opengl-shader-cache-dir=<dirname>``
-    Store and load compiled GL shaders in this directory. Normally, shader
-    compilation is very fast, so this is usually not needed. But some GL
-    implementations (notably ANGLE, the default on Windows) have relatively
-    slow shader compilation, and can cause startup delays.
+``--gpu-shader-cache-dir=<dirname>``
+    Store and load compiled GLSL shaders in this directory. Normally, shader
+    compilation is very fast, so this is usually not needed. It mostly matters
+    for GPU APIs that require internally recompiling shaders to other languages,
+    for example anything based on ANGLE or Vulkan. Enabling this can improve
+    startup performance on these platforms.
 
     NOTE: This is not cleaned automatically, so old, unused cache files may
     stick around indefinitely.
 
-    This option might be silently removed in the future, if ANGLE fixes shader
-    compilation speed.
-
 ``--cuda-decode-device=<auto|0..>``
     Choose the GPU device used for decoding when using the ``cuda`` hwdec.
 
diff --git a/DOCS/man/vo.rst b/DOCS/man/vo.rst
index 1552b217cb..84b3a6a9d9 100644
--- a/DOCS/man/vo.rst
+++ b/DOCS/man/vo.rst
@@ -14,7 +14,7 @@ in the list.
 
     See ``--vo=help`` for a list of compiled-in video output drivers.
 
-    The recommended output driver is ``--vo=opengl``, which is the default. All
+    The recommended output driver is ``--vo=gpu``, which is the default. All
     other drivers are for compatibility or special purposes. If the default
     does not work, it will fallback to other drivers (in the same order as
     listed by ``--vo=help``).
@@ -273,37 +273,34 @@ Available video output drivers are:
     ``--vo-direct3d-exact-backbuffer``
         Always resize the backbuffer to window size.
 
-``opengl``
-    OpenGL video output driver. It supports extended scaling methods, dithering
-    and color management.
+``gpu``
+    General purpose, customizable, GPU-accelerated video output driver. It
+    supports extended scaling methods, dithering, color management, custom
+    shaders, HDR, and more.
 
-    See `OpenGL renderer options`_ for options specific to this VO.
+    See `GPU renderer options`_ for options specific to this VO.
 
     By default, it tries to use fast and fail-safe settings. Use the
-    ``opengl-hq`` profile to use this driver with defaults set to high
-    quality rendering. (This profile is also the replacement for
-    ``--vo=opengl-hq``.) The profile can be applied with ``--profile=opengl-hq``
-    and its contents can be viewed with ``--show-profile=opengl-hq``.
+    ``gpu-hq`` profile to use this driver with defaults set to high quality
+    rendering. The profile can be applied with ``--profile=gpu-hq`` and its
+    contents can be viewed with ``--show-profile=gpu-hq``.
 
-    Requires at least OpenGL 2.1.
-
-    Some features are available with OpenGL 3 capable graphics drivers only
-    (or if the necessary extensions are available).
-
-    OpenGL ES 2.0 and 3.0 are supported as well.
+    This VO abstracts over several possible graphics APIs and windowing
+    contexts, which can be influenced using the ``--gpu-api`` and
+    ``--gpu-context`` options.
 
     Hardware decoding over OpenGL-interop is supported to some degree. Note
     that in this mode, some corner case might not be gracefully handled, and
     color space conversion and chroma upsampling is generally in the hand of
     the hardware decoder APIs.
 
-    ``opengl`` makes use of FBOs by default. Sometimes you can achieve better
-    quality or performance by changing the ``--opengl-fbo-format`` option to
+    ``gpu`` makes use of FBOs by default. Sometimes you can achieve better
+    quality or performance by changing the ``--gpu-fbo-format`` option to
     ``rgb16f``, ``rgb32f`` or ``rgb``. Known problems include Mesa/Intel not
     accepting ``rgb16``, Mesa sometimes not being compiled with float texture
     support, and some OS X setups being very slow with ``rgb16`` but fast
     with ``rgb32f``. If you have problems, you can also try enabling the
-    ``--opengl-dumb-mode=yes`` option.
+    ``--gpu-dumb-mode=yes`` option.
 
 ``sdl``
     SDL 2.0+ Render video output driver, depending on system with or without
diff --git a/DOCS/mplayer-changes.rst b/DOCS/mplayer-changes.rst
index 7c8ec50a90..66cacb3205 100644
--- a/DOCS/mplayer-changes.rst
+++ b/DOCS/mplayer-changes.rst
@@ -76,8 +76,8 @@ Video
 
 * Wayland support.
 * Native support for VAAPI and VDA. Improved VDPAU video output.
-* Improved OpenGL output (see the ``opengl-hq`` video output).
-* Make hardware decoding work with the ``opengl`` video output.
+* Improved GPU-accelerated video output (see the ``gpu-hq`` preset).
+* Make hardware decoding work with the ``gpu`` video output.
 * Support for libavfilter (for video->video and audio->audio). This allows
   using most of FFmpeg's filters, which improve greatly on the old MPlayer
   filters in features, performance, and correctness.
@@ -85,7 +85,7 @@ Video
   for BT.2020 (Ultra HD). linear XYZ (Digital Cinema) and SMPTE ST2084 (HDR)
   inputs.
 * Support for color managed displays, via ICC profiles.
-* High-quality image resamplers (see the ``opengl`` ``scale`` suboption).
+* High-quality image resamplers (see the ``--scale`` suboption).
 * Support for scaling in (sigmoidized) linear light.
 * Better subtitle rendering using libass by default.
 * Improvements when playing multiple files (``-fixed-vo`` is default, do not
diff --git a/etc/builtin.conf b/etc/builtin.conf
index 1d93df9606..ee46f8cb00 100644
--- a/etc/builtin.conf
+++ b/etc/builtin.conf
@@ -36,7 +36,7 @@ load-scripts=no
 osc=no
 framedrop=no
 
-[opengl-hq]
+[gpu-hq]
 scale=spline36
 cscale=spline36
 dscale=mitchell
@@ -44,3 +44,7 @@ dither-depth=auto
 correct-downscaling=yes
 sigmoid-upscaling=yes
 deband=yes
+
+# Compatibility alias (deprecated)
+[opengl-hq]
+profile=gpu-hq
diff --git a/etc/mpv.conf b/etc/mpv.conf
index d72c9ee6d7..2a8f8b9f8b 100644
--- a/etc/mpv.conf
+++ b/etc/mpv.conf
@@ -52,9 +52,9 @@
 # Keep the player window on top of all other windows.
 #ontop=yes
 
-# Specify high quality video rendering preset (for OpenGL VO only)
+# Specify high quality video rendering preset (for --vo=gpu only)
 # Can cause performance problems with some drivers and GPUs.
-#profile=opengl-hq
+#profile=gpu-hq
 
 # Force video to lock on the display's refresh rate, and change video and audio
 # speed to some degree to ensure synchronous playback - can cause problems
diff --git a/options/options.c b/options/options.c
index 7dc3b0b160..18b6bb8dd8 100644
--- a/options/options.c
+++ b/options/options.c
@@ -57,8 +57,8 @@
 #include "video/out/drm_common.h"
 #endif
 
-#if HAVE_GL
-#include "video/out/opengl/hwdec.h"
+#if HAVE_GPU
+#include "video/out/gpu/hwdec.h"
 #endif
 
 static void print_version(struct mp_log *log)
@@ -90,6 +90,7 @@ extern const struct m_obj_list af_obj_list;
 extern const struct m_obj_list vo_obj_list;
 extern const struct m_obj_list ao_obj_list;
 
+extern const struct m_sub_options opengl_conf;
 extern const struct m_sub_options angle_conf;
 extern const struct m_sub_options cocoa_conf;
 
@@ -687,10 +688,14 @@ const m_option_t mp_opts[] = {
     OPT_SUBSTRUCT("", vo, vo_sub_opts, 0),
     OPT_SUBSTRUCT("", demux_opts, demux_conf, 0),
 
-#if HAVE_GL
+#if HAVE_GPU
     OPT_SUBSTRUCT("", gl_video_opts, gl_video_conf, 0),
 #endif
 
+#if HAVE_GL
+    OPT_SUBSTRUCT("", opengl_opts, opengl_conf, 0),
+#endif
+
 #if HAVE_EGL_ANGLE_WIN32
     OPT_SUBSTRUCT("", angle_opts, angle_conf, 0),
 #endif
diff --git a/options/options.h b/options/options.h
index 895c12182b..c02b7a34ca 100644
--- a/options/options.h
+++ b/options/options.h
@@ -328,6 +328,7 @@ typedef struct MPOpts {
 
     struct gl_video_opts *gl_video_opts;
     struct angle_opts *angle_opts;
+    struct opengl_opts *opengl_opts;
     struct cocoa_opts *cocoa_opts;
     struct dvd_opts *dvd_opts;
 
diff --git a/player/main.c b/player/main.c
index 56a4f1d4cf..cf267fb170 100644
--- a/player/main.c
+++ b/player/main.c
@@ -297,21 +297,6 @@ static bool handle_help_options(struct MPContext *mpctx)
     return false;
 }
 
-static void handle_deprecated_options(struct MPContext *mpctx)
-{
-    struct MPOpts *opts = mpctx->opts;
-    struct m_obj_settings *vo = opts->vo->video_driver_list;
-    if (vo && vo->name && strcmp(vo->name, "opengl-hq") == 0) {
-        MP_WARN(mpctx,
-            "--vo=opengl-hq is deprecated! Use --profile=opengl-hq instead.\n");
-        // Fudge it. This will replace the --vo option too, which is why we
-        // unset/safe it, and later restore it.
-        talloc_free(vo->name);
-        vo->name = talloc_strdup(NULL, "opengl");
-        m_config_set_profile(mpctx->mconfig, "opengl-hq", 0);
-    }
-}
-
 static int cfg_include(void *ctx, char *filename, int flags)
 {
     struct MPContext *mpctx = ctx;
@@ -445,8 +430,6 @@ int mp_initialize(struct MPContext *mpctx, char **options)
     if (handle_help_options(mpctx))
         return -2;
 
-    handle_deprecated_options(mpctx);
-
     if (!print_libav_versions(mp_null_log, 0)) {
         // Using mismatched libraries can be legitimate, but even then it's
         // a bad idea. We don't acknowledge its usefulness and stability.
diff --git a/video/out/gpu/context.c b/video/out/gpu/context.c
new file mode 100644
index 0000000000..dbabba8b3b
--- /dev/null
+++ b/video/out/gpu/context.c
@@ -0,0 +1,186 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config.h"
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/options.h"
+#include "options/m_option.h"
+#include "video/out/vo.h"
+
+#include "context.h"
+
+extern const struct ra_ctx_fns ra_ctx_glx;
+extern const struct ra_ctx_fns ra_ctx_glx_probe;
+extern const struct ra_ctx_fns ra_ctx_x11_egl;
+extern const struct ra_ctx_fns ra_ctx_drm_egl;
+extern const struct ra_ctx_fns ra_ctx_cocoa;
+extern const struct ra_ctx_fns ra_ctx_wayland_egl;
+extern const struct ra_ctx_fns ra_ctx_wgl;
+extern const struct ra_ctx_fns ra_ctx_angle;
+extern const struct ra_ctx_fns ra_ctx_dxinterop;
+extern const struct ra_ctx_fns ra_ctx_rpi;
+extern const struct ra_ctx_fns ra_ctx_mali;
+extern const struct ra_ctx_fns ra_ctx_vdpauglx;
+
+static const struct ra_ctx_fns *contexts[] = {
+// OpenGL contexts:
+#if HAVE_RPI
+    &ra_ctx_rpi,
+#endif
+/*
+#if HAVE_GL_COCOA
+    &ra_ctx_cocoa,
+#endif
+#if HAVE_EGL_ANGLE_WIN32
+    &ra_ctx_angle,
+#endif
+#if HAVE_GL_WIN32
+    &ra_ctx_wgl,
+#endif
+#if HAVE_GL_DXINTEROP
+    &ra_ctx_dxinterop,
+#endif
+*/
+#if HAVE_GL_X11
+    &ra_ctx_glx_probe,
+#endif
+#if HAVE_EGL_X11
+    &ra_ctx_x11_egl,
+#endif
+#if HAVE_GL_X11
+    &ra_ctx_glx,
+#endif
+#if HAVE_GL_WAYLAND
+    &ra_ctx_wayland_egl,
+#endif
+#if HAVE_EGL_DRM
+    &ra_ctx_drm_egl,
+#endif
+#if HAVE_MALI_FBDEV
+    &ra_ctx_mali,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_ctx_vdpauglx,
+#endif
+};
+
+static bool get_help(struct mp_log *log, struct bstr param)
+{
+    if (bstr_equals0(param, "help")) {
+        mp_info(log, "GPU contexts / APIs:\n");
+        mp_info(log, "    auto (autodetect)\n");
+        for (int n = 0; n < MP_ARRAY_SIZE(contexts); n++)
+            mp_info(log, "    %s (%s)\n", contexts[n]->name, contexts[n]->type);
+        return true;
+    }
+
+    return false;
+}
+
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param)
+{
+    if (get_help(log, param))
+        return M_OPT_EXIT;
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->type))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param)
+{
+    if (get_help(log, param))
+        return M_OPT_EXIT;
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->name))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+// Create a VO window and create a RA context on it.
+//  vo_flags: passed to the backend's create window function
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts)
+{
+    bool api_auto = !context_type || strcmp(context_type, "auto") == 0;
+    bool ctx_auto = !context_name || strcmp(context_name, "auto") == 0;
+
+    if (ctx_auto) {
+        MP_VERBOSE(vo, "Probing for best GPU context.\n");
+        opts.probing = true;
+    }
+
+    // Hack to silence backend (X11/Wayland/etc.) errors. Kill it once backends
+    // are separate from `struct vo`
+    bool old_probing = vo->probing;
+    vo->probing = opts.probing;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (!opts.probing && strcmp(contexts[i]->name, context_name) != 0)
+            continue;
+        if (!api_auto && strcmp(contexts[i]->type, context_type) != 0)
+            continue;
+
+        struct ra_ctx *ctx = talloc_ptrtype(NULL, ctx);
+        *ctx = (struct ra_ctx) {
+            .vo = vo,
+            .global = vo->global,
+            .log = mp_log_new(ctx, vo->log, contexts[i]->type),
+            .opts = opts,
+            .fns = contexts[i],
+        };
+
+        MP_VERBOSE(ctx, "Initializing GPU context '%s'\n", ctx->fns->name);
+        if (contexts[i]->init(ctx)) {
+            vo->probing = old_probing;
+            return ctx;
+        }
+
+        talloc_free(ctx);
+    }
+
+    // If we've reached this point, then none of the contexts matched the name
+    // requested, or the backend creation failed for all of them.
+    MP_ERR(vo, "Failed initializing any suitable GPU context!\n");
+    vo->probing = old_probing;
+    return NULL;
+}
+
+void ra_ctx_destroy(struct ra_ctx **ctx)
+{
+    if (*ctx)
+        (*ctx)->fns->uninit(*ctx);
+    talloc_free(*ctx);
+    *ctx = NULL;
+}
diff --git a/video/out/gpu/context.h b/video/out/gpu/context.h
new file mode 100644
index 0000000000..42de59b75f
--- /dev/null
+++ b/video/out/gpu/context.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "video/out/vo.h"
+
+#include "config.h"
+#include "ra.h"
+
+struct ra_ctx_opts {
+    int allow_sw;        // allow software renderers
+    int want_alpha;      // create an alpha framebuffer if possible
+    int debug;           // enable debugging layers/callbacks etc.
+    bool probing;        // the backend was auto-probed
+    int swapchain_depth; // max number of images to render ahead
+};
+
+struct ra_ctx {
+    struct vo *vo;
+    struct ra *ra;
+    struct mpv_global *global;
+    struct mp_log *log;
+
+    struct ra_ctx_opts opts;
+    const struct ra_ctx_fns *fns;
+    struct ra_swapchain *swapchain;
+
+    void *priv;
+};
+
+// The functions that make up a ra_ctx.
+struct ra_ctx_fns {
+    const char *type; // API type (for --gpu-api)
+    const char *name; // name (for --gpu-context)
+
+    // Resize the window, or create a new window if there isn't one yet.
+    // Currently, there is an unfortunate interaction with ctx->vo, and
+    // display size etc. are determined by it.
+    bool (*reconfig)(struct ra_ctx *ctx);
+
+    // This behaves exactly like vo_driver.control().
+    int (*control)(struct ra_ctx *ctx, int *events, int request, void *arg);
+
+    // These behave exactly like vo_driver.wakeup/wait_events. They are
+    // optional.
+    void (*wakeup)(struct ra_ctx *ctx);
+    void (*wait_events)(struct ra_ctx *ctx, int64_t until_time_us);
+
+    // Initialize/destroy the 'struct ra' and possibly the underlying VO backend.
+    // Not normally called by the user of the ra_ctx.
+    bool (*init)(struct ra_ctx *ctx);
+    void (*uninit)(struct ra_ctx *ctx);
+};
+
+// Extra struct for the swapchain-related functions so they can be easily
+// inherited from helpers.
+struct ra_swapchain {
+    struct ra_ctx *ctx;
+    struct priv *priv;
+    const struct ra_swapchain_fns *fns;
+
+    bool flip_v; // flip the rendered image vertically (set by the swapchain)
+};
+
+struct ra_swapchain_fns {
+    // Gets the current framebuffer depth in bits (0 if unknown). Optional.
+    int (*color_depth)(struct ra_swapchain *sw);
+
+    // Retrieves a screenshot of the framebuffer. These are always the right
+    // side up, regardless of ra_swapchain->flip_v. Optional.
+    struct mp_image *(*screenshot)(struct ra_swapchain *sw);
+
+    // Called when rendering starts. Returns NULL on failure. This must be
+    // followed by submit_frame, to submit the rendered frame.
+    struct ra_tex *(*start_frame)(struct ra_swapchain *sw);
+
+    // Present the frame. Issued in lockstep with start_frame, with rendering
+    // commands in between. The `frame` is just there for timing data, for
+    // swapchains smart enough to do something with it.
+    bool (*submit_frame)(struct ra_swapchain *sw, const struct vo_frame *frame);
+
+    // Performs a buffer swap. This blocks for as long as necessary to meet
+    // params.swapchain_depth, or until the next vblank (for vsynced contexts)
+    void (*swap_buffers)(struct ra_swapchain *sw);
+};
+
+// Create and destroy a ra_ctx. This also takes care of creating and destroying
+// the underlying `struct ra`, and perhaps the underlying VO backend.
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts);
+void ra_ctx_destroy(struct ra_ctx **ctx);
+
+struct m_option;
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param);
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param);
diff --git a/video/out/gpu/hwdec.c b/video/out/gpu/hwdec.c
new file mode 100644
index 0000000000..5fbc1aa4a9
--- /dev/null
+++ b/video/out/gpu/hwdec.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "config.h"
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/m_config.h"
+#include "hwdec.h"
+
+extern const struct ra_hwdec_driver ra_hwdec_vaegl;
+extern const struct ra_hwdec_driver ra_hwdec_vaglx;
+extern const struct ra_hwdec_driver ra_hwdec_videotoolbox;
+extern const struct ra_hwdec_driver ra_hwdec_vdpau;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2egl;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11egl;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2;
+extern const struct ra_hwdec_driver ra_hwdec_cuda;
+extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
+
+static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
+#if HAVE_VAAPI_EGL
+    &ra_hwdec_vaegl,
+#endif
+#if HAVE_VAAPI_GLX
+    &ra_hwdec_vaglx,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_hwdec_vdpau,
+#endif
+#if HAVE_VIDEOTOOLBOX_GL || HAVE_IOS_GL
+    &ra_hwdec_videotoolbox,
+#endif
+#if HAVE_D3D_HWACCEL
+    &ra_hwdec_d3d11egl,
+    &ra_hwdec_d3d11eglrgb,
+ #if HAVE_D3D9_HWACCEL
+    &ra_hwdec_dxva2egl,
+ #endif
+#endif
+#if HAVE_GL_DXINTEROP_D3D9
+    &ra_hwdec_dxva2gldx,
+#endif
+#if HAVE_CUDA_HWACCEL
+    &ra_hwdec_cuda,
+#endif
+#if HAVE_RPI
+    &ra_hwdec_rpi_overlay,
+#endif
+    NULL
+};
+
+static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
+                                          struct mpv_global *global,
+                                          struct mp_hwdec_devices *devs,
+                                          const struct ra_hwdec_driver *drv,
+                                          bool is_auto)
+{
+    struct ra_hwdec *hwdec = talloc(NULL, struct ra_hwdec);
+    *hwdec = (struct ra_hwdec) {
+        .driver = drv,
+        .log = mp_log_new(hwdec, log, drv->name),
+        .global = global,
+        .ra = ra,
+        .devs = devs,
+        .probing = is_auto,
+        .priv = talloc_zero_size(hwdec, drv->priv_size),
+    };
+    mp_verbose(log, "Loading hwdec driver '%s'\n", drv->name);
+    if (hwdec->driver->init(hwdec) < 0) {
+        ra_hwdec_uninit(hwdec);
+        mp_verbose(log, "Loading failed.\n");
+        return NULL;
+    }
+    return hwdec;
+}
+
+struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api)
+{
+    bool is_auto = HWDEC_IS_AUTO(api);
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        if ((is_auto || api == drv->api) && !drv->testing_only) {
+            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, is_auto);
+            if (r)
+                return r;
+        }
+    }
+    return NULL;
+}
+
+// Load by option name.
+struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
+                               struct mpv_global *g,
+                               struct mp_hwdec_devices *devs,
+                               const char *name)
+{
+    int g_hwdec_api;
+    mp_read_option_raw(g, "hwdec", &m_option_type_choice, &g_hwdec_api);
+    if (!name || !name[0])
+        name = m_opt_choice_str(mp_hwdec_names, g_hwdec_api);
+
+    int api_id = HWDEC_NONE;
+    for (int n = 0; mp_hwdec_names[n].name; n++) {
+        if (name && strcmp(mp_hwdec_names[n].name, name) == 0)
+            api_id = mp_hwdec_names[n].value;
+    }
+
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        if (name && strcmp(drv->name, name) == 0) {
+            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, false);
+            if (r)
+                return r;
+        }
+    }
+
+    return ra_hwdec_load_api(log, ra, g, devs, api_id);
+}
+
+int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
+                          struct bstr name, struct bstr param)
+{
+    bool help = bstr_equals0(param, "help");
+    if (help)
+        mp_info(log, "Available hwdecs:\n");
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        const char *api_name = m_opt_choice_str(mp_hwdec_names, drv->api);
+        if (help) {
+            mp_info(log, "    %s [%s]\n", drv->name, api_name);
+        } else if (bstr_equals0(param, drv->name) ||
+                   bstr_equals0(param, api_name))
+        {
+            return 1;
+        }
+    }
+    if (help) {
+        mp_info(log, "    auto (loads best)\n"
+                     "    (other --hwdec values)\n"
+                     "Setting an empty string means use --hwdec.\n");
+        return M_OPT_EXIT;
+    }
+    if (!param.len)
+        return 1; // "" is treated specially
+    for (int n = 0; mp_hwdec_names[n].name; n++) {
+        if (bstr_equals0(param, mp_hwdec_names[n].name))
+            return 1;
+    }
+    mp_fatal(log, "No hwdec backend named '%.*s' found!\n", BSTR_P(param));
+    return M_OPT_INVALID;
+}
+
+void ra_hwdec_uninit(struct ra_hwdec *hwdec)
+{
+    if (hwdec)
+        hwdec->driver->uninit(hwdec);
+    talloc_free(hwdec);
+}
+
+bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt)
+{
+    for (int n = 0; hwdec->driver->imgfmts[n]; n++) {
+        if (hwdec->driver->imgfmts[n] == imgfmt)
+            return true;
+    }
+    return false;
+}
+
+struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
+                                               struct mp_image_params *params)
+{
+    assert(ra_hwdec_test_format(hwdec, params->imgfmt));
+
+    struct ra_hwdec_mapper *mapper = talloc_ptrtype(NULL, mapper);
+    *mapper = (struct ra_hwdec_mapper){
+        .owner = hwdec,
+        .driver = hwdec->driver->mapper,
+        .log = hwdec->log,
+        .ra = hwdec->ra,
+        .priv = talloc_zero_size(mapper, hwdec->driver->mapper->priv_size),
+        .src_params = *params,
+        .dst_params = *params,
+    };
+    if (mapper->driver->init(mapper) < 0)
+        ra_hwdec_mapper_free(&mapper);
+    return mapper;
+}
+
+void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper)
+{
+    struct ra_hwdec_mapper *p = *mapper;
+    if (p) {
+        ra_hwdec_mapper_unmap(p);
+        p->driver->uninit(p);
+        talloc_free(p);
+    }
+    *mapper = NULL;
+}
+
+void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper)
+{
+    if (mapper->driver->unmap)
+        mapper->driver->unmap(mapper);
+    mp_image_unrefp(&mapper->src);
+}
+
+int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img)
+{
+    ra_hwdec_mapper_unmap(mapper);
+    mp_image_setrefp(&mapper->src, img);
+    if (mapper->driver->map(mapper) < 0) {
+        ra_hwdec_mapper_unmap(mapper);
+        return -1;
+    }
+    return 0;
+}
diff --git a/video/out/gpu/hwdec.h b/video/out/gpu/hwdec.h
new file mode 100644
index 0000000000..20bbaae9eb
--- /dev/null
+++ b/video/out/gpu/hwdec.h
@@ -0,0 +1,130 @@
+#ifndef MPGL_HWDEC_H_
+#define MPGL_HWDEC_H_
+
+#include "video/mp_image.h"
+#include "ra.h"
+#include "video/hwdec.h"
+
+struct ra_hwdec {
+    const struct ra_hwdec_driver *driver;
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct ra *ra;
+    struct mp_hwdec_devices *devs;
+    // GLSL extensions required to sample textures from this.
+    const char **glsl_extensions;
+    // For free use by hwdec driver
+    void *priv;
+    // For working around the vdpau vs. vaapi mess.
+    bool probing;
+    // Used in overlay mode only.
+    float overlay_colorkey[4];
+};
+
+struct ra_hwdec_mapper {
+    const struct ra_hwdec_mapper_driver *driver;
+    struct mp_log *log;
+    struct ra *ra;
+    void *priv;
+    struct ra_hwdec *owner;
+    // Input frame parameters. (Set before init(), immutable.)
+    struct mp_image_params src_params;
+    // Output frame parameters (represents the format the textures return). Must
+    // be set by init(), immutable afterwards,
+    struct mp_image_params dst_params;
+
+    // The currently mapped source image (or the image about to be mapped in
+    // ->map()). NULL if unmapped. The mapper can also clear this reference if
+    // the mapped textures contain a full copy.
+    struct mp_image *src;
+
+    // The mapped textures and metadata about them. These fields change if a
+    // new frame is mapped (or unmapped), but otherwise remain constant.
+    // The common code won't mess with these, so you can e.g. set them in the
+    // .init() callback.
+    struct ra_tex *tex[4];
+    bool vdpau_fields;
+};
+
+// This can be used to map frames of a specific hw format as GL textures.
+struct ra_hwdec_mapper_driver {
+    // Used to create ra_hwdec_mapper.priv.
+    size_t priv_size;
+
+    // Init the mapper implementation. At this point, the field src_params,
+    // fns, devs, priv are initialized.
+    int (*init)(struct ra_hwdec_mapper *mapper);
+    // Destroy the mapper. unmap is called before this.
+    void (*uninit)(struct ra_hwdec_mapper *mapper);
+
+    // Map mapper->src as texture, and set mapper->frame to textures using it.
+    // It is expected that that the textures remain valid until the next unmap
+    // or uninit call.
+    // The function is allowed to unref mapper->src if it's not needed (i.e.
+    // this function creates a copy).
+    // The underlying format can change, so you might need to do some form
+    // of change detection. You also must reject unsupported formats with an
+    // error.
+    // On error, returns negative value on error and remains unmapped.
+    int (*map)(struct ra_hwdec_mapper *mapper);
+    // Unmap the frame. Does nothing if already unmapped. Optional.
+    void (*unmap)(struct ra_hwdec_mapper *mapper);
+};
+
+struct ra_hwdec_driver {
+    // Name of the interop backend. This is used for informational purposes only.
+    const char *name;
+    // Used to create ra_hwdec.priv.
+    size_t priv_size;
+    // Used to explicitly request a specific API.
+    enum hwdec_type api;
+    // One of the hardware surface IMGFMT_ that must be passed to map_image later.
+    // Terminated with a 0 entry. (Extend the array size as needed.)
+    const int imgfmts[3];
+    // Dosn't load this unless requested by name.
+    bool testing_only;
+
+    // Create the hwdec device. It must add it to hw->devs, if applicable.
+    int (*init)(struct ra_hwdec *hw);
+    void (*uninit)(struct ra_hwdec *hw);
+
+    // This will be used to create a ra_hwdec_mapper from ra_hwdec.
+    const struct ra_hwdec_mapper_driver *mapper;
+
+    // The following function provides an alternative API. Each ra_hwdec_driver
+    // must have either provide a mapper or overlay_frame (not both or none), and
+    // if overlay_frame is set, it operates in overlay mode. In this mode,
+    // OSD etc. is rendered via OpenGL, but the video is rendered as a separate
+    // layer below it.
+    // Non-overlay mode is strictly preferred, so try not to use overlay mode.
+    // Set the given frame as overlay, replacing the previous one. This can also
+    // just change the position of the overlay.
+    // hw_image==src==dst==NULL is passed to clear the overlay.
+    int (*overlay_frame)(struct ra_hwdec *hw, struct mp_image *hw_image,
+                         struct mp_rect *src, struct mp_rect *dst, bool newframe);
+};
+
+struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api);
+
+struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
+                               struct mpv_global *g,
+                               struct mp_hwdec_devices *devs,
+                               const char *name);
+
+int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
+                          struct bstr name, struct bstr param);
+
+void ra_hwdec_uninit(struct ra_hwdec *hwdec);
+
+bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt);
+
+struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
+                                               struct mp_image_params *params);
+void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper);
+void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper);
+int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img);
+
+#endif
diff --git a/video/out/gpu/lcms.c b/video/out/gpu/lcms.c
new file mode 100644
index 0000000000..8747ae6aa6
--- /dev/null
+++ b/video/out/gpu/lcms.c
@@ -0,0 +1,531 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+#include <math.h>
+
+#include "mpv_talloc.h"
+
+#include "config.h"
+
+#include "stream/stream.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "common/msg.h"
+#include "options/m_option.h"
+#include "options/path.h"
+#include "video/csputils.h"
+#include "lcms.h"
+
+#include "osdep/io.h"
+
+#if HAVE_LCMS2
+
+#include <lcms2.h>
+#include <libavutil/sha.h>
+#include <libavutil/mem.h>
+
+struct gl_lcms {
+    void *icc_data;
+    size_t icc_size;
+    struct AVBufferRef *vid_profile;
+    char *current_profile;
+    bool using_memory_profile;
+    bool changed;
+    enum mp_csp_prim current_prim;
+    enum mp_csp_trc current_trc;
+
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct mp_icc_opts *opts;
+};
+
+static bool parse_3dlut_size(const char *arg, int *p1, int *p2, int *p3)
+{
+    if (sscanf(arg, "%dx%dx%d", p1, p2, p3) != 3)
+        return false;
+    for (int n = 0; n < 3; n++) {
+        int s = ((int[]) { *p1, *p2, *p3 })[n];
+        if (s < 2 || s > 512)
+            return false;
+    }
+    return true;
+}
+
+static int validate_3dlut_size_opt(struct mp_log *log, const m_option_t *opt,
+                                   struct bstr name, struct bstr param)
+{
+    int p1, p2, p3;
+    char s[20];
+    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+    return parse_3dlut_size(s, &p1, &p2, &p3);
+}
+
+#define OPT_BASE_STRUCT struct mp_icc_opts
+const struct m_sub_options mp_icc_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_FLAG("use-embedded-icc-profile", use_embedded, 0),
+        OPT_STRING("icc-profile", profile, M_OPT_FILE),
+        OPT_FLAG("icc-profile-auto", profile_auto, 0),
+        OPT_STRING("icc-cache-dir", cache_dir, M_OPT_FILE),
+        OPT_INT("icc-intent", intent, 0),
+        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
+        OPT_STRING_VALIDATE("icc-3dlut-size", size_str, 0, validate_3dlut_size_opt),
+
+        OPT_REPLACED("3dlut-size", "icc-3dlut-size"),
+        OPT_REMOVED("icc-cache", "see icc-cache-dir"),
+        {0}
+    },
+    .size = sizeof(struct mp_icc_opts),
+    .defaults = &(const struct mp_icc_opts) {
+        .size_str = "64x64x64",
+        .intent = INTENT_RELATIVE_COLORIMETRIC,
+        .use_embedded = true,
+    },
+};
+
+static void lcms2_error_handler(cmsContext ctx, cmsUInt32Number code,
+                                const char *msg)
+{
+    struct gl_lcms *p = cmsGetContextUserData(ctx);
+    MP_ERR(p, "lcms2: %s\n", msg);
+}
+
+static void load_profile(struct gl_lcms *p)
+{
+    talloc_free(p->icc_data);
+    p->icc_data = NULL;
+    p->icc_size = 0;
+    p->using_memory_profile = false;
+    talloc_free(p->current_profile);
+    p->current_profile = NULL;
+
+    if (!p->opts->profile || !p->opts->profile[0])
+        return;
+
+    char *fname = mp_get_user_path(NULL, p->global, p->opts->profile);
+    MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
+    struct bstr iccdata = stream_read_file(fname, p, p->global,
+                                           100000000); // 100 MB
+    talloc_free(fname);
+    if (!iccdata.len)
+        return;
+
+    talloc_free(p->icc_data);
+
+    p->icc_data = iccdata.start;
+    p->icc_size = iccdata.len;
+    p->current_profile = talloc_strdup(p, p->opts->profile);
+}
+
+static void gl_lcms_destructor(void *ptr)
+{
+    struct gl_lcms *p = ptr;
+    av_buffer_unref(&p->vid_profile);
+}
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts)
+{
+    struct gl_lcms *p = talloc_ptrtype(talloc_ctx, p);
+    talloc_set_destructor(p, gl_lcms_destructor);
+    *p = (struct gl_lcms) {
+        .global = global,
+        .log = log,
+        .opts = opts,
+    };
+    gl_lcms_update_options(p);
+    return p;
+}
+
+void gl_lcms_update_options(struct gl_lcms *p)
+{
+    if ((p->using_memory_profile && !p->opts->profile_auto) ||
+        !bstr_equals(bstr0(p->opts->profile), bstr0(p->current_profile)))
+    {
+        load_profile(p);
+    }
+
+    p->changed = true; // probably
+}
+
+// Warning: profile.start must point to a ta allocation, and the function
+//          takes over ownership.
+// Returns whether the internal profile was changed.
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile)
+{
+    if (!p->opts->profile_auto || (p->opts->profile && p->opts->profile[0])) {
+        talloc_free(profile.start);
+        return false;
+    }
+
+    if (p->using_memory_profile &&
+        p->icc_data && profile.start &&
+        profile.len == p->icc_size &&
+        memcmp(profile.start, p->icc_data, p->icc_size) == 0)
+    {
+        talloc_free(profile.start);
+        return false;
+    }
+
+    p->changed = true;
+    p->using_memory_profile = true;
+
+    talloc_free(p->icc_data);
+
+    p->icc_data = talloc_steal(p, profile.start);
+    p->icc_size = profile.len;
+
+    return true;
+}
+
+// Guards against NULL and uses bstr_equals to short-circuit some special cases
+static bool vid_profile_eq(struct AVBufferRef *a, struct AVBufferRef *b)
+{
+    if (!a || !b)
+        return a == b;
+
+    return bstr_equals((struct bstr){ a->data, a->size },
+                       (struct bstr){ b->data, b->size });
+}
+
+// Return whether the profile or config has changed since the last time it was
+// retrieved. If it has changed, gl_lcms_get_lut3d() should be called.
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
+{
+    if (p->changed || p->current_prim != prim || p->current_trc != trc)
+        return true;
+
+    return !vid_profile_eq(p->vid_profile, vid_profile);
+}
+
+// Whether a profile is set. (gl_lcms_get_lut3d() is expected to return a lut,
+// but it could still fail due to runtime errors, such as invalid icc data.)
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return p->icc_size > 0;
+}
+
+static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
+                                   cmsHPROFILE disp_profile,
+                                   enum mp_csp_prim prim, enum mp_csp_trc trc)
+{
+    if (p->opts->use_embedded && p->vid_profile) {
+        // Try using the embedded ICC profile
+        cmsHPROFILE prof = cmsOpenProfileFromMemTHR(cms, p->vid_profile->data,
+                                                    p->vid_profile->size);
+        if (prof) {
+            MP_VERBOSE(p, "Successfully opened embedded ICC profile\n");
+            return prof;
+        }
+
+        // Otherwise, warn the user and generate the profile as usual
+        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring..\n");
+    }
+
+    // The input profile for the transformation is dependent on the video
+    // primaries and transfer characteristics
+    struct mp_csp_primaries csp = mp_get_csp_primaries(prim);
+    cmsCIExyY wp_xyY = {csp.white.x, csp.white.y, 1.0};
+    cmsCIExyYTRIPLE prim_xyY = {
+        .Red   = {csp.red.x,   csp.red.y,   1.0},
+        .Green = {csp.green.x, csp.green.y, 1.0},
+        .Blue  = {csp.blue.x,  csp.blue.y,  1.0},
+    };
+
+    cmsToneCurve *tonecurve[3] = {0};
+    switch (trc) {
+    case MP_CSP_TRC_LINEAR:  tonecurve[0] = cmsBuildGamma(cms, 1.0); break;
+    case MP_CSP_TRC_GAMMA18: tonecurve[0] = cmsBuildGamma(cms, 1.8); break;
+    case MP_CSP_TRC_GAMMA22: tonecurve[0] = cmsBuildGamma(cms, 2.2); break;
+    case MP_CSP_TRC_GAMMA28: tonecurve[0] = cmsBuildGamma(cms, 2.8); break;
+
+    case MP_CSP_TRC_SRGB:
+        // Values copied from Little-CMS
+        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
+                (double[5]){2.40, 1/1.055, 0.055/1.055, 1/12.92, 0.04045});
+        break;
+
+    case MP_CSP_TRC_PRO_PHOTO:
+        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
+                (double[5]){1.8, 1.0, 0.0, 1/16.0, 0.03125});
+        break;
+
+    case MP_CSP_TRC_BT_1886: {
+        // To build an appropriate BT.1886 transformation we need access to
+        // the display's black point, so we LittleCMS' detection function.
+        // Relative colorimetric is used since we want to approximate the
+        // BT.1886 to the target device's actual black point even in e.g.
+        // perceptual mode
+        const int intent = MP_INTENT_RELATIVE_COLORIMETRIC;
+        cmsCIEXYZ bp_XYZ;
+        if (!cmsDetectBlackPoint(&bp_XYZ, disp_profile, intent, 0))
+            return false;
+
+        // Map this XYZ value back into the (linear) source space
+        cmsToneCurve *linear = cmsBuildGamma(cms, 1.0);
+        cmsHPROFILE rev_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
+                (cmsToneCurve*[3]){linear, linear, linear});
+        cmsHPROFILE xyz_profile = cmsCreateXYZProfile();
+        cmsHTRANSFORM xyz2src = cmsCreateTransformTHR(cms,
+                xyz_profile, TYPE_XYZ_DBL, rev_profile, TYPE_RGB_DBL,
+                intent, 0);
+        cmsFreeToneCurve(linear);
+        cmsCloseProfile(rev_profile);
+        cmsCloseProfile(xyz_profile);
+        if (!xyz2src)
+            return false;
+
+        double src_black[3];
+        cmsDoTransform(xyz2src, &bp_XYZ, src_black, 1);
+        cmsDeleteTransform(xyz2src);
+
+        // Contrast limiting
+        if (p->opts->contrast > 0) {
+            for (int i = 0; i < 3; i++)
+                src_black[i] = MPMAX(src_black[i], 1.0 / p->opts->contrast);
+        }
+
+        // Built-in contrast failsafe
+        double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
+        if (contrast > 100000) {
+            MP_WARN(p, "ICC profile detected contrast very high (>100000),"
+                    " falling back to contrast 1000 for sanity. Set the"
+                    " icc-contrast option to silence this warning.\n");
+            src_black[0] = src_black[1] = src_black[2] = 1.0 / 1000;
+        }
+
+        // Build the parametric BT.1886 transfer curve, one per channel
+        for (int i = 0; i < 3; i++) {
+            const double gamma = 2.40;
+            double binv = pow(src_black[i], 1.0/gamma);
+            tonecurve[i] = cmsBuildParametricToneCurve(cms, 6,
+                    (double[4]){gamma, 1.0 - binv, binv, 0.0});
+        }
+        break;
+    }
+
+    default:
+        abort();
+    }
+
+    if (!tonecurve[0])
+        return false;
+
+    if (!tonecurve[1]) tonecurve[1] = tonecurve[0];
+    if (!tonecurve[2]) tonecurve[2] = tonecurve[0];
+
+    cmsHPROFILE *vid_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
+                                                      tonecurve);
+
+    if (tonecurve[2] != tonecurve[0]) cmsFreeToneCurve(tonecurve[2]);
+    if (tonecurve[1] != tonecurve[0]) cmsFreeToneCurve(tonecurve[1]);
+    cmsFreeToneCurve(tonecurve[0]);
+
+    return vid_profile;
+}
+
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile)
+{
+    int s_r, s_g, s_b;
+    bool result = false;
+
+    p->changed = false;
+    p->current_prim = prim;
+    p->current_trc = trc;
+
+    // We need to hold on to a reference to the video's ICC profile for as long
+    // as we still need to perform equality checking, so generate a new
+    // reference here
+    av_buffer_unref(&p->vid_profile);
+    if (vid_profile) {
+        MP_VERBOSE(p, "Got an embedded ICC profile.\n");
+        p->vid_profile = av_buffer_ref(vid_profile);
+        if (!p->vid_profile)
+            abort();
+    }
+
+    if (!parse_3dlut_size(p->opts->size_str, &s_r, &s_g, &s_b))
+        return false;
+
+    if (!gl_lcms_has_profile(p))
+        return false;
+
+    void *tmp = talloc_new(NULL);
+    uint16_t *output = talloc_array(tmp, uint16_t, s_r * s_g * s_b * 4);
+    struct lut3d *lut = NULL;
+    cmsContext cms = NULL;
+
+    char *cache_file = NULL;
+    if (p->opts->cache_dir && p->opts->cache_dir[0]) {
+        // Gamma is included in the header to help uniquely identify it,
+        // because we may change the parameter in the future or make it
+        // customizable, same for the primaries.
+        char *cache_info = talloc_asprintf(tmp,
+                "ver=1.4, intent=%d, size=%dx%dx%d, prim=%d, trc=%d, "
+                "contrast=%d\n",
+                p->opts->intent, s_r, s_g, s_b, prim, trc, p->opts->contrast);
+
+        uint8_t hash[32];
+        struct AVSHA *sha = av_sha_alloc();
+        if (!sha)
+            abort();
+        av_sha_init(sha, 256);
+        av_sha_update(sha, cache_info, strlen(cache_info));
+        if (vid_profile)
+            av_sha_update(sha, vid_profile->data, vid_profile->size);
+        av_sha_update(sha, p->icc_data, p->icc_size);
+        av_sha_final(sha, hash);
+        av_free(sha);
+
+        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts->cache_dir);
+        cache_file = talloc_strdup(tmp, "");
+        for (int i = 0; i < sizeof(hash); i++)
+            cache_file = talloc_asprintf_append(cache_file, "%02X", hash[i]);
+        cache_file = mp_path_join(tmp, cache_dir, cache_file);
+
+        mp_mkdirp(cache_dir);
+    }
+
+    // check cache
+    if (cache_file && stat(cache_file, &(struct stat){0}) == 0) {
+        MP_VERBOSE(p, "Opening 3D LUT cache in file '%s'.\n", cache_file);
+        struct bstr cachedata = stream_read_file(cache_file, tmp, p->global,
+                                                 1000000000); // 1 GB
+        if (cachedata.len == talloc_get_size(output)) {
+            memcpy(output, cachedata.start, cachedata.len);
+            goto done;
+        } else {
+            MP_WARN(p, "3D LUT cache invalid!\n");
+        }
+    }
+
+    cms = cmsCreateContext(NULL, p);
+    if (!cms)
+        goto error_exit;
+    cmsSetLogErrorHandlerTHR(cms, lcms2_error_handler);
+
+    cmsHPROFILE profile =
+        cmsOpenProfileFromMemTHR(cms, p->icc_data, p->icc_size);
+    if (!profile)
+        goto error_exit;
+
+    cmsHPROFILE vid_hprofile = get_vid_profile(p, cms, profile, prim, trc);
+    if (!vid_hprofile) {
+        cmsCloseProfile(profile);
+        goto error_exit;
+    }
+
+    cmsHTRANSFORM trafo = cmsCreateTransformTHR(cms, vid_hprofile, TYPE_RGB_16,
+                                                profile, TYPE_RGBA_16,
+                                                p->opts->intent,
+                                                cmsFLAGS_HIGHRESPRECALC |
+                                                cmsFLAGS_BLACKPOINTCOMPENSATION);
+    cmsCloseProfile(profile);
+    cmsCloseProfile(vid_hprofile);
+
+    if (!trafo)
+        goto error_exit;
+
+    // transform a (s_r)x(s_g)x(s_b) cube, with 3 components per channel
+    uint16_t *input = talloc_array(tmp, uint16_t, s_r * 3);
+    for (int b = 0; b < s_b; b++) {
+        for (int g = 0; g < s_g; g++) {
+            for (int r = 0; r < s_r; r++) {
+                input[r * 3 + 0] = r * 65535 / (s_r - 1);
+                input[r * 3 + 1] = g * 65535 / (s_g - 1);
+                input[r * 3 + 2] = b * 65535 / (s_b - 1);
+            }
+            size_t base = (b * s_r * s_g + g * s_r) * 4;
+            cmsDoTransform(trafo, input, output + base, s_r);
+        }
+    }
+
+    cmsDeleteTransform(trafo);
+
+    if (cache_file) {
+        FILE *out = fopen(cache_file, "wb");
+        if (out) {
+            fwrite(output, talloc_get_size(output), 1, out);
+            fclose(out);
+        }
+    }
+
+done: ;
+
+    lut = talloc_ptrtype(NULL, lut);
+    *lut = (struct lut3d) {
+        .data = talloc_steal(lut, output),
+        .size = {s_r, s_g, s_b},
+    };
+
+    *result_lut3d = lut;
+    result = true;
+
+error_exit:
+
+    if (cms)
+        cmsDeleteContext(cms);
+
+    if (!lut)
+        MP_FATAL(p, "Error loading ICC profile.\n");
+
+    talloc_free(tmp);
+    return result;
+}
+
+#else /* HAVE_LCMS2 */
+
+const struct m_sub_options mp_icc_conf = {
+    .opts = (const m_option_t[]) { {0} },
+    .size = sizeof(struct mp_icc_opts),
+    .defaults = &(const struct mp_icc_opts) {0},
+};
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts)
+{
+    return (struct gl_lcms *) talloc_new(talloc_ctx);
+}
+
+void gl_lcms_update_options(struct gl_lcms *p) { }
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile) {return false;}
+
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
+{
+    return false;
+}
+
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return false;
+}
+
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile)
+{
+    return false;
+}
+
+#endif
diff --git a/video/out/gpu/lcms.h b/video/out/gpu/lcms.h
new file mode 100644
index 0000000000..35bbd61fe0
--- /dev/null
+++ b/video/out/gpu/lcms.h
@@ -0,0 +1,43 @@
+#ifndef MP_GL_LCMS_H
+#define MP_GL_LCMS_H
+
+#include <stddef.h>
+#include <stdbool.h>
+#include "misc/bstr.h"
+#include "video/csputils.h"
+#include <libavutil/buffer.h>
+
+extern const struct m_sub_options mp_icc_conf;
+
+struct mp_icc_opts {
+    int use_embedded;
+    char *profile;
+    int profile_auto;
+    char *cache_dir;
+    char *size_str;
+    int intent;
+    int contrast;
+};
+
+struct lut3d {
+    uint16_t *data;
+    int size[3];
+};
+
+struct mp_log;
+struct mpv_global;
+struct gl_lcms;
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts);
+void gl_lcms_update_options(struct gl_lcms *p);
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile);
+bool gl_lcms_has_profile(struct gl_lcms *p);
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile);
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile);
+
+#endif
diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c
new file mode 100644
index 0000000000..f7c325d1db
--- /dev/null
+++ b/video/out/gpu/osd.c
@@ -0,0 +1,367 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+
+#include <libavutil/common.h>
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "video/csputils.h"
+#include "video/mp_image.h"
+#include "osd.h"
+
+#define GLSL(x) gl_sc_add(sc, #x "\n");
+
+// glBlendFuncSeparate() arguments
+static const int blend_factors[SUBBITMAP_COUNT][4] = {
+    [SUBBITMAP_LIBASS] = {RA_BLEND_SRC_ALPHA, RA_BLEND_ONE_MINUS_SRC_ALPHA,
+                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
+    [SUBBITMAP_RGBA] =   {RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA,
+                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
+};
+
+struct vertex {
+    float position[2];
+    float texcoord[2];
+    uint8_t ass_color[4];
+};
+
+static const struct ra_renderpass_input vertex_vao[] = {
+    {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
+    {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
+    {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
+    {0}
+};
+
+struct mpgl_osd_part {
+    enum sub_bitmap_format format;
+    int change_id;
+    struct ra_tex *texture;
+    int w, h;
+    int num_subparts;
+    int prev_num_subparts;
+    struct sub_bitmap *subparts;
+    int num_vertices;
+    struct vertex *vertices;
+};
+
+struct mpgl_osd {
+    struct mp_log *log;
+    struct osd_state *osd;
+    struct ra *ra;
+    struct mpgl_osd_part *parts[MAX_OSD_PARTS];
+    const struct ra_format *fmt_table[SUBBITMAP_COUNT];
+    bool formats[SUBBITMAP_COUNT];
+    bool change_flag; // for reporting to API user only
+    // temporary
+    int stereo_mode;
+    struct mp_osd_res osd_res;
+    void *scratch;
+};
+
+struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
+                               struct osd_state *osd)
+{
+    struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
+    *ctx = (struct mpgl_osd) {
+        .log = log,
+        .osd = osd,
+        .ra = ra,
+        .change_flag = true,
+        .scratch = talloc_zero_size(ctx, 1),
+    };
+
+    ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1);
+    ctx->fmt_table[SUBBITMAP_RGBA]   = ra_find_unorm_format(ra, 1, 4);
+
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
+
+    for (int n = 0; n < SUBBITMAP_COUNT; n++)
+        ctx->formats[n] = !!ctx->fmt_table[n];
+
+    return ctx;
+}
+
+void mpgl_osd_destroy(struct mpgl_osd *ctx)
+{
+    if (!ctx)
+        return;
+
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        struct mpgl_osd_part *p = ctx->parts[n];
+        ra_tex_free(ctx->ra, &p->texture);
+    }
+    talloc_free(ctx);
+}
+
+static int next_pow2(int v)
+{
+    for (int x = 0; x < 30; x++) {
+        if ((1 << x) >= v)
+            return 1 << x;
+    }
+    return INT_MAX;
+}
+
+static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
+                       struct sub_bitmaps *imgs)
+{
+    struct ra *ra = ctx->ra;
+    bool ok = false;
+
+    assert(imgs->packed);
+
+    int req_w = next_pow2(imgs->packed_w);
+    int req_h = next_pow2(imgs->packed_h);
+
+    const struct ra_format *fmt = ctx->fmt_table[imgs->format];
+    assert(fmt);
+
+    if (!osd->texture || req_w > osd->w || req_h > osd->h ||
+        osd->format != imgs->format)
+    {
+        ra_tex_free(ra, &osd->texture);
+
+        osd->format = imgs->format;
+        osd->w = FFMAX(32, req_w);
+        osd->h = FFMAX(32, req_h);
+
+        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
+
+        if (osd->w > ra->max_texture_wh || osd->h > ra->max_texture_wh) {
+            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
+                   "supported size %dx%d.\n", ra->max_texture_wh,
+                   ra->max_texture_wh);
+            goto done;
+        }
+
+        struct ra_tex_params params = {
+            .dimensions = 2,
+            .w = osd->w,
+            .h = osd->h,
+            .d = 1,
+            .format = fmt,
+            .render_src = true,
+            .src_linear = true,
+            .host_mutable = true,
+        };
+        osd->texture = ra_tex_create(ra, &params);
+        if (!osd->texture)
+            goto done;
+    }
+
+    struct ra_tex_upload_params params = {
+        .tex = osd->texture,
+        .src = imgs->packed->planes[0],
+        .invalidate = true,
+        .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h},
+        .stride = imgs->packed->stride[0],
+    };
+
+    ok = ra->fns->tex_upload(ra, &params);
+
+done:
+    return ok;
+}
+
+static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
+{
+    struct mpgl_osd *ctx = pctx;
+
+    if (imgs->num_parts == 0 || !ctx->formats[imgs->format])
+        return;
+
+    struct mpgl_osd_part *osd = ctx->parts[imgs->render_index];
+
+    bool ok = true;
+    if (imgs->change_id != osd->change_id) {
+        if (!upload_osd(ctx, osd, imgs))
+            ok = false;
+
+        osd->change_id = imgs->change_id;
+        ctx->change_flag = true;
+    }
+    osd->num_subparts = ok ? imgs->num_parts : 0;
+
+    MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
+    memcpy(osd->subparts, imgs->parts,
+           osd->num_subparts * sizeof(osd->subparts[0]));
+}
+
+bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
+                           struct gl_shader_cache *sc)
+{
+    assert(index >= 0 && index < MAX_OSD_PARTS);
+    struct mpgl_osd_part *part = ctx->parts[index];
+
+    enum sub_bitmap_format fmt = part->format;
+    if (!fmt || !part->num_subparts)
+        return false;
+
+    gl_sc_uniform_texture(sc, "osdtex", part->texture);
+    switch (fmt) {
+    case SUBBITMAP_RGBA: {
+        GLSL(color = texture(osdtex, texcoord).bgra;)
+        break;
+    }
+    case SUBBITMAP_LIBASS: {
+        GLSL(color =
+            vec4(ass_color.rgb, ass_color.a * texture(osdtex, texcoord).r);)
+        break;
+    }
+    default:
+        abort();
+    }
+
+    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
+
+    return true;
+}
+
+static void write_quad(struct vertex *va, struct gl_transform t,
+                       float x0, float y0, float x1, float y1,
+                       float tx0, float ty0, float tx1, float ty1,
+                       float tex_w, float tex_h, const uint8_t color[4])
+{
+    gl_transform_vec(t, &x0, &y0);
+    gl_transform_vec(t, &x1, &y1);
+
+#define COLOR_INIT {color[0], color[1], color[2], color[3]}
+    va[0] = (struct vertex){ {x0, y0}, {tx0 / tex_w, ty0 / tex_h}, COLOR_INIT };
+    va[1] = (struct vertex){ {x0, y1}, {tx0 / tex_w, ty1 / tex_h}, COLOR_INIT };
+    va[2] = (struct vertex){ {x1, y0}, {tx1 / tex_w, ty0 / tex_h}, COLOR_INIT };
+    va[3] = (struct vertex){ {x1, y1}, {tx1 / tex_w, ty1 / tex_h}, COLOR_INIT };
+    va[4] = va[2];
+    va[5] = va[1];
+#undef COLOR_INIT
+}
+
+static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
+{
+    int num_vertices = part->num_subparts * 6;
+    MP_TARRAY_GROW(part, part->vertices, part->num_vertices + num_vertices);
+
+    for (int n = 0; n < part->num_subparts; n++) {
+        struct sub_bitmap *b = &part->subparts[n];
+        struct vertex *va = &part->vertices[part->num_vertices];
+
+        // NOTE: the blend color is used with SUBBITMAP_LIBASS only, so it
+        //       doesn't matter that we upload garbage for the other formats
+        uint32_t c = b->libass.color;
+        uint8_t color[4] = { c >> 24, (c >> 16) & 0xff,
+                            (c >> 8) & 0xff, 255 - (c & 0xff) };
+
+        write_quad(&va[n * 6], t,
+                   b->x, b->y, b->x + b->dw, b->y + b->dh,
+                   b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
+                   part->w, part->h, color);
+    }
+
+    part->num_vertices += num_vertices;
+}
+
+// number of screen divisions per axis (x=0, y=1) for the current 3D mode
+static void get_3d_side_by_side(int stereo_mode, int div[2])
+{
+    div[0] = div[1] = 1;
+    switch (stereo_mode) {
+    case MP_STEREO3D_SBS2L:
+    case MP_STEREO3D_SBS2R: div[0] = 2; break;
+    case MP_STEREO3D_AB2R:
+    case MP_STEREO3D_AB2L:  div[1] = 2; break;
+    }
+}
+
+void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
+                          struct gl_shader_cache *sc, struct fbodst target)
+{
+    struct mpgl_osd_part *part = ctx->parts[index];
+
+    int div[2];
+    get_3d_side_by_side(ctx->stereo_mode, div);
+
+    part->num_vertices = 0;
+
+    for (int x = 0; x < div[0]; x++) {
+        for (int y = 0; y < div[1]; y++) {
+            struct gl_transform t;
+            gl_transform_ortho_fbodst(&t, target);
+
+            float a_x = ctx->osd_res.w * x;
+            float a_y = ctx->osd_res.h * y;
+            t.t[0] += a_x * t.m[0][0] + a_y * t.m[1][0];
+            t.t[1] += a_x * t.m[0][1] + a_y * t.m[1][1];
+
+            generate_verts(part, t);
+        }
+    }
+
+    const int *factors = &blend_factors[part->format][0];
+    gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
+
+    gl_sc_dispatch_draw(sc, target.tex, part->vertices, part->num_vertices);
+}
+
+static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
+{
+    int div[2];
+    get_3d_side_by_side(stereo_mode, div);
+
+    res.w /= div[0];
+    res.h /= div[1];
+    ctx->osd_res = res;
+}
+
+void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
+                       int stereo_mode, int draw_flags)
+{
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n]->num_subparts = 0;
+
+    set_res(ctx, res, stereo_mode);
+
+    osd_draw(ctx->osd, ctx->osd_res, pts, draw_flags, ctx->formats, gen_osd_cb, ctx);
+    ctx->stereo_mode = stereo_mode;
+
+    // Parts going away does not necessarily result in gen_osd_cb() being called
+    // (not even with num_parts==0), so check this separately.
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        struct mpgl_osd_part *part = ctx->parts[n];
+        if (part->num_subparts !=  part->prev_num_subparts)
+            ctx->change_flag = true;
+        part->prev_num_subparts = part->num_subparts;
+    }
+}
+
+// See osd_resize() for remarks. This function is an optional optimization too.
+void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
+{
+    set_res(ctx, res, stereo_mode);
+    osd_resize(ctx->osd, ctx->osd_res);
+}
+
+bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
+                           double pts)
+{
+    ctx->change_flag = false;
+    mpgl_osd_generate(ctx, *res, pts, 0, 0);
+    return ctx->change_flag;
+}
diff --git a/video/out/gpu/osd.h b/video/out/gpu/osd.h
new file mode 100644
index 0000000000..6c2b886de3
--- /dev/null
+++ b/video/out/gpu/osd.h
@@ -0,0 +1,25 @@
+#ifndef MPLAYER_GL_OSD_H
+#define MPLAYER_GL_OSD_H
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "utils.h"
+#include "shader_cache.h"
+#include "sub/osd.h"
+
+struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
+                               struct osd_state *osd);
+void mpgl_osd_destroy(struct mpgl_osd *ctx);
+
+void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
+                       int stereo_mode, int draw_flags);
+void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode);
+bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
+                           struct gl_shader_cache *sc);
+void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
+                          struct gl_shader_cache *sc, struct fbodst target);
+bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
+                           double pts);
+
+#endif
diff --git a/video/out/gpu/ra.c b/video/out/gpu/ra.c
new file mode 100644
index 0000000000..ef1de54d1a
--- /dev/null
+++ b/video/out/gpu/ra.c
@@ -0,0 +1,327 @@
+#include "common/common.h"
+#include "common/msg.h"
+#include "video/img_format.h"
+
+#include "ra.h"
+
+struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params)
+{
+    return ra->fns->tex_create(ra, params);
+}
+
+void ra_tex_free(struct ra *ra, struct ra_tex **tex)
+{
+    if (*tex)
+        ra->fns->tex_destroy(ra, *tex);
+    *tex = NULL;
+}
+
+struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params)
+{
+    return ra->fns->buf_create(ra, params);
+}
+
+void ra_buf_free(struct ra *ra, struct ra_buf **buf)
+{
+    if (*buf)
+        ra->fns->buf_destroy(ra, *buf);
+    *buf = NULL;
+}
+
+void ra_free(struct ra **ra)
+{
+    if (*ra)
+        (*ra)->fns->destroy(*ra);
+    talloc_free(*ra);
+    *ra = NULL;
+}
+
+size_t ra_vartype_size(enum ra_vartype type)
+{
+    switch (type) {
+    case RA_VARTYPE_INT:        return sizeof(int);
+    case RA_VARTYPE_FLOAT:      return sizeof(float);
+    case RA_VARTYPE_BYTE_UNORM: return 1;
+    default: return 0;
+    }
+}
+
+struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input)
+{
+    size_t el_size = ra_vartype_size(input->type);
+    if (!el_size)
+        return (struct ra_layout){0};
+
+    // host data is always tightly packed
+    return (struct ra_layout) {
+        .align  = 1,
+        .stride = el_size * input->dim_v,
+        .size   = el_size * input->dim_v * input->dim_m,
+    };
+}
+
+static struct ra_renderpass_input *dup_inputs(void *ta_parent,
+            const struct ra_renderpass_input *inputs, int num_inputs)
+{
+    struct ra_renderpass_input *res =
+        talloc_memdup(ta_parent, (void *)inputs, num_inputs * sizeof(inputs[0]));
+    for (int n = 0; n < num_inputs; n++)
+        res[n].name = talloc_strdup(res, res[n].name);
+    return res;
+}
+
+// Return a newly allocated deep-copy of params.
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
+        const struct ra_renderpass_params *params)
+{
+    struct ra_renderpass_params *res = talloc_ptrtype(ta_parent, res);
+    *res = *params;
+    res->inputs = dup_inputs(res, res->inputs, res->num_inputs);
+    res->vertex_attribs =
+        dup_inputs(res, res->vertex_attribs, res->num_vertex_attribs);
+    res->cached_program = bstrdup(res, res->cached_program);
+    res->vertex_shader = talloc_strdup(res, res->vertex_shader);
+    res->frag_shader = talloc_strdup(res, res->frag_shader);
+    res->compute_shader = talloc_strdup(res, res->compute_shader);
+    return res;
+};
+
+
+// Return whether this is a tightly packed format with no external padding and
+// with the same bit size/depth in all components, and the shader returns
+// components in the same order as in memory.
+static bool ra_format_is_regular(const struct ra_format *fmt)
+{
+    if (!fmt->pixel_size || !fmt->num_components || !fmt->ordered)
+        return false;
+    for (int n = 1; n < fmt->num_components; n++) {
+        if (fmt->component_size[n] != fmt->component_size[0] ||
+            fmt->component_depth[n] != fmt->component_depth[0])
+            return false;
+    }
+    if (fmt->component_size[0] * fmt->num_components != fmt->pixel_size * 8)
+        return false;
+    return true;
+}
+
+// Return a regular filterable format using RA_CTYPE_UNORM.
+const struct ra_format *ra_find_unorm_format(struct ra *ra,
+                                             int bytes_per_component,
+                                             int n_components)
+{
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (fmt->ctype == RA_CTYPE_UNORM && fmt->num_components == n_components &&
+            fmt->pixel_size == bytes_per_component * n_components &&
+            fmt->component_depth[0] == bytes_per_component * 8 &&
+            fmt->linear_filter && ra_format_is_regular(fmt))
+            return fmt;
+    }
+    return NULL;
+}
+
+// Return a regular format using RA_CTYPE_UINT.
+const struct ra_format *ra_find_uint_format(struct ra *ra,
+                                            int bytes_per_component,
+                                            int n_components)
+{
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (fmt->ctype == RA_CTYPE_UINT && fmt->num_components == n_components &&
+            fmt->pixel_size == bytes_per_component * n_components &&
+            fmt->component_depth[0] == bytes_per_component * 8 &&
+            ra_format_is_regular(fmt))
+            return fmt;
+    }
+    return NULL;
+}
+
+// Find a float format of any precision that matches the C type of the same
+// size for upload.
+// May drop bits from the mantissa (such as selecting float16 even if
+// bytes_per_component == 32); prefers possibly faster formats first.
+static const struct ra_format *ra_find_float_format(struct ra *ra,
+                                                    int bytes_per_component,
+                                                    int n_components)
+{
+    // Assumes ra_format are ordered by performance.
+    // The >=16 check is to avoid catching fringe formats.
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (fmt->ctype == RA_CTYPE_FLOAT && fmt->num_components == n_components &&
+            fmt->pixel_size == bytes_per_component * n_components &&
+            fmt->component_depth[0] >= 16 &&
+            fmt->linear_filter && ra_format_is_regular(fmt))
+            return fmt;
+    }
+    return NULL;
+}
+
+// Return a filterable regular format that uses at least float16 internally, and
+// uses a normal C float for transfer on the CPU side. (This is just so we don't
+// need 32->16 bit conversion on CPU, which would be messy.)
+const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components)
+{
+    return ra_find_float_format(ra, sizeof(float), n_components);
+}
+
+const struct ra_format *ra_find_named_format(struct ra *ra, const char *name)
+{
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        if (strcmp(fmt->name, name) == 0)
+            return fmt;
+    }
+    return NULL;
+}
+
+// Like ra_find_unorm_format(), but if no fixed point format is available,
+// return an unsigned integer format.
+static const struct ra_format *find_plane_format(struct ra *ra, int bytes,
+                                                 int n_channels,
+                                                 enum mp_component_type ctype)
+{
+    switch (ctype) {
+    case MP_COMPONENT_TYPE_UINT: {
+        const struct ra_format *f = ra_find_unorm_format(ra, bytes, n_channels);
+        if (f)
+            return f;
+        return ra_find_uint_format(ra, bytes, n_channels);
+    }
+    case MP_COMPONENT_TYPE_FLOAT:
+        return ra_find_float_format(ra, bytes, n_channels);
+    default: return NULL;
+    }
+}
+
+// Put a mapping of imgfmt to texture formats into *out. Basically it selects
+// the correct texture formats needed to represent an imgfmt in a shader, with
+// textures using the same memory organization as on the CPU.
+// Each plane is represented by a texture, and each texture has a RGBA
+// component order. out->components describes the meaning of them.
+// May return integer formats for >8 bit formats, if the driver has no
+// normalized 16 bit formats.
+// Returns false (and *out is not touched) if no format found.
+bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out)
+{
+    struct ra_imgfmt_desc res = {0};
+
+    struct mp_regular_imgfmt regfmt;
+    if (mp_get_regular_imgfmt(&regfmt, imgfmt)) {
+        enum ra_ctype ctype = RA_CTYPE_UNKNOWN;
+        res.num_planes = regfmt.num_planes;
+        res.component_bits = regfmt.component_size * 8;
+        res.component_pad = regfmt.component_pad;
+        for (int n = 0; n < regfmt.num_planes; n++) {
+            struct mp_regular_imgfmt_plane *plane = &regfmt.planes[n];
+            res.planes[n] = find_plane_format(ra, regfmt.component_size,
+                                              plane->num_components,
+                                              regfmt.component_type);
+            if (!res.planes[n])
+                return false;
+            for (int i = 0; i < plane->num_components; i++)
+                res.components[n][i] = plane->components[i];
+            // Dropping LSBs when shifting will lead to dropped MSBs.
+            if (res.component_bits > res.planes[n]->component_depth[0] &&
+                res.component_pad < 0)
+                return false;
+            // Renderer restriction, but actually an unwanted corner case.
+            if (ctype != RA_CTYPE_UNKNOWN && ctype != res.planes[n]->ctype)
+                return false;
+            ctype = res.planes[n]->ctype;
+        }
+        res.chroma_w = regfmt.chroma_w;
+        res.chroma_h = regfmt.chroma_h;
+        goto supported;
+    }
+
+    for (int n = 0; n < ra->num_formats; n++) {
+        if (imgfmt && ra->formats[n]->special_imgfmt == imgfmt) {
+            res = *ra->formats[n]->special_imgfmt_desc;
+            goto supported;
+        }
+    }
+
+    // Unsupported format
+    return false;
+
+supported:
+
+    *out = res;
+    return true;
+}
+
+void ra_dump_tex_formats(struct ra *ra, int msgl)
+{
+    if (!mp_msg_test(ra->log, msgl))
+        return;
+    MP_MSG(ra, msgl, "Texture formats:\n");
+    MP_MSG(ra, msgl, "  NAME       COMP*TYPE SIZE        DEPTH PER COMP.\n");
+    for (int n = 0; n < ra->num_formats; n++) {
+        const struct ra_format *fmt = ra->formats[n];
+        const char *ctype = "unknown";
+        switch (fmt->ctype) {
+        case RA_CTYPE_UNORM:    ctype = "unorm";    break;
+        case RA_CTYPE_UINT:     ctype = "uint ";    break;
+        case RA_CTYPE_FLOAT:    ctype = "float";    break;
+        }
+        char cl[40] = "";
+        for (int i = 0; i < fmt->num_components; i++) {
+            mp_snprintf_cat(cl, sizeof(cl), "%s%d", i ? " " : "",
+                            fmt->component_size[i]);
+            if (fmt->component_size[i] != fmt->component_depth[i])
+                mp_snprintf_cat(cl, sizeof(cl), "/%d", fmt->component_depth[i]);
+        }
+        MP_MSG(ra, msgl, "  %-10s %d*%s %3dB %s %s %s {%s}\n", fmt->name,
+               fmt->num_components, ctype, fmt->pixel_size,
+               fmt->luminance_alpha ? "LA" : "  ",
+               fmt->linear_filter ? "LF" : "  ",
+               fmt->renderable ? "CR" : "  ", cl);
+    }
+    MP_MSG(ra, msgl, " LA = LUMINANCE_ALPHA hack format\n");
+    MP_MSG(ra, msgl, " LF = linear filterable\n");
+    MP_MSG(ra, msgl, " CR = can be used for render targets\n");
+}
+
+void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
+                         int msgl)
+{
+    char pl[80] = "";
+    char pf[80] = "";
+    for (int n = 0; n < desc->num_planes; n++) {
+        if (n > 0) {
+            mp_snprintf_cat(pl, sizeof(pl), "/");
+            mp_snprintf_cat(pf, sizeof(pf), "/");
+        }
+        char t[5] = {0};
+        for (int i = 0; i < 4; i++)
+            t[i] = "_rgba"[desc->components[n][i]];
+        for (int i = 3; i > 0 && t[i] == '_'; i--)
+            t[i] = '\0';
+        mp_snprintf_cat(pl, sizeof(pl), "%s", t);
+        mp_snprintf_cat(pf, sizeof(pf), "%s", desc->planes[n]->name);
+    }
+    MP_MSG(ra, msgl, "%d planes %dx%d %d/%d [%s] (%s)\n",
+           desc->num_planes, desc->chroma_w, desc->chroma_h,
+           desc->component_bits, desc->component_pad, pf, pl);
+}
+
+void ra_dump_img_formats(struct ra *ra, int msgl)
+{
+    if (!mp_msg_test(ra->log, msgl))
+        return;
+    MP_MSG(ra, msgl, "Image formats:\n");
+    for (int imgfmt = IMGFMT_START; imgfmt < IMGFMT_END; imgfmt++) {
+        const char *name = mp_imgfmt_to_name(imgfmt);
+        if (strcmp(name, "unknown") == 0)
+            continue;
+        MP_MSG(ra, msgl, "  %s", name);
+        struct ra_imgfmt_desc desc;
+        if (ra_get_imgfmt_desc(ra, imgfmt, &desc)) {
+            MP_MSG(ra, msgl, " => ");
+            ra_dump_imgfmt_desc(ra, &desc, msgl);
+        } else {
+            MP_MSG(ra, msgl, "\n");
+        }
+    }
+}
diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h
new file mode 100644
index 0000000000..76f98397f8
--- /dev/null
+++ b/video/out/gpu/ra.h
@@ -0,0 +1,488 @@
+#pragma once
+
+#include "common/common.h"
+#include "misc/bstr.h"
+
+// Handle for a rendering API backend.
+struct ra {
+    struct ra_fns *fns;
+    void *priv;
+
+    int glsl_version;       // GLSL version (e.g. 300 => 3.0)
+    bool glsl_es;           // use ES dialect
+    bool glsl_vulkan;       // use vulkan dialect
+
+    struct mp_log *log;
+
+    // RA_CAP_* bit field. The RA backend must set supported features at init
+    // time.
+    uint64_t caps;
+
+    // Maximum supported width and height of a 2D texture. Set by the RA backend
+    // at init time.
+    int max_texture_wh;
+
+    // Maximum shared memory for compute shaders. Set by the RA backend at init
+    // time.
+    size_t max_shmem;
+
+    // Set of supported texture formats. Must be added by RA backend at init time.
+    // If there are equivalent formats with different caveats, the preferred
+    // formats should have a lower index. (E.g. GLES3 should put rg8 before la.)
+    struct ra_format **formats;
+    int num_formats;
+
+    // Accelerate texture uploads via an extra PBO even when
+    // RA_CAP_DIRECT_UPLOAD is supported. This is basically only relevant for
+    // OpenGL. Set by the RA user.
+    bool use_pbo;
+};
+
+enum {
+    RA_CAP_TEX_1D         = 1 << 0, // supports 1D textures (as shader inputs)
+    RA_CAP_TEX_3D         = 1 << 1, // supports 3D textures (as shader inputs)
+    RA_CAP_BLIT           = 1 << 2, // supports ra_fns.blit
+    RA_CAP_COMPUTE        = 1 << 3, // supports compute shaders
+    RA_CAP_DIRECT_UPLOAD  = 1 << 4, // supports tex_upload without ra_buf
+    RA_CAP_BUF_RO         = 1 << 5, // supports RA_VARTYPE_BUF_RO
+    RA_CAP_BUF_RW         = 1 << 6, // supports RA_VARTYPE_BUF_RW
+    RA_CAP_NESTED_ARRAY   = 1 << 7, // supports nested arrays
+    RA_CAP_SHARED_BINDING = 1 << 8, // sampler/image/buffer namespaces are disjoint
+    RA_CAP_GLOBAL_UNIFORM = 1 << 9, // supports using "naked" uniforms (not UBO)
+};
+
+enum ra_ctype {
+    RA_CTYPE_UNKNOWN = 0,   // also used for inconsistent multi-component formats
+    RA_CTYPE_UNORM,         // unsigned normalized integer (fixed point) formats
+    RA_CTYPE_UINT,          // full integer formats
+    RA_CTYPE_FLOAT,         // float formats (signed, any bit size)
+};
+
+// All formats must be useable as texture formats. All formats must be byte
+// aligned (all pixels start and end on a byte boundary), at least as far CPU
+// transfers are concerned.
+struct ra_format {
+    // All fields are read-only after creation.
+    const char *name;       // symbolic name for user interaction/debugging
+    void *priv;
+    enum ra_ctype ctype;    // data type of each component
+    bool ordered;           // components are sequential in memory, and returned
+                            // by the shader in memory order (the shader can
+                            // return arbitrary values for unused components)
+    int num_components;     // component count, 0 if not applicable, max. 4
+    int component_size[4];  // in bits, all entries 0 if not applicable
+    int component_depth[4]; // bits in use for each component, 0 if not applicable
+                            // (_must_ be set if component_size[] includes padding,
+                            //  and the real procession as seen by shader is lower)
+    int pixel_size;         // in bytes, total pixel size (0 if opaque)
+    bool luminance_alpha;   // pre-GL_ARB_texture_rg hack for 2 component textures
+                            // if this is set, shader must use .ra instead of .rg
+                            // only applies to 2-component textures
+    bool linear_filter;     // linear filtering available from shader
+    bool renderable;        // can be used for render targets
+
+    // If not 0, the format represents some sort of packed fringe format, whose
+    // shader representation is given by the special_imgfmt_desc pointer.
+    int special_imgfmt;
+    const struct ra_imgfmt_desc *special_imgfmt_desc;
+};
+
+struct ra_tex_params {
+    int dimensions;         // 1-3 for 1D-3D textures
+    // Size of the texture. 1D textures require h=d=1, 2D textures require d=1.
+    int w, h, d;
+    const struct ra_format *format;
+    bool render_src;        // must be useable as source texture in a shader
+    bool render_dst;        // must be useable as target texture in a shader
+    bool storage_dst;       // must be usable as a storage image (RA_VARTYPE_IMG_W)
+    bool blit_src;          // must be usable as a blit source
+    bool blit_dst;          // must be usable as a blit destination
+    bool host_mutable;      // texture may be updated with tex_upload
+    // When used as render source texture.
+    bool src_linear;        // if false, use nearest sampling (whether this can
+                            // be true depends on ra_format.linear_filter)
+    bool src_repeat;        // if false, clamp texture coordinates to edge
+                            // if true, repeat texture coordinates
+    bool non_normalized;    // hack for GL_TEXTURE_RECTANGLE OSX idiocy
+                            // always set to false, except in OSX code
+    bool external_oes;      // hack for GL_TEXTURE_EXTERNAL_OES idiocy
+    // If non-NULL, the texture will be created with these contents. Using
+    // this does *not* require setting host_mutable. Otherwise, the initial
+    // data is undefined.
+    void *initial_data;
+};
+
+// Conflates the following typical GPU API concepts:
+// - texture itself
+// - sampler state
+// - staging buffers for texture upload
+// - framebuffer objects
+// - wrappers for swapchain framebuffers
+// - synchronization needed for upload/rendering/etc.
+struct ra_tex {
+    // All fields are read-only after creation.
+    struct ra_tex_params params;
+    void *priv;
+};
+
+struct ra_tex_upload_params {
+    struct ra_tex *tex; // Texture to upload to
+    bool invalidate;    // Discard pre-existing data not in the region uploaded
+    // Uploading from buffer:
+    struct ra_buf *buf; // Buffer to upload from (mutually exclusive with `src`)
+    size_t buf_offset;  // Start of data within buffer (bytes)
+    // Uploading directly: (Note: If RA_CAP_DIRECT_UPLOAD is not set, then this
+    // will be internally translated to a tex_upload buffer by the RA)
+    const void *src;    // Address of data
+    // For 2D textures only:
+    struct mp_rect *rc; // Region to upload. NULL means entire image
+    ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
+};
+
+// Buffer type hint. Setting this may result in more or less efficient
+// operation, although it shouldn't technically prohibit anything
+enum ra_buf_type {
+    RA_BUF_TYPE_INVALID,
+    RA_BUF_TYPE_TEX_UPLOAD,     // texture upload buffer (pixel buffer object)
+    RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
+    RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
+};
+
+struct ra_buf_params {
+    enum ra_buf_type type;
+    size_t size;
+    bool host_mapped;  // create a read-writable persistent mapping (ra_buf.data)
+    bool host_mutable; // contents may be updated via buf_update()
+    // If non-NULL, the buffer will be created with these contents. Otherwise,
+    // the initial data is undefined.
+    void *initial_data;
+};
+
+// A generic buffer, which can be used for many purposes (texture upload,
+// storage buffer, uniform buffer, etc.)
+struct ra_buf {
+    // All fields are read-only after creation.
+    struct ra_buf_params params;
+    void *data; // for persistently mapped buffers, points to the first byte
+    void *priv;
+};
+
+// Type of a shader uniform variable, or a vertex attribute. In all cases,
+// vectors are matrices are done by having more than 1 value.
+enum ra_vartype {
+    RA_VARTYPE_INVALID,
+    RA_VARTYPE_INT,             // C: int, GLSL: int, ivec*
+    RA_VARTYPE_FLOAT,           // C: float, GLSL: float, vec*, mat*
+    RA_VARTYPE_TEX,             // C: ra_tex*, GLSL: various sampler types
+                                // ra_tex.params.render_src must be true
+    RA_VARTYPE_IMG_W,           // C: ra_tex*, GLSL: various image types
+                                // write-only (W) image for compute shaders
+                                // ra_tex.params.storage_dst must be true
+    RA_VARTYPE_BYTE_UNORM,      // C: uint8_t, GLSL: int, vec* (vertex data only)
+    RA_VARTYPE_BUF_RO,          // C: ra_buf*, GLSL: uniform buffer block
+                                // buf type must be RA_BUF_TYPE_UNIFORM
+    RA_VARTYPE_BUF_RW,          // C: ra_buf*, GLSL: shader storage buffer block
+                                // buf type must be RA_BUF_TYPE_SHADER_STORAGE
+    RA_VARTYPE_COUNT
+};
+
+// Returns the host size of a ra_vartype, or 0 for abstract vartypes (e.g. tex)
+size_t ra_vartype_size(enum ra_vartype type);
+
+// Represents a uniform, texture input parameter, and similar things.
+struct ra_renderpass_input {
+    const char *name;       // name as used in the shader
+    enum ra_vartype type;
+    // The total number of values is given by dim_v * dim_m.
+    int dim_v;              // vector dimension (1 for non-vector and non-matrix)
+    int dim_m;              // additional matrix dimension (dim_v x dim_m)
+    // Vertex data: byte offset of the attribute into the vertex struct
+    size_t offset;
+    // RA_VARTYPE_TEX: texture unit
+    // RA_VARTYPE_IMG_W: image unit
+    // RA_VARTYPE_BUF_* buffer binding point
+    // Other uniforms: unused
+    // If RA_CAP_SHARED_BINDING is set, these may only be unique per input type.
+    // Otherwise, these must be unique for all input values.
+    int binding;
+};
+
+// Represents the layout requirements of an input value
+struct ra_layout {
+    size_t align;  // the alignment requirements (always a power of two)
+    size_t stride; // the delta between two rows of an array/matrix
+    size_t size;   // the total size of the input
+};
+
+// Returns the host layout of a render pass input. Returns {0} for renderpass
+// inputs without a corresponding host representation (e.g. textures/buffers)
+struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input);
+
+enum ra_blend {
+    RA_BLEND_ZERO,
+    RA_BLEND_ONE,
+    RA_BLEND_SRC_ALPHA,
+    RA_BLEND_ONE_MINUS_SRC_ALPHA,
+};
+
+enum ra_renderpass_type {
+    RA_RENDERPASS_TYPE_INVALID,
+    RA_RENDERPASS_TYPE_RASTER,  // vertex+fragment shader
+    RA_RENDERPASS_TYPE_COMPUTE, // compute shader
+};
+
+// Static part of a rendering pass. It conflates the following:
+//  - compiled shader and its list of uniforms
+//  - vertex attributes and its shader mappings
+//  - blending parameters
+// (For Vulkan, this would be shader module + pipeline state.)
+// Upon creation, the values of dynamic values such as uniform contents (whose
+// initial values are not provided here) are required to be 0.
+struct ra_renderpass_params {
+    enum ra_renderpass_type type;
+
+    // Uniforms, including texture/sampler inputs.
+    struct ra_renderpass_input *inputs;
+    int num_inputs;
+
+    // Highly implementation-specific byte array storing a compiled version
+    // of the program. Can be used to speed up shader compilation. A backend
+    // xan read this in renderpass_create, or set this on the newly created
+    // ra_renderpass params field.
+    bstr cached_program;
+
+    // --- type==RA_RENDERPASS_TYPE_RASTER only
+
+    // Describes the format of the vertex data. When using ra.glsl_vulkan,
+    // the order of this array must match the vertex attribute locations.
+    struct ra_renderpass_input *vertex_attribs;
+    int num_vertex_attribs;
+    int vertex_stride;
+
+    // Format of the target texture
+    const struct ra_format *target_format;
+
+    // Shader text, in GLSL. (Yes, you need a GLSL compiler.)
+    // These are complete shaders, including prelude and declarations.
+    const char *vertex_shader;
+    const char *frag_shader;
+
+    // Target blending mode. If enable_blend is false, the blend_ fields can
+    // be ignored.
+    bool enable_blend;
+    enum ra_blend blend_src_rgb;
+    enum ra_blend blend_dst_rgb;
+    enum ra_blend blend_src_alpha;
+    enum ra_blend blend_dst_alpha;
+
+    // --- type==RA_RENDERPASS_TYPE_COMPUTE only
+
+    // Shader text, like vertex_shader/frag_shader.
+    const char *compute_shader;
+};
+
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
+        const struct ra_renderpass_params *params);
+
+// Conflates the following typical GPU API concepts:
+// - various kinds of shaders
+// - rendering pipelines
+// - descriptor sets, uniforms, other bindings
+// - all synchronization necessary
+// - the current values of all uniforms (this one makes it relatively stateful
+//   from an API perspective)
+struct ra_renderpass {
+    // All fields are read-only after creation.
+    struct ra_renderpass_params params;
+    void *priv;
+};
+
+// An input value (see ra_renderpass_input).
+struct ra_renderpass_input_val {
+    int index;  // index into ra_renderpass_params.inputs[]
+    void *data; // pointer to data according to ra_renderpass_input
+                // (e.g. type==RA_VARTYPE_FLOAT+dim_v=3,dim_m=3 => float[9])
+};
+
+// Parameters for performing a rendering pass (basically the dynamic params).
+// These change potentially every time.
+struct ra_renderpass_run_params {
+    struct ra_renderpass *pass;
+
+    // Generally this lists parameters only which changed since the last
+    // invocation and need to be updated. The ra_renderpass instance is
+    // supposed to keep unchanged values from the previous run.
+    // For non-primitive types like textures, these entries are always added,
+    // even if they do not change.
+    struct ra_renderpass_input_val *values;
+    int num_values;
+
+    // --- pass->params.type==RA_RENDERPASS_TYPE_RASTER only
+
+    // target->params.render_dst must be true, and target->params.format must
+    // match pass->params.target_format.
+    struct ra_tex *target;
+    struct mp_rect viewport;
+    struct mp_rect scissors;
+
+    // (The primitive type is always a triangle list.)
+    void *vertex_data;
+    int vertex_count;   // number of vertex elements, not bytes
+
+    // --- pass->params.type==RA_RENDERPASS_TYPE_COMPUTE only
+
+    // Number of work groups to be run in X/Y/Z dimensions.
+    int compute_groups[3];
+};
+
+// This is an opaque type provided by the implementation, but we want to at
+// least give it a saner name than void* for code readability purposes.
+typedef void ra_timer;
+
+// Rendering API entrypoints. (Note: there are some additional hidden features
+// you need to take care of. For example, hwdec mapping will be provided
+// separately from ra, but might need to call into ra private code.)
+struct ra_fns {
+    void (*destroy)(struct ra *ra);
+
+    // Create a texture (with undefined contents). Return NULL on failure.
+    // This is a rare operation, and normally textures and even FBOs for
+    // temporary rendering intermediate data are cached.
+    struct ra_tex *(*tex_create)(struct ra *ra,
+                                 const struct ra_tex_params *params);
+
+    void (*tex_destroy)(struct ra *ra, struct ra_tex *tex);
+
+    // Upload data to a texture. This is an extremely common operation. When
+    // using a buffer, the contants of the buffer must exactly match the image
+    // - conversions between bit depth etc. are not supported. The buffer *may*
+    // be marked as "in use" while this operation is going on, and the contents
+    // must not be touched again by the API user until buf_poll returns true.
+    // Returns whether successful.
+    bool (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params);
+
+    // Create a buffer. This can be used as a persistently mapped buffer,
+    // a uniform buffer, a shader storage buffer or possibly others.
+    // Not all usage types must be supported; may return NULL if unavailable.
+    struct ra_buf *(*buf_create)(struct ra *ra,
+                                 const struct ra_buf_params *params);
+
+    void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
+
+    // Update the contents of a buffer, starting at a given offset and up to a
+    // given size, with the contents of *data. This is an extremely common
+    // operation. Calling this while the buffer is considered "in use" is an
+    // error. (See: buf_poll)
+    void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
+                       const void *data, size_t size);
+
+    // Returns if a buffer is currently "in use" or not. Updating the contents
+    // of a buffer (via buf_update or writing to buf->data) while it is still
+    // in use is an error and may result in graphical corruption. Optional, if
+    // NULL then all buffers are always usable.
+    bool (*buf_poll)(struct ra *ra, struct ra_buf *buf);
+
+    // Returns the layout requirements of a uniform buffer element. Optional,
+    // but must be implemented if RA_CAP_BUF_RO is supported.
+    struct ra_layout (*uniform_layout)(struct ra_renderpass_input *inp);
+
+    // Clear the dst with the given color (rgba) and within the given scissor.
+    // dst must have dst->params.render_dst==true. Content outside of the
+    // scissor is preserved.
+    void (*clear)(struct ra *ra, struct ra_tex *dst, float color[4],
+                  struct mp_rect *scissor);
+
+    // Copy a sub-rectangle from one texture to another. The source/dest region
+    // is always within the texture bounds. Areas outside the dest region are
+    // preserved. The formats of the textures must be losely compatible. The
+    // dst texture can be a swapchain framebuffer, but src can not. Only 2D
+    // textures are supported.
+    // The textures must have blit_src and blit_dst set, respectively.
+    // Rectangles with negative width/height lead to flipping, different src/dst
+    // sizes lead to point scaling. Coordinates are always in pixels.
+    // Optional. Only available if RA_CAP_BLIT is set (if it's not set, it must
+    // not be called, even if it's non-NULL).
+    void (*blit)(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
+                 struct mp_rect *dst_rc, struct mp_rect *src_rc);
+
+    // Compile a shader and create a pipeline. This is a rare operation.
+    // The params pointer and anything it points to must stay valid until
+    // renderpass_destroy.
+    struct ra_renderpass *(*renderpass_create)(struct ra *ra,
+                                    const struct ra_renderpass_params *params);
+
+    void (*renderpass_destroy)(struct ra *ra, struct ra_renderpass *pass);
+
+    // Perform a render pass, basically drawing a list of triangles to a FBO.
+    // This is an extremely common operation.
+    void (*renderpass_run)(struct ra *ra,
+                           const struct ra_renderpass_run_params *params);
+
+    // Create a timer object. Returns NULL on failure, or if timers are
+    // unavailable for some reason. Optional.
+    ra_timer *(*timer_create)(struct ra *ra);
+
+    void (*timer_destroy)(struct ra *ra, ra_timer *timer);
+
+    // Start recording a timer. Note that valid usage requires you to pair
+    // every start with a stop. Trying to start a timer twice, or trying to
+    // stop a timer before having started it, consistutes invalid usage.
+    void (*timer_start)(struct ra *ra, ra_timer *timer);
+
+    // Stop recording a timer. This also returns any results that have been
+    // measured since the last usage of this ra_timer. It's important to note
+    // that GPU timer measurement are asynchronous, so this function does not
+    // always produce a value - and the values it does produce are typically
+    // delayed by a few frames. When no value is available, this returns 0.
+    uint64_t (*timer_stop)(struct ra *ra, ra_timer *timer);
+
+    // Associates a marker with any past error messages, for debugging
+    // purposes. Optional.
+    void (*debug_marker)(struct ra *ra, const char *msg);
+};
+
+struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params);
+void ra_tex_free(struct ra *ra, struct ra_tex **tex);
+
+struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params);
+void ra_buf_free(struct ra *ra, struct ra_buf **buf);
+
+void ra_free(struct ra **ra);
+
+const struct ra_format *ra_find_unorm_format(struct ra *ra,
+                                             int bytes_per_component,
+                                             int n_components);
+const struct ra_format *ra_find_uint_format(struct ra *ra,
+                                            int bytes_per_component,
+                                            int n_components);
+const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components);
+const struct ra_format *ra_find_named_format(struct ra *ra, const char *name);
+
+struct ra_imgfmt_desc {
+    int num_planes;
+    const struct ra_format *planes[4];
+    // Chroma pixel size (1x1 is 4:4:4)
+    uint8_t chroma_w, chroma_h;
+    // Component storage size in bits (possibly padded). For formats with
+    // different sizes per component, this is arbitrary. For padded formats
+    // like P010 or YUV420P10, padding is included.
+    int component_bits;
+    // Like mp_regular_imgfmt.component_pad.
+    int component_pad;
+    // For each texture and each texture output (rgba order) describe what
+    // component it returns.
+    // The values are like the values in mp_regular_imgfmt_plane.components[].
+    // Access as components[plane_nr][component_index]. Set unused items to 0.
+    // For ra_format.luminance_alpha, this returns 1/2 ("rg") instead of 1/4
+    // ("ra"). the logic is that the texture format has 2 channels, thus the
+    // data must be returned in the first two components. The renderer fixes
+    // this later.
+    uint8_t components[4][4];
+};
+
+bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out);
+
+void ra_dump_tex_formats(struct ra *ra, int msgl);
+void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
+                         int msgl);
+void ra_dump_img_formats(struct ra *ra, int msgl);
diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c
new file mode 100644
index 0000000000..afda9cc036
--- /dev/null
+++ b/video/out/gpu/shader_cache.c
@@ -0,0 +1,954 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <libavutil/sha.h>
+#include <libavutil/mem.h>
+
+#include "osdep/io.h"
+
+#include "common/common.h"
+#include "options/path.h"
+#include "stream/stream.h"
+#include "shader_cache.h"
+#include "utils.h"
+
+// Force cache flush if more than this number of shaders is created.
+#define SC_MAX_ENTRIES 48
+
+union uniform_val {
+    float f[9];         // RA_VARTYPE_FLOAT
+    int i[4];           // RA_VARTYPE_INT
+    struct ra_tex *tex; // RA_VARTYPE_TEX, RA_VARTYPE_IMG_*
+    struct ra_buf *buf; // RA_VARTYPE_BUF_*
+};
+
+enum sc_uniform_type {
+    SC_UNIFORM_TYPE_GLOBAL = 0, // global uniform (RA_CAP_GLOBAL_UNIFORM)
+    SC_UNIFORM_TYPE_UBO = 1,    // uniform buffer (RA_CAP_BUF_RO)
+};
+
+struct sc_uniform {
+    enum sc_uniform_type type;
+    struct ra_renderpass_input input;
+    const char *glsl_type;
+    union uniform_val v;
+    char *buffer_format;
+    // for SC_UNIFORM_TYPE_UBO:
+    struct ra_layout layout;
+    size_t offset; // byte offset within the buffer
+};
+
+struct sc_cached_uniform {
+    union uniform_val v;
+    int index; // for ra_renderpass_input_val
+    bool set; // whether the uniform has ever been set
+};
+
+struct sc_entry {
+    struct ra_renderpass *pass;
+    struct sc_cached_uniform *cached_uniforms;
+    int num_cached_uniforms;
+    bstr total;
+    struct timer_pool *timer;
+    struct ra_buf *ubo;
+    int ubo_index; // for ra_renderpass_input_val.index
+};
+
+struct gl_shader_cache {
+    struct ra *ra;
+    struct mp_log *log;
+
+    // permanent
+    char **exts;
+    int num_exts;
+
+    // this is modified during use (gl_sc_add() etc.) and reset for each shader
+    bstr prelude_text;
+    bstr header_text;
+    bstr text;
+
+    // Next binding point (texture unit, image unit, buffer binding, etc.)
+    // In OpenGL these are separate for each input type
+    int next_binding[RA_VARTYPE_COUNT];
+
+    struct ra_renderpass_params params;
+
+    struct sc_entry **entries;
+    int num_entries;
+
+    struct sc_entry *current_shader; // set by gl_sc_generate()
+
+    struct sc_uniform *uniforms;
+    int num_uniforms;
+
+    int ubo_binding;
+    size_t ubo_size;
+
+    struct ra_renderpass_input_val *values;
+    int num_values;
+
+    // For checking that the user is calling gl_sc_reset() properly.
+    bool needs_reset;
+
+    bool error_state; // true if an error occurred
+
+    // temporary buffers (avoids frequent reallocations)
+    bstr tmp[6];
+
+    // For the disk-cache.
+    char *cache_dir;
+    struct mpv_global *global; // can be NULL
+};
+
+static void gl_sc_reset(struct gl_shader_cache *sc);
+
+struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
+                                     struct mp_log *log)
+{
+    struct gl_shader_cache *sc = talloc_ptrtype(NULL, sc);
+    *sc = (struct gl_shader_cache){
+        .ra = ra,
+        .global = global,
+        .log = log,
+    };
+    gl_sc_reset(sc);
+    return sc;
+}
+
+// Reset the previous pass. This must be called after gl_sc_generate and before
+// starting a new shader.
+static void gl_sc_reset(struct gl_shader_cache *sc)
+{
+    sc->prelude_text.len = 0;
+    sc->header_text.len = 0;
+    sc->text.len = 0;
+    for (int n = 0; n < sc->num_uniforms; n++)
+        talloc_free((void *)sc->uniforms[n].input.name);
+    sc->num_uniforms = 0;
+    sc->ubo_binding = 0;
+    sc->ubo_size = 0;
+    for (int i = 0; i < RA_VARTYPE_COUNT; i++)
+        sc->next_binding[i] = 0;
+    sc->current_shader = NULL;
+    sc->params = (struct ra_renderpass_params){0};
+    sc->needs_reset = false;
+}
+
+static void sc_flush_cache(struct gl_shader_cache *sc)
+{
+    MP_VERBOSE(sc, "flushing shader cache\n");
+
+    for (int n = 0; n < sc->num_entries; n++) {
+        struct sc_entry *e = sc->entries[n];
+        ra_buf_free(sc->ra, &e->ubo);
+        if (e->pass)
+            sc->ra->fns->renderpass_destroy(sc->ra, e->pass);
+        timer_pool_destroy(e->timer);
+        talloc_free(e);
+    }
+    sc->num_entries = 0;
+}
+
+void gl_sc_destroy(struct gl_shader_cache *sc)
+{
+    if (!sc)
+        return;
+    gl_sc_reset(sc);
+    sc_flush_cache(sc);
+    talloc_free(sc);
+}
+
+bool gl_sc_error_state(struct gl_shader_cache *sc)
+{
+    return sc->error_state;
+}
+
+void gl_sc_reset_error(struct gl_shader_cache *sc)
+{
+    sc->error_state = false;
+}
+
+void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name)
+{
+    for (int n = 0; n < sc->num_exts; n++) {
+        if (strcmp(sc->exts[n], name) == 0)
+            return;
+    }
+    MP_TARRAY_APPEND(sc, sc->exts, sc->num_exts, talloc_strdup(sc, name));
+}
+
+#define bstr_xappend0(sc, b, s) bstr_xappend(sc, b, bstr0(s))
+
+void gl_sc_add(struct gl_shader_cache *sc, const char *text)
+{
+    bstr_xappend0(sc, &sc->text, text);
+}
+
+void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(sc, &sc->text, textf, ap);
+    va_end(ap);
+}
+
+void gl_sc_hadd(struct gl_shader_cache *sc, const char *text)
+{
+    bstr_xappend0(sc, &sc->header_text, text);
+}
+
+void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(sc, &sc->header_text, textf, ap);
+    va_end(ap);
+}
+
+void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text)
+{
+    bstr_xappend(sc, &sc->header_text, text);
+}
+
+void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(sc, &sc->prelude_text, textf, ap);
+    va_end(ap);
+}
+
+static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
+                                       const char *name)
+{
+    struct sc_uniform new = {
+        .input = {
+            .dim_v = 1,
+            .dim_m = 1,
+        },
+    };
+
+    for (int n = 0; n < sc->num_uniforms; n++) {
+        struct sc_uniform *u = &sc->uniforms[n];
+        if (strcmp(u->input.name, name) == 0) {
+            const char *allocname = u->input.name;
+            *u = new;
+            u->input.name = allocname;
+            return u;
+        }
+    }
+
+    // not found -> add it
+    new.input.name = talloc_strdup(NULL, name);
+    MP_TARRAY_APPEND(sc, sc->uniforms, sc->num_uniforms, new);
+    return &sc->uniforms[sc->num_uniforms - 1];
+}
+
+static int gl_sc_next_binding(struct gl_shader_cache *sc, enum ra_vartype type)
+{
+    if (sc->ra->caps & RA_CAP_SHARED_BINDING) {
+        return sc->next_binding[type]++;
+    } else {
+        return sc->next_binding[0]++;
+    }
+}
+
+// Updates the UBO metadata for the given sc_uniform. Assumes sc_uniform->input
+// is already set. Also updates sc_uniform->type.
+static void update_ubo_params(struct gl_shader_cache *sc, struct sc_uniform *u)
+{
+    if (!(sc->ra->caps & RA_CAP_BUF_RO))
+        return;
+
+    // Using UBOs with explicit layout(offset) like we do requires GLSL version
+    // 440 or higher. In theory the UBO code can also use older versions, but
+    // just try and avoid potential headaches. This also ensures they're only
+    // used on drivers that are probably modern enough to actually support them
+    // correctly.
+    if (sc->ra->glsl_version < 440)
+        return;
+
+    u->type = SC_UNIFORM_TYPE_UBO;
+    u->layout = sc->ra->fns->uniform_layout(&u->input);
+    u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align);
+    sc->ubo_size = u->offset + u->layout.size;
+}
+
+void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
+                           struct ra_tex *tex)
+{
+    const char *glsl_type = "sampler2D";
+    if (tex->params.dimensions == 1) {
+        glsl_type = "sampler1D";
+    } else if (tex->params.dimensions == 3) {
+        glsl_type = "sampler3D";
+    } else if (tex->params.non_normalized) {
+        glsl_type = "sampler2DRect";
+    } else if (tex->params.external_oes) {
+        glsl_type = "samplerExternalOES";
+    } else if (tex->params.format->ctype == RA_CTYPE_UINT) {
+        glsl_type = sc->ra->glsl_es ? "highp usampler2D" : "usampler2D";
+    }
+
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_TEX;
+    u->glsl_type = glsl_type;
+    u->input.binding = gl_sc_next_binding(sc, u->input.type);
+    u->v.tex = tex;
+}
+
+void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
+                              struct ra_tex *tex)
+{
+    gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
+
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_IMG_W;
+    u->glsl_type = "writeonly image2D";
+    u->input.binding = gl_sc_next_binding(sc, u->input.type);
+    u->v.tex = tex;
+}
+
+void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
+                char *format, ...)
+{
+    assert(sc->ra->caps & RA_CAP_BUF_RW);
+    gl_sc_enable_extension(sc, "GL_ARB_shader_storage_buffer_object");
+
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_BUF_RW;
+    u->glsl_type = "";
+    u->input.binding = gl_sc_next_binding(sc, u->input.type);
+    u->v.buf = buf;
+
+    va_list ap;
+    va_start(ap, format);
+    u->buffer_format = ta_vasprintf(sc, format, ap);
+    va_end(ap);
+}
+
+void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->glsl_type = "float";
+    update_ubo_params(sc, u);
+    u->v.f[0] = f;
+}
+
+void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int i)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_INT;
+    u->glsl_type = "int";
+    update_ubo_params(sc, u);
+    u->v.i[0] = i;
+}
+
+void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2])
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 2;
+    u->glsl_type = "vec2";
+    update_ubo_params(sc, u);
+    u->v.f[0] = f[0];
+    u->v.f[1] = f[1];
+}
+
+void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3])
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 3;
+    u->glsl_type = "vec3";
+    update_ubo_params(sc, u);
+    u->v.f[0] = f[0];
+    u->v.f[1] = f[1];
+    u->v.f[2] = f[2];
+}
+
+static void transpose2x2(float r[2 * 2])
+{
+    MPSWAP(float, r[0+2*1], r[1+2*0]);
+}
+
+void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 2;
+    u->input.dim_m = 2;
+    u->glsl_type = "mat2";
+    update_ubo_params(sc, u);
+    for (int n = 0; n < 4; n++)
+        u->v.f[n] = v[n];
+    if (transpose)
+        transpose2x2(&u->v.f[0]);
+}
+
+static void transpose3x3(float r[3 * 3])
+{
+    MPSWAP(float, r[0+3*1], r[1+3*0]);
+    MPSWAP(float, r[0+3*2], r[2+3*0]);
+    MPSWAP(float, r[1+3*2], r[2+3*1]);
+}
+
+void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v)
+{
+    struct sc_uniform *u = find_uniform(sc, name);
+    u->input.type = RA_VARTYPE_FLOAT;
+    u->input.dim_v = 3;
+    u->input.dim_m = 3;
+    u->glsl_type = "mat3";
+    update_ubo_params(sc, u);
+    for (int n = 0; n < 9; n++)
+        u->v.f[n] = v[n];
+    if (transpose)
+        transpose3x3(&u->v.f[0]);
+}
+
+// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
+// data layout and attribute names. The entries array is terminated with a {0}
+// entry. The array memory must remain valid indefinitely (for now).
+void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
+                             const struct ra_renderpass_input *entries,
+                             int vertex_stride)
+{
+    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
+    sc->params.num_vertex_attribs = 0;
+    while (entries[sc->params.num_vertex_attribs].name)
+        sc->params.num_vertex_attribs++;
+    sc->params.vertex_stride = vertex_stride;
+}
+
+void gl_sc_blend(struct gl_shader_cache *sc,
+                 enum ra_blend blend_src_rgb,
+                 enum ra_blend blend_dst_rgb,
+                 enum ra_blend blend_src_alpha,
+                 enum ra_blend blend_dst_alpha)
+{
+    sc->params.enable_blend = true;
+    sc->params.blend_src_rgb = blend_src_rgb;
+    sc->params.blend_dst_rgb = blend_dst_rgb;
+    sc->params.blend_src_alpha = blend_src_alpha;
+    sc->params.blend_dst_alpha = blend_dst_alpha;
+}
+
+static const char *vao_glsl_type(const struct ra_renderpass_input *e)
+{
+    // pretty dumb... too dumb, but works for us
+    switch (e->dim_v) {
+    case 1: return "float";
+    case 2: return "vec2";
+    case 3: return "vec3";
+    case 4: return "vec4";
+    default: abort();
+    }
+}
+
+static void update_ubo(struct ra *ra, struct ra_buf *ubo, struct sc_uniform *u)
+{
+    uintptr_t src = (uintptr_t) &u->v;
+    size_t dst = u->offset;
+    struct ra_layout src_layout = ra_renderpass_input_layout(&u->input);
+    struct ra_layout dst_layout = u->layout;
+
+    for (int i = 0; i < u->input.dim_m; i++) {
+        ra->fns->buf_update(ra, ubo, dst, (void *)src, src_layout.stride);
+        src += src_layout.stride;
+        dst += dst_layout.stride;
+    }
+}
+
+static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
+                           struct sc_uniform *u, int n)
+{
+    struct sc_cached_uniform *un = &e->cached_uniforms[n];
+    struct ra_layout layout = ra_renderpass_input_layout(&u->input);
+    if (layout.size > 0 && un->set && memcmp(&un->v, &u->v, layout.size) == 0)
+        return;
+
+    un->v = u->v;
+    un->set = true;
+
+    switch (u->type) {
+    case SC_UNIFORM_TYPE_GLOBAL: {
+        struct ra_renderpass_input_val value = {
+            .index = un->index,
+            .data = &un->v,
+        };
+        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, value);
+        break;
+    }
+    case SC_UNIFORM_TYPE_UBO:
+        assert(e->ubo);
+        update_ubo(sc->ra, e->ubo, u);
+        break;
+    default: abort();
+    }
+}
+
+void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir)
+{
+    talloc_free(sc->cache_dir);
+    sc->cache_dir = talloc_strdup(sc, dir);
+}
+
+static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
+{
+    bool ret = false;
+
+    void *tmp = talloc_new(NULL);
+    struct ra_renderpass_params params = sc->params;
+
+    MP_VERBOSE(sc, "new shader program:\n");
+    if (sc->header_text.len) {
+        MP_VERBOSE(sc, "header:\n");
+        mp_log_source(sc->log, MSGL_V, sc->header_text.start);
+        MP_VERBOSE(sc, "body:\n");
+    }
+    if (sc->text.len)
+        mp_log_source(sc->log, MSGL_V, sc->text.start);
+
+    // The vertex shader uses mangled names for the vertex attributes, so that
+    // the fragment shader can use the "real" names. But the shader is expecting
+    // the vertex attribute names (at least with older GLSL targets for GL).
+    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
+                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
+    for (int n = 0; n < params.num_vertex_attribs; n++) {
+        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
+        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
+    }
+
+    const char *cache_header = "mpv shader cache v1\n";
+    char *cache_filename = NULL;
+    char *cache_dir = NULL;
+
+    if (sc->cache_dir && sc->cache_dir[0]) {
+        // Try to load it from a disk cache.
+        cache_dir = mp_get_user_path(tmp, sc->global, sc->cache_dir);
+
+        struct AVSHA *sha = av_sha_alloc();
+        if (!sha)
+            abort();
+        av_sha_init(sha, 256);
+        av_sha_update(sha, entry->total.start, entry->total.len);
+
+        uint8_t hash[256 / 8];
+        av_sha_final(sha, hash);
+        av_free(sha);
+
+        char hashstr[256 / 8 * 2 + 1];
+        for (int n = 0; n < 256 / 8; n++)
+            snprintf(hashstr + n * 2, sizeof(hashstr) - n * 2, "%02X", hash[n]);
+
+        cache_filename = mp_path_join(tmp, cache_dir, hashstr);
+        if (stat(cache_filename, &(struct stat){0}) == 0) {
+            MP_VERBOSE(sc, "Trying to load shader from disk...\n");
+            struct bstr cachedata =
+                stream_read_file(cache_filename, tmp, sc->global, 1000000000);
+            if (bstr_eatstart0(&cachedata, cache_header))
+                params.cached_program = cachedata;
+        }
+    }
+
+    // If using a UBO, also make sure to add it as an input value so the RA
+    // can see it
+    if (sc->ubo_size) {
+        entry->ubo_index = sc->params.num_inputs;
+        struct ra_renderpass_input ubo_input = {
+            .name = "UBO",
+            .type = RA_VARTYPE_BUF_RO,
+            .dim_v = 1,
+            .dim_m = 1,
+            .binding = sc->ubo_binding,
+        };
+        MP_TARRAY_APPEND(sc, params.inputs, params.num_inputs, ubo_input);
+    }
+
+    entry->pass = sc->ra->fns->renderpass_create(sc->ra, &params);
+    if (!entry->pass)
+        goto error;
+
+    if (sc->ubo_size) {
+        struct ra_buf_params ubo_params = {
+            .type = RA_BUF_TYPE_UNIFORM,
+            .size = sc->ubo_size,
+            .host_mutable = true,
+        };
+
+        entry->ubo = ra_buf_create(sc->ra, &ubo_params);
+        if (!entry->ubo) {
+            MP_ERR(sc, "Failed creating uniform buffer!\n");
+            goto error;
+        }
+    }
+
+    if (entry->pass && cache_filename) {
+        bstr nc = entry->pass->params.cached_program;
+        if (nc.len && !bstr_equals(params.cached_program, nc)) {
+            mp_mkdirp(cache_dir);
+
+            MP_VERBOSE(sc, "Writing shader cache file: %s\n", cache_filename);
+            FILE *out = fopen(cache_filename, "wb");
+            if (out) {
+                fwrite(cache_header, strlen(cache_header), 1, out);
+                fwrite(nc.start, nc.len, 1, out);
+                fclose(out);
+            }
+        }
+    }
+
+    ret = true;
+
+error:
+    talloc_free(tmp);
+    return ret;
+}
+
+#define ADD(x, ...) bstr_xappend_asprintf(sc, (x), __VA_ARGS__)
+#define ADD_BSTR(x, s) bstr_xappend(sc, (x), (s))
+
+static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
+{
+    // Add all of the UBO entries separately as members of their own buffer
+    if (sc->ubo_size > 0) {
+        ADD(dst, "layout(std140, binding=%d) uniform UBO {\n", sc->ubo_binding);
+        for (int n = 0; n < sc->num_uniforms; n++) {
+            struct sc_uniform *u = &sc->uniforms[n];
+            if (u->type != SC_UNIFORM_TYPE_UBO)
+                continue;
+            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset,
+                u->glsl_type, u->input.name);
+        }
+        ADD(dst, "};\n");
+    }
+
+    for (int n = 0; n < sc->num_uniforms; n++) {
+        struct sc_uniform *u = &sc->uniforms[n];
+        if (u->type != SC_UNIFORM_TYPE_GLOBAL)
+            continue;
+        switch (u->input.type) {
+        case RA_VARTYPE_INT:
+        case RA_VARTYPE_FLOAT:
+            assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM);
+            // fall through
+        case RA_VARTYPE_TEX:
+        case RA_VARTYPE_IMG_W:
+            // Vulkan requires explicitly assigning the bindings in the shader
+            // source. For OpenGL it's optional, but requires higher GL version
+            // so we don't do it (and instead have ra_gl update the bindings
+            // after program creation).
+            if (sc->ra->glsl_vulkan)
+                ADD(dst, "layout(binding=%d) ", u->input.binding);
+            ADD(dst, "uniform %s %s;\n", u->glsl_type, u->input.name);
+            break;
+        case RA_VARTYPE_BUF_RO:
+            ADD(dst, "layout(std140, binding=%d) uniform %s { %s };\n",
+                u->input.binding, u->input.name, u->buffer_format);
+            break;
+        case RA_VARTYPE_BUF_RW:
+            ADD(dst, "layout(std430, binding=%d) buffer %s { %s };\n",
+                u->input.binding, u->input.name, u->buffer_format);
+            break;
+        }
+    }
+}
+
+// 1. Generate vertex and fragment shaders from the fragment shader text added
+//    with gl_sc_add(). The generated shader program is cached (based on the
+//    text), so actual compilation happens only the first time.
+// 2. Update the uniforms and textures set with gl_sc_uniform_*.
+// 3. Make the new shader program current (glUseProgram()).
+// After that, you render, and then you call gc_sc_reset(), which does:
+// 1. Unbind the program and all textures.
+// 2. Reset the sc state and prepare for a new shader program. (All uniforms
+//    and fragment operations needed for the next program have to be re-added.)
+static void gl_sc_generate(struct gl_shader_cache *sc,
+                           enum ra_renderpass_type type,
+                           const struct ra_format *target_format)
+{
+    int glsl_version = sc->ra->glsl_version;
+    int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
+
+    sc->params.type = type;
+
+    // gl_sc_reset() must be called after ending the previous render process,
+    // and before starting a new one.
+    assert(!sc->needs_reset);
+    sc->needs_reset = true;
+
+    // gl_sc_set_vertex_format() must always be called
+    assert(sc->params.vertex_attribs);
+
+    // If using a UBO, pick a binding (needed for shader generation)
+    if (sc->ubo_size)
+        sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
+
+    for (int n = 0; n < MP_ARRAY_SIZE(sc->tmp); n++)
+        sc->tmp[n].len = 0;
+
+    // set up shader text (header + uniforms + body)
+    bstr *header = &sc->tmp[0];
+    ADD(header, "#version %d%s\n", glsl_version, glsl_es >= 300 ? " es" : "");
+    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
+        // This extension cannot be enabled in fragment shader. Enable it as
+        // an exception for compute shader.
+        ADD(header, "#extension GL_ARB_compute_shader : enable\n");
+    }
+    for (int n = 0; n < sc->num_exts; n++)
+        ADD(header, "#extension %s : enable\n", sc->exts[n]);
+    if (glsl_es) {
+        ADD(header, "precision mediump float;\n");
+        ADD(header, "precision mediump sampler2D;\n");
+        if (sc->ra->caps & RA_CAP_TEX_3D)
+            ADD(header, "precision mediump sampler3D;\n");
+    }
+
+    if (glsl_version >= 130) {
+        ADD(header, "#define tex1D texture\n");
+        ADD(header, "#define tex3D texture\n");
+    } else {
+        ADD(header, "#define tex1D texture1D\n");
+        ADD(header, "#define tex3D texture3D\n");
+        ADD(header, "#define texture texture2D\n");
+    }
+
+    if (sc->ra->glsl_vulkan && type == RA_RENDERPASS_TYPE_COMPUTE) {
+        ADD(header, "#define gl_GlobalInvocationIndex "
+                    "(gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID)\n");
+    }
+
+    // Additional helpers.
+    ADD(header, "#define LUT_POS(x, lut_size)"
+                " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
+
+    char *vert_in = glsl_version >= 130 ? "in" : "attribute";
+    char *vert_out = glsl_version >= 130 ? "out" : "varying";
+    char *frag_in = glsl_version >= 130 ? "in" : "varying";
+
+    struct bstr *vert = NULL, *frag = NULL, *comp = NULL;
+
+    if (type == RA_RENDERPASS_TYPE_RASTER) {
+        // vertex shader: we don't use the vertex shader, so just setup a
+        // dummy, which passes through the vertex array attributes.
+        bstr *vert_head = &sc->tmp[1];
+        ADD_BSTR(vert_head, *header);
+        bstr *vert_body = &sc->tmp[2];
+        ADD(vert_body, "void main() {\n");
+        bstr *frag_vaos = &sc->tmp[3];
+        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
+            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
+            const char *glsl_type = vao_glsl_type(e);
+            char loc[32] = {0};
+            if (sc->ra->glsl_vulkan)
+                snprintf(loc, sizeof(loc), "layout(location=%d) ", n);
+            if (strcmp(e->name, "position") == 0) {
+                // setting raster pos. requires setting gl_Position magic variable
+                assert(e->dim_v == 2 && e->type == RA_VARTYPE_FLOAT);
+                ADD(vert_head, "%s%s vec2 vertex_position;\n", loc, vert_in);
+                ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n");
+            } else {
+                ADD(vert_head, "%s%s %s vertex_%s;\n", loc, vert_in, glsl_type, e->name);
+                ADD(vert_head, "%s%s %s %s;\n", loc, vert_out, glsl_type, e->name);
+                ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name);
+                ADD(frag_vaos, "%s%s %s %s;\n", loc, frag_in, glsl_type, e->name);
+            }
+        }
+        ADD(vert_body, "}\n");
+        vert = vert_head;
+        ADD_BSTR(vert, *vert_body);
+
+        // fragment shader; still requires adding used uniforms and VAO elements
+        frag = &sc->tmp[4];
+        ADD_BSTR(frag, *header);
+        if (glsl_version >= 130) {
+            ADD(frag, "%sout vec4 out_color;\n",
+                sc->ra->glsl_vulkan ? "layout(location=0) " : "");
+        }
+        ADD_BSTR(frag, *frag_vaos);
+        add_uniforms(sc, frag);
+
+        ADD_BSTR(frag, sc->prelude_text);
+        ADD_BSTR(frag, sc->header_text);
+
+        ADD(frag, "void main() {\n");
+        // we require _all_ frag shaders to write to a "vec4 color"
+        ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
+        ADD_BSTR(frag, sc->text);
+        if (glsl_version >= 130) {
+            ADD(frag, "out_color = color;\n");
+        } else {
+            ADD(frag, "gl_FragColor = color;\n");
+        }
+        ADD(frag, "}\n");
+
+        // We need to fix the format of the render dst at renderpass creation
+        // time
+        assert(target_format);
+        sc->params.target_format = target_format;
+    }
+
+    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
+        comp = &sc->tmp[4];
+        ADD_BSTR(comp, *header);
+
+        add_uniforms(sc, comp);
+
+        ADD_BSTR(comp, sc->prelude_text);
+        ADD_BSTR(comp, sc->header_text);
+
+        ADD(comp, "void main() {\n");
+        ADD(comp, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); // convenience
+        ADD_BSTR(comp, sc->text);
+        ADD(comp, "}\n");
+    }
+
+    bstr *hash_total = &sc->tmp[5];
+
+    ADD(hash_total, "type %d\n", sc->params.type);
+
+    if (frag) {
+        ADD_BSTR(hash_total, *frag);
+        sc->params.frag_shader = frag->start;
+    }
+    ADD(hash_total, "\n");
+    if (vert) {
+        ADD_BSTR(hash_total, *vert);
+        sc->params.vertex_shader = vert->start;
+    }
+    ADD(hash_total, "\n");
+    if (comp) {
+        ADD_BSTR(hash_total, *comp);
+        sc->params.compute_shader = comp->start;
+    }
+    ADD(hash_total, "\n");
+
+    if (sc->params.enable_blend) {
+        ADD(hash_total, "blend %d %d %d %d\n",
+            sc->params.blend_src_rgb, sc->params.blend_dst_rgb,
+            sc->params.blend_src_alpha, sc->params.blend_dst_alpha);
+    }
+
+    if (sc->params.target_format)
+        ADD(hash_total, "format %s\n", sc->params.target_format->name);
+
+    struct sc_entry *entry = NULL;
+    for (int n = 0; n < sc->num_entries; n++) {
+        struct sc_entry *cur = sc->entries[n];
+        if (bstr_equals(cur->total, *hash_total)) {
+            entry = cur;
+            break;
+        }
+    }
+    if (!entry) {
+        if (sc->num_entries == SC_MAX_ENTRIES)
+            sc_flush_cache(sc);
+        entry = talloc_ptrtype(NULL, entry);
+        *entry = (struct sc_entry){
+            .total = bstrdup(entry, *hash_total),
+            .timer = timer_pool_create(sc->ra),
+        };
+        for (int n = 0; n < sc->num_uniforms; n++) {
+            struct sc_cached_uniform u = {0};
+            if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
+                // global uniforms need to be made visible to the ra_renderpass
+                u.index = sc->params.num_inputs;
+                MP_TARRAY_APPEND(sc, sc->params.inputs, sc->params.num_inputs,
+                                 sc->uniforms[n].input);
+            }
+            MP_TARRAY_APPEND(entry, entry->cached_uniforms,
+                             entry->num_cached_uniforms, u);
+        }
+        if (!create_pass(sc, entry))
+            sc->error_state = true;
+        MP_TARRAY_APPEND(sc, sc->entries, sc->num_entries, entry);
+    }
+    if (sc->error_state)
+        return;
+
+    assert(sc->num_uniforms == entry->num_cached_uniforms);
+
+    sc->num_values = 0;
+    for (int n = 0; n < sc->num_uniforms; n++)
+        update_uniform(sc, entry, &sc->uniforms[n], n);
+
+    // If we're using a UBO, make sure to bind it as well
+    if (sc->ubo_size) {
+        struct ra_renderpass_input_val ubo_val = {
+            .index = entry->ubo_index,
+            .data = &entry->ubo,
+        };
+        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, ubo_val);
+    }
+
+    sc->current_shader = entry;
+}
+
+struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
+                                        struct ra_tex *target,
+                                        void *ptr, size_t num)
+{
+    struct timer_pool *timer = NULL;
+
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
+    if (!sc->current_shader)
+        goto error;
+
+    timer = sc->current_shader->timer;
+
+    struct mp_rect full_rc = {0, 0, target->params.w, target->params.h};
+
+    struct ra_renderpass_run_params run = {
+        .pass = sc->current_shader->pass,
+        .values = sc->values,
+        .num_values = sc->num_values,
+        .target = target,
+        .vertex_data = ptr,
+        .vertex_count = num,
+        .viewport = full_rc,
+        .scissors = full_rc,
+    };
+
+    timer_pool_start(timer);
+    sc->ra->fns->renderpass_run(sc->ra, &run);
+    timer_pool_stop(timer);
+
+error:
+    gl_sc_reset(sc);
+    return timer_pool_measure(timer);
+}
+
+struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
+                                           int w, int h, int d)
+{
+    struct timer_pool *timer = NULL;
+
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
+    if (!sc->current_shader)
+        goto error;
+
+    timer = sc->current_shader->timer;
+
+    struct ra_renderpass_run_params run = {
+        .pass = sc->current_shader->pass,
+        .values = sc->values,
+        .num_values = sc->num_values,
+        .compute_groups = {w, h, d},
+    };
+
+    timer_pool_start(timer);
+    sc->ra->fns->renderpass_run(sc->ra, &run);
+    timer_pool_stop(timer);
+
+error:
+    gl_sc_reset(sc);
+    return timer_pool_measure(timer);
+}
diff --git a/video/out/gpu/shader_cache.h b/video/out/gpu/shader_cache.h
new file mode 100644
index 0000000000..82a078079b
--- /dev/null
+++ b/video/out/gpu/shader_cache.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "ra.h"
+
+// For mp_pass_perf
+#include "video/out/vo.h"
+
+struct mp_log;
+struct mpv_global;
+struct gl_shader_cache;
+
+struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
+                                     struct mp_log *log);
+void gl_sc_destroy(struct gl_shader_cache *sc);
+bool gl_sc_error_state(struct gl_shader_cache *sc);
+void gl_sc_reset_error(struct gl_shader_cache *sc);
+void gl_sc_add(struct gl_shader_cache *sc, const char *text);
+void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
+    PRINTF_ATTRIBUTE(2, 3);
+void gl_sc_hadd(struct gl_shader_cache *sc, const char *text);
+void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
+    PRINTF_ATTRIBUTE(2, 3);
+void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
+void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
+    PRINTF_ATTRIBUTE(2, 3);
+void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
+                           struct ra_tex *tex);
+void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
+                              struct ra_tex *tex);
+void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
+                char *format, ...) PRINTF_ATTRIBUTE(4, 5);
+void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f);
+void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int f);
+void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2]);
+void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3]);
+void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v);
+void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
+                        bool transpose, float *v);
+void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
+                             const struct ra_renderpass_input *vertex_attribs,
+                             int vertex_stride);
+void gl_sc_blend(struct gl_shader_cache *sc,
+                 enum ra_blend blend_src_rgb,
+                 enum ra_blend blend_dst_rgb,
+                 enum ra_blend blend_src_alpha,
+                 enum ra_blend blend_dst_alpha);
+void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
+struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
+                                        struct ra_tex *target,
+                                        void *ptr, size_t num);
+struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
+                                           int w, int h, int d);
+void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir);
diff --git a/video/out/gpu/user_shaders.c b/video/out/gpu/user_shaders.c
new file mode 100644
index 0000000000..446941b03f
--- /dev/null
+++ b/video/out/gpu/user_shaders.c
@@ -0,0 +1,452 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+
+#include "common/msg.h"
+#include "misc/ctype.h"
+#include "user_shaders.h"
+
+static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
+{
+    int pos = 0;
+
+    while (line.len > 0) {
+        struct bstr word = bstr_strip(bstr_splitchar(line, &line, ' '));
+        if (word.len == 0)
+            continue;
+
+        if (pos >= MAX_SZEXP_SIZE)
+            return false;
+
+        struct szexp *exp = &out[pos++];
+
+        if (bstr_eatend0(&word, ".w") || bstr_eatend0(&word, ".width")) {
+            exp->tag = SZEXP_VAR_W;
+            exp->val.varname = word;
+            continue;
+        }
+
+        if (bstr_eatend0(&word, ".h") || bstr_eatend0(&word, ".height")) {
+            exp->tag = SZEXP_VAR_H;
+            exp->val.varname = word;
+            continue;
+        }
+
+        switch (word.start[0]) {
+        case '+': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_ADD; continue;
+        case '-': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_SUB; continue;
+        case '*': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_MUL; continue;
+        case '/': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_DIV; continue;
+        case '!': exp->tag = SZEXP_OP1; exp->val.op = SZEXP_OP_NOT; continue;
+        case '>': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_GT;  continue;
+        case '<': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_LT;  continue;
+        }
+
+        if (mp_isdigit(word.start[0])) {
+            exp->tag = SZEXP_CONST;
+            if (bstr_sscanf(word, "%f", &exp->val.cval) != 1)
+                return false;
+            continue;
+        }
+
+        // Some sort of illegal expression
+        return false;
+    }
+
+    return true;
+}
+
+// Returns whether successful. 'result' is left untouched on failure
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result)
+{
+    float stack[MAX_SZEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
+
+    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SZEXP_END:
+            goto done;
+
+        case SZEXP_CONST:
+            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SZEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
+
+        case SZEXP_OP1:
+            if (idx < 1) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            switch (expr[i].val.op) {
+            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: abort();
+            }
+            continue;
+
+        case SZEXP_OP2:
+            if (idx < 2) {
+                mp_warn(log, "Stack underflow in RPN expression!\n");
+                return false;
+            }
+
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SZEXP_OP_ADD: res = op1 + op2; break;
+            case SZEXP_OP_SUB: res = op1 - op2; break;
+            case SZEXP_OP_MUL: res = op1 * op2; break;
+            case SZEXP_OP_DIV: res = op1 / op2; break;
+            case SZEXP_OP_GT:  res = op1 > op2; break;
+            case SZEXP_OP_LT:  res = op1 < op2; break;
+            default: abort();
+            }
+
+            if (!isfinite(res)) {
+                mp_warn(log, "Illegal operation in RPN expression!\n");
+                return false;
+            }
+
+            stack[idx++] = res;
+            continue;
+
+        case SZEXP_VAR_W:
+        case SZEXP_VAR_H: {
+            struct bstr name = expr[i].val.varname;
+            float size[2];
+
+            if (!lookup(priv, name, size)) {
+                mp_warn(log, "Variable %.*s not found in RPN expression!\n",
+                        BSTR_P(name));
+                return false;
+            }
+
+            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? size[0] : size[1];
+            continue;
+            }
+        }
+    }
+
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        mp_warn(log, "Malformed stack after RPN expression!\n");
+        return false;
+    }
+
+    *result = stack[0];
+    return true;
+}
+
+static bool parse_hook(struct mp_log *log, struct bstr *body,
+                       struct gl_user_shader_hook *out)
+{
+    *out = (struct gl_user_shader_hook){
+        .pass_desc = bstr0("(unknown)"),
+        .offset = identity_trans,
+        .width = {{ SZEXP_VAR_W, { .varname = bstr0("HOOKED") }}},
+        .height = {{ SZEXP_VAR_H, { .varname = bstr0("HOOKED") }}},
+        .cond = {{ SZEXP_CONST, { .cval = 1.0 }}},
+    };
+
+    int hook_idx = 0;
+    int bind_idx = 0;
+
+    // Parse all headers
+    while (true) {
+        struct bstr rest;
+        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
+
+        // Check for the presence of the magic line beginning
+        if (!bstr_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        // Parse the supported commands
+        if (bstr_eatstart0(&line, "HOOK")) {
+            if (hook_idx == SHADER_MAX_HOOKS) {
+                mp_err(log, "Passes may only hook up to %d textures!\n",
+                       SHADER_MAX_HOOKS);
+                return false;
+            }
+            out->hook_tex[hook_idx++] = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "BIND")) {
+            if (bind_idx == SHADER_MAX_BINDS) {
+                mp_err(log, "Passes may only bind up to %d textures!\n",
+                       SHADER_MAX_BINDS);
+                return false;
+            }
+            out->bind_tex[bind_idx++] = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "SAVE")) {
+            out->save_tex = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "DESC")) {
+            out->pass_desc = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "OFFSET")) {
+            float ox, oy;
+            if (bstr_sscanf(line, "%f %f", &ox, &oy) != 2) {
+                mp_err(log, "Error while parsing OFFSET!\n");
+                return false;
+            }
+            out->offset.t[0] = ox;
+            out->offset.t[1] = oy;
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "WIDTH")) {
+            if (!parse_rpn_szexpr(line, out->width)) {
+                mp_err(log, "Error while parsing WIDTH!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "HEIGHT")) {
+            if (!parse_rpn_szexpr(line, out->height)) {
+                mp_err(log, "Error while parsing HEIGHT!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "WHEN")) {
+            if (!parse_rpn_szexpr(line, out->cond)) {
+                mp_err(log, "Error while parsing WHEN!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "COMPONENTS")) {
+            if (bstr_sscanf(line, "%d", &out->components) != 1) {
+                mp_err(log, "Error while parsing COMPONENTS!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "COMPUTE")) {
+            struct compute_info *ci = &out->compute;
+            int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
+                                  &ci->threads_w, &ci->threads_h);
+
+            if (num == 2 || num == 4) {
+                ci->active = true;
+                ci->directly_writes = true;
+            } else {
+                mp_err(log, "Error while parsing COMPUTE!\n");
+                return false;
+            }
+            continue;
+        }
+
+        // Unknown command type
+        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
+        return false;
+    }
+
+    // The rest of the file up until the next magic line beginning (if any)
+    // shall be the shader body
+    if (bstr_split_tok(*body, "//!", &out->pass_body, body)) {
+        // Make sure the magic line is part of the rest
+        body->start -= 3;
+        body->len += 3;
+    }
+
+    // Sanity checking
+    if (hook_idx == 0)
+        mp_warn(log, "Pass has no hooked textures (will be ignored)!\n");
+
+    return true;
+}
+
+static bool parse_tex(struct mp_log *log, struct ra *ra, struct bstr *body,
+                      struct gl_user_shader_tex *out)
+{
+    *out = (struct gl_user_shader_tex){
+        .name = bstr0("USER_TEX"),
+        .params = {
+            .dimensions = 2,
+            .w = 1, .h = 1, .d = 1,
+            .render_src = true,
+            .src_linear = true,
+        },
+    };
+    struct ra_tex_params *p = &out->params;
+
+    while (true) {
+        struct bstr rest;
+        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
+
+        if (!bstr_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        if (bstr_eatstart0(&line, "TEXTURE")) {
+            out->name = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "SIZE")) {
+            p->dimensions = bstr_sscanf(line, "%d %d %d", &p->w, &p->h, &p->d);
+            if (p->dimensions < 1 || p->dimensions > 3 ||
+                p->w < 1 || p->h < 1 || p->d < 1)
+            {
+                mp_err(log, "Error while parsing SIZE!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "FORMAT ")) {
+            p->format = NULL;
+            for (int n = 0; n < ra->num_formats; n++) {
+                const struct ra_format *fmt = ra->formats[n];
+                if (bstr_equals0(line, fmt->name)) {
+                    p->format = fmt;
+                    break;
+                }
+            }
+            // (pixel_size==0 is for opaque formats)
+            if (!p->format || !p->format->pixel_size) {
+                mp_err(log, "Unrecognized/unavailable FORMAT name: '%.*s'!\n",
+                       BSTR_P(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "FILTER")) {
+            line = bstr_strip(line);
+            if (bstr_equals0(line, "LINEAR")) {
+                p->src_linear = true;
+            } else if (bstr_equals0(line, "NEAREST")) {
+                p->src_linear = false;
+            } else {
+                mp_err(log, "Unrecognized FILTER: '%.*s'!\n", BSTR_P(line));
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "BORDER")) {
+            line = bstr_strip(line);
+            if (bstr_equals0(line, "CLAMP")) {
+                p->src_repeat = false;
+            } else if (bstr_equals0(line, "REPEAT")) {
+                p->src_repeat = true;
+            } else {
+                mp_err(log, "Unrecognized BORDER: '%.*s'!\n", BSTR_P(line));
+                return false;
+            }
+            continue;
+        }
+
+        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
+        return false;
+    }
+
+    if (!p->format) {
+        mp_err(log, "No FORMAT specified.\n");
+        return false;
+    }
+
+    if (p->src_linear && !p->format->linear_filter) {
+        mp_err(log, "The specified texture format cannot be filtered!\n");
+        return false;
+    }
+
+    // Decode the rest of the section (up to the next //! marker) as raw hex
+    // data for the texture
+    struct bstr hexdata;
+    if (bstr_split_tok(*body, "//!", &hexdata, body)) {
+        // Make sure the magic line is part of the rest
+        body->start -= 3;
+        body->len += 3;
+    }
+
+    struct bstr tex;
+    if (!bstr_decode_hex(NULL, bstr_strip(hexdata), &tex)) {
+        mp_err(log, "Error while parsing TEXTURE body: must be a valid "
+                    "hexadecimal sequence, on a single line!\n");
+        return false;
+    }
+
+    int expected_len = p->w * p->h * p->d * p->format->pixel_size;
+    if (tex.len != expected_len) {
+        mp_err(log, "Shader TEXTURE size mismatch: got %zd bytes, expected %d!\n",
+               tex.len, expected_len);
+        talloc_free(tex.start);
+        return false;
+    }
+
+    p->initial_data = tex.start;
+    return true;
+}
+
+void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
+                       void *priv,
+                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
+                       bool (*dotex)(void *p, struct gl_user_shader_tex tex))
+{
+    if (!dohook || !dotex || !shader.len)
+        return;
+
+    // Skip all garbage (e.g. comments) before the first header
+    int pos = bstr_find(shader, bstr0("//!"));
+    if (pos < 0) {
+        mp_warn(log, "Shader appears to contain no headers!\n");
+        return;
+    }
+    shader = bstr_cut(shader, pos);
+
+    // Loop over the file
+    while (shader.len > 0)
+    {
+        // Peek at the first header to dispatch the right type
+        if (bstr_startswith0(shader, "//!TEXTURE")) {
+            struct gl_user_shader_tex t;
+            if (!parse_tex(log, ra, &shader, &t) || !dotex(priv, t))
+                return;
+            continue;
+        }
+
+        struct gl_user_shader_hook h;
+        if (!parse_hook(log, &shader, &h) || !dohook(priv, h))
+            return;
+    }
+}
diff --git a/video/out/gpu/user_shaders.h b/video/out/gpu/user_shaders.h
new file mode 100644
index 0000000000..94a070c8e2
--- /dev/null
+++ b/video/out/gpu/user_shaders.h
@@ -0,0 +1,98 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_USER_SHADERS_H
+#define MP_GL_USER_SHADERS_H
+
+#include "utils.h"
+#include "ra.h"
+
+#define SHADER_MAX_PASSES 32
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 6
+#define SHADER_MAX_SAVED 64
+#define MAX_SZEXP_SIZE 32
+
+enum szexp_op {
+    SZEXP_OP_ADD,
+    SZEXP_OP_SUB,
+    SZEXP_OP_MUL,
+    SZEXP_OP_DIV,
+    SZEXP_OP_NOT,
+    SZEXP_OP_GT,
+    SZEXP_OP_LT,
+};
+
+enum szexp_tag {
+    SZEXP_END = 0, // End of an RPN expression
+    SZEXP_CONST, // Push a constant value onto the stack
+    SZEXP_VAR_W, // Get the width/height of a named texture (variable)
+    SZEXP_VAR_H,
+    SZEXP_OP2, // Pop two elements and push the result of a dyadic operation
+    SZEXP_OP1, // Pop one element and push the result of a monadic operation
+};
+
+struct szexp {
+    enum szexp_tag tag;
+    union {
+        float cval;
+        struct bstr varname;
+        enum szexp_op op;
+    } val;
+};
+
+struct compute_info {
+    bool active;
+    int block_w, block_h;     // Block size (each block corresponds to one WG)
+    int threads_w, threads_h; // How many threads form a working group
+    bool directly_writes;     // If true, shader is assumed to imageStore(out_image)
+};
+
+struct gl_user_shader_hook {
+    struct bstr pass_desc;
+    struct bstr hook_tex[SHADER_MAX_HOOKS];
+    struct bstr bind_tex[SHADER_MAX_BINDS];
+    struct bstr save_tex;
+    struct bstr pass_body;
+    struct gl_transform offset;
+    struct szexp width[MAX_SZEXP_SIZE];
+    struct szexp height[MAX_SZEXP_SIZE];
+    struct szexp cond[MAX_SZEXP_SIZE];
+    int components;
+    struct compute_info compute;
+};
+
+struct gl_user_shader_tex {
+    struct bstr name;
+    struct ra_tex_params params;
+    // for video.c
+    struct ra_tex *tex;
+};
+
+// Parse the next shader block from `body`. The callbacks are invoked on every
+// valid shader block parsed.
+void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
+                       void *priv,
+                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
+                       bool (*dotex)(void *p, struct gl_user_shader_tex tex));
+
+// Evaluate a szexp, given a lookup function for named textures
+bool eval_szexpr(struct mp_log *log, void *priv,
+                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
+                 struct szexp expr[MAX_SZEXP_SIZE], float *result);
+
+#endif
diff --git a/video/out/gpu/utils.c b/video/out/gpu/utils.c
new file mode 100644
index 0000000000..f8dcbaac60
--- /dev/null
+++ b/video/out/gpu/utils.c
@@ -0,0 +1,372 @@
+#include "common/msg.h"
+#include "video/out/vo.h"
+#include "utils.h"
+
+// Standard parallel 2D projection, except y1 < y0 means that the coordinate
+// system is flipped, not the projection.
+void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
+                        float y0, float y1)
+{
+    if (y1 < y0) {
+        float tmp = y0;
+        y0 = tmp - y1;
+        y1 = tmp;
+    }
+
+    t->m[0][0] = 2.0f / (x1 - x0);
+    t->m[0][1] = 0.0f;
+    t->m[1][0] = 0.0f;
+    t->m[1][1] = 2.0f / (y1 - y0);
+    t->t[0] = -(x1 + x0) / (x1 - x0);
+    t->t[1] = -(y1 + y0) / (y1 - y0);
+}
+
+// Apply the effects of one transformation to another, transforming it in the
+// process. In other words: post-composes t onto x
+void gl_transform_trans(struct gl_transform t, struct gl_transform *x)
+{
+    struct gl_transform xt = *x;
+    x->m[0][0] = t.m[0][0] * xt.m[0][0] + t.m[0][1] * xt.m[1][0];
+    x->m[1][0] = t.m[1][0] * xt.m[0][0] + t.m[1][1] * xt.m[1][0];
+    x->m[0][1] = t.m[0][0] * xt.m[0][1] + t.m[0][1] * xt.m[1][1];
+    x->m[1][1] = t.m[1][0] * xt.m[0][1] + t.m[1][1] * xt.m[1][1];
+    gl_transform_vec(t, &x->t[0], &x->t[1]);
+}
+
+void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo)
+{
+    int y_dir = fbo.flip ? -1 : 1;
+    gl_transform_ortho(t, 0, fbo.tex->params.w, 0, fbo.tex->params.h * y_dir);
+}
+
+void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool)
+{
+    for (int i = 0; i < pool->num_buffers; i++)
+        ra_buf_free(ra, &pool->buffers[i]);
+
+    talloc_free(pool->buffers);
+    *pool = (struct ra_buf_pool){0};
+}
+
+static bool ra_buf_params_compatible(const struct ra_buf_params *new,
+                                     const struct ra_buf_params *old)
+{
+    return new->type == old->type &&
+           new->size <= old->size &&
+           new->host_mapped  == old->host_mapped &&
+           new->host_mutable == old->host_mutable;
+}
+
+static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
+{
+    struct ra_buf *buf = ra_buf_create(ra, &pool->current_params);
+    if (!buf)
+        return false;
+
+    MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
+    MP_VERBOSE(ra, "Resized buffer pool of type %u to size %d\n",
+               pool->current_params.type, pool->num_buffers);
+    return true;
+}
+
+struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
+                               const struct ra_buf_params *params)
+{
+    assert(!params->initial_data);
+
+    if (!ra_buf_params_compatible(params, &pool->current_params)) {
+        ra_buf_pool_uninit(ra, pool);
+        pool->current_params = *params;
+    }
+
+    // Make sure we have at least one buffer available
+    if (!pool->buffers && !ra_buf_pool_grow(ra, pool))
+        return NULL;
+
+    // Make sure the next buffer is available for use
+    if (!ra->fns->buf_poll(ra, pool->buffers[pool->index]) &&
+        !ra_buf_pool_grow(ra, pool))
+    {
+        return NULL;
+    }
+
+    struct ra_buf *buf = pool->buffers[pool->index++];
+    pool->index %= pool->num_buffers;
+
+    return buf;
+}
+
+bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
+                       const struct ra_tex_upload_params *params)
+{
+    if (params->buf)
+        return ra->fns->tex_upload(ra, params);
+
+    struct ra_tex *tex = params->tex;
+    size_t row_size = tex->params.dimensions == 2 ? params->stride :
+                      tex->params.w * tex->params.format->pixel_size;
+
+    struct ra_buf_params bufparams = {
+        .type = RA_BUF_TYPE_TEX_UPLOAD,
+        .size = row_size * tex->params.h * tex->params.d,
+        .host_mutable = true,
+    };
+
+    struct ra_buf *buf = ra_buf_pool_get(ra, pbo, &bufparams);
+    if (!buf)
+        return false;
+
+    ra->fns->buf_update(ra, buf, 0, params->src, bufparams.size);
+
+    struct ra_tex_upload_params newparams = *params;
+    newparams.buf = buf;
+    newparams.src = NULL;
+
+    return ra->fns->tex_upload(ra, &newparams);
+}
+
+struct ra_layout std140_layout(struct ra_renderpass_input *inp)
+{
+    size_t el_size = ra_vartype_size(inp->type);
+
+    // std140 packing rules:
+    // 1. The alignment of generic values is their size in bytes
+    // 2. The alignment of vectors is the vector length * the base count, with
+    // the exception of vec3 which is always aligned like vec4
+    // 3. The alignment of arrays is that of the element size rounded up to
+    // the nearest multiple of vec4
+    // 4. Matrices are treated like arrays of vectors
+    // 5. Arrays/matrices are laid out with a stride equal to the alignment
+    size_t size = el_size * inp->dim_v;
+    if (inp->dim_v == 3)
+        size += el_size;
+    if (inp->dim_m > 1)
+        size = MP_ALIGN_UP(size, sizeof(float[4]));
+
+    return (struct ra_layout) {
+        .align  = size,
+        .stride = size,
+        .size   = size * inp->dim_m,
+    };
+}
+
+struct ra_layout std430_layout(struct ra_renderpass_input *inp)
+{
+    size_t el_size = ra_vartype_size(inp->type);
+
+    // std430 packing rules: like std140, except arrays/matrices are always
+    // "tightly" packed, even arrays/matrices of vec3s
+    size_t size = el_size * inp->dim_v;
+    if (inp->dim_v == 3 && inp->dim_m == 1)
+        size += el_size;
+
+    return (struct ra_layout) {
+        .align  = size,
+        .stride = size,
+        .size   = size * inp->dim_m,
+    };
+}
+
+// Create a texture and a FBO using the texture as color attachments.
+//  fmt: texture internal format
+// If the parameters are the same as the previous call, do not touch it.
+// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H.
+// Enabling FUZZY for W or H means the w or h does not need to be exact.
+bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
+                   int w, int h, const struct ra_format *fmt, int flags)
+{
+    int lw = w, lh = h;
+
+    if (fbo->tex) {
+        int cw = w, ch = h;
+        int rw = fbo->tex->params.w, rh = fbo->tex->params.h;
+
+        if ((flags & FBOTEX_FUZZY_W) && cw < rw)
+            cw = rw;
+        if ((flags & FBOTEX_FUZZY_H) && ch < rh)
+            ch = rh;
+
+        if (rw == cw && rh == ch && fbo->tex->params.format == fmt)
+            goto done;
+    }
+
+    if (flags & FBOTEX_FUZZY_W)
+        w = MP_ALIGN_UP(w, 256);
+    if (flags & FBOTEX_FUZZY_H)
+        h = MP_ALIGN_UP(h, 256);
+
+    mp_verbose(log, "Create FBO: %dx%d (%dx%d)\n", lw, lh, w, h);
+
+    if (!fmt || !fmt->renderable || !fmt->linear_filter) {
+        mp_err(log, "Format %s not supported.\n", fmt ? fmt->name : "(unset)");
+        return false;
+    }
+
+    fbotex_uninit(fbo);
+
+    *fbo = (struct fbotex) {
+        .ra = ra,
+    };
+
+    struct ra_tex_params params = {
+        .dimensions = 2,
+        .w = w,
+        .h = h,
+        .d = 1,
+        .format = fmt,
+        .src_linear = true,
+        .render_src = true,
+        .render_dst = true,
+        .storage_dst = true,
+        .blit_src = true,
+    };
+
+    fbo->tex = ra_tex_create(fbo->ra, &params);
+
+    if (!fbo->tex) {
+        mp_err(log, "Error: framebuffer could not be created.\n");
+        fbotex_uninit(fbo);
+        return false;
+    }
+
+done:
+
+    fbo->lw = lw;
+    fbo->lh = lh;
+
+    fbo->fbo = (struct fbodst){
+        .tex = fbo->tex,
+    };
+
+    return true;
+}
+
+void fbotex_uninit(struct fbotex *fbo)
+{
+    if (fbo->ra) {
+        ra_tex_free(fbo->ra, &fbo->tex);
+        *fbo = (struct fbotex) {0};
+    }
+}
+
+struct timer_pool {
+    struct ra *ra;
+    ra_timer *timer;
+    bool running; // detect invalid usage
+
+    uint64_t samples[VO_PERF_SAMPLE_COUNT];
+    int sample_idx;
+    int sample_count;
+
+    uint64_t sum;
+    uint64_t peak;
+};
+
+struct timer_pool *timer_pool_create(struct ra *ra)
+{
+    if (!ra->fns->timer_create)
+        return NULL;
+
+    ra_timer *timer = ra->fns->timer_create(ra);
+    if (!timer)
+        return NULL;
+
+    struct timer_pool *pool = talloc(NULL, struct timer_pool);
+    if (!pool) {
+        ra->fns->timer_destroy(ra, timer);
+        return NULL;
+    }
+
+    *pool = (struct timer_pool){ .ra = ra, .timer = timer };
+    return pool;
+}
+
+void timer_pool_destroy(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    pool->ra->fns->timer_destroy(pool->ra, pool->timer);
+    talloc_free(pool);
+}
+
+void timer_pool_start(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    assert(!pool->running);
+    pool->ra->fns->timer_start(pool->ra, pool->timer);
+    pool->running = true;
+}
+
+void timer_pool_stop(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    assert(pool->running);
+    uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
+    pool->running = false;
+
+    if (res) {
+        // Input res into the buffer and grab the previous value
+        uint64_t old = pool->samples[pool->sample_idx];
+        pool->sample_count = MPMIN(pool->sample_count + 1, VO_PERF_SAMPLE_COUNT);
+        pool->samples[pool->sample_idx++] = res;
+        pool->sample_idx %= VO_PERF_SAMPLE_COUNT;
+        pool->sum = pool->sum + res - old;
+
+        // Update peak if necessary
+        if (res >= pool->peak) {
+            pool->peak = res;
+        } else if (pool->peak == old) {
+            // It's possible that the last peak was the value we just removed,
+            // if so we need to scan for the new peak
+            uint64_t peak = res;
+            for (int i = 0; i < VO_PERF_SAMPLE_COUNT; i++)
+                peak = MPMAX(peak, pool->samples[i]);
+            pool->peak = peak;
+        }
+    }
+}
+
+struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
+{
+    if (!pool)
+        return (struct mp_pass_perf){0};
+
+    struct mp_pass_perf res = {
+        .peak = pool->peak,
+        .count = pool->sample_count,
+    };
+
+    int idx = pool->sample_idx - pool->sample_count + VO_PERF_SAMPLE_COUNT;
+    for (int i = 0; i < res.count; i++) {
+        idx %= VO_PERF_SAMPLE_COUNT;
+        res.samples[i] = pool->samples[idx++];
+    }
+
+    if (res.count > 0) {
+        res.last = res.samples[res.count - 1];
+        res.avg = pool->sum / res.count;
+    }
+
+    return res;
+}
+
+void mp_log_source(struct mp_log *log, int lev, const char *src)
+{
+    int line = 1;
+    if (!src)
+        return;
+    while (*src) {
+        const char *end = strchr(src, '\n');
+        const char *next = end + 1;
+        if (!end)
+            next = end = src + strlen(src);
+        mp_msg(log, lev, "[%3d] %.*s\n", line, (int)(end - src), src);
+        line++;
+        src = next;
+    }
+}
diff --git a/video/out/gpu/utils.h b/video/out/gpu/utils.h
new file mode 100644
index 0000000000..04695f8085
--- /dev/null
+++ b/video/out/gpu/utils.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <stdbool.h>
+#include <math.h>
+
+#include "ra.h"
+
+// A 3x2 matrix, with the translation part separate.
+struct gl_transform {
+    // row-major, e.g. in mathematical notation:
+    //  | m[0][0] m[0][1] |
+    //  | m[1][0] m[1][1] |
+    float m[2][2];
+    float t[2];
+};
+
+static const struct gl_transform identity_trans = {
+    .m = {{1.0, 0.0}, {0.0, 1.0}},
+    .t = {0.0, 0.0},
+};
+
+void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
+                        float y0, float y1);
+
+// This treats m as an affine transformation, in other words m[2][n] gets
+// added to the output.
+static inline void gl_transform_vec(struct gl_transform t, float *x, float *y)
+{
+    float vx = *x, vy = *y;
+    *x = vx * t.m[0][0] + vy * t.m[0][1] + t.t[0];
+    *y = vx * t.m[1][0] + vy * t.m[1][1] + t.t[1];
+}
+
+struct mp_rect_f {
+    float x0, y0, x1, y1;
+};
+
+// Semantic equality (fuzzy comparison)
+static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
+{
+    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
+           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
+}
+
+static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
+{
+    gl_transform_vec(t, &r->x0, &r->y0);
+    gl_transform_vec(t, &r->x1, &r->y1);
+}
+
+static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
+{
+    for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+            if (a.m[x][y] != b.m[x][y])
+                return false;
+        }
+    }
+
+    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
+}
+
+void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
+
+struct fbodst {
+    struct ra_tex *tex;
+    bool flip; // mirror vertically
+};
+
+void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo);
+
+// A pool of buffers, which can grow as needed
+struct ra_buf_pool {
+    struct ra_buf_params current_params;
+    struct ra_buf **buffers;
+    int num_buffers;
+    int index;
+};
+
+void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool);
+
+// Note: params->initial_data is *not* supported
+struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
+                               const struct ra_buf_params *params);
+
+// Helper that wraps ra_tex_upload using texture upload buffers to ensure that
+// params->buf is always set. This is intended for RA-internal usage.
+bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
+                       const struct ra_tex_upload_params *params);
+
+// Layout rules for GLSL's packing modes
+struct ra_layout std140_layout(struct ra_renderpass_input *inp);
+struct ra_layout std430_layout(struct ra_renderpass_input *inp);
+
+struct fbotex {
+    struct ra *ra;
+    struct ra_tex *tex;
+    int lw, lh; // logical (configured) size, <= than texture size
+    struct fbodst fbo;
+};
+
+void fbotex_uninit(struct fbotex *fbo);
+bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
+                   int w, int h, const struct ra_format *fmt, int flags);
+#define FBOTEX_FUZZY_W 1
+#define FBOTEX_FUZZY_H 2
+#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
+
+// A wrapper around ra_timer that does result pooling, averaging etc.
+struct timer_pool;
+
+struct timer_pool *timer_pool_create(struct ra *ra);
+void timer_pool_destroy(struct timer_pool *pool);
+void timer_pool_start(struct timer_pool *pool);
+void timer_pool_stop(struct timer_pool *pool);
+struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);
+
+// print a multi line string with line numbers (e.g. for shader sources)
+// log, lev: module and log level, as in mp_msg()
+void mp_log_source(struct mp_log *log, int lev, const char *src);
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
new file mode 100644
index 0000000000..e36fde60e8
--- /dev/null
+++ b/video/out/gpu/video.c
@@ -0,0 +1,3809 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include <libavutil/common.h>
+#include <libavutil/lfg.h>
+
+#include "video.h"
+
+#include "misc/bstr.h"
+#include "options/m_config.h"
+#include "common/global.h"
+#include "options/options.h"
+#include "utils.h"
+#include "hwdec.h"
+#include "osd.h"
+#include "ra.h"
+#include "stream/stream.h"
+#include "video_shaders.h"
+#include "user_shaders.h"
+#include "video/out/filter_kernels.h"
+#include "video/out/aspect.h"
+#include "video/out/dither.h"
+#include "video/out/vo.h"
+
+// scale/cscale arguments that map directly to shader filter routines.
+// Note that the convolution filters are not included in this list.
+static const char *const fixed_scale_filters[] = {
+    "bilinear",
+    "bicubic_fast",
+    "oversample",
+    NULL
+};
+static const char *const fixed_tscale_filters[] = {
+    "oversample",
+    "linear",
+    NULL
+};
+
+// must be sorted, and terminated with 0
+int filter_sizes[] =
+    {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
+int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
+
+struct vertex_pt {
+    float x, y;
+};
+
+struct vertex {
+    struct vertex_pt position;
+    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
+};
+
+static const struct ra_renderpass_input vertex_vao[] = {
+    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
+    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
+    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
+    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
+    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
+    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
+    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
+    {0}
+};
+
+struct texplane {
+    struct ra_tex *tex;
+    int w, h;
+    bool flipped;
+};
+
+struct video_image {
+    struct texplane planes[4];
+    struct mp_image *mpi;       // original input image
+    uint64_t id;                // unique ID identifying mpi contents
+    bool hwdec_mapped;
+};
+
+enum plane_type {
+    PLANE_NONE = 0,
+    PLANE_RGB,
+    PLANE_LUMA,
+    PLANE_CHROMA,
+    PLANE_ALPHA,
+    PLANE_XYZ,
+};
+
+static const char *plane_names[] = {
+    [PLANE_NONE] = "unknown",
+    [PLANE_RGB] = "rgb",
+    [PLANE_LUMA] = "luma",
+    [PLANE_CHROMA] = "chroma",
+    [PLANE_ALPHA] = "alpha",
+    [PLANE_XYZ] = "xyz",
+};
+
+// A self-contained description of a source image which can be bound to a
+// texture unit and sampled from. Contains metadata about how it's to be used
+struct img_tex {
+    enum plane_type type; // must be set to something non-zero
+    int components; // number of relevant coordinates
+    float multiplier; // multiplier to be used when sampling
+    struct ra_tex *tex;
+    int w, h; // logical size (after transformation)
+    struct gl_transform transform; // rendering transformation
+};
+
+// A named img_tex, for user scripting purposes
+struct saved_tex {
+    const char *name;
+    struct img_tex tex;
+};
+
+// A texture hook. This is some operation that transforms a named texture as
+// soon as it's generated
+struct tex_hook {
+    const char *save_tex;
+    const char *hook_tex[SHADER_MAX_HOOKS];
+    const char *bind_tex[TEXUNIT_VIDEO_NUM];
+    int components; // how many components are relevant (0 = same as input)
+    void *priv; // this gets talloc_freed when the tex_hook is removed
+    void (*hook)(struct gl_video *p, struct img_tex tex, // generates GLSL
+                 struct gl_transform *trans, void *priv);
+    bool (*cond)(struct gl_video *p, struct img_tex tex, void *priv);
+};
+
+struct fbosurface {
+    struct fbotex fbotex;
+    uint64_t id;
+    double pts;
+};
+
+#define FBOSURFACES_MAX 10
+
+struct cached_file {
+    char *path;
+    struct bstr body;
+};
+
+struct pass_info {
+    struct bstr desc;
+    struct mp_pass_perf perf;
+};
+
+#define PASS_INFO_MAX (SHADER_MAX_PASSES + 32)
+
+struct dr_buffer {
+    struct ra_buf *buf;
+    // The mpi reference will keep the data from being recycled (or from other
+    // references gaining write access) while the GPU is accessing the buffer.
+    struct mp_image *mpi;
+};
+
+struct gl_video {
+    struct ra *ra;
+
+    struct mpv_global *global;
+    struct mp_log *log;
+    struct gl_video_opts opts;
+    struct m_config_cache *opts_cache;
+    struct gl_lcms *cms;
+
+    int fb_depth;               // actual bits available in GL main framebuffer
+    struct m_color clear_color;
+    bool force_clear_color;
+
+    struct gl_shader_cache *sc;
+
+    struct osd_state *osd_state;
+    struct mpgl_osd *osd;
+    double osd_pts;
+
+    struct ra_tex *lut_3d_texture;
+    bool use_lut_3d;
+    int lut_3d_size[3];
+
+    struct ra_tex *dither_texture;
+
+    struct mp_image_params real_image_params;   // configured format
+    struct mp_image_params image_params;        // texture format (mind hwdec case)
+    struct ra_imgfmt_desc ra_format;            // texture format
+    int plane_count;
+
+    bool is_gray;
+    bool has_alpha;
+    char color_swizzle[5];
+    bool use_integer_conversion;
+
+    struct video_image image;
+
+    struct dr_buffer *dr_buffers;
+    int num_dr_buffers;
+
+    bool using_dr_path;
+
+    bool dumb_mode;
+    bool forced_dumb_mode;
+
+    const struct ra_format *fbo_format;
+    struct fbotex merge_fbo[4];
+    struct fbotex scale_fbo[4];
+    struct fbotex integer_fbo[4];
+    struct fbotex indirect_fbo;
+    struct fbotex blend_subs_fbo;
+    struct fbotex screen_fbo;
+    struct fbotex output_fbo;
+    struct fbosurface surfaces[FBOSURFACES_MAX];
+    struct fbotex vdpau_deinterleave_fbo[2];
+    struct ra_buf *hdr_peak_ssbo;
+
+    // user pass descriptions and textures
+    struct tex_hook tex_hooks[SHADER_MAX_PASSES];
+    int tex_hook_num;
+    struct gl_user_shader_tex user_textures[SHADER_MAX_PASSES];
+    int user_tex_num;
+
+    int surface_idx;
+    int surface_now;
+    int frames_drawn;
+    bool is_interpolated;
+    bool output_fbo_valid;
+
+    // state for configured scalers
+    struct scaler scaler[SCALER_COUNT];
+
+    struct mp_csp_equalizer_state *video_eq;
+
+    struct mp_rect src_rect;    // displayed part of the source video
+    struct mp_rect dst_rect;    // video rectangle on output window
+    struct mp_osd_res osd_rect; // OSD size/margins
+
+    // temporary during rendering
+    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    struct compute_info pass_compute; // compute shader metadata for this pass
+    int pass_tex_num;
+    int texture_w, texture_h;
+    struct gl_transform texture_offset; // texture transform without rotation
+    int components;
+    bool use_linear;
+    float user_gamma;
+
+    // pass info / metrics
+    struct pass_info pass_fresh[PASS_INFO_MAX];
+    struct pass_info pass_redraw[PASS_INFO_MAX];
+    struct pass_info *pass;
+    int pass_idx;
+    struct timer_pool *upload_timer;
+    struct timer_pool *blit_timer;
+    struct timer_pool *osd_timer;
+
+    // intermediate textures
+    struct saved_tex saved_tex[SHADER_MAX_SAVED];
+    int saved_tex_num;
+    struct fbotex hook_fbos[SHADER_MAX_SAVED];
+    int hook_fbo_num;
+
+    int frames_uploaded;
+    int frames_rendered;
+    AVLFG lfg;
+
+    // Cached because computing it can take relatively long
+    int last_dither_matrix_size;
+    float *last_dither_matrix;
+
+    struct cached_file *files;
+    int num_files;
+
+    struct ra_hwdec *hwdec;
+    struct ra_hwdec_mapper *hwdec_mapper;
+    bool hwdec_active;
+
+    bool dsi_warned;
+    bool broken_frame; // temporary error state
+};
+
+static const struct gl_video_opts gl_video_opts_def = {
+    .dither_algo = DITHER_FRUIT,
+    .dither_depth = -1,
+    .dither_size = 6,
+    .temporal_dither_period = 1,
+    .fbo_format = "auto",
+    .sigmoid_center = 0.75,
+    .sigmoid_slope = 6.5,
+    .scaler = {
+        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .cutoff = 0.001}, // scale
+        {{NULL,       .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .cutoff = 0.001}, // dscale
+        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .cutoff = 0.001}, // cscale
+        {{"mitchell", .params={NAN, NAN}}, {.params = {NAN, NAN}},
+         .clamp = 1, }, // tscale
+    },
+    .scaler_resizes_only = 1,
+    .scaler_lut_size = 6,
+    .interpolation_threshold = 0.0001,
+    .alpha_mode = ALPHA_BLEND_TILES,
+    .background = {0, 0, 0, 255},
+    .gamma = 1.0f,
+    .tone_mapping = TONE_MAPPING_MOBIUS,
+    .tone_mapping_param = NAN,
+    .tone_mapping_desat = 2.0,
+    .early_flush = -1,
+};
+
+static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param);
+
+static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param);
+
+#define OPT_BASE_STRUCT struct gl_video_opts
+
+#define SCALER_OPTS(n, i) \
+    OPT_STRING_VALIDATE(n, scaler[i].kernel.name, 0, validate_scaler_opt), \
+    OPT_FLOAT(n"-param1", scaler[i].kernel.params[0], 0),                  \
+    OPT_FLOAT(n"-param2", scaler[i].kernel.params[1], 0),                  \
+    OPT_FLOAT(n"-blur",   scaler[i].kernel.blur, 0),                       \
+    OPT_FLOATRANGE(n"-cutoff", scaler[i].cutoff, 0, 0.0, 1.0),             \
+    OPT_FLOATRANGE(n"-taper", scaler[i].kernel.taper, 0, 0.0, 1.0),        \
+    OPT_FLOAT(n"-wparam", scaler[i].window.params[0], 0),                  \
+    OPT_FLOAT(n"-wblur",  scaler[i].window.blur, 0),                       \
+    OPT_FLOATRANGE(n"-wtaper", scaler[i].window.taper, 0, 0.0, 1.0),       \
+    OPT_FLOATRANGE(n"-clamp", scaler[i].clamp, 0, 0.0, 1.0),               \
+    OPT_FLOATRANGE(n"-radius",    scaler[i].radius, 0, 0.5, 16.0),         \
+    OPT_FLOATRANGE(n"-antiring",  scaler[i].antiring, 0, 0.0, 1.0),        \
+    OPT_STRING_VALIDATE(n"-window", scaler[i].window.name, 0, validate_window_opt)
+
+const struct m_sub_options gl_video_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_CHOICE("gpu-dumb-mode", dumb_mode, 0,
+                   ({"auto", 0}, {"yes", 1}, {"no", -1})),
+        OPT_FLOATRANGE("gamma-factor", gamma, 0, 0.1, 2.0),
+        OPT_FLAG("gamma-auto", gamma_auto, 0),
+        OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
+        OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
+        OPT_CHOICE("tone-mapping", tone_mapping, 0,
+                   ({"clip",     TONE_MAPPING_CLIP},
+                    {"mobius",   TONE_MAPPING_MOBIUS},
+                    {"reinhard", TONE_MAPPING_REINHARD},
+                    {"hable",    TONE_MAPPING_HABLE},
+                    {"gamma",    TONE_MAPPING_GAMMA},
+                    {"linear",   TONE_MAPPING_LINEAR})),
+        OPT_FLAG("hdr-compute-peak", compute_hdr_peak, 0),
+        OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
+        OPT_FLOAT("tone-mapping-desaturate", tone_mapping_desat, 0),
+        OPT_FLAG("gamut-warning", gamut_warning, 0),
+        OPT_FLAG("opengl-pbo", pbo, 0),
+        SCALER_OPTS("scale",  SCALER_SCALE),
+        SCALER_OPTS("dscale", SCALER_DSCALE),
+        SCALER_OPTS("cscale", SCALER_CSCALE),
+        SCALER_OPTS("tscale", SCALER_TSCALE),
+        OPT_INTRANGE("scaler-lut-size", scaler_lut_size, 0, 4, 10),
+        OPT_FLAG("scaler-resizes-only", scaler_resizes_only, 0),
+        OPT_FLAG("linear-scaling", linear_scaling, 0),
+        OPT_FLAG("correct-downscaling", correct_downscaling, 0),
+        OPT_FLAG("sigmoid-upscaling", sigmoid_upscaling, 0),
+        OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
+        OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
+        OPT_STRING("fbo-format", fbo_format, 0),
+        OPT_CHOICE_OR_INT("dither-depth", dither_depth, 0, -1, 16,
+                          ({"no", -1}, {"auto", 0})),
+        OPT_CHOICE("dither", dither_algo, 0,
+                   ({"fruit", DITHER_FRUIT},
+                    {"ordered", DITHER_ORDERED},
+                    {"no", DITHER_NONE})),
+        OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
+        OPT_FLAG("temporal-dither", temporal_dither, 0),
+        OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
+        OPT_CHOICE("alpha", alpha_mode, 0,
+                   ({"no", ALPHA_NO},
+                    {"yes", ALPHA_YES},
+                    {"blend", ALPHA_BLEND},
+                    {"blend-tiles", ALPHA_BLEND_TILES})),
+        OPT_FLAG("opengl-rectangle-textures", use_rectangle, 0),
+        OPT_COLOR("background", background, 0),
+        OPT_FLAG("interpolation", interpolation, 0),
+        OPT_FLOAT("interpolation-threshold", interpolation_threshold, 0),
+        OPT_CHOICE("blend-subtitles", blend_subs, 0,
+                   ({"no", BLEND_SUBS_NO},
+                    {"yes", BLEND_SUBS_YES},
+                    {"video", BLEND_SUBS_VIDEO})),
+        OPT_PATHLIST("glsl-shaders", user_shaders, 0),
+        OPT_CLI_ALIAS("glsl-shader", "glsl-shaders-append"),
+        OPT_FLAG("deband", deband, 0),
+        OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
+        OPT_FLOAT("sharpen", unsharp, 0),
+        OPT_INTRANGE("gpu-tex-pad-x", tex_pad_x, 0, 0, 4096),
+        OPT_INTRANGE("gpu-tex-pad-y", tex_pad_y, 0, 0, 4096),
+        OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
+        OPT_STRING("gpu-shader-cache-dir", shader_cache_dir, 0),
+        OPT_REPLACED("hdr-tone-mapping", "tone-mapping"),
+        OPT_REPLACED("opengl-shaders", "glsl-shaders"),
+        OPT_CLI_ALIAS("opengl-shader", "glsl-shaders-append"),
+        OPT_REPLACED("opengl-shader-cache-dir", "gpu-shader-cache-dir"),
+        OPT_REPLACED("opengl-tex-pad-x", "gpu-tex-pad-x"),
+        OPT_REPLACED("opengl-tex-pad-y", "gpu-tex-pad-y"),
+        OPT_REPLACED("opengl-fbo-format", "fbo-format"),
+        OPT_REPLACED("opengl-dumb-mode", "gpu-dumb-mode"),
+        OPT_REPLACED("opengl-gamma", "gpu-gamma"),
+        {0}
+    },
+    .size = sizeof(struct gl_video_opts),
+    .defaults = &gl_video_opts_def,
+};
+
+static void uninit_rendering(struct gl_video *p);
+static void uninit_scaler(struct gl_video *p, struct scaler *scaler);
+static void check_gl_features(struct gl_video *p);
+static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id);
+static const char *handle_scaler_opt(const char *name, bool tscale);
+static void reinit_from_options(struct gl_video *p);
+static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
+static void gl_video_setup_hooks(struct gl_video *p);
+
+#define GLSL(x) gl_sc_add(p->sc, #x "\n");
+#define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
+#define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__)
+#define PRELUDE(...) gl_sc_paddf(p->sc, __VA_ARGS__)
+
+static struct bstr load_cached_file(struct gl_video *p, const char *path)
+{
+    if (!path || !path[0])
+        return (struct bstr){0};
+    for (int n = 0; n < p->num_files; n++) {
+        if (strcmp(p->files[n].path, path) == 0)
+            return p->files[n].body;
+    }
+    // not found -> load it
+    struct bstr s = stream_read_file(path, p, p->global, 1024000); // 1024 kB
+    if (s.len) {
+        struct cached_file new = {
+            .path = talloc_strdup(p, path),
+            .body = s,
+        };
+        MP_TARRAY_APPEND(p, p->files, p->num_files, new);
+        return new.body;
+    }
+    return (struct bstr){0};
+}
+
+static void debug_check_gl(struct gl_video *p, const char *msg)
+{
+    if (p->ra->fns->debug_marker)
+        p->ra->fns->debug_marker(p->ra, msg);
+}
+
+static void gl_video_reset_surfaces(struct gl_video *p)
+{
+    for (int i = 0; i < FBOSURFACES_MAX; i++) {
+        p->surfaces[i].id = 0;
+        p->surfaces[i].pts = MP_NOPTS_VALUE;
+    }
+    p->surface_idx = 0;
+    p->surface_now = 0;
+    p->frames_drawn = 0;
+    p->output_fbo_valid = false;
+}
+
+static void gl_video_reset_hooks(struct gl_video *p)
+{
+    for (int i = 0; i < p->tex_hook_num; i++)
+        talloc_free(p->tex_hooks[i].priv);
+
+    for (int i = 0; i < p->user_tex_num; i++)
+        ra_tex_free(p->ra, &p->user_textures[i].tex);
+
+    p->tex_hook_num = 0;
+    p->user_tex_num = 0;
+}
+
+static inline int fbosurface_wrap(int id)
+{
+    id = id % FBOSURFACES_MAX;
+    return id < 0 ? id + FBOSURFACES_MAX : id;
+}
+
+static void reinit_osd(struct gl_video *p)
+{
+    mpgl_osd_destroy(p->osd);
+    p->osd = NULL;
+    if (p->osd_state)
+        p->osd = mpgl_osd_init(p->ra, p->log, p->osd_state);
+}
+
+static void uninit_rendering(struct gl_video *p)
+{
+    for (int n = 0; n < SCALER_COUNT; n++)
+        uninit_scaler(p, &p->scaler[n]);
+
+    ra_tex_free(p->ra, &p->dither_texture);
+
+    for (int n = 0; n < 4; n++) {
+        fbotex_uninit(&p->merge_fbo[n]);
+        fbotex_uninit(&p->scale_fbo[n]);
+        fbotex_uninit(&p->integer_fbo[n]);
+    }
+
+    fbotex_uninit(&p->indirect_fbo);
+    fbotex_uninit(&p->blend_subs_fbo);
+    fbotex_uninit(&p->screen_fbo);
+    fbotex_uninit(&p->output_fbo);
+
+    for (int n = 0; n < FBOSURFACES_MAX; n++)
+        fbotex_uninit(&p->surfaces[n].fbotex);
+
+    for (int n = 0; n < SHADER_MAX_SAVED; n++)
+        fbotex_uninit(&p->hook_fbos[n]);
+
+    for (int n = 0; n < 2; n++)
+        fbotex_uninit(&p->vdpau_deinterleave_fbo[n]);
+
+    gl_video_reset_surfaces(p);
+    gl_video_reset_hooks(p);
+
+    gl_sc_reset_error(p->sc);
+}
+
+bool gl_video_gamma_auto_enabled(struct gl_video *p)
+{
+    return p->opts.gamma_auto;
+}
+
+struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p)
+{
+    return (struct mp_colorspace) {
+        .primaries = p->opts.target_prim,
+        .gamma = p->opts.target_trc,
+    };
+}
+
+// Warning: profile.start must point to a ta allocation, and the function
+//          takes over ownership.
+void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data)
+{
+    if (gl_lcms_set_memory_profile(p->cms, icc_data))
+        reinit_from_options(p);
+}
+
+bool gl_video_icc_auto_enabled(struct gl_video *p)
+{
+    return p->opts.icc_opts ? p->opts.icc_opts->profile_auto : false;
+}
+
+static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
+                               enum mp_csp_trc trc)
+{
+    if (!p->use_lut_3d)
+        return false;
+
+    struct AVBufferRef *icc = NULL;
+    if (p->image.mpi)
+        icc = p->image.mpi->icc_profile;
+
+    if (p->lut_3d_texture && !gl_lcms_has_changed(p->cms, prim, trc, icc))
+        return true;
+
+    // GLES3 doesn't provide filtered 16 bit integer textures
+    // GLES2 doesn't even provide 3D textures
+    const struct ra_format *fmt = ra_find_unorm_format(p->ra, 2, 4);
+    if (!fmt || !(p->ra->caps & RA_CAP_TEX_3D)) {
+        p->use_lut_3d = false;
+        MP_WARN(p, "Disabling color management (no RGBA16 3D textures).\n");
+        return false;
+    }
+
+    struct lut3d *lut3d = NULL;
+    if (!fmt || !gl_lcms_get_lut3d(p->cms, &lut3d, prim, trc, icc) || !lut3d) {
+        p->use_lut_3d = false;
+        return false;
+    }
+
+    ra_tex_free(p->ra, &p->lut_3d_texture);
+
+    struct ra_tex_params params = {
+        .dimensions = 3,
+        .w = lut3d->size[0],
+        .h = lut3d->size[1],
+        .d = lut3d->size[2],
+        .format = fmt,
+        .render_src = true,
+        .src_linear = true,
+        .initial_data = lut3d->data,
+    };
+    p->lut_3d_texture = ra_tex_create(p->ra, &params);
+
+    debug_check_gl(p, "after 3d lut creation");
+
+    for (int i = 0; i < 3; i++)
+        p->lut_3d_size[i] = lut3d->size[i];
+
+    talloc_free(lut3d);
+
+    return true;
+}
+
+// Fill an img_tex struct from an FBO + some metadata
+static struct img_tex img_tex_fbo(struct fbotex *fbo, enum plane_type type,
+                                  int components)
+{
+    assert(type != PLANE_NONE);
+    return (struct img_tex){
+        .type = type,
+        .tex = fbo->tex,
+        .multiplier = 1.0,
+        .w = fbo->lw,
+        .h = fbo->lh,
+        .transform = identity_trans,
+        .components = components,
+    };
+}
+
+// Bind an img_tex to a free texture unit and return its ID. At most
+// TEXUNIT_VIDEO_NUM texture units can be bound at once
+static int pass_bind(struct gl_video *p, struct img_tex tex)
+{
+    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
+    p->pass_tex[p->pass_tex_num] = tex;
+    return p->pass_tex_num++;
+}
+
+// Rotation by 90° and flipping.
+// w/h is used for recentering.
+static void get_transform(float w, float h, int rotate, bool flip,
+                          struct gl_transform *out_tr)
+{
+    int a = rotate % 90 ? 0 : rotate / 90;
+    int sin90[4] = {0, 1, 0, -1}; // just to avoid rounding issues etc.
+    int cos90[4] = {1, 0, -1, 0};
+    struct gl_transform tr = {{{ cos90[a], sin90[a]},
+                               {-sin90[a], cos90[a]}}};
+
+    // basically, recenter to keep the whole image in view
+    float b[2] = {1, 1};
+    gl_transform_vec(tr, &b[0], &b[1]);
+    tr.t[0] += b[0] < 0 ? w : 0;
+    tr.t[1] += b[1] < 0 ? h : 0;
+
+    if (flip) {
+        struct gl_transform fliptr = {{{1, 0}, {0, -1}}, {0, h}};
+        gl_transform_trans(fliptr, &tr);
+    }
+
+    *out_tr = tr;
+}
+
+// Return the chroma plane upscaled to luma size, but with additional padding
+// for image sizes not aligned to subsampling.
+static int chroma_upsize(int size, int pixel)
+{
+    return (size + pixel - 1) / pixel * pixel;
+}
+
+// If a and b are on the same plane, return what plane type should be used.
+// If a or b are none, the other type always wins.
+// Usually: LUMA/RGB/XYZ > CHROMA > ALPHA
+static enum plane_type merge_plane_types(enum plane_type a, enum plane_type b)
+{
+    if (a == PLANE_NONE)
+        return b;
+    if (b == PLANE_LUMA || b == PLANE_RGB || b == PLANE_XYZ)
+        return b;
+    if (b != PLANE_NONE && a == PLANE_ALPHA)
+        return b;
+    return a;
+}
+
+// Places a video_image's image textures + associated metadata into tex[]. The
+// number of textures is equal to p->plane_count. Any necessary plane offsets
+// are stored in off. (e.g. chroma position)
+static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
+                             struct img_tex tex[4], struct gl_transform off[4])
+{
+    assert(vimg->mpi);
+
+    int w = p->image_params.w;
+    int h = p->image_params.h;
+
+    // Determine the chroma offset
+    float ls_w = 1.0 / p->ra_format.chroma_w;
+    float ls_h = 1.0 / p->ra_format.chroma_h;
+
+    struct gl_transform chroma = {{{ls_w, 0.0}, {0.0, ls_h}}};
+
+    if (p->image_params.chroma_location != MP_CHROMA_CENTER) {
+        int cx, cy;
+        mp_get_chroma_location(p->image_params.chroma_location, &cx, &cy);
+        // By default texture coordinates are such that chroma is centered with
+        // any chroma subsampling. If a specific direction is given, make it
+        // so that the luma and chroma sample line up exactly.
+        // For 4:4:4, setting chroma location should have no effect at all.
+        // luma sample size (in chroma coord. space)
+        chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
+        chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
+    }
+
+    int msb_valid_bits =
+        p->ra_format.component_bits + MPMIN(p->ra_format.component_pad, 0);
+    // The existing code assumes we just have a single tex multiplier for
+    // all of the planes. This may change in the future
+    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.color.space,
+                                         msb_valid_bits,
+                                         p->ra_format.component_bits);
+
+    memset(tex, 0, 4 * sizeof(tex[0]));
+    for (int n = 0; n < p->plane_count; n++) {
+        struct texplane *t = &vimg->planes[n];
+
+        enum plane_type type = PLANE_NONE;
+        for (int i = 0; i < 4; i++) {
+            int c = p->ra_format.components[n][i];
+            enum plane_type ctype;
+            if (c == 0) {
+                ctype = PLANE_NONE;
+            } else if (c == 4) {
+                ctype = PLANE_ALPHA;
+            } else if (p->image_params.color.space == MP_CSP_RGB) {
+                ctype = PLANE_RGB;
+            } else if (p->image_params.color.space == MP_CSP_XYZ) {
+                ctype = PLANE_XYZ;
+            } else {
+                ctype = c == 1 ? PLANE_LUMA : PLANE_CHROMA;
+            }
+            type = merge_plane_types(type, ctype);
+        }
+
+        tex[n] = (struct img_tex){
+            .type = type,
+            .tex = t->tex,
+            .multiplier = tex_mul,
+            .w = t->w,
+            .h = t->h,
+        };
+
+        for (int i = 0; i < 4; i++)
+            tex[n].components += !!p->ra_format.components[n][i];
+
+        get_transform(t->w, t->h, p->image_params.rotate, t->flipped,
+                      &tex[n].transform);
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(int, tex[n].w, tex[n].h);
+
+        off[n] = identity_trans;
+
+        if (type == PLANE_CHROMA) {
+            struct gl_transform rot;
+            get_transform(0, 0, p->image_params.rotate, true, &rot);
+
+            struct gl_transform tr = chroma;
+            gl_transform_vec(rot, &tr.t[0], &tr.t[1]);
+
+            float dx = (chroma_upsize(w, p->ra_format.chroma_w) - w) * ls_w;
+            float dy = (chroma_upsize(h, p->ra_format.chroma_h) - h) * ls_h;
+
+            // Adjust the chroma offset if the real chroma size is fractional
+            // due image sizes not aligned to chroma subsampling.
+            struct gl_transform rot2;
+            get_transform(0, 0, p->image_params.rotate, t->flipped, &rot2);
+            if (rot2.m[0][0] < 0)
+                tr.t[0] += dx;
+            if (rot2.m[1][0] < 0)
+                tr.t[0] += dy;
+            if (rot2.m[0][1] < 0)
+                tr.t[1] += dx;
+            if (rot2.m[1][1] < 0)
+                tr.t[1] += dy;
+
+            off[n] = tr;
+        }
+    }
+}
+
+// Return the index of the given component (assuming all non-padding components
+// of all planes are concatenated into a linear list).
+static int find_comp(struct ra_imgfmt_desc *desc, int component)
+{
+    int cur = 0;
+    for (int n = 0; n < desc->num_planes; n++) {
+        for (int i = 0; i < 4; i++) {
+            if (desc->components[n][i]) {
+                if (desc->components[n][i] == component)
+                    return cur;
+                cur++;
+            }
+        }
+    }
+    return -1;
+}
+
+static void init_video(struct gl_video *p)
+{
+    p->use_integer_conversion = false;
+
+    if (p->hwdec && ra_hwdec_test_format(p->hwdec, p->image_params.imgfmt)) {
+        if (p->hwdec->driver->overlay_frame) {
+            MP_WARN(p, "Using HW-overlay mode. No GL filtering is performed "
+                       "on the video!\n");
+        } else {
+            p->hwdec_mapper = ra_hwdec_mapper_create(p->hwdec, &p->image_params);
+            if (!p->hwdec_mapper)
+                MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
+        }
+        if (p->hwdec_mapper)
+            p->image_params = p->hwdec_mapper->dst_params;
+        const char **exts = p->hwdec->glsl_extensions;
+        for (int n = 0; exts && exts[n]; n++)
+            gl_sc_enable_extension(p->sc, (char *)exts[n]);
+        p->hwdec_active = true;
+    }
+
+    p->ra_format = (struct ra_imgfmt_desc){0};
+    ra_get_imgfmt_desc(p->ra, p->image_params.imgfmt, &p->ra_format);
+
+    p->plane_count = p->ra_format.num_planes;
+
+    p->has_alpha = false;
+    p->is_gray = true;
+
+    for (int n = 0; n < p->ra_format.num_planes; n++) {
+        for (int i = 0; i < 4; i++) {
+            if (p->ra_format.components[n][i]) {
+                p->has_alpha |= p->ra_format.components[n][i] == 4;
+                p->is_gray &= p->ra_format.components[n][i] == 1 ||
+                              p->ra_format.components[n][i] == 4;
+            }
+        }
+    }
+
+    for (int c = 0; c < 4; c++) {
+        int loc = find_comp(&p->ra_format, c + 1);
+        p->color_swizzle[c] = "rgba"[loc >= 0 && loc < 4 ? loc : 0];
+    }
+    p->color_swizzle[4] = '\0';
+
+    // Format-dependent checks.
+    check_gl_features(p);
+
+    mp_image_params_guess_csp(&p->image_params);
+
+    av_lfg_init(&p->lfg, 1);
+
+    debug_check_gl(p, "before video texture creation");
+
+    if (!p->hwdec_active) {
+        struct video_image *vimg = &p->image;
+
+        struct mp_image layout = {0};
+        mp_image_set_params(&layout, &p->image_params);
+
+        for (int n = 0; n < p->plane_count; n++) {
+            struct texplane *plane = &vimg->planes[n];
+            const struct ra_format *format = p->ra_format.planes[n];
+
+            plane->w = mp_image_plane_w(&layout, n);
+            plane->h = mp_image_plane_h(&layout, n);
+
+            struct ra_tex_params params = {
+                .dimensions = 2,
+                .w = plane->w + p->opts.tex_pad_x,
+                .h = plane->h + p->opts.tex_pad_y,
+                .d = 1,
+                .format = format,
+                .render_src = true,
+                .src_linear = format->linear_filter,
+                .non_normalized = p->opts.use_rectangle,
+                .host_mutable = true,
+            };
+
+            MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n,
+                       params.w, params.h);
+
+            plane->tex = ra_tex_create(p->ra, &params);
+            if (!plane->tex)
+                abort(); // shit happens
+
+            p->use_integer_conversion |= format->ctype == RA_CTYPE_UINT;
+        }
+    }
+
+    debug_check_gl(p, "after video texture creation");
+
+    gl_video_setup_hooks(p);
+}
+
+// Release any texture mappings associated with the current frame.
+static void unmap_current_image(struct gl_video *p)
+{
+    struct video_image *vimg = &p->image;
+
+    if (vimg->hwdec_mapped) {
+        assert(p->hwdec_active && p->hwdec_mapper);
+        ra_hwdec_mapper_unmap(p->hwdec_mapper);
+        memset(vimg->planes, 0, sizeof(vimg->planes));
+        vimg->hwdec_mapped = false;
+        vimg->id = 0; // needs to be mapped again
+    }
+}
+
+static struct dr_buffer *gl_find_dr_buffer(struct gl_video *p, uint8_t *ptr)
+{
+   for (int i = 0; i < p->num_dr_buffers; i++) {
+       struct dr_buffer *buffer = &p->dr_buffers[i];
+        uint8_t *bufptr = buffer->buf->data;
+        size_t size = buffer->buf->params.size;
+        if (ptr >= bufptr && ptr < bufptr + size)
+            return buffer;
+    }
+
+    return NULL;
+}
+
+static void gc_pending_dr_fences(struct gl_video *p, bool force)
+{
+again:;
+    for (int n = 0; n < p->num_dr_buffers; n++) {
+        struct dr_buffer *buffer = &p->dr_buffers[n];
+        if (!buffer->mpi)
+            continue;
+
+        bool res = p->ra->fns->buf_poll(p->ra, buffer->buf);
+        if (res || force) {
+            // Unreferencing the image could cause gl_video_dr_free_buffer()
+            // to be called by the talloc destructor (if it was the last
+            // reference). This will implicitly invalidate the buffer pointer
+            // and change the p->dr_buffers array. To make it worse, it could
+            // free multiple dr_buffers due to weird theoretical corner cases.
+            // This is also why we use the goto to iterate again from the
+            // start, because everything gets fucked up. Hail satan!
+            struct mp_image *ref = buffer->mpi;
+            buffer->mpi = NULL;
+            talloc_free(ref);
+            goto again;
+        }
+    }
+}
+
+static void unref_current_image(struct gl_video *p)
+{
+    unmap_current_image(p);
+    p->image.id = 0;
+
+    mp_image_unrefp(&p->image.mpi);
+
+    // While we're at it, also garbage collect pending fences in here to
+    // get it out of the way.
+    gc_pending_dr_fences(p, false);
+}
+
+// If overlay mode is used, make sure to remove the overlay.
+// Be careful with this. Removing the overlay and adding another one will
+// lead to flickering artifacts.
+static void unmap_overlay(struct gl_video *p)
+{
+    if (p->hwdec_active && p->hwdec->driver->overlay_frame)
+        p->hwdec->driver->overlay_frame(p->hwdec, NULL, NULL, NULL, true);
+}
+
+static void uninit_video(struct gl_video *p)
+{
+    uninit_rendering(p);
+
+    struct video_image *vimg = &p->image;
+
+    unmap_overlay(p);
+    unref_current_image(p);
+
+    for (int n = 0; n < p->plane_count; n++) {
+        struct texplane *plane = &vimg->planes[n];
+        ra_tex_free(p->ra, &plane->tex);
+    }
+    *vimg = (struct video_image){0};
+
+    // Invalidate image_params to ensure that gl_video_config() will call
+    // init_video() on uninitialized gl_video.
+    p->real_image_params = (struct mp_image_params){0};
+    p->image_params = p->real_image_params;
+    p->hwdec_active = false;
+    ra_hwdec_mapper_free(&p->hwdec_mapper);
+}
+
+static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
+{
+    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
+        return;
+
+    struct pass_info *pass = &p->pass[p->pass_idx];
+    pass->perf = perf;
+
+    if (pass->desc.len == 0)
+        bstr_xappend(p, &pass->desc, bstr0("(unknown)"));
+
+    p->pass_idx++;
+}
+
+PRINTF_ATTRIBUTE(2, 3)
+static void pass_describe(struct gl_video *p, const char *textf, ...)
+{
+    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
+        return;
+
+    struct pass_info *pass = &p->pass[p->pass_idx];
+
+    if (pass->desc.len > 0)
+        bstr_xappend(p, &pass->desc, bstr0(" + "));
+
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(p, &pass->desc, textf, ap);
+    va_end(ap);
+}
+
+static void pass_info_reset(struct gl_video *p, bool is_redraw)
+{
+    p->pass = is_redraw ? p->pass_redraw : p->pass_fresh;
+    p->pass_idx = 0;
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        p->pass[i].desc.len = 0;
+        p->pass[i].perf = (struct mp_pass_perf){0};
+    }
+}
+
+static void pass_report_performance(struct gl_video *p)
+{
+    if (!p->pass)
+        return;
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        struct pass_info *pass = &p->pass[i];
+        if (pass->desc.len) {
+            MP_DBG(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
+                   BSTR_P(pass->desc),
+                   (int)pass->perf.last/1000,
+                   (int)pass->perf.avg/1000,
+                   (int)pass->perf.peak/1000);
+        }
+    }
+}
+
+static void pass_prepare_src_tex(struct gl_video *p)
+{
+    struct gl_shader_cache *sc = p->sc;
+
+    for (int n = 0; n < p->pass_tex_num; n++) {
+        struct img_tex *s = &p->pass_tex[n];
+        if (!s->tex)
+            continue;
+
+        char *texture_name = mp_tprintf(32, "texture%d", n);
+        char *texture_size = mp_tprintf(32, "texture_size%d", n);
+        char *texture_rot = mp_tprintf(32, "texture_rot%d", n);
+        char *texture_off = mp_tprintf(32, "texture_off%d", n);
+        char *pixel_size = mp_tprintf(32, "pixel_size%d", n);
+
+        gl_sc_uniform_texture(sc, texture_name, s->tex);
+        float f[2] = {1, 1};
+        if (!s->tex->params.non_normalized) {
+            f[0] = s->tex->params.w;
+            f[1] = s->tex->params.h;
+        }
+        gl_sc_uniform_vec2(sc, texture_size, f);
+        gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m);
+        gl_sc_uniform_vec2(sc, texture_off, (float *)s->transform.t);
+        gl_sc_uniform_vec2(sc, pixel_size, (float[]){1.0f / f[0],
+                                                     1.0f / f[1]});
+    }
+}
+
+// Sets the appropriate compute shader metadata for an implicit compute pass
+// bw/bh: block size
+static void pass_is_compute(struct gl_video *p, int bw, int bh)
+{
+    p->pass_compute = (struct compute_info){
+        .active = true,
+        .block_w = bw,
+        .block_h = bh,
+    };
+}
+
+// w/h: the width/height of the compute shader's operating domain (e.g. the
+// target target that needs to be written, or the source texture that needs to
+// be reduced)
+static void dispatch_compute(struct gl_video *p, int w, int h,
+                             struct compute_info info)
+{
+    PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
+            info.threads_w > 0 ? info.threads_w : info.block_w,
+            info.threads_h > 0 ? info.threads_h : info.block_h);
+
+    pass_prepare_src_tex(p);
+    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
+
+    // Since we don't actually have vertices, we pretend for convenience
+    // reasons that we do and calculate the right texture coordinates based on
+    // the output sample ID
+    gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
+    PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
+
+    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
+        struct img_tex *s = &p->pass_tex[n];
+        if (!s->tex)
+            continue;
+
+        // We need to rescale the coordinates to the true texture size
+        char tex_scale[32];
+        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
+        gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
+                (float)s->w / s->tex->params.w,
+                (float)s->h / s->tex->params.h,
+        });
+
+        PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
+        PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + "
+               "pixel_size%d * texture_off%d)\n", n, n, n, n, n);
+        // Clamp the texture coordinates to prevent sampling out-of-bounds in
+        // threads that exceed the requested width/height
+        PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n);
+        PRELUDE("#define texcoord%d texmap%d(gl_GlobalInvocationID)\n", n, n);
+    }
+
+    // always round up when dividing to make sure we don't leave off a part of
+    // the image
+    int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
+        num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
+
+    pass_record(p, gl_sc_dispatch_compute(p->sc, num_x, num_y, 1));
+
+    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+    p->pass_tex_num = 0;
+}
+
+static struct mp_pass_perf render_pass_quad(struct gl_video *p,
+                                            struct fbodst target,
+                                            const struct mp_rect *dst)
+{
+    struct vertex va[6] = {0};
+
+    struct gl_transform t;
+    gl_transform_ortho_fbodst(&t, target);
+
+    float x[2] = {dst->x0, dst->x1};
+    float y[2] = {dst->y0, dst->y1};
+    gl_transform_vec(t, &x[0], &y[0]);
+    gl_transform_vec(t, &x[1], &y[1]);
+
+    for (int n = 0; n < 4; n++) {
+        struct vertex *v = &va[n];
+        v->position.x = x[n / 2];
+        v->position.y = y[n % 2];
+        for (int i = 0; i < p->pass_tex_num; i++) {
+            struct img_tex *s = &p->pass_tex[i];
+            if (!s->tex)
+                continue;
+            struct gl_transform tr = s->transform;
+            float tx = (n / 2) * s->w;
+            float ty = (n % 2) * s->h;
+            gl_transform_vec(tr, &tx, &ty);
+            bool rect = s->tex->params.non_normalized;
+            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
+            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
+        }
+    }
+
+    va[4] = va[2];
+    va[5] = va[1];
+
+    return gl_sc_dispatch_draw(p->sc, target.tex, va, 6);
+}
+
+static void finish_pass_direct(struct gl_video *p, struct fbodst target,
+                               const struct mp_rect *dst)
+{
+    pass_prepare_src_tex(p);
+    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
+    pass_record(p, render_pass_quad(p, target, dst));
+    debug_check_gl(p, "after rendering");
+    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+    p->pass_tex_num = 0;
+}
+
+// dst_fbo: this will be used for rendering; possibly reallocating the whole
+//          FBO, if the required parameters have changed
+// w, h: required FBO target dimension, and also defines the target rectangle
+//       used for rasterization
+// flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
+//        flags allows the FBO to be larger than the w/h parameters)
+static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
+                            int w, int h, int flags)
+{
+    fbotex_change(dst_fbo, p->ra, p->log, w, h, p->fbo_format, flags);
+
+    if (p->pass_compute.active) {
+        if (!dst_fbo->tex)
+            return;
+        gl_sc_uniform_image2D_wo(p->sc, "out_image", dst_fbo->tex);
+        if (!p->pass_compute.directly_writes)
+            GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
+
+        dispatch_compute(p, w, h, p->pass_compute);
+        p->pass_compute = (struct compute_info){0};
+
+        debug_check_gl(p, "after dispatching compute shader");
+    } else {
+        finish_pass_direct(p, dst_fbo->fbo, &(struct mp_rect){0, 0, w, h});
+    }
+}
+
+static const char *get_tex_swizzle(struct img_tex *img)
+{
+    if (!img->tex)
+        return "rgba";
+    return img->tex->params.format->luminance_alpha ? "raaa" : "rgba";
+}
+
+// Copy a texture to the vec4 color, while increasing offset. Also applies
+// the texture multiplier to the sampled color
+static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
+{
+    int count = img.components;
+    assert(*offset + count <= 4);
+
+    int id = pass_bind(p, img);
+    char src[5] = {0};
+    char dst[5] = {0};
+    const char *tex_fmt = get_tex_swizzle(&img);
+    const char *dst_fmt = "rgba";
+    for (int i = 0; i < count; i++) {
+        src[i] = tex_fmt[i];
+        dst[i] = dst_fmt[*offset + i];
+    }
+
+    if (img.tex && img.tex->params.format->ctype == RA_CTYPE_UINT) {
+        uint64_t tex_max = 1ull << p->ra_format.component_bits;
+        img.multiplier *= 1.0 / (tex_max - 1);
+    }
+
+    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
+          dst, img.multiplier, id, id, src);
+
+    *offset += count;
+}
+
+static void skip_unused(struct gl_video *p, int num_components)
+{
+    for (int i = num_components; i < 4; i++)
+        GLSLF("color.%c = %f;\n", "rgba"[i], i < 3 ? 0.0 : 1.0);
+}
+
+static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
+{
+    fbotex_uninit(&scaler->sep_fbo);
+    ra_tex_free(p->ra, &scaler->lut);
+    scaler->kernel = NULL;
+    scaler->initialized = false;
+}
+
+static void hook_prelude(struct gl_video *p, const char *name, int id,
+                         struct img_tex tex)
+{
+    GLSLHF("#define %s_raw texture%d\n", name, id);
+    GLSLHF("#define %s_pos texcoord%d\n", name, id);
+    GLSLHF("#define %s_size texture_size%d\n", name, id);
+    GLSLHF("#define %s_rot texture_rot%d\n", name, id);
+    GLSLHF("#define %s_pt pixel_size%d\n", name, id);
+    GLSLHF("#define %s_map texmap%d\n", name, id);
+    GLSLHF("#define %s_mul %f\n", name, tex.multiplier);
+
+    // Set up the sampling functions
+    GLSLHF("#define %s_tex(pos) (%s_mul * vec4(texture(%s_raw, pos)).%s)\n",
+           name, name, name, get_tex_swizzle(&tex));
+
+    // Since the extra matrix multiplication impacts performance,
+    // skip it unless the texture was actually rotated
+    if (gl_transform_eq(tex.transform, identity_trans)) {
+        GLSLHF("#define %s_texOff(off) %s_tex(%s_pos + %s_pt * vec2(off))\n",
+               name, name, name, name);
+    } else {
+        GLSLHF("#define %s_texOff(off) "
+                   "%s_tex(%s_pos + %s_rot * vec2(off)/%s_size)\n",
+               name, name, name, name, name);
+    }
+}
+
+static bool saved_tex_find(struct gl_video *p, const char *name,
+                           struct img_tex *out)
+{
+    if (!name || !out)
+        return false;
+
+    for (int i = 0; i < p->saved_tex_num; i++) {
+        if (strcmp(p->saved_tex[i].name, name) == 0) {
+            *out = p->saved_tex[i].tex;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void saved_tex_store(struct gl_video *p, const char *name,
+                            struct img_tex tex)
+{
+    assert(name);
+
+    for (int i = 0; i < p->saved_tex_num; i++) {
+        if (strcmp(p->saved_tex[i].name, name) == 0) {
+            p->saved_tex[i].tex = tex;
+            return;
+        }
+    }
+
+    assert(p->saved_tex_num < SHADER_MAX_SAVED);
+    p->saved_tex[p->saved_tex_num++] = (struct saved_tex) {
+        .name = name,
+        .tex = tex
+    };
+}
+
+static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
+                                  struct img_tex tex, struct tex_hook *hook)
+{
+    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
+        char *bind_name = (char *)hook->bind_tex[t];
+
+        if (!bind_name)
+            continue;
+
+        // This is a special name that means "currently hooked texture"
+        if (strcmp(bind_name, "HOOKED") == 0) {
+            int id = pass_bind(p, tex);
+            hook_prelude(p, "HOOKED", id, tex);
+            hook_prelude(p, name, id, tex);
+            continue;
+        }
+
+        // BIND can also be used to load user-defined textures, in which
+        // case we will directly load them as a uniform instead of
+        // generating the hook_prelude boilerplate
+        for (int u = 0; u < p->user_tex_num; u++) {
+            struct gl_user_shader_tex *utex = &p->user_textures[u];
+            if (bstr_equals0(utex->name, bind_name)) {
+                gl_sc_uniform_texture(p->sc, bind_name, utex->tex);
+                goto next_bind;
+            }
+        }
+
+        struct img_tex bind_tex;
+        if (!saved_tex_find(p, bind_name, &bind_tex)) {
+            // Clean up texture bindings and move on to the next hook
+            MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
+                   name, bind_name);
+            p->pass_tex_num -= t;
+            return false;
+        }
+
+        hook_prelude(p, bind_name, pass_bind(p, bind_tex), bind_tex);
+
+next_bind: ;
+    }
+
+    return true;
+}
+
+// Process hooks for a plane, saving the result and returning a new img_tex
+// If 'trans' is NULL, the shader is forbidden from transforming tex
+static struct img_tex pass_hook(struct gl_video *p, const char *name,
+                                struct img_tex tex, struct gl_transform *trans)
+{
+    if (!name)
+        return tex;
+
+    saved_tex_store(p, name, tex);
+
+    MP_DBG(p, "Running hooks for %s\n", name);
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        struct tex_hook *hook = &p->tex_hooks[i];
+
+        // Figure out if this pass hooks this texture
+        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
+            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
+                goto found;
+        }
+
+        continue;
+
+found:
+        // Check the hook's condition
+        if (hook->cond && !hook->cond(p, tex, hook->priv)) {
+            MP_DBG(p, "Skipping hook on %s due to condition.\n", name);
+            continue;
+        }
+
+        if (!pass_hook_setup_binds(p, name, tex, hook))
+            continue;
+
+        // Run the actual hook. This generates a series of GLSL shader
+        // instructions sufficient for drawing the hook's output
+        struct gl_transform hook_off = identity_trans;
+        hook->hook(p, tex, &hook_off, hook->priv);
+
+        int comps = hook->components ? hook->components : tex.components;
+        skip_unused(p, comps);
+
+        // Compute the updated FBO dimensions and store the result
+        struct mp_rect_f sz = {0, 0, tex.w, tex.h};
+        gl_transform_rect(hook_off, &sz);
+        int w = lroundf(fabs(sz.x1 - sz.x0));
+        int h = lroundf(fabs(sz.y1 - sz.y0));
+
+        assert(p->hook_fbo_num < SHADER_MAX_SAVED);
+        struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
+        finish_pass_fbo(p, fbo, w, h, 0);
+
+        const char *store_name = hook->save_tex ? hook->save_tex : name;
+        struct img_tex saved_tex = img_tex_fbo(fbo, tex.type, comps);
+
+        // If the texture we're saving overwrites the "current" texture, also
+        // update the tex parameter so that the future loop cycles will use the
+        // updated values, and export the offset
+        if (strcmp(store_name, name) == 0) {
+            if (!trans && !gl_transform_eq(hook_off, identity_trans)) {
+                MP_ERR(p, "Hook tried changing size of unscalable texture %s!\n",
+                       name);
+                return tex;
+            }
+
+            tex = saved_tex;
+            if (trans)
+                gl_transform_trans(hook_off, trans);
+        }
+
+        saved_tex_store(p, store_name, saved_tex);
+    }
+
+    return tex;
+}
+
+// This can be used at any time in the middle of rendering to specify an
+// optional hook point, which if triggered will render out to a new FBO and
+// load the result back into vec4 color. Offsets applied by the hooks are
+// accumulated in tex_trans, and the FBO is dimensioned according
+// to p->texture_w/h
+static void pass_opt_hook_point(struct gl_video *p, const char *name,
+                                struct gl_transform *tex_trans)
+{
+    if (!name)
+        return;
+
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        struct tex_hook *hook = &p->tex_hooks[i];
+
+        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
+            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
+                goto found;
+        }
+
+        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
+            if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
+                goto found;
+        }
+    }
+
+    // Nothing uses this texture, don't bother storing it
+    return;
+
+found:
+    assert(p->hook_fbo_num < SHADER_MAX_SAVED);
+    struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
+    finish_pass_fbo(p, fbo, p->texture_w, p->texture_h, 0);
+
+    struct img_tex img = img_tex_fbo(fbo, PLANE_RGB, p->components);
+    img = pass_hook(p, name, img, tex_trans);
+    copy_img_tex(p, &(int){0}, img);
+    p->texture_w = img.w;
+    p->texture_h = img.h;
+    p->components = img.components;
+    pass_describe(p, "(remainder pass)");
+}
+
+static void load_shader(struct gl_video *p, struct bstr body)
+{
+    gl_sc_hadd_bstr(p->sc, body);
+    gl_sc_uniform_f(p->sc, "random", (double)av_lfg_get(&p->lfg) / UINT32_MAX);
+    gl_sc_uniform_i(p->sc, "frame", p->frames_uploaded);
+    gl_sc_uniform_vec2(p->sc, "input_size",
+                       (float[]){(p->src_rect.x1 - p->src_rect.x0) *
+                                  p->texture_offset.m[0][0],
+                                  (p->src_rect.y1 - p->src_rect.y0) *
+                                  p->texture_offset.m[1][1]});
+    gl_sc_uniform_vec2(p->sc, "target_size",
+                       (float[]){p->dst_rect.x1 - p->dst_rect.x0,
+                                 p->dst_rect.y1 - p->dst_rect.y0});
+    gl_sc_uniform_vec2(p->sc, "tex_offset",
+                       (float[]){p->src_rect.x0 * p->texture_offset.m[0][0] +
+                                 p->texture_offset.t[0],
+                                 p->src_rect.y0 * p->texture_offset.m[1][1] +
+                                 p->texture_offset.t[1]});
+}
+
+// Semantic equality
+static bool double_seq(double a, double b)
+{
+    return (isnan(a) && isnan(b)) || a == b;
+}
+
+static bool scaler_fun_eq(struct scaler_fun a, struct scaler_fun b)
+{
+    if ((a.name && !b.name) || (b.name && !a.name))
+        return false;
+
+    return ((!a.name && !b.name) || strcmp(a.name, b.name) == 0) &&
+           double_seq(a.params[0], b.params[0]) &&
+           double_seq(a.params[1], b.params[1]) &&
+           a.blur == b.blur &&
+           a.taper == b.taper;
+}
+
+static bool scaler_conf_eq(struct scaler_config a, struct scaler_config b)
+{
+    // Note: antiring isn't compared because it doesn't affect LUT
+    // generation
+    return scaler_fun_eq(a.kernel, b.kernel) &&
+           scaler_fun_eq(a.window, b.window) &&
+           a.radius == b.radius &&
+           a.clamp == b.clamp;
+}
+
+static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
+                          const struct scaler_config *conf,
+                          double scale_factor,
+                          int sizes[])
+{
+    if (scaler_conf_eq(scaler->conf, *conf) &&
+        scaler->scale_factor == scale_factor &&
+        scaler->initialized)
+        return;
+
+    uninit_scaler(p, scaler);
+
+    scaler->conf = *conf;
+    bool is_tscale = scaler->index == SCALER_TSCALE;
+    scaler->conf.kernel.name = (char *)handle_scaler_opt(conf->kernel.name, is_tscale);
+    scaler->conf.window.name = (char *)handle_scaler_opt(conf->window.name, is_tscale);
+    scaler->scale_factor = scale_factor;
+    scaler->insufficient = false;
+    scaler->initialized = true;
+
+    const struct filter_kernel *t_kernel = mp_find_filter_kernel(conf->kernel.name);
+    if (!t_kernel)
+        return;
+
+    scaler->kernel_storage = *t_kernel;
+    scaler->kernel = &scaler->kernel_storage;
+
+    const char *win = conf->window.name;
+    if (!win || !win[0])
+        win = t_kernel->window; // fall back to the scaler's default window
+    const struct filter_window *t_window = mp_find_filter_window(win);
+    if (t_window)
+        scaler->kernel->w = *t_window;
+
+    for (int n = 0; n < 2; n++) {
+        if (!isnan(conf->kernel.params[n]))
+            scaler->kernel->f.params[n] = conf->kernel.params[n];
+        if (!isnan(conf->window.params[n]))
+            scaler->kernel->w.params[n] = conf->window.params[n];
+    }
+
+    if (conf->kernel.blur > 0.0)
+        scaler->kernel->f.blur = conf->kernel.blur;
+    if (conf->window.blur > 0.0)
+        scaler->kernel->w.blur = conf->window.blur;
+
+    if (conf->kernel.taper > 0.0)
+        scaler->kernel->f.taper = conf->kernel.taper;
+    if (conf->window.taper > 0.0)
+        scaler->kernel->w.taper = conf->window.taper;
+
+    if (scaler->kernel->f.resizable && conf->radius > 0.0)
+        scaler->kernel->f.radius = conf->radius;
+
+    scaler->kernel->clamp = conf->clamp;
+    scaler->kernel->value_cutoff = conf->cutoff;
+
+    scaler->insufficient = !mp_init_filter(scaler->kernel, sizes, scale_factor);
+
+    int size = scaler->kernel->size;
+    int num_components = size > 2 ? 4 : size;
+    const struct ra_format *fmt = ra_find_float16_format(p->ra, num_components);
+    assert(fmt);
+
+    int width = (size + num_components - 1) / num_components; // round up
+    int stride = width * num_components;
+    assert(size <= stride);
+
+    scaler->lut_size = 1 << p->opts.scaler_lut_size;
+
+    float *weights = talloc_array(NULL, float, scaler->lut_size * stride);
+    mp_compute_lut(scaler->kernel, scaler->lut_size, stride, weights);
+
+    bool use_1d = scaler->kernel->polar && (p->ra->caps & RA_CAP_TEX_1D);
+
+    struct ra_tex_params lut_params = {
+        .dimensions = use_1d ? 1 : 2,
+        .w = use_1d ? scaler->lut_size : width,
+        .h = use_1d ? 1 : scaler->lut_size,
+        .d = 1,
+        .format = fmt,
+        .render_src = true,
+        .src_linear = true,
+        .initial_data = weights,
+    };
+    scaler->lut = ra_tex_create(p->ra, &lut_params);
+
+    talloc_free(weights);
+
+    debug_check_gl(p, "after initializing scaler");
+}
+
+// Special helper for sampling from two separated stages
+static void pass_sample_separated(struct gl_video *p, struct img_tex src,
+                                  struct scaler *scaler, int w, int h)
+{
+    // Separate the transformation into x and y components, per pass
+    struct gl_transform t_x = {
+        .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}},
+        .t = {src.transform.t[0], 0.0},
+    };
+    struct gl_transform t_y = {
+        .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}},
+        .t = {0.0, src.transform.t[1]},
+    };
+
+    // First pass (scale only in the y dir)
+    src.transform = t_y;
+    sampler_prelude(p->sc, pass_bind(p, src));
+    GLSLF("// first pass\n");
+    pass_sample_separated_gen(p->sc, scaler, 0, 1);
+    GLSLF("color *= %f;\n", src.multiplier);
+    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
+
+    // Second pass (scale only in the x dir)
+    src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
+    src.transform = t_x;
+    pass_describe(p, "%s second pass", scaler->conf.kernel.name);
+    sampler_prelude(p->sc, pass_bind(p, src));
+    pass_sample_separated_gen(p->sc, scaler, 1, 0);
+}
+
+// Picks either the compute shader version or the regular sampler version
+// depending on hardware support
+static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
+                                       struct img_tex tex, int w, int h)
+{
+    uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY;
+    if ((p->ra->caps & reqs) != reqs)
+        goto fallback;
+
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    int offset = bound - 1; // padding top/left
+    int padding = offset + bound; // total padding
+
+    float ratiox = (float)w / tex.w,
+          ratioy = (float)h / tex.h;
+
+    // For performance we want to load at least as many pixels
+    // horizontally as there are threads in a warp (32 for nvidia), as
+    // well as enough to take advantage of shmem parallelism
+    const int warp_size = 32, threads = 256;
+    int bw = warp_size;
+    int bh = threads / bw;
+
+    // We need to sample everything from base_min to base_max, so make sure
+    // we have enough room in shmem
+    int iw = (int)ceil(bw / ratiox) + padding + 1,
+        ih = (int)ceil(bh / ratioy) + padding + 1;
+
+    int shmem_req = iw * ih * tex.components * sizeof(float);
+    if (shmem_req > p->ra->max_shmem)
+        goto fallback;
+
+    pass_is_compute(p, bw, bh);
+    pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
+    return;
+
+fallback:
+    // Fall back to regular polar shader when compute shaders are unsupported
+    // or the kernel is too big for shmem
+    pass_sample_polar(p->sc, scaler, tex.components, p->ra->glsl_version);
+}
+
+// Sample from img_tex, with the src rectangle given by it.
+// The dst rectangle is implicit by what the caller will do next, but w and h
+// must still be what is going to be used (to dimension FBOs correctly).
+// This will write the scaled contents to the vec4 "color".
+// The scaler unit is initialized by this function; in order to avoid cache
+// thrashing, the scaler unit should usually use the same parameters.
+static void pass_sample(struct gl_video *p, struct img_tex tex,
+                        struct scaler *scaler, const struct scaler_config *conf,
+                        double scale_factor, int w, int h)
+{
+    reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
+
+    // Describe scaler
+    const char *scaler_opt[] = {
+        [SCALER_SCALE] = "scale",
+        [SCALER_DSCALE] = "dscale",
+        [SCALER_CSCALE] = "cscale",
+        [SCALER_TSCALE] = "tscale",
+    };
+
+    pass_describe(p, "%s=%s (%s)", scaler_opt[scaler->index],
+                  scaler->conf.kernel.name, plane_names[tex.type]);
+
+    bool is_separated = scaler->kernel && !scaler->kernel->polar;
+
+    // Set up the transformation+prelude and bind the texture, for everything
+    // other than separated scaling (which does this in the subfunction)
+    if (!is_separated)
+        sampler_prelude(p->sc, pass_bind(p, tex));
+
+    // Dispatch the scaler. They're all wildly different.
+    const char *name = scaler->conf.kernel.name;
+    if (strcmp(name, "bilinear") == 0) {
+        GLSL(color = texture(tex, pos);)
+    } else if (strcmp(name, "bicubic_fast") == 0) {
+        pass_sample_bicubic_fast(p->sc);
+    } else if (strcmp(name, "oversample") == 0) {
+        pass_sample_oversample(p->sc, scaler, w, h);
+    } else if (scaler->kernel && scaler->kernel->polar) {
+        pass_dispatch_sample_polar(p, scaler, tex, w, h);
+    } else if (scaler->kernel) {
+        pass_sample_separated(p, tex, scaler, w, h);
+    } else {
+        // Should never happen
+        abort();
+    }
+
+    // Apply any required multipliers. Separated scaling already does this in
+    // its first stage
+    if (!is_separated)
+        GLSLF("color *= %f;\n", tex.multiplier);
+
+    // Micro-optimization: Avoid scaling unneeded channels
+    skip_unused(p, tex.components);
+}
+
+// Returns true if two img_texs are semantically equivalent (same metadata)
+static bool img_tex_equiv(struct img_tex a, struct img_tex b)
+{
+    return a.type == b.type &&
+           a.components == b.components &&
+           a.multiplier == b.multiplier &&
+           a.tex->params.format == b.tex->params.format &&
+           a.tex->params.w == b.tex->params.w &&
+           a.tex->params.h == b.tex->params.h &&
+           a.w == b.w &&
+           a.h == b.h &&
+           gl_transform_eq(a.transform, b.transform);
+}
+
+static bool add_hook(struct gl_video *p, struct tex_hook hook)
+{
+    if (p->tex_hook_num < SHADER_MAX_PASSES) {
+        p->tex_hooks[p->tex_hook_num++] = hook;
+        return true;
+    } else {
+        MP_ERR(p, "Too many passes! Limit is %d.\n", SHADER_MAX_PASSES);
+        talloc_free(hook.priv);
+        return false;
+    }
+}
+
+static void deband_hook(struct gl_video *p, struct img_tex tex,
+                        struct gl_transform *trans, void *priv)
+{
+    pass_describe(p, "debanding (%s)", plane_names[tex.type]);
+    pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg,
+                       p->image_params.color.gamma);
+}
+
+static void unsharp_hook(struct gl_video *p, struct img_tex tex,
+                         struct gl_transform *trans, void *priv)
+{
+    pass_describe(p, "unsharp masking");
+    pass_sample_unsharp(p->sc, p->opts.unsharp);
+}
+
+struct szexp_ctx {
+    struct gl_video *p;
+    struct img_tex tex;
+};
+
+static bool szexp_lookup(void *priv, struct bstr var, float size[2])
+{
+    struct szexp_ctx *ctx = priv;
+    struct gl_video *p = ctx->p;
+
+    if (bstr_equals0(var, "NATIVE_CROPPED")) {
+        size[0] = (p->src_rect.x1 - p->src_rect.x0) * p->texture_offset.m[0][0];
+        size[1] = (p->src_rect.y1 - p->src_rect.y0) * p->texture_offset.m[1][1];
+        return true;
+    }
+
+    // The size of OUTPUT is determined. It could be useful for certain
+    // user shaders to skip passes.
+    if (bstr_equals0(var, "OUTPUT")) {
+        size[0] = p->dst_rect.x1 - p->dst_rect.x0;
+        size[1] = p->dst_rect.y1 - p->dst_rect.y0;
+        return true;
+    }
+
+    // HOOKED is a special case
+    if (bstr_equals0(var, "HOOKED")) {
+        size[0] = ctx->tex.w;
+        size[1] = ctx->tex.h;
+        return true;
+    }
+
+    for (int o = 0; o < p->saved_tex_num; o++) {
+        if (bstr_equals0(var, p->saved_tex[o].name)) {
+            size[0] = p->saved_tex[o].tex.w;
+            size[1] = p->saved_tex[o].tex.h;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
+{
+    struct gl_user_shader_hook *shader = priv;
+    assert(shader);
+
+    float res = false;
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->cond, &res);
+    return res;
+}
+
+static void user_hook(struct gl_video *p, struct img_tex tex,
+                      struct gl_transform *trans, void *priv)
+{
+    struct gl_user_shader_hook *shader = priv;
+    assert(shader);
+    load_shader(p, shader->pass_body);
+
+    pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
+                  plane_names[tex.type]);
+
+    if (shader->compute.active) {
+        p->pass_compute = shader->compute;
+        GLSLF("hook();\n");
+    } else {
+        GLSLF("color = hook();\n");
+    }
+
+    // Make sure we at least create a legal FBO on failure, since it's better
+    // to do this and display an error message than just crash OpenGL
+    float w = 1.0, h = 1.0;
+
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->width, &w);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->height, &h);
+
+    *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
+    gl_transform_trans(shader->offset, trans);
+}
+
+static bool add_user_hook(void *priv, struct gl_user_shader_hook hook)
+{
+    struct gl_video *p = priv;
+    struct gl_user_shader_hook *copy = talloc_ptrtype(p, copy);
+    *copy = hook;
+
+    struct tex_hook texhook = {
+        .save_tex = bstrdup0(copy, hook.save_tex),
+        .components = hook.components,
+        .hook = user_hook,
+        .cond = user_hook_cond,
+        .priv = copy,
+    };
+
+    for (int h = 0; h < SHADER_MAX_HOOKS; h++)
+        texhook.hook_tex[h] = bstrdup0(copy, hook.hook_tex[h]);
+    for (int h = 0; h < SHADER_MAX_BINDS; h++)
+        texhook.bind_tex[h] = bstrdup0(copy, hook.bind_tex[h]);
+
+    return add_hook(p, texhook);
+}
+
+static bool add_user_tex(void *priv, struct gl_user_shader_tex tex)
+{
+    struct gl_video *p = priv;
+
+    if (p->user_tex_num == SHADER_MAX_PASSES) {
+        MP_ERR(p, "Too many textures! Limit is %d.\n", SHADER_MAX_PASSES);
+        goto err;
+    }
+
+    tex.tex = ra_tex_create(p->ra, &tex.params);
+    TA_FREEP(&tex.params.initial_data);
+
+    p->user_textures[p->user_tex_num++] = tex;
+    return true;
+
+err:
+    talloc_free(tex.params.initial_data);
+    return false;
+}
+
+static void load_user_shaders(struct gl_video *p, char **shaders)
+{
+    if (!shaders)
+        return;
+
+    for (int n = 0; shaders[n] != NULL; n++) {
+        struct bstr file = load_cached_file(p, shaders[n]);
+        parse_user_shader(p->log, p->ra, file, p, add_user_hook, add_user_tex);
+    }
+}
+
+static void gl_video_setup_hooks(struct gl_video *p)
+{
+    gl_video_reset_hooks(p);
+
+    if (p->opts.deband) {
+        add_hook(p, (struct tex_hook) {
+            .hook_tex = {"LUMA", "CHROMA", "RGB", "XYZ"},
+            .bind_tex = {"HOOKED"},
+            .hook = deband_hook,
+        });
+    }
+
+    if (p->opts.unsharp != 0.0) {
+        add_hook(p, (struct tex_hook) {
+            .hook_tex = {"MAIN"},
+            .bind_tex = {"HOOKED"},
+            .hook = unsharp_hook,
+        });
+    }
+
+    load_user_shaders(p, p->opts.user_shaders);
+}
+
+// sample from video textures, set "color" variable to yuv value
+static void pass_read_video(struct gl_video *p)
+{
+    struct img_tex tex[4];
+    struct gl_transform offsets[4];
+    pass_get_img_tex(p, &p->image, tex, offsets);
+
+    // To keep the code as simple as possibly, we currently run all shader
+    // stages even if they would be unnecessary (e.g. no hooks for a texture).
+    // In the future, deferred img_tex should optimize this away.
+
+    // Merge semantically identical textures. This loop is done from back
+    // to front so that merged textures end up in the right order while
+    // simultaneously allowing us to skip unnecessary merges
+    for (int n = 3; n >= 0; n--) {
+        if (tex[n].type == PLANE_NONE)
+            continue;
+
+        int first = n;
+        int num = 0;
+
+        for (int i = 0; i < n; i++) {
+            if (img_tex_equiv(tex[n], tex[i]) &&
+                gl_transform_eq(offsets[n], offsets[i]))
+            {
+                GLSLF("// merging plane %d ...\n", i);
+                copy_img_tex(p, &num, tex[i]);
+                first = MPMIN(first, i);
+                tex[i] = (struct img_tex){0};
+            }
+        }
+
+        if (num > 0) {
+            GLSLF("// merging plane %d ... into %d\n", n, first);
+            copy_img_tex(p, &num, tex[n]);
+            pass_describe(p, "merging planes");
+            finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
+            tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
+            tex[n] = (struct img_tex){0};
+        }
+    }
+
+    // If any textures are still in integer format by this point, we need
+    // to introduce an explicit conversion pass to avoid breaking hooks/scaling
+    for (int n = 0; n < 4; n++) {
+        if (tex[n].tex && tex[n].tex->params.format->ctype == RA_CTYPE_UINT) {
+            GLSLF("// use_integer fix for plane %d\n", n);
+            copy_img_tex(p, &(int){0}, tex[n]);
+            pass_describe(p, "use_integer fix");
+            finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
+            tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
+                                 tex[n].components);
+        }
+    }
+
+    // Dispatch the hooks for all of these textures, saving and perhaps
+    // modifying them in the process
+    for (int n = 0; n < 4; n++) {
+        const char *name;
+        switch (tex[n].type) {
+        case PLANE_RGB:    name = "RGB";    break;
+        case PLANE_LUMA:   name = "LUMA";   break;
+        case PLANE_CHROMA: name = "CHROMA"; break;
+        case PLANE_ALPHA:  name = "ALPHA";  break;
+        case PLANE_XYZ:    name = "XYZ";    break;
+        default: continue;
+        }
+
+        tex[n] = pass_hook(p, name, tex[n], &offsets[n]);
+    }
+
+    // At this point all planes are finalized but they may not be at the
+    // required size yet. Furthermore, they may have texture offsets that
+    // require realignment. For lack of something better to do, we assume
+    // the rgb/luma texture is the "reference" and scale everything else
+    // to match.
+    for (int n = 0; n < 4; n++) {
+        switch (tex[n].type) {
+        case PLANE_RGB:
+        case PLANE_XYZ:
+        case PLANE_LUMA: break;
+        default: continue;
+        }
+
+        p->texture_w = tex[n].w;
+        p->texture_h = tex[n].h;
+        p->texture_offset = offsets[n];
+        break;
+    }
+
+    // Compute the reference rect
+    struct mp_rect_f src = {0.0, 0.0, p->image_params.w, p->image_params.h};
+    struct mp_rect_f ref = src;
+    gl_transform_rect(p->texture_offset, &ref);
+    MP_DBG(p, "ref rect: {%f %f} {%f %f}\n", ref.x0, ref.y0, ref.x1, ref.y1);
+
+    // Explicitly scale all of the textures that don't match
+    for (int n = 0; n < 4; n++) {
+        if (tex[n].type == PLANE_NONE)
+            continue;
+
+        // If the planes are aligned identically, we will end up with the
+        // exact same source rectangle.
+        struct mp_rect_f rect = src;
+        gl_transform_rect(offsets[n], &rect);
+        MP_DBG(p, "rect[%d]: {%f %f} {%f %f}\n", n,
+               rect.x0, rect.y0, rect.x1, rect.y1);
+
+        if (mp_rect_f_seq(ref, rect))
+            continue;
+
+        // If the rectangles differ, then our planes have a different
+        // alignment and/or size. First of all, we have to compute the
+        // corrections required to meet the target rectangle
+        struct gl_transform fix = {
+            .m = {{(ref.x1 - ref.x0) / (rect.x1 - rect.x0), 0.0},
+                  {0.0, (ref.y1 - ref.y0) / (rect.y1 - rect.y0)}},
+            .t = {ref.x0, ref.y0},
+        };
+        MP_DBG(p, "-> fix[%d] = {%f %f} + off {%f %f}\n", n,
+               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
+
+        // Since the scale in texture space is different from the scale in
+        // absolute terms, we have to scale the coefficients down to be
+        // relative to the texture's physical dimensions and local offset
+        struct gl_transform scale = {
+            .m = {{(float)tex[n].w / p->texture_w, 0.0},
+                  {0.0, (float)tex[n].h / p->texture_h}},
+            .t = {-rect.x0, -rect.y0},
+        };
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(double, scale.m[0][0], scale.m[1][1]);
+
+        gl_transform_trans(scale, &fix);
+        MP_DBG(p, "-> scaled[%d] = {%f %f} + off {%f %f}\n", n,
+               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
+
+        // Since the texture transform is a function of the texture coordinates
+        // to texture space, rather than the other way around, we have to
+        // actually apply the *inverse* of this. Fortunately, calculating
+        // the inverse is relatively easy here.
+        fix.m[0][0] = 1.0 / fix.m[0][0];
+        fix.m[1][1] = 1.0 / fix.m[1][1];
+        fix.t[0] = fix.m[0][0] * -fix.t[0];
+        fix.t[1] = fix.m[1][1] * -fix.t[1];
+        gl_transform_trans(fix, &tex[n].transform);
+
+        int scaler_id = -1;
+        const char *name = NULL;
+        switch (tex[n].type) {
+        case PLANE_RGB:
+        case PLANE_LUMA:
+        case PLANE_XYZ:
+            scaler_id = SCALER_SCALE;
+            // these aren't worth hooking, fringe hypothetical cases only
+            break;
+        case PLANE_CHROMA:
+            scaler_id = SCALER_CSCALE;
+            name = "CHROMA_SCALED";
+            break;
+        case PLANE_ALPHA:
+            // alpha always uses bilinear
+            name = "ALPHA_SCALED";
+        }
+
+        if (scaler_id < 0)
+            continue;
+
+        const struct scaler_config *conf = &p->opts.scaler[scaler_id];
+        struct scaler *scaler = &p->scaler[scaler_id];
+
+        // bilinear scaling is a free no-op thanks to GPU sampling
+        if (strcmp(conf->kernel.name, "bilinear") != 0) {
+            GLSLF("// upscaling plane %d\n", n);
+            pass_sample(p, tex[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
+            finish_pass_fbo(p, &p->scale_fbo[n], p->texture_w, p->texture_h, 0);
+            tex[n] = img_tex_fbo(&p->scale_fbo[n], tex[n].type, tex[n].components);
+        }
+
+        // Run any post-scaling hooks
+        tex[n] = pass_hook(p, name, tex[n], NULL);
+    }
+
+    // All planes are of the same size and properly aligned at this point
+    GLSLF("// combining planes\n");
+    int coord = 0;
+    for (int i = 0; i < 4; i++) {
+        if (tex[i].type != PLANE_NONE)
+            copy_img_tex(p, &coord, tex[i]);
+    }
+    p->components = coord;
+}
+
+// Utility function that simply binds an FBO and reads from it, without any
+// transformations.
+static void pass_read_fbo(struct gl_video *p, struct fbotex *fbo)
+{
+    struct img_tex tex = img_tex_fbo(fbo, PLANE_RGB, p->components);
+    copy_img_tex(p, &(int){0}, tex);
+}
+
+// yuv conversion, and any other conversions before main up/down-scaling
+static void pass_convert_yuv(struct gl_video *p)
+{
+    struct gl_shader_cache *sc = p->sc;
+
+    struct mp_csp_params cparams = MP_CSP_PARAMS_DEFAULTS;
+    cparams.gray = p->is_gray;
+    mp_csp_set_image_params(&cparams, &p->image_params);
+    mp_csp_equalizer_state_get(p->video_eq, &cparams);
+    p->user_gamma = 1.0 / (cparams.gamma * p->opts.gamma);
+
+    pass_describe(p, "color conversion");
+
+    if (p->color_swizzle[0])
+        GLSLF("color = color.%s;\n", p->color_swizzle);
+
+    // Pre-colormatrix input gamma correction
+    if (cparams.color.space == MP_CSP_XYZ)
+        GLSL(color.rgb = pow(color.rgb, vec3(2.6));) // linear light
+
+    // We always explicitly normalize the range in pass_read_video
+    cparams.input_bits = cparams.texture_bits = 0;
+
+    // Conversion to RGB. For RGB itself, this still applies e.g. brightness
+    // and contrast controls, or expansion of e.g. LSB-packed 10 bit data.
+    struct mp_cmat m = {{{0}}};
+    mp_get_csp_matrix(&cparams, &m);
+    gl_sc_uniform_mat3(sc, "colormatrix", true, &m.m[0][0]);
+    gl_sc_uniform_vec3(sc, "colormatrix_c", m.c);
+
+    GLSL(color.rgb = mat3(colormatrix) * color.rgb + colormatrix_c;)
+
+    if (p->image_params.color.space == MP_CSP_BT_2020_C) {
+        // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
+        // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
+        //      = (B'-Y'c) / 1.5816  | C'bc >  0
+        //
+        // C'rc = (R'-Y'c) / 1.7184  | C'rc <= 0
+        //      = (R'-Y'c) / 0.9936  | C'rc >  0
+        //
+        // as per the BT.2020 specification, table 4. This is a non-linear
+        // transformation because (constant) luminance receives non-equal
+        // contributions from the three different channels.
+        GLSLF("// constant luminance conversion\n");
+        GLSL(color.br = color.br * mix(vec2(1.5816, 0.9936),
+                                       vec2(1.9404, 1.7184),
+                                       lessThanEqual(color.br, vec2(0)))
+                        + color.gg;)
+        // Expand channels to camera-linear light. This shader currently just
+        // assumes everything uses the BT.2020 12-bit gamma function, since the
+        // difference between 10 and 12-bit is negligible for anything other
+        // than 12-bit content.
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
+                             pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993),
+                                 vec3(1.0/0.45)),
+                             lessThanEqual(vec3(0.08145), color.rgb));)
+        // Calculate the green channel from the expanded RYcB
+        // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
+        GLSL(color.g = (color.g - 0.2627*color.r - 0.0593*color.b)*1.0/0.6780;)
+        // Recompress to receive the R'G'B' result, same as other systems
+        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
+                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
+                             lessThanEqual(vec3(0.0181), color.rgb));)
+    }
+
+    p->components = 3;
+    if (!p->has_alpha || p->opts.alpha_mode == ALPHA_NO) {
+        GLSL(color.a = 1.0;)
+    } else { // alpha present in image
+        p->components = 4;
+        GLSL(color = vec4(color.rgb * color.a, color.a);)
+    }
+}
+
+static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2])
+{
+    double target_w = p->src_rect.x1 - p->src_rect.x0;
+    double target_h = p->src_rect.y1 - p->src_rect.y0;
+    if (transpose_rot && p->image_params.rotate % 180 == 90)
+        MPSWAP(double, target_w, target_h);
+    xy[0] = (p->dst_rect.x1 - p->dst_rect.x0) / target_w;
+    xy[1] = (p->dst_rect.y1 - p->dst_rect.y0) / target_h;
+}
+
+// Cropping.
+static void compute_src_transform(struct gl_video *p, struct gl_transform *tr)
+{
+    float sx = (p->src_rect.x1 - p->src_rect.x0) / (float)p->texture_w,
+          sy = (p->src_rect.y1 - p->src_rect.y0) / (float)p->texture_h,
+          ox = p->src_rect.x0,
+          oy = p->src_rect.y0;
+    struct gl_transform transform = {{{sx, 0}, {0, sy}}, {ox, oy}};
+
+    gl_transform_trans(p->texture_offset, &transform);
+
+    *tr = transform;
+}
+
+// Takes care of the main scaling and pre/post-conversions
+static void pass_scale_main(struct gl_video *p)
+{
+    // Figure out the main scaler.
+    double xy[2];
+    get_scale_factors(p, true, xy);
+
+    // actual scale factor should be divided by the scale factor of prescaling.
+    xy[0] /= p->texture_offset.m[0][0];
+    xy[1] /= p->texture_offset.m[1][1];
+
+    bool downscaling = xy[0] < 1.0 || xy[1] < 1.0;
+    bool upscaling = !downscaling && (xy[0] > 1.0 || xy[1] > 1.0);
+    double scale_factor = 1.0;
+
+    struct scaler *scaler = &p->scaler[SCALER_SCALE];
+    struct scaler_config scaler_conf = p->opts.scaler[SCALER_SCALE];
+    if (p->opts.scaler_resizes_only && !downscaling && !upscaling) {
+        scaler_conf.kernel.name = "bilinear";
+        // For scaler-resizes-only, we round the texture offset to
+        // the nearest round value in order to prevent ugly blurriness
+        // (in exchange for slightly shifting the image by up to half a
+        // subpixel)
+        p->texture_offset.t[0] = roundf(p->texture_offset.t[0]);
+        p->texture_offset.t[1] = roundf(p->texture_offset.t[1]);
+    }
+    if (downscaling && p->opts.scaler[SCALER_DSCALE].kernel.name) {
+        scaler_conf = p->opts.scaler[SCALER_DSCALE];
+        scaler = &p->scaler[SCALER_DSCALE];
+    }
+
+    // When requesting correct-downscaling and the clip is anamorphic, and
+    // because only a single scale factor is used for both axes, enable it only
+    // when both axes are downscaled, and use the milder of the factors to not
+    // end up with too much blur on one axis (even if we end up with sub-optimal
+    // scale factor on the other axis). This is better than not respecting
+    // correct scaling at all for anamorphic clips.
+    double f = MPMAX(xy[0], xy[1]);
+    if (p->opts.correct_downscaling && f < 1.0)
+        scale_factor = 1.0 / f;
+
+    // Pre-conversion, like linear light/sigmoidization
+    GLSLF("// scaler pre-conversion\n");
+    bool use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
+
+    // Linear light downscaling results in nasty artifacts for HDR curves due
+    // to the potentially extreme brightness differences severely compounding
+    // any ringing. So just scale in gamma light instead.
+    if (mp_trc_is_hdr(p->image_params.color.gamma) && downscaling)
+        use_linear = false;
+
+    if (use_linear) {
+        p->use_linear = true;
+        pass_linearize(p->sc, p->image_params.color.gamma);
+        pass_opt_hook_point(p, "LINEAR", NULL);
+    }
+
+    bool use_sigmoid = use_linear && p->opts.sigmoid_upscaling && upscaling;
+    float sig_center, sig_slope, sig_offset, sig_scale;
+    if (use_sigmoid) {
+        // Coefficients for the sigmoidal transform are taken from the
+        // formula here: http://www.imagemagick.org/Usage/color_mods/#sigmoidal
+        sig_center = p->opts.sigmoid_center;
+        sig_slope  = p->opts.sigmoid_slope;
+        // This function needs to go through (0,0) and (1,1) so we compute the
+        // values at 1 and 0, and then scale/shift them, respectively.
+        sig_offset = 1.0/(1+expf(sig_slope * sig_center));
+        sig_scale  = 1.0/(1+expf(sig_slope * (sig_center-1))) - sig_offset;
+        GLSLF("color.rgb = %f - log(1.0/(color.rgb * %f + %f) - 1.0) * 1.0/%f;\n",
+                sig_center, sig_scale, sig_offset, sig_slope);
+        pass_opt_hook_point(p, "SIGMOID", NULL);
+    }
+
+    pass_opt_hook_point(p, "PREKERNEL", NULL);
+
+    int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
+    int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+    struct gl_transform transform;
+    compute_src_transform(p, &transform);
+
+    GLSLF("// main scaling\n");
+    finish_pass_fbo(p, &p->indirect_fbo, p->texture_w, p->texture_h, 0);
+    struct img_tex src = img_tex_fbo(&p->indirect_fbo, PLANE_RGB, p->components);
+    gl_transform_trans(transform, &src.transform);
+    pass_sample(p, src, scaler, &scaler_conf, scale_factor, vp_w, vp_h);
+
+    // Changes the texture size to display size after main scaler.
+    p->texture_w = vp_w;
+    p->texture_h = vp_h;
+
+    pass_opt_hook_point(p, "POSTKERNEL", NULL);
+
+    GLSLF("// scaler post-conversion\n");
+    if (use_sigmoid) {
+        // Inverse of the transformation above
+        GLSLF("color.rgb = (1.0/(1.0 + exp(%f * (%f - color.rgb))) - %f) * 1.0/%f;\n",
+                sig_slope, sig_center, sig_offset, sig_scale);
+    }
+}
+
+// Adapts the colors to the right output color space. (Final pass during
+// rendering)
+// If OSD is true, ignore any changes that may have been made to the video
+// by previous passes (i.e. linear scaling)
+static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool osd)
+{
+    struct ra *ra = p->ra;
+
+    // Figure out the target color space from the options, or auto-guess if
+    // none were set
+    struct mp_colorspace dst = {
+        .gamma = p->opts.target_trc,
+        .primaries = p->opts.target_prim,
+        .light = MP_CSP_LIGHT_DISPLAY,
+    };
+
+    if (p->use_lut_3d) {
+        // The 3DLUT is always generated against the video's original source
+        // space, *not* the reference space. (To avoid having to regenerate
+        // the 3DLUT for the OSD on every frame)
+        enum mp_csp_prim prim_orig = p->image_params.color.primaries;
+        enum mp_csp_trc trc_orig = p->image_params.color.gamma;
+
+        // One exception: HDR is not implemented by LittleCMS for technical
+        // limitation reasons, so we use a gamma 2.2 input curve here instead.
+        // We could pick any value we want here, the difference is just coding
+        // efficiency.
+        if (mp_trc_is_hdr(trc_orig))
+            trc_orig = MP_CSP_TRC_GAMMA22;
+
+        if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
+            dst.primaries = prim_orig;
+            dst.gamma = trc_orig;
+        }
+    }
+
+    if (dst.primaries == MP_CSP_PRIM_AUTO) {
+        // The vast majority of people are on sRGB or BT.709 displays, so pick
+        // this as the default output color space.
+        dst.primaries = MP_CSP_PRIM_BT_709;
+
+        if (src.primaries == MP_CSP_PRIM_BT_601_525 ||
+            src.primaries == MP_CSP_PRIM_BT_601_625)
+        {
+            // Since we auto-pick BT.601 and BT.709 based on the dimensions,
+            // combined with the fact that they're very similar to begin with,
+            // and to avoid confusing the average user, just don't adapt BT.601
+            // content automatically at all.
+            dst.primaries = src.primaries;
+        }
+    }
+
+    if (dst.gamma == MP_CSP_TRC_AUTO) {
+        // Most people seem to complain when the image is darker or brighter
+        // than what they're "used to", so just avoid changing the gamma
+        // altogether by default. The only exceptions to this rule apply to
+        // very unusual TRCs, which even hardcode technoluddites would probably
+        // not enjoy viewing unaltered.
+        dst.gamma = src.gamma;
+
+        // Avoid outputting linear light or HDR content "by default". For these
+        // just pick gamma 2.2 as a default, since it's a good estimate for
+        // the response of typical displays
+        if (dst.gamma == MP_CSP_TRC_LINEAR || mp_trc_is_hdr(dst.gamma))
+            dst.gamma = MP_CSP_TRC_GAMMA22;
+    }
+
+    bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
+    if (detect_peak && !p->hdr_peak_ssbo) {
+        struct {
+            unsigned int sig_peak_raw;
+            unsigned int index;
+            unsigned int frame_max[PEAK_DETECT_FRAMES+1];
+        } peak_ssbo = {0};
+
+        // Prefill with safe values
+        int safe = MP_REF_WHITE * mp_trc_nom_peak(p->image_params.color.gamma);
+        peak_ssbo.sig_peak_raw = PEAK_DETECT_FRAMES * safe;
+        for (int i = 0; i < PEAK_DETECT_FRAMES+1; i++)
+            peak_ssbo.frame_max[i] = safe;
+
+        struct ra_buf_params params = {
+            .type = RA_BUF_TYPE_SHADER_STORAGE,
+            .size = sizeof(peak_ssbo),
+            .initial_data = &peak_ssbo,
+        };
+
+        p->hdr_peak_ssbo = ra_buf_create(ra, &params);
+        if (!p->hdr_peak_ssbo) {
+            MP_WARN(p, "Failed to create HDR peak detection SSBO, disabling.\n");
+            detect_peak = (p->opts.compute_hdr_peak = false);
+        }
+    }
+
+    if (detect_peak) {
+        pass_describe(p, "detect HDR peak");
+        pass_is_compute(p, 8, 8); // 8x8 is good for performance
+        gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
+            "uint sig_peak_raw;"
+            "uint index;"
+            "uint frame_max[%d];", PEAK_DETECT_FRAMES + 1
+        );
+    }
+
+    // Adapt from src to dst as necessary
+    pass_color_map(p->sc, src, dst, p->opts.tone_mapping,
+                   p->opts.tone_mapping_param, p->opts.tone_mapping_desat,
+                   detect_peak, p->opts.gamut_warning, p->use_linear && !osd);
+
+    if (p->use_lut_3d) {
+        gl_sc_uniform_texture(p->sc, "lut_3d", p->lut_3d_texture);
+        GLSL(vec3 cpos;)
+        for (int i = 0; i < 3; i++)
+            GLSLF("cpos[%d] = LUT_POS(color[%d], %d.0);\n", i, i, p->lut_3d_size[i]);
+        GLSL(color.rgb = tex3D(lut_3d, cpos).rgb;)
+    }
+}
+
+void gl_video_set_fb_depth(struct gl_video *p, int fb_depth)
+{
+    p->fb_depth = fb_depth;
+}
+
+static void pass_dither(struct gl_video *p)
+{
+    // Assume 8 bits per component if unknown.
+    int dst_depth = p->fb_depth > 0 ? p->fb_depth : 8;
+    if (p->opts.dither_depth > 0)
+        dst_depth = p->opts.dither_depth;
+
+    if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
+        return;
+
+    if (!p->dither_texture) {
+        MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
+
+        int tex_size = 0;
+        void *tex_data = NULL;
+        const struct ra_format *fmt = NULL;
+        void *temp = NULL;
+
+        if (p->opts.dither_algo == DITHER_FRUIT) {
+            int sizeb = p->opts.dither_size;
+            int size = 1 << sizeb;
+
+            if (p->last_dither_matrix_size != size) {
+                p->last_dither_matrix = talloc_realloc(p, p->last_dither_matrix,
+                                                       float, size * size);
+                mp_make_fruit_dither_matrix(p->last_dither_matrix, sizeb);
+                p->last_dither_matrix_size = size;
+            }
+
+            // Prefer R16 texture since they provide higher precision.
+            fmt = ra_find_unorm_format(p->ra, 2, 1);
+            if (!fmt)
+                fmt = ra_find_float16_format(p->ra, 1);
+            if (fmt) {
+                tex_size = size;
+                tex_data = p->last_dither_matrix;
+                if (fmt->ctype == RA_CTYPE_UNORM) {
+                    uint16_t *t = temp = talloc_array(NULL, uint16_t, size * size);
+                    for (int n = 0; n < size * size; n++)
+                        t[n] = p->last_dither_matrix[n] * UINT16_MAX;
+                    tex_data = t;
+                }
+            } else {
+                MP_VERBOSE(p, "GL too old. Falling back to ordered dither.\n");
+                p->opts.dither_algo = DITHER_ORDERED;
+            }
+        }
+
+        if (p->opts.dither_algo == DITHER_ORDERED) {
+            temp = talloc_array(NULL, char, 8 * 8);
+            mp_make_ordered_dither_matrix(temp, 8);
+
+            fmt = ra_find_unorm_format(p->ra, 1, 1);
+            tex_size = 8;
+            tex_data = temp;
+        }
+
+        struct ra_tex_params params = {
+            .dimensions = 2,
+            .w = tex_size,
+            .h = tex_size,
+            .d = 1,
+            .format = fmt,
+            .render_src = true,
+            .src_repeat = true,
+            .initial_data = tex_data,
+        };
+        p->dither_texture = ra_tex_create(p->ra, &params);
+
+        debug_check_gl(p, "dither setup");
+
+        talloc_free(temp);
+    }
+
+    GLSLF("// dithering\n");
+
+    // This defines how many bits are considered significant for output on
+    // screen. The superfluous bits will be used for rounding according to the
+    // dither matrix. The precision of the source implicitly decides how many
+    // dither patterns can be visible.
+    int dither_quantization = (1 << dst_depth) - 1;
+    int dither_size = p->dither_texture->params.w;
+
+    gl_sc_uniform_texture(p->sc, "dither", p->dither_texture);
+
+    GLSLF("vec2 dither_pos = gl_FragCoord.xy * 1.0/%d.0;\n", dither_size);
+
+    if (p->opts.temporal_dither) {
+        int phase = (p->frames_rendered / p->opts.temporal_dither_period) % 8u;
+        float r = phase * (M_PI / 2); // rotate
+        float m = phase < 4 ? 1 : -1; // mirror
+
+        float matrix[2][2] = {{cos(r),     -sin(r)    },
+                              {sin(r) * m,  cos(r) * m}};
+        gl_sc_uniform_mat2(p->sc, "dither_trafo", true, &matrix[0][0]);
+
+        GLSL(dither_pos = dither_trafo * dither_pos;)
+    }
+
+    GLSL(float dither_value = texture(dither, dither_pos).r;)
+    GLSLF("color = floor(color * %d.0 + dither_value + 0.5 / %d.0) * 1.0/%d.0;\n",
+          dither_quantization, dither_size * dither_size, dither_quantization);
+}
+
+// Draws the OSD, in scene-referred colors.. If cms is true, subtitles are
+// instead adapted to the display's gamut.
+static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
+                          struct mp_osd_res rect, struct fbodst target, bool cms)
+{
+    mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo_out, draw_flags);
+
+    timer_pool_start(p->osd_timer);
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        // (This returns false if this part is empty with nothing to draw.)
+        if (!mpgl_osd_draw_prepare(p->osd, n, p->sc))
+            continue;
+        // When subtitles need to be color managed, assume they're in sRGB
+        // (for lack of anything saner to do)
+        if (cms) {
+            static const struct mp_colorspace csp_srgb = {
+                .primaries = MP_CSP_PRIM_BT_709,
+                .gamma = MP_CSP_TRC_SRGB,
+                .light = MP_CSP_LIGHT_DISPLAY,
+            };
+
+            pass_colormanage(p, csp_srgb, true);
+        }
+        mpgl_osd_draw_finish(p->osd, n, p->sc, target);
+    }
+
+    timer_pool_stop(p->osd_timer);
+    pass_describe(p, "drawing osd");
+    pass_record(p, timer_pool_measure(p->osd_timer));
+}
+
+static float chroma_realign(int size, int pixel)
+{
+    return size / (float)chroma_upsize(size, pixel);
+}
+
+// Minimal rendering code path, for GLES or OpenGL 2.1 without proper FBOs.
+static void pass_render_frame_dumb(struct gl_video *p)
+{
+    struct img_tex tex[4];
+    struct gl_transform off[4];
+    pass_get_img_tex(p, &p->image, tex, off);
+
+    struct gl_transform transform;
+    compute_src_transform(p, &transform);
+
+    int index = 0;
+    for (int i = 0; i < p->plane_count; i++) {
+        int cw = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_w : 1;
+        int ch = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_h : 1;
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(int, cw, ch);
+
+        struct gl_transform t = transform;
+        t.m[0][0] *= chroma_realign(p->texture_w, cw);
+        t.m[1][1] *= chroma_realign(p->texture_h, ch);
+
+        t.t[0] /= cw;
+        t.t[1] /= ch;
+
+        t.t[0] += off[i].t[0];
+        t.t[1] += off[i].t[1];
+
+        gl_transform_trans(tex[i].transform, &t);
+        tex[i].transform = t;
+
+        copy_img_tex(p, &index, tex[i]);
+    }
+
+    pass_convert_yuv(p);
+}
+
+// The main rendering function, takes care of everything up to and including
+// upscaling. p->image is rendered.
+static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t id)
+{
+    // initialize the texture parameters and temporary variables
+    p->texture_w = p->image_params.w;
+    p->texture_h = p->image_params.h;
+    p->texture_offset = identity_trans;
+    p->components = 0;
+    p->saved_tex_num = 0;
+    p->hook_fbo_num = 0;
+    p->use_linear = false;
+
+    // try uploading the frame
+    if (!pass_upload_image(p, mpi, id))
+        return false;
+
+    if (p->image_params.rotate % 180 == 90)
+        MPSWAP(int, p->texture_w, p->texture_h);
+
+    if (p->dumb_mode)
+        return true;
+
+    pass_read_video(p);
+    pass_opt_hook_point(p, "NATIVE", &p->texture_offset);
+    pass_convert_yuv(p);
+    pass_opt_hook_point(p, "MAINPRESUB", &p->texture_offset);
+
+    // For subtitles
+    double vpts = p->image.mpi->pts;
+    if (vpts == MP_NOPTS_VALUE)
+        vpts = p->osd_pts;
+
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO) {
+        double scale[2];
+        get_scale_factors(p, false, scale);
+        struct mp_osd_res rect = {
+            .w = p->texture_w, .h = p->texture_h,
+            .display_par = scale[1] / scale[0], // counter compensate scaling
+        };
+        finish_pass_fbo(p, &p->blend_subs_fbo, rect.w, rect.h, 0);
+        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
+                      p->blend_subs_fbo.fbo, false);
+        pass_read_fbo(p, &p->blend_subs_fbo);
+        pass_describe(p, "blend subs video");
+    }
+    pass_opt_hook_point(p, "MAIN", &p->texture_offset);
+
+    pass_scale_main(p);
+
+    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
+        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES) {
+        // Recreate the real video size from the src/dst rects
+        struct mp_osd_res rect = {
+            .w = vp_w, .h = vp_h,
+            .ml = -p->src_rect.x0, .mr = p->src_rect.x1 - p->image_params.w,
+            .mt = -p->src_rect.y0, .mb = p->src_rect.y1 - p->image_params.h,
+            .display_par = 1.0,
+        };
+        // Adjust margins for scale
+        double scale[2];
+        get_scale_factors(p, true, scale);
+        rect.ml *= scale[0]; rect.mr *= scale[0];
+        rect.mt *= scale[1]; rect.mb *= scale[1];
+        // We should always blend subtitles in non-linear light
+        if (p->use_linear) {
+            pass_delinearize(p->sc, p->image_params.color.gamma);
+            p->use_linear = false;
+        }
+        finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h, 0);
+        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
+                      p->blend_subs_fbo.fbo, false);
+        pass_read_fbo(p, &p->blend_subs_fbo);
+        pass_describe(p, "blend subs");
+    }
+
+    pass_opt_hook_point(p, "SCALED", NULL);
+
+    return true;
+}
+
+static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
+{
+    if (p->dumb_mode)
+        pass_render_frame_dumb(p);
+
+    // Adjust the overall gamma before drawing to screen
+    if (p->user_gamma != 1) {
+        gl_sc_uniform_f(p->sc, "user_gamma", p->user_gamma);
+        GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+        GLSL(color.rgb = pow(color.rgb, vec3(user_gamma));)
+    }
+
+    pass_colormanage(p, p->image_params.color, false);
+
+    // Since finish_pass_direct doesn't work with compute shaders, and neither
+    // does the checkerboard/dither code, we may need an indirection via
+    // p->screen_fbo here.
+    if (p->pass_compute.active) {
+        int o_w = p->dst_rect.x1 - p->dst_rect.x0,
+            o_h = p->dst_rect.y1 - p->dst_rect.y0;
+        finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
+        struct img_tex tmp = img_tex_fbo(&p->screen_fbo, PLANE_RGB, p->components);
+        copy_img_tex(p, &(int){0}, tmp);
+    }
+
+    if (p->has_alpha){
+        if (p->opts.alpha_mode == ALPHA_BLEND_TILES) {
+            // Draw checkerboard pattern to indicate transparency
+            GLSLF("// transparency checkerboard\n");
+            GLSL(bvec2 tile = lessThan(fract(gl_FragCoord.xy * 1.0/32.0), vec2(0.5));)
+            GLSL(vec3 background = vec3(tile.x == tile.y ? 1.0 : 0.75);)
+            GLSL(color.rgb = mix(background, color.rgb, color.a);)
+        } else if (p->opts.alpha_mode == ALPHA_BLEND) {
+            // Blend into background color (usually black)
+            struct m_color c = p->opts.background;
+            GLSLF("vec4 background = vec4(%f, %f, %f, %f);\n",
+                  c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0);
+            GLSL(color = mix(background, vec4(color.rgb, 1.0), color.a);)
+        }
+    }
+
+    pass_opt_hook_point(p, "OUTPUT", NULL);
+
+    pass_dither(p);
+    pass_describe(p, "output to screen");
+    finish_pass_direct(p, fbo, &p->dst_rect);
+}
+
+static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
+                              uint64_t id, struct fbosurface *surf)
+{
+    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
+        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+
+    pass_info_reset(p, false);
+    if (!pass_render_frame(p, mpi, id))
+        return false;
+
+    // Frame blending should always be done in linear light to preserve the
+    // overall brightness, otherwise this will result in flashing dark frames
+    // because mixing in compressed light artificially darkens the results
+    if (!p->use_linear) {
+        p->use_linear = true;
+        pass_linearize(p->sc, p->image_params.color.gamma);
+    }
+
+    finish_pass_fbo(p, &surf->fbotex, vp_w, vp_h, FBOTEX_FUZZY);
+    surf->id  = id;
+    surf->pts = mpi->pts;
+    return true;
+}
+
+// Draws an interpolate frame to fbo, based on the frame timing in t
+static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
+                                       struct fbodst fbo)
+{
+    bool is_new = false;
+
+    // Reset the queue completely if this is a still image, to avoid any
+    // interpolation artifacts from surrounding frames when unpausing or
+    // framestepping
+    if (t->still)
+        gl_video_reset_surfaces(p);
+
+    // First of all, figure out if we have a frame available at all, and draw
+    // it manually + reset the queue if not
+    if (p->surfaces[p->surface_now].id == 0) {
+        struct fbosurface *now = &p->surfaces[p->surface_now];
+        if (!update_fbosurface(p, t->current, t->frame_id, now))
+            return;
+        p->surface_idx = p->surface_now;
+        is_new = true;
+    }
+
+    // Find the right frame for this instant
+    if (t->current) {
+        int next = fbosurface_wrap(p->surface_now + 1);
+        while (p->surfaces[next].id &&
+               p->surfaces[next].id > p->surfaces[p->surface_now].id &&
+               p->surfaces[p->surface_now].id < t->frame_id)
+        {
+            p->surface_now = next;
+            next = fbosurface_wrap(next + 1);
+        }
+    }
+
+    // Figure out the queue size. For illustration, a filter radius of 2 would
+    // look like this: _ A [B] C D _
+    // A is surface_bse, B is surface_now, C is surface_now+1 and D is
+    // surface_end.
+    struct scaler *tscale = &p->scaler[SCALER_TSCALE];
+    reinit_scaler(p, tscale, &p->opts.scaler[SCALER_TSCALE], 1, tscale_sizes);
+    bool oversample = strcmp(tscale->conf.kernel.name, "oversample") == 0;
+    bool linear = strcmp(tscale->conf.kernel.name, "linear") == 0;
+    int size;
+
+    if (oversample || linear) {
+        size = 2;
+    } else {
+        assert(tscale->kernel && !tscale->kernel->polar);
+        size = ceil(tscale->kernel->size);
+        assert(size <= TEXUNIT_VIDEO_NUM);
+    }
+
+    int radius = size/2;
+    int surface_now = p->surface_now;
+    int surface_bse = fbosurface_wrap(surface_now - (radius-1));
+    int surface_end = fbosurface_wrap(surface_now + radius);
+    assert(fbosurface_wrap(surface_bse + size-1) == surface_end);
+
+    // Render new frames while there's room in the queue. Note that technically,
+    // this should be done before the step where we find the right frame, but
+    // it only barely matters at the very beginning of playback, and this way
+    // makes the code much more linear.
+    int surface_dst = fbosurface_wrap(p->surface_idx + 1);
+    for (int i = 0; i < t->num_frames; i++) {
+        // Avoid overwriting data we might still need
+        if (surface_dst == surface_bse - 1)
+            break;
+
+        struct mp_image *f = t->frames[i];
+        uint64_t f_id = t->frame_id + i;
+        if (!mp_image_params_equal(&f->params, &p->real_image_params))
+            continue;
+
+        if (f_id > p->surfaces[p->surface_idx].id) {
+            struct fbosurface *dst = &p->surfaces[surface_dst];
+            if (!update_fbosurface(p, f, f_id, dst))
+                return;
+            p->surface_idx = surface_dst;
+            surface_dst = fbosurface_wrap(surface_dst + 1);
+            is_new = true;
+        }
+    }
+
+    // Figure out whether the queue is "valid". A queue is invalid if the
+    // frames' PTS is not monotonically increasing. Anything else is invalid,
+    // so avoid blending incorrect data and just draw the latest frame as-is.
+    // Possible causes for failure of this condition include seeks, pausing,
+    // end of playback or start of playback.
+    bool valid = true;
+    for (int i = surface_bse, ii; valid && i != surface_end; i = ii) {
+        ii = fbosurface_wrap(i + 1);
+        if (p->surfaces[i].id == 0 || p->surfaces[ii].id == 0) {
+            valid = false;
+        } else if (p->surfaces[ii].id < p->surfaces[i].id) {
+            valid = false;
+            MP_DBG(p, "interpolation queue underrun\n");
+        }
+    }
+
+    // Update OSD PTS to synchronize subtitles with the displayed frame
+    p->osd_pts = p->surfaces[surface_now].pts;
+
+    // Finally, draw the right mix of frames to the screen.
+    if (!is_new)
+        pass_info_reset(p, true);
+    pass_describe(p, "interpolation");
+    if (!valid || t->still) {
+        // surface_now is guaranteed to be valid, so we can safely use it.
+        pass_read_fbo(p, &p->surfaces[surface_now].fbotex);
+        p->is_interpolated = false;
+    } else {
+        double mix = t->vsync_offset / t->ideal_frame_duration;
+        // The scaler code always wants the fcoord to be between 0 and 1,
+        // so we try to adjust by using the previous set of N frames instead
+        // (which requires some extra checking to make sure it's valid)
+        if (mix < 0.0) {
+            int prev = fbosurface_wrap(surface_bse - 1);
+            if (p->surfaces[prev].id != 0 &&
+                p->surfaces[prev].id < p->surfaces[surface_bse].id)
+            {
+                mix += 1.0;
+                surface_bse = prev;
+            } else {
+                mix = 0.0; // at least don't blow up, this should only
+                           // ever happen at the start of playback
+            }
+        }
+
+        if (oversample) {
+            // Oversample uses the frame area as mix ratio, not the the vsync
+            // position itself
+            double vsync_dist = t->vsync_interval / t->ideal_frame_duration,
+                   threshold = tscale->conf.kernel.params[0];
+            threshold = isnan(threshold) ? 0.0 : threshold;
+            mix = (1 - mix) / vsync_dist;
+            mix = mix <= 0 + threshold ? 0 : mix;
+            mix = mix >= 1 - threshold ? 1 : mix;
+            mix = 1 - mix;
+        }
+
+        // Blend the frames together
+        if (oversample || linear) {
+            gl_sc_uniform_f(p->sc, "inter_coeff", mix);
+            GLSL(color = mix(texture(texture0, texcoord0),
+                             texture(texture1, texcoord1),
+                             inter_coeff);)
+        } else {
+            gl_sc_uniform_f(p->sc, "fcoord", mix);
+            pass_sample_separated_gen(p->sc, tscale, 0, 0);
+        }
+
+        // Load all the required frames
+        for (int i = 0; i < size; i++) {
+            struct img_tex img =
+                img_tex_fbo(&p->surfaces[fbosurface_wrap(surface_bse+i)].fbotex,
+                            PLANE_RGB, p->components);
+            // Since the code in pass_sample_separated currently assumes
+            // the textures are bound in-order and starting at 0, we just
+            // assert to make sure this is the case (which it should always be)
+            int id = pass_bind(p, img);
+            assert(id == i);
+        }
+
+        MP_DBG(p, "inter frame dur: %f vsync: %f, mix: %f\n",
+               t->ideal_frame_duration, t->vsync_interval, mix);
+        p->is_interpolated = true;
+    }
+    pass_draw_to_screen(p, fbo);
+
+    p->frames_drawn += 1;
+}
+
+void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
+                           struct fbodst target)
+{
+    struct mp_rect target_rc = {0, 0, target.tex->params.w, target.tex->params.h};
+
+    p->broken_frame = false;
+
+    bool has_frame = !!frame->current;
+
+    if (!has_frame || !mp_rect_equals(&p->dst_rect, &target_rc)) {
+        struct m_color c = p->clear_color;
+        float color[4] = {c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0};
+        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
+    }
+
+    if (p->hwdec_active && p->hwdec->driver->overlay_frame) {
+        if (has_frame) {
+            float *color = p->hwdec->overlay_colorkey;
+            p->ra->fns->clear(p->ra, target.tex, color, &p->dst_rect);
+        }
+
+        p->hwdec->driver->overlay_frame(p->hwdec, frame->current,
+                                        &p->src_rect, &p->dst_rect,
+                                        frame->frame_id != p->image.id);
+
+        if (frame->current)
+            p->osd_pts = frame->current->pts;
+
+        // Disable GL rendering
+        has_frame = false;
+    }
+
+    if (has_frame) {
+        bool interpolate = p->opts.interpolation && frame->display_synced &&
+                           (p->frames_drawn || !frame->still);
+        if (interpolate) {
+            double ratio = frame->ideal_frame_duration / frame->vsync_interval;
+            if (fabs(ratio - 1.0) < p->opts.interpolation_threshold)
+                interpolate = false;
+        }
+
+        if (interpolate) {
+            gl_video_interpolate_frame(p, frame, target);
+        } else {
+            bool is_new = frame->frame_id != p->image.id;
+
+            // Redrawing a frame might update subtitles.
+            if (frame->still && p->opts.blend_subs)
+                is_new = true;
+
+            if (is_new || !p->output_fbo_valid) {
+                p->output_fbo_valid = false;
+
+                pass_info_reset(p, !is_new);
+                if (!pass_render_frame(p, frame->current, frame->frame_id))
+                    goto done;
+
+                // For the non-interpolation case, we draw to a single "cache"
+                // FBO to speed up subsequent re-draws (if any exist)
+                struct fbodst dest_fbo = target;
+                if (frame->num_vsyncs > 1 && frame->display_synced &&
+                    !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
+                {
+                    fbotex_change(&p->output_fbo, p->ra, p->log,
+                                  target.tex->params.w, target.tex->params.h,
+                                  p->fbo_format, FBOTEX_FUZZY);
+                    dest_fbo = p->output_fbo.fbo;
+                    p->output_fbo_valid = true;
+                }
+                pass_draw_to_screen(p, dest_fbo);
+            }
+
+            // "output fbo valid" and "output fbo needed" are equivalent
+            if (p->output_fbo_valid) {
+                pass_info_reset(p, true);
+                pass_describe(p, "redraw cached frame");
+                struct mp_rect src = p->dst_rect;
+                struct mp_rect dst = src;
+                if (target.flip) {
+                    dst.y0 = target.tex->params.h - src.y0;
+                    dst.y1 = target.tex->params.h - src.y1;
+                }
+                timer_pool_start(p->blit_timer);
+                p->ra->fns->blit(p->ra, target.tex, p->output_fbo.tex,
+                                 &dst, &src);
+                timer_pool_stop(p->blit_timer);
+                pass_record(p, timer_pool_measure(p->blit_timer));
+            }
+        }
+    }
+
+done:
+
+    unmap_current_image(p);
+
+    debug_check_gl(p, "after video rendering");
+
+    if (p->osd) {
+        // If we haven't actually drawn anything so far, then we technically
+        // need to consider this the start of a new pass. Let's call it a
+        // redraw just because, since it's basically a blank frame anyway
+        if (!has_frame)
+            pass_info_reset(p, true);
+
+        pass_draw_osd(p, p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0,
+                      p->osd_pts, p->osd_rect, target, true);
+        debug_check_gl(p, "after OSD rendering");
+    }
+
+    if (gl_sc_error_state(p->sc) || p->broken_frame) {
+        // Make the screen solid blue to make it visually clear that an
+        // error has occurred
+        float color[4] = {0.0, 0.05, 0.5, 1.0};
+        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
+    }
+
+    p->frames_rendered++;
+    pass_report_performance(p);
+}
+
+// Use this color instead of the global option.
+void gl_video_set_clear_color(struct gl_video *p, struct m_color c)
+{
+    p->force_clear_color = true;
+    p->clear_color = c;
+}
+
+void gl_video_set_osd_pts(struct gl_video *p, double pts)
+{
+    p->osd_pts = pts;
+}
+
+bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *res,
+                               double pts)
+{
+    return p->osd ? mpgl_osd_check_change(p->osd, res, pts) : false;
+}
+
+void gl_video_resize(struct gl_video *p,
+                     struct mp_rect *src, struct mp_rect *dst,
+                     struct mp_osd_res *osd)
+{
+    if (mp_rect_equals(&p->src_rect, src) &&
+        mp_rect_equals(&p->dst_rect, dst) &&
+        osd_res_equals(p->osd_rect, *osd))
+        return;
+
+    p->src_rect = *src;
+    p->dst_rect = *dst;
+    p->osd_rect = *osd;
+
+    gl_video_reset_surfaces(p);
+
+    if (p->osd)
+        mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo_out);
+}
+
+static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
+{
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        if (!pass[i].desc.len)
+            break;
+        out->perf[out->count] = pass[i].perf;
+        out->desc[out->count] = pass[i].desc.start;
+        out->count++;
+    }
+}
+
+void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out)
+{
+    *out = (struct voctrl_performance_data){0};
+    frame_perf_data(p->pass_fresh,  &out->fresh);
+    frame_perf_data(p->pass_redraw, &out->redraw);
+}
+
+// This assumes nv12, with textures set to GL_NEAREST filtering.
+static void reinterleave_vdpau(struct gl_video *p,
+                               struct ra_tex *input[4], struct ra_tex *output[2])
+{
+    for (int n = 0; n < 2; n++) {
+        struct fbotex *fbo = &p->vdpau_deinterleave_fbo[n];
+        // This is an array of the 2 to-merge planes.
+        struct ra_tex **src = &input[n * 2];
+        int w = src[0]->params.w;
+        int h = src[0]->params.h;
+        int ids[2];
+        for (int t = 0; t < 2; t++) {
+            ids[t] = pass_bind(p, (struct img_tex){
+                .tex = src[t],
+                .multiplier = 1.0,
+                .transform = identity_trans,
+                .w = w,
+                .h = h,
+            });
+        }
+
+        GLSLF("color = fract(gl_FragCoord.y * 0.5) < 0.5\n");
+        GLSLF("      ? texture(texture%d, texcoord%d)\n", ids[0], ids[0]);
+        GLSLF("      : texture(texture%d, texcoord%d);", ids[1], ids[1]);
+
+        const struct ra_format *fmt =
+            ra_find_unorm_format(p->ra, 1, n == 0 ? 1 : 2);
+        fbotex_change(fbo, p->ra, p->log, w, h * 2, fmt, 0);
+
+        pass_describe(p, "vdpau reinterleaving");
+        finish_pass_direct(p, fbo->fbo, &(struct mp_rect){0, 0, w, h * 2});
+
+        output[n] = fbo->tex;
+    }
+}
+
+// Returns false on failure.
+static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id)
+{
+    struct video_image *vimg = &p->image;
+
+    if (vimg->id == id)
+        return true;
+
+    unref_current_image(p);
+
+    mpi = mp_image_new_ref(mpi);
+    if (!mpi)
+        goto error;
+
+    vimg->mpi = mpi;
+    vimg->id = id;
+    p->osd_pts = mpi->pts;
+    p->frames_uploaded++;
+
+    if (p->hwdec_active) {
+        // Hardware decoding
+
+        if (!p->hwdec_mapper)
+            goto error;
+
+        pass_describe(p, "map frame (hwdec)");
+        timer_pool_start(p->upload_timer);
+        bool ok = ra_hwdec_mapper_map(p->hwdec_mapper, vimg->mpi) >= 0;
+        timer_pool_stop(p->upload_timer);
+        pass_record(p, timer_pool_measure(p->upload_timer));
+
+        vimg->hwdec_mapped = true;
+        if (ok) {
+            struct mp_image layout = {0};
+            mp_image_set_params(&layout, &p->image_params);
+            struct ra_tex **tex = p->hwdec_mapper->tex;
+            struct ra_tex *tmp[4] = {0};
+            if (p->hwdec_mapper->vdpau_fields) {
+                reinterleave_vdpau(p, tex, tmp);
+                tex = tmp;
+            }
+            for (int n = 0; n < p->plane_count; n++) {
+                vimg->planes[n] = (struct texplane){
+                    .w = mp_image_plane_w(&layout, n),
+                    .h = mp_image_plane_h(&layout, n),
+                    .tex = tex[n],
+                };
+            }
+        } else {
+            MP_FATAL(p, "Mapping hardware decoded surface failed.\n");
+            goto error;
+        }
+        return true;
+    }
+
+    // Software decoding
+    assert(mpi->num_planes == p->plane_count);
+
+    timer_pool_start(p->upload_timer);
+    for (int n = 0; n < p->plane_count; n++) {
+        struct texplane *plane = &vimg->planes[n];
+
+        plane->flipped = mpi->stride[0] < 0;
+
+        struct ra_tex_upload_params params = {
+            .tex = plane->tex,
+            .src = mpi->planes[n],
+            .invalidate = true,
+            .stride = mpi->stride[n],
+        };
+
+        struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]);
+        if (mapped) {
+            params.buf = mapped->buf;
+            params.buf_offset = (uintptr_t)params.src -
+                                (uintptr_t)mapped->buf->data;
+            params.src = NULL;
+        }
+
+        if (p->using_dr_path != !!mapped) {
+            p->using_dr_path = !!mapped;
+            MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no");
+        }
+
+        if (!p->ra->fns->tex_upload(p->ra, &params)) {
+            timer_pool_stop(p->upload_timer);
+            goto error;
+        }
+
+        if (mapped && !mapped->mpi)
+            mapped->mpi = mp_image_new_ref(mpi);
+    }
+    timer_pool_stop(p->upload_timer);
+
+    bool using_pbo = p->ra->use_pbo || !(p->ra->caps & RA_CAP_DIRECT_UPLOAD);
+    const char *mode = p->using_dr_path ? "DR" : using_pbo ? "PBO" : "naive";
+    pass_describe(p, "upload frame (%s)", mode);
+    pass_record(p, timer_pool_measure(p->upload_timer));
+
+    return true;
+
+error:
+    unref_current_image(p);
+    p->broken_frame = true;
+    return false;
+}
+
+static bool test_fbo(struct gl_video *p, const struct ra_format *fmt)
+{
+    MP_VERBOSE(p, "Testing FBO format %s\n", fmt->name);
+    struct fbotex fbo = {0};
+    bool success = fbotex_change(&fbo, p->ra, p->log, 16, 16, fmt, 0);
+    fbotex_uninit(&fbo);
+    return success;
+}
+
+// Return whether dumb-mode can be used without disabling any features.
+// Essentially, vo_opengl with mostly default settings will return true.
+static bool check_dumb_mode(struct gl_video *p)
+{
+    struct gl_video_opts *o = &p->opts;
+    if (p->use_integer_conversion)
+        return false;
+    if (o->dumb_mode > 0) // requested by user
+        return true;
+    if (o->dumb_mode < 0) // disabled by user
+        return false;
+
+    // otherwise, use auto-detection
+    if (o->target_prim || o->target_trc || o->linear_scaling ||
+        o->correct_downscaling || o->sigmoid_upscaling || o->interpolation ||
+        o->blend_subs || o->deband || o->unsharp)
+        return false;
+    // check remaining scalers (tscale is already implicitly excluded above)
+    for (int i = 0; i < SCALER_COUNT; i++) {
+        if (i != SCALER_TSCALE) {
+            const char *name = o->scaler[i].kernel.name;
+            if (name && strcmp(name, "bilinear") != 0)
+                return false;
+        }
+    }
+    if (o->user_shaders && o->user_shaders[0])
+        return false;
+    if (p->use_lut_3d)
+        return false;
+    return true;
+}
+
+// Disable features that are not supported with the current OpenGL version.
+static void check_gl_features(struct gl_video *p)
+{
+    struct ra *ra = p->ra;
+    bool have_float_tex = !!ra_find_float16_format(ra, 1);
+    bool have_mglsl = ra->glsl_version >= 130; // modern GLSL
+    const struct ra_format *rg_tex = ra_find_unorm_format(p->ra, 1, 2);
+    bool have_texrg = rg_tex && !rg_tex->luminance_alpha;
+    bool have_compute = ra->caps & RA_CAP_COMPUTE;
+    bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
+
+    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgb10_a2", "rgba8", 0};
+    const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
+    const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
+                          ? user_fbo_fmts : auto_fbo_fmts;
+    bool have_fbo = false;
+    p->fbo_format = NULL;
+    for (int n = 0; fbo_fmts[n]; n++) {
+        const char *fmt = fbo_fmts[n];
+        const struct ra_format *f = ra_find_named_format(p->ra, fmt);
+        if (!f && fbo_fmts == user_fbo_fmts)
+            MP_WARN(p, "FBO format '%s' not found!\n", fmt);
+        if (f && f->renderable && f->linear_filter && test_fbo(p, f)) {
+            MP_VERBOSE(p, "Using FBO format %s.\n", f->name);
+            have_fbo = true;
+            p->fbo_format = f;
+            break;
+        }
+    }
+
+    p->forced_dumb_mode = p->opts.dumb_mode > 0 || !have_fbo || !have_texrg;
+    bool voluntarily_dumb = check_dumb_mode(p);
+    if (p->forced_dumb_mode || voluntarily_dumb) {
+        if (voluntarily_dumb) {
+            MP_VERBOSE(p, "No advanced processing required. Enabling dumb mode.\n");
+        } else if (p->opts.dumb_mode <= 0) {
+            MP_WARN(p, "High bit depth FBOs unsupported. Enabling dumb mode.\n"
+                       "Most extended features will be disabled.\n");
+        }
+        p->dumb_mode = true;
+        p->use_lut_3d = false;
+        // Most things don't work, so whitelist all options that still work.
+        p->opts = (struct gl_video_opts){
+            .gamma = p->opts.gamma,
+            .gamma_auto = p->opts.gamma_auto,
+            .pbo = p->opts.pbo,
+            .fbo_format = p->opts.fbo_format,
+            .alpha_mode = p->opts.alpha_mode,
+            .use_rectangle = p->opts.use_rectangle,
+            .background = p->opts.background,
+            .dither_algo = p->opts.dither_algo,
+            .dither_depth = p->opts.dither_depth,
+            .dither_size = p->opts.dither_size,
+            .temporal_dither = p->opts.temporal_dither,
+            .temporal_dither_period = p->opts.temporal_dither_period,
+            .tex_pad_x = p->opts.tex_pad_x,
+            .tex_pad_y = p->opts.tex_pad_y,
+            .tone_mapping = p->opts.tone_mapping,
+            .tone_mapping_param = p->opts.tone_mapping_param,
+            .tone_mapping_desat = p->opts.tone_mapping_desat,
+            .early_flush = p->opts.early_flush,
+        };
+        for (int n = 0; n < SCALER_COUNT; n++)
+            p->opts.scaler[n] = gl_video_opts_def.scaler[n];
+        return;
+    }
+    p->dumb_mode = false;
+
+    // Normally, we want to disable them by default if FBOs are unavailable,
+    // because they will be slow (not critically slow, but still slower).
+    // Without FP textures, we must always disable them.
+    // I don't know if luminance alpha float textures exist, so disregard them.
+    for (int n = 0; n < SCALER_COUNT; n++) {
+        const struct filter_kernel *kernel =
+            mp_find_filter_kernel(p->opts.scaler[n].kernel.name);
+        if (kernel) {
+            char *reason = NULL;
+            if (!have_float_tex)
+                reason = "(float tex. missing)";
+            if (!have_mglsl)
+                reason = "(GLSL version too old)";
+            if (reason) {
+                MP_WARN(p, "Disabling scaler #%d %s %s.\n", n,
+                        p->opts.scaler[n].kernel.name, reason);
+                // p->opts is a copy => we can just mess with it.
+                p->opts.scaler[n].kernel.name = "bilinear";
+                if (n == SCALER_TSCALE)
+                    p->opts.interpolation = 0;
+            }
+        }
+    }
+
+    int use_cms = p->opts.target_prim != MP_CSP_PRIM_AUTO ||
+                  p->opts.target_trc != MP_CSP_TRC_AUTO || p->use_lut_3d;
+
+    // mix() is needed for some gamma functions
+    if (!have_mglsl && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
+        p->opts.linear_scaling = false;
+        p->opts.sigmoid_upscaling = false;
+        MP_WARN(p, "Disabling linear/sigmoid scaling (GLSL version too old).\n");
+    }
+    if (!have_mglsl && use_cms) {
+        p->opts.target_prim = MP_CSP_PRIM_AUTO;
+        p->opts.target_trc = MP_CSP_TRC_AUTO;
+        p->use_lut_3d = false;
+        MP_WARN(p, "Disabling color management (GLSL version too old).\n");
+    }
+    if (!have_mglsl && p->opts.deband) {
+        p->opts.deband = 0;
+        MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
+    }
+    if ((!have_compute || !have_ssbo) && p->opts.compute_hdr_peak) {
+        p->opts.compute_hdr_peak = 0;
+        MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
+    }
+}
+
+static void init_gl(struct gl_video *p)
+{
+    debug_check_gl(p, "before init_gl");
+
+    p->upload_timer = timer_pool_create(p->ra);
+    p->blit_timer = timer_pool_create(p->ra);
+    p->osd_timer = timer_pool_create(p->ra);
+
+    debug_check_gl(p, "after init_gl");
+
+    ra_dump_tex_formats(p->ra, MSGL_DEBUG);
+    ra_dump_img_formats(p->ra, MSGL_DEBUG);
+}
+
+void gl_video_uninit(struct gl_video *p)
+{
+    if (!p)
+        return;
+
+    uninit_video(p);
+
+    gl_sc_destroy(p->sc);
+
+    ra_tex_free(p->ra, &p->lut_3d_texture);
+    ra_buf_free(p->ra, &p->hdr_peak_ssbo);
+
+    timer_pool_destroy(p->upload_timer);
+    timer_pool_destroy(p->blit_timer);
+    timer_pool_destroy(p->osd_timer);
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        talloc_free(p->pass_fresh[i].desc.start);
+        talloc_free(p->pass_redraw[i].desc.start);
+    }
+
+    mpgl_osd_destroy(p->osd);
+
+    // Forcibly destroy possibly remaining image references. This should also
+    // cause gl_video_dr_free_buffer() to be called for the remaining buffers.
+    gc_pending_dr_fences(p, true);
+
+    // Should all have been unreffed already.
+    assert(!p->num_dr_buffers);
+
+    talloc_free(p);
+}
+
+void gl_video_reset(struct gl_video *p)
+{
+    gl_video_reset_surfaces(p);
+}
+
+bool gl_video_showing_interpolated_frame(struct gl_video *p)
+{
+    return p->is_interpolated;
+}
+
+static bool is_imgfmt_desc_supported(struct gl_video *p,
+                                     const struct ra_imgfmt_desc *desc)
+{
+    if (!desc->num_planes)
+        return false;
+
+    if (desc->planes[0]->ctype == RA_CTYPE_UINT && p->forced_dumb_mode)
+        return false;
+
+    return true;
+}
+
+bool gl_video_check_format(struct gl_video *p, int mp_format)
+{
+    struct ra_imgfmt_desc desc;
+    if (ra_get_imgfmt_desc(p->ra, mp_format, &desc) &&
+        is_imgfmt_desc_supported(p, &desc))
+        return true;
+    if (p->hwdec && ra_hwdec_test_format(p->hwdec, mp_format))
+        return true;
+    return false;
+}
+
+void gl_video_config(struct gl_video *p, struct mp_image_params *params)
+{
+    unmap_overlay(p);
+    unref_current_image(p);
+
+    if (!mp_image_params_equal(&p->real_image_params, params)) {
+        uninit_video(p);
+        p->real_image_params = *params;
+        p->image_params = *params;
+        if (params->imgfmt)
+            init_video(p);
+    }
+
+    gl_video_reset_surfaces(p);
+}
+
+void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd)
+{
+    mpgl_osd_destroy(p->osd);
+    p->osd = NULL;
+    p->osd_state = osd;
+    reinit_osd(p);
+}
+
+struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
+                               struct mpv_global *g)
+{
+    struct gl_video *p = talloc_ptrtype(NULL, p);
+    *p = (struct gl_video) {
+        .ra = ra,
+        .global = g,
+        .log = log,
+        .sc = gl_sc_create(ra, g, log),
+        .video_eq = mp_csp_equalizer_create(p, g),
+        .opts_cache = m_config_cache_alloc(p, g, &gl_video_conf),
+    };
+    // make sure this variable is initialized to *something*
+    p->pass = p->pass_fresh;
+    struct gl_video_opts *opts = p->opts_cache->opts;
+    p->cms = gl_lcms_init(p, log, g, opts->icc_opts),
+    p->opts = *opts;
+    for (int n = 0; n < SCALER_COUNT; n++)
+        p->scaler[n] = (struct scaler){.index = n};
+    init_gl(p);
+    reinit_from_options(p);
+    return p;
+}
+
+// Get static string for scaler shader. If "tscale" is set to true, the
+// scaler must be a separable convolution filter.
+static const char *handle_scaler_opt(const char *name, bool tscale)
+{
+    if (name && name[0]) {
+        const struct filter_kernel *kernel = mp_find_filter_kernel(name);
+        if (kernel && (!tscale || !kernel->polar))
+                return kernel->f.name;
+
+        for (const char *const *filter = tscale ? fixed_tscale_filters
+                                                : fixed_scale_filters;
+             *filter; filter++) {
+            if (strcmp(*filter, name) == 0)
+                return *filter;
+        }
+    }
+    return NULL;
+}
+
+void gl_video_update_options(struct gl_video *p)
+{
+    if (m_config_cache_update(p->opts_cache)) {
+        gl_lcms_update_options(p->cms);
+        reinit_from_options(p);
+    }
+}
+
+static void reinit_from_options(struct gl_video *p)
+{
+    p->use_lut_3d = gl_lcms_has_profile(p->cms);
+
+    // Copy the option fields, so that check_gl_features() can mutate them.
+    // This works only for the fields themselves of course, not for any memory
+    // referenced by them.
+    p->opts = *(struct gl_video_opts *)p->opts_cache->opts;
+
+    if (!p->force_clear_color)
+        p->clear_color = p->opts.background;
+
+    check_gl_features(p);
+    uninit_rendering(p);
+    gl_sc_set_cache_dir(p->sc, p->opts.shader_cache_dir);
+    p->ra->use_pbo = p->opts.pbo;
+    gl_video_setup_hooks(p);
+    reinit_osd(p);
+
+    if (p->opts.interpolation && !p->global->opts->video_sync && !p->dsi_warned) {
+        MP_WARN(p, "Interpolation now requires enabling display-sync mode.\n"
+                   "E.g.: --video-sync=display-resample\n");
+        p->dsi_warned = true;
+    }
+}
+
+void gl_video_configure_queue(struct gl_video *p, struct vo *vo)
+{
+    int queue_size = 1;
+
+    // Figure out an adequate size for the interpolation queue. The larger
+    // the radius, the earlier we need to queue frames.
+    if (p->opts.interpolation) {
+        const struct filter_kernel *kernel =
+            mp_find_filter_kernel(p->opts.scaler[SCALER_TSCALE].kernel.name);
+        if (kernel) {
+            // filter_scale wouldn't be correctly initialized were we to use it here.
+            // This is fine since we're always upsampling, but beware if downsampling
+            // is added!
+            double radius = kernel->f.radius;
+            radius = radius > 0 ? radius : p->opts.scaler[SCALER_TSCALE].radius;
+            queue_size += 1 + ceil(radius);
+        } else {
+            // Oversample/linear case
+            queue_size += 2;
+        }
+    }
+
+    vo_set_queue_params(vo, 0, queue_size);
+}
+
+static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param)
+{
+    char s[20] = {0};
+    int r = 1;
+    bool tscale = bstr_equals0(name, "tscale");
+    if (bstr_equals0(param, "help")) {
+        r = M_OPT_EXIT;
+    } else {
+        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+        if (!handle_scaler_opt(s, tscale))
+            r = M_OPT_INVALID;
+    }
+    if (r < 1) {
+        mp_info(log, "Available scalers:\n");
+        for (const char *const *filter = tscale ? fixed_tscale_filters
+                                                : fixed_scale_filters;
+             *filter; filter++) {
+            mp_info(log, "    %s\n", *filter);
+        }
+        for (int n = 0; mp_filter_kernels[n].f.name; n++) {
+            if (!tscale || !mp_filter_kernels[n].polar)
+                mp_info(log, "    %s\n", mp_filter_kernels[n].f.name);
+        }
+        if (s[0])
+            mp_fatal(log, "No scaler named '%s' found!\n", s);
+    }
+    return r;
+}
+
+static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
+                               struct bstr name, struct bstr param)
+{
+    char s[20] = {0};
+    int r = 1;
+    if (bstr_equals0(param, "help")) {
+        r = M_OPT_EXIT;
+    } else {
+        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+        const struct filter_window *window = mp_find_filter_window(s);
+        if (!window)
+            r = M_OPT_INVALID;
+    }
+    if (r < 1) {
+        mp_info(log, "Available windows:\n");
+        for (int n = 0; mp_filter_windows[n].name; n++)
+            mp_info(log, "    %s\n", mp_filter_windows[n].name);
+        if (s[0])
+            mp_fatal(log, "No window named '%s' found!\n", s);
+    }
+    return r;
+}
+
+float gl_video_scale_ambient_lux(float lmin, float lmax,
+                                 float rmin, float rmax, float lux)
+{
+    assert(lmax > lmin);
+
+    float num = (rmax - rmin) * (log10(lux) - log10(lmin));
+    float den = log10(lmax) - log10(lmin);
+    float result = num / den + rmin;
+
+    // clamp the result
+    float max = MPMAX(rmax, rmin);
+    float min = MPMIN(rmax, rmin);
+    return MPMAX(MPMIN(result, max), min);
+}
+
+void gl_video_set_ambient_lux(struct gl_video *p, int lux)
+{
+    if (p->opts.gamma_auto) {
+        float gamma = gl_video_scale_ambient_lux(16.0, 64.0, 2.40, 1.961, lux);
+        MP_VERBOSE(p, "ambient light changed: %dlux (gamma: %f)\n", lux, gamma);
+        p->opts.gamma = MPMIN(1.0, 1.961 / gamma);
+    }
+}
+
+void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec)
+{
+    unref_current_image(p);
+    ra_hwdec_mapper_free(&p->hwdec_mapper);
+    p->hwdec = hwdec;
+}
+
+static void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size)
+{
+    struct ra_buf_params params = {
+        .type = RA_BUF_TYPE_TEX_UPLOAD,
+        .host_mapped = true,
+        .size = size,
+    };
+
+    struct ra_buf *buf = ra_buf_create(p->ra, &params);
+    if (!buf)
+        return NULL;
+
+    MP_TARRAY_GROW(p, p->dr_buffers, p->num_dr_buffers);
+    p->dr_buffers[p->num_dr_buffers++] = (struct dr_buffer){ .buf = buf };
+
+    return buf->data;
+};
+
+static void gl_video_dr_free_buffer(void *opaque, uint8_t *data)
+{
+    struct gl_video *p = opaque;
+
+    for (int n = 0; n < p->num_dr_buffers; n++) {
+        struct dr_buffer *buffer = &p->dr_buffers[n];
+        if (buffer->buf->data == data) {
+            assert(!buffer->mpi); // can't be freed while it has a ref
+            ra_buf_free(p->ra, &buffer->buf);
+            MP_TARRAY_REMOVE_AT(p->dr_buffers, p->num_dr_buffers, n);
+            return;
+        }
+    }
+    // not found - must not happen
+    assert(0);
+}
+
+struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
+                                    int stride_align)
+{
+    int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
+    if (size < 0)
+        return NULL;
+
+    int alloc_size = size + stride_align;
+    void *ptr = gl_video_dr_alloc_buffer(p, alloc_size);
+    if (!ptr)
+        return NULL;
+
+    // (we expect vo.c to proxy the free callback, so it happens in the same
+    // thread it was allocated in, removing the need for synchronization)
+    struct mp_image *res = mp_image_from_buffer(imgfmt, w, h, stride_align,
+                                                ptr, alloc_size, p,
+                                                gl_video_dr_free_buffer);
+    if (!res)
+        gl_video_dr_free_buffer(p, ptr);
+    return res;
+}
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
new file mode 100644
index 0000000000..884f5914fd
--- /dev/null
+++ b/video/out/gpu/video.h
@@ -0,0 +1,194 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_VIDEO_H
+#define MP_GL_VIDEO_H
+
+#include <stdbool.h>
+
+#include "options/m_option.h"
+#include "sub/osd.h"
+#include "utils.h"
+#include "lcms.h"
+#include "shader_cache.h"
+#include "video/csputils.h"
+#include "video/out/filter_kernels.h"
+
+// Assume we have this many texture units for sourcing additional passes.
+// The actual texture unit assignment is dynamic.
+#define TEXUNIT_VIDEO_NUM 6
+
+struct scaler_fun {
+    char *name;
+    float params[2];
+    float blur;
+    float taper;
+};
+
+struct scaler_config {
+    struct scaler_fun kernel;
+    struct scaler_fun window;
+    float radius;
+    float antiring;
+    float cutoff;
+    float clamp;
+};
+
+struct scaler {
+    int index;
+    struct scaler_config conf;
+    double scale_factor;
+    bool initialized;
+    struct filter_kernel *kernel;
+    struct ra_tex *lut;
+    struct fbotex sep_fbo;
+    bool insufficient;
+    int lut_size;
+
+    // kernel points here
+    struct filter_kernel kernel_storage;
+};
+
+enum scaler_unit {
+    SCALER_SCALE,  // luma/video
+    SCALER_DSCALE, // luma-video downscaling
+    SCALER_CSCALE, // chroma upscaling
+    SCALER_TSCALE, // temporal scaling (interpolation)
+    SCALER_COUNT
+};
+
+enum dither_algo {
+    DITHER_NONE = 0,
+    DITHER_FRUIT,
+    DITHER_ORDERED,
+};
+
+enum alpha_mode {
+    ALPHA_NO = 0,
+    ALPHA_YES,
+    ALPHA_BLEND,
+    ALPHA_BLEND_TILES,
+};
+
+enum blend_subs_mode {
+    BLEND_SUBS_NO = 0,
+    BLEND_SUBS_YES,
+    BLEND_SUBS_VIDEO,
+};
+
+enum tone_mapping {
+    TONE_MAPPING_CLIP,
+    TONE_MAPPING_MOBIUS,
+    TONE_MAPPING_REINHARD,
+    TONE_MAPPING_HABLE,
+    TONE_MAPPING_GAMMA,
+    TONE_MAPPING_LINEAR,
+};
+
+// How many frames to average over for HDR peak detection
+#define PEAK_DETECT_FRAMES 100
+
+struct gl_video_opts {
+    int dumb_mode;
+    struct scaler_config scaler[4];
+    int scaler_lut_size;
+    float gamma;
+    int gamma_auto;
+    int target_prim;
+    int target_trc;
+    int target_brightness;
+    int tone_mapping;
+    int compute_hdr_peak;
+    float tone_mapping_param;
+    float tone_mapping_desat;
+    int gamut_warning;
+    int linear_scaling;
+    int correct_downscaling;
+    int sigmoid_upscaling;
+    float sigmoid_center;
+    float sigmoid_slope;
+    int scaler_resizes_only;
+    int pbo;
+    int dither_depth;
+    int dither_algo;
+    int dither_size;
+    int temporal_dither;
+    int temporal_dither_period;
+    char *fbo_format;
+    int alpha_mode;
+    int use_rectangle;
+    struct m_color background;
+    int interpolation;
+    float interpolation_threshold;
+    int blend_subs;
+    char **user_shaders;
+    int deband;
+    struct deband_opts *deband_opts;
+    float unsharp;
+    int tex_pad_x, tex_pad_y;
+    struct mp_icc_opts *icc_opts;
+    int early_flush;
+    char *shader_cache_dir;
+};
+
+extern const struct m_sub_options gl_video_conf;
+
+struct gl_video;
+struct vo_frame;
+
+struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
+                               struct mpv_global *g);
+void gl_video_uninit(struct gl_video *p);
+void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
+void gl_video_update_options(struct gl_video *p);
+bool gl_video_check_format(struct gl_video *p, int mp_format);
+void gl_video_config(struct gl_video *p, struct mp_image_params *params);
+void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
+void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
+                           struct fbodst target);
+void gl_video_resize(struct gl_video *p,
+                     struct mp_rect *src, struct mp_rect *dst,
+                     struct mp_osd_res *osd);
+void gl_video_set_fb_depth(struct gl_video *p, int fb_depth);
+void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out);
+void gl_video_set_clear_color(struct gl_video *p, struct m_color color);
+void gl_video_set_osd_pts(struct gl_video *p, double pts);
+bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *osd,
+                               double pts);
+
+float gl_video_scale_ambient_lux(float lmin, float lmax,
+                                 float rmin, float rmax, float lux);
+void gl_video_set_ambient_lux(struct gl_video *p, int lux);
+void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data);
+bool gl_video_icc_auto_enabled(struct gl_video *p);
+bool gl_video_gamma_auto_enabled(struct gl_video *p);
+struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p);
+
+void gl_video_reset(struct gl_video *p);
+bool gl_video_showing_interpolated_frame(struct gl_video *p);
+
+struct ra_hwdec;
+void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec);
+
+struct vo;
+void gl_video_configure_queue(struct gl_video *p, struct vo *vo);
+
+struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
+                                    int stride_align);
+
+
+#endif
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
new file mode 100644
index 0000000000..60c5ce82ac
--- /dev/null
+++ b/video/out/gpu/video_shaders.c
@@ -0,0 +1,872 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+
+#include "video_shaders.h"
+#include "video.h"
+
+#define GLSL(x) gl_sc_add(sc, #x "\n");
+#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
+#define GLSLH(x) gl_sc_hadd(sc, #x "\n");
+#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__)
+
+// Set up shared/commonly used variables and macros
+void sampler_prelude(struct gl_shader_cache *sc, int tex_num)
+{
+    GLSLF("#undef tex\n");
+    GLSLF("#undef texmap\n");
+    GLSLF("#define tex texture%d\n", tex_num);
+    GLSLF("#define texmap texmap%d\n", tex_num);
+    GLSLF("vec2 pos = texcoord%d;\n", tex_num);
+    GLSLF("vec2 size = texture_size%d;\n", tex_num);
+    GLSLF("vec2 pt = pixel_size%d;\n", tex_num);
+}
+
+static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
+                                              struct scaler *scaler)
+{
+    gl_sc_uniform_texture(sc, "lut", scaler->lut);
+    GLSLF("float ypos = LUT_POS(fcoord, %d.0);\n", scaler->lut_size);
+
+    int N = scaler->kernel->size;
+    int width = (N + 3) / 4; // round up
+
+    GLSLF("float weights[%d];\n", N);
+    for (int i = 0; i < N; i++) {
+        if (i % 4 == 0)
+            GLSLF("c = texture(lut, vec2(%f, ypos));\n", (i / 4 + 0.5) / width);
+        GLSLF("weights[%d] = c[%d];\n", i, i % 4);
+    }
+}
+
+// Handle a single pass (either vertical or horizontal). The direction is given
+// by the vector (d_x, d_y). If the vector is 0, then planar interpolation is
+// used instead (samples from texture0 through textureN)
+void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
+                               int d_x, int d_y)
+{
+    int N = scaler->kernel->size;
+    bool use_ar = scaler->conf.antiring > 0;
+    bool planar = d_x == 0 && d_y == 0;
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    if (!planar) {
+        GLSLF("vec2 dir = vec2(%d.0, %d.0);\n", d_x, d_y);
+        GLSL(pt *= dir;)
+        GLSL(float fcoord = dot(fract(pos * size - vec2(0.5)), dir);)
+        GLSLF("vec2 base = pos - fcoord * pt - pt * vec2(%d.0);\n", N / 2 - 1);
+    }
+    GLSL(vec4 c;)
+    if (use_ar) {
+        GLSL(vec4 hi = vec4(0.0);)
+        GLSL(vec4 lo = vec4(1.0);)
+    }
+    pass_sample_separated_get_weights(sc, scaler);
+    GLSLF("// scaler samples\n");
+    for (int n = 0; n < N; n++) {
+        if (planar) {
+            GLSLF("c = texture(texture%d, texcoord%d);\n", n, n);
+        } else {
+            GLSLF("c = texture(tex, base + pt * vec2(%d.0));\n", n);
+        }
+        GLSLF("color += vec4(weights[%d]) * c;\n", n);
+        if (use_ar && (n == N/2-1 || n == N/2)) {
+            GLSL(lo = min(lo, c);)
+            GLSL(hi = max(hi, c);)
+        }
+    }
+    if (use_ar)
+        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
+              scaler->conf.antiring);
+    GLSLF("}\n");
+}
+
+// Subroutine for computing and adding an individual texel contribution
+// If subtexel < 0 and offset < 0, samples directly.
+// If subtexel >= 0, takes the texel from cN[subtexel]
+// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
+static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
+                         int x, int y, int subtexel, int offset, int components)
+{
+    double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
+    double radius_cutoff = scaler->kernel->radius_cutoff;
+
+    // Since we can't know the subpixel position in advance, assume a
+    // worst case scenario
+    int yy = y > 0 ? y-1 : y;
+    int xx = x > 0 ? x-1 : x;
+    double dmax = sqrt(xx*xx + yy*yy);
+    // Skip samples definitely outside the radius
+    if (dmax >= radius_cutoff)
+        return;
+    GLSLF("d = length(vec2(%d.0, %d.0) - fcoord);\n", x, y);
+    // Check for samples that might be skippable
+    bool maybe_skippable = dmax >= radius_cutoff - M_SQRT2;
+    if (maybe_skippable)
+        GLSLF("if (d < %f) {\n", radius_cutoff);
+
+    // get the weight for this pixel
+    if (scaler->lut->params.dimensions == 1) {
+        GLSLF("w = tex1D(lut, LUT_POS(d * 1.0/%f, %d.0)).r;\n",
+              radius, scaler->lut_size);
+    } else {
+        GLSLF("w = texture(lut, vec2(0.5, LUT_POS(d * 1.0/%f, %d.0))).r;\n",
+              radius, scaler->lut_size);
+    }
+    GLSL(wsum += w;)
+
+    if (subtexel < 0 && offset < 0) {
+        GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
+        GLSL(color += vec4(w) * c0;)
+    } else if (subtexel >= 0) {
+        for (int n = 0; n < components; n++)
+            GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
+    } else if (offset >= 0) {
+        for (int n = 0; n <components; n++)
+            GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
+                  y + offset, x + offset);
+    } else {
+        // invalid usage
+        abort();
+    }
+
+    if (maybe_skippable)
+        GLSLF("}\n");
+}
+
+void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                       int components, int glsl_version)
+{
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    GLSL(vec2 base = pos - fcoord * pt;)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    for (int n = 0; n < components; n++)
+        GLSLF("vec4 c%d;\n", n);
+
+    gl_sc_uniform_texture(sc, "lut", scaler->lut);
+
+    GLSLF("// scaler samples\n");
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    for (int y = 1-bound; y <= bound; y += 2) {
+        for (int x = 1-bound; x <= bound; x += 2) {
+            // First we figure out whether it's more efficient to use direct
+            // sampling or gathering. The problem is that gathering 4 texels
+            // only to discard some of them is very wasteful, so only do it if
+            // we suspect it will be a win rather than a loss. This is the case
+            // exactly when all four texels are within bounds
+            bool use_gather = sqrt(x*x + y*y) < scaler->kernel->radius_cutoff;
+
+            // textureGather is only supported in GLSL 400+
+            if (glsl_version < 400)
+                use_gather = false;
+
+            if (use_gather) {
+                // Gather the four surrounding texels simultaneously
+                for (int n = 0; n < components; n++) {
+                    GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
+                          n, x, y, n);
+                }
+
+                // Mix in all of the points with their weights
+                for (int p = 0; p < 4; p++) {
+                    // The four texels are gathered counterclockwise starting
+                    // from the bottom left
+                    static const int xo[4] = {0, 1, 1, 0};
+                    static const int yo[4] = {1, 1, 0, 0};
+                    if (x+xo[p] > bound || y+yo[p] > bound)
+                        continue;
+                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
+                }
+            } else {
+                // switch to direct sampling instead, for efficiency/compatibility
+                for (int yy = y; yy <= bound && yy <= y+1; yy++) {
+                    for (int xx = x; xx <= bound && xx <= x+1; xx++)
+                        polar_sample(sc, scaler, xx, yy, -1, -1, components);
+                }
+            }
+        }
+    }
+
+    GLSL(color = color / vec4(wsum);)
+    GLSLF("}\n");
+}
+
+// bw/bh: block size
+// iw/ih: input size (pre-calculated to fit all required texels)
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                        int components, int bw, int bh, int iw, int ih)
+{
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    int offset = bound - 1; // padding top/left
+
+    GLSL(color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 wpos = texmap(gl_WorkGroupID * gl_WorkGroupSize);)
+    GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));)
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    GLSL(vec2 base = pos - pt * fcoord;)
+    GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    gl_sc_uniform_texture(sc, "lut", scaler->lut);
+
+    // Load all relevant texels into shmem
+    gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays");
+    for (int c = 0; c < components; c++)
+        GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
+
+    GLSL(vec4 c;)
+    GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
+    GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
+    GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
+    for (int c = 0; c < components; c++)
+        GLSLF("in%d[y][x] = c[%d];\n", c, c);
+    GLSLF("}}\n");
+    GLSL(groupMemoryBarrier();)
+    GLSL(barrier();)
+
+    // Dispatch the actual samples
+    GLSLF("// scaler samples\n");
+    for (int y = 1-bound; y <= bound; y++) {
+        for (int x = 1-bound; x <= bound; x++)
+            polar_sample(sc, scaler, x, y, -1, offset, components);
+    }
+
+    GLSL(color = color / vec4(wsum);)
+    GLSLF("}\n");
+}
+
+static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
+{
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+    // Explanation why this algorithm normally always blurs, even with unit
+    // scaling:
+    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
+    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
+    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
+                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
+    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
+    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
+    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
+    GLSLF("%s.xy += vec2(1.0 + %s, 1.0 - %s);\n", t, s, s);
+}
+
+void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
+{
+    GLSLF("{\n");
+    GLSL(vec2 fcoord = fract(pos * size + vec2(0.5, 0.5));)
+    bicubic_calcweights(sc, "parmx", "fcoord.x");
+    bicubic_calcweights(sc, "parmy", "fcoord.y");
+    GLSL(vec4 cdelta;)
+    GLSL(cdelta.xz = parmx.rg * vec2(-pt.x, pt.x);)
+    GLSL(cdelta.yw = parmy.rg * vec2(-pt.y, pt.y);)
+    // first y-interpolation
+    GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
+    GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
+    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
+    // second y-interpolation
+    GLSL(vec4 br = texture(tex, pos + cdelta.zy);)
+    GLSL(vec4 bg = texture(tex, pos + cdelta.zw);)
+    GLSL(vec4 aa = mix(bg, br, parmy.b);)
+    // x-interpolation
+    GLSL(color = mix(aa, ab, parmx.b);)
+    GLSLF("}\n");
+}
+
+void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
+                                   int w, int h)
+{
+    GLSLF("{\n");
+    GLSL(vec2 pos = pos - vec2(0.5) * pt;) // round to nearest
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    // Determine the mixing coefficient vector
+    gl_sc_uniform_vec2(sc, "output_size", (float[2]){w, h});
+    GLSL(vec2 coeff = fcoord * output_size/size;)
+    float threshold = scaler->conf.kernel.params[0];
+    threshold = isnan(threshold) ? 0.0 : threshold;
+    GLSLF("coeff = (coeff - %f) * 1.0/%f;\n", threshold, 1.0 - 2 * threshold);
+    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
+    // Compute the right blend of colors
+    GLSL(color = texture(tex, pos + pt * (coeff - fcoord));)
+    GLSLF("}\n");
+}
+
+// Common constants for SMPTE ST.2084 (HDR)
+static const float PQ_M1 = 2610./4096 * 1./4,
+                   PQ_M2 = 2523./4096 * 128,
+                   PQ_C1 = 3424./4096,
+                   PQ_C2 = 2413./4096 * 32,
+                   PQ_C3 = 2392./4096 * 32;
+
+// Common constants for ARIB STD-B67 (HLG)
+static const float HLG_A = 0.17883277,
+                   HLG_B = 0.28466892,
+                   HLG_C = 0.55991073;
+
+// Common constants for Panasonic V-Log
+static const float VLOG_B = 0.00873,
+                   VLOG_C = 0.241514,
+                   VLOG_D = 0.598206;
+
+// Common constants for Sony S-Log
+static const float SLOG_A = 0.432699,
+                   SLOG_B = 0.037584,
+                   SLOG_C = 0.616596 + 0.03,
+                   SLOG_P = 3.538813,
+                   SLOG_Q = 0.030001,
+                   SLOG_K2 = 155.0 / 219.0;
+
+// Linearize (expand), given a TRC as input. In essence, this is the ITU-R
+// EOTF, calculated on an idealized (reference) monitor with a white point of
+// MP_REF_WHITE and infinite contrast.
+void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
+{
+    if (trc == MP_CSP_TRC_LINEAR)
+        return;
+
+    GLSLF("// linearize\n");
+
+    // Note that this clamp may technically violate the definition of
+    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
+    // displayed on the display where such would be possible. That said, the
+    // problem is that not all gamma curves are well-defined on the values
+    // outside this range, so we ignore it and just clip anyway for sanity.
+    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+
+    switch (trc) {
+    case MP_CSP_TRC_SRGB:
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/12.92),
+                             pow((color.rgb + vec3(0.055))/vec3(1.055), vec3(2.4)),
+                             lessThan(vec3(0.04045), color.rgb));)
+        break;
+    case MP_CSP_TRC_BT_1886:
+        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
+        break;
+    case MP_CSP_TRC_GAMMA18:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.8));)
+        break;
+    case MP_CSP_TRC_GAMMA22:
+        GLSL(color.rgb = pow(color.rgb, vec3(2.2));)
+        break;
+    case MP_CSP_TRC_GAMMA28:
+        GLSL(color.rgb = pow(color.rgb, vec3(2.8));)
+        break;
+    case MP_CSP_TRC_PRO_PHOTO:
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/16.0),
+                             pow(color.rgb, vec3(1.8)),
+                             lessThan(vec3(0.03125), color.rgb));)
+        break;
+    case MP_CSP_TRC_PQ:
+        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M2);
+        GLSLF("color.rgb = max(color.rgb - vec3(%f), vec3(0.0)) \n"
+              "             / (vec3(%f) - vec3(%f) * color.rgb);\n",
+              PQ_C1, PQ_C2, PQ_C3);
+        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M1);
+        // PQ's output range is 0-10000, but we need it to be relative to to
+        // MP_REF_WHITE instead, so rescale
+        GLSLF("color.rgb *= vec3(%f);\n", 10000 / MP_REF_WHITE);
+        break;
+    case MP_CSP_TRC_HLG:
+        GLSLF("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,\n"
+              "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) + vec3(%f),\n"
+              "                lessThan(vec3(0.5), color.rgb));\n",
+              HLG_C, HLG_A, HLG_B);
+        break;
+    case MP_CSP_TRC_V_LOG:
+        GLSLF("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
+              "    pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+              "              - vec3(%f),                                  \n"
+              "    lessThanEqual(vec3(0.181), color.rgb));                \n",
+              VLOG_D, VLOG_C, VLOG_B);
+        break;
+    case MP_CSP_TRC_S_LOG1:
+        GLSLF("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
+              "            - vec3(%f);\n",
+              SLOG_C, SLOG_A, SLOG_B);
+        break;
+    case MP_CSP_TRC_S_LOG2:
+        GLSLF("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f),      \n"
+              "    (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
+              "              - vec3(%f)) * vec3(1.0/%f),                   \n"
+              "    lessThanEqual(vec3(%f), color.rgb));                    \n",
+              SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
+        break;
+    default:
+        abort();
+    }
+
+    // Rescale to prevent clipping on non-float textures
+    GLSLF("color.rgb *= vec3(1.0/%f);\n", mp_trc_nom_peak(trc));
+}
+
+// Delinearize (compress), given a TRC as output. This corresponds to the
+// inverse EOTF (not the OETF) in ITU-R terminology, again assuming a
+// reference monitor.
+void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
+{
+    if (trc == MP_CSP_TRC_LINEAR)
+        return;
+
+    GLSLF("// delinearize\n");
+    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+    GLSLF("color.rgb *= vec3(%f);\n", mp_trc_nom_peak(trc));
+
+    switch (trc) {
+    case MP_CSP_TRC_SRGB:
+        GLSL(color.rgb = mix(color.rgb * vec3(12.92),
+                             vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))
+                                 - vec3(0.055),
+                             lessThanEqual(vec3(0.0031308), color.rgb));)
+        break;
+    case MP_CSP_TRC_BT_1886:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
+        break;
+    case MP_CSP_TRC_GAMMA18:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.8));)
+        break;
+    case MP_CSP_TRC_GAMMA22:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.2));)
+        break;
+    case MP_CSP_TRC_GAMMA28:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.8));)
+        break;
+    case MP_CSP_TRC_PRO_PHOTO:
+        GLSL(color.rgb = mix(color.rgb * vec3(16.0),
+                             pow(color.rgb, vec3(1.0/1.8)),
+                             lessThanEqual(vec3(0.001953), color.rgb));)
+        break;
+    case MP_CSP_TRC_PQ:
+        GLSLF("color.rgb *= vec3(1.0/%f);\n", 10000 / MP_REF_WHITE);
+        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M1);
+        GLSLF("color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+              "             / (vec3(1.0) + vec3(%f) * color.rgb);\n",
+              PQ_C1, PQ_C2, PQ_C3);
+        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M2);
+        break;
+    case MP_CSP_TRC_HLG:
+        GLSLF("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),\n"
+              "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),\n"
+              "                lessThan(vec3(1.0), color.rgb));\n",
+              HLG_A, HLG_B, HLG_C);
+        break;
+    case MP_CSP_TRC_V_LOG:
+        GLSLF("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),   \n"
+              "                vec3(%f) * log(color.rgb + vec3(%f))   \n"
+              "                    + vec3(%f),                        \n"
+              "                lessThanEqual(vec3(0.01), color.rgb)); \n",
+              VLOG_C / M_LN10, VLOG_B, VLOG_D);
+        break;
+    case MP_CSP_TRC_S_LOG1:
+        GLSLF("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
+              SLOG_A / M_LN10, SLOG_B, SLOG_C);
+        break;
+    case MP_CSP_TRC_S_LOG2:
+        GLSLF("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f),                \n"
+              "                vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
+              "                    + vec3(%f),                                 \n"
+              "                lessThanEqual(vec3(0.0), color.rgb));           \n",
+              SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
+        break;
+    default:
+        abort();
+    }
+}
+
+// Apply the OOTF mapping from a given light type to display-referred light.
+// The extra peak parameter is used to scale the values before and after
+// the OOTF, and can be inferred using mp_trc_nom_peak
+void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
+{
+    if (light == MP_CSP_LIGHT_DISPLAY)
+        return;
+
+    GLSLF("// apply ootf\n");
+    GLSLF("color.rgb *= vec3(%f);\n", peak);
+
+    switch (light)
+    {
+    case MP_CSP_LIGHT_SCENE_HLG:
+        // HLG OOTF from BT.2100, assuming a reference display with a
+        // peak of 1000 cd/m² -> gamma = 1.2
+        GLSLF("color.rgb *= vec3(%f * pow(dot(src_luma, color.rgb), 0.2));\n",
+              (1000 / MP_REF_WHITE) / pow(12, 1.2));
+        break;
+    case MP_CSP_LIGHT_SCENE_709_1886:
+        // This OOTF is defined by encoding the result as 709 and then decoding
+        // it as 1886; although this is called 709_1886 we actually use the
+        // more precise (by one decimal) values from BT.2020 instead
+        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
+                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
+                             lessThan(vec3(0.0181), color.rgb));)
+        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
+        break;
+    case MP_CSP_LIGHT_SCENE_1_2:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.2));)
+        break;
+    default:
+        abort();
+    }
+
+    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
+}
+
+// Inverse of the function pass_ootf, for completeness' sake.
+void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
+{
+    if (light == MP_CSP_LIGHT_DISPLAY)
+        return;
+
+    GLSLF("// apply inverse ootf\n");
+    GLSLF("color.rgb *= vec3(%f);\n", peak);
+
+    switch (light)
+    {
+    case MP_CSP_LIGHT_SCENE_HLG:
+        GLSLF("color.rgb *= vec3(1.0/%f);\n", (1000 / MP_REF_WHITE) / pow(12, 1.2));
+        GLSL(color.rgb /= vec3(max(1e-6, pow(dot(src_luma, color.rgb), 0.2/1.2)));)
+        break;
+    case MP_CSP_LIGHT_SCENE_709_1886:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
+        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
+                             pow((color.rgb + vec3(0.0993)) * vec3(1.0/1.0993),
+                                 vec3(1/0.45)),
+                             lessThan(vec3(0.08145), color.rgb));)
+        break;
+    case MP_CSP_LIGHT_SCENE_1_2:
+        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.2));)
+        break;
+    default:
+        abort();
+    }
+
+    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
+}
+
+// Tone map from a known peak brightness to the range [0,1]. If ref_peak
+// is 0, we will use peak detection instead
+static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
+                          enum tone_mapping algo, float param, float desat)
+{
+    GLSLF("// HDR tone mapping\n");
+
+    // Desaturate the color using a coefficient dependent on the luminance
+    GLSL(float luma = dot(dst_luma, color.rgb);)
+    if (desat > 0) {
+        GLSLF("float overbright = max(luma - %f, 1e-6) / max(luma, 1e-6);\n", desat);
+        GLSL(color.rgb = mix(color.rgb, vec3(luma), overbright);)
+    }
+
+    // To prevent discoloration due to out-of-bounds clipping, we need to make
+    // sure to reduce the value range as far as necessary to keep the entire
+    // signal in range, so tone map based on the brightest component.
+    GLSL(float sig = max(max(color.r, color.g), color.b);)
+    GLSL(float sig_orig = sig;)
+
+    if (!ref_peak) {
+        // For performance, we want to do as few atomic operations on global
+        // memory as possible, so use an atomic in shmem for the work group.
+        // We also want slightly more stable values, so use the group average
+        // instead of the group max
+        GLSLHF("shared uint group_sum = 0;\n");
+        GLSLF("atomicAdd(group_sum, uint(sig * %f));\n", MP_REF_WHITE);
+
+        // Have one thread in each work group update the frame maximum
+        GLSL(memoryBarrierBuffer();)
+        GLSL(barrier();)
+        GLSL(if (gl_LocalInvocationIndex == 0))
+            GLSL(atomicMax(frame_max[index], group_sum /
+                 (gl_WorkGroupSize.x * gl_WorkGroupSize.y));)
+
+        // Finally, have one thread per invocation update the total maximum
+        // and advance the index
+        GLSL(memoryBarrierBuffer();)
+        GLSL(barrier();)
+        GLSL(if (gl_GlobalInvocationID == ivec3(0)) {) // do this once per invocation
+            GLSLF("uint next = (index + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
+            GLSLF("sig_peak_raw = sig_peak_raw + frame_max[index] - frame_max[next];\n");
+            GLSLF("frame_max[next] = %d;\n", (int)MP_REF_WHITE);
+            GLSL(index = next;)
+        GLSL(})
+
+        GLSL(memoryBarrierBuffer();)
+        GLSL(barrier();)
+        GLSLF("float sig_peak = 1.0/%f * float(sig_peak_raw);\n",
+              MP_REF_WHITE * PEAK_DETECT_FRAMES);
+    } else {
+        GLSLHF("const float sig_peak = %f;\n", ref_peak);
+    }
+
+    switch (algo) {
+    case TONE_MAPPING_CLIP:
+        GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
+        break;
+
+    case TONE_MAPPING_MOBIUS:
+        GLSLF("const float j = %f;\n", isnan(param) ? 0.3 : param);
+        // solve for M(j) = j; M(sig_peak) = 1.0; M'(j) = 1.0
+        // where M(x) = scale * (x+a)/(x+b)
+        GLSLF("float a = -j*j * (sig_peak - 1.0) / (j*j - 2.0*j + sig_peak);\n");
+        GLSLF("float b = (j*j - 2.0*j*sig_peak + sig_peak) / "
+              "max(1e-6, sig_peak - 1.0);\n");
+        GLSLF("float scale = (b*b + 2.0*b*j + j*j) / (b-a);\n");
+        GLSL(sig = mix(sig, scale * (sig + a) / (sig + b), sig > j);)
+        break;
+
+    case TONE_MAPPING_REINHARD: {
+        float contrast = isnan(param) ? 0.5 : param,
+              offset = (1.0 - contrast) / contrast;
+        GLSLF("sig = sig / (sig + %f);\n", offset);
+        GLSLF("float scale = (sig_peak + %f) / sig_peak;\n", offset);
+        GLSL(sig *= scale;)
+        break;
+    }
+
+    case TONE_MAPPING_HABLE: {
+        float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
+        GLSLHF("float hable(float x) {\n");
+        GLSLHF("return ((x * (%f*x + %f)+%f)/(x * (%f*x + %f) + %f)) - %f;\n",
+               A, C*B, D*E, A, B, D*F, E/F);
+        GLSLHF("}\n");
+        GLSL(sig = hable(sig) / hable(sig_peak);)
+        break;
+    }
+
+    case TONE_MAPPING_GAMMA: {
+        float gamma = isnan(param) ? 1.8 : param;
+        GLSLF("const float cutoff = 0.05, gamma = %f;\n", 1.0/gamma);
+        GLSL(float scale = pow(cutoff / sig_peak, gamma) / cutoff;)
+        GLSL(sig = sig > cutoff ? pow(sig / sig_peak, gamma) : scale * sig;)
+        break;
+    }
+
+    case TONE_MAPPING_LINEAR: {
+        float coeff = isnan(param) ? 1.0 : param;
+        GLSLF("sig = %f / sig_peak * sig;\n", coeff);
+        break;
+    }
+
+    default:
+        abort();
+    }
+
+    // Apply the computed scale factor to the color, linearly to prevent
+    // discoloration
+    GLSL(color.rgb *= sig / sig_orig;)
+}
+
+// Map colors from one source space to another. These source spaces must be
+// known (i.e. not MP_CSP_*_AUTO), as this function won't perform any
+// auto-guessing. If is_linear is true, we assume the input has already been
+// linearized (e.g. for linear-scaling). If `detect_peak` is true, we will
+// detect the peak instead of relying on metadata. Note that this requires
+// the caller to have already bound the appropriate SSBO and set up the
+// compute shader metadata
+void pass_color_map(struct gl_shader_cache *sc,
+                    struct mp_colorspace src, struct mp_colorspace dst,
+                    enum tone_mapping algo, float tone_mapping_param,
+                    float tone_mapping_desat, bool detect_peak,
+                    bool gamut_warning, bool is_linear)
+{
+    GLSLF("// color mapping\n");
+
+    // Compute the highest encodable level
+    float src_range = mp_trc_nom_peak(src.gamma),
+          dst_range = mp_trc_nom_peak(dst.gamma);
+    float ref_peak = src.sig_peak / dst_range;
+
+    // Some operations need access to the video's luma coefficients, so make
+    // them available
+    float rgb2xyz[3][3];
+    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(src.primaries), rgb2xyz);
+    gl_sc_uniform_vec3(sc, "src_luma", rgb2xyz[1]);
+    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(dst.primaries), rgb2xyz);
+    gl_sc_uniform_vec3(sc, "dst_luma", rgb2xyz[1]);
+
+    // All operations from here on require linear light as a starting point,
+    // so we linearize even if src.gamma == dst.gamma when one of the other
+    // operations needs it
+    bool need_gamma = src.gamma != dst.gamma ||
+                      src.primaries != dst.primaries ||
+                      src_range != dst_range ||
+                      src.sig_peak > dst_range ||
+                      src.light != dst.light;
+
+    if (need_gamma && !is_linear) {
+        pass_linearize(sc, src.gamma);
+        is_linear= true;
+    }
+
+    if (src.light != dst.light)
+        pass_ootf(sc, src.light, mp_trc_nom_peak(src.gamma));
+
+    // Rescale the signal to compensate for differences in the encoding range
+    // and reference white level. This is necessary because of how mpv encodes
+    // brightness in textures.
+    if (src_range != dst_range) {
+        GLSLF("// rescale value range;\n");
+        GLSLF("color.rgb *= vec3(%f);\n", src_range / dst_range);
+    }
+
+    // Adapt to the right colorspace if necessary
+    if (src.primaries != dst.primaries) {
+        struct mp_csp_primaries csp_src = mp_get_csp_primaries(src.primaries),
+                                csp_dst = mp_get_csp_primaries(dst.primaries);
+        float m[3][3] = {{0}};
+        mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
+        gl_sc_uniform_mat3(sc, "cms_matrix", true, &m[0][0]);
+        GLSL(color.rgb = cms_matrix * color.rgb;)
+        // Since this can reduce the gamut, figure out by how much
+        for (int c = 0; c < 3; c++)
+            ref_peak = MPMAX(ref_peak, m[c][c]);
+    }
+
+    // Tone map to prevent clipping when the source signal peak exceeds the
+    // encodable range or we've reduced the gamut
+    if (ref_peak > 1) {
+        pass_tone_map(sc, detect_peak ? 0 : ref_peak, algo,
+                      tone_mapping_param, tone_mapping_desat);
+    }
+
+    if (src.light != dst.light)
+        pass_inverse_ootf(sc, dst.light, mp_trc_nom_peak(dst.gamma));
+
+    // Warn for remaining out-of-gamut colors is enabled
+    if (gamut_warning) {
+        GLSL(if (any(greaterThan(color.rgb, vec3(1.01)))))
+            GLSL(color.rgb = vec3(1.0) - color.rgb;) // invert
+    }
+
+    if (is_linear)
+        pass_delinearize(sc, dst.gamma);
+}
+
+// Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
+// Obtain random numbers by calling rand(h), followed by h = permute(h) to
+// update the state. Assumes the texture was hooked.
+static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
+{
+    GLSLH(float mod289(float x)  { return x - floor(x * 1.0/289.0) * 289.0; })
+    GLSLH(float permute(float x) { return mod289((34.0*x + 1.0) * x); })
+    GLSLH(float rand(float x)    { return fract(x * 1.0/41.0); })
+
+    // Initialize the PRNG by hashing the position + a random uniform
+    GLSL(vec3 _m = vec3(HOOKED_pos, random) + vec3(1.0);)
+    GLSL(float h = permute(permute(permute(_m.x)+_m.y)+_m.z);)
+    gl_sc_uniform_f(sc, "random", (double)av_lfg_get(lfg) / UINT32_MAX);
+}
+
+struct deband_opts {
+    int enabled;
+    int iterations;
+    float threshold;
+    float range;
+    float grain;
+};
+
+const struct deband_opts deband_opts_def = {
+    .iterations = 1,
+    .threshold = 64.0,
+    .range = 16.0,
+    .grain = 48.0,
+};
+
+#define OPT_BASE_STRUCT struct deband_opts
+const struct m_sub_options deband_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_INTRANGE("iterations", iterations, 0, 1, 16),
+        OPT_FLOATRANGE("threshold", threshold, 0, 0.0, 4096.0),
+        OPT_FLOATRANGE("range", range, 0, 1.0, 64.0),
+        OPT_FLOATRANGE("grain", grain, 0, 0.0, 4096.0),
+        {0}
+    },
+    .size = sizeof(struct deband_opts),
+    .defaults = &deband_opts_def,
+};
+
+// Stochastically sample a debanded result from a hooked texture.
+void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
+                        AVLFG *lfg, enum mp_csp_trc trc)
+{
+    // Initialize the PRNG
+    GLSLF("{\n");
+    prng_init(sc, lfg);
+
+    // Helper: Compute a stochastic approximation of the avg color around a
+    // pixel
+    GLSLHF("vec4 average(float range, inout float h) {\n");
+        // Compute a random rangle and distance
+        GLSLH(float dist = rand(h) * range;     h = permute(h);)
+        GLSLH(float dir  = rand(h) * 6.2831853; h = permute(h);)
+        GLSLH(vec2 o = dist * vec2(cos(dir), sin(dir));)
+
+        // Sample at quarter-turn intervals around the source pixel
+        GLSLH(vec4 ref[4];)
+        GLSLH(ref[0] = HOOKED_texOff(vec2( o.x,  o.y));)
+        GLSLH(ref[1] = HOOKED_texOff(vec2(-o.y,  o.x));)
+        GLSLH(ref[2] = HOOKED_texOff(vec2(-o.x, -o.y));)
+        GLSLH(ref[3] = HOOKED_texOff(vec2( o.y, -o.x));)
+
+        // Return the (normalized) average
+        GLSLH(return (ref[0] + ref[1] + ref[2] + ref[3])*0.25;)
+    GLSLHF("}\n");
+
+    // Sample the source pixel
+    GLSL(color = HOOKED_tex(HOOKED_pos);)
+    GLSLF("vec4 avg, diff;\n");
+    for (int i = 1; i <= opts->iterations; i++) {
+        // Sample the average pixel and use it instead of the original if
+        // the difference is below the given threshold
+        GLSLF("avg = average(%f, h);\n", i * opts->range);
+        GLSL(diff = abs(color - avg);)
+        GLSLF("color = mix(avg, color, greaterThan(diff, vec4(%f)));\n",
+              opts->threshold / (i * 16384.0));
+    }
+
+    // Add some random noise to smooth out residual differences
+    GLSL(vec3 noise;)
+    GLSL(noise.x = rand(h); h = permute(h);)
+    GLSL(noise.y = rand(h); h = permute(h);)
+    GLSL(noise.z = rand(h); h = permute(h);)
+
+    // Noise is scaled to the signal level to prevent extreme noise for HDR
+    float gain = opts->grain/8192.0 / mp_trc_nom_peak(trc);
+    GLSLF("color.xyz += %f * (noise - vec3(0.5));\n", gain);
+    GLSLF("}\n");
+}
+
+// Assumes the texture was hooked
+void pass_sample_unsharp(struct gl_shader_cache *sc, float param) {
+    GLSLF("{\n");
+    GLSL(float st1 = 1.2;)
+    GLSL(vec4 p = HOOKED_tex(HOOKED_pos);)
+    GLSL(vec4 sum1 = HOOKED_texOff(st1 * vec2(+1, +1))
+                   + HOOKED_texOff(st1 * vec2(+1, -1))
+                   + HOOKED_texOff(st1 * vec2(-1, +1))
+                   + HOOKED_texOff(st1 * vec2(-1, -1));)
+    GLSL(float st2 = 1.5;)
+    GLSL(vec4 sum2 = HOOKED_texOff(st2 * vec2(+1,  0))
+                   + HOOKED_texOff(st2 * vec2( 0, +1))
+                   + HOOKED_texOff(st2 * vec2(-1,  0))
+                   + HOOKED_texOff(st2 * vec2( 0, -1));)
+    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
+    GLSLF("color = p + t * %f;\n", param);
+    GLSLF("}\n");
+}
diff --git a/video/out/gpu/video_shaders.h b/video/out/gpu/video_shaders.h
new file mode 100644
index 0000000000..8345e4c598
--- /dev/null
+++ b/video/out/gpu/video_shaders.h
@@ -0,0 +1,56 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_VIDEO_SHADERS_H
+#define MP_GL_VIDEO_SHADERS_H
+
+#include <libavutil/lfg.h>
+
+#include "utils.h"
+#include "video.h"
+
+extern const struct deband_opts deband_opts_def;
+extern const struct m_sub_options deband_conf;
+
+void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
+void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
+                               int d_x, int d_y);
+void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                       int components, int glsl_version);
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+                        int components, int bw, int bh, int iw, int ih);
+void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
+void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
+                            int w, int h);
+
+void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
+void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
+void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
+void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
+
+void pass_color_map(struct gl_shader_cache *sc,
+                    struct mp_colorspace src, struct mp_colorspace dst,
+                    enum tone_mapping algo, float tone_mapping_param,
+                    float tone_mapping_desat, bool use_detected_peak,
+                    bool gamut_warning, bool is_linear);
+
+void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
+                        AVLFG *lfg, enum mp_csp_trc trc);
+
+void pass_sample_unsharp(struct gl_shader_cache *sc, float param);
+
+#endif
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index 7b2e3ed497..b9f582b79f 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -26,10 +26,10 @@
 #include "common/msg.h"
 #include "misc/bstr.h"
 
-#include "video/out/vo.h"
 #include "video/csputils.h"
-
 #include "video/mp_image.h"
+#include "video/out/vo.h"
+#include "video/out/gpu/ra.h"
 
 #include "gl_headers.h"
 
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index fe454e9741..d3cdcac3b7 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -1,10 +1,4 @@
 /*
- * common OpenGL routines
- *
- * copyleft (C) 2005-2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
- * Special thanks go to the xine team and Matthias Hopf, whose video_out_opengl.c
- * gave me lots of good ideas.
- *
  * This file is part of mpv.
  *
  * mpv is free software; you can redistribute it and/or
@@ -21,73 +15,10 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdbool.h>
-#include <math.h>
-#include <assert.h>
-
+#include "options/m_config.h"
 #include "context.h"
-#include "common/common.h"
-#include "options/options.h"
-#include "options/m_option.h"
-
-extern const struct mpgl_driver mpgl_driver_x11;
-extern const struct mpgl_driver mpgl_driver_x11egl;
-extern const struct mpgl_driver mpgl_driver_x11_probe;
-extern const struct mpgl_driver mpgl_driver_drm_egl;
-extern const struct mpgl_driver mpgl_driver_drm;
-extern const struct mpgl_driver mpgl_driver_cocoa;
-extern const struct mpgl_driver mpgl_driver_wayland;
-extern const struct mpgl_driver mpgl_driver_w32;
-extern const struct mpgl_driver mpgl_driver_angle;
-extern const struct mpgl_driver mpgl_driver_angle_es2;
-extern const struct mpgl_driver mpgl_driver_dxinterop;
-extern const struct mpgl_driver mpgl_driver_rpi;
-extern const struct mpgl_driver mpgl_driver_mali;
-extern const struct mpgl_driver mpgl_driver_vdpauglx;
-
-static const struct mpgl_driver *const backends[] = {
-#if HAVE_RPI
-    &mpgl_driver_rpi,
-#endif
-#if HAVE_GL_COCOA
-    &mpgl_driver_cocoa,
-#endif
-#if HAVE_EGL_ANGLE_WIN32
-    &mpgl_driver_angle,
-#endif
-#if HAVE_GL_WIN32
-    &mpgl_driver_w32,
-#endif
-#if HAVE_GL_DXINTEROP
-    &mpgl_driver_dxinterop,
-#endif
-#if HAVE_GL_X11
-    &mpgl_driver_x11_probe,
-#endif
-#if HAVE_EGL_X11
-    &mpgl_driver_x11egl,
-#endif
-#if HAVE_GL_X11
-    &mpgl_driver_x11,
-#endif
-#if HAVE_GL_WAYLAND
-    &mpgl_driver_wayland,
-#endif
-#if HAVE_EGL_DRM
-    &mpgl_driver_drm,
-    &mpgl_driver_drm_egl,
-#endif
-#if HAVE_MALI_FBDEV
-    &mpgl_driver_mali,
-#endif
-#if HAVE_VDPAU_GL_X11
-    &mpgl_driver_vdpauglx,
-#endif
-};
+#include "ra_gl.h"
+#include "utils.h"
 
 // 0-terminated list of desktop GL versions a backend should try to
 // initialize. The first entry is the most preferred version.
@@ -103,140 +34,319 @@ const int mpgl_preferred_gl_versions[] = {
     0
 };
 
-int mpgl_find_backend(const char *name)
+enum {
+    FLUSH_NO = 0,
+    FLUSH_YES,
+    FLUSH_AUTO,
+};
+
+enum {
+    GLES_AUTO = 0,
+    GLES_YES,
+    GLES_NO,
+};
+
+struct opengl_opts {
+    int use_glfinish;
+    int waitvsync;
+    int vsync_pattern[2];
+    int swapinterval;
+    int early_flush;
+    int restrict_version;
+    int gles_mode;
+};
+
+#define OPT_BASE_STRUCT struct opengl_opts
+const struct m_sub_options opengl_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_FLAG("opengl-glfinish", use_glfinish, 0),
+        OPT_FLAG("opengl-waitvsync", waitvsync, 0),
+        OPT_INT("opengl-swapinterval", swapinterval, 0),
+        OPT_INTPAIR("opengl-check-pattern", vsync_pattern, 0),
+        OPT_INT("opengl-restrict", restrict_version, 0),
+        OPT_CHOICE("opengl-es", gles_mode, 0,
+                ({"auto", GLES_AUTO}, {"yes", GLES_YES}, {"no", GLES_NO})),
+        OPT_CHOICE("opengl-early-flush", early_flush, 0,
+                ({"no", FLUSH_NO}, {"yes", FLUSH_YES}, {"auto", FLUSH_AUTO})),
+
+        OPT_REPLACED("opengl-debug", "gpu-debug"),
+        OPT_REPLACED("opengl-sw", "gpu-sw"),
+        OPT_REPLACED("opengl-vsync-fences", "swapchain-depth"),
+        OPT_REPLACED("opengl-backend", "gpu-context"),
+        {0},
+    },
+    .defaults = &(const struct opengl_opts) {
+        .swapinterval = 1,
+    },
+    .size = sizeof(struct opengl_opts),
+};
+
+struct priv {
+    GL *gl;
+    struct mp_log *log;
+    struct ra_gl_ctx_params params;
+    struct opengl_opts *opts;
+    struct ra_swapchain_fns fns;
+    GLuint main_fb;
+    struct ra_tex *wrapped_fb; // corresponds to main_fb
+    // for debugging:
+    int frames_rendered;
+    unsigned int prev_sgi_sync_count;
+    // for gl_vsync_pattern
+    int last_pattern;
+    int matches, mismatches;
+    // for swapchain_depth simulation
+    GLsync *vsync_fences;
+    int num_vsync_fences;
+};
+
+bool ra_gl_ctx_test_version(struct ra_ctx *ctx, int version, bool es)
 {
-    if (name == NULL || strcmp(name, "auto") == 0)
-        return -1;
-    for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-        if (strcmp(backends[n]->name, name) == 0)
-            return n;
+    bool ret;
+    struct opengl_opts *opts;
+    void *tmp = talloc_new(NULL);
+    opts = mp_get_config_group(tmp, ctx->global, &opengl_conf);
+
+    // Version too high
+    if (opts->restrict_version && version >= opts->restrict_version) {
+        ret = false;
+        goto done;
     }
-    return -2;
-}
 
-int mpgl_validate_backend_opt(struct mp_log *log, const struct m_option *opt,
-                              struct bstr name, struct bstr param)
-{
-    if (bstr_equals0(param, "help")) {
-        mp_info(log, "OpenGL windowing backends:\n");
-        mp_info(log, "    auto (autodetect)\n");
-        for (int n = 0; n < MP_ARRAY_SIZE(backends); n++)
-            mp_info(log, "    %s\n", backends[n]->name);
-        return M_OPT_EXIT;
+    switch (opts->gles_mode) {
+    case GLES_YES:  ret = es;   goto done;
+    case GLES_NO:   ret = !es;  goto done;
+    case GLES_AUTO: ret = true; goto done;
+    default: abort();
     }
-    char s[20];
-    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-    return mpgl_find_backend(s) >= -1 ? 1 : M_OPT_INVALID;
+
+done:
+    talloc_free(tmp);
+    return ret;
 }
 
-static void *get_native_display(void *pctx, const char *name)
+static void *get_native_display(void *priv, const char *name)
 {
-    MPGLContext *ctx = pctx;
-    if (!ctx->native_display_type || !name)
+    struct priv *p = priv;
+    if (!p->params.native_display_type || !name)
+        return NULL;
+    if (strcmp(p->params.native_display_type, name) != 0)
         return NULL;
-    return strcmp(ctx->native_display_type, name) == 0 ? ctx->native_display : NULL;
+
+    return p->params.native_display;
 }
 
-static MPGLContext *init_backend(struct vo *vo, const struct mpgl_driver *driver,
-                                 bool probing, int vo_flags)
+void ra_gl_ctx_uninit(struct ra_ctx *ctx)
 {
-    MPGLContext *ctx = talloc_ptrtype(NULL, ctx);
-    *ctx = (MPGLContext) {
-        .gl = talloc_zero(ctx, GL),
-        .vo = vo,
-        .global = vo->global,
-        .driver = driver,
-        .log = vo->log,
+    if (ctx->ra)
+        ctx->ra->fns->destroy(ctx->ra);
+    if (ctx->swapchain) {
+        talloc_free(ctx->swapchain);
+        ctx->swapchain = NULL;
+    }
+}
+
+static const struct ra_swapchain_fns ra_gl_swapchain_fns;
+
+bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params)
+{
+    struct ra_swapchain *sw = ctx->swapchain = talloc_ptrtype(NULL, sw);
+    *sw = (struct ra_swapchain) {
+        .ctx    = ctx,
+        .flip_v = !params.flipped, // OpenGL framebuffers are normally inverted
     };
-    if (probing)
-        vo_flags |= VOFLAG_PROBING;
-    bool old_probing = vo->probing;
-    vo->probing = probing; // hack; kill it once backends are separate
-    MP_VERBOSE(vo, "Initializing OpenGL backend '%s'\n", ctx->driver->name);
-    ctx->priv = talloc_zero_size(ctx, ctx->driver->priv_size);
-    if (ctx->driver->init(ctx, vo_flags) < 0) {
-        vo->probing = old_probing;
-        talloc_free(ctx);
-        return NULL;
+
+    struct priv *p = sw->priv = talloc_ptrtype(sw, p);
+    *p = (struct priv) {
+        .gl     = gl,
+        .log    = ctx->log,
+        .params = params,
+        .opts   = mp_get_config_group(p, ctx->global, &opengl_conf),
+        .fns    = ra_gl_swapchain_fns,
+    };
+
+    sw->fns = &p->fns;
+
+    const struct ra_swapchain_fns *ext = p->params.external_swapchain;
+    if (ext) {
+        if (ext->color_depth)
+            p->fns.color_depth = ext->color_depth;
+        if (ext->screenshot)
+            p->fns.screenshot = ext->screenshot;
+        if (ext->start_frame)
+            p->fns.start_frame = ext->start_frame;
+        if (ext->submit_frame)
+            p->fns.submit_frame = ext->submit_frame;
+        if (ext->swap_buffers)
+            p->fns.swap_buffers = ext->swap_buffers;
     }
-    vo->probing = old_probing;
 
-    if (!ctx->gl->version && !ctx->gl->es)
-        goto cleanup;
+    if (!gl->version && !gl->es)
+        return false;
 
-    if (probing && ctx->gl->es && (vo_flags & VOFLAG_NO_GLES)) {
-        MP_VERBOSE(ctx->vo, "Skipping GLES backend.\n");
-        goto cleanup;
+    if (gl->mpgl_caps & MPGL_CAP_SW) {
+        MP_WARN(p, "Suspected software renderer or indirect context.\n");
+        if (ctx->opts.probing && !ctx->opts.allow_sw)
+            return false;
     }
 
-    if (ctx->gl->mpgl_caps & MPGL_CAP_SW) {
-        MP_WARN(ctx->vo, "Suspected software renderer or indirect context.\n");
-        if (vo->probing && !(vo_flags & VOFLAG_SW))
-            goto cleanup;
+    gl->debug_context = ctx->opts.debug;
+    gl->get_native_display_ctx = p;
+    gl->get_native_display = get_native_display;
+
+    if (gl->SwapInterval) {
+        gl->SwapInterval(p->opts->swapinterval);
+    } else {
+        MP_VERBOSE(p, "GL_*_swap_control extension missing.\n");
     }
 
-    ctx->gl->debug_context = !!(vo_flags & VOFLAG_GL_DEBUG);
+    ctx->ra = ra_create_gl(p->gl, ctx->log);
+    return !!ctx->ra;
+}
 
-    ctx->gl->get_native_display_ctx = ctx;
-    ctx->gl->get_native_display = get_native_display;
+void ra_gl_ctx_resize(struct ra_swapchain *sw, int w, int h, int fbo)
+{
+    struct priv *p = sw->priv;
+    if (p->main_fb == fbo && p->wrapped_fb && p->wrapped_fb->params.w == w
+        && p->wrapped_fb->params.h == h)
+        return;
 
-    return ctx;
+    if (p->wrapped_fb)
+        ra_tex_free(sw->ctx->ra, &p->wrapped_fb);
 
-cleanup:
-    mpgl_uninit(ctx);
-    return NULL;
+    p->main_fb = fbo;
+    p->wrapped_fb = ra_create_wrapped_fb(sw->ctx->ra, fbo, w, h);
 }
 
-// Create a VO window and create a GL context on it.
-//  vo_flags: passed to the backend's create window function
-MPGLContext *mpgl_init(struct vo *vo, const char *backend_name, int vo_flags)
+int ra_gl_ctx_color_depth(struct ra_swapchain *sw)
 {
-    MPGLContext *ctx = NULL;
-    int index = mpgl_find_backend(backend_name);
-    if (index == -1) {
-        for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-            ctx = init_backend(vo, backends[n], true, vo_flags);
-            if (ctx)
-                break;
-        }
-        // VO forced, but no backend is ok => force the first that works at all
-        if (!ctx && !vo->probing) {
-            for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-                ctx = init_backend(vo, backends[n], false, vo_flags);
-                if (ctx)
-                    break;
-            }
-        }
-    } else if (index >= 0) {
-        ctx = init_backend(vo, backends[index], false, vo_flags);
-    }
-    return ctx;
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    if (!p->wrapped_fb)
+        return 0;
+
+    if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))
+        return 0;
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, p->main_fb);
+
+    GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
+    if (p->main_fb)
+        obj = GL_COLOR_ATTACHMENT0;
+
+    GLint depth_g = 0;
+
+    gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+
+    return depth_g;
 }
 
-int mpgl_reconfig_window(struct MPGLContext *ctx)
+struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw)
 {
-    return ctx->driver->reconfig(ctx);
+    struct priv *p = sw->priv;
+
+    assert(p->wrapped_fb);
+    struct mp_image *screen = gl_read_fbo_contents(p->gl, p->main_fb,
+                                                   p->wrapped_fb->params.w,
+                                                   p->wrapped_fb->params.h);
+
+    // OpenGL FB is also read in flipped order, so we need to flip when the
+    // rendering is *not* flipped, which in our case is whenever
+    // p->params.flipped is true. I hope that made sense
+    if (p->params.flipped)
+        mp_image_vflip(screen);
+
+    return screen;
 }
 
-int mpgl_control(struct MPGLContext *ctx, int *events, int request, void *arg)
+struct ra_tex *ra_gl_ctx_start_frame(struct ra_swapchain *sw)
 {
-    return ctx->driver->control(ctx, events, request, arg);
+    struct priv *p = sw->priv;
+
+    return p->wrapped_fb;
 }
 
-void mpgl_start_frame(struct MPGLContext *ctx)
+bool ra_gl_ctx_submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
 {
-    if (ctx->driver->start_frame)
-        ctx->driver->start_frame(ctx);
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    if (p->opts->use_glfinish)
+        gl->Finish();
+
+    if (gl->FenceSync && !p->params.external_swapchain) {
+        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        if (fence)
+            MP_TARRAY_APPEND(p, p->vsync_fences, p->num_vsync_fences, fence);
+    }
+
+    switch (p->opts->early_flush) {
+    case FLUSH_AUTO:
+        if (frame->display_synced)
+            break;
+        // fall through
+    case FLUSH_YES:
+        gl->Flush();
+    }
+
+    return true;
 }
 
-void mpgl_swap_buffers(struct MPGLContext *ctx)
+static void check_pattern(struct priv *p, int item)
 {
-    ctx->driver->swap_buffers(ctx);
+    int expected = p->opts->vsync_pattern[p->last_pattern];
+    if (item == expected) {
+        p->last_pattern++;
+        if (p->last_pattern >= 2)
+            p->last_pattern = 0;
+        p->matches++;
+    } else {
+        p->mismatches++;
+        MP_WARN(p, "wrong pattern, expected %d got %d (hit: %d, mis: %d)\n",
+                expected, item, p->matches, p->mismatches);
+    }
 }
 
-void mpgl_uninit(MPGLContext *ctx)
+void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw)
 {
-    if (ctx)
-        ctx->driver->uninit(ctx);
-    talloc_free(ctx);
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    p->params.swap_buffers(sw->ctx);
+    p->frames_rendered++;
+
+    if (p->frames_rendered > 5 && !sw->ctx->opts.debug)
+        ra_gl_set_debug(sw->ctx->ra, false);
+
+    if ((p->opts->waitvsync || p->opts->vsync_pattern[0])
+        && gl->GetVideoSync)
+    {
+        unsigned int n1 = 0, n2 = 0;
+        gl->GetVideoSync(&n1);
+        if (p->opts->waitvsync)
+            gl->WaitVideoSync(2, (n1 + 1) % 2, &n2);
+        int step = n1 - p->prev_sgi_sync_count;
+        p->prev_sgi_sync_count = n1;
+        MP_DBG(p, "Flip counts: %u->%u, step=%d\n", n1, n2, step);
+        if (p->opts->vsync_pattern[0])
+            check_pattern(p, step);
+    }
+
+    while (p->num_vsync_fences >= sw->ctx->opts.swapchain_depth) {
+        gl->ClientWaitSync(p->vsync_fences[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
+        gl->DeleteSync(p->vsync_fences[0]);
+        MP_TARRAY_REMOVE_AT(p->vsync_fences, p->num_vsync_fences, 0);
+    }
 }
+
+static const struct ra_swapchain_fns ra_gl_swapchain_fns = {
+    .color_depth   = ra_gl_ctx_color_depth,
+    .screenshot    = ra_gl_ctx_screenshot,
+    .start_frame   = ra_gl_ctx_start_frame,
+    .submit_frame  = ra_gl_ctx_submit_frame,
+    .swap_buffers  = ra_gl_ctx_swap_buffers,
+};
diff --git a/video/out/opengl/context.h b/video/out/opengl/context.h
index 229c5ef54f..bdf426b9b4 100644
--- a/video/out/opengl/context.h
+++ b/video/out/opengl/context.h
@@ -1,116 +1,56 @@
-/*
- * common OpenGL routines
- *
- * copyleft (C) 2005-2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
- * Special thanks go to the xine team and Matthias Hopf, whose video_out_opengl.c
- * gave me lots of good ideas.
- *
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_CONTEXT_H_
-#define MP_GL_CONTEXT_H_
+#pragma once
 
+#include "common/global.h"
+#include "video/out/gpu/context.h"
 #include "common.h"
 
-enum {
-    VOFLAG_GLES         = 1 << 0,       // Hint to create a GLES context
-    VOFLAG_NO_GLES      = 1 << 1,       // Hint to create a desktop GL context
-    VOFLAG_GL_DEBUG     = 1 << 2,       // Hint to request debug OpenGL context
-    VOFLAG_ALPHA        = 1 << 3,       // Hint to request alpha framebuffer
-    VOFLAG_SW           = 1 << 4,       // Hint to accept a software GL renderer
-    VOFLAG_PROBING      = 1 << 6,       // The backend is being auto-probed.
-    VOFLAG_GLES2        = 1 << 7,       // Hint for GLESv2 (needs VOFLAG_GLES)
-};
-
 extern const int mpgl_preferred_gl_versions[];
 
-struct MPGLContext;
-
-// A windowing backend (like X11, win32, ...), which provides OpenGL rendering.
-struct mpgl_driver {
-    const char *name;
-
-    // Size of the struct allocated for MPGLContext.priv
-    int priv_size;
-
-    // Init the GL context and possibly the underlying VO backend.
-    // The created context should be compatible to GL 3.2 core profile, but
-    // some other GL versions are supported as well (e.g. GL 2.1 or GLES 2).
-    // Return 0 on success, negative value (-1) on error.
-    int (*init)(struct MPGLContext *ctx, int vo_flags);
-
-    // Resize the window, or create a new window if there isn't one yet.
-    // Currently, there is an unfortunate interaction with ctx->vo, and
-    // display size etc. are determined by it.
-    // Return 0 on success, negative value (-1) on error.
-    int (*reconfig)(struct MPGLContext *ctx);
-
-    // Called when rendering starts. The backend can map or resize the
-    // framebuffer, or update GL.main_fb. swap_buffers() ends the frame.
-    // Optional.
-    void (*start_frame)(struct MPGLContext *ctx);
-
-    // Present the frame.
-    void (*swap_buffers)(struct MPGLContext *ctx);
-
-    // This behaves exactly like vo_driver.control().
-    int (*control)(struct MPGLContext *ctx, int *events, int request, void *arg);
-
-    // These behave exactly like vo_driver.wakeup/wait_events. They are
-    // optional.
-    void (*wakeup)(struct MPGLContext *ctx);
-    void (*wait_events)(struct MPGLContext *ctx, int64_t until_time_us);
-
-    // Destroy the GL context and possibly the underlying VO backend.
-    void (*uninit)(struct MPGLContext *ctx);
-};
-
-typedef struct MPGLContext {
-    GL *gl;
-    struct vo *vo;
-    const struct mpgl_driver *driver;
-    struct mpv_global *global;
-    struct mp_log *log;
-
-    // For hwdec_vaegl.c.
+// Returns whether or not a candidate GL version should be accepted or not
+// (based on the --opengl opts). Implementations may call this before
+// ra_gl_ctx_init if they wish to probe for multiple possible GL versions.
+bool ra_gl_ctx_test_version(struct ra_ctx *ctx, int version, bool es);
+
+// These are a set of helpers for ra_ctx providers based on ra_gl.
+// The init function also initializes ctx->ra and ctx->swapchain, so the user
+// doesn't have to do this manually. (Similarly, the uninit function will
+// clean them up)
+
+struct ra_gl_ctx_params {
+    // Set to the platform-specific function to swap buffers, like
+    // glXSwapBuffers, eglSwapBuffers etc. This will be called by
+    // ra_gl_ctx_swap_buffers. Required unless you either never call that
+    // function or if you override it yourself.
+    void (*swap_buffers)(struct ra_ctx *ctx);
+
+    // Set to false if the implementation follows normal GL semantics, which is
+    // upside down. Set to true if it does *not*, i.e. if rendering is right
+    // side up
+    bool flipped;
+
+    // If this is set to non-NULL, then the ra_gl_ctx will consider the GL
+    // implementation to be using an external swapchain, which disables the
+    // software simulation of --swapchain-depth. Any functions defined by this
+    // ra_swapchain_fns structs will entirely replace the equivalent ra_gl_ctx
+    // functions in the resulting ra_swapchain.
+    const struct ra_swapchain_fns *external_swapchain;
+
+    // For hwdec_vaegl.c:
     const char *native_display_type;
     void *native_display;
+};
 
-    // Flip the rendered image vertically. This is useful for dxinterop.
-    bool flip_v;
-
-    // framebuffer to render to (normally 0)
-    GLuint main_fb;
-
-    // For free use by the mpgl_driver.
-    void *priv;
-} MPGLContext;
-
-MPGLContext *mpgl_init(struct vo *vo, const char *backend_name, int vo_flags);
-void mpgl_uninit(MPGLContext *ctx);
-int mpgl_reconfig_window(struct MPGLContext *ctx);
-int mpgl_control(struct MPGLContext *ctx, int *events, int request, void *arg);
-void mpgl_start_frame(struct MPGLContext *ctx);
-void mpgl_swap_buffers(struct MPGLContext *ctx);
-
-int mpgl_find_backend(const char *name);
+void ra_gl_ctx_uninit(struct ra_ctx *ctx);
+bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params);
 
-struct m_option;
-int mpgl_validate_backend_opt(struct mp_log *log, const struct m_option *opt,
-                              struct bstr name, struct bstr param);
+// Call this any time the window size or main framebuffer changes
+void ra_gl_ctx_resize(struct ra_swapchain *sw, int w, int h, int fbo);
 
-#endif
+// These functions are normally set in the ra_swapchain->fns, but if an
+// implementation has a need to override this fns struct with custom functions
+// for whatever reason, these can be used to inherit the original behavior.
+int ra_gl_ctx_color_depth(struct ra_swapchain *sw);
+struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw);
+struct ra_tex *ra_gl_ctx_start_frame(struct ra_swapchain *sw);
+bool ra_gl_ctx_submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame);
+void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw);
diff --git a/video/out/opengl/context_cocoa.c b/video/out/opengl/context_cocoa.c
index 1d9a10cf38..cdf6faffcd 100644
--- a/video/out/opengl/context_cocoa.c
+++ b/video/out/opengl/context_cocoa.c
@@ -188,4 +188,4 @@ const struct mpgl_driver mpgl_driver_cocoa = {
     .swap_buffers   = cocoa_swap_buffers,
     .control        = cocoa_control,
     .uninit         = cocoa_uninit,
-};
\ No newline at end of file
+};
diff --git a/video/out/opengl/context_drm_egl.c b/video/out/opengl/context_drm_egl.c
index e52fec451b..21b16a52d5 100644
--- a/video/out/opengl/context_drm_egl.c
+++ b/video/out/opengl/context_drm_egl.c
@@ -28,10 +28,12 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "context.h"
-#include "egl_helpers.h"
-#include "common/common.h"
 #include "video/out/drm_common.h"
+#include "common/common.h"
+
+#include "egl_helpers.h"
+#include "common.h"
+#include "context.h"
 
 #define USE_MASTER 0
 
@@ -59,6 +61,7 @@ struct egl
 };
 
 struct priv {
+    GL gl;
     struct kms *kms;
 
     drmEventContext ev;
@@ -75,34 +78,33 @@ struct priv {
     struct vt_switcher vt_switcher;
 };
 
-static bool init_egl(struct MPGLContext *ctx, int flags)
+static bool init_egl(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    MP_VERBOSE(ctx->vo, "Initializing EGL\n");
+    MP_VERBOSE(ctx, "Initializing EGL\n");
     p->egl.display = eglGetDisplay(p->gbm.device);
     if (p->egl.display == EGL_NO_DISPLAY) {
-        MP_ERR(ctx->vo, "Failed to get EGL display.\n");
+        MP_ERR(ctx, "Failed to get EGL display.\n");
         return false;
     }
     if (!eglInitialize(p->egl.display, NULL, NULL)) {
-        MP_ERR(ctx->vo, "Failed to initialize EGL.\n");
+        MP_ERR(ctx, "Failed to initialize EGL.\n");
         return false;
     }
     EGLConfig config;
-    if (!mpegl_create_context(p->egl.display, ctx->vo->log, flags,
-                              &p->egl.context, &config))
-        return -1;
-    MP_VERBOSE(ctx->vo, "Initializing EGL surface\n");
+    if (!mpegl_create_context(ctx, p->egl.display, &p->egl.context, &config))
+        return false;
+    MP_VERBOSE(ctx, "Initializing EGL surface\n");
     p->egl.surface
         = eglCreateWindowSurface(p->egl.display, config, p->gbm.surface, NULL);
     if (p->egl.surface == EGL_NO_SURFACE) {
-        MP_ERR(ctx->vo, "Failed to create EGL surface.\n");
+        MP_ERR(ctx, "Failed to create EGL surface.\n");
         return false;
     }
     return true;
 }
 
-static bool init_gbm(struct MPGLContext *ctx)
+static bool init_gbm(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     MP_VERBOSE(ctx->vo, "Creating GBM device\n");
@@ -136,7 +138,7 @@ static void framebuffer_destroy_callback(struct gbm_bo *bo, void *data)
 }
 
 static void update_framebuffer_from_bo(
-    const struct MPGLContext *ctx, struct gbm_bo *bo)
+    const struct ra_ctx *ctx, struct gbm_bo *bo)
 {
     struct priv *p = ctx->priv;
     p->fb.bo = bo;
@@ -161,7 +163,7 @@ static void page_flipped(int fd, unsigned int frame, unsigned int sec,
     p->waiting_for_flip = false;
 }
 
-static bool crtc_setup(struct MPGLContext *ctx)
+static bool crtc_setup(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     if (p->active)
@@ -174,7 +176,7 @@ static bool crtc_setup(struct MPGLContext *ctx)
     return ret == 0;
 }
 
-static void crtc_release(struct MPGLContext *ctx)
+static void crtc_release(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -204,7 +206,7 @@ static void crtc_release(struct MPGLContext *ctx)
 
 static void release_vt(void *data)
 {
-    struct MPGLContext *ctx = data;
+    struct ra_ctx *ctx = data;
     MP_VERBOSE(ctx->vo, "Releasing VT");
     crtc_release(ctx);
     if (USE_MASTER) {
@@ -221,7 +223,7 @@ static void release_vt(void *data)
 
 static void acquire_vt(void *data)
 {
-    struct MPGLContext *ctx = data;
+    struct ra_ctx *ctx = data;
     MP_VERBOSE(ctx->vo, "Acquiring VT");
     if (USE_MASTER) {
         struct priv *p = ctx->priv;
@@ -234,11 +236,41 @@ static void acquire_vt(void *data)
     crtc_setup(ctx);
 }
 
-static void drm_egl_uninit(MPGLContext *ctx)
+static void drm_egl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    crtc_release(ctx);
+    eglSwapBuffers(p->egl.display, p->egl.surface);
+    p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
+    p->waiting_for_flip = true;
+    update_framebuffer_from_bo(ctx, p->gbm.next_bo);
+    int ret = drmModePageFlip(p->kms->fd, p->kms->crtc_id, p->fb.id,
+                              DRM_MODE_PAGE_FLIP_EVENT, p);
+    if (ret) {
+        MP_WARN(ctx->vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
+    }
+
+    // poll page flip finish event
+    const int timeout_ms = 3000;
+    struct pollfd fds[1] = { { .events = POLLIN, .fd = p->kms->fd } };
+    poll(fds, 1, timeout_ms);
+    if (fds[0].revents & POLLIN) {
+        ret = drmHandleEvent(p->kms->fd, &p->ev);
+        if (ret != 0) {
+            MP_ERR(ctx->vo, "drmHandleEvent failed: %i\n", ret);
+            return;
+        }
+    }
+
+    gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
+    p->gbm.bo = p->gbm.next_bo;
+}
 
+static void drm_egl_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
+    crtc_release(ctx);
     if (p->vt_switcher_active)
         vt_switcher_destroy(&p->vt_switcher);
 
@@ -258,19 +290,14 @@ static void drm_egl_uninit(MPGLContext *ctx)
     }
 }
 
-static int drm_egl_init(struct MPGLContext *ctx, int flags)
+static bool drm_egl_init(struct ra_ctx *ctx)
 {
-    if (ctx->vo->probing) {
-        MP_VERBOSE(ctx->vo, "DRM EGL backend can be activated only manually.\n");
-        return -1;
+    if (ctx->opts.probing) {
+        MP_VERBOSE(ctx, "DRM EGL backend can be activated only manually.\n");
+        return false;
     }
-    struct priv *p = ctx->priv;
-    p->kms = NULL;
-    p->old_crtc = NULL;
-    p->gbm.surface = NULL;
-    p->gbm.device = NULL;
-    p->active = false;
-    p->waiting_for_flip = false;
+
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     p->ev.version = DRM_EVENT_CONTEXT_VERSION;
     p->ev.page_flip_handler = page_flipped;
 
@@ -279,79 +306,76 @@ static int drm_egl_init(struct MPGLContext *ctx, int flags)
         vt_switcher_acquire(&p->vt_switcher, acquire_vt, ctx);
         vt_switcher_release(&p->vt_switcher, release_vt, ctx);
     } else {
-        MP_WARN(ctx->vo, "Failed to set up VT switcher. Terminal switching will be unavailable.\n");
+        MP_WARN(ctx, "Failed to set up VT switcher. Terminal switching will be unavailable.\n");
     }
 
-    MP_VERBOSE(ctx->vo, "Initializing KMS\n");
-    p->kms = kms_create(ctx->vo->log, ctx->vo->opts->drm_connector_spec,
+    MP_VERBOSE(ctx, "Initializing KMS\n");
+    p->kms = kms_create(ctx->log, ctx->vo->opts->drm_connector_spec,
                         ctx->vo->opts->drm_mode_id);
     if (!p->kms) {
         MP_ERR(ctx->vo, "Failed to create KMS.\n");
-        return -1;
+        return false;
     }
 
     if (!init_gbm(ctx)) {
         MP_ERR(ctx->vo, "Failed to setup GBM.\n");
-        return -1;
+        return false;
     }
 
-    if (!init_egl(ctx, flags)) {
+    if (!init_egl(ctx)) {
         MP_ERR(ctx->vo, "Failed to setup EGL.\n");
-        return -1;
+        return false;
     }
 
     if (!eglMakeCurrent(p->egl.display, p->egl.surface, p->egl.surface,
                         p->egl.context)) {
         MP_ERR(ctx->vo, "Failed to make context current.\n");
-        return -1;
+        return false;
     }
 
-    mpegl_load_functions(ctx->gl, ctx->vo->log);
-
-    ctx->native_display_type = "drm";
-    ctx->native_display = (void *)(intptr_t)p->kms->fd;
-
+    mpegl_load_functions(&p->gl, ctx->vo->log);
     // required by gbm_surface_lock_front_buffer
     eglSwapBuffers(p->egl.display, p->egl.surface);
 
-    MP_VERBOSE(ctx->vo, "Preparing framebuffer\n");
+    MP_VERBOSE(ctx, "Preparing framebuffer\n");
     p->gbm.bo = gbm_surface_lock_front_buffer(p->gbm.surface);
     if (!p->gbm.bo) {
-        MP_ERR(ctx->vo, "Failed to lock GBM surface.\n");
-        return -1;
+        MP_ERR(ctx, "Failed to lock GBM surface.\n");
+        return false;
     }
     update_framebuffer_from_bo(ctx, p->gbm.bo);
     if (!p->fb.id) {
-        MP_ERR(ctx->vo, "Failed to create framebuffer.\n");
-        return -1;
+        MP_ERR(ctx, "Failed to create framebuffer.\n");
+        return false;
     }
 
     if (!crtc_setup(ctx)) {
-        MP_ERR(ctx->vo, "Failed to set CRTC for connector %u: %s\n",
+        MP_ERR(ctx, "Failed to set CRTC for connector %u: %s\n",
                p->kms->connector->connector_id, mp_strerror(errno));
-        return -1;
+        return false;
     }
 
-    return 0;
-}
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = drm_egl_swap_buffers,
+        .native_display_type = "drm",
+        .native_display = (void *)(intptr_t)p->kms->fd,
+    };
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        return false;
 
-static int drm_egl_init_deprecated(struct MPGLContext *ctx, int flags)
-{
-    if (ctx->vo->probing)
-        return -1;
-    MP_WARN(ctx->vo, "'drm-egl' is deprecated, use 'drm' instead.\n");
-    return drm_egl_init(ctx, flags);
+    return true;
 }
 
-static int drm_egl_reconfig(struct MPGLContext *ctx)
+static bool drm_egl_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     ctx->vo->dwidth = p->fb.width;
     ctx->vo->dheight = p->fb.height;
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, p->fb.width, p->fb.height, 0);
+    return true;
 }
 
-static int drm_egl_control(struct MPGLContext *ctx, int *events, int request,
+static int drm_egl_control(struct ra_ctx *ctx, int *events, int request,
                            void *arg)
 {
     struct priv *p = ctx->priv;
@@ -367,51 +391,11 @@ static int drm_egl_control(struct MPGLContext *ctx, int *events, int request,
     return VO_NOTIMPL;
 }
 
-static void drm_egl_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl.display, p->egl.surface);
-    p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
-    p->waiting_for_flip = true;
-    update_framebuffer_from_bo(ctx, p->gbm.next_bo);
-    int ret = drmModePageFlip(p->kms->fd, p->kms->crtc_id, p->fb.id,
-                              DRM_MODE_PAGE_FLIP_EVENT, p);
-    if (ret) {
-        MP_WARN(ctx->vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
-    }
-
-    // poll page flip finish event
-    const int timeout_ms = 3000;
-    struct pollfd fds[1] = { { .events = POLLIN, .fd = p->kms->fd } };
-    poll(fds, 1, timeout_ms);
-    if (fds[0].revents & POLLIN) {
-        ret = drmHandleEvent(p->kms->fd, &p->ev);
-        if (ret != 0) {
-            MP_ERR(ctx->vo, "drmHandleEvent failed: %i\n", ret);
-            return;
-        }
-    }
-
-    gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
-    p->gbm.bo = p->gbm.next_bo;
-}
-
-const struct mpgl_driver mpgl_driver_drm = {
+const struct ra_ctx_fns ra_ctx_drm_egl = {
+    .type           = "opengl",
     .name           = "drm",
-    .priv_size      = sizeof(struct priv),
-    .init           = drm_egl_init,
     .reconfig       = drm_egl_reconfig,
-    .swap_buffers   = drm_egl_swap_buffers,
-    .control        = drm_egl_control,
-    .uninit         = drm_egl_uninit,
-};
-
-const struct mpgl_driver mpgl_driver_drm_egl = {
-    .name           = "drm-egl",
-    .priv_size      = sizeof(struct priv),
-    .init           = drm_egl_init_deprecated,
-    .reconfig       = drm_egl_reconfig,
-    .swap_buffers   = drm_egl_swap_buffers,
     .control        = drm_egl_control,
+    .init           = drm_egl_init,
     .uninit         = drm_egl_uninit,
 };
diff --git a/video/out/opengl/context_glx.c b/video/out/opengl/context_glx.c
new file mode 100644
index 0000000000..462f2cf592
--- /dev/null
+++ b/video/out/opengl/context_glx.c
@@ -0,0 +1,376 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <X11/Xlib.h>
+#include <GL/glx.h>
+
+// FreeBSD 10.0-CURRENT lacks the GLX_ARB_create_context extension completely
+#ifndef GLX_CONTEXT_MAJOR_VERSION_ARB
+#define GLX_CONTEXT_MAJOR_VERSION_ARB           0x2091
+#define GLX_CONTEXT_MINOR_VERSION_ARB           0x2092
+#define GLX_CONTEXT_FLAGS_ARB                   0x2094
+#define GLX_CONTEXT_PROFILE_MASK_ARB            0x9126
+#ifndef __APPLE__
+// These are respectively 0x00000001 and 0x00000002 on OSX
+#define GLX_CONTEXT_DEBUG_BIT_ARB               0x0001
+#define GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB  0x0002
+#endif
+#define GLX_CONTEXT_CORE_PROFILE_BIT_ARB        0x00000001
+#define GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB 0x00000002
+#endif
+// GLX_EXT_create_context_es2_profile
+#ifndef GLX_CONTEXT_ES2_PROFILE_BIT_EXT
+#define GLX_CONTEXT_ES2_PROFILE_BIT_EXT         0x00000004
+#endif
+
+#include "video/out/x11_common.h"
+#include "context.h"
+#include "utils.h"
+
+struct priv {
+    GL gl;
+    XVisualInfo *vinfo;
+    GLXContext context;
+    GLXFBConfig fbc;
+};
+
+static void glx_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
+    if (p->vinfo)
+        XFree(p->vinfo);
+    if (p->context) {
+        Display *display = ctx->vo->x11->display;
+        glXMakeCurrent(display, None, NULL);
+        glXDestroyContext(display, p->context);
+    }
+
+    vo_x11_uninit(ctx->vo);
+}
+
+static bool create_context_x11_old(struct ra_ctx *ctx, GL *gl)
+{
+    struct priv *p = ctx->priv;
+    Display *display = ctx->vo->x11->display;
+    struct vo *vo = ctx->vo;
+
+    if (p->context)
+        return true;
+
+    if (!p->vinfo) {
+        MP_FATAL(vo, "Can't create a legacy GLX context without X visual\n");
+        return false;
+    }
+
+    GLXContext new_context = glXCreateContext(display, p->vinfo, NULL, True);
+    if (!new_context) {
+        MP_FATAL(vo, "Could not create GLX context!\n");
+        return false;
+    }
+
+    if (!glXMakeCurrent(display, ctx->vo->x11->window, new_context)) {
+        MP_FATAL(vo, "Could not set GLX context!\n");
+        glXDestroyContext(display, new_context);
+        return false;
+    }
+
+    const char *glxstr = glXQueryExtensionsString(display, ctx->vo->x11->screen);
+
+    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+
+    p->context = new_context;
+
+    return true;
+}
+
+typedef GLXContext (*glXCreateContextAttribsARBProc)
+    (Display*, GLXFBConfig, GLXContext, Bool, const int*);
+
+static bool create_context_x11_gl3(struct ra_ctx *ctx, GL *gl, int gl_version,
+                                   bool es)
+{
+    struct priv *p = ctx->priv;
+    struct vo *vo = ctx->vo;
+
+    if (p->context)
+        return true;
+
+    if (!ra_gl_ctx_test_version(ctx, gl_version, es))
+        return false;
+
+    glXCreateContextAttribsARBProc glXCreateContextAttribsARB =
+        (glXCreateContextAttribsARBProc)
+            glXGetProcAddressARB((const GLubyte *)"glXCreateContextAttribsARB");
+
+    const char *glxstr =
+        glXQueryExtensionsString(vo->x11->display, vo->x11->screen);
+    bool have_ctx_ext = glxstr && !!strstr(glxstr, "GLX_ARB_create_context");
+
+    if (!(have_ctx_ext && glXCreateContextAttribsARB)) {
+        return false;
+    }
+
+    int ctx_flags = ctx->opts.debug ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
+    int profile_mask = GLX_CONTEXT_CORE_PROFILE_BIT_ARB;
+
+    if (es) {
+        profile_mask = GLX_CONTEXT_ES2_PROFILE_BIT_EXT;
+        if (!(glxstr && strstr(glxstr, "GLX_EXT_create_context_es2_profile")))
+            return false;
+    }
+
+    int context_attribs[] = {
+        GLX_CONTEXT_MAJOR_VERSION_ARB, MPGL_VER_GET_MAJOR(gl_version),
+        GLX_CONTEXT_MINOR_VERSION_ARB, MPGL_VER_GET_MINOR(gl_version),
+        GLX_CONTEXT_PROFILE_MASK_ARB, profile_mask,
+        GLX_CONTEXT_FLAGS_ARB, ctx_flags,
+        None
+    };
+    vo_x11_silence_xlib(1);
+    GLXContext context = glXCreateContextAttribsARB(vo->x11->display,
+                                                    p->fbc, 0, True,
+                                                    context_attribs);
+    vo_x11_silence_xlib(-1);
+    if (!context)
+        return false;
+
+    // set context
+    if (!glXMakeCurrent(vo->x11->display, vo->x11->window, context)) {
+        MP_FATAL(vo, "Could not set GLX context!\n");
+        glXDestroyContext(vo->x11->display, context);
+        return false;
+    }
+
+    p->context = context;
+
+    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+
+    return true;
+}
+
+// The GL3/FBC initialization code roughly follows/copies from:
+//  http://www.opengl.org/wiki/Tutorial:_OpenGL_3.0_Context_Creation_(GLX)
+// but also uses some of the old code.
+
+static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, bool alpha)
+{
+    int fbcount;
+    GLXFBConfig *fbc = glXChooseFBConfig(vo->x11->display, vo->x11->screen,
+                                         attribs, &fbcount);
+    if (!fbc)
+        return NULL;
+
+    // The list in fbc is sorted (so that the first element is the best).
+    GLXFBConfig fbconfig = fbcount > 0 ? fbc[0] : NULL;
+
+    if (alpha) {
+        for (int n = 0; n < fbcount; n++) {
+            XVisualInfo *v = glXGetVisualFromFBConfig(vo->x11->display, fbc[n]);
+            if (v) {
+                bool is_rgba = vo_x11_is_rgba_visual(v);
+                XFree(v);
+                if (is_rgba) {
+                    fbconfig = fbc[n];
+                    break;
+                }
+            }
+        }
+    }
+
+    XFree(fbc);
+
+    return fbconfig;
+}
+
+static void set_glx_attrib(int *attribs, int name, int value)
+{
+    for (int n = 0; attribs[n * 2 + 0] != None; n++) {
+        if (attribs[n * 2 + 0] == name) {
+            attribs[n * 2 + 1] = value;
+            break;
+        }
+    }
+}
+
+static void glx_swap_buffers(struct ra_ctx *ctx)
+{
+    glXSwapBuffers(ctx->vo->x11->display, ctx->vo->x11->window);
+}
+
+static bool glx_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    struct vo *vo = ctx->vo;
+    GL *gl = &p->gl;
+
+    if (!vo_x11_init(ctx->vo))
+        goto uninit;
+
+    int glx_major, glx_minor;
+
+    if (!glXQueryVersion(vo->x11->display, &glx_major, &glx_minor)) {
+        MP_ERR(ctx, "GLX not found.\n");
+        goto uninit;
+    }
+    // FBConfigs were added in GLX version 1.3.
+    if (MPGL_VER(glx_major, glx_minor) <  MPGL_VER(1, 3)) {
+        MP_ERR(ctx, "GLX version older than 1.3.\n");
+        goto uninit;
+    }
+
+    int glx_attribs[] = {
+        GLX_X_RENDERABLE, True,
+        GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
+        GLX_RED_SIZE, 1,
+        GLX_GREEN_SIZE, 1,
+        GLX_BLUE_SIZE, 1,
+        GLX_ALPHA_SIZE, 0,
+        GLX_DOUBLEBUFFER, True,
+        None
+    };
+    GLXFBConfig fbc = NULL;
+    if (ctx->opts.want_alpha) {
+        set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 1);
+        fbc = select_fb_config(vo, glx_attribs, true);
+        if (!fbc)
+            set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 0);
+    }
+    if (!fbc)
+        fbc = select_fb_config(vo, glx_attribs, false);
+    if (!fbc) {
+        MP_ERR(ctx, "no GLX support present\n");
+        goto uninit;
+    }
+
+    int fbid = -1;
+    if (!glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_FBCONFIG_ID, &fbid))
+        MP_VERBOSE(ctx, "GLX chose FB config with ID 0x%x\n", fbid);
+
+    p->fbc = fbc;
+    p->vinfo = glXGetVisualFromFBConfig(vo->x11->display, fbc);
+    if (p->vinfo) {
+        MP_VERBOSE(ctx, "GLX chose visual with ID 0x%x\n",
+                   (int)p->vinfo->visualid);
+    } else {
+        MP_WARN(ctx, "Selected GLX FB config has no associated X visual\n");
+    }
+
+    if (!vo_x11_create_vo_window(vo, p->vinfo, "gl"))
+        goto uninit;
+
+    bool success = false;
+    for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
+        int version = mpgl_preferred_gl_versions[n];
+        MP_VERBOSE(ctx, "Creating OpenGL %d.%d context...\n",
+                   MPGL_VER_P(version));
+        if (version >= 300) {
+            success = create_context_x11_gl3(ctx, gl, version, false);
+        } else {
+            success = create_context_x11_old(ctx, gl);
+        }
+        if (success)
+            break;
+    }
+    if (!success) // try again for GLES
+        success = create_context_x11_gl3(ctx, gl, 200, true);
+    if (success && !glXIsDirect(vo->x11->display, p->context))
+        gl->mpgl_caps |= MPGL_CAP_SW;
+    if (!success)
+        goto uninit;
+
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = glx_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto uninit;
+
+    return true;
+
+uninit:
+    glx_uninit(ctx);
+    return false;
+}
+
+static bool glx_init_probe(struct ra_ctx *ctx)
+{
+    if (!glx_init(ctx))
+        return false;
+
+    struct priv *p = ctx->priv;
+    if (!(p->gl.mpgl_caps & MPGL_CAP_VDPAU)) {
+        MP_VERBOSE(ctx, "No vdpau support found - probing more things.\n");
+        glx_uninit(ctx);
+        return false;
+    }
+
+    return true;
+}
+
+static void resize(struct ra_ctx *ctx)
+{
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
+}
+
+static bool glx_reconfig(struct ra_ctx *ctx)
+{
+    vo_x11_config_vo_window(ctx->vo);
+    resize(ctx);
+    return true;
+}
+
+static int glx_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
+}
+
+static void glx_wakeup(struct ra_ctx *ctx)
+{
+    vo_x11_wakeup(ctx->vo);
+}
+
+static void glx_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
+{
+    vo_x11_wait_events(ctx->vo, until_time_us);
+}
+
+const struct ra_ctx_fns ra_ctx_glx = {
+    .type           = "opengl",
+    .name           = "x11",
+    .reconfig       = glx_reconfig,
+    .control        = glx_control,
+    .wakeup         = glx_wakeup,
+    .wait_events    = glx_wait_events,
+    .init           = glx_init,
+    .uninit         = glx_uninit,
+};
+
+const struct ra_ctx_fns ra_ctx_glx_probe = {
+    .type           = "opengl",
+    .name           = "x11probe",
+    .reconfig       = glx_reconfig,
+    .control        = glx_control,
+    .wakeup         = glx_wakeup,
+    .wait_events    = glx_wait_events,
+    .init           = glx_init_probe,
+    .uninit         = glx_uninit,
+};
diff --git a/video/out/opengl/context_mali_fbdev.c b/video/out/opengl/context_mali_fbdev.c
index 66daa7f9ee..8576e536d3 100644
--- a/video/out/opengl/context_mali_fbdev.c
+++ b/video/out/opengl/context_mali_fbdev.c
@@ -50,8 +50,7 @@ static bool get_fbdev_size(int *w, int *h)
 }
 
 struct priv {
-    struct mp_log *log;
-    struct GL *gl;
+    struct GL gl;
     EGLDisplay egl_display;
     EGLConfig egl_config;
     EGLContext egl_context;
@@ -60,9 +59,10 @@ struct priv {
     int w, h;
 };
 
-static void mali_uninit(struct MPGLContext *ctx)
+static void mali_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     if (p->egl_surface) {
         eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
@@ -74,25 +74,29 @@ static void mali_uninit(struct MPGLContext *ctx)
     eglReleaseThread();
 }
 
-static int mali_init(struct MPGLContext *ctx, int flags)
+static void mali_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    p->log = ctx->vo->log;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool mali_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     if (!get_fbdev_size(&p->w, &p->h)) {
-        MP_FATAL(p, "Could not get fbdev size.\n");
+        MP_FATAL(ctx, "Could not get fbdev size.\n");
         goto fail;
     }
 
     p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        MP_FATAL(p, "EGL failed to initialize.\n");
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
         goto fail;
     }
 
     EGLConfig config;
-    if (!mpegl_create_context(p->egl_display, p->log, flags, &p->egl_context,
-                              &config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &config))
         goto fail;
 
     p->egl_window = (struct fbdev_window){
@@ -104,53 +108,51 @@ static int mali_init(struct MPGLContext *ctx, int flags)
                                     (EGLNativeWindowType)&p->egl_window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(p, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto fail;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(p, "Failed to set context!\n");
+        MP_FATAL(ctx, "Failed to set context!\n");
         goto fail;
     }
 
-    ctx->gl = talloc_zero(ctx, GL);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    mpegl_load_functions(ctx->gl, p->log);
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = mali_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
 
-    return 0;
+    return true;
 
 fail:
     mali_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int mali_reconfig(struct MPGLContext *ctx)
+static bool mali_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 }
 
-static void mali_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
-}
-
-static int mali_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int mali_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
     return VO_NOTIMPL;
 }
 
-const struct mpgl_driver mpgl_driver_mali = {
+const struct ra_ctx_fns ra_ctx_mali_fbdev = {
+    .type           = "opengl",
     .name           = "mali-fbdev",
-    .priv_size      = sizeof(struct priv),
-    .init           = mali_init,
     .reconfig       = mali_reconfig,
-    .swap_buffers   = mali_swap_buffers,
     .control        = mali_control,
+    .init           = mali_init,
     .uninit         = mali_uninit,
 };
diff --git a/video/out/opengl/context_rpi.c b/video/out/opengl/context_rpi.c
index e79622be5d..8b447d0bfc 100644
--- a/video/out/opengl/context_rpi.c
+++ b/video/out/opengl/context_rpi.c
@@ -30,7 +30,7 @@
 #include "egl_helpers.h"
 
 struct priv {
-    struct mp_log *log;
+    struct GL gl;
     DISPMANX_DISPLAY_HANDLE_T display;
     DISPMANX_ELEMENT_HANDLE_T window;
     DISPMANX_UPDATE_HANDLE_T update;
@@ -49,13 +49,13 @@ struct priv {
 static void tv_callback(void *callback_data, uint32_t reason, uint32_t param1,
                         uint32_t param2)
 {
-    struct MPGLContext *ctx = callback_data;
+    struct ra_ctx *ctx = callback_data;
     struct priv *p = ctx->priv;
     atomic_store(&p->reload_display, true);
     vo_wakeup(ctx->vo);
 }
 
-static void destroy_dispmanx(struct MPGLContext *ctx)
+static void destroy_dispmanx(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -77,9 +77,10 @@ static void destroy_dispmanx(struct MPGLContext *ctx)
     p->update = 0;
 }
 
-static void rpi_uninit(MPGLContext *ctx)
+static void rpi_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     vc_tv_unregister_callback_full(tv_callback, ctx);
 
@@ -92,26 +93,26 @@ static void rpi_uninit(MPGLContext *ctx)
     p->egl_display = EGL_NO_DISPLAY;
 }
 
-static int recreate_dispmanx(struct MPGLContext *ctx)
+static bool recreate_dispmanx(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     int display_nr = 0;
     int layer = 0;
 
-    MP_VERBOSE(ctx->vo, "Recreating DISPMANX state...\n");
+    MP_VERBOSE(ctx, "Recreating DISPMANX state...\n");
 
     destroy_dispmanx(ctx);
 
     p->display = vc_dispmanx_display_open(display_nr);
     p->update = vc_dispmanx_update_start(0);
     if (!p->display || !p->update) {
-        MP_FATAL(ctx->vo, "Could not get DISPMANX objects.\n");
+        MP_FATAL(ctx, "Could not get DISPMANX objects.\n");
         goto fail;
     }
 
     uint32_t dispw, disph;
     if (graphics_get_display_size(0, &dispw, &disph) < 0) {
-        MP_FATAL(ctx->vo, "Could not get display size.\n");
+        MP_FATAL(ctx, "Could not get display size.\n");
         goto fail;
     }
     p->w = dispw;
@@ -145,7 +146,7 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
                                         &src, DISPMANX_PROTECTION_NONE, &alpha,
                                         0, 0);
     if (!p->window) {
-        MP_FATAL(ctx->vo, "Could not add DISPMANX element.\n");
+        MP_FATAL(ctx, "Could not add DISPMANX element.\n");
         goto fail;
     }
 
@@ -161,14 +162,14 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
                                             &p->egl_window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(p, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto fail;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(p, "Failed to set context!\n");
+        MP_FATAL(ctx, "Failed to set context!\n");
         goto fail;
     }
 
@@ -197,21 +198,27 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
 
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
+    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 
     ctx->vo->want_redraw = true;
 
     vo_event(ctx->vo, VO_EVENT_WIN_STATE);
-    return 0;
+    return true;
 
 fail:
     destroy_dispmanx(ctx);
-    return -1;
+    return false;
 }
 
-static int rpi_init(struct MPGLContext *ctx, int flags)
+static void rpi_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    p->log = ctx->vo->log;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool rpi_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     bcm_host_init();
 
@@ -219,43 +226,40 @@ static int rpi_init(struct MPGLContext *ctx, int flags)
 
     p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        MP_FATAL(p, "EGL failed to initialize.\n");
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
         goto fail;
     }
 
-    if (!mpegl_create_context(p->egl_display, p->log, 0, &p->egl_context,
-                              &p->egl_config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &p->egl_config))
         goto fail;
 
     if (recreate_dispmanx(ctx) < 0)
         goto fail;
 
-    ctx->gl = talloc_zero(ctx, GL);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    mpegl_load_functions(ctx->gl, p->log);
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = rpi_swap_buffers,
+        .native_display_type = "MPV_RPI_WINDOW",
+        .native_display = p->win_params,
+    };
 
-    ctx->native_display_type = "MPV_RPI_WINDOW";
-    ctx->native_display = p->win_params;
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
 
-    return 0;
+    return true;
 
 fail:
     rpi_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int rpi_reconfig(struct MPGLContext *ctx)
+static bool rpi_reconfig(struct ra_ctx *ctx)
 {
     return recreate_dispmanx(ctx);
 }
 
-static void rpi_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
-}
-
-static struct mp_image *take_screenshot(struct MPGLContext *ctx)
+static struct mp_image *take_screenshot(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -289,21 +293,20 @@ fail:
     return NULL;
 }
 
-
-static int rpi_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int rpi_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
     struct priv *p = ctx->priv;
 
     switch (request) {
     case VOCTRL_SCREENSHOT_WIN:
         *(struct mp_image **)arg = take_screenshot(ctx);
-        return true;
+        return VO_TRUE;
     case VOCTRL_FULLSCREEN:
         recreate_dispmanx(ctx);
         return VO_TRUE;
     case VOCTRL_CHECK_EVENTS:
         if (atomic_fetch_and(&p->reload_display, 0)) {
-            MP_WARN(ctx->vo, "Recovering from display mode switch...\n");
+            MP_WARN(ctx, "Recovering from display mode switch...\n");
             recreate_dispmanx(ctx);
         }
         return VO_TRUE;
@@ -315,12 +318,11 @@ static int rpi_control(MPGLContext *ctx, int *events, int request, void *arg)
     return VO_NOTIMPL;
 }
 
-const struct mpgl_driver mpgl_driver_rpi = {
+const struct ra_ctx_fns ra_ctx_rpi = {
+    .type           = "opengl",
     .name           = "rpi",
-    .priv_size      = sizeof(struct priv),
-    .init           = rpi_init,
     .reconfig       = rpi_reconfig,
-    .swap_buffers   = rpi_swap_buffers,
     .control        = rpi_control,
+    .init           = rpi_init,
     .uninit         = rpi_uninit,
-};
\ No newline at end of file
+};
diff --git a/video/out/opengl/context_vdpau.c b/video/out/opengl/context_vdpau.c
index 40d21ab65c..a2321f78dd 100644
--- a/video/out/opengl/context_vdpau.c
+++ b/video/out/opengl/context_vdpau.c
@@ -26,8 +26,6 @@
 // follow it. I'm not sure about the original nvidia headers.
 #define BRAINDEATH(x) ((void *)(uintptr_t)(x))
 
-#define NUM_SURFACES 4
-
 struct surface {
     int w, h;
     VdpOutputSurface surface;
@@ -39,21 +37,22 @@ struct surface {
 };
 
 struct priv {
+    GL gl;
     GLXContext context;
     struct mp_vdpau_ctx *vdp;
     VdpPresentationQueueTarget vdp_target;
     VdpPresentationQueue vdp_queue;
+    struct surface *surfaces;
     int num_surfaces;
-    struct surface surfaces[NUM_SURFACES];
-    int current_surface;
+    int idx_surfaces;
 };
 
 typedef GLXContext (*glXCreateContextAttribsARBProc)
     (Display*, GLXFBConfig, GLXContext, Bool, const int*);
 
-static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
+static bool create_context_x11(struct ra_ctx *ctx)
 {
-    struct priv *glx_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
     int glx_major, glx_minor;
@@ -62,6 +61,9 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
+    if (!ra_gl_ctx_test_version(ctx, MPGL_VER(glx_major, glx_minor), false))
+        return false;
+
     int glx_attribs[] = {
         GLX_X_RENDERABLE, True,
         GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
@@ -96,7 +98,7 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
-    int ctx_flags = vo_flags & VOFLAG_GL_DEBUG ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
+    int ctx_flags = ctx->opts.debug ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
     int context_attribs[] = {
         GLX_CONTEXT_MAJOR_VERSION_ARB, 4,
         GLX_CONTEXT_MINOR_VERSION_ARB, 0,
@@ -117,19 +119,20 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
-    glx_ctx->context = context;
-    mpgl_load_functions(ctx->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+    p->context = context;
+    mpgl_load_functions(&p->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
     return true;
 }
 
-static int create_vdpau_objects(struct MPGLContext *ctx)
+static int create_vdpau_objects(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    struct GL *gl = &p->gl;
     VdpDevice dev = p->vdp->vdp_device;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
 
-    ctx->gl->VDPAUInitNV(BRAINDEATH(dev), p->vdp->get_proc_address);
+    gl->VDPAUInitNV(BRAINDEATH(dev), p->vdp->get_proc_address);
 
     vdp_st = vdp->presentation_queue_target_create_x11(dev, ctx->vo->x11->window,
                                                        &p->vdp_target);
@@ -141,13 +144,13 @@ static int create_vdpau_objects(struct MPGLContext *ctx)
     return 0;
 }
 
-static void destroy_vdpau_surface(struct MPGLContext *ctx,
+static void destroy_vdpau_surface(struct ra_ctx *ctx,
                                   struct surface *surface)
 {
     struct priv *p = ctx->priv;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     if (surface->mapped)
         gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
@@ -168,14 +171,14 @@ static void destroy_vdpau_surface(struct MPGLContext *ctx,
     };
 }
 
-static int recreate_vdpau_surface(struct MPGLContext *ctx,
-                                  struct surface *surface)
+static bool recreate_vdpau_surface(struct ra_ctx *ctx,
+                                   struct surface *surface)
 {
     struct priv *p = ctx->priv;
     VdpDevice dev = p->vdp->vdp_device;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     destroy_vdpau_surface(ctx, surface);
 
@@ -219,16 +222,37 @@ static int recreate_vdpau_surface(struct MPGLContext *ctx,
     gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
     surface->mapped = false;
 
-    return 0;
+    return true;
 
 error:
     destroy_vdpau_surface(ctx, surface);
-    return -1;
+    return false;
+}
+
+static void vdpau_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    struct vdp_functions *vdp = &p->vdp->vdp;
+    VdpStatus vdp_st;
+
+    // This is the *next* surface we will be rendering to. By delaying the
+    // block_until_idle, we're essentially allowing p->num_surfaces - 1
+    // in-flight surfaces, plus the one currently visible surface.
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    if (surf->surface == VDP_INVALID_HANDLE)
+        return;
+
+    VdpTime prev_vsync_time;
+    vdp_st = vdp->presentation_queue_block_until_surface_idle(p->vdp_queue,
+                                                              surf->surface,
+                                                              &prev_vsync_time);
+    CHECK_VDP_WARNING(ctx, "waiting for surface failed");
 }
 
-static void glx_uninit(MPGLContext *ctx)
+static void vdpau_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     if (p->vdp) {
         struct vdp_functions *vdp = &p->vdp->vdp;
@@ -259,10 +283,12 @@ static void glx_uninit(MPGLContext *ctx)
     vo_x11_uninit(ctx->vo);
 }
 
-static int glx_init(struct MPGLContext *ctx, int flags)
+static const struct ra_swapchain_fns vdpau_swapchain;
+
+static bool vdpau_init(struct ra_ctx *ctx)
 {
     struct vo *vo = ctx->vo;
-    struct priv *p = ctx->priv;
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     p->vdp_queue = VDP_INVALID_HANDLE;
     p->vdp_target = VDP_INVALID_HANDLE;
@@ -280,110 +306,112 @@ static int glx_init(struct MPGLContext *ctx, int flags)
     if (!vo_x11_create_vo_window(vo, NULL, "vdpauglx"))
         goto uninit;
 
-    if (!create_context_x11(ctx, flags))
+    if (!create_context_x11(ctx))
         goto uninit;
 
-    if (!(ctx->gl->mpgl_caps & MPGL_CAP_VDPAU))
+    if (!(p->gl.mpgl_caps & MPGL_CAP_VDPAU))
         goto uninit;
 
     if (create_vdpau_objects(ctx) < 0)
         goto uninit;
 
-    p->num_surfaces = NUM_SURFACES;
+    p->num_surfaces = ctx->opts.swapchain_depth + 1; // +1 for the visible image
+    p->surfaces = talloc_zero_array(p, struct surface, p->num_surfaces);
     for (int n = 0; n < p->num_surfaces; n++)
         p->surfaces[n].surface = VDP_INVALID_HANDLE;
 
-    ctx->flip_v = true;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = vdpau_swap_buffers,
+        .external_swapchain = &vdpau_swapchain,
+        .flipped = true,
+    };
 
-    return 0;
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
-    glx_uninit(ctx);
-    return -1;
+    vdpau_uninit(ctx);
+    return false;
 }
 
-static int glx_reconfig(struct MPGLContext *ctx)
+static struct ra_tex *vdpau_start_frame(struct ra_swapchain *sw)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
-}
+    struct priv *p = sw->ctx->priv;
+    struct vo *vo = sw->ctx->vo;
+    GL *gl = &p->gl;
+
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    if (surf->w != vo->dwidth || surf->h != vo->dheight ||
+        surf->surface == VDP_INVALID_HANDLE)
+    {
+        if (!recreate_vdpau_surface(sw->ctx, surf))
+            return NULL;
+    }
 
-static int glx_control(struct MPGLContext *ctx, int *events, int request,
-                       void *arg)
-{
-    return vo_x11_control(ctx->vo, events, request, arg);
+    assert(!surf->mapped);
+    gl->VDPAUMapSurfacesNV(1, &surf->registered);
+    surf->mapped = true;
+
+    ra_gl_ctx_resize(sw, surf->w, surf->h, surf->fbo);
+    return ra_gl_ctx_start_frame(sw);
 }
 
-static void glx_start_frame(struct MPGLContext *ctx)
+static bool vdpau_submit_frame(struct ra_swapchain *sw,
+                               const struct vo_frame *frame)
 {
-    struct priv *p = ctx->priv;
+    struct priv *p = sw->ctx->priv;
+    GL *gl = &p->gl;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
-
-    struct surface *surface = &p->surfaces[p->current_surface];
-
-    if (surface->surface != VDP_INVALID_HANDLE) {
-        VdpTime prev_vsync_time;
-        vdp_st = vdp->presentation_queue_block_until_surface_idle(p->vdp_queue,
-                                                                  surface->surface,
-                                                                  &prev_vsync_time);
-        CHECK_VDP_WARNING(ctx, "waiting for surface failed");
-    }
 
-    if (surface->w != ctx->vo->dwidth || surface->h != ctx->vo->dheight)
-        recreate_vdpau_surface(ctx, surface);
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    assert(surf->surface != VDP_INVALID_HANDLE);
+    assert(surf->mapped);
+    gl->VDPAUUnmapSurfacesNV(1, &surf->registered);
+    surf->mapped = false;
 
+    vdp_st = vdp->presentation_queue_display(p->vdp_queue, surf->surface, 0, 0, 0);
+    CHECK_VDP_WARNING(sw->ctx, "trying to present vdp surface");
 
-    ctx->main_fb = surface->fbo; // 0 if creating the surface failed
-
-    if (surface->surface != VDP_INVALID_HANDLE) {
-        gl->VDPAUMapSurfacesNV(1, &surface->registered);
-        surface->mapped = true;
-    }
+    p->idx_surfaces = (p->idx_surfaces + 1) % p->num_surfaces;
+    return ra_gl_ctx_submit_frame(sw, frame) && vdp_st == VDP_STATUS_OK;
 }
 
-static void glx_swap_buffers(struct MPGLContext *ctx)
+static bool vdpau_reconfig(struct ra_ctx *ctx)
 {
-    struct priv *p = ctx->priv;
-    struct vdp_functions *vdp = &p->vdp->vdp;
-    VdpStatus vdp_st;
-    GL *gl = ctx->gl;
-
-    struct surface *surface = &p->surfaces[p->current_surface];
-    if (surface->surface == VDP_INVALID_HANDLE)
-        return; // surface alloc probably failed before
-
-    if (surface->mapped)
-        gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
-    surface->mapped = false;
-
-    vdp_st = vdp->presentation_queue_display(p->vdp_queue, surface->surface,
-                                             0, 0, 0);
-    CHECK_VDP_WARNING(ctx, "trying to present vdp surface");
+    vo_x11_config_vo_window(ctx->vo);
+    return true;
+}
 
-    p->current_surface = (p->current_surface + 1) % p->num_surfaces;
+static int vdpau_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    return vo_x11_control(ctx->vo, events, request, arg);
 }
 
-static void glx_wakeup(struct MPGLContext *ctx)
+static void vdpau_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void glx_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void vdpau_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_vdpauglx = {
+static const struct ra_swapchain_fns vdpau_swapchain = {
+    .start_frame   = vdpau_start_frame,
+    .submit_frame  = vdpau_submit_frame,
+};
+
+const struct ra_ctx_fns ra_ctx_vdpauglx = {
+    .type           = "opengl",
     .name           = "vdpauglx",
-    .priv_size      = sizeof(struct priv),
-    .init           = glx_init,
-    .reconfig       = glx_reconfig,
-    .start_frame    = glx_start_frame,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
+    .reconfig       = vdpau_reconfig,
+    .control        = vdpau_control,
+    .wakeup         = vdpau_wakeup,
+    .wait_events    = vdpau_wait_events,
+    .init           = vdpau_init,
+    .uninit         = vdpau_uninit,
 };
diff --git a/video/out/opengl/context_wayland.c b/video/out/opengl/context_wayland.c
index 87e98cd64f..6ddc550306 100644
--- a/video/out/opengl/context_wayland.c
+++ b/video/out/opengl/context_wayland.c
@@ -19,6 +19,7 @@
 #include "video/out/wayland_common.h"
 #include "context.h"
 #include "egl_helpers.h"
+#include "utils.h"
 
 static void egl_resize(struct vo_wayland_state *wl)
 {
@@ -63,30 +64,42 @@ static void egl_resize(struct vo_wayland_state *wl)
     wl->vo->want_redraw = true;
 }
 
-static int egl_create_context(struct vo_wayland_state *wl, MPGLContext *ctx,
-                              int flags)
+static void waylandgl_swap_buffers(struct ra_ctx *ctx)
 {
-    GL *gl = ctx->gl;
+    struct vo_wayland_state *wl = ctx->vo->wayland;
+    vo_wayland_wait_events(ctx->vo, 0);
+    eglSwapBuffers(wl->egl_context.egl.dpy, wl->egl_context.egl_surface);
+}
+
+static bool egl_create_context(struct ra_ctx *ctx, struct vo_wayland_state *wl)
+{
+    GL *gl = ctx->priv = talloc_zero(ctx, GL);
 
     if (!(wl->egl_context.egl.dpy = eglGetDisplay(wl->display.display)))
-        return -1;
+        return false;
 
     if (eglInitialize(wl->egl_context.egl.dpy, NULL, NULL) != EGL_TRUE)
-        return -1;
+        return false;
 
-    if (!mpegl_create_context(wl->egl_context.egl.dpy, wl->log, flags,
+    if (!mpegl_create_context(ctx, wl->egl_context.egl.dpy,
                               &wl->egl_context.egl.ctx,
                               &wl->egl_context.egl.conf))
-        return -1;
+        return false;
 
     eglMakeCurrent(wl->egl_context.egl.dpy, NULL, NULL, wl->egl_context.egl.ctx);
 
     mpegl_load_functions(gl, wl->log);
 
-    ctx->native_display_type = "wl";
-    ctx->native_display = wl->display.display;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = waylandgl_swap_buffers,
+        .native_display_type = "wl",
+        .native_display = wl->display.display,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        return false;
 
-    return 0;
+    return true;
 }
 
 static void egl_create_window(struct vo_wayland_state *wl)
@@ -122,23 +135,25 @@ static void egl_create_window(struct vo_wayland_state *wl)
     eglSwapInterval(wl->egl_context.egl.dpy, 0);
 }
 
-static int waylandgl_reconfig(struct MPGLContext *ctx)
+static bool waylandgl_reconfig(struct ra_ctx *ctx)
 {
     struct vo_wayland_state * wl = ctx->vo->wayland;
 
     if (!vo_wayland_config(ctx->vo))
-        return -1;
+        return false;
 
     if (!wl->egl_context.egl_window)
         egl_create_window(wl);
 
-    return 0;
+    return true;
 }
 
-static void waylandgl_uninit(MPGLContext *ctx)
+static void waylandgl_uninit(struct ra_ctx *ctx)
 {
     struct vo_wayland_state *wl = ctx->vo->wayland;
 
+    ra_gl_ctx_uninit(ctx);
+
     if (wl->egl_context.egl.ctx) {
         eglReleaseThread();
         if (wl->egl_context.egl_window)
@@ -153,52 +168,45 @@ static void waylandgl_uninit(MPGLContext *ctx)
     vo_wayland_uninit(ctx->vo);
 }
 
-static void waylandgl_swap_buffers(MPGLContext *ctx)
-{
-    struct vo_wayland_state *wl = ctx->vo->wayland;
-
-    vo_wayland_wait_events(ctx->vo, 0);
-
-    eglSwapBuffers(wl->egl_context.egl.dpy, wl->egl_context.egl_surface);
-}
-
-static int waylandgl_control(MPGLContext *ctx, int *events, int request,
+static int waylandgl_control(struct ra_ctx *ctx, int *events, int request,
                              void *data)
 {
     struct vo_wayland_state *wl = ctx->vo->wayland;
     int r = vo_wayland_control(ctx->vo, events, request, data);
 
-    if (*events & VO_EVENT_RESIZE)
+    if (*events & VO_EVENT_RESIZE) {
         egl_resize(wl);
+        ra_gl_ctx_resize(ctx->swapchain, wl->vo->dwidth, wl->vo->dheight, 0);
+    }
 
     return r;
 }
 
-static void wayland_wakeup(struct MPGLContext *ctx)
+static void wayland_wakeup(struct ra_ctx *ctx)
 {
     vo_wayland_wakeup(ctx->vo);
 }
 
-static void wayland_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void wayland_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_wayland_wait_events(ctx->vo, until_time_us);
 }
 
-static int waylandgl_init(struct MPGLContext *ctx, int flags)
+static bool waylandgl_init(struct ra_ctx *ctx)
 {
     if (!vo_wayland_init(ctx->vo))
-        return -1;
+        return false;
 
-    return egl_create_context(ctx->vo->wayland, ctx, flags);
+    return egl_create_context(ctx, ctx->vo->wayland);
 }
 
-const struct mpgl_driver mpgl_driver_wayland = {
+const struct ra_ctx_fns ra_ctx_wayland_egl = {
+    .type           = "opengl",
     .name           = "wayland",
-    .init           = waylandgl_init,
     .reconfig       = waylandgl_reconfig,
-    .swap_buffers   = waylandgl_swap_buffers,
     .control        = waylandgl_control,
     .wakeup         = wayland_wakeup,
     .wait_events    = wayland_wait_events,
+    .init           = waylandgl_init,
     .uninit         = waylandgl_uninit,
 };
diff --git a/video/out/opengl/context_x11.c b/video/out/opengl/context_x11.c
deleted file mode 100644
index 4d8dac1ea5..0000000000
--- a/video/out/opengl/context_x11.c
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <X11/Xlib.h>
-#include <GL/glx.h>
-
-// FreeBSD 10.0-CURRENT lacks the GLX_ARB_create_context extension completely
-#ifndef GLX_CONTEXT_MAJOR_VERSION_ARB
-#define GLX_CONTEXT_MAJOR_VERSION_ARB           0x2091
-#define GLX_CONTEXT_MINOR_VERSION_ARB           0x2092
-#define GLX_CONTEXT_FLAGS_ARB                   0x2094
-#define GLX_CONTEXT_PROFILE_MASK_ARB            0x9126
-#ifndef __APPLE__
-// These are respectively 0x00000001 and 0x00000002 on OSX
-#define GLX_CONTEXT_DEBUG_BIT_ARB               0x0001
-#define GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB  0x0002
-#endif
-#define GLX_CONTEXT_CORE_PROFILE_BIT_ARB        0x00000001
-#define GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB 0x00000002
-#endif
-// GLX_EXT_create_context_es2_profile
-#ifndef GLX_CONTEXT_ES2_PROFILE_BIT_EXT
-#define GLX_CONTEXT_ES2_PROFILE_BIT_EXT         0x00000004
-#endif
-
-#include "video/out/x11_common.h"
-#include "context.h"
-
-struct glx_context {
-    XVisualInfo *vinfo;
-    GLXContext context;
-    GLXFBConfig fbc;
-};
-
-static void glx_uninit(MPGLContext *ctx)
-{
-    struct glx_context *glx_ctx = ctx->priv;
-    if (glx_ctx->vinfo)
-        XFree(glx_ctx->vinfo);
-    if (glx_ctx->context) {
-        Display *display = ctx->vo->x11->display;
-        glXMakeCurrent(display, None, NULL);
-        glXDestroyContext(display, glx_ctx->context);
-    }
-    vo_x11_uninit(ctx->vo);
-}
-
-static bool create_context_x11_old(struct MPGLContext *ctx)
-{
-    struct glx_context *glx_ctx = ctx->priv;
-    Display *display = ctx->vo->x11->display;
-    struct vo *vo = ctx->vo;
-    GL *gl = ctx->gl;
-
-    if (glx_ctx->context)
-        return true;
-
-    if (!glx_ctx->vinfo) {
-        MP_FATAL(vo, "Can't create a legacy GLX context without X visual\n");
-        return false;
-    }
-
-    GLXContext new_context = glXCreateContext(display, glx_ctx->vinfo, NULL,
-                                              True);
-    if (!new_context) {
-        MP_FATAL(vo, "Could not create GLX context!\n");
-        return false;
-    }
-
-    if (!glXMakeCurrent(display, ctx->vo->x11->window, new_context)) {
-        MP_FATAL(vo, "Could not set GLX context!\n");
-        glXDestroyContext(display, new_context);
-        return false;
-    }
-
-    const char *glxstr = glXQueryExtensionsString(display, ctx->vo->x11->screen);
-
-    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
-
-    glx_ctx->context = new_context;
-
-    return true;
-}
-
-typedef GLXContext (*glXCreateContextAttribsARBProc)
-    (Display*, GLXFBConfig, GLXContext, Bool, const int*);
-
-static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
-                                   int gl_version, bool es)
-{
-    struct glx_context *glx_ctx = ctx->priv;
-    struct vo *vo = ctx->vo;
-
-    if (glx_ctx->context)
-        return true;
-
-    glXCreateContextAttribsARBProc glXCreateContextAttribsARB =
-        (glXCreateContextAttribsARBProc)
-            glXGetProcAddressARB((const GLubyte *)"glXCreateContextAttribsARB");
-
-    const char *glxstr =
-        glXQueryExtensionsString(vo->x11->display, vo->x11->screen);
-    bool have_ctx_ext = glxstr && !!strstr(glxstr, "GLX_ARB_create_context");
-
-    if (!(have_ctx_ext && glXCreateContextAttribsARB)) {
-        return false;
-    }
-
-    int ctx_flags = vo_flags & VOFLAG_GL_DEBUG ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
-    int profile_mask = GLX_CONTEXT_CORE_PROFILE_BIT_ARB;
-
-    if (es) {
-        profile_mask = GLX_CONTEXT_ES2_PROFILE_BIT_EXT;
-        if (!(glxstr && strstr(glxstr, "GLX_EXT_create_context_es2_profile")))
-            return false;
-    }
-
-    int context_attribs[] = {
-        GLX_CONTEXT_MAJOR_VERSION_ARB, MPGL_VER_GET_MAJOR(gl_version),
-        GLX_CONTEXT_MINOR_VERSION_ARB, MPGL_VER_GET_MINOR(gl_version),
-        GLX_CONTEXT_PROFILE_MASK_ARB, profile_mask,
-        GLX_CONTEXT_FLAGS_ARB, ctx_flags,
-        None
-    };
-    vo_x11_silence_xlib(1);
-    GLXContext context = glXCreateContextAttribsARB(vo->x11->display,
-                                                    glx_ctx->fbc, 0, True,
-                                                    context_attribs);
-    vo_x11_silence_xlib(-1);
-    if (!context)
-        return false;
-
-    // set context
-    if (!glXMakeCurrent(vo->x11->display, vo->x11->window, context)) {
-        MP_FATAL(vo, "Could not set GLX context!\n");
-        glXDestroyContext(vo->x11->display, context);
-        return false;
-    }
-
-    glx_ctx->context = context;
-
-    mpgl_load_functions(ctx->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
-
-    return true;
-}
-
-// The GL3/FBC initialization code roughly follows/copies from:
-//  http://www.opengl.org/wiki/Tutorial:_OpenGL_3.0_Context_Creation_(GLX)
-// but also uses some of the old code.
-
-static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, int flags)
-{
-    int fbcount;
-    GLXFBConfig *fbc = glXChooseFBConfig(vo->x11->display, vo->x11->screen,
-                                         attribs, &fbcount);
-    if (!fbc)
-        return NULL;
-
-    // The list in fbc is sorted (so that the first element is the best).
-    GLXFBConfig fbconfig = fbcount > 0 ? fbc[0] : NULL;
-
-    if (flags & VOFLAG_ALPHA) {
-        for (int n = 0; n < fbcount; n++) {
-            XVisualInfo *v = glXGetVisualFromFBConfig(vo->x11->display, fbc[n]);
-            if (v) {
-                bool is_rgba = vo_x11_is_rgba_visual(v);
-                XFree(v);
-                if (is_rgba) {
-                    fbconfig = fbc[n];
-                    break;
-                }
-            }
-        }
-    }
-
-    XFree(fbc);
-
-    return fbconfig;
-}
-
-static void set_glx_attrib(int *attribs, int name, int value)
-{
-    for (int n = 0; attribs[n * 2 + 0] != None; n++) {
-        if (attribs[n * 2 + 0] == name) {
-            attribs[n * 2 + 1] = value;
-            break;
-        }
-    }
-}
-
-static int glx_init(struct MPGLContext *ctx, int flags)
-{
-    struct vo *vo = ctx->vo;
-    struct glx_context *glx_ctx = ctx->priv;
-
-    if (!vo_x11_init(ctx->vo))
-        goto uninit;
-
-    int glx_major, glx_minor;
-
-    if (!glXQueryVersion(vo->x11->display, &glx_major, &glx_minor)) {
-        MP_ERR(vo, "GLX not found.\n");
-        goto uninit;
-    }
-    // FBConfigs were added in GLX version 1.3.
-    if (MPGL_VER(glx_major, glx_minor) <  MPGL_VER(1, 3)) {
-        MP_ERR(vo, "GLX version older than 1.3.\n");
-        goto uninit;
-    }
-
-    int glx_attribs[] = {
-        GLX_X_RENDERABLE, True,
-        GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
-        GLX_RED_SIZE, 1,
-        GLX_GREEN_SIZE, 1,
-        GLX_BLUE_SIZE, 1,
-        GLX_ALPHA_SIZE, 0,
-        GLX_DOUBLEBUFFER, True,
-        None
-    };
-    GLXFBConfig fbc = NULL;
-    if (flags & VOFLAG_ALPHA) {
-        set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 1);
-        fbc = select_fb_config(vo, glx_attribs, flags);
-        if (!fbc) {
-            set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 0);
-            flags &= ~VOFLAG_ALPHA;
-        }
-    }
-    if (!fbc)
-        fbc = select_fb_config(vo, glx_attribs, flags);
-    if (!fbc) {
-        MP_ERR(vo, "no GLX support present\n");
-        goto uninit;
-    }
-
-    int fbid = -1;
-    if (!glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_FBCONFIG_ID, &fbid))
-        MP_VERBOSE(vo, "GLX chose FB config with ID 0x%x\n", fbid);
-
-    glx_ctx->fbc = fbc;
-    glx_ctx->vinfo = glXGetVisualFromFBConfig(vo->x11->display, fbc);
-    if (glx_ctx->vinfo) {
-        MP_VERBOSE(vo, "GLX chose visual with ID 0x%x\n",
-                   (int)glx_ctx->vinfo->visualid);
-    } else {
-        MP_WARN(vo, "Selected GLX FB config has no associated X visual\n");
-    }
-
-    if (!vo_x11_create_vo_window(vo, glx_ctx->vinfo, "gl"))
-        goto uninit;
-
-    bool success = false;
-    if (!(flags & VOFLAG_GLES)) {
-        for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
-            int version = mpgl_preferred_gl_versions[n];
-            MP_VERBOSE(vo, "Creating OpenGL %d.%d context...\n",
-                       MPGL_VER_P(version));
-            if (version >= 300) {
-                success = create_context_x11_gl3(ctx, flags, version, false);
-            } else {
-                success = create_context_x11_old(ctx);
-            }
-            if (success)
-                break;
-        }
-    }
-    if (!success) // try ES
-        success = create_context_x11_gl3(ctx, flags, 200, true);
-    if (success && !glXIsDirect(vo->x11->display, glx_ctx->context))
-        ctx->gl->mpgl_caps |= MPGL_CAP_SW;
-    if (!success)
-        goto uninit;
-
-    return 0;
-
-uninit:
-    glx_uninit(ctx);
-    return -1;
-}
-
-static int glx_init_probe(struct MPGLContext *ctx, int flags)
-{
-    int r = glx_init(ctx, flags);
-    if (r >= 0) {
-        if (!(ctx->gl->mpgl_caps & MPGL_CAP_VDPAU)) {
-            MP_VERBOSE(ctx->vo, "No vdpau support found - probing more things.\n");
-            glx_uninit(ctx);
-            r = -1;
-        }
-    }
-    return r;
-}
-
-static int glx_reconfig(struct MPGLContext *ctx)
-{
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
-}
-
-static int glx_control(struct MPGLContext *ctx, int *events, int request,
-                       void *arg)
-{
-    return vo_x11_control(ctx->vo, events, request, arg);
-}
-
-static void glx_swap_buffers(struct MPGLContext *ctx)
-{
-    glXSwapBuffers(ctx->vo->x11->display, ctx->vo->x11->window);
-}
-
-static void glx_wakeup(struct MPGLContext *ctx)
-{
-    vo_x11_wakeup(ctx->vo);
-}
-
-static void glx_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
-{
-    vo_x11_wait_events(ctx->vo, until_time_us);
-}
-
-const struct mpgl_driver mpgl_driver_x11 = {
-    .name           = "x11",
-    .priv_size      = sizeof(struct glx_context),
-    .init           = glx_init,
-    .reconfig       = glx_reconfig,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
-};
-
-const struct mpgl_driver mpgl_driver_x11_probe = {
-    .name           = "x11probe",
-    .priv_size      = sizeof(struct glx_context),
-    .init           = glx_init_probe,
-    .reconfig       = glx_reconfig,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
-};
diff --git a/video/out/opengl/context_x11egl.c b/video/out/opengl/context_x11egl.c
index 2b68007a33..7ab4fe0579 100644
--- a/video/out/opengl/context_x11egl.c
+++ b/video/out/opengl/context_x11egl.c
@@ -32,14 +32,17 @@
 #include "egl_helpers.h"
 
 struct priv {
+    GL gl;
     EGLDisplay egl_display;
     EGLContext egl_context;
     EGLSurface egl_surface;
 };
 
-static void mpegl_uninit(MPGLContext *ctx)
+static void mpegl_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
     if (p->egl_context) {
         eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
                        EGL_NO_CONTEXT);
@@ -51,7 +54,7 @@ static void mpegl_uninit(MPGLContext *ctx)
 
 static int pick_xrgba_config(void *user_data, EGLConfig *configs, int num_configs)
 {
-    struct MPGLContext *ctx = user_data;
+    struct ra_ctx *ctx = user_data;
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
@@ -72,40 +75,44 @@ static int pick_xrgba_config(void *user_data, EGLConfig *configs, int num_config
     return 0;
 }
 
-static int mpegl_init(struct MPGLContext *ctx, int flags)
+static void mpegl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool mpegl_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     struct vo *vo = ctx->vo;
-    int msgl = vo->probing ? MSGL_V : MSGL_FATAL;
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_FATAL;
 
     if (!vo_x11_init(vo))
         goto uninit;
 
     p->egl_display = eglGetDisplay(vo->x11->display);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        mp_msg(vo->log, msgl, "Could not initialize EGL.\n");
+        MP_MSG(ctx, msgl, "Could not initialize EGL.\n");
         goto uninit;
     }
 
-    struct mpegl_opts opts = {
-        .vo_flags = flags,
+    struct mpegl_cb cb = {
         .user_data = ctx,
-        .refine_config = (flags & VOFLAG_ALPHA) ? pick_xrgba_config : NULL,
+        .refine_config = ctx->opts.want_alpha ? pick_xrgba_config : NULL,
     };
 
     EGLConfig config;
-    if (!mpegl_create_context_opts(p->egl_display, vo->log, &opts,
-                                   &p->egl_context, &config))
+    if (!mpegl_create_context_cb(ctx, p->egl_display, cb, &p->egl_context, &config))
         goto uninit;
 
     int vID, n;
     eglGetConfigAttrib(p->egl_display, config, EGL_NATIVE_VISUAL_ID, &vID);
-    MP_VERBOSE(vo, "chose visual 0x%x\n", vID);
+    MP_VERBOSE(ctx, "chose visual 0x%x\n", vID);
     XVisualInfo template = {.visualid = vID};
     XVisualInfo *vi = XGetVisualInfo(vo->x11->display, VisualIDMask, &template, &n);
 
     if (!vi) {
-        MP_FATAL(vo, "Getting X visual failed!\n");
+        MP_FATAL(ctx, "Getting X visual failed!\n");
         goto uninit;
     }
 
@@ -120,64 +127,73 @@ static int mpegl_init(struct MPGLContext *ctx, int flags)
                                     (EGLNativeWindowType)vo->x11->window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(ctx->vo, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto uninit;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(ctx->vo, "Could not make context current!\n");
+        MP_FATAL(ctx, "Could not make context current!\n");
         goto uninit;
     }
 
-    mpegl_load_functions(ctx->gl, vo->log);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    ctx->native_display_type = "x11";
-    ctx->native_display = vo->x11->display;
-    return 0;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = mpegl_swap_buffers,
+        .native_display_type = "x11",
+        .native_display = vo->x11->display,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
     mpegl_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int mpegl_reconfig(struct MPGLContext *ctx)
+static void resize(struct ra_ctx *ctx)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
 }
 
-static int mpegl_control(struct MPGLContext *ctx, int *events, int request,
-                         void *arg)
+static bool mpegl_reconfig(struct ra_ctx *ctx)
 {
-    return vo_x11_control(ctx->vo, events, request, arg);
+    vo_x11_config_vo_window(ctx->vo);
+    resize(ctx);
+    return true;
 }
 
-static void mpegl_swap_buffers(MPGLContext *ctx)
+static int mpegl_control(struct ra_ctx *ctx, int *events, int request,
+                         void *arg)
 {
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-static void mpegl_wakeup(struct MPGLContext *ctx)
+static void mpegl_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void mpegl_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void mpegl_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_x11egl = {
+const struct ra_ctx_fns ra_ctx_x11_egl = {
+    .type           = "opengl",
     .name           = "x11egl",
-    .priv_size      = sizeof(struct priv),
-    .init           = mpegl_init,
     .reconfig       = mpegl_reconfig,
-    .swap_buffers   = mpegl_swap_buffers,
     .control        = mpegl_control,
     .wakeup         = mpegl_wakeup,
     .wait_events    = mpegl_wait_events,
+    .init           = mpegl_init,
     .uninit         = mpegl_uninit,
 };
diff --git a/video/out/opengl/egl_helpers.c b/video/out/opengl/egl_helpers.c
index ac152df06a..0033bf1e33 100644
--- a/video/out/opengl/egl_helpers.c
+++ b/video/out/opengl/egl_helpers.c
@@ -25,6 +25,7 @@
 
 #include "egl_helpers.h"
 #include "common.h"
+#include "utils.h"
 #include "context.h"
 
 #if HAVE_EGL_ANGLE
@@ -43,41 +44,49 @@
 #define EGL_OPENGL_ES3_BIT                      0x00000040
 #endif
 
-// es_version = 0 (desktop), 2/3 (ES major version)
-static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
-                           int es_version, struct mpegl_opts *opts,
+// es_version: 0 (core), 2 or 3
+static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
+                           int es_version, struct mpegl_cb cb,
                            EGLContext *out_context, EGLConfig *out_config)
 {
-    int msgl = probing ? MSGL_V : MSGL_FATAL;
-
-    EGLenum api = EGL_OPENGL_API;
-    EGLint rend = EGL_OPENGL_BIT;
-    const char *name = "Desktop OpenGL";
-    if (es_version == 2) {
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_FATAL;
+
+    EGLenum api;
+    EGLint rend;
+    const char *name;
+
+    switch (es_version) {
+    case 0:
+        api = EGL_OPENGL_API;
+        rend = EGL_OPENGL_BIT;
+        name = "Desktop OpenGL";
+        break;
+    case 2:
         api = EGL_OPENGL_ES_API;
         rend = EGL_OPENGL_ES2_BIT;
-        name = "GLES 2.0";
-    }
-    if (es_version == 3) {
+        name = "GLES 2.x";
+        break;
+    case 3:
         api = EGL_OPENGL_ES_API;
         rend = EGL_OPENGL_ES3_BIT;
         name = "GLES 3.x";
+        break;
+    default: abort();
     }
 
-    mp_msg(log, MSGL_V, "Trying to create %s context.\n", name);
+    MP_VERBOSE(ctx, "Trying to create %s context.\n", name);
 
     if (!eglBindAPI(api)) {
-        mp_msg(log, MSGL_V, "Could not bind API!\n");
+        MP_VERBOSE(ctx, "Could not bind API!\n");
         return false;
     }
 
-
     EGLint attributes[] = {
         EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
         EGL_RED_SIZE, 1,
         EGL_GREEN_SIZE, 1,
         EGL_BLUE_SIZE, 1,
-        EGL_ALPHA_SIZE, (opts->vo_flags & VOFLAG_ALPHA ) ? 1 : 0,
+        EGL_ALPHA_SIZE, ctx->opts.want_alpha ? 1 : 0,
         EGL_RENDERABLE_TYPE, rend,
         EGL_NONE
     };
@@ -92,29 +101,34 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
 
     if (!num_configs) {
         talloc_free(configs);
-        mp_msg(log, msgl, "Could not choose EGLConfig!\n");
+        MP_MSG(ctx, msgl, "Could not choose EGLConfig!\n");
         return false;
     }
 
     int chosen = 0;
-    if (opts->refine_config)
-        chosen = opts->refine_config(opts->user_data, configs, num_configs);
+    if (cb.refine_config)
+        chosen = cb.refine_config(cb.user_data, configs, num_configs);
     EGLConfig config = configs[chosen];
 
     talloc_free(configs);
 
-    EGLContext *ctx = NULL;
+    EGLContext *egl_ctx = NULL;
 
     if (es_version) {
+        if (!ra_gl_ctx_test_version(ctx, MPGL_VER(es_version, 0), true))
+            return false;
+
         EGLint attrs[] = {
             EGL_CONTEXT_CLIENT_VERSION, es_version,
             EGL_NONE
         };
 
-        ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+        egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
     } else {
         for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
             int ver = mpgl_preferred_gl_versions[n];
+            if (!ra_gl_ctx_test_version(ctx, ver, false))
+                continue;
 
             EGLint attrs[] = {
                 EGL_CONTEXT_MAJOR_VERSION, MPGL_VER_GET_MAJOR(ver),
@@ -124,25 +138,25 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
                 EGL_NONE
             };
 
-            ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
-            if (ctx)
+            egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+            if (egl_ctx)
                 break;
         }
 
-        if (!ctx) {
+        if (!egl_ctx && ra_gl_ctx_test_version(ctx, 140, false)) {
             // Fallback for EGL 1.4 without EGL_KHR_create_context.
             EGLint attrs[] = { EGL_NONE };
 
-            ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+            egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
         }
     }
 
-    if (!ctx) {
-        mp_msg(log, msgl, "Could not create EGL context!\n");
+    if (!egl_ctx) {
+        MP_MSG(ctx, msgl, "Could not create EGL context!\n");
         return false;
     }
 
-    *out_context = ctx;
+    *out_context = egl_ctx;
     *out_config = config;
     return true;
 }
@@ -152,56 +166,36 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
 // Create a context and return it and the config it was created with. If it
 // returns false, the out_* pointers are set to NULL.
 // vo_flags is a combination of VOFLAG_* values.
-bool mpegl_create_context(EGLDisplay display, struct mp_log *log, int vo_flags,
+bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
                           EGLContext *out_context, EGLConfig *out_config)
 {
-    return mpegl_create_context_opts(display, log,
-        &(struct mpegl_opts){.vo_flags = vo_flags}, out_context, out_config);
+    return mpegl_create_context_cb(ctx, display, (struct mpegl_cb){0},
+                                   out_context, out_config);
 }
 
 // Create a context and return it and the config it was created with. If it
 // returns false, the out_* pointers are set to NULL.
-bool mpegl_create_context_opts(EGLDisplay display, struct mp_log *log,
-                               struct mpegl_opts *opts,
-                               EGLContext *out_context, EGLConfig *out_config)
+bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
+                             struct mpegl_cb cb, EGLContext *out_context,
+                             EGLConfig *out_config)
 {
-    assert(opts);
-
     *out_context = NULL;
     *out_config = NULL;
 
     const char *version = eglQueryString(display, EGL_VERSION);
     const char *vendor = eglQueryString(display, EGL_VENDOR);
     const char *apis = eglQueryString(display, EGL_CLIENT_APIS);
-    mp_verbose(log, "EGL_VERSION=%s\nEGL_VENDOR=%s\nEGL_CLIENT_APIS=%s\n",
+    MP_VERBOSE(ctx, "EGL_VERSION=%s\nEGL_VENDOR=%s\nEGL_CLIENT_APIS=%s\n",
                STR_OR_ERR(version), STR_OR_ERR(vendor), STR_OR_ERR(apis));
 
-    bool probing = opts->vo_flags & VOFLAG_PROBING;
-    int msgl = probing ? MSGL_V : MSGL_FATAL;
-    bool try_gles = !(opts->vo_flags & VOFLAG_NO_GLES);
-
-    if (!(opts->vo_flags & VOFLAG_GLES)) {
-        // Desktop OpenGL
-        if (create_context(display, log, try_gles | probing, 0, opts,
-                           out_context, out_config))
-            return true;
-    }
-
-    if (try_gles && !(opts->vo_flags & VOFLAG_GLES2)) {
-        // ES 3.x
-        if (create_context(display, log, true, 3, opts,
-                           out_context, out_config))
-            return true;
-    }
-
-    if (try_gles) {
-        // ES 2.0
-        if (create_context(display, log, probing, 2, opts,
-                           out_context, out_config))
+    int es[] = {0, 3, 2}; // preference order
+    for (int i = 0; i < MP_ARRAY_SIZE(es); i++) {
+        if (create_context(ctx, display, es[i], cb, out_context, out_config))
             return true;
     }
 
-    mp_msg(log, msgl, "Could not create a GL context.\n");
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR;
+    MP_MSG(ctx, msgl, "Could not create a GL context.\n");
     return false;
 }
 
diff --git a/video/out/opengl/egl_helpers.h b/video/out/opengl/egl_helpers.h
index 05f9dccb70..eaaf9d7a48 100644
--- a/video/out/opengl/egl_helpers.h
+++ b/video/out/opengl/egl_helpers.h
@@ -6,26 +6,23 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
+#include "video/out/gpu/context.h"
+
 struct mp_log;
 
-bool mpegl_create_context(EGLDisplay display, struct mp_log *log, int vo_flags,
+bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
                           EGLContext *out_context, EGLConfig *out_config);
 
-struct mpegl_opts {
-    // combination of VOFLAG_* values.
-    int vo_flags;
-
-    // for callbacks
-    void *user_data;
-
+struct mpegl_cb {
     // if set, pick the desired config from the given list and return its index
     // defaults to 0 (they are sorted by eglChooseConfig)
     int (*refine_config)(void *user_data, EGLConfig *configs, int num_configs);
+    void *user_data;
 };
 
-bool mpegl_create_context_opts(EGLDisplay display, struct mp_log *log,
-                               struct mpegl_opts *opts,
-                               EGLContext *out_context, EGLConfig *out_config);
+bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
+                             struct mpegl_cb cb, EGLContext *out_context,
+                             EGLConfig *out_config);
 
 struct GL;
 void mpegl_load_functions(struct GL *gl, struct mp_log *log);
diff --git a/video/out/opengl/formats.h b/video/out/opengl/formats.h
index 3da6ede82a..f727a3b6ef 100644
--- a/video/out/opengl/formats.h
+++ b/video/out/opengl/formats.h
@@ -2,7 +2,6 @@
 #define MPGL_FORMATS_H_
 
 #include "common.h"
-#include "ra.h"
 
 struct gl_format {
     const char *name;           // symbolic name for user interaction/debugging
diff --git a/video/out/opengl/gl_utils.c b/video/out/opengl/gl_utils.c
deleted file mode 100644
index bce2dabe5d..0000000000
--- a/video/out/opengl/gl_utils.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * This file is part of mpv.
- * Parts based on MPlayer code by Reimar Döffinger.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include <libavutil/sha.h>
-#include <libavutil/intreadwrite.h>
-#include <libavutil/mem.h>
-
-#include "osdep/io.h"
-
-#include "common/common.h"
-#include "options/path.h"
-#include "stream/stream.h"
-#include "formats.h"
-#include "ra_gl.h"
-#include "gl_utils.h"
-
-// GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
-static const char *gl_error_to_string(GLenum error)
-{
-    switch (error) {
-    case GL_INVALID_ENUM: return "INVALID_ENUM";
-    case GL_INVALID_VALUE: return "INVALID_VALUE";
-    case GL_INVALID_OPERATION: return "INVALID_OPERATION";
-    case GL_INVALID_FRAMEBUFFER_OPERATION: return "INVALID_FRAMEBUFFER_OPERATION";
-    case GL_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
-    default: return "unknown";
-    }
-}
-
-void gl_check_error(GL *gl, struct mp_log *log, const char *info)
-{
-    for (;;) {
-        GLenum error = gl->GetError();
-        if (error == GL_NO_ERROR)
-            break;
-        mp_msg(log, MSGL_ERR, "%s: OpenGL error %s.\n", info,
-               gl_error_to_string(error));
-    }
-}
-
-static int get_alignment(int stride)
-{
-    if (stride % 8 == 0)
-        return 8;
-    if (stride % 4 == 0)
-        return 4;
-    if (stride % 2 == 0)
-        return 2;
-    return 1;
-}
-
-// upload a texture, handling things like stride and slices
-//  target: texture target, usually GL_TEXTURE_2D
-//  format, type: texture parameters
-//  dataptr, stride: image data
-//  x, y, width, height: part of the image to upload
-void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
-                   const void *dataptr, int stride,
-                   int x, int y, int w, int h)
-{
-    int bpp = gl_bytes_per_pixel(format, type);
-    const uint8_t *data = dataptr;
-    int y_max = y + h;
-    if (w <= 0 || h <= 0 || !bpp)
-        return;
-    if (stride < 0) {
-        data += (h - 1) * stride;
-        stride = -stride;
-    }
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
-    int slice = h;
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
-        // this is not always correct, but should work for MPlayer
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
-    } else {
-        if (stride != bpp * w)
-            slice = 1; // very inefficient, but at least it works
-    }
-    for (; y + slice <= y_max; y += slice) {
-        gl->TexSubImage2D(target, 0, x, y, w, slice, format, type, data);
-        data += stride * slice;
-    }
-    if (y < y_max)
-        gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
-}
-
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
-{
-    if (gl->es)
-        return NULL; // ES can't read from front buffer
-    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
-    if (!image)
-        return NULL;
-    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
-    GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
-    gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
-    gl->ReadBuffer(obj);
-    //flip image while reading (and also avoid stride-related trouble)
-    for (int y = 0; y < h; y++) {
-        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
-                       image->planes[0] + y * image->stride[0]);
-    }
-    gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-    return image;
-}
-
-static void gl_vao_enable_attribs(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    for (int n = 0; n < vao->num_entries; n++) {
-        const struct ra_renderpass_input *e = &vao->entries[n];
-        GLenum type = 0;
-        bool normalized = false;
-        switch (e->type) {
-        case RA_VARTYPE_INT:
-            type = GL_INT;
-            break;
-        case RA_VARTYPE_FLOAT:
-            type = GL_FLOAT;
-            break;
-        case RA_VARTYPE_BYTE_UNORM:
-            type = GL_UNSIGNED_BYTE;
-            normalized = true;
-            break;
-        default:
-            abort();
-        }
-        assert(e->dim_m == 1);
-
-        gl->EnableVertexAttribArray(n);
-        gl->VertexAttribPointer(n, e->dim_v, type, normalized,
-                                vao->stride, (void *)(intptr_t)e->offset);
-    }
-}
-
-void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
-                 const struct ra_renderpass_input *entries,
-                 int num_entries)
-{
-    assert(!vao->vao);
-    assert(!vao->buffer);
-
-    *vao = (struct gl_vao){
-        .gl = gl,
-        .stride = stride,
-        .entries = entries,
-        .num_entries = num_entries,
-    };
-
-    gl->GenBuffers(1, &vao->buffer);
-
-    if (gl->BindVertexArray) {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-
-        gl->GenVertexArrays(1, &vao->vao);
-        gl->BindVertexArray(vao->vao);
-        gl_vao_enable_attribs(vao);
-        gl->BindVertexArray(0);
-
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-}
-
-void gl_vao_uninit(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-    if (!gl)
-        return;
-
-    if (gl->DeleteVertexArrays)
-        gl->DeleteVertexArrays(1, &vao->vao);
-    gl->DeleteBuffers(1, &vao->buffer);
-
-    *vao = (struct gl_vao){0};
-}
-
-static void gl_vao_bind(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    if (gl->BindVertexArray) {
-        gl->BindVertexArray(vao->vao);
-    } else {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-        gl_vao_enable_attribs(vao);
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-}
-
-static void gl_vao_unbind(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    if (gl->BindVertexArray) {
-        gl->BindVertexArray(0);
-    } else {
-        for (int n = 0; n < vao->num_entries; n++)
-            gl->DisableVertexAttribArray(n);
-    }
-}
-
-// Draw the vertex data (as described by the gl_vao_entry entries) in ptr
-// to the screen. num is the number of vertexes. prim is usually GL_TRIANGLES.
-// If ptr is NULL, then skip the upload, and use the data uploaded with the
-// previous call.
-void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
-{
-    GL *gl = vao->gl;
-
-    if (ptr) {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-        gl->BufferData(GL_ARRAY_BUFFER, num * vao->stride, ptr, GL_STREAM_DRAW);
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-
-    gl_vao_bind(vao);
-
-    gl->DrawArrays(prim, 0, num);
-
-    gl_vao_unbind(vao);
-}
-
-static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
-                                   GLenum severity, GLsizei length,
-                                   const GLchar *message, const void *userParam)
-{
-    // keep in mind that the debug callback can be asynchronous
-    struct mp_log *log = (void *)userParam;
-    int level = MSGL_ERR;
-    switch (severity) {
-    case GL_DEBUG_SEVERITY_NOTIFICATION:level = MSGL_V; break;
-    case GL_DEBUG_SEVERITY_LOW:         level = MSGL_INFO; break;
-    case GL_DEBUG_SEVERITY_MEDIUM:      level = MSGL_WARN; break;
-    case GL_DEBUG_SEVERITY_HIGH:        level = MSGL_ERR; break;
-    }
-    mp_msg(log, level, "GL: %s\n", message);
-}
-
-void gl_set_debug_logger(GL *gl, struct mp_log *log)
-{
-    if (gl->DebugMessageCallback)
-        gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
-}
-
-int gl_get_fb_depth(GL *gl, int fbo)
-{
-    if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))
-        return -1;
-
-    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
-
-    GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
-    if (fbo)
-        obj = GL_COLOR_ATTACHMENT0;
-
-    GLint depth_g = -1;
-
-    gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
-                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
-
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-
-    return depth_g > 0 ? depth_g : -1;
-}
diff --git a/video/out/opengl/gl_utils.h b/video/out/opengl/gl_utils.h
deleted file mode 100644
index 306ee23f65..0000000000
--- a/video/out/opengl/gl_utils.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of mpv.
- * Parts based on MPlayer code by Reimar Döffinger.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_UTILS_
-#define MP_GL_UTILS_
-
-#include <math.h>
-
-#include "common.h"
-#include "ra.h"
-
-struct mp_log;
-
-void gl_check_error(GL *gl, struct mp_log *log, const char *info);
-
-void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
-                   const void *dataptr, int stride,
-                   int x, int y, int w, int h);
-
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
-
-struct gl_vao {
-    GL *gl;
-    GLuint vao;     // the VAO object, or 0 if unsupported by driver
-    GLuint buffer;  // GL_ARRAY_BUFFER used for the data
-    int stride;     // size of each element (interleaved elements are assumed)
-    const struct ra_renderpass_input *entries;
-    int num_entries;
-};
-
-void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
-                 const struct ra_renderpass_input *entries,
-                 int num_entries);
-void gl_vao_uninit(struct gl_vao *vao);
-void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
-
-void gl_set_debug_logger(GL *gl, struct mp_log *log);
-
-int gl_get_fb_depth(GL *gl, int fbo);
-
-#endif
diff --git a/video/out/opengl/hwdec.c b/video/out/opengl/hwdec.c
deleted file mode 100644
index 5fbc1aa4a9..0000000000
--- a/video/out/opengl/hwdec.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stddef.h>
-#include <string.h>
-
-#include "config.h"
-
-#include "common/common.h"
-#include "common/msg.h"
-#include "options/m_config.h"
-#include "hwdec.h"
-
-extern const struct ra_hwdec_driver ra_hwdec_vaegl;
-extern const struct ra_hwdec_driver ra_hwdec_vaglx;
-extern const struct ra_hwdec_driver ra_hwdec_videotoolbox;
-extern const struct ra_hwdec_driver ra_hwdec_vdpau;
-extern const struct ra_hwdec_driver ra_hwdec_dxva2egl;
-extern const struct ra_hwdec_driver ra_hwdec_d3d11egl;
-extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
-extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
-extern const struct ra_hwdec_driver ra_hwdec_dxva2;
-extern const struct ra_hwdec_driver ra_hwdec_cuda;
-extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
-
-static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
-#if HAVE_VAAPI_EGL
-    &ra_hwdec_vaegl,
-#endif
-#if HAVE_VAAPI_GLX
-    &ra_hwdec_vaglx,
-#endif
-#if HAVE_VDPAU_GL_X11
-    &ra_hwdec_vdpau,
-#endif
-#if HAVE_VIDEOTOOLBOX_GL || HAVE_IOS_GL
-    &ra_hwdec_videotoolbox,
-#endif
-#if HAVE_D3D_HWACCEL
-    &ra_hwdec_d3d11egl,
-    &ra_hwdec_d3d11eglrgb,
- #if HAVE_D3D9_HWACCEL
-    &ra_hwdec_dxva2egl,
- #endif
-#endif
-#if HAVE_GL_DXINTEROP_D3D9
-    &ra_hwdec_dxva2gldx,
-#endif
-#if HAVE_CUDA_HWACCEL
-    &ra_hwdec_cuda,
-#endif
-#if HAVE_RPI
-    &ra_hwdec_rpi_overlay,
-#endif
-    NULL
-};
-
-static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
-                                          struct mpv_global *global,
-                                          struct mp_hwdec_devices *devs,
-                                          const struct ra_hwdec_driver *drv,
-                                          bool is_auto)
-{
-    struct ra_hwdec *hwdec = talloc(NULL, struct ra_hwdec);
-    *hwdec = (struct ra_hwdec) {
-        .driver = drv,
-        .log = mp_log_new(hwdec, log, drv->name),
-        .global = global,
-        .ra = ra,
-        .devs = devs,
-        .probing = is_auto,
-        .priv = talloc_zero_size(hwdec, drv->priv_size),
-    };
-    mp_verbose(log, "Loading hwdec driver '%s'\n", drv->name);
-    if (hwdec->driver->init(hwdec) < 0) {
-        ra_hwdec_uninit(hwdec);
-        mp_verbose(log, "Loading failed.\n");
-        return NULL;
-    }
-    return hwdec;
-}
-
-struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
-                                   struct mpv_global *g,
-                                   struct mp_hwdec_devices *devs,
-                                   enum hwdec_type api)
-{
-    bool is_auto = HWDEC_IS_AUTO(api);
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if ((is_auto || api == drv->api) && !drv->testing_only) {
-            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, is_auto);
-            if (r)
-                return r;
-        }
-    }
-    return NULL;
-}
-
-// Load by option name.
-struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
-                               struct mpv_global *g,
-                               struct mp_hwdec_devices *devs,
-                               const char *name)
-{
-    int g_hwdec_api;
-    mp_read_option_raw(g, "hwdec", &m_option_type_choice, &g_hwdec_api);
-    if (!name || !name[0])
-        name = m_opt_choice_str(mp_hwdec_names, g_hwdec_api);
-
-    int api_id = HWDEC_NONE;
-    for (int n = 0; mp_hwdec_names[n].name; n++) {
-        if (name && strcmp(mp_hwdec_names[n].name, name) == 0)
-            api_id = mp_hwdec_names[n].value;
-    }
-
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if (name && strcmp(drv->name, name) == 0) {
-            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, false);
-            if (r)
-                return r;
-        }
-    }
-
-    return ra_hwdec_load_api(log, ra, g, devs, api_id);
-}
-
-int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
-                          struct bstr name, struct bstr param)
-{
-    bool help = bstr_equals0(param, "help");
-    if (help)
-        mp_info(log, "Available hwdecs:\n");
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        const char *api_name = m_opt_choice_str(mp_hwdec_names, drv->api);
-        if (help) {
-            mp_info(log, "    %s [%s]\n", drv->name, api_name);
-        } else if (bstr_equals0(param, drv->name) ||
-                   bstr_equals0(param, api_name))
-        {
-            return 1;
-        }
-    }
-    if (help) {
-        mp_info(log, "    auto (loads best)\n"
-                     "    (other --hwdec values)\n"
-                     "Setting an empty string means use --hwdec.\n");
-        return M_OPT_EXIT;
-    }
-    if (!param.len)
-        return 1; // "" is treated specially
-    for (int n = 0; mp_hwdec_names[n].name; n++) {
-        if (bstr_equals0(param, mp_hwdec_names[n].name))
-            return 1;
-    }
-    mp_fatal(log, "No hwdec backend named '%.*s' found!\n", BSTR_P(param));
-    return M_OPT_INVALID;
-}
-
-void ra_hwdec_uninit(struct ra_hwdec *hwdec)
-{
-    if (hwdec)
-        hwdec->driver->uninit(hwdec);
-    talloc_free(hwdec);
-}
-
-bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt)
-{
-    for (int n = 0; hwdec->driver->imgfmts[n]; n++) {
-        if (hwdec->driver->imgfmts[n] == imgfmt)
-            return true;
-    }
-    return false;
-}
-
-struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
-                                               struct mp_image_params *params)
-{
-    assert(ra_hwdec_test_format(hwdec, params->imgfmt));
-
-    struct ra_hwdec_mapper *mapper = talloc_ptrtype(NULL, mapper);
-    *mapper = (struct ra_hwdec_mapper){
-        .owner = hwdec,
-        .driver = hwdec->driver->mapper,
-        .log = hwdec->log,
-        .ra = hwdec->ra,
-        .priv = talloc_zero_size(mapper, hwdec->driver->mapper->priv_size),
-        .src_params = *params,
-        .dst_params = *params,
-    };
-    if (mapper->driver->init(mapper) < 0)
-        ra_hwdec_mapper_free(&mapper);
-    return mapper;
-}
-
-void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper)
-{
-    struct ra_hwdec_mapper *p = *mapper;
-    if (p) {
-        ra_hwdec_mapper_unmap(p);
-        p->driver->uninit(p);
-        talloc_free(p);
-    }
-    *mapper = NULL;
-}
-
-void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper)
-{
-    if (mapper->driver->unmap)
-        mapper->driver->unmap(mapper);
-    mp_image_unrefp(&mapper->src);
-}
-
-int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img)
-{
-    ra_hwdec_mapper_unmap(mapper);
-    mp_image_setrefp(&mapper->src, img);
-    if (mapper->driver->map(mapper) < 0) {
-        ra_hwdec_mapper_unmap(mapper);
-        return -1;
-    }
-    return 0;
-}
diff --git a/video/out/opengl/hwdec.h b/video/out/opengl/hwdec.h
deleted file mode 100644
index 20bbaae9eb..0000000000
--- a/video/out/opengl/hwdec.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef MPGL_HWDEC_H_
-#define MPGL_HWDEC_H_
-
-#include "video/mp_image.h"
-#include "ra.h"
-#include "video/hwdec.h"
-
-struct ra_hwdec {
-    const struct ra_hwdec_driver *driver;
-    struct mp_log *log;
-    struct mpv_global *global;
-    struct ra *ra;
-    struct mp_hwdec_devices *devs;
-    // GLSL extensions required to sample textures from this.
-    const char **glsl_extensions;
-    // For free use by hwdec driver
-    void *priv;
-    // For working around the vdpau vs. vaapi mess.
-    bool probing;
-    // Used in overlay mode only.
-    float overlay_colorkey[4];
-};
-
-struct ra_hwdec_mapper {
-    const struct ra_hwdec_mapper_driver *driver;
-    struct mp_log *log;
-    struct ra *ra;
-    void *priv;
-    struct ra_hwdec *owner;
-    // Input frame parameters. (Set before init(), immutable.)
-    struct mp_image_params src_params;
-    // Output frame parameters (represents the format the textures return). Must
-    // be set by init(), immutable afterwards,
-    struct mp_image_params dst_params;
-
-    // The currently mapped source image (or the image about to be mapped in
-    // ->map()). NULL if unmapped. The mapper can also clear this reference if
-    // the mapped textures contain a full copy.
-    struct mp_image *src;
-
-    // The mapped textures and metadata about them. These fields change if a
-    // new frame is mapped (or unmapped), but otherwise remain constant.
-    // The common code won't mess with these, so you can e.g. set them in the
-    // .init() callback.
-    struct ra_tex *tex[4];
-    bool vdpau_fields;
-};
-
-// This can be used to map frames of a specific hw format as GL textures.
-struct ra_hwdec_mapper_driver {
-    // Used to create ra_hwdec_mapper.priv.
-    size_t priv_size;
-
-    // Init the mapper implementation. At this point, the field src_params,
-    // fns, devs, priv are initialized.
-    int (*init)(struct ra_hwdec_mapper *mapper);
-    // Destroy the mapper. unmap is called before this.
-    void (*uninit)(struct ra_hwdec_mapper *mapper);
-
-    // Map mapper->src as texture, and set mapper->frame to textures using it.
-    // It is expected that that the textures remain valid until the next unmap
-    // or uninit call.
-    // The function is allowed to unref mapper->src if it's not needed (i.e.
-    // this function creates a copy).
-    // The underlying format can change, so you might need to do some form
-    // of change detection. You also must reject unsupported formats with an
-    // error.
-    // On error, returns negative value on error and remains unmapped.
-    int (*map)(struct ra_hwdec_mapper *mapper);
-    // Unmap the frame. Does nothing if already unmapped. Optional.
-    void (*unmap)(struct ra_hwdec_mapper *mapper);
-};
-
-struct ra_hwdec_driver {
-    // Name of the interop backend. This is used for informational purposes only.
-    const char *name;
-    // Used to create ra_hwdec.priv.
-    size_t priv_size;
-    // Used to explicitly request a specific API.
-    enum hwdec_type api;
-    // One of the hardware surface IMGFMT_ that must be passed to map_image later.
-    // Terminated with a 0 entry. (Extend the array size as needed.)
-    const int imgfmts[3];
-    // Dosn't load this unless requested by name.
-    bool testing_only;
-
-    // Create the hwdec device. It must add it to hw->devs, if applicable.
-    int (*init)(struct ra_hwdec *hw);
-    void (*uninit)(struct ra_hwdec *hw);
-
-    // This will be used to create a ra_hwdec_mapper from ra_hwdec.
-    const struct ra_hwdec_mapper_driver *mapper;
-
-    // The following function provides an alternative API. Each ra_hwdec_driver
-    // must have either provide a mapper or overlay_frame (not both or none), and
-    // if overlay_frame is set, it operates in overlay mode. In this mode,
-    // OSD etc. is rendered via OpenGL, but the video is rendered as a separate
-    // layer below it.
-    // Non-overlay mode is strictly preferred, so try not to use overlay mode.
-    // Set the given frame as overlay, replacing the previous one. This can also
-    // just change the position of the overlay.
-    // hw_image==src==dst==NULL is passed to clear the overlay.
-    int (*overlay_frame)(struct ra_hwdec *hw, struct mp_image *hw_image,
-                         struct mp_rect *src, struct mp_rect *dst, bool newframe);
-};
-
-struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
-                                   struct mpv_global *g,
-                                   struct mp_hwdec_devices *devs,
-                                   enum hwdec_type api);
-
-struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
-                               struct mpv_global *g,
-                               struct mp_hwdec_devices *devs,
-                               const char *name);
-
-int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
-                          struct bstr name, struct bstr param);
-
-void ra_hwdec_uninit(struct ra_hwdec *hwdec);
-
-bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt);
-
-struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
-                                               struct mp_image_params *params);
-void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper);
-void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper);
-int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img);
-
-#endif
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index d40bafee24..d9c4c199f1 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -32,11 +32,10 @@
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
 
+#include "video/out/gpu/hwdec.h"
 #include "formats.h"
-#include "hwdec.h"
 #include "options/m_config.h"
 #include "ra_gl.h"
-#include "video.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
diff --git a/video/out/opengl/hwdec_ios.m b/video/out/opengl/hwdec_ios.m
index 8e020ded63..71b205b583 100644
--- a/video/out/opengl/hwdec_ios.m
+++ b/video/out/opengl/hwdec_ios.m
@@ -27,10 +27,10 @@
 
 #include "config.h"
 
+#include "video/out/gpu/hwdec.h"
 #include "video/mp_image_pool.h"
 #include "video/vt.h"
 #include "ra_gl.h"
-#include "hwdec.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
diff --git a/video/out/opengl/hwdec_osx.c b/video/out/opengl/hwdec_osx.c
index 348a5e19c5..cfd5f52e7b 100644
--- a/video/out/opengl/hwdec_osx.c
+++ b/video/out/opengl/hwdec_osx.c
@@ -29,9 +29,9 @@
 #include "config.h"
 
 #include "video/mp_image_pool.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/vt.h"
 #include "ra_gl.h"
-#include "hwdec.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
diff --git a/video/out/opengl/hwdec_rpi.c b/video/out/opengl/hwdec_rpi.c
index 6f39c3e330..ea8312a179 100644
--- a/video/out/opengl/hwdec_rpi.c
+++ b/video/out/opengl/hwdec_rpi.c
@@ -33,8 +33,8 @@
 #include "common/common.h"
 #include "common/msg.h"
 #include "video/mp_image.h"
+#include "video/out/gpu/hwdec.h"
 
-#include "hwdec.h"
 #include "common.h"
 #include "ra_gl.h"
 
diff --git a/video/out/opengl/hwdec_vaegl.c b/video/out/opengl/hwdec_vaegl.c
index a0e3222cfc..6078222bd5 100644
--- a/video/out/opengl/hwdec_vaegl.c
+++ b/video/out/opengl/hwdec_vaegl.c
@@ -30,9 +30,9 @@
 
 #include "config.h"
 
-#include "hwdec.h"
-#include "video/vaapi.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/mp_image_pool.h"
+#include "video/vaapi.h"
 #include "common.h"
 #include "ra_gl.h"
 
diff --git a/video/out/opengl/hwdec_vaglx.c b/video/out/opengl/hwdec_vaglx.c
index 8db15c4468..d5bc0b6ee7 100644
--- a/video/out/opengl/hwdec_vaglx.c
+++ b/video/out/opengl/hwdec_vaglx.c
@@ -25,10 +25,11 @@
 #include <va/va_x11.h>
 
 #include "video/out/x11_common.h"
-#include "ra_gl.h"
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/vaapi.h"
 
+#include "ra_gl.h"
+
 struct priv_owner {
     struct mp_vaapi_ctx *ctx;
     VADisplay *display;
diff --git a/video/out/opengl/hwdec_vdpau.c b/video/out/opengl/hwdec_vdpau.c
index d733650328..e0618e425e 100644
--- a/video/out/opengl/hwdec_vdpau.c
+++ b/video/out/opengl/hwdec_vdpau.c
@@ -20,7 +20,7 @@
 
 #include <GL/glx.h>
 
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/vdpau.h"
 #include "video/vdpau_mixer.h"
diff --git a/video/out/opengl/lcms.c b/video/out/opengl/lcms.c
deleted file mode 100644
index 8747ae6aa6..0000000000
--- a/video/out/opengl/lcms.c
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <string.h>
-#include <math.h>
-
-#include "mpv_talloc.h"
-
-#include "config.h"
-
-#include "stream/stream.h"
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "common/msg.h"
-#include "options/m_option.h"
-#include "options/path.h"
-#include "video/csputils.h"
-#include "lcms.h"
-
-#include "osdep/io.h"
-
-#if HAVE_LCMS2
-
-#include <lcms2.h>
-#include <libavutil/sha.h>
-#include <libavutil/mem.h>
-
-struct gl_lcms {
-    void *icc_data;
-    size_t icc_size;
-    struct AVBufferRef *vid_profile;
-    char *current_profile;
-    bool using_memory_profile;
-    bool changed;
-    enum mp_csp_prim current_prim;
-    enum mp_csp_trc current_trc;
-
-    struct mp_log *log;
-    struct mpv_global *global;
-    struct mp_icc_opts *opts;
-};
-
-static bool parse_3dlut_size(const char *arg, int *p1, int *p2, int *p3)
-{
-    if (sscanf(arg, "%dx%dx%d", p1, p2, p3) != 3)
-        return false;
-    for (int n = 0; n < 3; n++) {
-        int s = ((int[]) { *p1, *p2, *p3 })[n];
-        if (s < 2 || s > 512)
-            return false;
-    }
-    return true;
-}
-
-static int validate_3dlut_size_opt(struct mp_log *log, const m_option_t *opt,
-                                   struct bstr name, struct bstr param)
-{
-    int p1, p2, p3;
-    char s[20];
-    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-    return parse_3dlut_size(s, &p1, &p2, &p3);
-}
-
-#define OPT_BASE_STRUCT struct mp_icc_opts
-const struct m_sub_options mp_icc_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_FLAG("use-embedded-icc-profile", use_embedded, 0),
-        OPT_STRING("icc-profile", profile, M_OPT_FILE),
-        OPT_FLAG("icc-profile-auto", profile_auto, 0),
-        OPT_STRING("icc-cache-dir", cache_dir, M_OPT_FILE),
-        OPT_INT("icc-intent", intent, 0),
-        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
-        OPT_STRING_VALIDATE("icc-3dlut-size", size_str, 0, validate_3dlut_size_opt),
-
-        OPT_REPLACED("3dlut-size", "icc-3dlut-size"),
-        OPT_REMOVED("icc-cache", "see icc-cache-dir"),
-        {0}
-    },
-    .size = sizeof(struct mp_icc_opts),
-    .defaults = &(const struct mp_icc_opts) {
-        .size_str = "64x64x64",
-        .intent = INTENT_RELATIVE_COLORIMETRIC,
-        .use_embedded = true,
-    },
-};
-
-static void lcms2_error_handler(cmsContext ctx, cmsUInt32Number code,
-                                const char *msg)
-{
-    struct gl_lcms *p = cmsGetContextUserData(ctx);
-    MP_ERR(p, "lcms2: %s\n", msg);
-}
-
-static void load_profile(struct gl_lcms *p)
-{
-    talloc_free(p->icc_data);
-    p->icc_data = NULL;
-    p->icc_size = 0;
-    p->using_memory_profile = false;
-    talloc_free(p->current_profile);
-    p->current_profile = NULL;
-
-    if (!p->opts->profile || !p->opts->profile[0])
-        return;
-
-    char *fname = mp_get_user_path(NULL, p->global, p->opts->profile);
-    MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
-    struct bstr iccdata = stream_read_file(fname, p, p->global,
-                                           100000000); // 100 MB
-    talloc_free(fname);
-    if (!iccdata.len)
-        return;
-
-    talloc_free(p->icc_data);
-
-    p->icc_data = iccdata.start;
-    p->icc_size = iccdata.len;
-    p->current_profile = talloc_strdup(p, p->opts->profile);
-}
-
-static void gl_lcms_destructor(void *ptr)
-{
-    struct gl_lcms *p = ptr;
-    av_buffer_unref(&p->vid_profile);
-}
-
-struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
-                             struct mpv_global *global,
-                             struct mp_icc_opts *opts)
-{
-    struct gl_lcms *p = talloc_ptrtype(talloc_ctx, p);
-    talloc_set_destructor(p, gl_lcms_destructor);
-    *p = (struct gl_lcms) {
-        .global = global,
-        .log = log,
-        .opts = opts,
-    };
-    gl_lcms_update_options(p);
-    return p;
-}
-
-void gl_lcms_update_options(struct gl_lcms *p)
-{
-    if ((p->using_memory_profile && !p->opts->profile_auto) ||
-        !bstr_equals(bstr0(p->opts->profile), bstr0(p->current_profile)))
-    {
-        load_profile(p);
-    }
-
-    p->changed = true; // probably
-}
-
-// Warning: profile.start must point to a ta allocation, and the function
-//          takes over ownership.
-// Returns whether the internal profile was changed.
-bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile)
-{
-    if (!p->opts->profile_auto || (p->opts->profile && p->opts->profile[0])) {
-        talloc_free(profile.start);
-        return false;
-    }
-
-    if (p->using_memory_profile &&
-        p->icc_data && profile.start &&
-        profile.len == p->icc_size &&
-        memcmp(profile.start, p->icc_data, p->icc_size) == 0)
-    {
-        talloc_free(profile.start);
-        return false;
-    }
-
-    p->changed = true;
-    p->using_memory_profile = true;
-
-    talloc_free(p->icc_data);
-
-    p->icc_data = talloc_steal(p, profile.start);
-    p->icc_size = profile.len;
-
-    return true;
-}
-
-// Guards against NULL and uses bstr_equals to short-circuit some special cases
-static bool vid_profile_eq(struct AVBufferRef *a, struct AVBufferRef *b)
-{
-    if (!a || !b)
-        return a == b;
-
-    return bstr_equals((struct bstr){ a->data, a->size },
-                       (struct bstr){ b->data, b->size });
-}
-
-// Return whether the profile or config has changed since the last time it was
-// retrieved. If it has changed, gl_lcms_get_lut3d() should be called.
-bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
-                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
-{
-    if (p->changed || p->current_prim != prim || p->current_trc != trc)
-        return true;
-
-    return !vid_profile_eq(p->vid_profile, vid_profile);
-}
-
-// Whether a profile is set. (gl_lcms_get_lut3d() is expected to return a lut,
-// but it could still fail due to runtime errors, such as invalid icc data.)
-bool gl_lcms_has_profile(struct gl_lcms *p)
-{
-    return p->icc_size > 0;
-}
-
-static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
-                                   cmsHPROFILE disp_profile,
-                                   enum mp_csp_prim prim, enum mp_csp_trc trc)
-{
-    if (p->opts->use_embedded && p->vid_profile) {
-        // Try using the embedded ICC profile
-        cmsHPROFILE prof = cmsOpenProfileFromMemTHR(cms, p->vid_profile->data,
-                                                    p->vid_profile->size);
-        if (prof) {
-            MP_VERBOSE(p, "Successfully opened embedded ICC profile\n");
-            return prof;
-        }
-
-        // Otherwise, warn the user and generate the profile as usual
-        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring..\n");
-    }
-
-    // The input profile for the transformation is dependent on the video
-    // primaries and transfer characteristics
-    struct mp_csp_primaries csp = mp_get_csp_primaries(prim);
-    cmsCIExyY wp_xyY = {csp.white.x, csp.white.y, 1.0};
-    cmsCIExyYTRIPLE prim_xyY = {
-        .Red   = {csp.red.x,   csp.red.y,   1.0},
-        .Green = {csp.green.x, csp.green.y, 1.0},
-        .Blue  = {csp.blue.x,  csp.blue.y,  1.0},
-    };
-
-    cmsToneCurve *tonecurve[3] = {0};
-    switch (trc) {
-    case MP_CSP_TRC_LINEAR:  tonecurve[0] = cmsBuildGamma(cms, 1.0); break;
-    case MP_CSP_TRC_GAMMA18: tonecurve[0] = cmsBuildGamma(cms, 1.8); break;
-    case MP_CSP_TRC_GAMMA22: tonecurve[0] = cmsBuildGamma(cms, 2.2); break;
-    case MP_CSP_TRC_GAMMA28: tonecurve[0] = cmsBuildGamma(cms, 2.8); break;
-
-    case MP_CSP_TRC_SRGB:
-        // Values copied from Little-CMS
-        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
-                (double[5]){2.40, 1/1.055, 0.055/1.055, 1/12.92, 0.04045});
-        break;
-
-    case MP_CSP_TRC_PRO_PHOTO:
-        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
-                (double[5]){1.8, 1.0, 0.0, 1/16.0, 0.03125});
-        break;
-
-    case MP_CSP_TRC_BT_1886: {
-        // To build an appropriate BT.1886 transformation we need access to
-        // the display's black point, so we LittleCMS' detection function.
-        // Relative colorimetric is used since we want to approximate the
-        // BT.1886 to the target device's actual black point even in e.g.
-        // perceptual mode
-        const int intent = MP_INTENT_RELATIVE_COLORIMETRIC;
-        cmsCIEXYZ bp_XYZ;
-        if (!cmsDetectBlackPoint(&bp_XYZ, disp_profile, intent, 0))
-            return false;
-
-        // Map this XYZ value back into the (linear) source space
-        cmsToneCurve *linear = cmsBuildGamma(cms, 1.0);
-        cmsHPROFILE rev_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
-                (cmsToneCurve*[3]){linear, linear, linear});
-        cmsHPROFILE xyz_profile = cmsCreateXYZProfile();
-        cmsHTRANSFORM xyz2src = cmsCreateTransformTHR(cms,
-                xyz_profile, TYPE_XYZ_DBL, rev_profile, TYPE_RGB_DBL,
-                intent, 0);
-        cmsFreeToneCurve(linear);
-        cmsCloseProfile(rev_profile);
-        cmsCloseProfile(xyz_profile);
-        if (!xyz2src)
-            return false;
-
-        double src_black[3];
-        cmsDoTransform(xyz2src, &bp_XYZ, src_black, 1);
-        cmsDeleteTransform(xyz2src);
-
-        // Contrast limiting
-        if (p->opts->contrast > 0) {
-            for (int i = 0; i < 3; i++)
-                src_black[i] = MPMAX(src_black[i], 1.0 / p->opts->contrast);
-        }
-
-        // Built-in contrast failsafe
-        double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
-        if (contrast > 100000) {
-            MP_WARN(p, "ICC profile detected contrast very high (>100000),"
-                    " falling back to contrast 1000 for sanity. Set the"
-                    " icc-contrast option to silence this warning.\n");
-            src_black[0] = src_black[1] = src_black[2] = 1.0 / 1000;
-        }
-
-        // Build the parametric BT.1886 transfer curve, one per channel
-        for (int i = 0; i < 3; i++) {
-            const double gamma = 2.40;
-            double binv = pow(src_black[i], 1.0/gamma);
-            tonecurve[i] = cmsBuildParametricToneCurve(cms, 6,
-                    (double[4]){gamma, 1.0 - binv, binv, 0.0});
-        }
-        break;
-    }
-
-    default:
-        abort();
-    }
-
-    if (!tonecurve[0])
-        return false;
-
-    if (!tonecurve[1]) tonecurve[1] = tonecurve[0];
-    if (!tonecurve[2]) tonecurve[2] = tonecurve[0];
-
-    cmsHPROFILE *vid_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
-                                                      tonecurve);
-
-    if (tonecurve[2] != tonecurve[0]) cmsFreeToneCurve(tonecurve[2]);
-    if (tonecurve[1] != tonecurve[0]) cmsFreeToneCurve(tonecurve[1]);
-    cmsFreeToneCurve(tonecurve[0]);
-
-    return vid_profile;
-}
-
-bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
-                       enum mp_csp_prim prim, enum mp_csp_trc trc,
-                       struct AVBufferRef *vid_profile)
-{
-    int s_r, s_g, s_b;
-    bool result = false;
-
-    p->changed = false;
-    p->current_prim = prim;
-    p->current_trc = trc;
-
-    // We need to hold on to a reference to the video's ICC profile for as long
-    // as we still need to perform equality checking, so generate a new
-    // reference here
-    av_buffer_unref(&p->vid_profile);
-    if (vid_profile) {
-        MP_VERBOSE(p, "Got an embedded ICC profile.\n");
-        p->vid_profile = av_buffer_ref(vid_profile);
-        if (!p->vid_profile)
-            abort();
-    }
-
-    if (!parse_3dlut_size(p->opts->size_str, &s_r, &s_g, &s_b))
-        return false;
-
-    if (!gl_lcms_has_profile(p))
-        return false;
-
-    void *tmp = talloc_new(NULL);
-    uint16_t *output = talloc_array(tmp, uint16_t, s_r * s_g * s_b * 4);
-    struct lut3d *lut = NULL;
-    cmsContext cms = NULL;
-
-    char *cache_file = NULL;
-    if (p->opts->cache_dir && p->opts->cache_dir[0]) {
-        // Gamma is included in the header to help uniquely identify it,
-        // because we may change the parameter in the future or make it
-        // customizable, same for the primaries.
-        char *cache_info = talloc_asprintf(tmp,
-                "ver=1.4, intent=%d, size=%dx%dx%d, prim=%d, trc=%d, "
-                "contrast=%d\n",
-                p->opts->intent, s_r, s_g, s_b, prim, trc, p->opts->contrast);
-
-        uint8_t hash[32];
-        struct AVSHA *sha = av_sha_alloc();
-        if (!sha)
-            abort();
-        av_sha_init(sha, 256);
-        av_sha_update(sha, cache_info, strlen(cache_info));
-        if (vid_profile)
-            av_sha_update(sha, vid_profile->data, vid_profile->size);
-        av_sha_update(sha, p->icc_data, p->icc_size);
-        av_sha_final(sha, hash);
-        av_free(sha);
-
-        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts->cache_dir);
-        cache_file = talloc_strdup(tmp, "");
-        for (int i = 0; i < sizeof(hash); i++)
-            cache_file = talloc_asprintf_append(cache_file, "%02X", hash[i]);
-        cache_file = mp_path_join(tmp, cache_dir, cache_file);
-
-        mp_mkdirp(cache_dir);
-    }
-
-    // check cache
-    if (cache_file && stat(cache_file, &(struct stat){0}) == 0) {
-        MP_VERBOSE(p, "Opening 3D LUT cache in file '%s'.\n", cache_file);
-        struct bstr cachedata = stream_read_file(cache_file, tmp, p->global,
-                                                 1000000000); // 1 GB
-        if (cachedata.len == talloc_get_size(output)) {
-            memcpy(output, cachedata.start, cachedata.len);
-            goto done;
-        } else {
-            MP_WARN(p, "3D LUT cache invalid!\n");
-        }
-    }
-
-    cms = cmsCreateContext(NULL, p);
-    if (!cms)
-        goto error_exit;
-    cmsSetLogErrorHandlerTHR(cms, lcms2_error_handler);
-
-    cmsHPROFILE profile =
-        cmsOpenProfileFromMemTHR(cms, p->icc_data, p->icc_size);
-    if (!profile)
-        goto error_exit;
-
-    cmsHPROFILE vid_hprofile = get_vid_profile(p, cms, profile, prim, trc);
-    if (!vid_hprofile) {
-        cmsCloseProfile(profile);
-        goto error_exit;
-    }
-
-    cmsHTRANSFORM trafo = cmsCreateTransformTHR(cms, vid_hprofile, TYPE_RGB_16,
-                                                profile, TYPE_RGBA_16,
-                                                p->opts->intent,
-                                                cmsFLAGS_HIGHRESPRECALC |
-                                                cmsFLAGS_BLACKPOINTCOMPENSATION);
-    cmsCloseProfile(profile);
-    cmsCloseProfile(vid_hprofile);
-
-    if (!trafo)
-        goto error_exit;
-
-    // transform a (s_r)x(s_g)x(s_b) cube, with 3 components per channel
-    uint16_t *input = talloc_array(tmp, uint16_t, s_r * 3);
-    for (int b = 0; b < s_b; b++) {
-        for (int g = 0; g < s_g; g++) {
-            for (int r = 0; r < s_r; r++) {
-                input[r * 3 + 0] = r * 65535 / (s_r - 1);
-                input[r * 3 + 1] = g * 65535 / (s_g - 1);
-                input[r * 3 + 2] = b * 65535 / (s_b - 1);
-            }
-            size_t base = (b * s_r * s_g + g * s_r) * 4;
-            cmsDoTransform(trafo, input, output + base, s_r);
-        }
-    }
-
-    cmsDeleteTransform(trafo);
-
-    if (cache_file) {
-        FILE *out = fopen(cache_file, "wb");
-        if (out) {
-            fwrite(output, talloc_get_size(output), 1, out);
-            fclose(out);
-        }
-    }
-
-done: ;
-
-    lut = talloc_ptrtype(NULL, lut);
-    *lut = (struct lut3d) {
-        .data = talloc_steal(lut, output),
-        .size = {s_r, s_g, s_b},
-    };
-
-    *result_lut3d = lut;
-    result = true;
-
-error_exit:
-
-    if (cms)
-        cmsDeleteContext(cms);
-
-    if (!lut)
-        MP_FATAL(p, "Error loading ICC profile.\n");
-
-    talloc_free(tmp);
-    return result;
-}
-
-#else /* HAVE_LCMS2 */
-
-const struct m_sub_options mp_icc_conf = {
-    .opts = (const m_option_t[]) { {0} },
-    .size = sizeof(struct mp_icc_opts),
-    .defaults = &(const struct mp_icc_opts) {0},
-};
-
-struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
-                             struct mpv_global *global,
-                             struct mp_icc_opts *opts)
-{
-    return (struct gl_lcms *) talloc_new(talloc_ctx);
-}
-
-void gl_lcms_update_options(struct gl_lcms *p) { }
-bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile) {return false;}
-
-bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
-                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
-{
-    return false;
-}
-
-bool gl_lcms_has_profile(struct gl_lcms *p)
-{
-    return false;
-}
-
-bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
-                       enum mp_csp_prim prim, enum mp_csp_trc trc,
-                       struct AVBufferRef *vid_profile)
-{
-    return false;
-}
-
-#endif
diff --git a/video/out/opengl/lcms.h b/video/out/opengl/lcms.h
deleted file mode 100644
index 35bbd61fe0..0000000000
--- a/video/out/opengl/lcms.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef MP_GL_LCMS_H
-#define MP_GL_LCMS_H
-
-#include <stddef.h>
-#include <stdbool.h>
-#include "misc/bstr.h"
-#include "video/csputils.h"
-#include <libavutil/buffer.h>
-
-extern const struct m_sub_options mp_icc_conf;
-
-struct mp_icc_opts {
-    int use_embedded;
-    char *profile;
-    int profile_auto;
-    char *cache_dir;
-    char *size_str;
-    int intent;
-    int contrast;
-};
-
-struct lut3d {
-    uint16_t *data;
-    int size[3];
-};
-
-struct mp_log;
-struct mpv_global;
-struct gl_lcms;
-
-struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
-                             struct mpv_global *global,
-                             struct mp_icc_opts *opts);
-void gl_lcms_update_options(struct gl_lcms *p);
-bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile);
-bool gl_lcms_has_profile(struct gl_lcms *p);
-bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **,
-                       enum mp_csp_prim prim, enum mp_csp_trc trc,
-                       struct AVBufferRef *vid_profile);
-bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
-                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile);
-
-#endif
diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c
deleted file mode 100644
index f7c325d1db..0000000000
--- a/video/out/opengl/osd.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdlib.h>
-#include <assert.h>
-#include <limits.h>
-
-#include <libavutil/common.h>
-
-#include "common/common.h"
-#include "common/msg.h"
-#include "video/csputils.h"
-#include "video/mp_image.h"
-#include "osd.h"
-
-#define GLSL(x) gl_sc_add(sc, #x "\n");
-
-// glBlendFuncSeparate() arguments
-static const int blend_factors[SUBBITMAP_COUNT][4] = {
-    [SUBBITMAP_LIBASS] = {RA_BLEND_SRC_ALPHA, RA_BLEND_ONE_MINUS_SRC_ALPHA,
-                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
-    [SUBBITMAP_RGBA] =   {RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA,
-                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
-};
-
-struct vertex {
-    float position[2];
-    float texcoord[2];
-    uint8_t ass_color[4];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
-    {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
-    {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
-    {0}
-};
-
-struct mpgl_osd_part {
-    enum sub_bitmap_format format;
-    int change_id;
-    struct ra_tex *texture;
-    int w, h;
-    int num_subparts;
-    int prev_num_subparts;
-    struct sub_bitmap *subparts;
-    int num_vertices;
-    struct vertex *vertices;
-};
-
-struct mpgl_osd {
-    struct mp_log *log;
-    struct osd_state *osd;
-    struct ra *ra;
-    struct mpgl_osd_part *parts[MAX_OSD_PARTS];
-    const struct ra_format *fmt_table[SUBBITMAP_COUNT];
-    bool formats[SUBBITMAP_COUNT];
-    bool change_flag; // for reporting to API user only
-    // temporary
-    int stereo_mode;
-    struct mp_osd_res osd_res;
-    void *scratch;
-};
-
-struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
-                               struct osd_state *osd)
-{
-    struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
-    *ctx = (struct mpgl_osd) {
-        .log = log,
-        .osd = osd,
-        .ra = ra,
-        .change_flag = true,
-        .scratch = talloc_zero_size(ctx, 1),
-    };
-
-    ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1);
-    ctx->fmt_table[SUBBITMAP_RGBA]   = ra_find_unorm_format(ra, 1, 4);
-
-    for (int n = 0; n < MAX_OSD_PARTS; n++)
-        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
-
-    for (int n = 0; n < SUBBITMAP_COUNT; n++)
-        ctx->formats[n] = !!ctx->fmt_table[n];
-
-    return ctx;
-}
-
-void mpgl_osd_destroy(struct mpgl_osd *ctx)
-{
-    if (!ctx)
-        return;
-
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct mpgl_osd_part *p = ctx->parts[n];
-        ra_tex_free(ctx->ra, &p->texture);
-    }
-    talloc_free(ctx);
-}
-
-static int next_pow2(int v)
-{
-    for (int x = 0; x < 30; x++) {
-        if ((1 << x) >= v)
-            return 1 << x;
-    }
-    return INT_MAX;
-}
-
-static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
-                       struct sub_bitmaps *imgs)
-{
-    struct ra *ra = ctx->ra;
-    bool ok = false;
-
-    assert(imgs->packed);
-
-    int req_w = next_pow2(imgs->packed_w);
-    int req_h = next_pow2(imgs->packed_h);
-
-    const struct ra_format *fmt = ctx->fmt_table[imgs->format];
-    assert(fmt);
-
-    if (!osd->texture || req_w > osd->w || req_h > osd->h ||
-        osd->format != imgs->format)
-    {
-        ra_tex_free(ra, &osd->texture);
-
-        osd->format = imgs->format;
-        osd->w = FFMAX(32, req_w);
-        osd->h = FFMAX(32, req_h);
-
-        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
-
-        if (osd->w > ra->max_texture_wh || osd->h > ra->max_texture_wh) {
-            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-                   "supported size %dx%d.\n", ra->max_texture_wh,
-                   ra->max_texture_wh);
-            goto done;
-        }
-
-        struct ra_tex_params params = {
-            .dimensions = 2,
-            .w = osd->w,
-            .h = osd->h,
-            .d = 1,
-            .format = fmt,
-            .render_src = true,
-            .src_linear = true,
-            .host_mutable = true,
-        };
-        osd->texture = ra_tex_create(ra, &params);
-        if (!osd->texture)
-            goto done;
-    }
-
-    struct ra_tex_upload_params params = {
-        .tex = osd->texture,
-        .src = imgs->packed->planes[0],
-        .invalidate = true,
-        .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h},
-        .stride = imgs->packed->stride[0],
-    };
-
-    ok = ra->fns->tex_upload(ra, &params);
-
-done:
-    return ok;
-}
-
-static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
-{
-    struct mpgl_osd *ctx = pctx;
-
-    if (imgs->num_parts == 0 || !ctx->formats[imgs->format])
-        return;
-
-    struct mpgl_osd_part *osd = ctx->parts[imgs->render_index];
-
-    bool ok = true;
-    if (imgs->change_id != osd->change_id) {
-        if (!upload_osd(ctx, osd, imgs))
-            ok = false;
-
-        osd->change_id = imgs->change_id;
-        ctx->change_flag = true;
-    }
-    osd->num_subparts = ok ? imgs->num_parts : 0;
-
-    MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
-    memcpy(osd->subparts, imgs->parts,
-           osd->num_subparts * sizeof(osd->subparts[0]));
-}
-
-bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
-                           struct gl_shader_cache *sc)
-{
-    assert(index >= 0 && index < MAX_OSD_PARTS);
-    struct mpgl_osd_part *part = ctx->parts[index];
-
-    enum sub_bitmap_format fmt = part->format;
-    if (!fmt || !part->num_subparts)
-        return false;
-
-    gl_sc_uniform_texture(sc, "osdtex", part->texture);
-    switch (fmt) {
-    case SUBBITMAP_RGBA: {
-        GLSL(color = texture(osdtex, texcoord).bgra;)
-        break;
-    }
-    case SUBBITMAP_LIBASS: {
-        GLSL(color =
-            vec4(ass_color.rgb, ass_color.a * texture(osdtex, texcoord).r);)
-        break;
-    }
-    default:
-        abort();
-    }
-
-    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
-
-    return true;
-}
-
-static void write_quad(struct vertex *va, struct gl_transform t,
-                       float x0, float y0, float x1, float y1,
-                       float tx0, float ty0, float tx1, float ty1,
-                       float tex_w, float tex_h, const uint8_t color[4])
-{
-    gl_transform_vec(t, &x0, &y0);
-    gl_transform_vec(t, &x1, &y1);
-
-#define COLOR_INIT {color[0], color[1], color[2], color[3]}
-    va[0] = (struct vertex){ {x0, y0}, {tx0 / tex_w, ty0 / tex_h}, COLOR_INIT };
-    va[1] = (struct vertex){ {x0, y1}, {tx0 / tex_w, ty1 / tex_h}, COLOR_INIT };
-    va[2] = (struct vertex){ {x1, y0}, {tx1 / tex_w, ty0 / tex_h}, COLOR_INIT };
-    va[3] = (struct vertex){ {x1, y1}, {tx1 / tex_w, ty1 / tex_h}, COLOR_INIT };
-    va[4] = va[2];
-    va[5] = va[1];
-#undef COLOR_INIT
-}
-
-static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
-{
-    int num_vertices = part->num_subparts * 6;
-    MP_TARRAY_GROW(part, part->vertices, part->num_vertices + num_vertices);
-
-    for (int n = 0; n < part->num_subparts; n++) {
-        struct sub_bitmap *b = &part->subparts[n];
-        struct vertex *va = &part->vertices[part->num_vertices];
-
-        // NOTE: the blend color is used with SUBBITMAP_LIBASS only, so it
-        //       doesn't matter that we upload garbage for the other formats
-        uint32_t c = b->libass.color;
-        uint8_t color[4] = { c >> 24, (c >> 16) & 0xff,
-                            (c >> 8) & 0xff, 255 - (c & 0xff) };
-
-        write_quad(&va[n * 6], t,
-                   b->x, b->y, b->x + b->dw, b->y + b->dh,
-                   b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
-                   part->w, part->h, color);
-    }
-
-    part->num_vertices += num_vertices;
-}
-
-// number of screen divisions per axis (x=0, y=1) for the current 3D mode
-static void get_3d_side_by_side(int stereo_mode, int div[2])
-{
-    div[0] = div[1] = 1;
-    switch (stereo_mode) {
-    case MP_STEREO3D_SBS2L:
-    case MP_STEREO3D_SBS2R: div[0] = 2; break;
-    case MP_STEREO3D_AB2R:
-    case MP_STEREO3D_AB2L:  div[1] = 2; break;
-    }
-}
-
-void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
-                          struct gl_shader_cache *sc, struct fbodst target)
-{
-    struct mpgl_osd_part *part = ctx->parts[index];
-
-    int div[2];
-    get_3d_side_by_side(ctx->stereo_mode, div);
-
-    part->num_vertices = 0;
-
-    for (int x = 0; x < div[0]; x++) {
-        for (int y = 0; y < div[1]; y++) {
-            struct gl_transform t;
-            gl_transform_ortho_fbodst(&t, target);
-
-            float a_x = ctx->osd_res.w * x;
-            float a_y = ctx->osd_res.h * y;
-            t.t[0] += a_x * t.m[0][0] + a_y * t.m[1][0];
-            t.t[1] += a_x * t.m[0][1] + a_y * t.m[1][1];
-
-            generate_verts(part, t);
-        }
-    }
-
-    const int *factors = &blend_factors[part->format][0];
-    gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
-
-    gl_sc_dispatch_draw(sc, target.tex, part->vertices, part->num_vertices);
-}
-
-static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
-{
-    int div[2];
-    get_3d_side_by_side(stereo_mode, div);
-
-    res.w /= div[0];
-    res.h /= div[1];
-    ctx->osd_res = res;
-}
-
-void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
-                       int stereo_mode, int draw_flags)
-{
-    for (int n = 0; n < MAX_OSD_PARTS; n++)
-        ctx->parts[n]->num_subparts = 0;
-
-    set_res(ctx, res, stereo_mode);
-
-    osd_draw(ctx->osd, ctx->osd_res, pts, draw_flags, ctx->formats, gen_osd_cb, ctx);
-    ctx->stereo_mode = stereo_mode;
-
-    // Parts going away does not necessarily result in gen_osd_cb() being called
-    // (not even with num_parts==0), so check this separately.
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        struct mpgl_osd_part *part = ctx->parts[n];
-        if (part->num_subparts !=  part->prev_num_subparts)
-            ctx->change_flag = true;
-        part->prev_num_subparts = part->num_subparts;
-    }
-}
-
-// See osd_resize() for remarks. This function is an optional optimization too.
-void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
-{
-    set_res(ctx, res, stereo_mode);
-    osd_resize(ctx->osd, ctx->osd_res);
-}
-
-bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
-                           double pts)
-{
-    ctx->change_flag = false;
-    mpgl_osd_generate(ctx, *res, pts, 0, 0);
-    return ctx->change_flag;
-}
diff --git a/video/out/opengl/osd.h b/video/out/opengl/osd.h
deleted file mode 100644
index 6c2b886de3..0000000000
--- a/video/out/opengl/osd.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef MPLAYER_GL_OSD_H
-#define MPLAYER_GL_OSD_H
-
-#include <stdbool.h>
-#include <inttypes.h>
-
-#include "utils.h"
-#include "shader_cache.h"
-#include "sub/osd.h"
-
-struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
-                               struct osd_state *osd);
-void mpgl_osd_destroy(struct mpgl_osd *ctx);
-
-void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
-                       int stereo_mode, int draw_flags);
-void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode);
-bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
-                           struct gl_shader_cache *sc);
-void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
-                          struct gl_shader_cache *sc, struct fbodst target);
-bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
-                           double pts);
-
-#endif
diff --git a/video/out/opengl/ra.c b/video/out/opengl/ra.c
deleted file mode 100644
index ef1de54d1a..0000000000
--- a/video/out/opengl/ra.c
+++ /dev/null
@@ -1,327 +0,0 @@
-#include "common/common.h"
-#include "common/msg.h"
-#include "video/img_format.h"
-
-#include "ra.h"
-
-struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params)
-{
-    return ra->fns->tex_create(ra, params);
-}
-
-void ra_tex_free(struct ra *ra, struct ra_tex **tex)
-{
-    if (*tex)
-        ra->fns->tex_destroy(ra, *tex);
-    *tex = NULL;
-}
-
-struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params)
-{
-    return ra->fns->buf_create(ra, params);
-}
-
-void ra_buf_free(struct ra *ra, struct ra_buf **buf)
-{
-    if (*buf)
-        ra->fns->buf_destroy(ra, *buf);
-    *buf = NULL;
-}
-
-void ra_free(struct ra **ra)
-{
-    if (*ra)
-        (*ra)->fns->destroy(*ra);
-    talloc_free(*ra);
-    *ra = NULL;
-}
-
-size_t ra_vartype_size(enum ra_vartype type)
-{
-    switch (type) {
-    case RA_VARTYPE_INT:        return sizeof(int);
-    case RA_VARTYPE_FLOAT:      return sizeof(float);
-    case RA_VARTYPE_BYTE_UNORM: return 1;
-    default: return 0;
-    }
-}
-
-struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input)
-{
-    size_t el_size = ra_vartype_size(input->type);
-    if (!el_size)
-        return (struct ra_layout){0};
-
-    // host data is always tightly packed
-    return (struct ra_layout) {
-        .align  = 1,
-        .stride = el_size * input->dim_v,
-        .size   = el_size * input->dim_v * input->dim_m,
-    };
-}
-
-static struct ra_renderpass_input *dup_inputs(void *ta_parent,
-            const struct ra_renderpass_input *inputs, int num_inputs)
-{
-    struct ra_renderpass_input *res =
-        talloc_memdup(ta_parent, (void *)inputs, num_inputs * sizeof(inputs[0]));
-    for (int n = 0; n < num_inputs; n++)
-        res[n].name = talloc_strdup(res, res[n].name);
-    return res;
-}
-
-// Return a newly allocated deep-copy of params.
-struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
-        const struct ra_renderpass_params *params)
-{
-    struct ra_renderpass_params *res = talloc_ptrtype(ta_parent, res);
-    *res = *params;
-    res->inputs = dup_inputs(res, res->inputs, res->num_inputs);
-    res->vertex_attribs =
-        dup_inputs(res, res->vertex_attribs, res->num_vertex_attribs);
-    res->cached_program = bstrdup(res, res->cached_program);
-    res->vertex_shader = talloc_strdup(res, res->vertex_shader);
-    res->frag_shader = talloc_strdup(res, res->frag_shader);
-    res->compute_shader = talloc_strdup(res, res->compute_shader);
-    return res;
-};
-
-
-// Return whether this is a tightly packed format with no external padding and
-// with the same bit size/depth in all components, and the shader returns
-// components in the same order as in memory.
-static bool ra_format_is_regular(const struct ra_format *fmt)
-{
-    if (!fmt->pixel_size || !fmt->num_components || !fmt->ordered)
-        return false;
-    for (int n = 1; n < fmt->num_components; n++) {
-        if (fmt->component_size[n] != fmt->component_size[0] ||
-            fmt->component_depth[n] != fmt->component_depth[0])
-            return false;
-    }
-    if (fmt->component_size[0] * fmt->num_components != fmt->pixel_size * 8)
-        return false;
-    return true;
-}
-
-// Return a regular filterable format using RA_CTYPE_UNORM.
-const struct ra_format *ra_find_unorm_format(struct ra *ra,
-                                             int bytes_per_component,
-                                             int n_components)
-{
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (fmt->ctype == RA_CTYPE_UNORM && fmt->num_components == n_components &&
-            fmt->pixel_size == bytes_per_component * n_components &&
-            fmt->component_depth[0] == bytes_per_component * 8 &&
-            fmt->linear_filter && ra_format_is_regular(fmt))
-            return fmt;
-    }
-    return NULL;
-}
-
-// Return a regular format using RA_CTYPE_UINT.
-const struct ra_format *ra_find_uint_format(struct ra *ra,
-                                            int bytes_per_component,
-                                            int n_components)
-{
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (fmt->ctype == RA_CTYPE_UINT && fmt->num_components == n_components &&
-            fmt->pixel_size == bytes_per_component * n_components &&
-            fmt->component_depth[0] == bytes_per_component * 8 &&
-            ra_format_is_regular(fmt))
-            return fmt;
-    }
-    return NULL;
-}
-
-// Find a float format of any precision that matches the C type of the same
-// size for upload.
-// May drop bits from the mantissa (such as selecting float16 even if
-// bytes_per_component == 32); prefers possibly faster formats first.
-static const struct ra_format *ra_find_float_format(struct ra *ra,
-                                                    int bytes_per_component,
-                                                    int n_components)
-{
-    // Assumes ra_format are ordered by performance.
-    // The >=16 check is to avoid catching fringe formats.
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (fmt->ctype == RA_CTYPE_FLOAT && fmt->num_components == n_components &&
-            fmt->pixel_size == bytes_per_component * n_components &&
-            fmt->component_depth[0] >= 16 &&
-            fmt->linear_filter && ra_format_is_regular(fmt))
-            return fmt;
-    }
-    return NULL;
-}
-
-// Return a filterable regular format that uses at least float16 internally, and
-// uses a normal C float for transfer on the CPU side. (This is just so we don't
-// need 32->16 bit conversion on CPU, which would be messy.)
-const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components)
-{
-    return ra_find_float_format(ra, sizeof(float), n_components);
-}
-
-const struct ra_format *ra_find_named_format(struct ra *ra, const char *name)
-{
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        if (strcmp(fmt->name, name) == 0)
-            return fmt;
-    }
-    return NULL;
-}
-
-// Like ra_find_unorm_format(), but if no fixed point format is available,
-// return an unsigned integer format.
-static const struct ra_format *find_plane_format(struct ra *ra, int bytes,
-                                                 int n_channels,
-                                                 enum mp_component_type ctype)
-{
-    switch (ctype) {
-    case MP_COMPONENT_TYPE_UINT: {
-        const struct ra_format *f = ra_find_unorm_format(ra, bytes, n_channels);
-        if (f)
-            return f;
-        return ra_find_uint_format(ra, bytes, n_channels);
-    }
-    case MP_COMPONENT_TYPE_FLOAT:
-        return ra_find_float_format(ra, bytes, n_channels);
-    default: return NULL;
-    }
-}
-
-// Put a mapping of imgfmt to texture formats into *out. Basically it selects
-// the correct texture formats needed to represent an imgfmt in a shader, with
-// textures using the same memory organization as on the CPU.
-// Each plane is represented by a texture, and each texture has a RGBA
-// component order. out->components describes the meaning of them.
-// May return integer formats for >8 bit formats, if the driver has no
-// normalized 16 bit formats.
-// Returns false (and *out is not touched) if no format found.
-bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out)
-{
-    struct ra_imgfmt_desc res = {0};
-
-    struct mp_regular_imgfmt regfmt;
-    if (mp_get_regular_imgfmt(&regfmt, imgfmt)) {
-        enum ra_ctype ctype = RA_CTYPE_UNKNOWN;
-        res.num_planes = regfmt.num_planes;
-        res.component_bits = regfmt.component_size * 8;
-        res.component_pad = regfmt.component_pad;
-        for (int n = 0; n < regfmt.num_planes; n++) {
-            struct mp_regular_imgfmt_plane *plane = &regfmt.planes[n];
-            res.planes[n] = find_plane_format(ra, regfmt.component_size,
-                                              plane->num_components,
-                                              regfmt.component_type);
-            if (!res.planes[n])
-                return false;
-            for (int i = 0; i < plane->num_components; i++)
-                res.components[n][i] = plane->components[i];
-            // Dropping LSBs when shifting will lead to dropped MSBs.
-            if (res.component_bits > res.planes[n]->component_depth[0] &&
-                res.component_pad < 0)
-                return false;
-            // Renderer restriction, but actually an unwanted corner case.
-            if (ctype != RA_CTYPE_UNKNOWN && ctype != res.planes[n]->ctype)
-                return false;
-            ctype = res.planes[n]->ctype;
-        }
-        res.chroma_w = regfmt.chroma_w;
-        res.chroma_h = regfmt.chroma_h;
-        goto supported;
-    }
-
-    for (int n = 0; n < ra->num_formats; n++) {
-        if (imgfmt && ra->formats[n]->special_imgfmt == imgfmt) {
-            res = *ra->formats[n]->special_imgfmt_desc;
-            goto supported;
-        }
-    }
-
-    // Unsupported format
-    return false;
-
-supported:
-
-    *out = res;
-    return true;
-}
-
-void ra_dump_tex_formats(struct ra *ra, int msgl)
-{
-    if (!mp_msg_test(ra->log, msgl))
-        return;
-    MP_MSG(ra, msgl, "Texture formats:\n");
-    MP_MSG(ra, msgl, "  NAME       COMP*TYPE SIZE        DEPTH PER COMP.\n");
-    for (int n = 0; n < ra->num_formats; n++) {
-        const struct ra_format *fmt = ra->formats[n];
-        const char *ctype = "unknown";
-        switch (fmt->ctype) {
-        case RA_CTYPE_UNORM:    ctype = "unorm";    break;
-        case RA_CTYPE_UINT:     ctype = "uint ";    break;
-        case RA_CTYPE_FLOAT:    ctype = "float";    break;
-        }
-        char cl[40] = "";
-        for (int i = 0; i < fmt->num_components; i++) {
-            mp_snprintf_cat(cl, sizeof(cl), "%s%d", i ? " " : "",
-                            fmt->component_size[i]);
-            if (fmt->component_size[i] != fmt->component_depth[i])
-                mp_snprintf_cat(cl, sizeof(cl), "/%d", fmt->component_depth[i]);
-        }
-        MP_MSG(ra, msgl, "  %-10s %d*%s %3dB %s %s %s {%s}\n", fmt->name,
-               fmt->num_components, ctype, fmt->pixel_size,
-               fmt->luminance_alpha ? "LA" : "  ",
-               fmt->linear_filter ? "LF" : "  ",
-               fmt->renderable ? "CR" : "  ", cl);
-    }
-    MP_MSG(ra, msgl, " LA = LUMINANCE_ALPHA hack format\n");
-    MP_MSG(ra, msgl, " LF = linear filterable\n");
-    MP_MSG(ra, msgl, " CR = can be used for render targets\n");
-}
-
-void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
-                         int msgl)
-{
-    char pl[80] = "";
-    char pf[80] = "";
-    for (int n = 0; n < desc->num_planes; n++) {
-        if (n > 0) {
-            mp_snprintf_cat(pl, sizeof(pl), "/");
-            mp_snprintf_cat(pf, sizeof(pf), "/");
-        }
-        char t[5] = {0};
-        for (int i = 0; i < 4; i++)
-            t[i] = "_rgba"[desc->components[n][i]];
-        for (int i = 3; i > 0 && t[i] == '_'; i--)
-            t[i] = '\0';
-        mp_snprintf_cat(pl, sizeof(pl), "%s", t);
-        mp_snprintf_cat(pf, sizeof(pf), "%s", desc->planes[n]->name);
-    }
-    MP_MSG(ra, msgl, "%d planes %dx%d %d/%d [%s] (%s)\n",
-           desc->num_planes, desc->chroma_w, desc->chroma_h,
-           desc->component_bits, desc->component_pad, pf, pl);
-}
-
-void ra_dump_img_formats(struct ra *ra, int msgl)
-{
-    if (!mp_msg_test(ra->log, msgl))
-        return;
-    MP_MSG(ra, msgl, "Image formats:\n");
-    for (int imgfmt = IMGFMT_START; imgfmt < IMGFMT_END; imgfmt++) {
-        const char *name = mp_imgfmt_to_name(imgfmt);
-        if (strcmp(name, "unknown") == 0)
-            continue;
-        MP_MSG(ra, msgl, "  %s", name);
-        struct ra_imgfmt_desc desc;
-        if (ra_get_imgfmt_desc(ra, imgfmt, &desc)) {
-            MP_MSG(ra, msgl, " => ");
-            ra_dump_imgfmt_desc(ra, &desc, msgl);
-        } else {
-            MP_MSG(ra, msgl, "\n");
-        }
-    }
-}
diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h
deleted file mode 100644
index ae7fb9aea7..0000000000
--- a/video/out/opengl/ra.h
+++ /dev/null
@@ -1,491 +0,0 @@
-#pragma once
-
-#include "common/common.h"
-#include "misc/bstr.h"
-
-// Handle for a rendering API backend.
-struct ra {
-    struct ra_fns *fns;
-    void *priv;
-
-    int glsl_version;       // GLSL version (e.g. 300 => 3.0)
-    bool glsl_es;           // use ES dialect
-    bool glsl_vulkan;       // use vulkan dialect
-
-    struct mp_log *log;
-
-    // RA_CAP_* bit field. The RA backend must set supported features at init
-    // time.
-    uint64_t caps;
-
-    // Maximum supported width and height of a 2D texture. Set by the RA backend
-    // at init time.
-    int max_texture_wh;
-
-    // Maximum shared memory for compute shaders. Set by the RA backend at init
-    // time.
-    size_t max_shmem;
-
-    // Set of supported texture formats. Must be added by RA backend at init time.
-    // If there are equivalent formats with different caveats, the preferred
-    // formats should have a lower index. (E.g. GLES3 should put rg8 before la.)
-    struct ra_format **formats;
-    int num_formats;
-
-    // Accelerate texture uploads via an extra PBO even when
-    // RA_CAP_DIRECT_UPLOAD is supported. This is basically only relevant for
-    // OpenGL. Set by the RA user.
-    bool use_pbo;
-};
-
-enum {
-    RA_CAP_TEX_1D         = 1 << 0, // supports 1D textures (as shader inputs)
-    RA_CAP_TEX_3D         = 1 << 1, // supports 3D textures (as shader inputs)
-    RA_CAP_BLIT           = 1 << 2, // supports ra_fns.blit
-    RA_CAP_COMPUTE        = 1 << 3, // supports compute shaders
-    RA_CAP_DIRECT_UPLOAD  = 1 << 4, // supports tex_upload without ra_buf
-    RA_CAP_BUF_RO         = 1 << 5, // supports RA_VARTYPE_BUF_RO
-    RA_CAP_BUF_RW         = 1 << 6, // supports RA_VARTYPE_BUF_RW
-    RA_CAP_NESTED_ARRAY   = 1 << 7, // supports nested arrays
-    RA_CAP_SHARED_BINDING = 1 << 8, // sampler/image/buffer namespaces are disjoint
-    RA_CAP_GLOBAL_UNIFORM = 1 << 9, // supports using "naked" uniforms (not UBO)
-};
-
-enum ra_ctype {
-    RA_CTYPE_UNKNOWN = 0,   // also used for inconsistent multi-component formats
-    RA_CTYPE_UNORM,         // unsigned normalized integer (fixed point) formats
-    RA_CTYPE_UINT,          // full integer formats
-    RA_CTYPE_FLOAT,         // float formats (signed, any bit size)
-};
-
-// All formats must be useable as texture formats. All formats must be byte
-// aligned (all pixels start and end on a byte boundary), at least as far CPU
-// transfers are concerned.
-struct ra_format {
-    // All fields are read-only after creation.
-    const char *name;       // symbolic name for user interaction/debugging
-    void *priv;
-    enum ra_ctype ctype;    // data type of each component
-    bool ordered;           // components are sequential in memory, and returned
-                            // by the shader in memory order (the shader can
-                            // return arbitrary values for unused components)
-    int num_components;     // component count, 0 if not applicable, max. 4
-    int component_size[4];  // in bits, all entries 0 if not applicable
-    int component_depth[4]; // bits in use for each component, 0 if not applicable
-                            // (_must_ be set if component_size[] includes padding,
-                            //  and the real procession as seen by shader is lower)
-    int pixel_size;         // in bytes, total pixel size (0 if opaque)
-    bool luminance_alpha;   // pre-GL_ARB_texture_rg hack for 2 component textures
-                            // if this is set, shader must use .ra instead of .rg
-                            // only applies to 2-component textures
-    bool linear_filter;     // linear filtering available from shader
-    bool renderable;        // can be used for render targets
-
-    // If not 0, the format represents some sort of packed fringe format, whose
-    // shader representation is given by the special_imgfmt_desc pointer.
-    int special_imgfmt;
-    const struct ra_imgfmt_desc *special_imgfmt_desc;
-};
-
-struct ra_tex_params {
-    int dimensions;         // 1-3 for 1D-3D textures
-    // Size of the texture. 1D textures require h=d=1, 2D textures require d=1.
-    int w, h, d;
-    const struct ra_format *format;
-    bool render_src;        // must be useable as source texture in a shader
-    bool render_dst;        // must be useable as target texture in a shader
-    bool storage_dst;       // must be usable as a storage image (RA_VARTYPE_IMG_W)
-    bool blit_src;          // must be usable as a blit source
-    bool blit_dst;          // must be usable as a blit destination
-    bool host_mutable;      // texture may be updated with tex_upload
-    // When used as render source texture.
-    bool src_linear;        // if false, use nearest sampling (whether this can
-                            // be true depends on ra_format.linear_filter)
-    bool src_repeat;        // if false, clamp texture coordinates to edge
-                            // if true, repeat texture coordinates
-    bool non_normalized;    // hack for GL_TEXTURE_RECTANGLE OSX idiocy
-                            // always set to false, except in OSX code
-    bool external_oes;      // hack for GL_TEXTURE_EXTERNAL_OES idiocy
-    // If non-NULL, the texture will be created with these contents. Using
-    // this does *not* require setting host_mutable. Otherwise, the initial
-    // data is undefined.
-    void *initial_data;
-};
-
-// Conflates the following typical GPU API concepts:
-// - texture itself
-// - sampler state
-// - staging buffers for texture upload
-// - framebuffer objects
-// - wrappers for swapchain framebuffers
-// - synchronization needed for upload/rendering/etc.
-struct ra_tex {
-    // All fields are read-only after creation.
-    struct ra_tex_params params;
-    void *priv;
-};
-
-struct ra_tex_upload_params {
-    struct ra_tex *tex; // Texture to upload to
-    bool invalidate;    // Discard pre-existing data not in the region uploaded
-    // Uploading from buffer:
-    struct ra_buf *buf; // Buffer to upload from (mutually exclusive with `src`)
-    size_t buf_offset;  // Start of data within buffer (bytes)
-    // Uploading directly: (Note: If RA_CAP_DIRECT_UPLOAD is not set, then this
-    // will be internally translated to a tex_upload buffer by the RA)
-    const void *src;    // Address of data
-    // For 2D textures only:
-    struct mp_rect *rc; // Region to upload. NULL means entire image
-    ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
-};
-
-// Buffer type hint. Setting this may result in more or less efficient
-// operation, although it shouldn't technically prohibit anything
-enum ra_buf_type {
-    RA_BUF_TYPE_INVALID,
-    RA_BUF_TYPE_TEX_UPLOAD,     // texture upload buffer (pixel buffer object)
-    RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
-    RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
-};
-
-struct ra_buf_params {
-    enum ra_buf_type type;
-    size_t size;
-    bool host_mapped;  // create a read-writable persistent mapping (ra_buf.data)
-    bool host_mutable; // contents may be updated via buf_update()
-    // If non-NULL, the buffer will be created with these contents. Otherwise,
-    // the initial data is undefined.
-    void *initial_data;
-};
-
-// A generic buffer, which can be used for many purposes (texture upload,
-// storage buffer, uniform buffer, etc.)
-struct ra_buf {
-    // All fields are read-only after creation.
-    struct ra_buf_params params;
-    void *data; // for persistently mapped buffers, points to the first byte
-    void *priv;
-};
-
-// Type of a shader uniform variable, or a vertex attribute. In all cases,
-// vectors are matrices are done by having more than 1 value.
-enum ra_vartype {
-    RA_VARTYPE_INVALID,
-    RA_VARTYPE_INT,             // C: int, GLSL: int, ivec*
-    RA_VARTYPE_FLOAT,           // C: float, GLSL: float, vec*, mat*
-    RA_VARTYPE_TEX,             // C: ra_tex*, GLSL: various sampler types
-                                // ra_tex.params.render_src must be true
-    RA_VARTYPE_IMG_W,           // C: ra_tex*, GLSL: various image types
-                                // write-only (W) image for compute shaders
-                                // ra_tex.params.storage_dst must be true
-    RA_VARTYPE_BYTE_UNORM,      // C: uint8_t, GLSL: int, vec* (vertex data only)
-    RA_VARTYPE_BUF_RO,          // C: ra_buf*, GLSL: uniform buffer block
-                                // buf type must be RA_BUF_TYPE_UNIFORM
-    RA_VARTYPE_BUF_RW,          // C: ra_buf*, GLSL: shader storage buffer block
-                                // buf type must be RA_BUF_TYPE_SHADER_STORAGE
-    RA_VARTYPE_COUNT
-};
-
-// Returns the host size of a ra_vartype, or 0 for abstract vartypes (e.g. tex)
-size_t ra_vartype_size(enum ra_vartype type);
-
-// Represents a uniform, texture input parameter, and similar things.
-struct ra_renderpass_input {
-    const char *name;       // name as used in the shader
-    enum ra_vartype type;
-    // The total number of values is given by dim_v * dim_m.
-    int dim_v;              // vector dimension (1 for non-vector and non-matrix)
-    int dim_m;              // additional matrix dimension (dim_v x dim_m)
-    // Vertex data: byte offset of the attribute into the vertex struct
-    size_t offset;
-    // RA_VARTYPE_TEX: texture unit
-    // RA_VARTYPE_IMG_W: image unit
-    // RA_VARTYPE_BUF_* buffer binding point
-    // Other uniforms: unused
-    // If RA_CAP_SHARED_BINDING is set, these may only be unique per input type.
-    // Otherwise, these must be unique for all input values.
-    int binding;
-};
-
-// Represents the layout requirements of an input value
-struct ra_layout {
-    size_t align;  // the alignment requirements (always a power of two)
-    size_t stride; // the delta between two rows of an array/matrix
-    size_t size;   // the total size of the input
-};
-
-// Returns the host layout of a render pass input. Returns {0} for renderpass
-// inputs without a corresponding host representation (e.g. textures/buffers)
-struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input);
-
-enum ra_blend {
-    RA_BLEND_ZERO,
-    RA_BLEND_ONE,
-    RA_BLEND_SRC_ALPHA,
-    RA_BLEND_ONE_MINUS_SRC_ALPHA,
-};
-
-enum ra_renderpass_type {
-    RA_RENDERPASS_TYPE_INVALID,
-    RA_RENDERPASS_TYPE_RASTER,  // vertex+fragment shader
-    RA_RENDERPASS_TYPE_COMPUTE, // compute shader
-};
-
-// Static part of a rendering pass. It conflates the following:
-//  - compiled shader and its list of uniforms
-//  - vertex attributes and its shader mappings
-//  - blending parameters
-// (For Vulkan, this would be shader module + pipeline state.)
-// Upon creation, the values of dynamic values such as uniform contents (whose
-// initial values are not provided here) are required to be 0.
-struct ra_renderpass_params {
-    enum ra_renderpass_type type;
-
-    // Uniforms, including texture/sampler inputs.
-    struct ra_renderpass_input *inputs;
-    int num_inputs;
-
-    // Highly implementation-specific byte array storing a compiled version
-    // of the program. Can be used to speed up shader compilation. A backend
-    // xan read this in renderpass_create, or set this on the newly created
-    // ra_renderpass params field.
-    bstr cached_program;
-
-    // --- type==RA_RENDERPASS_TYPE_RASTER only
-
-    // Describes the format of the vertex data. When using ra.glsl_vulkan,
-    // the order of this array must match the vertex attribute locations.
-    struct ra_renderpass_input *vertex_attribs;
-    int num_vertex_attribs;
-    int vertex_stride;
-
-    // Format of the target texture
-    const struct ra_format *target_format;
-
-    // Shader text, in GLSL. (Yes, you need a GLSL compiler.)
-    // These are complete shaders, including prelude and declarations.
-    const char *vertex_shader;
-    const char *frag_shader;
-
-    // Target blending mode. If enable_blend is false, the blend_ fields can
-    // be ignored.
-    bool enable_blend;
-    enum ra_blend blend_src_rgb;
-    enum ra_blend blend_dst_rgb;
-    enum ra_blend blend_src_alpha;
-    enum ra_blend blend_dst_alpha;
-
-    // --- type==RA_RENDERPASS_TYPE_COMPUTE only
-
-    // Shader text, like vertex_shader/frag_shader.
-    const char *compute_shader;
-};
-
-struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
-        const struct ra_renderpass_params *params);
-
-// Conflates the following typical GPU API concepts:
-// - various kinds of shaders
-// - rendering pipelines
-// - descriptor sets, uniforms, other bindings
-// - all synchronization necessary
-// - the current values of all uniforms (this one makes it relatively stateful
-//   from an API perspective)
-struct ra_renderpass {
-    // All fields are read-only after creation.
-    struct ra_renderpass_params params;
-    void *priv;
-};
-
-// An input value (see ra_renderpass_input).
-struct ra_renderpass_input_val {
-    int index;  // index into ra_renderpass_params.inputs[]
-    void *data; // pointer to data according to ra_renderpass_input
-                // (e.g. type==RA_VARTYPE_FLOAT+dim_v=3,dim_m=3 => float[9])
-};
-
-// Parameters for performing a rendering pass (basically the dynamic params).
-// These change potentially every time.
-struct ra_renderpass_run_params {
-    struct ra_renderpass *pass;
-
-    // Generally this lists parameters only which changed since the last
-    // invocation and need to be updated. The ra_renderpass instance is
-    // supposed to keep unchanged values from the previous run.
-    // For non-primitive types like textures, these entries are always added,
-    // even if they do not change.
-    struct ra_renderpass_input_val *values;
-    int num_values;
-
-    // --- pass->params.type==RA_RENDERPASS_TYPE_RASTER only
-
-    // target->params.render_dst must be true, and target->params.format must
-    // match pass->params.target_format.
-    struct ra_tex *target;
-    struct mp_rect viewport;
-    struct mp_rect scissors;
-
-    // (The primitive type is always a triangle list.)
-    void *vertex_data;
-    int vertex_count;   // number of vertex elements, not bytes
-
-    // --- pass->params.type==RA_RENDERPASS_TYPE_COMPUTE only
-
-    // Number of work groups to be run in X/Y/Z dimensions.
-    int compute_groups[3];
-};
-
-// This is an opaque type provided by the implementation, but we want to at
-// least give it a saner name than void* for code readability purposes.
-typedef void ra_timer;
-
-// Rendering API entrypoints. (Note: there are some additional hidden features
-// you need to take care of. For example, hwdec mapping will be provided
-// separately from ra, but might need to call into ra private code.)
-struct ra_fns {
-    void (*destroy)(struct ra *ra);
-
-    // Create a texture (with undefined contents). Return NULL on failure.
-    // This is a rare operation, and normally textures and even FBOs for
-    // temporary rendering intermediate data are cached.
-    struct ra_tex *(*tex_create)(struct ra *ra,
-                                 const struct ra_tex_params *params);
-
-    void (*tex_destroy)(struct ra *ra, struct ra_tex *tex);
-
-    // Upload data to a texture. This is an extremely common operation. When
-    // using a buffer, the contants of the buffer must exactly match the image
-    // - conversions between bit depth etc. are not supported. The buffer *may*
-    // be marked as "in use" while this operation is going on, and the contents
-    // must not be touched again by the API user until buf_poll returns true.
-    // Returns whether successful.
-    bool (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params);
-
-    // Create a buffer. This can be used as a persistently mapped buffer,
-    // a uniform buffer, a shader storage buffer or possibly others.
-    // Not all usage types must be supported; may return NULL if unavailable.
-    struct ra_buf *(*buf_create)(struct ra *ra,
-                                 const struct ra_buf_params *params);
-
-    void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
-
-    // Update the contents of a buffer, starting at a given offset and up to a
-    // given size, with the contents of *data. This is an extremely common
-    // operation. Calling this while the buffer is considered "in use" is an
-    // error. (See: buf_poll)
-    void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
-                       const void *data, size_t size);
-
-    // Returns if a buffer is currently "in use" or not. Updating the contents
-    // of a buffer (via buf_update or writing to buf->data) while it is still
-    // in use is an error and may result in graphical corruption. Optional, if
-    // NULL then all buffers are always usable.
-    bool (*buf_poll)(struct ra *ra, struct ra_buf *buf);
-
-    // Returns the layout requirements of a uniform buffer element. Optional,
-    // but must be implemented if RA_CAP_BUF_RO is supported.
-    struct ra_layout (*uniform_layout)(struct ra_renderpass_input *inp);
-
-    // Clear the dst with the given color (rgba) and within the given scissor.
-    // dst must have dst->params.render_dst==true. Content outside of the
-    // scissor is preserved.
-    void (*clear)(struct ra *ra, struct ra_tex *dst, float color[4],
-                  struct mp_rect *scissor);
-
-    // Copy a sub-rectangle from one texture to another. The source/dest region
-    // is always within the texture bounds. Areas outside the dest region are
-    // preserved. The formats of the textures must be losely compatible. The
-    // dst texture can be a swapchain framebuffer, but src can not. Only 2D
-    // textures are supported.
-    // The textures must have blit_src and blit_dst set, respectively.
-    // Rectangles with negative width/height lead to flipping, different src/dst
-    // sizes lead to point scaling. Coordinates are always in pixels.
-    // Optional. Only available if RA_CAP_BLIT is set (if it's not set, it must
-    // not be called, even if it's non-NULL).
-    void (*blit)(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
-                 struct mp_rect *dst_rc, struct mp_rect *src_rc);
-
-    // Compile a shader and create a pipeline. This is a rare operation.
-    // The params pointer and anything it points to must stay valid until
-    // renderpass_destroy.
-    struct ra_renderpass *(*renderpass_create)(struct ra *ra,
-                                    const struct ra_renderpass_params *params);
-
-    void (*renderpass_destroy)(struct ra *ra, struct ra_renderpass *pass);
-
-    // Perform a render pass, basically drawing a list of triangles to a FBO.
-    // This is an extremely common operation.
-    void (*renderpass_run)(struct ra *ra,
-                           const struct ra_renderpass_run_params *params);
-
-    // Create a timer object. Returns NULL on failure, or if timers are
-    // unavailable for some reason. Optional.
-    ra_timer *(*timer_create)(struct ra *ra);
-
-    void (*timer_destroy)(struct ra *ra, ra_timer *timer);
-
-    // Start recording a timer. Note that valid usage requires you to pair
-    // every start with a stop. Trying to start a timer twice, or trying to
-    // stop a timer before having started it, consistutes invalid usage.
-    void (*timer_start)(struct ra *ra, ra_timer *timer);
-
-    // Stop recording a timer. This also returns any results that have been
-    // measured since the last usage of this ra_timer. It's important to note
-    // that GPU timer measurement are asynchronous, so this function does not
-    // always produce a value - and the values it does produce are typically
-    // delayed by a few frames. When no value is available, this returns 0.
-    uint64_t (*timer_stop)(struct ra *ra, ra_timer *timer);
-
-    // Hint that possibly queued up commands should be sent to the GPU. Optional.
-    void (*flush)(struct ra *ra);
-
-    // Associates a marker with any past error messages, for debugging
-    // purposes. Optional.
-    void (*debug_marker)(struct ra *ra, const char *msg);
-};
-
-struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params);
-void ra_tex_free(struct ra *ra, struct ra_tex **tex);
-
-struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params);
-void ra_buf_free(struct ra *ra, struct ra_buf **buf);
-
-void ra_free(struct ra **ra);
-
-const struct ra_format *ra_find_unorm_format(struct ra *ra,
-                                             int bytes_per_component,
-                                             int n_components);
-const struct ra_format *ra_find_uint_format(struct ra *ra,
-                                            int bytes_per_component,
-                                            int n_components);
-const struct ra_format *ra_find_float16_format(struct ra *ra, int n_components);
-const struct ra_format *ra_find_named_format(struct ra *ra, const char *name);
-
-struct ra_imgfmt_desc {
-    int num_planes;
-    const struct ra_format *planes[4];
-    // Chroma pixel size (1x1 is 4:4:4)
-    uint8_t chroma_w, chroma_h;
-    // Component storage size in bits (possibly padded). For formats with
-    // different sizes per component, this is arbitrary. For padded formats
-    // like P010 or YUV420P10, padding is included.
-    int component_bits;
-    // Like mp_regular_imgfmt.component_pad.
-    int component_pad;
-    // For each texture and each texture output (rgba order) describe what
-    // component it returns.
-    // The values are like the values in mp_regular_imgfmt_plane.components[].
-    // Access as components[plane_nr][component_index]. Set unused items to 0.
-    // For ra_format.luminance_alpha, this returns 1/2 ("rg") instead of 1/4
-    // ("ra"). the logic is that the texture format has 2 channels, thus the
-    // data must be returned in the first two components. The renderer fixes
-    // this later.
-    uint8_t components[4][4];
-};
-
-bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out);
-
-void ra_dump_tex_formats(struct ra *ra, int msgl);
-void ra_dump_imgfmt_desc(struct ra *ra, const struct ra_imgfmt_desc *desc,
-                         int msgl);
-void ra_dump_img_formats(struct ra *ra, int msgl);
diff --git a/video/out/opengl/ra_gl.c b/video/out/opengl/ra_gl.c
index 0d99877a9e..ccb8755ba6 100644
--- a/video/out/opengl/ra_gl.c
+++ b/video/out/opengl/ra_gl.c
@@ -1097,12 +1097,6 @@ static uint64_t gl_timer_stop(struct ra *ra, ra_timer *ratimer)
     return timer->result;
 }
 
-static void gl_flush(struct ra *ra)
-{
-    GL *gl = ra_gl_get(ra);
-    gl->Flush();
-}
-
 static void gl_debug_marker(struct ra *ra, const char *msg)
 {
     struct ra_gl *p = ra->priv;
@@ -1130,6 +1124,5 @@ static struct ra_fns ra_fns_gl = {
     .timer_destroy          = gl_timer_destroy,
     .timer_start            = gl_timer_start,
     .timer_stop             = gl_timer_stop,
-    .flush                  = gl_flush,
     .debug_marker           = gl_debug_marker,
 };
diff --git a/video/out/opengl/ra_gl.h b/video/out/opengl/ra_gl.h
index e5e09a0197..9844977801 100644
--- a/video/out/opengl/ra_gl.h
+++ b/video/out/opengl/ra_gl.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include "common.h"
-#include "ra.h"
-#include "gl_utils.h"
+#include "utils.h"
 
 struct ra *ra_create_gl(GL *gl, struct mp_log *log);
 struct ra_tex *ra_create_wrapped_tex(struct ra *ra,
diff --git a/video/out/opengl/shader_cache.c b/video/out/opengl/shader_cache.c
deleted file mode 100644
index 90a757617b..0000000000
--- a/video/out/opengl/shader_cache.c
+++ /dev/null
@@ -1,955 +0,0 @@
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include <libavutil/sha.h>
-#include <libavutil/mem.h>
-
-#include "osdep/io.h"
-
-#include "common/common.h"
-#include "options/path.h"
-#include "stream/stream.h"
-#include "shader_cache.h"
-#include "formats.h"
-#include "utils.h"
-
-// Force cache flush if more than this number of shaders is created.
-#define SC_MAX_ENTRIES 48
-
-union uniform_val {
-    float f[9];         // RA_VARTYPE_FLOAT
-    int i[4];           // RA_VARTYPE_INT
-    struct ra_tex *tex; // RA_VARTYPE_TEX, RA_VARTYPE_IMG_*
-    struct ra_buf *buf; // RA_VARTYPE_BUF_*
-};
-
-enum sc_uniform_type {
-    SC_UNIFORM_TYPE_GLOBAL = 0, // global uniform (RA_CAP_GLOBAL_UNIFORM)
-    SC_UNIFORM_TYPE_UBO = 1,    // uniform buffer (RA_CAP_BUF_RO)
-};
-
-struct sc_uniform {
-    enum sc_uniform_type type;
-    struct ra_renderpass_input input;
-    const char *glsl_type;
-    union uniform_val v;
-    char *buffer_format;
-    // for SC_UNIFORM_TYPE_UBO:
-    struct ra_layout layout;
-    size_t offset; // byte offset within the buffer
-};
-
-struct sc_cached_uniform {
-    union uniform_val v;
-    int index; // for ra_renderpass_input_val
-    bool set; // whether the uniform has ever been set
-};
-
-struct sc_entry {
-    struct ra_renderpass *pass;
-    struct sc_cached_uniform *cached_uniforms;
-    int num_cached_uniforms;
-    bstr total;
-    struct timer_pool *timer;
-    struct ra_buf *ubo;
-    int ubo_index; // for ra_renderpass_input_val.index
-};
-
-struct gl_shader_cache {
-    struct ra *ra;
-    struct mp_log *log;
-
-    // permanent
-    char **exts;
-    int num_exts;
-
-    // this is modified during use (gl_sc_add() etc.) and reset for each shader
-    bstr prelude_text;
-    bstr header_text;
-    bstr text;
-
-    // Next binding point (texture unit, image unit, buffer binding, etc.)
-    // In OpenGL these are separate for each input type
-    int next_binding[RA_VARTYPE_COUNT];
-
-    struct ra_renderpass_params params;
-
-    struct sc_entry **entries;
-    int num_entries;
-
-    struct sc_entry *current_shader; // set by gl_sc_generate()
-
-    struct sc_uniform *uniforms;
-    int num_uniforms;
-
-    int ubo_binding;
-    size_t ubo_size;
-
-    struct ra_renderpass_input_val *values;
-    int num_values;
-
-    // For checking that the user is calling gl_sc_reset() properly.
-    bool needs_reset;
-
-    bool error_state; // true if an error occurred
-
-    // temporary buffers (avoids frequent reallocations)
-    bstr tmp[6];
-
-    // For the disk-cache.
-    char *cache_dir;
-    struct mpv_global *global; // can be NULL
-};
-
-static void gl_sc_reset(struct gl_shader_cache *sc);
-
-struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
-                                     struct mp_log *log)
-{
-    struct gl_shader_cache *sc = talloc_ptrtype(NULL, sc);
-    *sc = (struct gl_shader_cache){
-        .ra = ra,
-        .global = global,
-        .log = log,
-    };
-    gl_sc_reset(sc);
-    return sc;
-}
-
-// Reset the previous pass. This must be called after gl_sc_generate and before
-// starting a new shader.
-static void gl_sc_reset(struct gl_shader_cache *sc)
-{
-    sc->prelude_text.len = 0;
-    sc->header_text.len = 0;
-    sc->text.len = 0;
-    for (int n = 0; n < sc->num_uniforms; n++)
-        talloc_free((void *)sc->uniforms[n].input.name);
-    sc->num_uniforms = 0;
-    sc->ubo_binding = 0;
-    sc->ubo_size = 0;
-    for (int i = 0; i < RA_VARTYPE_COUNT; i++)
-        sc->next_binding[i] = 0;
-    sc->current_shader = NULL;
-    sc->params = (struct ra_renderpass_params){0};
-    sc->needs_reset = false;
-}
-
-static void sc_flush_cache(struct gl_shader_cache *sc)
-{
-    MP_VERBOSE(sc, "flushing shader cache\n");
-
-    for (int n = 0; n < sc->num_entries; n++) {
-        struct sc_entry *e = sc->entries[n];
-        ra_buf_free(sc->ra, &e->ubo);
-        if (e->pass)
-            sc->ra->fns->renderpass_destroy(sc->ra, e->pass);
-        timer_pool_destroy(e->timer);
-        talloc_free(e);
-    }
-    sc->num_entries = 0;
-}
-
-void gl_sc_destroy(struct gl_shader_cache *sc)
-{
-    if (!sc)
-        return;
-    gl_sc_reset(sc);
-    sc_flush_cache(sc);
-    talloc_free(sc);
-}
-
-bool gl_sc_error_state(struct gl_shader_cache *sc)
-{
-    return sc->error_state;
-}
-
-void gl_sc_reset_error(struct gl_shader_cache *sc)
-{
-    sc->error_state = false;
-}
-
-void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name)
-{
-    for (int n = 0; n < sc->num_exts; n++) {
-        if (strcmp(sc->exts[n], name) == 0)
-            return;
-    }
-    MP_TARRAY_APPEND(sc, sc->exts, sc->num_exts, talloc_strdup(sc, name));
-}
-
-#define bstr_xappend0(sc, b, s) bstr_xappend(sc, b, bstr0(s))
-
-void gl_sc_add(struct gl_shader_cache *sc, const char *text)
-{
-    bstr_xappend0(sc, &sc->text, text);
-}
-
-void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
-{
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(sc, &sc->text, textf, ap);
-    va_end(ap);
-}
-
-void gl_sc_hadd(struct gl_shader_cache *sc, const char *text)
-{
-    bstr_xappend0(sc, &sc->header_text, text);
-}
-
-void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
-{
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(sc, &sc->header_text, textf, ap);
-    va_end(ap);
-}
-
-void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text)
-{
-    bstr_xappend(sc, &sc->header_text, text);
-}
-
-void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
-{
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(sc, &sc->prelude_text, textf, ap);
-    va_end(ap);
-}
-
-static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
-                                       const char *name)
-{
-    struct sc_uniform new = {
-        .input = {
-            .dim_v = 1,
-            .dim_m = 1,
-        },
-    };
-
-    for (int n = 0; n < sc->num_uniforms; n++) {
-        struct sc_uniform *u = &sc->uniforms[n];
-        if (strcmp(u->input.name, name) == 0) {
-            const char *allocname = u->input.name;
-            *u = new;
-            u->input.name = allocname;
-            return u;
-        }
-    }
-
-    // not found -> add it
-    new.input.name = talloc_strdup(NULL, name);
-    MP_TARRAY_APPEND(sc, sc->uniforms, sc->num_uniforms, new);
-    return &sc->uniforms[sc->num_uniforms - 1];
-}
-
-static int gl_sc_next_binding(struct gl_shader_cache *sc, enum ra_vartype type)
-{
-    if (sc->ra->caps & RA_CAP_SHARED_BINDING) {
-        return sc->next_binding[type]++;
-    } else {
-        return sc->next_binding[0]++;
-    }
-}
-
-// Updates the UBO metadata for the given sc_uniform. Assumes sc_uniform->input
-// is already set. Also updates sc_uniform->type.
-static void update_ubo_params(struct gl_shader_cache *sc, struct sc_uniform *u)
-{
-    if (!(sc->ra->caps & RA_CAP_BUF_RO))
-        return;
-
-    // Using UBOs with explicit layout(offset) like we do requires GLSL version
-    // 440 or higher. In theory the UBO code can also use older versions, but
-    // just try and avoid potential headaches. This also ensures they're only
-    // used on drivers that are probably modern enough to actually support them
-    // correctly.
-    if (sc->ra->glsl_version < 440)
-        return;
-
-    u->type = SC_UNIFORM_TYPE_UBO;
-    u->layout = sc->ra->fns->uniform_layout(&u->input);
-    u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align);
-    sc->ubo_size = u->offset + u->layout.size;
-}
-
-void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
-                           struct ra_tex *tex)
-{
-    const char *glsl_type = "sampler2D";
-    if (tex->params.dimensions == 1) {
-        glsl_type = "sampler1D";
-    } else if (tex->params.dimensions == 3) {
-        glsl_type = "sampler3D";
-    } else if (tex->params.non_normalized) {
-        glsl_type = "sampler2DRect";
-    } else if (tex->params.external_oes) {
-        glsl_type = "samplerExternalOES";
-    } else if (tex->params.format->ctype == RA_CTYPE_UINT) {
-        glsl_type = sc->ra->glsl_es ? "highp usampler2D" : "usampler2D";
-    }
-
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_TEX;
-    u->glsl_type = glsl_type;
-    u->input.binding = gl_sc_next_binding(sc, u->input.type);
-    u->v.tex = tex;
-}
-
-void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
-                              struct ra_tex *tex)
-{
-    gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
-
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_IMG_W;
-    u->glsl_type = "writeonly image2D";
-    u->input.binding = gl_sc_next_binding(sc, u->input.type);
-    u->v.tex = tex;
-}
-
-void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
-                char *format, ...)
-{
-    assert(sc->ra->caps & RA_CAP_BUF_RW);
-    gl_sc_enable_extension(sc, "GL_ARB_shader_storage_buffer_object");
-
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_BUF_RW;
-    u->glsl_type = "";
-    u->input.binding = gl_sc_next_binding(sc, u->input.type);
-    u->v.buf = buf;
-
-    va_list ap;
-    va_start(ap, format);
-    u->buffer_format = ta_vasprintf(sc, format, ap);
-    va_end(ap);
-}
-
-void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->glsl_type = "float";
-    update_ubo_params(sc, u);
-    u->v.f[0] = f;
-}
-
-void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int i)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_INT;
-    u->glsl_type = "int";
-    update_ubo_params(sc, u);
-    u->v.i[0] = i;
-}
-
-void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2])
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 2;
-    u->glsl_type = "vec2";
-    update_ubo_params(sc, u);
-    u->v.f[0] = f[0];
-    u->v.f[1] = f[1];
-}
-
-void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, GLfloat f[3])
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 3;
-    u->glsl_type = "vec3";
-    update_ubo_params(sc, u);
-    u->v.f[0] = f[0];
-    u->v.f[1] = f[1];
-    u->v.f[2] = f[2];
-}
-
-static void transpose2x2(float r[2 * 2])
-{
-    MPSWAP(float, r[0+2*1], r[1+2*0]);
-}
-
-void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
-                        bool transpose, GLfloat *v)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 2;
-    u->input.dim_m = 2;
-    u->glsl_type = "mat2";
-    update_ubo_params(sc, u);
-    for (int n = 0; n < 4; n++)
-        u->v.f[n] = v[n];
-    if (transpose)
-        transpose2x2(&u->v.f[0]);
-}
-
-static void transpose3x3(float r[3 * 3])
-{
-    MPSWAP(float, r[0+3*1], r[1+3*0]);
-    MPSWAP(float, r[0+3*2], r[2+3*0]);
-    MPSWAP(float, r[1+3*2], r[2+3*1]);
-}
-
-void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
-                        bool transpose, GLfloat *v)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->input.type = RA_VARTYPE_FLOAT;
-    u->input.dim_v = 3;
-    u->input.dim_m = 3;
-    u->glsl_type = "mat3";
-    update_ubo_params(sc, u);
-    for (int n = 0; n < 9; n++)
-        u->v.f[n] = v[n];
-    if (transpose)
-        transpose3x3(&u->v.f[0]);
-}
-
-// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
-// data layout and attribute names. The entries array is terminated with a {0}
-// entry. The array memory must remain valid indefinitely (for now).
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *entries,
-                             int vertex_stride)
-{
-    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
-    sc->params.num_vertex_attribs = 0;
-    while (entries[sc->params.num_vertex_attribs].name)
-        sc->params.num_vertex_attribs++;
-    sc->params.vertex_stride = vertex_stride;
-}
-
-void gl_sc_blend(struct gl_shader_cache *sc,
-                 enum ra_blend blend_src_rgb,
-                 enum ra_blend blend_dst_rgb,
-                 enum ra_blend blend_src_alpha,
-                 enum ra_blend blend_dst_alpha)
-{
-    sc->params.enable_blend = true;
-    sc->params.blend_src_rgb = blend_src_rgb;
-    sc->params.blend_dst_rgb = blend_dst_rgb;
-    sc->params.blend_src_alpha = blend_src_alpha;
-    sc->params.blend_dst_alpha = blend_dst_alpha;
-}
-
-static const char *vao_glsl_type(const struct ra_renderpass_input *e)
-{
-    // pretty dumb... too dumb, but works for us
-    switch (e->dim_v) {
-    case 1: return "float";
-    case 2: return "vec2";
-    case 3: return "vec3";
-    case 4: return "vec4";
-    default: abort();
-    }
-}
-
-static void update_ubo(struct ra *ra, struct ra_buf *ubo, struct sc_uniform *u)
-{
-    uintptr_t src = (uintptr_t) &u->v;
-    size_t dst = u->offset;
-    struct ra_layout src_layout = ra_renderpass_input_layout(&u->input);
-    struct ra_layout dst_layout = u->layout;
-
-    for (int i = 0; i < u->input.dim_m; i++) {
-        ra->fns->buf_update(ra, ubo, dst, (void *)src, src_layout.stride);
-        src += src_layout.stride;
-        dst += dst_layout.stride;
-    }
-}
-
-static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
-                           struct sc_uniform *u, int n)
-{
-    struct sc_cached_uniform *un = &e->cached_uniforms[n];
-    struct ra_layout layout = ra_renderpass_input_layout(&u->input);
-    if (layout.size > 0 && un->set && memcmp(&un->v, &u->v, layout.size) == 0)
-        return;
-
-    un->v = u->v;
-    un->set = true;
-
-    switch (u->type) {
-    case SC_UNIFORM_TYPE_GLOBAL: {
-        struct ra_renderpass_input_val value = {
-            .index = un->index,
-            .data = &un->v,
-        };
-        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, value);
-        break;
-    }
-    case SC_UNIFORM_TYPE_UBO:
-        assert(e->ubo);
-        update_ubo(sc->ra, e->ubo, u);
-        break;
-    default: abort();
-    }
-}
-
-void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir)
-{
-    talloc_free(sc->cache_dir);
-    sc->cache_dir = talloc_strdup(sc, dir);
-}
-
-static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
-{
-    bool ret = false;
-
-    void *tmp = talloc_new(NULL);
-    struct ra_renderpass_params params = sc->params;
-
-    MP_VERBOSE(sc, "new shader program:\n");
-    if (sc->header_text.len) {
-        MP_VERBOSE(sc, "header:\n");
-        mp_log_source(sc->log, MSGL_V, sc->header_text.start);
-        MP_VERBOSE(sc, "body:\n");
-    }
-    if (sc->text.len)
-        mp_log_source(sc->log, MSGL_V, sc->text.start);
-
-    // The vertex shader uses mangled names for the vertex attributes, so that
-    // the fragment shader can use the "real" names. But the shader is expecting
-    // the vertex attribute names (at least with older GLSL targets for GL).
-    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
-                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
-    for (int n = 0; n < params.num_vertex_attribs; n++) {
-        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
-        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
-    }
-
-    const char *cache_header = "mpv shader cache v1\n";
-    char *cache_filename = NULL;
-    char *cache_dir = NULL;
-
-    if (sc->cache_dir && sc->cache_dir[0]) {
-        // Try to load it from a disk cache.
-        cache_dir = mp_get_user_path(tmp, sc->global, sc->cache_dir);
-
-        struct AVSHA *sha = av_sha_alloc();
-        if (!sha)
-            abort();
-        av_sha_init(sha, 256);
-        av_sha_update(sha, entry->total.start, entry->total.len);
-
-        uint8_t hash[256 / 8];
-        av_sha_final(sha, hash);
-        av_free(sha);
-
-        char hashstr[256 / 8 * 2 + 1];
-        for (int n = 0; n < 256 / 8; n++)
-            snprintf(hashstr + n * 2, sizeof(hashstr) - n * 2, "%02X", hash[n]);
-
-        cache_filename = mp_path_join(tmp, cache_dir, hashstr);
-        if (stat(cache_filename, &(struct stat){0}) == 0) {
-            MP_VERBOSE(sc, "Trying to load shader from disk...\n");
-            struct bstr cachedata =
-                stream_read_file(cache_filename, tmp, sc->global, 1000000000);
-            if (bstr_eatstart0(&cachedata, cache_header))
-                params.cached_program = cachedata;
-        }
-    }
-
-    // If using a UBO, also make sure to add it as an input value so the RA
-    // can see it
-    if (sc->ubo_size) {
-        entry->ubo_index = sc->params.num_inputs;
-        struct ra_renderpass_input ubo_input = {
-            .name = "UBO",
-            .type = RA_VARTYPE_BUF_RO,
-            .dim_v = 1,
-            .dim_m = 1,
-            .binding = sc->ubo_binding,
-        };
-        MP_TARRAY_APPEND(sc, params.inputs, params.num_inputs, ubo_input);
-    }
-
-    entry->pass = sc->ra->fns->renderpass_create(sc->ra, &params);
-    if (!entry->pass)
-        goto error;
-
-    if (sc->ubo_size) {
-        struct ra_buf_params ubo_params = {
-            .type = RA_BUF_TYPE_UNIFORM,
-            .size = sc->ubo_size,
-            .host_mutable = true,
-        };
-
-        entry->ubo = ra_buf_create(sc->ra, &ubo_params);
-        if (!entry->ubo) {
-            MP_ERR(sc, "Failed creating uniform buffer!\n");
-            goto error;
-        }
-    }
-
-    if (entry->pass && cache_filename) {
-        bstr nc = entry->pass->params.cached_program;
-        if (nc.len && !bstr_equals(params.cached_program, nc)) {
-            mp_mkdirp(cache_dir);
-
-            MP_VERBOSE(sc, "Writing shader cache file: %s\n", cache_filename);
-            FILE *out = fopen(cache_filename, "wb");
-            if (out) {
-                fwrite(cache_header, strlen(cache_header), 1, out);
-                fwrite(nc.start, nc.len, 1, out);
-                fclose(out);
-            }
-        }
-    }
-
-    ret = true;
-
-error:
-    talloc_free(tmp);
-    return ret;
-}
-
-#define ADD(x, ...) bstr_xappend_asprintf(sc, (x), __VA_ARGS__)
-#define ADD_BSTR(x, s) bstr_xappend(sc, (x), (s))
-
-static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
-{
-    // Add all of the UBO entries separately as members of their own buffer
-    if (sc->ubo_size > 0) {
-        ADD(dst, "layout(std140, binding=%d) uniform UBO {\n", sc->ubo_binding);
-        for (int n = 0; n < sc->num_uniforms; n++) {
-            struct sc_uniform *u = &sc->uniforms[n];
-            if (u->type != SC_UNIFORM_TYPE_UBO)
-                continue;
-            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset,
-                u->glsl_type, u->input.name);
-        }
-        ADD(dst, "};\n");
-    }
-
-    for (int n = 0; n < sc->num_uniforms; n++) {
-        struct sc_uniform *u = &sc->uniforms[n];
-        if (u->type != SC_UNIFORM_TYPE_GLOBAL)
-            continue;
-        switch (u->input.type) {
-        case RA_VARTYPE_INT:
-        case RA_VARTYPE_FLOAT:
-            assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM);
-            // fall through
-        case RA_VARTYPE_TEX:
-        case RA_VARTYPE_IMG_W:
-            // Vulkan requires explicitly assigning the bindings in the shader
-            // source. For OpenGL it's optional, but requires higher GL version
-            // so we don't do it (and instead have ra_gl update the bindings
-            // after program creation).
-            if (sc->ra->glsl_vulkan)
-                ADD(dst, "layout(binding=%d) ", u->input.binding);
-            ADD(dst, "uniform %s %s;\n", u->glsl_type, u->input.name);
-            break;
-        case RA_VARTYPE_BUF_RO:
-            ADD(dst, "layout(std140, binding=%d) uniform %s { %s };\n",
-                u->input.binding, u->input.name, u->buffer_format);
-            break;
-        case RA_VARTYPE_BUF_RW:
-            ADD(dst, "layout(std430, binding=%d) buffer %s { %s };\n",
-                u->input.binding, u->input.name, u->buffer_format);
-            break;
-        }
-    }
-}
-
-// 1. Generate vertex and fragment shaders from the fragment shader text added
-//    with gl_sc_add(). The generated shader program is cached (based on the
-//    text), so actual compilation happens only the first time.
-// 2. Update the uniforms and textures set with gl_sc_uniform_*.
-// 3. Make the new shader program current (glUseProgram()).
-// After that, you render, and then you call gc_sc_reset(), which does:
-// 1. Unbind the program and all textures.
-// 2. Reset the sc state and prepare for a new shader program. (All uniforms
-//    and fragment operations needed for the next program have to be re-added.)
-static void gl_sc_generate(struct gl_shader_cache *sc,
-                           enum ra_renderpass_type type,
-                           const struct ra_format *target_format)
-{
-    int glsl_version = sc->ra->glsl_version;
-    int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
-
-    sc->params.type = type;
-
-    // gl_sc_reset() must be called after ending the previous render process,
-    // and before starting a new one.
-    assert(!sc->needs_reset);
-    sc->needs_reset = true;
-
-    // gl_sc_set_vertex_format() must always be called
-    assert(sc->params.vertex_attribs);
-
-    // If using a UBO, pick a binding (needed for shader generation)
-    if (sc->ubo_size)
-        sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
-
-    for (int n = 0; n < MP_ARRAY_SIZE(sc->tmp); n++)
-        sc->tmp[n].len = 0;
-
-    // set up shader text (header + uniforms + body)
-    bstr *header = &sc->tmp[0];
-    ADD(header, "#version %d%s\n", glsl_version, glsl_es >= 300 ? " es" : "");
-    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
-        // This extension cannot be enabled in fragment shader. Enable it as
-        // an exception for compute shader.
-        ADD(header, "#extension GL_ARB_compute_shader : enable\n");
-    }
-    for (int n = 0; n < sc->num_exts; n++)
-        ADD(header, "#extension %s : enable\n", sc->exts[n]);
-    if (glsl_es) {
-        ADD(header, "precision mediump float;\n");
-        ADD(header, "precision mediump sampler2D;\n");
-        if (sc->ra->caps & RA_CAP_TEX_3D)
-            ADD(header, "precision mediump sampler3D;\n");
-    }
-
-    if (glsl_version >= 130) {
-        ADD(header, "#define tex1D texture\n");
-        ADD(header, "#define tex3D texture\n");
-    } else {
-        ADD(header, "#define tex1D texture1D\n");
-        ADD(header, "#define tex3D texture3D\n");
-        ADD(header, "#define texture texture2D\n");
-    }
-
-    if (sc->ra->glsl_vulkan && type == RA_RENDERPASS_TYPE_COMPUTE) {
-        ADD(header, "#define gl_GlobalInvocationIndex "
-                    "(gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID)\n");
-    }
-
-    // Additional helpers.
-    ADD(header, "#define LUT_POS(x, lut_size)"
-                " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
-
-    char *vert_in = glsl_version >= 130 ? "in" : "attribute";
-    char *vert_out = glsl_version >= 130 ? "out" : "varying";
-    char *frag_in = glsl_version >= 130 ? "in" : "varying";
-
-    struct bstr *vert = NULL, *frag = NULL, *comp = NULL;
-
-    if (type == RA_RENDERPASS_TYPE_RASTER) {
-        // vertex shader: we don't use the vertex shader, so just setup a
-        // dummy, which passes through the vertex array attributes.
-        bstr *vert_head = &sc->tmp[1];
-        ADD_BSTR(vert_head, *header);
-        bstr *vert_body = &sc->tmp[2];
-        ADD(vert_body, "void main() {\n");
-        bstr *frag_vaos = &sc->tmp[3];
-        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
-            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
-            const char *glsl_type = vao_glsl_type(e);
-            char loc[32] = {0};
-            if (sc->ra->glsl_vulkan)
-                snprintf(loc, sizeof(loc), "layout(location=%d) ", n);
-            if (strcmp(e->name, "position") == 0) {
-                // setting raster pos. requires setting gl_Position magic variable
-                assert(e->dim_v == 2 && e->type == RA_VARTYPE_FLOAT);
-                ADD(vert_head, "%s%s vec2 vertex_position;\n", loc, vert_in);
-                ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n");
-            } else {
-                ADD(vert_head, "%s%s %s vertex_%s;\n", loc, vert_in, glsl_type, e->name);
-                ADD(vert_head, "%s%s %s %s;\n", loc, vert_out, glsl_type, e->name);
-                ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name);
-                ADD(frag_vaos, "%s%s %s %s;\n", loc, frag_in, glsl_type, e->name);
-            }
-        }
-        ADD(vert_body, "}\n");
-        vert = vert_head;
-        ADD_BSTR(vert, *vert_body);
-
-        // fragment shader; still requires adding used uniforms and VAO elements
-        frag = &sc->tmp[4];
-        ADD_BSTR(frag, *header);
-        if (glsl_version >= 130) {
-            ADD(frag, "%sout vec4 out_color;\n",
-                sc->ra->glsl_vulkan ? "layout(location=0) " : "");
-        }
-        ADD_BSTR(frag, *frag_vaos);
-        add_uniforms(sc, frag);
-
-        ADD_BSTR(frag, sc->prelude_text);
-        ADD_BSTR(frag, sc->header_text);
-
-        ADD(frag, "void main() {\n");
-        // we require _all_ frag shaders to write to a "vec4 color"
-        ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
-        ADD_BSTR(frag, sc->text);
-        if (glsl_version >= 130) {
-            ADD(frag, "out_color = color;\n");
-        } else {
-            ADD(frag, "gl_FragColor = color;\n");
-        }
-        ADD(frag, "}\n");
-
-        // We need to fix the format of the render dst at renderpass creation
-        // time
-        assert(target_format);
-        sc->params.target_format = target_format;
-    }
-
-    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
-        comp = &sc->tmp[4];
-        ADD_BSTR(comp, *header);
-
-        add_uniforms(sc, comp);
-
-        ADD_BSTR(comp, sc->prelude_text);
-        ADD_BSTR(comp, sc->header_text);
-
-        ADD(comp, "void main() {\n");
-        ADD(comp, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); // convenience
-        ADD_BSTR(comp, sc->text);
-        ADD(comp, "}\n");
-    }
-
-    bstr *hash_total = &sc->tmp[5];
-
-    ADD(hash_total, "type %d\n", sc->params.type);
-
-    if (frag) {
-        ADD_BSTR(hash_total, *frag);
-        sc->params.frag_shader = frag->start;
-    }
-    ADD(hash_total, "\n");
-    if (vert) {
-        ADD_BSTR(hash_total, *vert);
-        sc->params.vertex_shader = vert->start;
-    }
-    ADD(hash_total, "\n");
-    if (comp) {
-        ADD_BSTR(hash_total, *comp);
-        sc->params.compute_shader = comp->start;
-    }
-    ADD(hash_total, "\n");
-
-    if (sc->params.enable_blend) {
-        ADD(hash_total, "blend %d %d %d %d\n",
-            sc->params.blend_src_rgb, sc->params.blend_dst_rgb,
-            sc->params.blend_src_alpha, sc->params.blend_dst_alpha);
-    }
-
-    if (sc->params.target_format)
-        ADD(hash_total, "format %s\n", sc->params.target_format->name);
-
-    struct sc_entry *entry = NULL;
-    for (int n = 0; n < sc->num_entries; n++) {
-        struct sc_entry *cur = sc->entries[n];
-        if (bstr_equals(cur->total, *hash_total)) {
-            entry = cur;
-            break;
-        }
-    }
-    if (!entry) {
-        if (sc->num_entries == SC_MAX_ENTRIES)
-            sc_flush_cache(sc);
-        entry = talloc_ptrtype(NULL, entry);
-        *entry = (struct sc_entry){
-            .total = bstrdup(entry, *hash_total),
-            .timer = timer_pool_create(sc->ra),
-        };
-        for (int n = 0; n < sc->num_uniforms; n++) {
-            struct sc_cached_uniform u = {0};
-            if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
-                // global uniforms need to be made visible to the ra_renderpass
-                u.index = sc->params.num_inputs;
-                MP_TARRAY_APPEND(sc, sc->params.inputs, sc->params.num_inputs,
-                                 sc->uniforms[n].input);
-            }
-            MP_TARRAY_APPEND(entry, entry->cached_uniforms,
-                             entry->num_cached_uniforms, u);
-        }
-        if (!create_pass(sc, entry))
-            sc->error_state = true;
-        MP_TARRAY_APPEND(sc, sc->entries, sc->num_entries, entry);
-    }
-    if (sc->error_state)
-        return;
-
-    assert(sc->num_uniforms == entry->num_cached_uniforms);
-
-    sc->num_values = 0;
-    for (int n = 0; n < sc->num_uniforms; n++)
-        update_uniform(sc, entry, &sc->uniforms[n], n);
-
-    // If we're using a UBO, make sure to bind it as well
-    if (sc->ubo_size) {
-        struct ra_renderpass_input_val ubo_val = {
-            .index = entry->ubo_index,
-            .data = &entry->ubo,
-        };
-        MP_TARRAY_APPEND(sc, sc->values, sc->num_values, ubo_val);
-    }
-
-    sc->current_shader = entry;
-}
-
-struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
-                                        void *ptr, size_t num)
-{
-    struct timer_pool *timer = NULL;
-
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
-    if (!sc->current_shader)
-        goto error;
-
-    timer = sc->current_shader->timer;
-
-    struct mp_rect full_rc = {0, 0, target->params.w, target->params.h};
-
-    struct ra_renderpass_run_params run = {
-        .pass = sc->current_shader->pass,
-        .values = sc->values,
-        .num_values = sc->num_values,
-        .target = target,
-        .vertex_data = ptr,
-        .vertex_count = num,
-        .viewport = full_rc,
-        .scissors = full_rc,
-    };
-
-    timer_pool_start(timer);
-    sc->ra->fns->renderpass_run(sc->ra, &run);
-    timer_pool_stop(timer);
-
-error:
-    gl_sc_reset(sc);
-    return timer_pool_measure(timer);
-}
-
-struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
-                                           int w, int h, int d)
-{
-    struct timer_pool *timer = NULL;
-
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
-    if (!sc->current_shader)
-        goto error;
-
-    timer = sc->current_shader->timer;
-
-    struct ra_renderpass_run_params run = {
-        .pass = sc->current_shader->pass,
-        .values = sc->values,
-        .num_values = sc->num_values,
-        .compute_groups = {w, h, d},
-    };
-
-    timer_pool_start(timer);
-    sc->ra->fns->renderpass_run(sc->ra, &run);
-    timer_pool_stop(timer);
-
-error:
-    gl_sc_reset(sc);
-    return timer_pool_measure(timer);
-}
diff --git a/video/out/opengl/shader_cache.h b/video/out/opengl/shader_cache.h
deleted file mode 100644
index 82a078079b..0000000000
--- a/video/out/opengl/shader_cache.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "ra.h"
-
-// For mp_pass_perf
-#include "video/out/vo.h"
-
-struct mp_log;
-struct mpv_global;
-struct gl_shader_cache;
-
-struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
-                                     struct mp_log *log);
-void gl_sc_destroy(struct gl_shader_cache *sc);
-bool gl_sc_error_state(struct gl_shader_cache *sc);
-void gl_sc_reset_error(struct gl_shader_cache *sc);
-void gl_sc_add(struct gl_shader_cache *sc, const char *text);
-void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...)
-    PRINTF_ATTRIBUTE(2, 3);
-void gl_sc_hadd(struct gl_shader_cache *sc, const char *text);
-void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
-    PRINTF_ATTRIBUTE(2, 3);
-void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
-void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
-    PRINTF_ATTRIBUTE(2, 3);
-void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
-                           struct ra_tex *tex);
-void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
-                              struct ra_tex *tex);
-void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, struct ra_buf *buf,
-                char *format, ...) PRINTF_ATTRIBUTE(4, 5);
-void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f);
-void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int f);
-void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2]);
-void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3]);
-void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
-                        bool transpose, float *v);
-void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
-                        bool transpose, float *v);
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *vertex_attribs,
-                             int vertex_stride);
-void gl_sc_blend(struct gl_shader_cache *sc,
-                 enum ra_blend blend_src_rgb,
-                 enum ra_blend blend_dst_rgb,
-                 enum ra_blend blend_src_alpha,
-                 enum ra_blend blend_dst_alpha);
-void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
-struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
-                                        void *ptr, size_t num);
-struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
-                                           int w, int h, int d);
-void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir);
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
deleted file mode 100644
index 58a1ac9e64..0000000000
--- a/video/out/opengl/user_shaders.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <assert.h>
-
-#include "misc/ctype.h"
-#include "user_shaders.h"
-#include "formats.h"
-
-static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
-{
-    int pos = 0;
-
-    while (line.len > 0) {
-        struct bstr word = bstr_strip(bstr_splitchar(line, &line, ' '));
-        if (word.len == 0)
-            continue;
-
-        if (pos >= MAX_SZEXP_SIZE)
-            return false;
-
-        struct szexp *exp = &out[pos++];
-
-        if (bstr_eatend0(&word, ".w") || bstr_eatend0(&word, ".width")) {
-            exp->tag = SZEXP_VAR_W;
-            exp->val.varname = word;
-            continue;
-        }
-
-        if (bstr_eatend0(&word, ".h") || bstr_eatend0(&word, ".height")) {
-            exp->tag = SZEXP_VAR_H;
-            exp->val.varname = word;
-            continue;
-        }
-
-        switch (word.start[0]) {
-        case '+': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_ADD; continue;
-        case '-': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_SUB; continue;
-        case '*': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_MUL; continue;
-        case '/': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_DIV; continue;
-        case '!': exp->tag = SZEXP_OP1; exp->val.op = SZEXP_OP_NOT; continue;
-        case '>': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_GT;  continue;
-        case '<': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_LT;  continue;
-        }
-
-        if (mp_isdigit(word.start[0])) {
-            exp->tag = SZEXP_CONST;
-            if (bstr_sscanf(word, "%f", &exp->val.cval) != 1)
-                return false;
-            continue;
-        }
-
-        // Some sort of illegal expression
-        return false;
-    }
-
-    return true;
-}
-
-// Returns whether successful. 'result' is left untouched on failure
-bool eval_szexpr(struct mp_log *log, void *priv,
-                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
-                 struct szexp expr[MAX_SZEXP_SIZE], float *result)
-{
-    float stack[MAX_SZEXP_SIZE] = {0};
-    int idx = 0; // points to next element to push
-
-    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
-        switch (expr[i].tag) {
-        case SZEXP_END:
-            goto done;
-
-        case SZEXP_CONST:
-            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
-            // impossible to overflow the stack
-            assert(idx < MAX_SZEXP_SIZE);
-            stack[idx++] = expr[i].val.cval;
-            continue;
-
-        case SZEXP_OP1:
-            if (idx < 1) {
-                mp_warn(log, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            switch (expr[i].val.op) {
-            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
-            default: abort();
-            }
-            continue;
-
-        case SZEXP_OP2:
-            if (idx < 2) {
-                mp_warn(log, "Stack underflow in RPN expression!\n");
-                return false;
-            }
-
-            // Pop the operands in reverse order
-            float op2 = stack[--idx];
-            float op1 = stack[--idx];
-            float res = 0.0;
-            switch (expr[i].val.op) {
-            case SZEXP_OP_ADD: res = op1 + op2; break;
-            case SZEXP_OP_SUB: res = op1 - op2; break;
-            case SZEXP_OP_MUL: res = op1 * op2; break;
-            case SZEXP_OP_DIV: res = op1 / op2; break;
-            case SZEXP_OP_GT:  res = op1 > op2; break;
-            case SZEXP_OP_LT:  res = op1 < op2; break;
-            default: abort();
-            }
-
-            if (!isfinite(res)) {
-                mp_warn(log, "Illegal operation in RPN expression!\n");
-                return false;
-            }
-
-            stack[idx++] = res;
-            continue;
-
-        case SZEXP_VAR_W:
-        case SZEXP_VAR_H: {
-            struct bstr name = expr[i].val.varname;
-            float size[2];
-
-            if (!lookup(priv, name, size)) {
-                mp_warn(log, "Variable %.*s not found in RPN expression!\n",
-                        BSTR_P(name));
-                return false;
-            }
-
-            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? size[0] : size[1];
-            continue;
-            }
-        }
-    }
-
-done:
-    // Return the single stack element
-    if (idx != 1) {
-        mp_warn(log, "Malformed stack after RPN expression!\n");
-        return false;
-    }
-
-    *result = stack[0];
-    return true;
-}
-
-static bool parse_hook(struct mp_log *log, struct bstr *body,
-                       struct gl_user_shader_hook *out)
-{
-    *out = (struct gl_user_shader_hook){
-        .pass_desc = bstr0("(unknown)"),
-        .offset = identity_trans,
-        .width = {{ SZEXP_VAR_W, { .varname = bstr0("HOOKED") }}},
-        .height = {{ SZEXP_VAR_H, { .varname = bstr0("HOOKED") }}},
-        .cond = {{ SZEXP_CONST, { .cval = 1.0 }}},
-    };
-
-    int hook_idx = 0;
-    int bind_idx = 0;
-
-    // Parse all headers
-    while (true) {
-        struct bstr rest;
-        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
-
-        // Check for the presence of the magic line beginning
-        if (!bstr_eatstart0(&line, "//!"))
-            break;
-
-        *body = rest;
-
-        // Parse the supported commands
-        if (bstr_eatstart0(&line, "HOOK")) {
-            if (hook_idx == SHADER_MAX_HOOKS) {
-                mp_err(log, "Passes may only hook up to %d textures!\n",
-                       SHADER_MAX_HOOKS);
-                return false;
-            }
-            out->hook_tex[hook_idx++] = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "BIND")) {
-            if (bind_idx == SHADER_MAX_BINDS) {
-                mp_err(log, "Passes may only bind up to %d textures!\n",
-                       SHADER_MAX_BINDS);
-                return false;
-            }
-            out->bind_tex[bind_idx++] = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "SAVE")) {
-            out->save_tex = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "DESC")) {
-            out->pass_desc = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "OFFSET")) {
-            float ox, oy;
-            if (bstr_sscanf(line, "%f %f", &ox, &oy) != 2) {
-                mp_err(log, "Error while parsing OFFSET!\n");
-                return false;
-            }
-            out->offset.t[0] = ox;
-            out->offset.t[1] = oy;
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "WIDTH")) {
-            if (!parse_rpn_szexpr(line, out->width)) {
-                mp_err(log, "Error while parsing WIDTH!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "HEIGHT")) {
-            if (!parse_rpn_szexpr(line, out->height)) {
-                mp_err(log, "Error while parsing HEIGHT!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "WHEN")) {
-            if (!parse_rpn_szexpr(line, out->cond)) {
-                mp_err(log, "Error while parsing WHEN!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "COMPONENTS")) {
-            if (bstr_sscanf(line, "%d", &out->components) != 1) {
-                mp_err(log, "Error while parsing COMPONENTS!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "COMPUTE")) {
-            struct compute_info *ci = &out->compute;
-            int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
-                                  &ci->threads_w, &ci->threads_h);
-
-            if (num == 2 || num == 4) {
-                ci->active = true;
-                ci->directly_writes = true;
-            } else {
-                mp_err(log, "Error while parsing COMPUTE!\n");
-                return false;
-            }
-            continue;
-        }
-
-        // Unknown command type
-        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
-        return false;
-    }
-
-    // The rest of the file up until the next magic line beginning (if any)
-    // shall be the shader body
-    if (bstr_split_tok(*body, "//!", &out->pass_body, body)) {
-        // Make sure the magic line is part of the rest
-        body->start -= 3;
-        body->len += 3;
-    }
-
-    // Sanity checking
-    if (hook_idx == 0)
-        mp_warn(log, "Pass has no hooked textures (will be ignored)!\n");
-
-    return true;
-}
-
-static bool parse_tex(struct mp_log *log, struct ra *ra, struct bstr *body,
-                      struct gl_user_shader_tex *out)
-{
-    *out = (struct gl_user_shader_tex){
-        .name = bstr0("USER_TEX"),
-        .params = {
-            .dimensions = 2,
-            .w = 1, .h = 1, .d = 1,
-            .render_src = true,
-            .src_linear = true,
-        },
-    };
-    struct ra_tex_params *p = &out->params;
-
-    while (true) {
-        struct bstr rest;
-        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
-
-        if (!bstr_eatstart0(&line, "//!"))
-            break;
-
-        *body = rest;
-
-        if (bstr_eatstart0(&line, "TEXTURE")) {
-            out->name = bstr_strip(line);
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "SIZE")) {
-            p->dimensions = bstr_sscanf(line, "%d %d %d", &p->w, &p->h, &p->d);
-            if (p->dimensions < 1 || p->dimensions > 3 ||
-                p->w < 1 || p->h < 1 || p->d < 1)
-            {
-                mp_err(log, "Error while parsing SIZE!\n");
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "FORMAT ")) {
-            p->format = NULL;
-            for (int n = 0; n < ra->num_formats; n++) {
-                const struct ra_format *fmt = ra->formats[n];
-                if (bstr_equals0(line, fmt->name)) {
-                    p->format = fmt;
-                    break;
-                }
-            }
-            // (pixel_size==0 is for opaque formats)
-            if (!p->format || !p->format->pixel_size) {
-                mp_err(log, "Unrecognized/unavailable FORMAT name: '%.*s'!\n",
-                       BSTR_P(line));
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "FILTER")) {
-            line = bstr_strip(line);
-            if (bstr_equals0(line, "LINEAR")) {
-                p->src_linear = true;
-            } else if (bstr_equals0(line, "NEAREST")) {
-                p->src_linear = false;
-            } else {
-                mp_err(log, "Unrecognized FILTER: '%.*s'!\n", BSTR_P(line));
-                return false;
-            }
-            continue;
-        }
-
-        if (bstr_eatstart0(&line, "BORDER")) {
-            line = bstr_strip(line);
-            if (bstr_equals0(line, "CLAMP")) {
-                p->src_repeat = false;
-            } else if (bstr_equals0(line, "REPEAT")) {
-                p->src_repeat = true;
-            } else {
-                mp_err(log, "Unrecognized BORDER: '%.*s'!\n", BSTR_P(line));
-                return false;
-            }
-            continue;
-        }
-
-        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
-        return false;
-    }
-
-    if (!p->format) {
-        mp_err(log, "No FORMAT specified.\n");
-        return false;
-    }
-
-    if (p->src_linear && !p->format->linear_filter) {
-        mp_err(log, "The specified texture format cannot be filtered!\n");
-        return false;
-    }
-
-    // Decode the rest of the section (up to the next //! marker) as raw hex
-    // data for the texture
-    struct bstr hexdata;
-    if (bstr_split_tok(*body, "//!", &hexdata, body)) {
-        // Make sure the magic line is part of the rest
-        body->start -= 3;
-        body->len += 3;
-    }
-
-    struct bstr tex;
-    if (!bstr_decode_hex(NULL, bstr_strip(hexdata), &tex)) {
-        mp_err(log, "Error while parsing TEXTURE body: must be a valid "
-                    "hexadecimal sequence, on a single line!\n");
-        return false;
-    }
-
-    int expected_len = p->w * p->h * p->d * p->format->pixel_size;
-    if (tex.len != expected_len) {
-        mp_err(log, "Shader TEXTURE size mismatch: got %zd bytes, expected %d!\n",
-               tex.len, expected_len);
-        talloc_free(tex.start);
-        return false;
-    }
-
-    p->initial_data = tex.start;
-    return true;
-}
-
-void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
-                       void *priv,
-                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
-                       bool (*dotex)(void *p, struct gl_user_shader_tex tex))
-{
-    if (!dohook || !dotex || !shader.len)
-        return;
-
-    // Skip all garbage (e.g. comments) before the first header
-    int pos = bstr_find(shader, bstr0("//!"));
-    if (pos < 0) {
-        mp_warn(log, "Shader appears to contain no headers!\n");
-        return;
-    }
-    shader = bstr_cut(shader, pos);
-
-    // Loop over the file
-    while (shader.len > 0)
-    {
-        // Peek at the first header to dispatch the right type
-        if (bstr_startswith0(shader, "//!TEXTURE")) {
-            struct gl_user_shader_tex t;
-            if (!parse_tex(log, ra, &shader, &t) || !dotex(priv, t))
-                return;
-            continue;
-        }
-
-        struct gl_user_shader_hook h;
-        if (!parse_hook(log, &shader, &h) || !dohook(priv, h))
-            return;
-    }
-}
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
deleted file mode 100644
index 94a070c8e2..0000000000
--- a/video/out/opengl/user_shaders.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_USER_SHADERS_H
-#define MP_GL_USER_SHADERS_H
-
-#include "utils.h"
-#include "ra.h"
-
-#define SHADER_MAX_PASSES 32
-#define SHADER_MAX_HOOKS 16
-#define SHADER_MAX_BINDS 6
-#define SHADER_MAX_SAVED 64
-#define MAX_SZEXP_SIZE 32
-
-enum szexp_op {
-    SZEXP_OP_ADD,
-    SZEXP_OP_SUB,
-    SZEXP_OP_MUL,
-    SZEXP_OP_DIV,
-    SZEXP_OP_NOT,
-    SZEXP_OP_GT,
-    SZEXP_OP_LT,
-};
-
-enum szexp_tag {
-    SZEXP_END = 0, // End of an RPN expression
-    SZEXP_CONST, // Push a constant value onto the stack
-    SZEXP_VAR_W, // Get the width/height of a named texture (variable)
-    SZEXP_VAR_H,
-    SZEXP_OP2, // Pop two elements and push the result of a dyadic operation
-    SZEXP_OP1, // Pop one element and push the result of a monadic operation
-};
-
-struct szexp {
-    enum szexp_tag tag;
-    union {
-        float cval;
-        struct bstr varname;
-        enum szexp_op op;
-    } val;
-};
-
-struct compute_info {
-    bool active;
-    int block_w, block_h;     // Block size (each block corresponds to one WG)
-    int threads_w, threads_h; // How many threads form a working group
-    bool directly_writes;     // If true, shader is assumed to imageStore(out_image)
-};
-
-struct gl_user_shader_hook {
-    struct bstr pass_desc;
-    struct bstr hook_tex[SHADER_MAX_HOOKS];
-    struct bstr bind_tex[SHADER_MAX_BINDS];
-    struct bstr save_tex;
-    struct bstr pass_body;
-    struct gl_transform offset;
-    struct szexp width[MAX_SZEXP_SIZE];
-    struct szexp height[MAX_SZEXP_SIZE];
-    struct szexp cond[MAX_SZEXP_SIZE];
-    int components;
-    struct compute_info compute;
-};
-
-struct gl_user_shader_tex {
-    struct bstr name;
-    struct ra_tex_params params;
-    // for video.c
-    struct ra_tex *tex;
-};
-
-// Parse the next shader block from `body`. The callbacks are invoked on every
-// valid shader block parsed.
-void parse_user_shader(struct mp_log *log, struct ra *ra, struct bstr shader,
-                       void *priv,
-                       bool (*dohook)(void *p, struct gl_user_shader_hook hook),
-                       bool (*dotex)(void *p, struct gl_user_shader_tex tex));
-
-// Evaluate a szexp, given a lookup function for named textures
-bool eval_szexpr(struct mp_log *log, void *priv,
-                 bool (*lookup)(void *priv, struct bstr var, float size[2]),
-                 struct szexp expr[MAX_SZEXP_SIZE], float *result);
-
-#endif
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index b8fc24a52e..3b296d52de 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -1,371 +1,269 @@
-#include "common/msg.h"
-#include "video/out/vo.h"
+/*
+ * This file is part of mpv.
+ * Parts based on MPlayer code by Reimar Döffinger.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <libavutil/sha.h>
+#include <libavutil/intreadwrite.h>
+#include <libavutil/mem.h>
+
+#include "osdep/io.h"
+
+#include "common/common.h"
+#include "options/path.h"
+#include "stream/stream.h"
+#include "formats.h"
 #include "utils.h"
 
-// Standard parallel 2D projection, except y1 < y0 means that the coordinate
-// system is flipped, not the projection.
-void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
-                        float y0, float y1)
+// GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
+static const char *gl_error_to_string(GLenum error)
 {
-    if (y1 < y0) {
-        float tmp = y0;
-        y0 = tmp - y1;
-        y1 = tmp;
+    switch (error) {
+    case GL_INVALID_ENUM: return "INVALID_ENUM";
+    case GL_INVALID_VALUE: return "INVALID_VALUE";
+    case GL_INVALID_OPERATION: return "INVALID_OPERATION";
+    case GL_INVALID_FRAMEBUFFER_OPERATION: return "INVALID_FRAMEBUFFER_OPERATION";
+    case GL_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
+    default: return "unknown";
     }
-
-    t->m[0][0] = 2.0f / (x1 - x0);
-    t->m[0][1] = 0.0f;
-    t->m[1][0] = 0.0f;
-    t->m[1][1] = 2.0f / (y1 - y0);
-    t->t[0] = -(x1 + x0) / (x1 - x0);
-    t->t[1] = -(y1 + y0) / (y1 - y0);
-}
-
-// Apply the effects of one transformation to another, transforming it in the
-// process. In other words: post-composes t onto x
-void gl_transform_trans(struct gl_transform t, struct gl_transform *x)
-{
-    struct gl_transform xt = *x;
-    x->m[0][0] = t.m[0][0] * xt.m[0][0] + t.m[0][1] * xt.m[1][0];
-    x->m[1][0] = t.m[1][0] * xt.m[0][0] + t.m[1][1] * xt.m[1][0];
-    x->m[0][1] = t.m[0][0] * xt.m[0][1] + t.m[0][1] * xt.m[1][1];
-    x->m[1][1] = t.m[1][0] * xt.m[0][1] + t.m[1][1] * xt.m[1][1];
-    gl_transform_vec(t, &x->t[0], &x->t[1]);
-}
-
-void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo)
-{
-    int y_dir = fbo.flip ? -1 : 1;
-    gl_transform_ortho(t, 0, fbo.tex->params.w, 0, fbo.tex->params.h * y_dir);
 }
 
-void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool)
+void gl_check_error(GL *gl, struct mp_log *log, const char *info)
 {
-    for (int i = 0; i < pool->num_buffers; i++)
-        ra_buf_free(ra, &pool->buffers[i]);
-
-    talloc_free(pool->buffers);
-    *pool = (struct ra_buf_pool){0};
+    for (;;) {
+        GLenum error = gl->GetError();
+        if (error == GL_NO_ERROR)
+            break;
+        mp_msg(log, MSGL_ERR, "%s: OpenGL error %s.\n", info,
+               gl_error_to_string(error));
+    }
 }
 
-static bool ra_buf_params_compatible(const struct ra_buf_params *new,
-                                     const struct ra_buf_params *old)
+static int get_alignment(int stride)
 {
-    return new->type == old->type &&
-           new->size <= old->size &&
-           new->host_mapped  == old->host_mapped &&
-           new->host_mutable == old->host_mutable;
+    if (stride % 8 == 0)
+        return 8;
+    if (stride % 4 == 0)
+        return 4;
+    if (stride % 2 == 0)
+        return 2;
+    return 1;
 }
 
-static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
+// upload a texture, handling things like stride and slices
+//  target: texture target, usually GL_TEXTURE_2D
+//  format, type: texture parameters
+//  dataptr, stride: image data
+//  x, y, width, height: part of the image to upload
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h)
 {
-    struct ra_buf *buf = ra_buf_create(ra, &pool->current_params);
-    if (!buf)
-        return false;
-
-    MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
-    MP_VERBOSE(ra, "Resized buffer pool to size %d\n", pool->num_buffers);
-    return true;
+    int bpp = gl_bytes_per_pixel(format, type);
+    const uint8_t *data = dataptr;
+    int y_max = y + h;
+    if (w <= 0 || h <= 0 || !bpp)
+        return;
+    if (stride < 0) {
+        data += (h - 1) * stride;
+        stride = -stride;
+    }
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
+    int slice = h;
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
+        // this is not always correct, but should work for MPlayer
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
+    } else {
+        if (stride != bpp * w)
+            slice = 1; // very inefficient, but at least it works
+    }
+    for (; y + slice <= y_max; y += slice) {
+        gl->TexSubImage2D(target, 0, x, y, w, slice, format, type, data);
+        data += stride * slice;
+    }
+    if (y < y_max)
+        gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
 }
 
-struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
-                               const struct ra_buf_params *params)
+mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
 {
-    assert(!params->initial_data);
-
-    if (!ra_buf_params_compatible(params, &pool->current_params)) {
-        ra_buf_pool_uninit(ra, pool);
-        pool->current_params = *params;
-    }
-
-    // Make sure we have at least one buffer available
-    if (!pool->buffers && !ra_buf_pool_grow(ra, pool))
-        return NULL;
-
-    // Make sure the next buffer is available for use
-    if (!ra->fns->buf_poll(ra, pool->buffers[pool->index]) &&
-        !ra_buf_pool_grow(ra, pool))
-    {
+    if (gl->es)
+        return NULL; // ES can't read from front buffer
+    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
+    if (!image)
         return NULL;
+    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
+    GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
+    gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
+    gl->ReadBuffer(obj);
+    //flip image while reading (and also avoid stride-related trouble)
+    for (int y = 0; y < h; y++) {
+        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
+                       image->planes[0] + y * image->stride[0]);
     }
-
-    struct ra_buf *buf = pool->buffers[pool->index++];
-    pool->index %= pool->num_buffers;
-
-    return buf;
+    gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+    return image;
 }
 
-bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
-                       const struct ra_tex_upload_params *params)
+static void gl_vao_enable_attribs(struct gl_vao *vao)
 {
-    if (params->buf)
-        return ra->fns->tex_upload(ra, params);
-
-    struct ra_tex *tex = params->tex;
-    size_t row_size = tex->params.dimensions == 2 ? params->stride :
-                      tex->params.w * tex->params.format->pixel_size;
-
-    struct ra_buf_params bufparams = {
-        .type = RA_BUF_TYPE_TEX_UPLOAD,
-        .size = row_size * tex->params.h * tex->params.d,
-        .host_mutable = true,
-    };
-
-    struct ra_buf *buf = ra_buf_pool_get(ra, pbo, &bufparams);
-    if (!buf)
-        return false;
-
-    ra->fns->buf_update(ra, buf, 0, params->src, bufparams.size);
-
-    struct ra_tex_upload_params newparams = *params;
-    newparams.buf = buf;
-    newparams.src = NULL;
-
-    return ra->fns->tex_upload(ra, &newparams);
-}
+    GL *gl = vao->gl;
+
+    for (int n = 0; n < vao->num_entries; n++) {
+        const struct ra_renderpass_input *e = &vao->entries[n];
+        GLenum type = 0;
+        bool normalized = false;
+        switch (e->type) {
+        case RA_VARTYPE_INT:
+            type = GL_INT;
+            break;
+        case RA_VARTYPE_FLOAT:
+            type = GL_FLOAT;
+            break;
+        case RA_VARTYPE_BYTE_UNORM:
+            type = GL_UNSIGNED_BYTE;
+            normalized = true;
+            break;
+        default:
+            abort();
+        }
+        assert(e->dim_m == 1);
 
-struct ra_layout std140_layout(struct ra_renderpass_input *inp)
-{
-    size_t el_size = ra_vartype_size(inp->type);
-
-    // std140 packing rules:
-    // 1. The alignment of generic values is their size in bytes
-    // 2. The alignment of vectors is the vector length * the base count, with
-    // the exception of vec3 which is always aligned like vec4
-    // 3. The alignment of arrays is that of the element size rounded up to
-    // the nearest multiple of vec4
-    // 4. Matrices are treated like arrays of vectors
-    // 5. Arrays/matrices are laid out with a stride equal to the alignment
-    size_t size = el_size * inp->dim_v;
-    if (inp->dim_v == 3)
-        size += el_size;
-    if (inp->dim_m > 1)
-        size = MP_ALIGN_UP(size, sizeof(float[4]));
-
-    return (struct ra_layout) {
-        .align  = size,
-        .stride = size,
-        .size   = size * inp->dim_m,
-    };
+        gl->EnableVertexAttribArray(n);
+        gl->VertexAttribPointer(n, e->dim_v, type, normalized,
+                                vao->stride, (void *)(intptr_t)e->offset);
+    }
 }
 
-struct ra_layout std430_layout(struct ra_renderpass_input *inp)
+void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
+                 const struct ra_renderpass_input *entries,
+                 int num_entries)
 {
-    size_t el_size = ra_vartype_size(inp->type);
-
-    // std430 packing rules: like std140, except arrays/matrices are always
-    // "tightly" packed, even arrays/matrices of vec3s
-    size_t align = el_size * inp->dim_v;
-    if (inp->dim_v == 3 && inp->dim_m == 1)
-        align += el_size;
-
-    return (struct ra_layout) {
-        .align  = align,
-        .stride = align,
-        .size   = align * inp->dim_m,
+    assert(!vao->vao);
+    assert(!vao->buffer);
+
+    *vao = (struct gl_vao){
+        .gl = gl,
+        .stride = stride,
+        .entries = entries,
+        .num_entries = num_entries,
     };
-}
-
-// Create a texture and a FBO using the texture as color attachments.
-//  fmt: texture internal format
-// If the parameters are the same as the previous call, do not touch it.
-// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H.
-// Enabling FUZZY for W or H means the w or h does not need to be exact.
-bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
-                   int w, int h, const struct ra_format *fmt, int flags)
-{
-    int lw = w, lh = h;
-
-    if (fbo->tex) {
-        int cw = w, ch = h;
-        int rw = fbo->tex->params.w, rh = fbo->tex->params.h;
 
-        if ((flags & FBOTEX_FUZZY_W) && cw < rw)
-            cw = rw;
-        if ((flags & FBOTEX_FUZZY_H) && ch < rh)
-            ch = rh;
+    gl->GenBuffers(1, &vao->buffer);
 
-        if (rw == cw && rh == ch && fbo->tex->params.format == fmt)
-            goto done;
-    }
+    if (gl->BindVertexArray) {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
 
-    if (flags & FBOTEX_FUZZY_W)
-        w = MP_ALIGN_UP(w, 256);
-    if (flags & FBOTEX_FUZZY_H)
-        h = MP_ALIGN_UP(h, 256);
+        gl->GenVertexArrays(1, &vao->vao);
+        gl->BindVertexArray(vao->vao);
+        gl_vao_enable_attribs(vao);
+        gl->BindVertexArray(0);
 
-    mp_verbose(log, "Create FBO: %dx%d (%dx%d)\n", lw, lh, w, h);
-
-    if (!fmt || !fmt->renderable || !fmt->linear_filter) {
-        mp_err(log, "Format %s not supported.\n", fmt ? fmt->name : "(unset)");
-        return false;
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
     }
+}
 
-    fbotex_uninit(fbo);
-
-    *fbo = (struct fbotex) {
-        .ra = ra,
-    };
-
-    struct ra_tex_params params = {
-        .dimensions = 2,
-        .w = w,
-        .h = h,
-        .d = 1,
-        .format = fmt,
-        .src_linear = true,
-        .render_src = true,
-        .render_dst = true,
-        .storage_dst = true,
-        .blit_src = true,
-    };
-
-    fbo->tex = ra_tex_create(fbo->ra, &params);
-
-    if (!fbo->tex) {
-        mp_err(log, "Error: framebuffer could not be created.\n");
-        fbotex_uninit(fbo);
-        return false;
-    }
-
-done:
-
-    fbo->lw = lw;
-    fbo->lh = lh;
+void gl_vao_uninit(struct gl_vao *vao)
+{
+    GL *gl = vao->gl;
+    if (!gl)
+        return;
 
-    fbo->fbo = (struct fbodst){
-        .tex = fbo->tex,
-    };
+    if (gl->DeleteVertexArrays)
+        gl->DeleteVertexArrays(1, &vao->vao);
+    gl->DeleteBuffers(1, &vao->buffer);
 
-    return true;
+    *vao = (struct gl_vao){0};
 }
 
-void fbotex_uninit(struct fbotex *fbo)
+static void gl_vao_bind(struct gl_vao *vao)
 {
-    if (fbo->ra) {
-        ra_tex_free(fbo->ra, &fbo->tex);
-        *fbo = (struct fbotex) {0};
+    GL *gl = vao->gl;
+
+    if (gl->BindVertexArray) {
+        gl->BindVertexArray(vao->vao);
+    } else {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
+        gl_vao_enable_attribs(vao);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
     }
 }
 
-struct timer_pool {
-    struct ra *ra;
-    ra_timer *timer;
-    bool running; // detect invalid usage
-
-    uint64_t samples[VO_PERF_SAMPLE_COUNT];
-    int sample_idx;
-    int sample_count;
-
-    uint64_t sum;
-    uint64_t peak;
-};
-
-struct timer_pool *timer_pool_create(struct ra *ra)
+static void gl_vao_unbind(struct gl_vao *vao)
 {
-    if (!ra->fns->timer_create)
-        return NULL;
-
-    ra_timer *timer = ra->fns->timer_create(ra);
-    if (!timer)
-        return NULL;
+    GL *gl = vao->gl;
 
-    struct timer_pool *pool = talloc(NULL, struct timer_pool);
-    if (!pool) {
-        ra->fns->timer_destroy(ra, timer);
-        return NULL;
+    if (gl->BindVertexArray) {
+        gl->BindVertexArray(0);
+    } else {
+        for (int n = 0; n < vao->num_entries; n++)
+            gl->DisableVertexAttribArray(n);
     }
-
-    *pool = (struct timer_pool){ .ra = ra, .timer = timer };
-    return pool;
 }
 
-void timer_pool_destroy(struct timer_pool *pool)
+// Draw the vertex data (as described by the gl_vao_entry entries) in ptr
+// to the screen. num is the number of vertexes. prim is usually GL_TRIANGLES.
+// If ptr is NULL, then skip the upload, and use the data uploaded with the
+// previous call.
+void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
 {
-    if (!pool)
-        return;
-
-    pool->ra->fns->timer_destroy(pool->ra, pool->timer);
-    talloc_free(pool);
-}
+    GL *gl = vao->gl;
 
-void timer_pool_start(struct timer_pool *pool)
-{
-    if (!pool)
-        return;
+    if (ptr) {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
+        gl->BufferData(GL_ARRAY_BUFFER, num * vao->stride, ptr, GL_STREAM_DRAW);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+    }
 
-    assert(!pool->running);
-    pool->ra->fns->timer_start(pool->ra, pool->timer);
-    pool->running = true;
-}
+    gl_vao_bind(vao);
 
-void timer_pool_stop(struct timer_pool *pool)
-{
-    if (!pool)
-        return;
+    gl->DrawArrays(prim, 0, num);
 
-    assert(pool->running);
-    uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
-    pool->running = false;
-
-    if (res) {
-        // Input res into the buffer and grab the previous value
-        uint64_t old = pool->samples[pool->sample_idx];
-        pool->sample_count = MPMIN(pool->sample_count + 1, VO_PERF_SAMPLE_COUNT);
-        pool->samples[pool->sample_idx++] = res;
-        pool->sample_idx %= VO_PERF_SAMPLE_COUNT;
-        pool->sum = pool->sum + res - old;
-
-        // Update peak if necessary
-        if (res >= pool->peak) {
-            pool->peak = res;
-        } else if (pool->peak == old) {
-            // It's possible that the last peak was the value we just removed,
-            // if so we need to scan for the new peak
-            uint64_t peak = res;
-            for (int i = 0; i < VO_PERF_SAMPLE_COUNT; i++)
-                peak = MPMAX(peak, pool->samples[i]);
-            pool->peak = peak;
-        }
-    }
+    gl_vao_unbind(vao);
 }
 
-struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
+static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
+                                   GLenum severity, GLsizei length,
+                                   const GLchar *message, const void *userParam)
 {
-    if (!pool)
-        return (struct mp_pass_perf){0};
-
-    struct mp_pass_perf res = {
-        .peak = pool->peak,
-        .count = pool->sample_count,
-    };
-
-    int idx = pool->sample_idx - pool->sample_count + VO_PERF_SAMPLE_COUNT;
-    for (int i = 0; i < res.count; i++) {
-        idx %= VO_PERF_SAMPLE_COUNT;
-        res.samples[i] = pool->samples[idx++];
+    // keep in mind that the debug callback can be asynchronous
+    struct mp_log *log = (void *)userParam;
+    int level = MSGL_ERR;
+    switch (severity) {
+    case GL_DEBUG_SEVERITY_NOTIFICATION:level = MSGL_V; break;
+    case GL_DEBUG_SEVERITY_LOW:         level = MSGL_INFO; break;
+    case GL_DEBUG_SEVERITY_MEDIUM:      level = MSGL_WARN; break;
+    case GL_DEBUG_SEVERITY_HIGH:        level = MSGL_ERR; break;
     }
-
-    if (res.count > 0) {
-        res.last = res.samples[res.count - 1];
-        res.avg = pool->sum / res.count;
-    }
-
-    return res;
+    mp_msg(log, level, "GL: %s\n", message);
 }
 
-void mp_log_source(struct mp_log *log, int lev, const char *src)
+void gl_set_debug_logger(GL *gl, struct mp_log *log)
 {
-    int line = 1;
-    if (!src)
-        return;
-    while (*src) {
-        const char *end = strchr(src, '\n');
-        const char *next = end + 1;
-        if (!end)
-            next = end = src + strlen(src);
-        mp_msg(log, lev, "[%3d] %.*s\n", line, (int)(end - src), src);
-        line++;
-        src = next;
-    }
+    if (gl->DebugMessageCallback)
+        gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
 }
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 7d00d26cf5..18cab476ed 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -1,121 +1,54 @@
-#pragma once
+/*
+ * This file is part of mpv.
+ * Parts based on MPlayer code by Reimar Döffinger.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_UTILS_
+#define MP_GL_UTILS_
 
-#include <stdbool.h>
 #include <math.h>
 
-#include "video/out/vo.h"
-#include "ra.h"
+#include "video/out/gpu/utils.h"
+#include "common.h"
 
-// A 3x2 matrix, with the translation part separate.
-struct gl_transform {
-    // row-major, e.g. in mathematical notation:
-    //  | m[0][0] m[0][1] |
-    //  | m[1][0] m[1][1] |
-    float m[2][2];
-    float t[2];
-};
-
-static const struct gl_transform identity_trans = {
-    .m = {{1.0, 0.0}, {0.0, 1.0}},
-    .t = {0.0, 0.0},
-};
-
-void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
-                        float y0, float y1);
-
-// This treats m as an affine transformation, in other words m[2][n] gets
-// added to the output.
-static inline void gl_transform_vec(struct gl_transform t, float *x, float *y)
-{
-    float vx = *x, vy = *y;
-    *x = vx * t.m[0][0] + vy * t.m[0][1] + t.t[0];
-    *y = vx * t.m[1][0] + vy * t.m[1][1] + t.t[1];
-}
+struct mp_log;
 
-struct mp_rect_f {
-    float x0, y0, x1, y1;
-};
-
-// Semantic equality (fuzzy comparison)
-static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
-{
-    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
-           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
-}
-
-static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
-{
-    gl_transform_vec(t, &r->x0, &r->y0);
-    gl_transform_vec(t, &r->x1, &r->y1);
-}
+void gl_check_error(GL *gl, struct mp_log *log, const char *info);
 
-static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
-{
-    for (int x = 0; x < 2; x++) {
-        for (int y = 0; y < 2; y++) {
-            if (a.m[x][y] != b.m[x][y])
-                return false;
-        }
-    }
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h);
 
-    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
-}
+mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
 
-void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
-
-struct fbodst {
-    struct ra_tex *tex;
-    bool flip; // mirror vertically
+struct gl_vao {
+    GL *gl;
+    GLuint vao;     // the VAO object, or 0 if unsupported by driver
+    GLuint buffer;  // GL_ARRAY_BUFFER used for the data
+    int stride;     // size of each element (interleaved elements are assumed)
+    const struct ra_renderpass_input *entries;
+    int num_entries;
 };
 
-void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo);
-
-// A pool of buffers, which can grow as needed
-struct ra_buf_pool {
-    struct ra_buf_params current_params;
-    struct ra_buf **buffers;
-    int num_buffers;
-    int index;
-};
-
-void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool);
-
-// Note: params->initial_data is *not* supported
-struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
-                               const struct ra_buf_params *params);
-
-// Helper that wraps ra_tex_upload using texture upload buffers to ensure that
-// params->buf is always set. This is intended for RA-internal usage.
-bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
-                       const struct ra_tex_upload_params *params);
-
-// Layout rules for GLSL's packing modes
-struct ra_layout std140_layout(struct ra_renderpass_input *inp);
-struct ra_layout std430_layout(struct ra_renderpass_input *inp);
-
-struct fbotex {
-    struct ra *ra;
-    struct ra_tex *tex;
-    int lw, lh; // logical (configured) size, <= than texture size
-    struct fbodst fbo;
-};
-
-void fbotex_uninit(struct fbotex *fbo);
-bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
-                   int w, int h, const struct ra_format *fmt, int flags);
-#define FBOTEX_FUZZY_W 1
-#define FBOTEX_FUZZY_H 2
-#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
-
-// A wrapper around ra_timer that does result pooling, averaging etc.
-struct timer_pool;
+void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
+                 const struct ra_renderpass_input *entries,
+                 int num_entries);
+void gl_vao_uninit(struct gl_vao *vao);
+void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
 
-struct timer_pool *timer_pool_create(struct ra *ra);
-void timer_pool_destroy(struct timer_pool *pool);
-void timer_pool_start(struct timer_pool *pool);
-void timer_pool_stop(struct timer_pool *pool);
-struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);
+void gl_set_debug_logger(GL *gl, struct mp_log *log);
 
-// print a multi line string with line numbers (e.g. for shader sources)
-// log, lev: module and log level, as in mp_msg()
-void mp_log_source(struct mp_log *log, int lev, const char *src);
+#endif
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
deleted file mode 100644
index 3362381eff..0000000000
--- a/video/out/opengl/video.c
+++ /dev/null
@@ -1,3813 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <string.h>
-#include <assert.h>
-
-#include <libavutil/common.h>
-#include <libavutil/lfg.h>
-
-#include "video.h"
-
-#include "misc/bstr.h"
-#include "options/m_config.h"
-#include "common/global.h"
-#include "options/options.h"
-#include "utils.h"
-#include "hwdec.h"
-#include "osd.h"
-#include "ra.h"
-#include "stream/stream.h"
-#include "video_shaders.h"
-#include "user_shaders.h"
-#include "video/out/filter_kernels.h"
-#include "video/out/aspect.h"
-#include "video/out/dither.h"
-#include "video/out/vo.h"
-
-// scale/cscale arguments that map directly to shader filter routines.
-// Note that the convolution filters are not included in this list.
-static const char *const fixed_scale_filters[] = {
-    "bilinear",
-    "bicubic_fast",
-    "oversample",
-    NULL
-};
-static const char *const fixed_tscale_filters[] = {
-    "oversample",
-    "linear",
-    NULL
-};
-
-// must be sorted, and terminated with 0
-int filter_sizes[] =
-    {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
-int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
-
-struct vertex_pt {
-    float x, y;
-};
-
-struct vertex {
-    struct vertex_pt position;
-    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
-    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
-    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
-    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
-    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
-    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
-    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
-    {0}
-};
-
-struct texplane {
-    struct ra_tex *tex;
-    int w, h;
-    bool flipped;
-};
-
-struct video_image {
-    struct texplane planes[4];
-    struct mp_image *mpi;       // original input image
-    uint64_t id;                // unique ID identifying mpi contents
-    bool hwdec_mapped;
-};
-
-enum plane_type {
-    PLANE_NONE = 0,
-    PLANE_RGB,
-    PLANE_LUMA,
-    PLANE_CHROMA,
-    PLANE_ALPHA,
-    PLANE_XYZ,
-};
-
-static const char *plane_names[] = {
-    [PLANE_NONE] = "unknown",
-    [PLANE_RGB] = "rgb",
-    [PLANE_LUMA] = "luma",
-    [PLANE_CHROMA] = "chroma",
-    [PLANE_ALPHA] = "alpha",
-    [PLANE_XYZ] = "xyz",
-};
-
-// A self-contained description of a source image which can be bound to a
-// texture unit and sampled from. Contains metadata about how it's to be used
-struct img_tex {
-    enum plane_type type; // must be set to something non-zero
-    int components; // number of relevant coordinates
-    float multiplier; // multiplier to be used when sampling
-    struct ra_tex *tex;
-    int w, h; // logical size (after transformation)
-    struct gl_transform transform; // rendering transformation
-};
-
-// A named img_tex, for user scripting purposes
-struct saved_tex {
-    const char *name;
-    struct img_tex tex;
-};
-
-// A texture hook. This is some operation that transforms a named texture as
-// soon as it's generated
-struct tex_hook {
-    const char *save_tex;
-    const char *hook_tex[SHADER_MAX_HOOKS];
-    const char *bind_tex[TEXUNIT_VIDEO_NUM];
-    int components; // how many components are relevant (0 = same as input)
-    void *priv; // this gets talloc_freed when the tex_hook is removed
-    void (*hook)(struct gl_video *p, struct img_tex tex, // generates GLSL
-                 struct gl_transform *trans, void *priv);
-    bool (*cond)(struct gl_video *p, struct img_tex tex, void *priv);
-};
-
-struct fbosurface {
-    struct fbotex fbotex;
-    uint64_t id;
-    double pts;
-};
-
-#define FBOSURFACES_MAX 10
-
-struct cached_file {
-    char *path;
-    struct bstr body;
-};
-
-struct pass_info {
-    struct bstr desc;
-    struct mp_pass_perf perf;
-};
-
-#define PASS_INFO_MAX (SHADER_MAX_PASSES + 32)
-
-struct dr_buffer {
-    struct ra_buf *buf;
-    // The mpi reference will keep the data from being recycled (or from other
-    // references gaining write access) while the GPU is accessing the buffer.
-    struct mp_image *mpi;
-};
-
-struct gl_video {
-    struct ra *ra;
-
-    struct mpv_global *global;
-    struct mp_log *log;
-    struct gl_video_opts opts;
-    struct m_config_cache *opts_cache;
-    struct gl_lcms *cms;
-
-    int fb_depth;               // actual bits available in GL main framebuffer
-    struct m_color clear_color;
-    bool force_clear_color;
-
-    struct gl_shader_cache *sc;
-
-    struct osd_state *osd_state;
-    struct mpgl_osd *osd;
-    double osd_pts;
-
-    struct ra_tex *lut_3d_texture;
-    bool use_lut_3d;
-    int lut_3d_size[3];
-
-    struct ra_tex *dither_texture;
-
-    struct mp_image_params real_image_params;   // configured format
-    struct mp_image_params image_params;        // texture format (mind hwdec case)
-    struct ra_imgfmt_desc ra_format;            // texture format
-    int plane_count;
-
-    bool is_gray;
-    bool has_alpha;
-    char color_swizzle[5];
-    bool use_integer_conversion;
-
-    struct video_image image;
-
-    struct dr_buffer *dr_buffers;
-    int num_dr_buffers;
-
-    bool using_dr_path;
-
-    bool dumb_mode;
-    bool forced_dumb_mode;
-
-    const struct ra_format *fbo_format;
-    struct fbotex merge_fbo[4];
-    struct fbotex scale_fbo[4];
-    struct fbotex integer_fbo[4];
-    struct fbotex indirect_fbo;
-    struct fbotex blend_subs_fbo;
-    struct fbotex screen_fbo;
-    struct fbotex output_fbo;
-    struct fbosurface surfaces[FBOSURFACES_MAX];
-    struct fbotex vdpau_deinterleave_fbo[2];
-    struct ra_buf *hdr_peak_ssbo;
-
-    // user pass descriptions and textures
-    struct tex_hook tex_hooks[SHADER_MAX_PASSES];
-    int tex_hook_num;
-    struct gl_user_shader_tex user_textures[SHADER_MAX_PASSES];
-    int user_tex_num;
-
-    int surface_idx;
-    int surface_now;
-    int frames_drawn;
-    bool is_interpolated;
-    bool output_fbo_valid;
-
-    // state for configured scalers
-    struct scaler scaler[SCALER_COUNT];
-
-    struct mp_csp_equalizer_state *video_eq;
-
-    struct mp_rect src_rect;    // displayed part of the source video
-    struct mp_rect dst_rect;    // video rectangle on output window
-    struct mp_osd_res osd_rect; // OSD size/margins
-
-    // temporary during rendering
-    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
-    struct compute_info pass_compute; // compute shader metadata for this pass
-    int pass_tex_num;
-    int texture_w, texture_h;
-    struct gl_transform texture_offset; // texture transform without rotation
-    int components;
-    bool use_linear;
-    float user_gamma;
-
-    // pass info / metrics
-    struct pass_info pass_fresh[PASS_INFO_MAX];
-    struct pass_info pass_redraw[PASS_INFO_MAX];
-    struct pass_info *pass;
-    int pass_idx;
-    struct timer_pool *upload_timer;
-    struct timer_pool *blit_timer;
-    struct timer_pool *osd_timer;
-
-    // intermediate textures
-    struct saved_tex saved_tex[SHADER_MAX_SAVED];
-    int saved_tex_num;
-    struct fbotex hook_fbos[SHADER_MAX_SAVED];
-    int hook_fbo_num;
-
-    int frames_uploaded;
-    int frames_rendered;
-    AVLFG lfg;
-
-    // Cached because computing it can take relatively long
-    int last_dither_matrix_size;
-    float *last_dither_matrix;
-
-    struct cached_file *files;
-    int num_files;
-
-    struct ra_hwdec *hwdec;
-    struct ra_hwdec_mapper *hwdec_mapper;
-    bool hwdec_active;
-
-    bool dsi_warned;
-    bool broken_frame; // temporary error state
-};
-
-static const struct gl_video_opts gl_video_opts_def = {
-    .dither_algo = DITHER_FRUIT,
-    .dither_depth = -1,
-    .dither_size = 6,
-    .temporal_dither_period = 1,
-    .fbo_format = "auto",
-    .sigmoid_center = 0.75,
-    .sigmoid_slope = 6.5,
-    .scaler = {
-        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .cutoff = 0.001}, // scale
-        {{NULL,       .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .cutoff = 0.001}, // dscale
-        {{"bilinear", .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .cutoff = 0.001}, // cscale
-        {{"mitchell", .params={NAN, NAN}}, {.params = {NAN, NAN}},
-         .clamp = 1, }, // tscale
-    },
-    .scaler_resizes_only = 1,
-    .scaler_lut_size = 6,
-    .interpolation_threshold = 0.0001,
-    .alpha_mode = ALPHA_BLEND_TILES,
-    .background = {0, 0, 0, 255},
-    .gamma = 1.0f,
-    .tone_mapping = TONE_MAPPING_MOBIUS,
-    .tone_mapping_param = NAN,
-    .tone_mapping_desat = 2.0,
-    .early_flush = -1,
-};
-
-static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param);
-
-static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param);
-
-#define OPT_BASE_STRUCT struct gl_video_opts
-
-#define SCALER_OPTS(n, i) \
-    OPT_STRING_VALIDATE(n, scaler[i].kernel.name, 0, validate_scaler_opt), \
-    OPT_FLOAT(n"-param1", scaler[i].kernel.params[0], 0),                  \
-    OPT_FLOAT(n"-param2", scaler[i].kernel.params[1], 0),                  \
-    OPT_FLOAT(n"-blur",   scaler[i].kernel.blur, 0),                       \
-    OPT_FLOATRANGE(n"-cutoff", scaler[i].cutoff, 0, 0.0, 1.0),             \
-    OPT_FLOATRANGE(n"-taper", scaler[i].kernel.taper, 0, 0.0, 1.0),        \
-    OPT_FLOAT(n"-wparam", scaler[i].window.params[0], 0),                  \
-    OPT_FLOAT(n"-wblur",  scaler[i].window.blur, 0),                       \
-    OPT_FLOATRANGE(n"-wtaper", scaler[i].window.taper, 0, 0.0, 1.0),       \
-    OPT_FLOATRANGE(n"-clamp", scaler[i].clamp, 0, 0.0, 1.0),               \
-    OPT_FLOATRANGE(n"-radius",    scaler[i].radius, 0, 0.5, 16.0),         \
-    OPT_FLOATRANGE(n"-antiring",  scaler[i].antiring, 0, 0.0, 1.0),        \
-    OPT_STRING_VALIDATE(n"-window", scaler[i].window.name, 0, validate_window_opt)
-
-const struct m_sub_options gl_video_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_CHOICE("opengl-dumb-mode", dumb_mode, 0,
-                   ({"auto", 0}, {"yes", 1}, {"no", -1})),
-        OPT_FLOATRANGE("opengl-gamma", gamma, 0, 0.1, 2.0),
-        OPT_FLAG("gamma-auto", gamma_auto, 0),
-        OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
-        OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
-        OPT_CHOICE("tone-mapping", tone_mapping, 0,
-                   ({"clip",     TONE_MAPPING_CLIP},
-                    {"mobius",   TONE_MAPPING_MOBIUS},
-                    {"reinhard", TONE_MAPPING_REINHARD},
-                    {"hable",    TONE_MAPPING_HABLE},
-                    {"gamma",    TONE_MAPPING_GAMMA},
-                    {"linear",   TONE_MAPPING_LINEAR})),
-        OPT_FLAG("hdr-compute-peak", compute_hdr_peak, 0),
-        OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
-        OPT_FLOAT("tone-mapping-desaturate", tone_mapping_desat, 0),
-        OPT_FLAG("gamut-warning", gamut_warning, 0),
-        OPT_FLAG("opengl-pbo", pbo, 0),
-        SCALER_OPTS("scale",  SCALER_SCALE),
-        SCALER_OPTS("dscale", SCALER_DSCALE),
-        SCALER_OPTS("cscale", SCALER_CSCALE),
-        SCALER_OPTS("tscale", SCALER_TSCALE),
-        OPT_INTRANGE("scaler-lut-size", scaler_lut_size, 0, 4, 10),
-        OPT_FLAG("scaler-resizes-only", scaler_resizes_only, 0),
-        OPT_FLAG("linear-scaling", linear_scaling, 0),
-        OPT_FLAG("correct-downscaling", correct_downscaling, 0),
-        OPT_FLAG("sigmoid-upscaling", sigmoid_upscaling, 0),
-        OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
-        OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
-        OPT_STRING("opengl-fbo-format", fbo_format, 0),
-        OPT_CHOICE_OR_INT("dither-depth", dither_depth, 0, -1, 16,
-                          ({"no", -1}, {"auto", 0})),
-        OPT_CHOICE("dither", dither_algo, 0,
-                   ({"fruit", DITHER_FRUIT},
-                    {"ordered", DITHER_ORDERED},
-                    {"no", DITHER_NONE})),
-        OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
-        OPT_FLAG("temporal-dither", temporal_dither, 0),
-        OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
-        OPT_CHOICE("alpha", alpha_mode, 0,
-                   ({"no", ALPHA_NO},
-                    {"yes", ALPHA_YES},
-                    {"blend", ALPHA_BLEND},
-                    {"blend-tiles", ALPHA_BLEND_TILES})),
-        OPT_FLAG("opengl-rectangle-textures", use_rectangle, 0),
-        OPT_COLOR("background", background, 0),
-        OPT_FLAG("interpolation", interpolation, 0),
-        OPT_FLOAT("interpolation-threshold", interpolation_threshold, 0),
-        OPT_CHOICE("blend-subtitles", blend_subs, 0,
-                   ({"no", BLEND_SUBS_NO},
-                    {"yes", BLEND_SUBS_YES},
-                    {"video", BLEND_SUBS_VIDEO})),
-        OPT_PATHLIST("opengl-shaders", user_shaders, 0),
-        OPT_CLI_ALIAS("opengl-shader", "opengl-shaders-append"),
-        OPT_FLAG("deband", deband, 0),
-        OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
-        OPT_FLOAT("sharpen", unsharp, 0),
-        OPT_INTRANGE("opengl-tex-pad-x", tex_pad_x, 0, 0, 4096),
-        OPT_INTRANGE("opengl-tex-pad-y", tex_pad_y, 0, 0, 4096),
-        OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
-        OPT_CHOICE("opengl-early-flush", early_flush, 0,
-                   ({"no", 0}, {"yes", 1}, {"auto", -1})),
-        OPT_STRING("opengl-shader-cache-dir", shader_cache_dir, 0),
-        OPT_REPLACED("hdr-tone-mapping", "tone-mapping"),
-        {0}
-    },
-    .size = sizeof(struct gl_video_opts),
-    .defaults = &gl_video_opts_def,
-};
-
-static void uninit_rendering(struct gl_video *p);
-static void uninit_scaler(struct gl_video *p, struct scaler *scaler);
-static void check_gl_features(struct gl_video *p);
-static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id);
-static const char *handle_scaler_opt(const char *name, bool tscale);
-static void reinit_from_options(struct gl_video *p);
-static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
-static void gl_video_setup_hooks(struct gl_video *p);
-
-#define GLSL(x) gl_sc_add(p->sc, #x "\n");
-#define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
-#define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__)
-#define PRELUDE(...) gl_sc_paddf(p->sc, __VA_ARGS__)
-
-static struct bstr load_cached_file(struct gl_video *p, const char *path)
-{
-    if (!path || !path[0])
-        return (struct bstr){0};
-    for (int n = 0; n < p->num_files; n++) {
-        if (strcmp(p->files[n].path, path) == 0)
-            return p->files[n].body;
-    }
-    // not found -> load it
-    struct bstr s = stream_read_file(path, p, p->global, 1024000); // 1024 kB
-    if (s.len) {
-        struct cached_file new = {
-            .path = talloc_strdup(p, path),
-            .body = s,
-        };
-        MP_TARRAY_APPEND(p, p->files, p->num_files, new);
-        return new.body;
-    }
-    return (struct bstr){0};
-}
-
-static void debug_check_gl(struct gl_video *p, const char *msg)
-{
-    if (p->ra->fns->debug_marker)
-        p->ra->fns->debug_marker(p->ra, msg);
-}
-
-static void gl_video_reset_surfaces(struct gl_video *p)
-{
-    for (int i = 0; i < FBOSURFACES_MAX; i++) {
-        p->surfaces[i].id = 0;
-        p->surfaces[i].pts = MP_NOPTS_VALUE;
-    }
-    p->surface_idx = 0;
-    p->surface_now = 0;
-    p->frames_drawn = 0;
-    p->output_fbo_valid = false;
-}
-
-static void gl_video_reset_hooks(struct gl_video *p)
-{
-    for (int i = 0; i < p->tex_hook_num; i++)
-        talloc_free(p->tex_hooks[i].priv);
-
-    for (int i = 0; i < p->user_tex_num; i++)
-        ra_tex_free(p->ra, &p->user_textures[i].tex);
-
-    p->tex_hook_num = 0;
-    p->user_tex_num = 0;
-}
-
-static inline int fbosurface_wrap(int id)
-{
-    id = id % FBOSURFACES_MAX;
-    return id < 0 ? id + FBOSURFACES_MAX : id;
-}
-
-static void reinit_osd(struct gl_video *p)
-{
-    mpgl_osd_destroy(p->osd);
-    p->osd = NULL;
-    if (p->osd_state)
-        p->osd = mpgl_osd_init(p->ra, p->log, p->osd_state);
-}
-
-static void uninit_rendering(struct gl_video *p)
-{
-    for (int n = 0; n < SCALER_COUNT; n++)
-        uninit_scaler(p, &p->scaler[n]);
-
-    ra_tex_free(p->ra, &p->dither_texture);
-
-    for (int n = 0; n < 4; n++) {
-        fbotex_uninit(&p->merge_fbo[n]);
-        fbotex_uninit(&p->scale_fbo[n]);
-        fbotex_uninit(&p->integer_fbo[n]);
-    }
-
-    fbotex_uninit(&p->indirect_fbo);
-    fbotex_uninit(&p->blend_subs_fbo);
-    fbotex_uninit(&p->screen_fbo);
-    fbotex_uninit(&p->output_fbo);
-
-    for (int n = 0; n < FBOSURFACES_MAX; n++)
-        fbotex_uninit(&p->surfaces[n].fbotex);
-
-    for (int n = 0; n < SHADER_MAX_SAVED; n++)
-        fbotex_uninit(&p->hook_fbos[n]);
-
-    for (int n = 0; n < 2; n++)
-        fbotex_uninit(&p->vdpau_deinterleave_fbo[n]);
-
-    gl_video_reset_surfaces(p);
-    gl_video_reset_hooks(p);
-
-    gl_sc_reset_error(p->sc);
-}
-
-bool gl_video_gamma_auto_enabled(struct gl_video *p)
-{
-    return p->opts.gamma_auto;
-}
-
-struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p)
-{
-    return (struct mp_colorspace) {
-        .primaries = p->opts.target_prim,
-        .gamma = p->opts.target_trc,
-    };
-}
-
-// Warning: profile.start must point to a ta allocation, and the function
-//          takes over ownership.
-void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data)
-{
-    if (gl_lcms_set_memory_profile(p->cms, icc_data))
-        reinit_from_options(p);
-}
-
-bool gl_video_icc_auto_enabled(struct gl_video *p)
-{
-    return p->opts.icc_opts ? p->opts.icc_opts->profile_auto : false;
-}
-
-static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
-                               enum mp_csp_trc trc)
-{
-    if (!p->use_lut_3d)
-        return false;
-
-    struct AVBufferRef *icc = NULL;
-    if (p->image.mpi)
-        icc = p->image.mpi->icc_profile;
-
-    if (p->lut_3d_texture && !gl_lcms_has_changed(p->cms, prim, trc, icc))
-        return true;
-
-    // GLES3 doesn't provide filtered 16 bit integer textures
-    // GLES2 doesn't even provide 3D textures
-    const struct ra_format *fmt = ra_find_unorm_format(p->ra, 2, 4);
-    if (!fmt || !(p->ra->caps & RA_CAP_TEX_3D)) {
-        p->use_lut_3d = false;
-        MP_WARN(p, "Disabling color management (no RGBA16 3D textures).\n");
-        return false;
-    }
-
-    struct lut3d *lut3d = NULL;
-    if (!fmt || !gl_lcms_get_lut3d(p->cms, &lut3d, prim, trc, icc) || !lut3d) {
-        p->use_lut_3d = false;
-        return false;
-    }
-
-    ra_tex_free(p->ra, &p->lut_3d_texture);
-
-    struct ra_tex_params params = {
-        .dimensions = 3,
-        .w = lut3d->size[0],
-        .h = lut3d->size[1],
-        .d = lut3d->size[2],
-        .format = fmt,
-        .render_src = true,
-        .src_linear = true,
-        .initial_data = lut3d->data,
-    };
-    p->lut_3d_texture = ra_tex_create(p->ra, &params);
-
-    debug_check_gl(p, "after 3d lut creation");
-
-    for (int i = 0; i < 3; i++)
-        p->lut_3d_size[i] = lut3d->size[i];
-
-    talloc_free(lut3d);
-
-    return true;
-}
-
-// Fill an img_tex struct from an FBO + some metadata
-static struct img_tex img_tex_fbo(struct fbotex *fbo, enum plane_type type,
-                                  int components)
-{
-    assert(type != PLANE_NONE);
-    return (struct img_tex){
-        .type = type,
-        .tex = fbo->tex,
-        .multiplier = 1.0,
-        .w = fbo->lw,
-        .h = fbo->lh,
-        .transform = identity_trans,
-        .components = components,
-    };
-}
-
-// Bind an img_tex to a free texture unit and return its ID. At most
-// TEXUNIT_VIDEO_NUM texture units can be bound at once
-static int pass_bind(struct gl_video *p, struct img_tex tex)
-{
-    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
-    p->pass_tex[p->pass_tex_num] = tex;
-    return p->pass_tex_num++;
-}
-
-// Rotation by 90° and flipping.
-// w/h is used for recentering.
-static void get_transform(float w, float h, int rotate, bool flip,
-                          struct gl_transform *out_tr)
-{
-    int a = rotate % 90 ? 0 : rotate / 90;
-    int sin90[4] = {0, 1, 0, -1}; // just to avoid rounding issues etc.
-    int cos90[4] = {1, 0, -1, 0};
-    struct gl_transform tr = {{{ cos90[a], sin90[a]},
-                               {-sin90[a], cos90[a]}}};
-
-    // basically, recenter to keep the whole image in view
-    float b[2] = {1, 1};
-    gl_transform_vec(tr, &b[0], &b[1]);
-    tr.t[0] += b[0] < 0 ? w : 0;
-    tr.t[1] += b[1] < 0 ? h : 0;
-
-    if (flip) {
-        struct gl_transform fliptr = {{{1, 0}, {0, -1}}, {0, h}};
-        gl_transform_trans(fliptr, &tr);
-    }
-
-    *out_tr = tr;
-}
-
-// Return the chroma plane upscaled to luma size, but with additional padding
-// for image sizes not aligned to subsampling.
-static int chroma_upsize(int size, int pixel)
-{
-    return (size + pixel - 1) / pixel * pixel;
-}
-
-// If a and b are on the same plane, return what plane type should be used.
-// If a or b are none, the other type always wins.
-// Usually: LUMA/RGB/XYZ > CHROMA > ALPHA
-static enum plane_type merge_plane_types(enum plane_type a, enum plane_type b)
-{
-    if (a == PLANE_NONE)
-        return b;
-    if (b == PLANE_LUMA || b == PLANE_RGB || b == PLANE_XYZ)
-        return b;
-    if (b != PLANE_NONE && a == PLANE_ALPHA)
-        return b;
-    return a;
-}
-
-// Places a video_image's image textures + associated metadata into tex[]. The
-// number of textures is equal to p->plane_count. Any necessary plane offsets
-// are stored in off. (e.g. chroma position)
-static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
-                             struct img_tex tex[4], struct gl_transform off[4])
-{
-    assert(vimg->mpi);
-
-    int w = p->image_params.w;
-    int h = p->image_params.h;
-
-    // Determine the chroma offset
-    float ls_w = 1.0 / p->ra_format.chroma_w;
-    float ls_h = 1.0 / p->ra_format.chroma_h;
-
-    struct gl_transform chroma = {{{ls_w, 0.0}, {0.0, ls_h}}};
-
-    if (p->image_params.chroma_location != MP_CHROMA_CENTER) {
-        int cx, cy;
-        mp_get_chroma_location(p->image_params.chroma_location, &cx, &cy);
-        // By default texture coordinates are such that chroma is centered with
-        // any chroma subsampling. If a specific direction is given, make it
-        // so that the luma and chroma sample line up exactly.
-        // For 4:4:4, setting chroma location should have no effect at all.
-        // luma sample size (in chroma coord. space)
-        chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
-        chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
-    }
-
-    int msb_valid_bits =
-        p->ra_format.component_bits + MPMIN(p->ra_format.component_pad, 0);
-    // The existing code assumes we just have a single tex multiplier for
-    // all of the planes. This may change in the future
-    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.color.space,
-                                         msb_valid_bits,
-                                         p->ra_format.component_bits);
-
-    memset(tex, 0, 4 * sizeof(tex[0]));
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *t = &vimg->planes[n];
-
-        enum plane_type type = PLANE_NONE;
-        for (int i = 0; i < 4; i++) {
-            int c = p->ra_format.components[n][i];
-            enum plane_type ctype;
-            if (c == 0) {
-                ctype = PLANE_NONE;
-            } else if (c == 4) {
-                ctype = PLANE_ALPHA;
-            } else if (p->image_params.color.space == MP_CSP_RGB) {
-                ctype = PLANE_RGB;
-            } else if (p->image_params.color.space == MP_CSP_XYZ) {
-                ctype = PLANE_XYZ;
-            } else {
-                ctype = c == 1 ? PLANE_LUMA : PLANE_CHROMA;
-            }
-            type = merge_plane_types(type, ctype);
-        }
-
-        tex[n] = (struct img_tex){
-            .type = type,
-            .tex = t->tex,
-            .multiplier = tex_mul,
-            .w = t->w,
-            .h = t->h,
-        };
-
-        for (int i = 0; i < 4; i++)
-            tex[n].components += !!p->ra_format.components[n][i];
-
-        get_transform(t->w, t->h, p->image_params.rotate, t->flipped,
-                      &tex[n].transform);
-        if (p->image_params.rotate % 180 == 90)
-            MPSWAP(int, tex[n].w, tex[n].h);
-
-        off[n] = identity_trans;
-
-        if (type == PLANE_CHROMA) {
-            struct gl_transform rot;
-            get_transform(0, 0, p->image_params.rotate, true, &rot);
-
-            struct gl_transform tr = chroma;
-            gl_transform_vec(rot, &tr.t[0], &tr.t[1]);
-
-            float dx = (chroma_upsize(w, p->ra_format.chroma_w) - w) * ls_w;
-            float dy = (chroma_upsize(h, p->ra_format.chroma_h) - h) * ls_h;
-
-            // Adjust the chroma offset if the real chroma size is fractional
-            // due image sizes not aligned to chroma subsampling.
-            struct gl_transform rot2;
-            get_transform(0, 0, p->image_params.rotate, t->flipped, &rot2);
-            if (rot2.m[0][0] < 0)
-                tr.t[0] += dx;
-            if (rot2.m[1][0] < 0)
-                tr.t[0] += dy;
-            if (rot2.m[0][1] < 0)
-                tr.t[1] += dx;
-            if (rot2.m[1][1] < 0)
-                tr.t[1] += dy;
-
-            off[n] = tr;
-        }
-    }
-}
-
-// Return the index of the given component (assuming all non-padding components
-// of all planes are concatenated into a linear list).
-static int find_comp(struct ra_imgfmt_desc *desc, int component)
-{
-    int cur = 0;
-    for (int n = 0; n < desc->num_planes; n++) {
-        for (int i = 0; i < 4; i++) {
-            if (desc->components[n][i]) {
-                if (desc->components[n][i] == component)
-                    return cur;
-                cur++;
-            }
-        }
-    }
-    return -1;
-}
-
-static void init_video(struct gl_video *p)
-{
-    p->use_integer_conversion = false;
-
-    if (p->hwdec && ra_hwdec_test_format(p->hwdec, p->image_params.imgfmt)) {
-        if (p->hwdec->driver->overlay_frame) {
-            MP_WARN(p, "Using HW-overlay mode. No GL filtering is performed "
-                       "on the video!\n");
-        } else {
-            p->hwdec_mapper = ra_hwdec_mapper_create(p->hwdec, &p->image_params);
-            if (!p->hwdec_mapper)
-                MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
-        }
-        if (p->hwdec_mapper)
-            p->image_params = p->hwdec_mapper->dst_params;
-        const char **exts = p->hwdec->glsl_extensions;
-        for (int n = 0; exts && exts[n]; n++)
-            gl_sc_enable_extension(p->sc, (char *)exts[n]);
-        p->hwdec_active = true;
-    }
-
-    p->ra_format = (struct ra_imgfmt_desc){0};
-    ra_get_imgfmt_desc(p->ra, p->image_params.imgfmt, &p->ra_format);
-
-    p->plane_count = p->ra_format.num_planes;
-
-    p->has_alpha = false;
-    p->is_gray = true;
-
-    for (int n = 0; n < p->ra_format.num_planes; n++) {
-        for (int i = 0; i < 4; i++) {
-            if (p->ra_format.components[n][i]) {
-                p->has_alpha |= p->ra_format.components[n][i] == 4;
-                p->is_gray &= p->ra_format.components[n][i] == 1 ||
-                              p->ra_format.components[n][i] == 4;
-            }
-        }
-    }
-
-    for (int c = 0; c < 4; c++) {
-        int loc = find_comp(&p->ra_format, c + 1);
-        p->color_swizzle[c] = "rgba"[loc >= 0 && loc < 4 ? loc : 0];
-    }
-    p->color_swizzle[4] = '\0';
-
-    // Format-dependent checks.
-    check_gl_features(p);
-
-    mp_image_params_guess_csp(&p->image_params);
-
-    av_lfg_init(&p->lfg, 1);
-
-    debug_check_gl(p, "before video texture creation");
-
-    if (!p->hwdec_active) {
-        struct video_image *vimg = &p->image;
-
-        struct mp_image layout = {0};
-        mp_image_set_params(&layout, &p->image_params);
-
-        for (int n = 0; n < p->plane_count; n++) {
-            struct texplane *plane = &vimg->planes[n];
-            const struct ra_format *format = p->ra_format.planes[n];
-
-            plane->w = mp_image_plane_w(&layout, n);
-            plane->h = mp_image_plane_h(&layout, n);
-
-            struct ra_tex_params params = {
-                .dimensions = 2,
-                .w = plane->w + p->opts.tex_pad_x,
-                .h = plane->h + p->opts.tex_pad_y,
-                .d = 1,
-                .format = format,
-                .render_src = true,
-                .src_linear = format->linear_filter,
-                .non_normalized = p->opts.use_rectangle,
-                .host_mutable = true,
-            };
-
-            MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n,
-                       params.w, params.h);
-
-            plane->tex = ra_tex_create(p->ra, &params);
-            if (!plane->tex)
-                abort(); // shit happens
-
-            p->use_integer_conversion |= format->ctype == RA_CTYPE_UINT;
-        }
-    }
-
-    debug_check_gl(p, "after video texture creation");
-
-    gl_video_setup_hooks(p);
-}
-
-// Release any texture mappings associated with the current frame.
-static void unmap_current_image(struct gl_video *p)
-{
-    struct video_image *vimg = &p->image;
-
-    if (vimg->hwdec_mapped) {
-        assert(p->hwdec_active && p->hwdec_mapper);
-        ra_hwdec_mapper_unmap(p->hwdec_mapper);
-        memset(vimg->planes, 0, sizeof(vimg->planes));
-        vimg->hwdec_mapped = false;
-        vimg->id = 0; // needs to be mapped again
-    }
-}
-
-static struct dr_buffer *gl_find_dr_buffer(struct gl_video *p, uint8_t *ptr)
-{
-   for (int i = 0; i < p->num_dr_buffers; i++) {
-       struct dr_buffer *buffer = &p->dr_buffers[i];
-        uint8_t *bufptr = buffer->buf->data;
-        size_t size = buffer->buf->params.size;
-        if (ptr >= bufptr && ptr < bufptr + size)
-            return buffer;
-    }
-
-    return NULL;
-}
-
-static void gc_pending_dr_fences(struct gl_video *p, bool force)
-{
-again:;
-    for (int n = 0; n < p->num_dr_buffers; n++) {
-        struct dr_buffer *buffer = &p->dr_buffers[n];
-        if (!buffer->mpi)
-            continue;
-
-        bool res = p->ra->fns->buf_poll(p->ra, buffer->buf);
-        if (res || force) {
-            // Unreferencing the image could cause gl_video_dr_free_buffer()
-            // to be called by the talloc destructor (if it was the last
-            // reference). This will implicitly invalidate the buffer pointer
-            // and change the p->dr_buffers array. To make it worse, it could
-            // free multiple dr_buffers due to weird theoretical corner cases.
-            // This is also why we use the goto to iterate again from the
-            // start, because everything gets fucked up. Hail satan!
-            struct mp_image *ref = buffer->mpi;
-            buffer->mpi = NULL;
-            talloc_free(ref);
-            goto again;
-        }
-    }
-}
-
-static void unref_current_image(struct gl_video *p)
-{
-    unmap_current_image(p);
-    p->image.id = 0;
-
-    mp_image_unrefp(&p->image.mpi);
-
-    // While we're at it, also garbage collect pending fences in here to
-    // get it out of the way.
-    gc_pending_dr_fences(p, false);
-}
-
-// If overlay mode is used, make sure to remove the overlay.
-// Be careful with this. Removing the overlay and adding another one will
-// lead to flickering artifacts.
-static void unmap_overlay(struct gl_video *p)
-{
-    if (p->hwdec_active && p->hwdec->driver->overlay_frame)
-        p->hwdec->driver->overlay_frame(p->hwdec, NULL, NULL, NULL, true);
-}
-
-static void uninit_video(struct gl_video *p)
-{
-    uninit_rendering(p);
-
-    struct video_image *vimg = &p->image;
-
-    unmap_overlay(p);
-    unref_current_image(p);
-
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
-        ra_tex_free(p->ra, &plane->tex);
-    }
-    *vimg = (struct video_image){0};
-
-    // Invalidate image_params to ensure that gl_video_config() will call
-    // init_video() on uninitialized gl_video.
-    p->real_image_params = (struct mp_image_params){0};
-    p->image_params = p->real_image_params;
-    p->hwdec_active = false;
-    ra_hwdec_mapper_free(&p->hwdec_mapper);
-}
-
-static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
-{
-    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
-        return;
-
-    struct pass_info *pass = &p->pass[p->pass_idx];
-    pass->perf = perf;
-
-    if (pass->desc.len == 0)
-        bstr_xappend(p, &pass->desc, bstr0("(unknown)"));
-
-    p->pass_idx++;
-}
-
-PRINTF_ATTRIBUTE(2, 3)
-static void pass_describe(struct gl_video *p, const char *textf, ...)
-{
-    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
-        return;
-
-    struct pass_info *pass = &p->pass[p->pass_idx];
-
-    if (pass->desc.len > 0)
-        bstr_xappend(p, &pass->desc, bstr0(" + "));
-
-    va_list ap;
-    va_start(ap, textf);
-    bstr_xappend_vasprintf(p, &pass->desc, textf, ap);
-    va_end(ap);
-}
-
-static void pass_info_reset(struct gl_video *p, bool is_redraw)
-{
-    p->pass = is_redraw ? p->pass_redraw : p->pass_fresh;
-    p->pass_idx = 0;
-
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        p->pass[i].desc.len = 0;
-        p->pass[i].perf = (struct mp_pass_perf){0};
-    }
-}
-
-static void pass_report_performance(struct gl_video *p)
-{
-    if (!p->pass)
-        return;
-
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        struct pass_info *pass = &p->pass[i];
-        if (pass->desc.len) {
-            MP_DBG(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
-                   BSTR_P(pass->desc),
-                   (int)pass->perf.last/1000,
-                   (int)pass->perf.avg/1000,
-                   (int)pass->perf.peak/1000);
-        }
-    }
-}
-
-static void pass_prepare_src_tex(struct gl_video *p)
-{
-    struct gl_shader_cache *sc = p->sc;
-
-    for (int n = 0; n < p->pass_tex_num; n++) {
-        struct img_tex *s = &p->pass_tex[n];
-        if (!s->tex)
-            continue;
-
-        char *texture_name = mp_tprintf(32, "texture%d", n);
-        char *texture_size = mp_tprintf(32, "texture_size%d", n);
-        char *texture_rot = mp_tprintf(32, "texture_rot%d", n);
-        char *texture_off = mp_tprintf(32, "texture_off%d", n);
-        char *pixel_size = mp_tprintf(32, "pixel_size%d", n);
-
-        gl_sc_uniform_texture(sc, texture_name, s->tex);
-        float f[2] = {1, 1};
-        if (!s->tex->params.non_normalized) {
-            f[0] = s->tex->params.w;
-            f[1] = s->tex->params.h;
-        }
-        gl_sc_uniform_vec2(sc, texture_size, f);
-        gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m);
-        gl_sc_uniform_vec2(sc, texture_off, (float *)s->transform.t);
-        gl_sc_uniform_vec2(sc, pixel_size, (float[]){1.0f / f[0],
-                                                     1.0f / f[1]});
-    }
-}
-
-// Sets the appropriate compute shader metadata for an implicit compute pass
-// bw/bh: block size
-static void pass_is_compute(struct gl_video *p, int bw, int bh)
-{
-    p->pass_compute = (struct compute_info){
-        .active = true,
-        .block_w = bw,
-        .block_h = bh,
-    };
-}
-
-// w/h: the width/height of the compute shader's operating domain (e.g. the
-// target target that needs to be written, or the source texture that needs to
-// be reduced)
-static void dispatch_compute(struct gl_video *p, int w, int h,
-                             struct compute_info info)
-{
-    PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
-            info.threads_w > 0 ? info.threads_w : info.block_w,
-            info.threads_h > 0 ? info.threads_h : info.block_h);
-
-    pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
-
-    // Since we don't actually have vertices, we pretend for convenience
-    // reasons that we do and calculate the right texture coordinates based on
-    // the output sample ID
-    gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
-    PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
-
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct img_tex *s = &p->pass_tex[n];
-        if (!s->tex)
-            continue;
-
-        // We need to rescale the coordinates to the true texture size
-        char tex_scale[32];
-        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
-        gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
-                (float)s->w / s->tex->params.w,
-                (float)s->h / s->tex->params.h,
-        });
-
-        PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
-        PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + "
-               "pixel_size%d * texture_off%d)\n", n, n, n, n, n);
-        // Clamp the texture coordinates to prevent sampling out-of-bounds in
-        // threads that exceed the requested width/height
-        PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n);
-        PRELUDE("#define texcoord%d texmap%d(gl_GlobalInvocationID)\n", n, n);
-    }
-
-    // always round up when dividing to make sure we don't leave off a part of
-    // the image
-    int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
-        num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
-
-    pass_record(p, gl_sc_dispatch_compute(p->sc, num_x, num_y, 1));
-
-    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
-    p->pass_tex_num = 0;
-}
-
-static struct mp_pass_perf render_pass_quad(struct gl_video *p,
-                                            struct fbodst target,
-                                            const struct mp_rect *dst)
-{
-    struct vertex va[6] = {0};
-
-    struct gl_transform t;
-    gl_transform_ortho_fbodst(&t, target);
-
-    float x[2] = {dst->x0, dst->x1};
-    float y[2] = {dst->y0, dst->y1};
-    gl_transform_vec(t, &x[0], &y[0]);
-    gl_transform_vec(t, &x[1], &y[1]);
-
-    for (int n = 0; n < 4; n++) {
-        struct vertex *v = &va[n];
-        v->position.x = x[n / 2];
-        v->position.y = y[n % 2];
-        for (int i = 0; i < p->pass_tex_num; i++) {
-            struct img_tex *s = &p->pass_tex[i];
-            if (!s->tex)
-                continue;
-            struct gl_transform tr = s->transform;
-            float tx = (n / 2) * s->w;
-            float ty = (n % 2) * s->h;
-            gl_transform_vec(tr, &tx, &ty);
-            bool rect = s->tex->params.non_normalized;
-            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
-            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
-        }
-    }
-
-    va[4] = va[2];
-    va[5] = va[1];
-
-    return gl_sc_dispatch_draw(p->sc, target.tex, va, 6);
-}
-
-static void finish_pass_direct(struct gl_video *p, struct fbodst target,
-                               const struct mp_rect *dst)
-{
-    pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
-    pass_record(p, render_pass_quad(p, target, dst));
-    debug_check_gl(p, "after rendering");
-    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
-    p->pass_tex_num = 0;
-}
-
-// dst_fbo: this will be used for rendering; possibly reallocating the whole
-//          FBO, if the required parameters have changed
-// w, h: required FBO target dimension, and also defines the target rectangle
-//       used for rasterization
-// flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
-//        flags allows the FBO to be larger than the w/h parameters)
-static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
-                            int w, int h, int flags)
-{
-    fbotex_change(dst_fbo, p->ra, p->log, w, h, p->fbo_format, flags);
-
-    if (p->pass_compute.active) {
-        if (!dst_fbo->tex)
-            return;
-        gl_sc_uniform_image2D_wo(p->sc, "out_image", dst_fbo->tex);
-        if (!p->pass_compute.directly_writes)
-            GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
-
-        dispatch_compute(p, w, h, p->pass_compute);
-        p->pass_compute = (struct compute_info){0};
-
-        debug_check_gl(p, "after dispatching compute shader");
-    } else {
-        finish_pass_direct(p, dst_fbo->fbo, &(struct mp_rect){0, 0, w, h});
-    }
-}
-
-static const char *get_tex_swizzle(struct img_tex *img)
-{
-    if (!img->tex)
-        return "rgba";
-    return img->tex->params.format->luminance_alpha ? "raaa" : "rgba";
-}
-
-// Copy a texture to the vec4 color, while increasing offset. Also applies
-// the texture multiplier to the sampled color
-static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
-{
-    int count = img.components;
-    assert(*offset + count <= 4);
-
-    int id = pass_bind(p, img);
-    char src[5] = {0};
-    char dst[5] = {0};
-    const char *tex_fmt = get_tex_swizzle(&img);
-    const char *dst_fmt = "rgba";
-    for (int i = 0; i < count; i++) {
-        src[i] = tex_fmt[i];
-        dst[i] = dst_fmt[*offset + i];
-    }
-
-    if (img.tex && img.tex->params.format->ctype == RA_CTYPE_UINT) {
-        uint64_t tex_max = 1ull << p->ra_format.component_bits;
-        img.multiplier *= 1.0 / (tex_max - 1);
-    }
-
-    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
-          dst, img.multiplier, id, id, src);
-
-    *offset += count;
-}
-
-static void skip_unused(struct gl_video *p, int num_components)
-{
-    for (int i = num_components; i < 4; i++)
-        GLSLF("color.%c = %f;\n", "rgba"[i], i < 3 ? 0.0 : 1.0);
-}
-
-static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
-{
-    fbotex_uninit(&scaler->sep_fbo);
-    ra_tex_free(p->ra, &scaler->lut);
-    scaler->kernel = NULL;
-    scaler->initialized = false;
-}
-
-static void hook_prelude(struct gl_video *p, const char *name, int id,
-                         struct img_tex tex)
-{
-    GLSLHF("#define %s_raw texture%d\n", name, id);
-    GLSLHF("#define %s_pos texcoord%d\n", name, id);
-    GLSLHF("#define %s_size texture_size%d\n", name, id);
-    GLSLHF("#define %s_rot texture_rot%d\n", name, id);
-    GLSLHF("#define %s_pt pixel_size%d\n", name, id);
-    GLSLHF("#define %s_map texmap%d\n", name, id);
-    GLSLHF("#define %s_mul %f\n", name, tex.multiplier);
-
-    // Set up the sampling functions
-    GLSLHF("#define %s_tex(pos) (%s_mul * vec4(texture(%s_raw, pos)).%s)\n",
-           name, name, name, get_tex_swizzle(&tex));
-
-    // Since the extra matrix multiplication impacts performance,
-    // skip it unless the texture was actually rotated
-    if (gl_transform_eq(tex.transform, identity_trans)) {
-        GLSLHF("#define %s_texOff(off) %s_tex(%s_pos + %s_pt * vec2(off))\n",
-               name, name, name, name);
-    } else {
-        GLSLHF("#define %s_texOff(off) "
-                   "%s_tex(%s_pos + %s_rot * vec2(off)/%s_size)\n",
-               name, name, name, name, name);
-    }
-}
-
-static bool saved_tex_find(struct gl_video *p, const char *name,
-                           struct img_tex *out)
-{
-    if (!name || !out)
-        return false;
-
-    for (int i = 0; i < p->saved_tex_num; i++) {
-        if (strcmp(p->saved_tex[i].name, name) == 0) {
-            *out = p->saved_tex[i].tex;
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static void saved_tex_store(struct gl_video *p, const char *name,
-                            struct img_tex tex)
-{
-    assert(name);
-
-    for (int i = 0; i < p->saved_tex_num; i++) {
-        if (strcmp(p->saved_tex[i].name, name) == 0) {
-            p->saved_tex[i].tex = tex;
-            return;
-        }
-    }
-
-    assert(p->saved_tex_num < SHADER_MAX_SAVED);
-    p->saved_tex[p->saved_tex_num++] = (struct saved_tex) {
-        .name = name,
-        .tex = tex
-    };
-}
-
-static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
-                                  struct img_tex tex, struct tex_hook *hook)
-{
-    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
-        char *bind_name = (char *)hook->bind_tex[t];
-
-        if (!bind_name)
-            continue;
-
-        // This is a special name that means "currently hooked texture"
-        if (strcmp(bind_name, "HOOKED") == 0) {
-            int id = pass_bind(p, tex);
-            hook_prelude(p, "HOOKED", id, tex);
-            hook_prelude(p, name, id, tex);
-            continue;
-        }
-
-        // BIND can also be used to load user-defined textures, in which
-        // case we will directly load them as a uniform instead of
-        // generating the hook_prelude boilerplate
-        for (int u = 0; u < p->user_tex_num; u++) {
-            struct gl_user_shader_tex *utex = &p->user_textures[u];
-            if (bstr_equals0(utex->name, bind_name)) {
-                gl_sc_uniform_texture(p->sc, bind_name, utex->tex);
-                goto next_bind;
-            }
-        }
-
-        struct img_tex bind_tex;
-        if (!saved_tex_find(p, bind_name, &bind_tex)) {
-            // Clean up texture bindings and move on to the next hook
-            MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
-                   name, bind_name);
-            p->pass_tex_num -= t;
-            return false;
-        }
-
-        hook_prelude(p, bind_name, pass_bind(p, bind_tex), bind_tex);
-
-next_bind: ;
-    }
-
-    return true;
-}
-
-// Process hooks for a plane, saving the result and returning a new img_tex
-// If 'trans' is NULL, the shader is forbidden from transforming tex
-static struct img_tex pass_hook(struct gl_video *p, const char *name,
-                                struct img_tex tex, struct gl_transform *trans)
-{
-    if (!name)
-        return tex;
-
-    saved_tex_store(p, name, tex);
-
-    MP_DBG(p, "Running hooks for %s\n", name);
-    for (int i = 0; i < p->tex_hook_num; i++) {
-        struct tex_hook *hook = &p->tex_hooks[i];
-
-        // Figure out if this pass hooks this texture
-        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
-            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
-                goto found;
-        }
-
-        continue;
-
-found:
-        // Check the hook's condition
-        if (hook->cond && !hook->cond(p, tex, hook->priv)) {
-            MP_DBG(p, "Skipping hook on %s due to condition.\n", name);
-            continue;
-        }
-
-        if (!pass_hook_setup_binds(p, name, tex, hook))
-            continue;
-
-        // Run the actual hook. This generates a series of GLSL shader
-        // instructions sufficient for drawing the hook's output
-        struct gl_transform hook_off = identity_trans;
-        hook->hook(p, tex, &hook_off, hook->priv);
-
-        int comps = hook->components ? hook->components : tex.components;
-        skip_unused(p, comps);
-
-        // Compute the updated FBO dimensions and store the result
-        struct mp_rect_f sz = {0, 0, tex.w, tex.h};
-        gl_transform_rect(hook_off, &sz);
-        int w = lroundf(fabs(sz.x1 - sz.x0));
-        int h = lroundf(fabs(sz.y1 - sz.y0));
-
-        assert(p->hook_fbo_num < SHADER_MAX_SAVED);
-        struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
-        finish_pass_fbo(p, fbo, w, h, 0);
-
-        const char *store_name = hook->save_tex ? hook->save_tex : name;
-        struct img_tex saved_tex = img_tex_fbo(fbo, tex.type, comps);
-
-        // If the texture we're saving overwrites the "current" texture, also
-        // update the tex parameter so that the future loop cycles will use the
-        // updated values, and export the offset
-        if (strcmp(store_name, name) == 0) {
-            if (!trans && !gl_transform_eq(hook_off, identity_trans)) {
-                MP_ERR(p, "Hook tried changing size of unscalable texture %s!\n",
-                       name);
-                return tex;
-            }
-
-            tex = saved_tex;
-            if (trans)
-                gl_transform_trans(hook_off, trans);
-        }
-
-        saved_tex_store(p, store_name, saved_tex);
-    }
-
-    return tex;
-}
-
-// This can be used at any time in the middle of rendering to specify an
-// optional hook point, which if triggered will render out to a new FBO and
-// load the result back into vec4 color. Offsets applied by the hooks are
-// accumulated in tex_trans, and the FBO is dimensioned according
-// to p->texture_w/h
-static void pass_opt_hook_point(struct gl_video *p, const char *name,
-                                struct gl_transform *tex_trans)
-{
-    if (!name)
-        return;
-
-    for (int i = 0; i < p->tex_hook_num; i++) {
-        struct tex_hook *hook = &p->tex_hooks[i];
-
-        for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
-            if (hook->hook_tex[h] && strcmp(hook->hook_tex[h], name) == 0)
-                goto found;
-        }
-
-        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
-            if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
-                goto found;
-        }
-    }
-
-    // Nothing uses this texture, don't bother storing it
-    return;
-
-found:
-    assert(p->hook_fbo_num < SHADER_MAX_SAVED);
-    struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
-    finish_pass_fbo(p, fbo, p->texture_w, p->texture_h, 0);
-
-    struct img_tex img = img_tex_fbo(fbo, PLANE_RGB, p->components);
-    img = pass_hook(p, name, img, tex_trans);
-    copy_img_tex(p, &(int){0}, img);
-    p->texture_w = img.w;
-    p->texture_h = img.h;
-    p->components = img.components;
-    pass_describe(p, "(remainder pass)");
-}
-
-static void load_shader(struct gl_video *p, struct bstr body)
-{
-    gl_sc_hadd_bstr(p->sc, body);
-    gl_sc_uniform_f(p->sc, "random", (double)av_lfg_get(&p->lfg) / UINT32_MAX);
-    gl_sc_uniform_i(p->sc, "frame", p->frames_uploaded);
-    gl_sc_uniform_vec2(p->sc, "input_size",
-                       (float[]){(p->src_rect.x1 - p->src_rect.x0) *
-                                  p->texture_offset.m[0][0],
-                                  (p->src_rect.y1 - p->src_rect.y0) *
-                                  p->texture_offset.m[1][1]});
-    gl_sc_uniform_vec2(p->sc, "target_size",
-                       (float[]){p->dst_rect.x1 - p->dst_rect.x0,
-                                 p->dst_rect.y1 - p->dst_rect.y0});
-    gl_sc_uniform_vec2(p->sc, "tex_offset",
-                       (float[]){p->src_rect.x0 * p->texture_offset.m[0][0] +
-                                 p->texture_offset.t[0],
-                                 p->src_rect.y0 * p->texture_offset.m[1][1] +
-                                 p->texture_offset.t[1]});
-}
-
-// Semantic equality
-static bool double_seq(double a, double b)
-{
-    return (isnan(a) && isnan(b)) || a == b;
-}
-
-static bool scaler_fun_eq(struct scaler_fun a, struct scaler_fun b)
-{
-    if ((a.name && !b.name) || (b.name && !a.name))
-        return false;
-
-    return ((!a.name && !b.name) || strcmp(a.name, b.name) == 0) &&
-           double_seq(a.params[0], b.params[0]) &&
-           double_seq(a.params[1], b.params[1]) &&
-           a.blur == b.blur &&
-           a.taper == b.taper;
-}
-
-static bool scaler_conf_eq(struct scaler_config a, struct scaler_config b)
-{
-    // Note: antiring isn't compared because it doesn't affect LUT
-    // generation
-    return scaler_fun_eq(a.kernel, b.kernel) &&
-           scaler_fun_eq(a.window, b.window) &&
-           a.radius == b.radius &&
-           a.clamp == b.clamp;
-}
-
-static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
-                          const struct scaler_config *conf,
-                          double scale_factor,
-                          int sizes[])
-{
-    if (scaler_conf_eq(scaler->conf, *conf) &&
-        scaler->scale_factor == scale_factor &&
-        scaler->initialized)
-        return;
-
-    uninit_scaler(p, scaler);
-
-    scaler->conf = *conf;
-    bool is_tscale = scaler->index == SCALER_TSCALE;
-    scaler->conf.kernel.name = (char *)handle_scaler_opt(conf->kernel.name, is_tscale);
-    scaler->conf.window.name = (char *)handle_scaler_opt(conf->window.name, is_tscale);
-    scaler->scale_factor = scale_factor;
-    scaler->insufficient = false;
-    scaler->initialized = true;
-
-    const struct filter_kernel *t_kernel = mp_find_filter_kernel(conf->kernel.name);
-    if (!t_kernel)
-        return;
-
-    scaler->kernel_storage = *t_kernel;
-    scaler->kernel = &scaler->kernel_storage;
-
-    const char *win = conf->window.name;
-    if (!win || !win[0])
-        win = t_kernel->window; // fall back to the scaler's default window
-    const struct filter_window *t_window = mp_find_filter_window(win);
-    if (t_window)
-        scaler->kernel->w = *t_window;
-
-    for (int n = 0; n < 2; n++) {
-        if (!isnan(conf->kernel.params[n]))
-            scaler->kernel->f.params[n] = conf->kernel.params[n];
-        if (!isnan(conf->window.params[n]))
-            scaler->kernel->w.params[n] = conf->window.params[n];
-    }
-
-    if (conf->kernel.blur > 0.0)
-        scaler->kernel->f.blur = conf->kernel.blur;
-    if (conf->window.blur > 0.0)
-        scaler->kernel->w.blur = conf->window.blur;
-
-    if (conf->kernel.taper > 0.0)
-        scaler->kernel->f.taper = conf->kernel.taper;
-    if (conf->window.taper > 0.0)
-        scaler->kernel->w.taper = conf->window.taper;
-
-    if (scaler->kernel->f.resizable && conf->radius > 0.0)
-        scaler->kernel->f.radius = conf->radius;
-
-    scaler->kernel->clamp = conf->clamp;
-    scaler->kernel->value_cutoff = conf->cutoff;
-
-    scaler->insufficient = !mp_init_filter(scaler->kernel, sizes, scale_factor);
-
-    int size = scaler->kernel->size;
-    int num_components = size > 2 ? 4 : size;
-    const struct ra_format *fmt = ra_find_float16_format(p->ra, num_components);
-    assert(fmt);
-
-    int width = (size + num_components - 1) / num_components; // round up
-    int stride = width * num_components;
-    assert(size <= stride);
-
-    scaler->lut_size = 1 << p->opts.scaler_lut_size;
-
-    float *weights = talloc_array(NULL, float, scaler->lut_size * stride);
-    mp_compute_lut(scaler->kernel, scaler->lut_size, stride, weights);
-
-    bool use_1d = scaler->kernel->polar && (p->ra->caps & RA_CAP_TEX_1D);
-
-    struct ra_tex_params lut_params = {
-        .dimensions = use_1d ? 1 : 2,
-        .w = use_1d ? scaler->lut_size : width,
-        .h = use_1d ? 1 : scaler->lut_size,
-        .d = 1,
-        .format = fmt,
-        .render_src = true,
-        .src_linear = true,
-        .initial_data = weights,
-    };
-    scaler->lut = ra_tex_create(p->ra, &lut_params);
-
-    talloc_free(weights);
-
-    debug_check_gl(p, "after initializing scaler");
-}
-
-// Special helper for sampling from two separated stages
-static void pass_sample_separated(struct gl_video *p, struct img_tex src,
-                                  struct scaler *scaler, int w, int h)
-{
-    // Separate the transformation into x and y components, per pass
-    struct gl_transform t_x = {
-        .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}},
-        .t = {src.transform.t[0], 0.0},
-    };
-    struct gl_transform t_y = {
-        .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}},
-        .t = {0.0, src.transform.t[1]},
-    };
-
-    // First pass (scale only in the y dir)
-    src.transform = t_y;
-    sampler_prelude(p->sc, pass_bind(p, src));
-    GLSLF("// first pass\n");
-    pass_sample_separated_gen(p->sc, scaler, 0, 1);
-    GLSLF("color *= %f;\n", src.multiplier);
-    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
-
-    // Second pass (scale only in the x dir)
-    src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
-    src.transform = t_x;
-    pass_describe(p, "%s second pass", scaler->conf.kernel.name);
-    sampler_prelude(p->sc, pass_bind(p, src));
-    pass_sample_separated_gen(p->sc, scaler, 1, 0);
-}
-
-// Picks either the compute shader version or the regular sampler version
-// depending on hardware support
-static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
-                                       struct img_tex tex, int w, int h)
-{
-    uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY;
-    if ((p->ra->caps & reqs) != reqs)
-        goto fallback;
-
-    int bound = ceil(scaler->kernel->radius_cutoff);
-    int offset = bound - 1; // padding top/left
-    int padding = offset + bound; // total padding
-
-    float ratiox = (float)w / tex.w,
-          ratioy = (float)h / tex.h;
-
-    // For performance we want to load at least as many pixels
-    // horizontally as there are threads in a warp (32 for nvidia), as
-    // well as enough to take advantage of shmem parallelism
-    const int warp_size = 32, threads = 256;
-    int bw = warp_size;
-    int bh = threads / bw;
-
-    // We need to sample everything from base_min to base_max, so make sure
-    // we have enough room in shmem
-    int iw = (int)ceil(bw / ratiox) + padding + 1,
-        ih = (int)ceil(bh / ratioy) + padding + 1;
-
-    int shmem_req = iw * ih * tex.components * sizeof(float);
-    if (shmem_req > p->ra->max_shmem)
-        goto fallback;
-
-    pass_is_compute(p, bw, bh);
-    pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
-    return;
-
-fallback:
-    // Fall back to regular polar shader when compute shaders are unsupported
-    // or the kernel is too big for shmem
-    pass_sample_polar(p->sc, scaler, tex.components, p->ra->glsl_version);
-}
-
-// Sample from img_tex, with the src rectangle given by it.
-// The dst rectangle is implicit by what the caller will do next, but w and h
-// must still be what is going to be used (to dimension FBOs correctly).
-// This will write the scaled contents to the vec4 "color".
-// The scaler unit is initialized by this function; in order to avoid cache
-// thrashing, the scaler unit should usually use the same parameters.
-static void pass_sample(struct gl_video *p, struct img_tex tex,
-                        struct scaler *scaler, const struct scaler_config *conf,
-                        double scale_factor, int w, int h)
-{
-    reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
-
-    // Describe scaler
-    const char *scaler_opt[] = {
-        [SCALER_SCALE] = "scale",
-        [SCALER_DSCALE] = "dscale",
-        [SCALER_CSCALE] = "cscale",
-        [SCALER_TSCALE] = "tscale",
-    };
-
-    pass_describe(p, "%s=%s (%s)", scaler_opt[scaler->index],
-                  scaler->conf.kernel.name, plane_names[tex.type]);
-
-    bool is_separated = scaler->kernel && !scaler->kernel->polar;
-
-    // Set up the transformation+prelude and bind the texture, for everything
-    // other than separated scaling (which does this in the subfunction)
-    if (!is_separated)
-        sampler_prelude(p->sc, pass_bind(p, tex));
-
-    // Dispatch the scaler. They're all wildly different.
-    const char *name = scaler->conf.kernel.name;
-    if (strcmp(name, "bilinear") == 0) {
-        GLSL(color = texture(tex, pos);)
-    } else if (strcmp(name, "bicubic_fast") == 0) {
-        pass_sample_bicubic_fast(p->sc);
-    } else if (strcmp(name, "oversample") == 0) {
-        pass_sample_oversample(p->sc, scaler, w, h);
-    } else if (scaler->kernel && scaler->kernel->polar) {
-        pass_dispatch_sample_polar(p, scaler, tex, w, h);
-    } else if (scaler->kernel) {
-        pass_sample_separated(p, tex, scaler, w, h);
-    } else {
-        // Should never happen
-        abort();
-    }
-
-    // Apply any required multipliers. Separated scaling already does this in
-    // its first stage
-    if (!is_separated)
-        GLSLF("color *= %f;\n", tex.multiplier);
-
-    // Micro-optimization: Avoid scaling unneeded channels
-    skip_unused(p, tex.components);
-}
-
-// Returns true if two img_texs are semantically equivalent (same metadata)
-static bool img_tex_equiv(struct img_tex a, struct img_tex b)
-{
-    return a.type == b.type &&
-           a.components == b.components &&
-           a.multiplier == b.multiplier &&
-           a.tex->params.format == b.tex->params.format &&
-           a.tex->params.w == b.tex->params.w &&
-           a.tex->params.h == b.tex->params.h &&
-           a.w == b.w &&
-           a.h == b.h &&
-           gl_transform_eq(a.transform, b.transform);
-}
-
-static bool add_hook(struct gl_video *p, struct tex_hook hook)
-{
-    if (p->tex_hook_num < SHADER_MAX_PASSES) {
-        p->tex_hooks[p->tex_hook_num++] = hook;
-        return true;
-    } else {
-        MP_ERR(p, "Too many passes! Limit is %d.\n", SHADER_MAX_PASSES);
-        talloc_free(hook.priv);
-        return false;
-    }
-}
-
-static void deband_hook(struct gl_video *p, struct img_tex tex,
-                        struct gl_transform *trans, void *priv)
-{
-    pass_describe(p, "debanding (%s)", plane_names[tex.type]);
-    pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg,
-                       p->image_params.color.gamma);
-}
-
-static void unsharp_hook(struct gl_video *p, struct img_tex tex,
-                         struct gl_transform *trans, void *priv)
-{
-    pass_describe(p, "unsharp masking");
-    pass_sample_unsharp(p->sc, p->opts.unsharp);
-}
-
-struct szexp_ctx {
-    struct gl_video *p;
-    struct img_tex tex;
-};
-
-static bool szexp_lookup(void *priv, struct bstr var, float size[2])
-{
-    struct szexp_ctx *ctx = priv;
-    struct gl_video *p = ctx->p;
-
-    if (bstr_equals0(var, "NATIVE_CROPPED")) {
-        size[0] = (p->src_rect.x1 - p->src_rect.x0) * p->texture_offset.m[0][0];
-        size[1] = (p->src_rect.y1 - p->src_rect.y0) * p->texture_offset.m[1][1];
-        return true;
-    }
-
-    // The size of OUTPUT is determined. It could be useful for certain
-    // user shaders to skip passes.
-    if (bstr_equals0(var, "OUTPUT")) {
-        size[0] = p->dst_rect.x1 - p->dst_rect.x0;
-        size[1] = p->dst_rect.y1 - p->dst_rect.y0;
-        return true;
-    }
-
-    // HOOKED is a special case
-    if (bstr_equals0(var, "HOOKED")) {
-        size[0] = ctx->tex.w;
-        size[1] = ctx->tex.h;
-        return true;
-    }
-
-    for (int o = 0; o < p->saved_tex_num; o++) {
-        if (bstr_equals0(var, p->saved_tex[o].name)) {
-            size[0] = p->saved_tex[o].tex.w;
-            size[1] = p->saved_tex[o].tex.h;
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
-{
-    struct gl_user_shader_hook *shader = priv;
-    assert(shader);
-
-    float res = false;
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->cond, &res);
-    return res;
-}
-
-static void user_hook(struct gl_video *p, struct img_tex tex,
-                      struct gl_transform *trans, void *priv)
-{
-    struct gl_user_shader_hook *shader = priv;
-    assert(shader);
-    load_shader(p, shader->pass_body);
-
-    pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
-                  plane_names[tex.type]);
-
-    if (shader->compute.active) {
-        p->pass_compute = shader->compute;
-        GLSLF("hook();\n");
-    } else {
-        GLSLF("color = hook();\n");
-    }
-
-    // Make sure we at least create a legal FBO on failure, since it's better
-    // to do this and display an error message than just crash OpenGL
-    float w = 1.0, h = 1.0;
-
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->width, &w);
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->height, &h);
-
-    *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
-    gl_transform_trans(shader->offset, trans);
-}
-
-static bool add_user_hook(void *priv, struct gl_user_shader_hook hook)
-{
-    struct gl_video *p = priv;
-    struct gl_user_shader_hook *copy = talloc_ptrtype(p, copy);
-    *copy = hook;
-
-    struct tex_hook texhook = {
-        .save_tex = bstrdup0(copy, hook.save_tex),
-        .components = hook.components,
-        .hook = user_hook,
-        .cond = user_hook_cond,
-        .priv = copy,
-    };
-
-    for (int h = 0; h < SHADER_MAX_HOOKS; h++)
-        texhook.hook_tex[h] = bstrdup0(copy, hook.hook_tex[h]);
-    for (int h = 0; h < SHADER_MAX_BINDS; h++)
-        texhook.bind_tex[h] = bstrdup0(copy, hook.bind_tex[h]);
-
-    return add_hook(p, texhook);
-}
-
-static bool add_user_tex(void *priv, struct gl_user_shader_tex tex)
-{
-    struct gl_video *p = priv;
-
-    if (p->user_tex_num == SHADER_MAX_PASSES) {
-        MP_ERR(p, "Too many textures! Limit is %d.\n", SHADER_MAX_PASSES);
-        goto err;
-    }
-
-    tex.tex = ra_tex_create(p->ra, &tex.params);
-    TA_FREEP(&tex.params.initial_data);
-
-    p->user_textures[p->user_tex_num++] = tex;
-    return true;
-
-err:
-    talloc_free(tex.params.initial_data);
-    return false;
-}
-
-static void load_user_shaders(struct gl_video *p, char **shaders)
-{
-    if (!shaders)
-        return;
-
-    for (int n = 0; shaders[n] != NULL; n++) {
-        struct bstr file = load_cached_file(p, shaders[n]);
-        parse_user_shader(p->log, p->ra, file, p, add_user_hook, add_user_tex);
-    }
-}
-
-static void gl_video_setup_hooks(struct gl_video *p)
-{
-    gl_video_reset_hooks(p);
-
-    if (p->opts.deband) {
-        add_hook(p, (struct tex_hook) {
-            .hook_tex = {"LUMA", "CHROMA", "RGB", "XYZ"},
-            .bind_tex = {"HOOKED"},
-            .hook = deband_hook,
-        });
-    }
-
-    if (p->opts.unsharp != 0.0) {
-        add_hook(p, (struct tex_hook) {
-            .hook_tex = {"MAIN"},
-            .bind_tex = {"HOOKED"},
-            .hook = unsharp_hook,
-        });
-    }
-
-    load_user_shaders(p, p->opts.user_shaders);
-}
-
-// sample from video textures, set "color" variable to yuv value
-static void pass_read_video(struct gl_video *p)
-{
-    struct img_tex tex[4];
-    struct gl_transform offsets[4];
-    pass_get_img_tex(p, &p->image, tex, offsets);
-
-    // To keep the code as simple as possibly, we currently run all shader
-    // stages even if they would be unnecessary (e.g. no hooks for a texture).
-    // In the future, deferred img_tex should optimize this away.
-
-    // Merge semantically identical textures. This loop is done from back
-    // to front so that merged textures end up in the right order while
-    // simultaneously allowing us to skip unnecessary merges
-    for (int n = 3; n >= 0; n--) {
-        if (tex[n].type == PLANE_NONE)
-            continue;
-
-        int first = n;
-        int num = 0;
-
-        for (int i = 0; i < n; i++) {
-            if (img_tex_equiv(tex[n], tex[i]) &&
-                gl_transform_eq(offsets[n], offsets[i]))
-            {
-                GLSLF("// merging plane %d ...\n", i);
-                copy_img_tex(p, &num, tex[i]);
-                first = MPMIN(first, i);
-                tex[i] = (struct img_tex){0};
-            }
-        }
-
-        if (num > 0) {
-            GLSLF("// merging plane %d ... into %d\n", n, first);
-            copy_img_tex(p, &num, tex[n]);
-            pass_describe(p, "merging planes");
-            finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
-            tex[n] = (struct img_tex){0};
-        }
-    }
-
-    // If any textures are still in integer format by this point, we need
-    // to introduce an explicit conversion pass to avoid breaking hooks/scaling
-    for (int n = 0; n < 4; n++) {
-        if (tex[n].tex && tex[n].tex->params.format->ctype == RA_CTYPE_UINT) {
-            GLSLF("// use_integer fix for plane %d\n", n);
-            copy_img_tex(p, &(int){0}, tex[n]);
-            pass_describe(p, "use_integer fix");
-            finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
-                                 tex[n].components);
-        }
-    }
-
-    // Dispatch the hooks for all of these textures, saving and perhaps
-    // modifying them in the process
-    for (int n = 0; n < 4; n++) {
-        const char *name;
-        switch (tex[n].type) {
-        case PLANE_RGB:    name = "RGB";    break;
-        case PLANE_LUMA:   name = "LUMA";   break;
-        case PLANE_CHROMA: name = "CHROMA"; break;
-        case PLANE_ALPHA:  name = "ALPHA";  break;
-        case PLANE_XYZ:    name = "XYZ";    break;
-        default: continue;
-        }
-
-        tex[n] = pass_hook(p, name, tex[n], &offsets[n]);
-    }
-
-    // At this point all planes are finalized but they may not be at the
-    // required size yet. Furthermore, they may have texture offsets that
-    // require realignment. For lack of something better to do, we assume
-    // the rgb/luma texture is the "reference" and scale everything else
-    // to match.
-    for (int n = 0; n < 4; n++) {
-        switch (tex[n].type) {
-        case PLANE_RGB:
-        case PLANE_XYZ:
-        case PLANE_LUMA: break;
-        default: continue;
-        }
-
-        p->texture_w = tex[n].w;
-        p->texture_h = tex[n].h;
-        p->texture_offset = offsets[n];
-        break;
-    }
-
-    // Compute the reference rect
-    struct mp_rect_f src = {0.0, 0.0, p->image_params.w, p->image_params.h};
-    struct mp_rect_f ref = src;
-    gl_transform_rect(p->texture_offset, &ref);
-    MP_DBG(p, "ref rect: {%f %f} {%f %f}\n", ref.x0, ref.y0, ref.x1, ref.y1);
-
-    // Explicitly scale all of the textures that don't match
-    for (int n = 0; n < 4; n++) {
-        if (tex[n].type == PLANE_NONE)
-            continue;
-
-        // If the planes are aligned identically, we will end up with the
-        // exact same source rectangle.
-        struct mp_rect_f rect = src;
-        gl_transform_rect(offsets[n], &rect);
-        MP_DBG(p, "rect[%d]: {%f %f} {%f %f}\n", n,
-               rect.x0, rect.y0, rect.x1, rect.y1);
-
-        if (mp_rect_f_seq(ref, rect))
-            continue;
-
-        // If the rectangles differ, then our planes have a different
-        // alignment and/or size. First of all, we have to compute the
-        // corrections required to meet the target rectangle
-        struct gl_transform fix = {
-            .m = {{(ref.x1 - ref.x0) / (rect.x1 - rect.x0), 0.0},
-                  {0.0, (ref.y1 - ref.y0) / (rect.y1 - rect.y0)}},
-            .t = {ref.x0, ref.y0},
-        };
-        MP_DBG(p, "-> fix[%d] = {%f %f} + off {%f %f}\n", n,
-               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
-
-        // Since the scale in texture space is different from the scale in
-        // absolute terms, we have to scale the coefficients down to be
-        // relative to the texture's physical dimensions and local offset
-        struct gl_transform scale = {
-            .m = {{(float)tex[n].w / p->texture_w, 0.0},
-                  {0.0, (float)tex[n].h / p->texture_h}},
-            .t = {-rect.x0, -rect.y0},
-        };
-        if (p->image_params.rotate % 180 == 90)
-            MPSWAP(double, scale.m[0][0], scale.m[1][1]);
-
-        gl_transform_trans(scale, &fix);
-        MP_DBG(p, "-> scaled[%d] = {%f %f} + off {%f %f}\n", n,
-               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
-
-        // Since the texture transform is a function of the texture coordinates
-        // to texture space, rather than the other way around, we have to
-        // actually apply the *inverse* of this. Fortunately, calculating
-        // the inverse is relatively easy here.
-        fix.m[0][0] = 1.0 / fix.m[0][0];
-        fix.m[1][1] = 1.0 / fix.m[1][1];
-        fix.t[0] = fix.m[0][0] * -fix.t[0];
-        fix.t[1] = fix.m[1][1] * -fix.t[1];
-        gl_transform_trans(fix, &tex[n].transform);
-
-        int scaler_id = -1;
-        const char *name = NULL;
-        switch (tex[n].type) {
-        case PLANE_RGB:
-        case PLANE_LUMA:
-        case PLANE_XYZ:
-            scaler_id = SCALER_SCALE;
-            // these aren't worth hooking, fringe hypothetical cases only
-            break;
-        case PLANE_CHROMA:
-            scaler_id = SCALER_CSCALE;
-            name = "CHROMA_SCALED";
-            break;
-        case PLANE_ALPHA:
-            // alpha always uses bilinear
-            name = "ALPHA_SCALED";
-        }
-
-        if (scaler_id < 0)
-            continue;
-
-        const struct scaler_config *conf = &p->opts.scaler[scaler_id];
-        struct scaler *scaler = &p->scaler[scaler_id];
-
-        // bilinear scaling is a free no-op thanks to GPU sampling
-        if (strcmp(conf->kernel.name, "bilinear") != 0) {
-            GLSLF("// upscaling plane %d\n", n);
-            pass_sample(p, tex[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
-            finish_pass_fbo(p, &p->scale_fbo[n], p->texture_w, p->texture_h, 0);
-            tex[n] = img_tex_fbo(&p->scale_fbo[n], tex[n].type, tex[n].components);
-        }
-
-        // Run any post-scaling hooks
-        tex[n] = pass_hook(p, name, tex[n], NULL);
-    }
-
-    // All planes are of the same size and properly aligned at this point
-    GLSLF("// combining planes\n");
-    int coord = 0;
-    for (int i = 0; i < 4; i++) {
-        if (tex[i].type != PLANE_NONE)
-            copy_img_tex(p, &coord, tex[i]);
-    }
-    p->components = coord;
-}
-
-// Utility function that simply binds an FBO and reads from it, without any
-// transformations.
-static void pass_read_fbo(struct gl_video *p, struct fbotex *fbo)
-{
-    struct img_tex tex = img_tex_fbo(fbo, PLANE_RGB, p->components);
-    copy_img_tex(p, &(int){0}, tex);
-}
-
-// yuv conversion, and any other conversions before main up/down-scaling
-static void pass_convert_yuv(struct gl_video *p)
-{
-    struct gl_shader_cache *sc = p->sc;
-
-    struct mp_csp_params cparams = MP_CSP_PARAMS_DEFAULTS;
-    cparams.gray = p->is_gray;
-    mp_csp_set_image_params(&cparams, &p->image_params);
-    mp_csp_equalizer_state_get(p->video_eq, &cparams);
-    p->user_gamma = 1.0 / (cparams.gamma * p->opts.gamma);
-
-    pass_describe(p, "color conversion");
-
-    if (p->color_swizzle[0])
-        GLSLF("color = color.%s;\n", p->color_swizzle);
-
-    // Pre-colormatrix input gamma correction
-    if (cparams.color.space == MP_CSP_XYZ)
-        GLSL(color.rgb = pow(color.rgb, vec3(2.6));) // linear light
-
-    // We always explicitly normalize the range in pass_read_video
-    cparams.input_bits = cparams.texture_bits = 0;
-
-    // Conversion to RGB. For RGB itself, this still applies e.g. brightness
-    // and contrast controls, or expansion of e.g. LSB-packed 10 bit data.
-    struct mp_cmat m = {{{0}}};
-    mp_get_csp_matrix(&cparams, &m);
-    gl_sc_uniform_mat3(sc, "colormatrix", true, &m.m[0][0]);
-    gl_sc_uniform_vec3(sc, "colormatrix_c", m.c);
-
-    GLSL(color.rgb = mat3(colormatrix) * color.rgb + colormatrix_c;)
-
-    if (p->image_params.color.space == MP_CSP_BT_2020_C) {
-        // Conversion for C'rcY'cC'bc via the BT.2020 CL system:
-        // C'bc = (B'-Y'c) / 1.9404  | C'bc <= 0
-        //      = (B'-Y'c) / 1.5816  | C'bc >  0
-        //
-        // C'rc = (R'-Y'c) / 1.7184  | C'rc <= 0
-        //      = (R'-Y'c) / 0.9936  | C'rc >  0
-        //
-        // as per the BT.2020 specification, table 4. This is a non-linear
-        // transformation because (constant) luminance receives non-equal
-        // contributions from the three different channels.
-        GLSLF("// constant luminance conversion\n");
-        GLSL(color.br = color.br * mix(vec2(1.5816, 0.9936),
-                                       vec2(1.9404, 1.7184),
-                                       lessThanEqual(color.br, vec2(0)))
-                        + color.gg;)
-        // Expand channels to camera-linear light. This shader currently just
-        // assumes everything uses the BT.2020 12-bit gamma function, since the
-        // difference between 10 and 12-bit is negligible for anything other
-        // than 12-bit content.
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
-                             pow((color.rgb + vec3(0.0993))*vec3(1.0/1.0993),
-                                 vec3(1.0/0.45)),
-                             lessThanEqual(vec3(0.08145), color.rgb));)
-        // Calculate the green channel from the expanded RYcB
-        // The BT.2020 specification says Yc = 0.2627*R + 0.6780*G + 0.0593*B
-        GLSL(color.g = (color.g - 0.2627*color.r - 0.0593*color.b)*1.0/0.6780;)
-        // Recompress to receive the R'G'B' result, same as other systems
-        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
-                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
-                             lessThanEqual(vec3(0.0181), color.rgb));)
-    }
-
-    p->components = 3;
-    if (!p->has_alpha || p->opts.alpha_mode == ALPHA_NO) {
-        GLSL(color.a = 1.0;)
-    } else { // alpha present in image
-        p->components = 4;
-        GLSL(color = vec4(color.rgb * color.a, color.a);)
-    }
-}
-
-static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2])
-{
-    double target_w = p->src_rect.x1 - p->src_rect.x0;
-    double target_h = p->src_rect.y1 - p->src_rect.y0;
-    if (transpose_rot && p->image_params.rotate % 180 == 90)
-        MPSWAP(double, target_w, target_h);
-    xy[0] = (p->dst_rect.x1 - p->dst_rect.x0) / target_w;
-    xy[1] = (p->dst_rect.y1 - p->dst_rect.y0) / target_h;
-}
-
-// Cropping.
-static void compute_src_transform(struct gl_video *p, struct gl_transform *tr)
-{
-    float sx = (p->src_rect.x1 - p->src_rect.x0) / (float)p->texture_w,
-          sy = (p->src_rect.y1 - p->src_rect.y0) / (float)p->texture_h,
-          ox = p->src_rect.x0,
-          oy = p->src_rect.y0;
-    struct gl_transform transform = {{{sx, 0}, {0, sy}}, {ox, oy}};
-
-    gl_transform_trans(p->texture_offset, &transform);
-
-    *tr = transform;
-}
-
-// Takes care of the main scaling and pre/post-conversions
-static void pass_scale_main(struct gl_video *p)
-{
-    // Figure out the main scaler.
-    double xy[2];
-    get_scale_factors(p, true, xy);
-
-    // actual scale factor should be divided by the scale factor of prescaling.
-    xy[0] /= p->texture_offset.m[0][0];
-    xy[1] /= p->texture_offset.m[1][1];
-
-    bool downscaling = xy[0] < 1.0 || xy[1] < 1.0;
-    bool upscaling = !downscaling && (xy[0] > 1.0 || xy[1] > 1.0);
-    double scale_factor = 1.0;
-
-    struct scaler *scaler = &p->scaler[SCALER_SCALE];
-    struct scaler_config scaler_conf = p->opts.scaler[SCALER_SCALE];
-    if (p->opts.scaler_resizes_only && !downscaling && !upscaling) {
-        scaler_conf.kernel.name = "bilinear";
-        // For scaler-resizes-only, we round the texture offset to
-        // the nearest round value in order to prevent ugly blurriness
-        // (in exchange for slightly shifting the image by up to half a
-        // subpixel)
-        p->texture_offset.t[0] = roundf(p->texture_offset.t[0]);
-        p->texture_offset.t[1] = roundf(p->texture_offset.t[1]);
-    }
-    if (downscaling && p->opts.scaler[SCALER_DSCALE].kernel.name) {
-        scaler_conf = p->opts.scaler[SCALER_DSCALE];
-        scaler = &p->scaler[SCALER_DSCALE];
-    }
-
-    // When requesting correct-downscaling and the clip is anamorphic, and
-    // because only a single scale factor is used for both axes, enable it only
-    // when both axes are downscaled, and use the milder of the factors to not
-    // end up with too much blur on one axis (even if we end up with sub-optimal
-    // scale factor on the other axis). This is better than not respecting
-    // correct scaling at all for anamorphic clips.
-    double f = MPMAX(xy[0], xy[1]);
-    if (p->opts.correct_downscaling && f < 1.0)
-        scale_factor = 1.0 / f;
-
-    // Pre-conversion, like linear light/sigmoidization
-    GLSLF("// scaler pre-conversion\n");
-    bool use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
-
-    // Linear light downscaling results in nasty artifacts for HDR curves due
-    // to the potentially extreme brightness differences severely compounding
-    // any ringing. So just scale in gamma light instead.
-    if (mp_trc_is_hdr(p->image_params.color.gamma) && downscaling)
-        use_linear = false;
-
-    if (use_linear) {
-        p->use_linear = true;
-        pass_linearize(p->sc, p->image_params.color.gamma);
-        pass_opt_hook_point(p, "LINEAR", NULL);
-    }
-
-    bool use_sigmoid = use_linear && p->opts.sigmoid_upscaling && upscaling;
-    float sig_center, sig_slope, sig_offset, sig_scale;
-    if (use_sigmoid) {
-        // Coefficients for the sigmoidal transform are taken from the
-        // formula here: http://www.imagemagick.org/Usage/color_mods/#sigmoidal
-        sig_center = p->opts.sigmoid_center;
-        sig_slope  = p->opts.sigmoid_slope;
-        // This function needs to go through (0,0) and (1,1) so we compute the
-        // values at 1 and 0, and then scale/shift them, respectively.
-        sig_offset = 1.0/(1+expf(sig_slope * sig_center));
-        sig_scale  = 1.0/(1+expf(sig_slope * (sig_center-1))) - sig_offset;
-        GLSLF("color.rgb = %f - log(1.0/(color.rgb * %f + %f) - 1.0) * 1.0/%f;\n",
-                sig_center, sig_scale, sig_offset, sig_slope);
-        pass_opt_hook_point(p, "SIGMOID", NULL);
-    }
-
-    pass_opt_hook_point(p, "PREKERNEL", NULL);
-
-    int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
-    int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-    struct gl_transform transform;
-    compute_src_transform(p, &transform);
-
-    GLSLF("// main scaling\n");
-    finish_pass_fbo(p, &p->indirect_fbo, p->texture_w, p->texture_h, 0);
-    struct img_tex src = img_tex_fbo(&p->indirect_fbo, PLANE_RGB, p->components);
-    gl_transform_trans(transform, &src.transform);
-    pass_sample(p, src, scaler, &scaler_conf, scale_factor, vp_w, vp_h);
-
-    // Changes the texture size to display size after main scaler.
-    p->texture_w = vp_w;
-    p->texture_h = vp_h;
-
-    pass_opt_hook_point(p, "POSTKERNEL", NULL);
-
-    GLSLF("// scaler post-conversion\n");
-    if (use_sigmoid) {
-        // Inverse of the transformation above
-        GLSLF("color.rgb = (1.0/(1.0 + exp(%f * (%f - color.rgb))) - %f) * 1.0/%f;\n",
-                sig_slope, sig_center, sig_offset, sig_scale);
-    }
-}
-
-// Adapts the colors to the right output color space. (Final pass during
-// rendering)
-// If OSD is true, ignore any changes that may have been made to the video
-// by previous passes (i.e. linear scaling)
-static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool osd)
-{
-    struct ra *ra = p->ra;
-
-    // Figure out the target color space from the options, or auto-guess if
-    // none were set
-    struct mp_colorspace dst = {
-        .gamma = p->opts.target_trc,
-        .primaries = p->opts.target_prim,
-        .light = MP_CSP_LIGHT_DISPLAY,
-    };
-
-    if (p->use_lut_3d) {
-        // The 3DLUT is always generated against the video's original source
-        // space, *not* the reference space. (To avoid having to regenerate
-        // the 3DLUT for the OSD on every frame)
-        enum mp_csp_prim prim_orig = p->image_params.color.primaries;
-        enum mp_csp_trc trc_orig = p->image_params.color.gamma;
-
-        // One exception: HDR is not implemented by LittleCMS for technical
-        // limitation reasons, so we use a gamma 2.2 input curve here instead.
-        // We could pick any value we want here, the difference is just coding
-        // efficiency.
-        if (mp_trc_is_hdr(trc_orig))
-            trc_orig = MP_CSP_TRC_GAMMA22;
-
-        if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
-            dst.primaries = prim_orig;
-            dst.gamma = trc_orig;
-        }
-    }
-
-    if (dst.primaries == MP_CSP_PRIM_AUTO) {
-        // The vast majority of people are on sRGB or BT.709 displays, so pick
-        // this as the default output color space.
-        dst.primaries = MP_CSP_PRIM_BT_709;
-
-        if (src.primaries == MP_CSP_PRIM_BT_601_525 ||
-            src.primaries == MP_CSP_PRIM_BT_601_625)
-        {
-            // Since we auto-pick BT.601 and BT.709 based on the dimensions,
-            // combined with the fact that they're very similar to begin with,
-            // and to avoid confusing the average user, just don't adapt BT.601
-            // content automatically at all.
-            dst.primaries = src.primaries;
-        }
-    }
-
-    if (dst.gamma == MP_CSP_TRC_AUTO) {
-        // Most people seem to complain when the image is darker or brighter
-        // than what they're "used to", so just avoid changing the gamma
-        // altogether by default. The only exceptions to this rule apply to
-        // very unusual TRCs, which even hardcode technoluddites would probably
-        // not enjoy viewing unaltered.
-        dst.gamma = src.gamma;
-
-        // Avoid outputting linear light or HDR content "by default". For these
-        // just pick gamma 2.2 as a default, since it's a good estimate for
-        // the response of typical displays
-        if (dst.gamma == MP_CSP_TRC_LINEAR || mp_trc_is_hdr(dst.gamma))
-            dst.gamma = MP_CSP_TRC_GAMMA22;
-    }
-
-    bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
-    if (detect_peak && !p->hdr_peak_ssbo) {
-        struct {
-            unsigned int sig_peak_raw;
-            unsigned int index;
-            unsigned int frame_max[PEAK_DETECT_FRAMES+1];
-        } peak_ssbo = {0};
-
-        // Prefill with safe values
-        int safe = MP_REF_WHITE * mp_trc_nom_peak(p->image_params.color.gamma);
-        peak_ssbo.sig_peak_raw = PEAK_DETECT_FRAMES * safe;
-        for (int i = 0; i < PEAK_DETECT_FRAMES+1; i++)
-            peak_ssbo.frame_max[i] = safe;
-
-        struct ra_buf_params params = {
-            .type = RA_BUF_TYPE_SHADER_STORAGE,
-            .size = sizeof(peak_ssbo),
-            .initial_data = &peak_ssbo,
-        };
-
-        p->hdr_peak_ssbo = ra_buf_create(ra, &params);
-        if (!p->hdr_peak_ssbo) {
-            MP_WARN(p, "Failed to create HDR peak detection SSBO, disabling.\n");
-            detect_peak = (p->opts.compute_hdr_peak = false);
-        }
-    }
-
-    if (detect_peak) {
-        pass_describe(p, "detect HDR peak");
-        pass_is_compute(p, 8, 8); // 8x8 is good for performance
-        gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
-            "uint sig_peak_raw;"
-            "uint index;"
-            "uint frame_max[%d];", PEAK_DETECT_FRAMES + 1
-        );
-    }
-
-    // Adapt from src to dst as necessary
-    pass_color_map(p->sc, src, dst, p->opts.tone_mapping,
-                   p->opts.tone_mapping_param, p->opts.tone_mapping_desat,
-                   detect_peak, p->opts.gamut_warning, p->use_linear && !osd);
-
-    if (p->use_lut_3d) {
-        gl_sc_uniform_texture(p->sc, "lut_3d", p->lut_3d_texture);
-        GLSL(vec3 cpos;)
-        for (int i = 0; i < 3; i++)
-            GLSLF("cpos[%d] = LUT_POS(color[%d], %d.0);\n", i, i, p->lut_3d_size[i]);
-        GLSL(color.rgb = tex3D(lut_3d, cpos).rgb;)
-    }
-}
-
-void gl_video_set_fb_depth(struct gl_video *p, int fb_depth)
-{
-    p->fb_depth = fb_depth;
-}
-
-static void pass_dither(struct gl_video *p)
-{
-    // Assume 8 bits per component if unknown.
-    int dst_depth = p->fb_depth > 0 ? p->fb_depth : 8;
-    if (p->opts.dither_depth > 0)
-        dst_depth = p->opts.dither_depth;
-
-    if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
-        return;
-
-    if (!p->dither_texture) {
-        MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
-
-        int tex_size = 0;
-        void *tex_data = NULL;
-        const struct ra_format *fmt = NULL;
-        void *temp = NULL;
-
-        if (p->opts.dither_algo == DITHER_FRUIT) {
-            int sizeb = p->opts.dither_size;
-            int size = 1 << sizeb;
-
-            if (p->last_dither_matrix_size != size) {
-                p->last_dither_matrix = talloc_realloc(p, p->last_dither_matrix,
-                                                       float, size * size);
-                mp_make_fruit_dither_matrix(p->last_dither_matrix, sizeb);
-                p->last_dither_matrix_size = size;
-            }
-
-            // Prefer R16 texture since they provide higher precision.
-            fmt = ra_find_unorm_format(p->ra, 2, 1);
-            if (!fmt)
-                fmt = ra_find_float16_format(p->ra, 1);
-            if (fmt) {
-                tex_size = size;
-                tex_data = p->last_dither_matrix;
-                if (fmt->ctype == RA_CTYPE_UNORM) {
-                    uint16_t *t = temp = talloc_array(NULL, uint16_t, size * size);
-                    for (int n = 0; n < size * size; n++)
-                        t[n] = p->last_dither_matrix[n] * UINT16_MAX;
-                    tex_data = t;
-                }
-            } else {
-                MP_VERBOSE(p, "GL too old. Falling back to ordered dither.\n");
-                p->opts.dither_algo = DITHER_ORDERED;
-            }
-        }
-
-        if (p->opts.dither_algo == DITHER_ORDERED) {
-            temp = talloc_array(NULL, char, 8 * 8);
-            mp_make_ordered_dither_matrix(temp, 8);
-
-            fmt = ra_find_unorm_format(p->ra, 1, 1);
-            tex_size = 8;
-            tex_data = temp;
-        }
-
-        struct ra_tex_params params = {
-            .dimensions = 2,
-            .w = tex_size,
-            .h = tex_size,
-            .d = 1,
-            .format = fmt,
-            .render_src = true,
-            .src_repeat = true,
-            .initial_data = tex_data,
-        };
-        p->dither_texture = ra_tex_create(p->ra, &params);
-
-        debug_check_gl(p, "dither setup");
-
-        talloc_free(temp);
-    }
-
-    GLSLF("// dithering\n");
-
-    // This defines how many bits are considered significant for output on
-    // screen. The superfluous bits will be used for rounding according to the
-    // dither matrix. The precision of the source implicitly decides how many
-    // dither patterns can be visible.
-    int dither_quantization = (1 << dst_depth) - 1;
-    int dither_size = p->dither_texture->params.w;
-
-    gl_sc_uniform_texture(p->sc, "dither", p->dither_texture);
-
-    GLSLF("vec2 dither_pos = gl_FragCoord.xy * 1.0/%d.0;\n", dither_size);
-
-    if (p->opts.temporal_dither) {
-        int phase = (p->frames_rendered / p->opts.temporal_dither_period) % 8u;
-        float r = phase * (M_PI / 2); // rotate
-        float m = phase < 4 ? 1 : -1; // mirror
-
-        float matrix[2][2] = {{cos(r),     -sin(r)    },
-                              {sin(r) * m,  cos(r) * m}};
-        gl_sc_uniform_mat2(p->sc, "dither_trafo", true, &matrix[0][0]);
-
-        GLSL(dither_pos = dither_trafo * dither_pos;)
-    }
-
-    GLSL(float dither_value = texture(dither, dither_pos).r;)
-    GLSLF("color = floor(color * %d.0 + dither_value + 0.5 / %d.0) * 1.0/%d.0;\n",
-          dither_quantization, dither_size * dither_size, dither_quantization);
-}
-
-// Draws the OSD, in scene-referred colors.. If cms is true, subtitles are
-// instead adapted to the display's gamut.
-static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
-                          struct mp_osd_res rect, struct fbodst target, bool cms)
-{
-    mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo_out, draw_flags);
-
-    timer_pool_start(p->osd_timer);
-    for (int n = 0; n < MAX_OSD_PARTS; n++) {
-        // (This returns false if this part is empty with nothing to draw.)
-        if (!mpgl_osd_draw_prepare(p->osd, n, p->sc))
-            continue;
-        // When subtitles need to be color managed, assume they're in sRGB
-        // (for lack of anything saner to do)
-        if (cms) {
-            static const struct mp_colorspace csp_srgb = {
-                .primaries = MP_CSP_PRIM_BT_709,
-                .gamma = MP_CSP_TRC_SRGB,
-                .light = MP_CSP_LIGHT_DISPLAY,
-            };
-
-            pass_colormanage(p, csp_srgb, true);
-        }
-        mpgl_osd_draw_finish(p->osd, n, p->sc, target);
-    }
-
-    timer_pool_stop(p->osd_timer);
-    pass_describe(p, "drawing osd");
-    pass_record(p, timer_pool_measure(p->osd_timer));
-}
-
-static float chroma_realign(int size, int pixel)
-{
-    return size / (float)chroma_upsize(size, pixel);
-}
-
-// Minimal rendering code path, for GLES or OpenGL 2.1 without proper FBOs.
-static void pass_render_frame_dumb(struct gl_video *p)
-{
-    struct img_tex tex[4];
-    struct gl_transform off[4];
-    pass_get_img_tex(p, &p->image, tex, off);
-
-    struct gl_transform transform;
-    compute_src_transform(p, &transform);
-
-    int index = 0;
-    for (int i = 0; i < p->plane_count; i++) {
-        int cw = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_w : 1;
-        int ch = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_h : 1;
-        if (p->image_params.rotate % 180 == 90)
-            MPSWAP(int, cw, ch);
-
-        struct gl_transform t = transform;
-        t.m[0][0] *= chroma_realign(p->texture_w, cw);
-        t.m[1][1] *= chroma_realign(p->texture_h, ch);
-
-        t.t[0] /= cw;
-        t.t[1] /= ch;
-
-        t.t[0] += off[i].t[0];
-        t.t[1] += off[i].t[1];
-
-        gl_transform_trans(tex[i].transform, &t);
-        tex[i].transform = t;
-
-        copy_img_tex(p, &index, tex[i]);
-    }
-
-    pass_convert_yuv(p);
-}
-
-// The main rendering function, takes care of everything up to and including
-// upscaling. p->image is rendered.
-static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t id)
-{
-    // initialize the texture parameters and temporary variables
-    p->texture_w = p->image_params.w;
-    p->texture_h = p->image_params.h;
-    p->texture_offset = identity_trans;
-    p->components = 0;
-    p->saved_tex_num = 0;
-    p->hook_fbo_num = 0;
-    p->use_linear = false;
-
-    // try uploading the frame
-    if (!pass_upload_image(p, mpi, id))
-        return false;
-
-    if (p->image_params.rotate % 180 == 90)
-        MPSWAP(int, p->texture_w, p->texture_h);
-
-    if (p->dumb_mode)
-        return true;
-
-    pass_read_video(p);
-    pass_opt_hook_point(p, "NATIVE", &p->texture_offset);
-    pass_convert_yuv(p);
-    pass_opt_hook_point(p, "MAINPRESUB", &p->texture_offset);
-
-    // For subtitles
-    double vpts = p->image.mpi->pts;
-    if (vpts == MP_NOPTS_VALUE)
-        vpts = p->osd_pts;
-
-    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO) {
-        double scale[2];
-        get_scale_factors(p, false, scale);
-        struct mp_osd_res rect = {
-            .w = p->texture_w, .h = p->texture_h,
-            .display_par = scale[1] / scale[0], // counter compensate scaling
-        };
-        finish_pass_fbo(p, &p->blend_subs_fbo, rect.w, rect.h, 0);
-        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
-                      p->blend_subs_fbo.fbo, false);
-        pass_read_fbo(p, &p->blend_subs_fbo);
-        pass_describe(p, "blend subs video");
-    }
-    pass_opt_hook_point(p, "MAIN", &p->texture_offset);
-
-    pass_scale_main(p);
-
-    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
-        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES) {
-        // Recreate the real video size from the src/dst rects
-        struct mp_osd_res rect = {
-            .w = vp_w, .h = vp_h,
-            .ml = -p->src_rect.x0, .mr = p->src_rect.x1 - p->image_params.w,
-            .mt = -p->src_rect.y0, .mb = p->src_rect.y1 - p->image_params.h,
-            .display_par = 1.0,
-        };
-        // Adjust margins for scale
-        double scale[2];
-        get_scale_factors(p, true, scale);
-        rect.ml *= scale[0]; rect.mr *= scale[0];
-        rect.mt *= scale[1]; rect.mb *= scale[1];
-        // We should always blend subtitles in non-linear light
-        if (p->use_linear) {
-            pass_delinearize(p->sc, p->image_params.color.gamma);
-            p->use_linear = false;
-        }
-        finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h, 0);
-        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
-                      p->blend_subs_fbo.fbo, false);
-        pass_read_fbo(p, &p->blend_subs_fbo);
-        pass_describe(p, "blend subs");
-    }
-
-    pass_opt_hook_point(p, "SCALED", NULL);
-
-    return true;
-}
-
-static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
-{
-    if (p->dumb_mode)
-        pass_render_frame_dumb(p);
-
-    // Adjust the overall gamma before drawing to screen
-    if (p->user_gamma != 1) {
-        gl_sc_uniform_f(p->sc, "user_gamma", p->user_gamma);
-        GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-        GLSL(color.rgb = pow(color.rgb, vec3(user_gamma));)
-    }
-
-    pass_colormanage(p, p->image_params.color, false);
-
-    // Since finish_pass_direct doesn't work with compute shaders, and neither
-    // does the checkerboard/dither code, we may need an indirection via
-    // p->screen_fbo here.
-    if (p->pass_compute.active) {
-        int o_w = p->dst_rect.x1 - p->dst_rect.x0,
-            o_h = p->dst_rect.y1 - p->dst_rect.y0;
-        finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
-        struct img_tex tmp = img_tex_fbo(&p->screen_fbo, PLANE_RGB, p->components);
-        copy_img_tex(p, &(int){0}, tmp);
-    }
-
-    if (p->has_alpha){
-        if (p->opts.alpha_mode == ALPHA_BLEND_TILES) {
-            // Draw checkerboard pattern to indicate transparency
-            GLSLF("// transparency checkerboard\n");
-            GLSL(bvec2 tile = lessThan(fract(gl_FragCoord.xy * 1.0/32.0), vec2(0.5));)
-            GLSL(vec3 background = vec3(tile.x == tile.y ? 1.0 : 0.75);)
-            GLSL(color.rgb = mix(background, color.rgb, color.a);)
-        } else if (p->opts.alpha_mode == ALPHA_BLEND) {
-            // Blend into background color (usually black)
-            struct m_color c = p->opts.background;
-            GLSLF("vec4 background = vec4(%f, %f, %f, %f);\n",
-                  c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0);
-            GLSL(color = mix(background, vec4(color.rgb, 1.0), color.a);)
-        }
-    }
-
-    pass_opt_hook_point(p, "OUTPUT", NULL);
-
-    pass_dither(p);
-    pass_describe(p, "output to screen");
-    finish_pass_direct(p, fbo, &p->dst_rect);
-}
-
-static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
-                              uint64_t id, struct fbosurface *surf)
-{
-    int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
-        vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-
-    pass_info_reset(p, false);
-    if (!pass_render_frame(p, mpi, id))
-        return false;
-
-    // Frame blending should always be done in linear light to preserve the
-    // overall brightness, otherwise this will result in flashing dark frames
-    // because mixing in compressed light artificially darkens the results
-    if (!p->use_linear) {
-        p->use_linear = true;
-        pass_linearize(p->sc, p->image_params.color.gamma);
-    }
-
-    finish_pass_fbo(p, &surf->fbotex, vp_w, vp_h, FBOTEX_FUZZY);
-    surf->id  = id;
-    surf->pts = mpi->pts;
-    return true;
-}
-
-// Draws an interpolate frame to fbo, based on the frame timing in t
-static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
-                                       struct fbodst fbo)
-{
-    bool is_new = false;
-
-    // Reset the queue completely if this is a still image, to avoid any
-    // interpolation artifacts from surrounding frames when unpausing or
-    // framestepping
-    if (t->still)
-        gl_video_reset_surfaces(p);
-
-    // First of all, figure out if we have a frame available at all, and draw
-    // it manually + reset the queue if not
-    if (p->surfaces[p->surface_now].id == 0) {
-        struct fbosurface *now = &p->surfaces[p->surface_now];
-        if (!update_fbosurface(p, t->current, t->frame_id, now))
-            return;
-        p->surface_idx = p->surface_now;
-        is_new = true;
-    }
-
-    // Find the right frame for this instant
-    if (t->current) {
-        int next = fbosurface_wrap(p->surface_now + 1);
-        while (p->surfaces[next].id &&
-               p->surfaces[next].id > p->surfaces[p->surface_now].id &&
-               p->surfaces[p->surface_now].id < t->frame_id)
-        {
-            p->surface_now = next;
-            next = fbosurface_wrap(next + 1);
-        }
-    }
-
-    // Figure out the queue size. For illustration, a filter radius of 2 would
-    // look like this: _ A [B] C D _
-    // A is surface_bse, B is surface_now, C is surface_now+1 and D is
-    // surface_end.
-    struct scaler *tscale = &p->scaler[SCALER_TSCALE];
-    reinit_scaler(p, tscale, &p->opts.scaler[SCALER_TSCALE], 1, tscale_sizes);
-    bool oversample = strcmp(tscale->conf.kernel.name, "oversample") == 0;
-    bool linear = strcmp(tscale->conf.kernel.name, "linear") == 0;
-    int size;
-
-    if (oversample || linear) {
-        size = 2;
-    } else {
-        assert(tscale->kernel && !tscale->kernel->polar);
-        size = ceil(tscale->kernel->size);
-        assert(size <= TEXUNIT_VIDEO_NUM);
-    }
-
-    int radius = size/2;
-    int surface_now = p->surface_now;
-    int surface_bse = fbosurface_wrap(surface_now - (radius-1));
-    int surface_end = fbosurface_wrap(surface_now + radius);
-    assert(fbosurface_wrap(surface_bse + size-1) == surface_end);
-
-    // Render new frames while there's room in the queue. Note that technically,
-    // this should be done before the step where we find the right frame, but
-    // it only barely matters at the very beginning of playback, and this way
-    // makes the code much more linear.
-    int surface_dst = fbosurface_wrap(p->surface_idx + 1);
-    for (int i = 0; i < t->num_frames; i++) {
-        // Avoid overwriting data we might still need
-        if (surface_dst == surface_bse - 1)
-            break;
-
-        struct mp_image *f = t->frames[i];
-        uint64_t f_id = t->frame_id + i;
-        if (!mp_image_params_equal(&f->params, &p->real_image_params))
-            continue;
-
-        if (f_id > p->surfaces[p->surface_idx].id) {
-            struct fbosurface *dst = &p->surfaces[surface_dst];
-            if (!update_fbosurface(p, f, f_id, dst))
-                return;
-            p->surface_idx = surface_dst;
-            surface_dst = fbosurface_wrap(surface_dst + 1);
-            is_new = true;
-        }
-    }
-
-    // Figure out whether the queue is "valid". A queue is invalid if the
-    // frames' PTS is not monotonically increasing. Anything else is invalid,
-    // so avoid blending incorrect data and just draw the latest frame as-is.
-    // Possible causes for failure of this condition include seeks, pausing,
-    // end of playback or start of playback.
-    bool valid = true;
-    for (int i = surface_bse, ii; valid && i != surface_end; i = ii) {
-        ii = fbosurface_wrap(i + 1);
-        if (p->surfaces[i].id == 0 || p->surfaces[ii].id == 0) {
-            valid = false;
-        } else if (p->surfaces[ii].id < p->surfaces[i].id) {
-            valid = false;
-            MP_DBG(p, "interpolation queue underrun\n");
-        }
-    }
-
-    // Update OSD PTS to synchronize subtitles with the displayed frame
-    p->osd_pts = p->surfaces[surface_now].pts;
-
-    // Finally, draw the right mix of frames to the screen.
-    if (!is_new)
-        pass_info_reset(p, true);
-    pass_describe(p, "interpolation");
-    if (!valid || t->still) {
-        // surface_now is guaranteed to be valid, so we can safely use it.
-        pass_read_fbo(p, &p->surfaces[surface_now].fbotex);
-        p->is_interpolated = false;
-    } else {
-        double mix = t->vsync_offset / t->ideal_frame_duration;
-        // The scaler code always wants the fcoord to be between 0 and 1,
-        // so we try to adjust by using the previous set of N frames instead
-        // (which requires some extra checking to make sure it's valid)
-        if (mix < 0.0) {
-            int prev = fbosurface_wrap(surface_bse - 1);
-            if (p->surfaces[prev].id != 0 &&
-                p->surfaces[prev].id < p->surfaces[surface_bse].id)
-            {
-                mix += 1.0;
-                surface_bse = prev;
-            } else {
-                mix = 0.0; // at least don't blow up, this should only
-                           // ever happen at the start of playback
-            }
-        }
-
-        if (oversample) {
-            // Oversample uses the frame area as mix ratio, not the the vsync
-            // position itself
-            double vsync_dist = t->vsync_interval / t->ideal_frame_duration,
-                   threshold = tscale->conf.kernel.params[0];
-            threshold = isnan(threshold) ? 0.0 : threshold;
-            mix = (1 - mix) / vsync_dist;
-            mix = mix <= 0 + threshold ? 0 : mix;
-            mix = mix >= 1 - threshold ? 1 : mix;
-            mix = 1 - mix;
-        }
-
-        // Blend the frames together
-        if (oversample || linear) {
-            gl_sc_uniform_f(p->sc, "inter_coeff", mix);
-            GLSL(color = mix(texture(texture0, texcoord0),
-                             texture(texture1, texcoord1),
-                             inter_coeff);)
-        } else {
-            gl_sc_uniform_f(p->sc, "fcoord", mix);
-            pass_sample_separated_gen(p->sc, tscale, 0, 0);
-        }
-
-        // Load all the required frames
-        for (int i = 0; i < size; i++) {
-            struct img_tex img =
-                img_tex_fbo(&p->surfaces[fbosurface_wrap(surface_bse+i)].fbotex,
-                            PLANE_RGB, p->components);
-            // Since the code in pass_sample_separated currently assumes
-            // the textures are bound in-order and starting at 0, we just
-            // assert to make sure this is the case (which it should always be)
-            int id = pass_bind(p, img);
-            assert(id == i);
-        }
-
-        MP_DBG(p, "inter frame dur: %f vsync: %f, mix: %f\n",
-               t->ideal_frame_duration, t->vsync_interval, mix);
-        p->is_interpolated = true;
-    }
-    pass_draw_to_screen(p, fbo);
-
-    p->frames_drawn += 1;
-}
-
-void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct fbodst target)
-{
-    struct mp_rect target_rc = {0, 0, target.tex->params.w, target.tex->params.h};
-
-    p->broken_frame = false;
-
-    bool has_frame = !!frame->current;
-
-    if (!has_frame || !mp_rect_equals(&p->dst_rect, &target_rc)) {
-        struct m_color c = p->clear_color;
-        float color[4] = {c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0};
-        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
-    }
-
-    if (p->hwdec_active && p->hwdec->driver->overlay_frame) {
-        if (has_frame) {
-            float *color = p->hwdec->overlay_colorkey;
-            p->ra->fns->clear(p->ra, target.tex, color, &p->dst_rect);
-        }
-
-        p->hwdec->driver->overlay_frame(p->hwdec, frame->current,
-                                        &p->src_rect, &p->dst_rect,
-                                        frame->frame_id != p->image.id);
-
-        if (frame->current)
-            p->osd_pts = frame->current->pts;
-
-        // Disable GL rendering
-        has_frame = false;
-    }
-
-    if (has_frame) {
-        bool interpolate = p->opts.interpolation && frame->display_synced &&
-                           (p->frames_drawn || !frame->still);
-        if (interpolate) {
-            double ratio = frame->ideal_frame_duration / frame->vsync_interval;
-            if (fabs(ratio - 1.0) < p->opts.interpolation_threshold)
-                interpolate = false;
-        }
-
-        if (interpolate) {
-            gl_video_interpolate_frame(p, frame, target);
-        } else {
-            bool is_new = frame->frame_id != p->image.id;
-
-            // Redrawing a frame might update subtitles.
-            if (frame->still && p->opts.blend_subs)
-                is_new = true;
-
-            if (is_new || !p->output_fbo_valid) {
-                p->output_fbo_valid = false;
-
-                pass_info_reset(p, !is_new);
-                if (!pass_render_frame(p, frame->current, frame->frame_id))
-                    goto done;
-
-                // For the non-interpolation case, we draw to a single "cache"
-                // FBO to speed up subsequent re-draws (if any exist)
-                struct fbodst dest_fbo = target;
-                if (frame->num_vsyncs > 1 && frame->display_synced &&
-                    !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
-                {
-                    fbotex_change(&p->output_fbo, p->ra, p->log,
-                                  target.tex->params.w, target.tex->params.h,
-                                  p->fbo_format, FBOTEX_FUZZY);
-                    dest_fbo = p->output_fbo.fbo;
-                    p->output_fbo_valid = true;
-                }
-                pass_draw_to_screen(p, dest_fbo);
-            }
-
-            // "output fbo valid" and "output fbo needed" are equivalent
-            if (p->output_fbo_valid) {
-                pass_info_reset(p, true);
-                pass_describe(p, "redraw cached frame");
-                struct mp_rect src = p->dst_rect;
-                struct mp_rect dst = src;
-                if (target.flip) {
-                    dst.y0 = target.tex->params.h - src.y0;
-                    dst.y1 = target.tex->params.h - src.y1;
-                }
-                timer_pool_start(p->blit_timer);
-                p->ra->fns->blit(p->ra, target.tex, p->output_fbo.tex,
-                                 &dst, &src);
-                timer_pool_stop(p->blit_timer);
-                pass_record(p, timer_pool_measure(p->blit_timer));
-            }
-        }
-    }
-
-done:
-
-    unmap_current_image(p);
-
-    debug_check_gl(p, "after video rendering");
-
-    if (p->osd) {
-        // If we haven't actually drawn anything so far, then we technically
-        // need to consider this the start of a new pass. Let's call it a
-        // redraw just because, since it's basically a blank frame anyway
-        if (!has_frame)
-            pass_info_reset(p, true);
-
-        pass_draw_osd(p, p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0,
-                      p->osd_pts, p->osd_rect, target, true);
-        debug_check_gl(p, "after OSD rendering");
-    }
-
-    if (gl_sc_error_state(p->sc) || p->broken_frame) {
-        // Make the screen solid blue to make it visually clear that an
-        // error has occurred
-        float color[4] = {0.0, 0.05, 0.5, 1.0};
-        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
-    }
-
-    // The playloop calls this last before waiting some time until it decides
-    // to call flip_page(). Tell OpenGL to start execution of the GPU commands
-    // while we sleep (this happens asynchronously).
-    if ((p->opts.early_flush == -1 && !frame->display_synced) ||
-        p->opts.early_flush == 1)
-    {
-        if (p->ra->fns->flush)
-            p->ra->fns->flush(p->ra);
-    }
-
-    p->frames_rendered++;
-    pass_report_performance(p);
-}
-
-// Use this color instead of the global option.
-void gl_video_set_clear_color(struct gl_video *p, struct m_color c)
-{
-    p->force_clear_color = true;
-    p->clear_color = c;
-}
-
-void gl_video_set_osd_pts(struct gl_video *p, double pts)
-{
-    p->osd_pts = pts;
-}
-
-bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *res,
-                               double pts)
-{
-    return p->osd ? mpgl_osd_check_change(p->osd, res, pts) : false;
-}
-
-void gl_video_resize(struct gl_video *p,
-                     struct mp_rect *src, struct mp_rect *dst,
-                     struct mp_osd_res *osd)
-{
-    if (mp_rect_equals(&p->src_rect, src) &&
-        mp_rect_equals(&p->dst_rect, dst) &&
-        osd_res_equals(p->osd_rect, *osd))
-        return;
-
-    p->src_rect = *src;
-    p->dst_rect = *dst;
-    p->osd_rect = *osd;
-
-    gl_video_reset_surfaces(p);
-
-    if (p->osd)
-        mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo_out);
-}
-
-static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
-{
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        if (!pass[i].desc.len)
-            break;
-        out->perf[out->count] = pass[i].perf;
-        out->desc[out->count] = pass[i].desc.start;
-        out->count++;
-    }
-}
-
-void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out)
-{
-    *out = (struct voctrl_performance_data){0};
-    frame_perf_data(p->pass_fresh,  &out->fresh);
-    frame_perf_data(p->pass_redraw, &out->redraw);
-}
-
-// This assumes nv12, with textures set to GL_NEAREST filtering.
-static void reinterleave_vdpau(struct gl_video *p,
-                               struct ra_tex *input[4], struct ra_tex *output[2])
-{
-    for (int n = 0; n < 2; n++) {
-        struct fbotex *fbo = &p->vdpau_deinterleave_fbo[n];
-        // This is an array of the 2 to-merge planes.
-        struct ra_tex **src = &input[n * 2];
-        int w = src[0]->params.w;
-        int h = src[0]->params.h;
-        int ids[2];
-        for (int t = 0; t < 2; t++) {
-            ids[t] = pass_bind(p, (struct img_tex){
-                .tex = src[t],
-                .multiplier = 1.0,
-                .transform = identity_trans,
-                .w = w,
-                .h = h,
-            });
-        }
-
-        GLSLF("color = fract(gl_FragCoord.y * 0.5) < 0.5\n");
-        GLSLF("      ? texture(texture%d, texcoord%d)\n", ids[0], ids[0]);
-        GLSLF("      : texture(texture%d, texcoord%d);", ids[1], ids[1]);
-
-        const struct ra_format *fmt =
-            ra_find_unorm_format(p->ra, 1, n == 0 ? 1 : 2);
-        fbotex_change(fbo, p->ra, p->log, w, h * 2, fmt, 0);
-
-        pass_describe(p, "vdpau reinterleaving");
-        finish_pass_direct(p, fbo->fbo, &(struct mp_rect){0, 0, w, h * 2});
-
-        output[n] = fbo->tex;
-    }
-}
-
-// Returns false on failure.
-static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t id)
-{
-    struct video_image *vimg = &p->image;
-
-    if (vimg->id == id)
-        return true;
-
-    unref_current_image(p);
-
-    mpi = mp_image_new_ref(mpi);
-    if (!mpi)
-        goto error;
-
-    vimg->mpi = mpi;
-    vimg->id = id;
-    p->osd_pts = mpi->pts;
-    p->frames_uploaded++;
-
-    if (p->hwdec_active) {
-        // Hardware decoding
-
-        if (!p->hwdec_mapper)
-            goto error;
-
-        pass_describe(p, "map frame (hwdec)");
-        timer_pool_start(p->upload_timer);
-        bool ok = ra_hwdec_mapper_map(p->hwdec_mapper, vimg->mpi) >= 0;
-        timer_pool_stop(p->upload_timer);
-        pass_record(p, timer_pool_measure(p->upload_timer));
-
-        vimg->hwdec_mapped = true;
-        if (ok) {
-            struct mp_image layout = {0};
-            mp_image_set_params(&layout, &p->image_params);
-            struct ra_tex **tex = p->hwdec_mapper->tex;
-            struct ra_tex *tmp[4] = {0};
-            if (p->hwdec_mapper->vdpau_fields) {
-                reinterleave_vdpau(p, tex, tmp);
-                tex = tmp;
-            }
-            for (int n = 0; n < p->plane_count; n++) {
-                vimg->planes[n] = (struct texplane){
-                    .w = mp_image_plane_w(&layout, n),
-                    .h = mp_image_plane_h(&layout, n),
-                    .tex = tex[n],
-                };
-            }
-        } else {
-            MP_FATAL(p, "Mapping hardware decoded surface failed.\n");
-            goto error;
-        }
-        return true;
-    }
-
-    // Software decoding
-    assert(mpi->num_planes == p->plane_count);
-
-    timer_pool_start(p->upload_timer);
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
-
-        plane->flipped = mpi->stride[0] < 0;
-
-        struct ra_tex_upload_params params = {
-            .tex = plane->tex,
-            .src = mpi->planes[n],
-            .invalidate = true,
-            .stride = mpi->stride[n],
-        };
-
-        struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]);
-        if (mapped) {
-            params.buf = mapped->buf;
-            params.buf_offset = (uintptr_t)params.src -
-                                (uintptr_t)mapped->buf->data;
-            params.src = NULL;
-        }
-
-        if (p->using_dr_path != !!mapped) {
-            p->using_dr_path = !!mapped;
-            MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no");
-        }
-
-        if (!p->ra->fns->tex_upload(p->ra, &params)) {
-            timer_pool_stop(p->upload_timer);
-            goto error;
-        }
-
-        if (mapped && !mapped->mpi)
-            mapped->mpi = mp_image_new_ref(mpi);
-    }
-    timer_pool_stop(p->upload_timer);
-
-    bool using_pbo = p->ra->use_pbo || !(p->ra->caps & RA_CAP_DIRECT_UPLOAD);
-    const char *mode = p->using_dr_path ? "DR" : using_pbo ? "PBO" : "naive";
-    pass_describe(p, "upload frame (%s)", mode);
-    pass_record(p, timer_pool_measure(p->upload_timer));
-
-    return true;
-
-error:
-    unref_current_image(p);
-    p->broken_frame = true;
-    return false;
-}
-
-static bool test_fbo(struct gl_video *p, const struct ra_format *fmt)
-{
-    MP_VERBOSE(p, "Testing FBO format %s\n", fmt->name);
-    struct fbotex fbo = {0};
-    bool success = fbotex_change(&fbo, p->ra, p->log, 16, 16, fmt, 0);
-    fbotex_uninit(&fbo);
-    return success;
-}
-
-// Return whether dumb-mode can be used without disabling any features.
-// Essentially, vo_opengl with mostly default settings will return true.
-static bool check_dumb_mode(struct gl_video *p)
-{
-    struct gl_video_opts *o = &p->opts;
-    if (p->use_integer_conversion)
-        return false;
-    if (o->dumb_mode > 0) // requested by user
-        return true;
-    if (o->dumb_mode < 0) // disabled by user
-        return false;
-
-    // otherwise, use auto-detection
-    if (o->target_prim || o->target_trc || o->linear_scaling ||
-        o->correct_downscaling || o->sigmoid_upscaling || o->interpolation ||
-        o->blend_subs || o->deband || o->unsharp)
-        return false;
-    // check remaining scalers (tscale is already implicitly excluded above)
-    for (int i = 0; i < SCALER_COUNT; i++) {
-        if (i != SCALER_TSCALE) {
-            const char *name = o->scaler[i].kernel.name;
-            if (name && strcmp(name, "bilinear") != 0)
-                return false;
-        }
-    }
-    if (o->user_shaders && o->user_shaders[0])
-        return false;
-    if (p->use_lut_3d)
-        return false;
-    return true;
-}
-
-// Disable features that are not supported with the current OpenGL version.
-static void check_gl_features(struct gl_video *p)
-{
-    struct ra *ra = p->ra;
-    bool have_float_tex = !!ra_find_float16_format(ra, 1);
-    bool have_mglsl = ra->glsl_version >= 130; // modern GLSL
-    const struct ra_format *rg_tex = ra_find_unorm_format(p->ra, 1, 2);
-    bool have_texrg = rg_tex && !rg_tex->luminance_alpha;
-    bool have_compute = ra->caps & RA_CAP_COMPUTE;
-    bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
-
-    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgb10_a2", "rgba8", 0};
-    const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
-    const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
-                          ? user_fbo_fmts : auto_fbo_fmts;
-    bool have_fbo = false;
-    p->fbo_format = NULL;
-    for (int n = 0; fbo_fmts[n]; n++) {
-        const char *fmt = fbo_fmts[n];
-        const struct ra_format *f = ra_find_named_format(p->ra, fmt);
-        if (!f && fbo_fmts == user_fbo_fmts)
-            MP_WARN(p, "FBO format '%s' not found!\n", fmt);
-        if (f && f->renderable && f->linear_filter && test_fbo(p, f)) {
-            MP_VERBOSE(p, "Using FBO format %s.\n", f->name);
-            have_fbo = true;
-            p->fbo_format = f;
-            break;
-        }
-    }
-
-    p->forced_dumb_mode = p->opts.dumb_mode > 0 || !have_fbo || !have_texrg;
-    bool voluntarily_dumb = check_dumb_mode(p);
-    if (p->forced_dumb_mode || voluntarily_dumb) {
-        if (voluntarily_dumb) {
-            MP_VERBOSE(p, "No advanced processing required. Enabling dumb mode.\n");
-        } else if (p->opts.dumb_mode <= 0) {
-            MP_WARN(p, "High bit depth FBOs unsupported. Enabling dumb mode.\n"
-                       "Most extended features will be disabled.\n");
-        }
-        p->dumb_mode = true;
-        p->use_lut_3d = false;
-        // Most things don't work, so whitelist all options that still work.
-        p->opts = (struct gl_video_opts){
-            .gamma = p->opts.gamma,
-            .gamma_auto = p->opts.gamma_auto,
-            .pbo = p->opts.pbo,
-            .fbo_format = p->opts.fbo_format,
-            .alpha_mode = p->opts.alpha_mode,
-            .use_rectangle = p->opts.use_rectangle,
-            .background = p->opts.background,
-            .dither_algo = p->opts.dither_algo,
-            .dither_depth = p->opts.dither_depth,
-            .dither_size = p->opts.dither_size,
-            .temporal_dither = p->opts.temporal_dither,
-            .temporal_dither_period = p->opts.temporal_dither_period,
-            .tex_pad_x = p->opts.tex_pad_x,
-            .tex_pad_y = p->opts.tex_pad_y,
-            .tone_mapping = p->opts.tone_mapping,
-            .tone_mapping_param = p->opts.tone_mapping_param,
-            .tone_mapping_desat = p->opts.tone_mapping_desat,
-            .early_flush = p->opts.early_flush,
-        };
-        for (int n = 0; n < SCALER_COUNT; n++)
-            p->opts.scaler[n] = gl_video_opts_def.scaler[n];
-        return;
-    }
-    p->dumb_mode = false;
-
-    // Normally, we want to disable them by default if FBOs are unavailable,
-    // because they will be slow (not critically slow, but still slower).
-    // Without FP textures, we must always disable them.
-    // I don't know if luminance alpha float textures exist, so disregard them.
-    for (int n = 0; n < SCALER_COUNT; n++) {
-        const struct filter_kernel *kernel =
-            mp_find_filter_kernel(p->opts.scaler[n].kernel.name);
-        if (kernel) {
-            char *reason = NULL;
-            if (!have_float_tex)
-                reason = "(float tex. missing)";
-            if (!have_mglsl)
-                reason = "(GLSL version too old)";
-            if (reason) {
-                MP_WARN(p, "Disabling scaler #%d %s %s.\n", n,
-                        p->opts.scaler[n].kernel.name, reason);
-                // p->opts is a copy => we can just mess with it.
-                p->opts.scaler[n].kernel.name = "bilinear";
-                if (n == SCALER_TSCALE)
-                    p->opts.interpolation = 0;
-            }
-        }
-    }
-
-    int use_cms = p->opts.target_prim != MP_CSP_PRIM_AUTO ||
-                  p->opts.target_trc != MP_CSP_TRC_AUTO || p->use_lut_3d;
-
-    // mix() is needed for some gamma functions
-    if (!have_mglsl && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
-        p->opts.linear_scaling = false;
-        p->opts.sigmoid_upscaling = false;
-        MP_WARN(p, "Disabling linear/sigmoid scaling (GLSL version too old).\n");
-    }
-    if (!have_mglsl && use_cms) {
-        p->opts.target_prim = MP_CSP_PRIM_AUTO;
-        p->opts.target_trc = MP_CSP_TRC_AUTO;
-        p->use_lut_3d = false;
-        MP_WARN(p, "Disabling color management (GLSL version too old).\n");
-    }
-    if (!have_mglsl && p->opts.deband) {
-        p->opts.deband = 0;
-        MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
-    }
-    if ((!have_compute || !have_ssbo) && p->opts.compute_hdr_peak) {
-        p->opts.compute_hdr_peak = 0;
-        MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
-    }
-}
-
-static void init_gl(struct gl_video *p)
-{
-    debug_check_gl(p, "before init_gl");
-
-    p->upload_timer = timer_pool_create(p->ra);
-    p->blit_timer = timer_pool_create(p->ra);
-    p->osd_timer = timer_pool_create(p->ra);
-
-    debug_check_gl(p, "after init_gl");
-
-    ra_dump_tex_formats(p->ra, MSGL_DEBUG);
-    ra_dump_img_formats(p->ra, MSGL_DEBUG);
-}
-
-void gl_video_uninit(struct gl_video *p)
-{
-    if (!p)
-        return;
-
-    uninit_video(p);
-
-    gl_sc_destroy(p->sc);
-
-    ra_tex_free(p->ra, &p->lut_3d_texture);
-    ra_buf_free(p->ra, &p->hdr_peak_ssbo);
-
-    timer_pool_destroy(p->upload_timer);
-    timer_pool_destroy(p->blit_timer);
-    timer_pool_destroy(p->osd_timer);
-
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
-        talloc_free(p->pass_fresh[i].desc.start);
-        talloc_free(p->pass_redraw[i].desc.start);
-    }
-
-    mpgl_osd_destroy(p->osd);
-
-    // Forcibly destroy possibly remaining image references. This should also
-    // cause gl_video_dr_free_buffer() to be called for the remaining buffers.
-    gc_pending_dr_fences(p, true);
-
-    // Should all have been unreffed already.
-    assert(!p->num_dr_buffers);
-
-    talloc_free(p);
-}
-
-void gl_video_reset(struct gl_video *p)
-{
-    gl_video_reset_surfaces(p);
-}
-
-bool gl_video_showing_interpolated_frame(struct gl_video *p)
-{
-    return p->is_interpolated;
-}
-
-static bool is_imgfmt_desc_supported(struct gl_video *p,
-                                     const struct ra_imgfmt_desc *desc)
-{
-    if (!desc->num_planes)
-        return false;
-
-    if (desc->planes[0]->ctype == RA_CTYPE_UINT && p->forced_dumb_mode)
-        return false;
-
-    return true;
-}
-
-bool gl_video_check_format(struct gl_video *p, int mp_format)
-{
-    struct ra_imgfmt_desc desc;
-    if (ra_get_imgfmt_desc(p->ra, mp_format, &desc) &&
-        is_imgfmt_desc_supported(p, &desc))
-        return true;
-    if (p->hwdec && ra_hwdec_test_format(p->hwdec, mp_format))
-        return true;
-    return false;
-}
-
-void gl_video_config(struct gl_video *p, struct mp_image_params *params)
-{
-    unmap_overlay(p);
-    unref_current_image(p);
-
-    if (!mp_image_params_equal(&p->real_image_params, params)) {
-        uninit_video(p);
-        p->real_image_params = *params;
-        p->image_params = *params;
-        if (params->imgfmt)
-            init_video(p);
-    }
-
-    gl_video_reset_surfaces(p);
-}
-
-void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd)
-{
-    mpgl_osd_destroy(p->osd);
-    p->osd = NULL;
-    p->osd_state = osd;
-    reinit_osd(p);
-}
-
-struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
-                               struct mpv_global *g)
-{
-    struct gl_video *p = talloc_ptrtype(NULL, p);
-    *p = (struct gl_video) {
-        .ra = ra,
-        .global = g,
-        .log = log,
-        .sc = gl_sc_create(ra, g, log),
-        .video_eq = mp_csp_equalizer_create(p, g),
-        .opts_cache = m_config_cache_alloc(p, g, &gl_video_conf),
-    };
-    // make sure this variable is initialized to *something*
-    p->pass = p->pass_fresh;
-    struct gl_video_opts *opts = p->opts_cache->opts;
-    p->cms = gl_lcms_init(p, log, g, opts->icc_opts),
-    p->opts = *opts;
-    for (int n = 0; n < SCALER_COUNT; n++)
-        p->scaler[n] = (struct scaler){.index = n};
-    init_gl(p);
-    reinit_from_options(p);
-    return p;
-}
-
-// Get static string for scaler shader. If "tscale" is set to true, the
-// scaler must be a separable convolution filter.
-static const char *handle_scaler_opt(const char *name, bool tscale)
-{
-    if (name && name[0]) {
-        const struct filter_kernel *kernel = mp_find_filter_kernel(name);
-        if (kernel && (!tscale || !kernel->polar))
-                return kernel->f.name;
-
-        for (const char *const *filter = tscale ? fixed_tscale_filters
-                                                : fixed_scale_filters;
-             *filter; filter++) {
-            if (strcmp(*filter, name) == 0)
-                return *filter;
-        }
-    }
-    return NULL;
-}
-
-void gl_video_update_options(struct gl_video *p)
-{
-    if (m_config_cache_update(p->opts_cache)) {
-        gl_lcms_update_options(p->cms);
-        reinit_from_options(p);
-    }
-}
-
-static void reinit_from_options(struct gl_video *p)
-{
-    p->use_lut_3d = gl_lcms_has_profile(p->cms);
-
-    // Copy the option fields, so that check_gl_features() can mutate them.
-    // This works only for the fields themselves of course, not for any memory
-    // referenced by them.
-    p->opts = *(struct gl_video_opts *)p->opts_cache->opts;
-
-    if (!p->force_clear_color)
-        p->clear_color = p->opts.background;
-
-    check_gl_features(p);
-    uninit_rendering(p);
-    gl_sc_set_cache_dir(p->sc, p->opts.shader_cache_dir);
-    p->ra->use_pbo = p->opts.pbo;
-    gl_video_setup_hooks(p);
-    reinit_osd(p);
-
-    if (p->opts.interpolation && !p->global->opts->video_sync && !p->dsi_warned) {
-        MP_WARN(p, "Interpolation now requires enabling display-sync mode.\n"
-                   "E.g.: --video-sync=display-resample\n");
-        p->dsi_warned = true;
-    }
-}
-
-void gl_video_configure_queue(struct gl_video *p, struct vo *vo)
-{
-    int queue_size = 1;
-
-    // Figure out an adequate size for the interpolation queue. The larger
-    // the radius, the earlier we need to queue frames.
-    if (p->opts.interpolation) {
-        const struct filter_kernel *kernel =
-            mp_find_filter_kernel(p->opts.scaler[SCALER_TSCALE].kernel.name);
-        if (kernel) {
-            // filter_scale wouldn't be correctly initialized were we to use it here.
-            // This is fine since we're always upsampling, but beware if downsampling
-            // is added!
-            double radius = kernel->f.radius;
-            radius = radius > 0 ? radius : p->opts.scaler[SCALER_TSCALE].radius;
-            queue_size += 1 + ceil(radius);
-        } else {
-            // Oversample/linear case
-            queue_size += 2;
-        }
-    }
-
-    vo_set_queue_params(vo, 0, queue_size);
-}
-
-static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param)
-{
-    char s[20] = {0};
-    int r = 1;
-    bool tscale = bstr_equals0(name, "tscale");
-    if (bstr_equals0(param, "help")) {
-        r = M_OPT_EXIT;
-    } else {
-        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-        if (!handle_scaler_opt(s, tscale))
-            r = M_OPT_INVALID;
-    }
-    if (r < 1) {
-        mp_info(log, "Available scalers:\n");
-        for (const char *const *filter = tscale ? fixed_tscale_filters
-                                                : fixed_scale_filters;
-             *filter; filter++) {
-            mp_info(log, "    %s\n", *filter);
-        }
-        for (int n = 0; mp_filter_kernels[n].f.name; n++) {
-            if (!tscale || !mp_filter_kernels[n].polar)
-                mp_info(log, "    %s\n", mp_filter_kernels[n].f.name);
-        }
-        if (s[0])
-            mp_fatal(log, "No scaler named '%s' found!\n", s);
-    }
-    return r;
-}
-
-static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
-                               struct bstr name, struct bstr param)
-{
-    char s[20] = {0};
-    int r = 1;
-    if (bstr_equals0(param, "help")) {
-        r = M_OPT_EXIT;
-    } else {
-        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-        const struct filter_window *window = mp_find_filter_window(s);
-        if (!window)
-            r = M_OPT_INVALID;
-    }
-    if (r < 1) {
-        mp_info(log, "Available windows:\n");
-        for (int n = 0; mp_filter_windows[n].name; n++)
-            mp_info(log, "    %s\n", mp_filter_windows[n].name);
-        if (s[0])
-            mp_fatal(log, "No window named '%s' found!\n", s);
-    }
-    return r;
-}
-
-float gl_video_scale_ambient_lux(float lmin, float lmax,
-                                 float rmin, float rmax, float lux)
-{
-    assert(lmax > lmin);
-
-    float num = (rmax - rmin) * (log10(lux) - log10(lmin));
-    float den = log10(lmax) - log10(lmin);
-    float result = num / den + rmin;
-
-    // clamp the result
-    float max = MPMAX(rmax, rmin);
-    float min = MPMIN(rmax, rmin);
-    return MPMAX(MPMIN(result, max), min);
-}
-
-void gl_video_set_ambient_lux(struct gl_video *p, int lux)
-{
-    if (p->opts.gamma_auto) {
-        float gamma = gl_video_scale_ambient_lux(16.0, 64.0, 2.40, 1.961, lux);
-        MP_VERBOSE(p, "ambient light changed: %dlux (gamma: %f)\n", lux, gamma);
-        p->opts.gamma = MPMIN(1.0, 1.961 / gamma);
-    }
-}
-
-void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec)
-{
-    unref_current_image(p);
-    ra_hwdec_mapper_free(&p->hwdec_mapper);
-    p->hwdec = hwdec;
-}
-
-static void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size)
-{
-    struct ra_buf_params params = {
-        .type = RA_BUF_TYPE_TEX_UPLOAD,
-        .host_mapped = true,
-        .size = size,
-    };
-
-    struct ra_buf *buf = ra_buf_create(p->ra, &params);
-    if (!buf)
-        return NULL;
-
-    MP_TARRAY_GROW(p, p->dr_buffers, p->num_dr_buffers);
-    p->dr_buffers[p->num_dr_buffers++] = (struct dr_buffer){ .buf = buf };
-
-    return buf->data;
-};
-
-static void gl_video_dr_free_buffer(void *opaque, uint8_t *data)
-{
-    struct gl_video *p = opaque;
-
-    for (int n = 0; n < p->num_dr_buffers; n++) {
-        struct dr_buffer *buffer = &p->dr_buffers[n];
-        if (buffer->buf->data == data) {
-            assert(!buffer->mpi); // can't be freed while it has a ref
-            ra_buf_free(p->ra, &buffer->buf);
-            MP_TARRAY_REMOVE_AT(p->dr_buffers, p->num_dr_buffers, n);
-            return;
-        }
-    }
-    // not found - must not happen
-    assert(0);
-}
-
-struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
-                                    int stride_align)
-{
-    int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
-    if (size < 0)
-        return NULL;
-
-    int alloc_size = size + stride_align;
-    void *ptr = gl_video_dr_alloc_buffer(p, alloc_size);
-    if (!ptr)
-        return NULL;
-
-    // (we expect vo.c to proxy the free callback, so it happens in the same
-    // thread it was allocated in, removing the need for synchronization)
-    struct mp_image *res = mp_image_from_buffer(imgfmt, w, h, stride_align,
-                                                ptr, alloc_size, p,
-                                                gl_video_dr_free_buffer);
-    if (!res)
-        gl_video_dr_free_buffer(p, ptr);
-    return res;
-}
diff --git a/video/out/opengl/video.h b/video/out/opengl/video.h
deleted file mode 100644
index d163bc8405..0000000000
--- a/video/out/opengl/video.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_VIDEO_H
-#define MP_GL_VIDEO_H
-
-#include <stdbool.h>
-
-#include "options/m_option.h"
-#include "sub/osd.h"
-#include "utils.h"
-#include "lcms.h"
-#include "shader_cache.h"
-#include "video/csputils.h"
-#include "video/out/filter_kernels.h"
-#include "video/out/vo.h"
-
-// Assume we have this many texture units for sourcing additional passes.
-// The actual texture unit assignment is dynamic.
-#define TEXUNIT_VIDEO_NUM 6
-
-struct scaler_fun {
-    char *name;
-    float params[2];
-    float blur;
-    float taper;
-};
-
-struct scaler_config {
-    struct scaler_fun kernel;
-    struct scaler_fun window;
-    float radius;
-    float antiring;
-    float cutoff;
-    float clamp;
-};
-
-struct scaler {
-    int index;
-    struct scaler_config conf;
-    double scale_factor;
-    bool initialized;
-    struct filter_kernel *kernel;
-    struct ra_tex *lut;
-    struct fbotex sep_fbo;
-    bool insufficient;
-    int lut_size;
-
-    // kernel points here
-    struct filter_kernel kernel_storage;
-};
-
-enum scaler_unit {
-    SCALER_SCALE,  // luma/video
-    SCALER_DSCALE, // luma-video downscaling
-    SCALER_CSCALE, // chroma upscaling
-    SCALER_TSCALE, // temporal scaling (interpolation)
-    SCALER_COUNT
-};
-
-enum dither_algo {
-    DITHER_NONE = 0,
-    DITHER_FRUIT,
-    DITHER_ORDERED,
-};
-
-enum alpha_mode {
-    ALPHA_NO = 0,
-    ALPHA_YES,
-    ALPHA_BLEND,
-    ALPHA_BLEND_TILES,
-};
-
-enum blend_subs_mode {
-    BLEND_SUBS_NO = 0,
-    BLEND_SUBS_YES,
-    BLEND_SUBS_VIDEO,
-};
-
-enum tone_mapping {
-    TONE_MAPPING_CLIP,
-    TONE_MAPPING_MOBIUS,
-    TONE_MAPPING_REINHARD,
-    TONE_MAPPING_HABLE,
-    TONE_MAPPING_GAMMA,
-    TONE_MAPPING_LINEAR,
-};
-
-// How many frames to average over for HDR peak detection
-#define PEAK_DETECT_FRAMES 100
-
-struct gl_video_opts {
-    int dumb_mode;
-    struct scaler_config scaler[4];
-    int scaler_lut_size;
-    float gamma;
-    int gamma_auto;
-    int target_prim;
-    int target_trc;
-    int target_brightness;
-    int tone_mapping;
-    int compute_hdr_peak;
-    float tone_mapping_param;
-    float tone_mapping_desat;
-    int gamut_warning;
-    int linear_scaling;
-    int correct_downscaling;
-    int sigmoid_upscaling;
-    float sigmoid_center;
-    float sigmoid_slope;
-    int scaler_resizes_only;
-    int pbo;
-    int dither_depth;
-    int dither_algo;
-    int dither_size;
-    int temporal_dither;
-    int temporal_dither_period;
-    char *fbo_format;
-    int alpha_mode;
-    int use_rectangle;
-    struct m_color background;
-    int interpolation;
-    float interpolation_threshold;
-    int blend_subs;
-    char **user_shaders;
-    int deband;
-    struct deband_opts *deband_opts;
-    float unsharp;
-    int tex_pad_x, tex_pad_y;
-    struct mp_icc_opts *icc_opts;
-    int early_flush;
-    char *shader_cache_dir;
-};
-
-extern const struct m_sub_options gl_video_conf;
-
-struct gl_video;
-struct vo_frame;
-
-struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
-                               struct mpv_global *g);
-void gl_video_uninit(struct gl_video *p);
-void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
-void gl_video_update_options(struct gl_video *p);
-bool gl_video_check_format(struct gl_video *p, int mp_format);
-void gl_video_config(struct gl_video *p, struct mp_image_params *params);
-void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
-void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct fbodst target);
-void gl_video_resize(struct gl_video *p,
-                     struct mp_rect *src, struct mp_rect *dst,
-                     struct mp_osd_res *osd);
-void gl_video_set_fb_depth(struct gl_video *p, int fb_depth);
-void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out);
-void gl_video_set_clear_color(struct gl_video *p, struct m_color color);
-void gl_video_set_osd_pts(struct gl_video *p, double pts);
-bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *osd,
-                               double pts);
-
-float gl_video_scale_ambient_lux(float lmin, float lmax,
-                                 float rmin, float rmax, float lux);
-void gl_video_set_ambient_lux(struct gl_video *p, int lux);
-void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data);
-bool gl_video_icc_auto_enabled(struct gl_video *p);
-bool gl_video_gamma_auto_enabled(struct gl_video *p);
-struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p);
-
-void gl_video_reset(struct gl_video *p);
-bool gl_video_showing_interpolated_frame(struct gl_video *p);
-
-struct ra_hwdec;
-void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec);
-
-struct vo;
-void gl_video_configure_queue(struct gl_video *p, struct vo *vo);
-
-struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
-                                    int stride_align);
-
-
-#endif
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
deleted file mode 100644
index 60c5ce82ac..0000000000
--- a/video/out/opengl/video_shaders.c
+++ /dev/null
@@ -1,872 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <math.h>
-
-#include "video_shaders.h"
-#include "video.h"
-
-#define GLSL(x) gl_sc_add(sc, #x "\n");
-#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
-#define GLSLH(x) gl_sc_hadd(sc, #x "\n");
-#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__)
-
-// Set up shared/commonly used variables and macros
-void sampler_prelude(struct gl_shader_cache *sc, int tex_num)
-{
-    GLSLF("#undef tex\n");
-    GLSLF("#undef texmap\n");
-    GLSLF("#define tex texture%d\n", tex_num);
-    GLSLF("#define texmap texmap%d\n", tex_num);
-    GLSLF("vec2 pos = texcoord%d;\n", tex_num);
-    GLSLF("vec2 size = texture_size%d;\n", tex_num);
-    GLSLF("vec2 pt = pixel_size%d;\n", tex_num);
-}
-
-static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
-                                              struct scaler *scaler)
-{
-    gl_sc_uniform_texture(sc, "lut", scaler->lut);
-    GLSLF("float ypos = LUT_POS(fcoord, %d.0);\n", scaler->lut_size);
-
-    int N = scaler->kernel->size;
-    int width = (N + 3) / 4; // round up
-
-    GLSLF("float weights[%d];\n", N);
-    for (int i = 0; i < N; i++) {
-        if (i % 4 == 0)
-            GLSLF("c = texture(lut, vec2(%f, ypos));\n", (i / 4 + 0.5) / width);
-        GLSLF("weights[%d] = c[%d];\n", i, i % 4);
-    }
-}
-
-// Handle a single pass (either vertical or horizontal). The direction is given
-// by the vector (d_x, d_y). If the vector is 0, then planar interpolation is
-// used instead (samples from texture0 through textureN)
-void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
-                               int d_x, int d_y)
-{
-    int N = scaler->kernel->size;
-    bool use_ar = scaler->conf.antiring > 0;
-    bool planar = d_x == 0 && d_y == 0;
-    GLSL(color = vec4(0.0);)
-    GLSLF("{\n");
-    if (!planar) {
-        GLSLF("vec2 dir = vec2(%d.0, %d.0);\n", d_x, d_y);
-        GLSL(pt *= dir;)
-        GLSL(float fcoord = dot(fract(pos * size - vec2(0.5)), dir);)
-        GLSLF("vec2 base = pos - fcoord * pt - pt * vec2(%d.0);\n", N / 2 - 1);
-    }
-    GLSL(vec4 c;)
-    if (use_ar) {
-        GLSL(vec4 hi = vec4(0.0);)
-        GLSL(vec4 lo = vec4(1.0);)
-    }
-    pass_sample_separated_get_weights(sc, scaler);
-    GLSLF("// scaler samples\n");
-    for (int n = 0; n < N; n++) {
-        if (planar) {
-            GLSLF("c = texture(texture%d, texcoord%d);\n", n, n);
-        } else {
-            GLSLF("c = texture(tex, base + pt * vec2(%d.0));\n", n);
-        }
-        GLSLF("color += vec4(weights[%d]) * c;\n", n);
-        if (use_ar && (n == N/2-1 || n == N/2)) {
-            GLSL(lo = min(lo, c);)
-            GLSL(hi = max(hi, c);)
-        }
-    }
-    if (use_ar)
-        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
-              scaler->conf.antiring);
-    GLSLF("}\n");
-}
-
-// Subroutine for computing and adding an individual texel contribution
-// If subtexel < 0 and offset < 0, samples directly.
-// If subtexel >= 0, takes the texel from cN[subtexel]
-// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
-static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
-                         int x, int y, int subtexel, int offset, int components)
-{
-    double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
-    double radius_cutoff = scaler->kernel->radius_cutoff;
-
-    // Since we can't know the subpixel position in advance, assume a
-    // worst case scenario
-    int yy = y > 0 ? y-1 : y;
-    int xx = x > 0 ? x-1 : x;
-    double dmax = sqrt(xx*xx + yy*yy);
-    // Skip samples definitely outside the radius
-    if (dmax >= radius_cutoff)
-        return;
-    GLSLF("d = length(vec2(%d.0, %d.0) - fcoord);\n", x, y);
-    // Check for samples that might be skippable
-    bool maybe_skippable = dmax >= radius_cutoff - M_SQRT2;
-    if (maybe_skippable)
-        GLSLF("if (d < %f) {\n", radius_cutoff);
-
-    // get the weight for this pixel
-    if (scaler->lut->params.dimensions == 1) {
-        GLSLF("w = tex1D(lut, LUT_POS(d * 1.0/%f, %d.0)).r;\n",
-              radius, scaler->lut_size);
-    } else {
-        GLSLF("w = texture(lut, vec2(0.5, LUT_POS(d * 1.0/%f, %d.0))).r;\n",
-              radius, scaler->lut_size);
-    }
-    GLSL(wsum += w;)
-
-    if (subtexel < 0 && offset < 0) {
-        GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
-        GLSL(color += vec4(w) * c0;)
-    } else if (subtexel >= 0) {
-        for (int n = 0; n < components; n++)
-            GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
-    } else if (offset >= 0) {
-        for (int n = 0; n <components; n++)
-            GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
-                  y + offset, x + offset);
-    } else {
-        // invalid usage
-        abort();
-    }
-
-    if (maybe_skippable)
-        GLSLF("}\n");
-}
-
-void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                       int components, int glsl_version)
-{
-    GLSL(color = vec4(0.0);)
-    GLSLF("{\n");
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    GLSL(vec2 base = pos - fcoord * pt;)
-    GLSLF("float w, d, wsum = 0.0;\n");
-    for (int n = 0; n < components; n++)
-        GLSLF("vec4 c%d;\n", n);
-
-    gl_sc_uniform_texture(sc, "lut", scaler->lut);
-
-    GLSLF("// scaler samples\n");
-    int bound = ceil(scaler->kernel->radius_cutoff);
-    for (int y = 1-bound; y <= bound; y += 2) {
-        for (int x = 1-bound; x <= bound; x += 2) {
-            // First we figure out whether it's more efficient to use direct
-            // sampling or gathering. The problem is that gathering 4 texels
-            // only to discard some of them is very wasteful, so only do it if
-            // we suspect it will be a win rather than a loss. This is the case
-            // exactly when all four texels are within bounds
-            bool use_gather = sqrt(x*x + y*y) < scaler->kernel->radius_cutoff;
-
-            // textureGather is only supported in GLSL 400+
-            if (glsl_version < 400)
-                use_gather = false;
-
-            if (use_gather) {
-                // Gather the four surrounding texels simultaneously
-                for (int n = 0; n < components; n++) {
-                    GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
-                          n, x, y, n);
-                }
-
-                // Mix in all of the points with their weights
-                for (int p = 0; p < 4; p++) {
-                    // The four texels are gathered counterclockwise starting
-                    // from the bottom left
-                    static const int xo[4] = {0, 1, 1, 0};
-                    static const int yo[4] = {1, 1, 0, 0};
-                    if (x+xo[p] > bound || y+yo[p] > bound)
-                        continue;
-                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
-                }
-            } else {
-                // switch to direct sampling instead, for efficiency/compatibility
-                for (int yy = y; yy <= bound && yy <= y+1; yy++) {
-                    for (int xx = x; xx <= bound && xx <= x+1; xx++)
-                        polar_sample(sc, scaler, xx, yy, -1, -1, components);
-                }
-            }
-        }
-    }
-
-    GLSL(color = color / vec4(wsum);)
-    GLSLF("}\n");
-}
-
-// bw/bh: block size
-// iw/ih: input size (pre-calculated to fit all required texels)
-void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                        int components, int bw, int bh, int iw, int ih)
-{
-    int bound = ceil(scaler->kernel->radius_cutoff);
-    int offset = bound - 1; // padding top/left
-
-    GLSL(color = vec4(0.0);)
-    GLSLF("{\n");
-    GLSL(vec2 wpos = texmap(gl_WorkGroupID * gl_WorkGroupSize);)
-    GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));)
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    GLSL(vec2 base = pos - pt * fcoord;)
-    GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
-    GLSLF("float w, d, wsum = 0.0;\n");
-    gl_sc_uniform_texture(sc, "lut", scaler->lut);
-
-    // Load all relevant texels into shmem
-    gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays");
-    for (int c = 0; c < components; c++)
-        GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
-
-    GLSL(vec4 c;)
-    GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
-    GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
-    GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
-    for (int c = 0; c < components; c++)
-        GLSLF("in%d[y][x] = c[%d];\n", c, c);
-    GLSLF("}}\n");
-    GLSL(groupMemoryBarrier();)
-    GLSL(barrier();)
-
-    // Dispatch the actual samples
-    GLSLF("// scaler samples\n");
-    for (int y = 1-bound; y <= bound; y++) {
-        for (int x = 1-bound; x <= bound; x++)
-            polar_sample(sc, scaler, x, y, -1, offset, components);
-    }
-
-    GLSL(color = color / vec4(wsum);)
-    GLSLF("}\n");
-}
-
-static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
-{
-    // Explanation of how bicubic scaling with only 4 texel fetches is done:
-    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
-    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
-    // Explanation why this algorithm normally always blurs, even with unit
-    // scaling:
-    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
-    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
-    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
-                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
-    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
-    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
-    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
-    GLSLF("%s.xy += vec2(1.0 + %s, 1.0 - %s);\n", t, s, s);
-}
-
-void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
-{
-    GLSLF("{\n");
-    GLSL(vec2 fcoord = fract(pos * size + vec2(0.5, 0.5));)
-    bicubic_calcweights(sc, "parmx", "fcoord.x");
-    bicubic_calcweights(sc, "parmy", "fcoord.y");
-    GLSL(vec4 cdelta;)
-    GLSL(cdelta.xz = parmx.rg * vec2(-pt.x, pt.x);)
-    GLSL(cdelta.yw = parmy.rg * vec2(-pt.y, pt.y);)
-    // first y-interpolation
-    GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
-    GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
-    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
-    // second y-interpolation
-    GLSL(vec4 br = texture(tex, pos + cdelta.zy);)
-    GLSL(vec4 bg = texture(tex, pos + cdelta.zw);)
-    GLSL(vec4 aa = mix(bg, br, parmy.b);)
-    // x-interpolation
-    GLSL(color = mix(aa, ab, parmx.b);)
-    GLSLF("}\n");
-}
-
-void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
-                                   int w, int h)
-{
-    GLSLF("{\n");
-    GLSL(vec2 pos = pos - vec2(0.5) * pt;) // round to nearest
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    // Determine the mixing coefficient vector
-    gl_sc_uniform_vec2(sc, "output_size", (float[2]){w, h});
-    GLSL(vec2 coeff = fcoord * output_size/size;)
-    float threshold = scaler->conf.kernel.params[0];
-    threshold = isnan(threshold) ? 0.0 : threshold;
-    GLSLF("coeff = (coeff - %f) * 1.0/%f;\n", threshold, 1.0 - 2 * threshold);
-    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
-    // Compute the right blend of colors
-    GLSL(color = texture(tex, pos + pt * (coeff - fcoord));)
-    GLSLF("}\n");
-}
-
-// Common constants for SMPTE ST.2084 (HDR)
-static const float PQ_M1 = 2610./4096 * 1./4,
-                   PQ_M2 = 2523./4096 * 128,
-                   PQ_C1 = 3424./4096,
-                   PQ_C2 = 2413./4096 * 32,
-                   PQ_C3 = 2392./4096 * 32;
-
-// Common constants for ARIB STD-B67 (HLG)
-static const float HLG_A = 0.17883277,
-                   HLG_B = 0.28466892,
-                   HLG_C = 0.55991073;
-
-// Common constants for Panasonic V-Log
-static const float VLOG_B = 0.00873,
-                   VLOG_C = 0.241514,
-                   VLOG_D = 0.598206;
-
-// Common constants for Sony S-Log
-static const float SLOG_A = 0.432699,
-                   SLOG_B = 0.037584,
-                   SLOG_C = 0.616596 + 0.03,
-                   SLOG_P = 3.538813,
-                   SLOG_Q = 0.030001,
-                   SLOG_K2 = 155.0 / 219.0;
-
-// Linearize (expand), given a TRC as input. In essence, this is the ITU-R
-// EOTF, calculated on an idealized (reference) monitor with a white point of
-// MP_REF_WHITE and infinite contrast.
-void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
-{
-    if (trc == MP_CSP_TRC_LINEAR)
-        return;
-
-    GLSLF("// linearize\n");
-
-    // Note that this clamp may technically violate the definition of
-    // ITU-R BT.2100, which allows for sub-blacks and super-whites to be
-    // displayed on the display where such would be possible. That said, the
-    // problem is that not all gamma curves are well-defined on the values
-    // outside this range, so we ignore it and just clip anyway for sanity.
-    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-
-    switch (trc) {
-    case MP_CSP_TRC_SRGB:
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/12.92),
-                             pow((color.rgb + vec3(0.055))/vec3(1.055), vec3(2.4)),
-                             lessThan(vec3(0.04045), color.rgb));)
-        break;
-    case MP_CSP_TRC_BT_1886:
-        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
-        break;
-    case MP_CSP_TRC_GAMMA18:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.8));)
-        break;
-    case MP_CSP_TRC_GAMMA22:
-        GLSL(color.rgb = pow(color.rgb, vec3(2.2));)
-        break;
-    case MP_CSP_TRC_GAMMA28:
-        GLSL(color.rgb = pow(color.rgb, vec3(2.8));)
-        break;
-    case MP_CSP_TRC_PRO_PHOTO:
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/16.0),
-                             pow(color.rgb, vec3(1.8)),
-                             lessThan(vec3(0.03125), color.rgb));)
-        break;
-    case MP_CSP_TRC_PQ:
-        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M2);
-        GLSLF("color.rgb = max(color.rgb - vec3(%f), vec3(0.0)) \n"
-              "             / (vec3(%f) - vec3(%f) * color.rgb);\n",
-              PQ_C1, PQ_C2, PQ_C3);
-        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", PQ_M1);
-        // PQ's output range is 0-10000, but we need it to be relative to to
-        // MP_REF_WHITE instead, so rescale
-        GLSLF("color.rgb *= vec3(%f);\n", 10000 / MP_REF_WHITE);
-        break;
-    case MP_CSP_TRC_HLG:
-        GLSLF("color.rgb = mix(vec3(4.0) * color.rgb * color.rgb,\n"
-              "                exp((color.rgb - vec3(%f)) * vec3(1.0/%f)) + vec3(%f),\n"
-              "                lessThan(vec3(0.5), color.rgb));\n",
-              HLG_C, HLG_A, HLG_B);
-        break;
-    case MP_CSP_TRC_V_LOG:
-        GLSLF("color.rgb = mix((color.rgb - vec3(0.125)) * vec3(1.0/5.6), \n"
-              "    pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
-              "              - vec3(%f),                                  \n"
-              "    lessThanEqual(vec3(0.181), color.rgb));                \n",
-              VLOG_D, VLOG_C, VLOG_B);
-        break;
-    case MP_CSP_TRC_S_LOG1:
-        GLSLF("color.rgb = pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f))\n"
-              "            - vec3(%f);\n",
-              SLOG_C, SLOG_A, SLOG_B);
-        break;
-    case MP_CSP_TRC_S_LOG2:
-        GLSLF("color.rgb = mix((color.rgb - vec3(%f)) * vec3(1.0/%f),      \n"
-              "    (pow(vec3(10.0), (color.rgb - vec3(%f)) * vec3(1.0/%f)) \n"
-              "              - vec3(%f)) * vec3(1.0/%f),                   \n"
-              "    lessThanEqual(vec3(%f), color.rgb));                    \n",
-              SLOG_Q, SLOG_P, SLOG_C, SLOG_A, SLOG_B, SLOG_K2, SLOG_Q);
-        break;
-    default:
-        abort();
-    }
-
-    // Rescale to prevent clipping on non-float textures
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", mp_trc_nom_peak(trc));
-}
-
-// Delinearize (compress), given a TRC as output. This corresponds to the
-// inverse EOTF (not the OETF) in ITU-R terminology, again assuming a
-// reference monitor.
-void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
-{
-    if (trc == MP_CSP_TRC_LINEAR)
-        return;
-
-    GLSLF("// delinearize\n");
-    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-    GLSLF("color.rgb *= vec3(%f);\n", mp_trc_nom_peak(trc));
-
-    switch (trc) {
-    case MP_CSP_TRC_SRGB:
-        GLSL(color.rgb = mix(color.rgb * vec3(12.92),
-                             vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))
-                                 - vec3(0.055),
-                             lessThanEqual(vec3(0.0031308), color.rgb));)
-        break;
-    case MP_CSP_TRC_BT_1886:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
-        break;
-    case MP_CSP_TRC_GAMMA18:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.8));)
-        break;
-    case MP_CSP_TRC_GAMMA22:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.2));)
-        break;
-    case MP_CSP_TRC_GAMMA28:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.8));)
-        break;
-    case MP_CSP_TRC_PRO_PHOTO:
-        GLSL(color.rgb = mix(color.rgb * vec3(16.0),
-                             pow(color.rgb, vec3(1.0/1.8)),
-                             lessThanEqual(vec3(0.001953), color.rgb));)
-        break;
-    case MP_CSP_TRC_PQ:
-        GLSLF("color.rgb *= vec3(1.0/%f);\n", 10000 / MP_REF_WHITE);
-        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M1);
-        GLSLF("color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
-              "             / (vec3(1.0) + vec3(%f) * color.rgb);\n",
-              PQ_C1, PQ_C2, PQ_C3);
-        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", PQ_M2);
-        break;
-    case MP_CSP_TRC_HLG:
-        GLSLF("color.rgb = mix(vec3(0.5) * sqrt(color.rgb),\n"
-              "                vec3(%f) * log(color.rgb - vec3(%f)) + vec3(%f),\n"
-              "                lessThan(vec3(1.0), color.rgb));\n",
-              HLG_A, HLG_B, HLG_C);
-        break;
-    case MP_CSP_TRC_V_LOG:
-        GLSLF("color.rgb = mix(vec3(5.6) * color.rgb + vec3(0.125),   \n"
-              "                vec3(%f) * log(color.rgb + vec3(%f))   \n"
-              "                    + vec3(%f),                        \n"
-              "                lessThanEqual(vec3(0.01), color.rgb)); \n",
-              VLOG_C / M_LN10, VLOG_B, VLOG_D);
-        break;
-    case MP_CSP_TRC_S_LOG1:
-        GLSLF("color.rgb = vec3(%f) * log(color.rgb + vec3(%f)) + vec3(%f);\n",
-              SLOG_A / M_LN10, SLOG_B, SLOG_C);
-        break;
-    case MP_CSP_TRC_S_LOG2:
-        GLSLF("color.rgb = mix(vec3(%f) * color.rgb + vec3(%f),                \n"
-              "                vec3(%f) * log(vec3(%f) * color.rgb + vec3(%f)) \n"
-              "                    + vec3(%f),                                 \n"
-              "                lessThanEqual(vec3(0.0), color.rgb));           \n",
-              SLOG_P, SLOG_Q, SLOG_A / M_LN10, SLOG_K2, SLOG_B, SLOG_C);
-        break;
-    default:
-        abort();
-    }
-}
-
-// Apply the OOTF mapping from a given light type to display-referred light.
-// The extra peak parameter is used to scale the values before and after
-// the OOTF, and can be inferred using mp_trc_nom_peak
-void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
-{
-    if (light == MP_CSP_LIGHT_DISPLAY)
-        return;
-
-    GLSLF("// apply ootf\n");
-    GLSLF("color.rgb *= vec3(%f);\n", peak);
-
-    switch (light)
-    {
-    case MP_CSP_LIGHT_SCENE_HLG:
-        // HLG OOTF from BT.2100, assuming a reference display with a
-        // peak of 1000 cd/m² -> gamma = 1.2
-        GLSLF("color.rgb *= vec3(%f * pow(dot(src_luma, color.rgb), 0.2));\n",
-              (1000 / MP_REF_WHITE) / pow(12, 1.2));
-        break;
-    case MP_CSP_LIGHT_SCENE_709_1886:
-        // This OOTF is defined by encoding the result as 709 and then decoding
-        // it as 1886; although this is called 709_1886 we actually use the
-        // more precise (by one decimal) values from BT.2020 instead
-        GLSL(color.rgb = mix(color.rgb * vec3(4.5),
-                             vec3(1.0993) * pow(color.rgb, vec3(0.45)) - vec3(0.0993),
-                             lessThan(vec3(0.0181), color.rgb));)
-        GLSL(color.rgb = pow(color.rgb, vec3(2.4));)
-        break;
-    case MP_CSP_LIGHT_SCENE_1_2:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.2));)
-        break;
-    default:
-        abort();
-    }
-
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
-}
-
-// Inverse of the function pass_ootf, for completeness' sake.
-void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
-{
-    if (light == MP_CSP_LIGHT_DISPLAY)
-        return;
-
-    GLSLF("// apply inverse ootf\n");
-    GLSLF("color.rgb *= vec3(%f);\n", peak);
-
-    switch (light)
-    {
-    case MP_CSP_LIGHT_SCENE_HLG:
-        GLSLF("color.rgb *= vec3(1.0/%f);\n", (1000 / MP_REF_WHITE) / pow(12, 1.2));
-        GLSL(color.rgb /= vec3(max(1e-6, pow(dot(src_luma, color.rgb), 0.2/1.2)));)
-        break;
-    case MP_CSP_LIGHT_SCENE_709_1886:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
-        GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
-                             pow((color.rgb + vec3(0.0993)) * vec3(1.0/1.0993),
-                                 vec3(1/0.45)),
-                             lessThan(vec3(0.08145), color.rgb));)
-        break;
-    case MP_CSP_LIGHT_SCENE_1_2:
-        GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.2));)
-        break;
-    default:
-        abort();
-    }
-
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
-}
-
-// Tone map from a known peak brightness to the range [0,1]. If ref_peak
-// is 0, we will use peak detection instead
-static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
-                          enum tone_mapping algo, float param, float desat)
-{
-    GLSLF("// HDR tone mapping\n");
-
-    // Desaturate the color using a coefficient dependent on the luminance
-    GLSL(float luma = dot(dst_luma, color.rgb);)
-    if (desat > 0) {
-        GLSLF("float overbright = max(luma - %f, 1e-6) / max(luma, 1e-6);\n", desat);
-        GLSL(color.rgb = mix(color.rgb, vec3(luma), overbright);)
-    }
-
-    // To prevent discoloration due to out-of-bounds clipping, we need to make
-    // sure to reduce the value range as far as necessary to keep the entire
-    // signal in range, so tone map based on the brightest component.
-    GLSL(float sig = max(max(color.r, color.g), color.b);)
-    GLSL(float sig_orig = sig;)
-
-    if (!ref_peak) {
-        // For performance, we want to do as few atomic operations on global
-        // memory as possible, so use an atomic in shmem for the work group.
-        // We also want slightly more stable values, so use the group average
-        // instead of the group max
-        GLSLHF("shared uint group_sum = 0;\n");
-        GLSLF("atomicAdd(group_sum, uint(sig * %f));\n", MP_REF_WHITE);
-
-        // Have one thread in each work group update the frame maximum
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSL(if (gl_LocalInvocationIndex == 0))
-            GLSL(atomicMax(frame_max[index], group_sum /
-                 (gl_WorkGroupSize.x * gl_WorkGroupSize.y));)
-
-        // Finally, have one thread per invocation update the total maximum
-        // and advance the index
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSL(if (gl_GlobalInvocationID == ivec3(0)) {) // do this once per invocation
-            GLSLF("uint next = (index + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
-            GLSLF("sig_peak_raw = sig_peak_raw + frame_max[index] - frame_max[next];\n");
-            GLSLF("frame_max[next] = %d;\n", (int)MP_REF_WHITE);
-            GLSL(index = next;)
-        GLSL(})
-
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSLF("float sig_peak = 1.0/%f * float(sig_peak_raw);\n",
-              MP_REF_WHITE * PEAK_DETECT_FRAMES);
-    } else {
-        GLSLHF("const float sig_peak = %f;\n", ref_peak);
-    }
-
-    switch (algo) {
-    case TONE_MAPPING_CLIP:
-        GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
-        break;
-
-    case TONE_MAPPING_MOBIUS:
-        GLSLF("const float j = %f;\n", isnan(param) ? 0.3 : param);
-        // solve for M(j) = j; M(sig_peak) = 1.0; M'(j) = 1.0
-        // where M(x) = scale * (x+a)/(x+b)
-        GLSLF("float a = -j*j * (sig_peak - 1.0) / (j*j - 2.0*j + sig_peak);\n");
-        GLSLF("float b = (j*j - 2.0*j*sig_peak + sig_peak) / "
-              "max(1e-6, sig_peak - 1.0);\n");
-        GLSLF("float scale = (b*b + 2.0*b*j + j*j) / (b-a);\n");
-        GLSL(sig = mix(sig, scale * (sig + a) / (sig + b), sig > j);)
-        break;
-
-    case TONE_MAPPING_REINHARD: {
-        float contrast = isnan(param) ? 0.5 : param,
-              offset = (1.0 - contrast) / contrast;
-        GLSLF("sig = sig / (sig + %f);\n", offset);
-        GLSLF("float scale = (sig_peak + %f) / sig_peak;\n", offset);
-        GLSL(sig *= scale;)
-        break;
-    }
-
-    case TONE_MAPPING_HABLE: {
-        float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
-        GLSLHF("float hable(float x) {\n");
-        GLSLHF("return ((x * (%f*x + %f)+%f)/(x * (%f*x + %f) + %f)) - %f;\n",
-               A, C*B, D*E, A, B, D*F, E/F);
-        GLSLHF("}\n");
-        GLSL(sig = hable(sig) / hable(sig_peak);)
-        break;
-    }
-
-    case TONE_MAPPING_GAMMA: {
-        float gamma = isnan(param) ? 1.8 : param;
-        GLSLF("const float cutoff = 0.05, gamma = %f;\n", 1.0/gamma);
-        GLSL(float scale = pow(cutoff / sig_peak, gamma) / cutoff;)
-        GLSL(sig = sig > cutoff ? pow(sig / sig_peak, gamma) : scale * sig;)
-        break;
-    }
-
-    case TONE_MAPPING_LINEAR: {
-        float coeff = isnan(param) ? 1.0 : param;
-        GLSLF("sig = %f / sig_peak * sig;\n", coeff);
-        break;
-    }
-
-    default:
-        abort();
-    }
-
-    // Apply the computed scale factor to the color, linearly to prevent
-    // discoloration
-    GLSL(color.rgb *= sig / sig_orig;)
-}
-
-// Map colors from one source space to another. These source spaces must be
-// known (i.e. not MP_CSP_*_AUTO), as this function won't perform any
-// auto-guessing. If is_linear is true, we assume the input has already been
-// linearized (e.g. for linear-scaling). If `detect_peak` is true, we will
-// detect the peak instead of relying on metadata. Note that this requires
-// the caller to have already bound the appropriate SSBO and set up the
-// compute shader metadata
-void pass_color_map(struct gl_shader_cache *sc,
-                    struct mp_colorspace src, struct mp_colorspace dst,
-                    enum tone_mapping algo, float tone_mapping_param,
-                    float tone_mapping_desat, bool detect_peak,
-                    bool gamut_warning, bool is_linear)
-{
-    GLSLF("// color mapping\n");
-
-    // Compute the highest encodable level
-    float src_range = mp_trc_nom_peak(src.gamma),
-          dst_range = mp_trc_nom_peak(dst.gamma);
-    float ref_peak = src.sig_peak / dst_range;
-
-    // Some operations need access to the video's luma coefficients, so make
-    // them available
-    float rgb2xyz[3][3];
-    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(src.primaries), rgb2xyz);
-    gl_sc_uniform_vec3(sc, "src_luma", rgb2xyz[1]);
-    mp_get_rgb2xyz_matrix(mp_get_csp_primaries(dst.primaries), rgb2xyz);
-    gl_sc_uniform_vec3(sc, "dst_luma", rgb2xyz[1]);
-
-    // All operations from here on require linear light as a starting point,
-    // so we linearize even if src.gamma == dst.gamma when one of the other
-    // operations needs it
-    bool need_gamma = src.gamma != dst.gamma ||
-                      src.primaries != dst.primaries ||
-                      src_range != dst_range ||
-                      src.sig_peak > dst_range ||
-                      src.light != dst.light;
-
-    if (need_gamma && !is_linear) {
-        pass_linearize(sc, src.gamma);
-        is_linear= true;
-    }
-
-    if (src.light != dst.light)
-        pass_ootf(sc, src.light, mp_trc_nom_peak(src.gamma));
-
-    // Rescale the signal to compensate for differences in the encoding range
-    // and reference white level. This is necessary because of how mpv encodes
-    // brightness in textures.
-    if (src_range != dst_range) {
-        GLSLF("// rescale value range;\n");
-        GLSLF("color.rgb *= vec3(%f);\n", src_range / dst_range);
-    }
-
-    // Adapt to the right colorspace if necessary
-    if (src.primaries != dst.primaries) {
-        struct mp_csp_primaries csp_src = mp_get_csp_primaries(src.primaries),
-                                csp_dst = mp_get_csp_primaries(dst.primaries);
-        float m[3][3] = {{0}};
-        mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
-        gl_sc_uniform_mat3(sc, "cms_matrix", true, &m[0][0]);
-        GLSL(color.rgb = cms_matrix * color.rgb;)
-        // Since this can reduce the gamut, figure out by how much
-        for (int c = 0; c < 3; c++)
-            ref_peak = MPMAX(ref_peak, m[c][c]);
-    }
-
-    // Tone map to prevent clipping when the source signal peak exceeds the
-    // encodable range or we've reduced the gamut
-    if (ref_peak > 1) {
-        pass_tone_map(sc, detect_peak ? 0 : ref_peak, algo,
-                      tone_mapping_param, tone_mapping_desat);
-    }
-
-    if (src.light != dst.light)
-        pass_inverse_ootf(sc, dst.light, mp_trc_nom_peak(dst.gamma));
-
-    // Warn for remaining out-of-gamut colors is enabled
-    if (gamut_warning) {
-        GLSL(if (any(greaterThan(color.rgb, vec3(1.01)))))
-            GLSL(color.rgb = vec3(1.0) - color.rgb;) // invert
-    }
-
-    if (is_linear)
-        pass_delinearize(sc, dst.gamma);
-}
-
-// Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
-// Obtain random numbers by calling rand(h), followed by h = permute(h) to
-// update the state. Assumes the texture was hooked.
-static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
-{
-    GLSLH(float mod289(float x)  { return x - floor(x * 1.0/289.0) * 289.0; })
-    GLSLH(float permute(float x) { return mod289((34.0*x + 1.0) * x); })
-    GLSLH(float rand(float x)    { return fract(x * 1.0/41.0); })
-
-    // Initialize the PRNG by hashing the position + a random uniform
-    GLSL(vec3 _m = vec3(HOOKED_pos, random) + vec3(1.0);)
-    GLSL(float h = permute(permute(permute(_m.x)+_m.y)+_m.z);)
-    gl_sc_uniform_f(sc, "random", (double)av_lfg_get(lfg) / UINT32_MAX);
-}
-
-struct deband_opts {
-    int enabled;
-    int iterations;
-    float threshold;
-    float range;
-    float grain;
-};
-
-const struct deband_opts deband_opts_def = {
-    .iterations = 1,
-    .threshold = 64.0,
-    .range = 16.0,
-    .grain = 48.0,
-};
-
-#define OPT_BASE_STRUCT struct deband_opts
-const struct m_sub_options deband_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_INTRANGE("iterations", iterations, 0, 1, 16),
-        OPT_FLOATRANGE("threshold", threshold, 0, 0.0, 4096.0),
-        OPT_FLOATRANGE("range", range, 0, 1.0, 64.0),
-        OPT_FLOATRANGE("grain", grain, 0, 0.0, 4096.0),
-        {0}
-    },
-    .size = sizeof(struct deband_opts),
-    .defaults = &deband_opts_def,
-};
-
-// Stochastically sample a debanded result from a hooked texture.
-void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
-                        AVLFG *lfg, enum mp_csp_trc trc)
-{
-    // Initialize the PRNG
-    GLSLF("{\n");
-    prng_init(sc, lfg);
-
-    // Helper: Compute a stochastic approximation of the avg color around a
-    // pixel
-    GLSLHF("vec4 average(float range, inout float h) {\n");
-        // Compute a random rangle and distance
-        GLSLH(float dist = rand(h) * range;     h = permute(h);)
-        GLSLH(float dir  = rand(h) * 6.2831853; h = permute(h);)
-        GLSLH(vec2 o = dist * vec2(cos(dir), sin(dir));)
-
-        // Sample at quarter-turn intervals around the source pixel
-        GLSLH(vec4 ref[4];)
-        GLSLH(ref[0] = HOOKED_texOff(vec2( o.x,  o.y));)
-        GLSLH(ref[1] = HOOKED_texOff(vec2(-o.y,  o.x));)
-        GLSLH(ref[2] = HOOKED_texOff(vec2(-o.x, -o.y));)
-        GLSLH(ref[3] = HOOKED_texOff(vec2( o.y, -o.x));)
-
-        // Return the (normalized) average
-        GLSLH(return (ref[0] + ref[1] + ref[2] + ref[3])*0.25;)
-    GLSLHF("}\n");
-
-    // Sample the source pixel
-    GLSL(color = HOOKED_tex(HOOKED_pos);)
-    GLSLF("vec4 avg, diff;\n");
-    for (int i = 1; i <= opts->iterations; i++) {
-        // Sample the average pixel and use it instead of the original if
-        // the difference is below the given threshold
-        GLSLF("avg = average(%f, h);\n", i * opts->range);
-        GLSL(diff = abs(color - avg);)
-        GLSLF("color = mix(avg, color, greaterThan(diff, vec4(%f)));\n",
-              opts->threshold / (i * 16384.0));
-    }
-
-    // Add some random noise to smooth out residual differences
-    GLSL(vec3 noise;)
-    GLSL(noise.x = rand(h); h = permute(h);)
-    GLSL(noise.y = rand(h); h = permute(h);)
-    GLSL(noise.z = rand(h); h = permute(h);)
-
-    // Noise is scaled to the signal level to prevent extreme noise for HDR
-    float gain = opts->grain/8192.0 / mp_trc_nom_peak(trc);
-    GLSLF("color.xyz += %f * (noise - vec3(0.5));\n", gain);
-    GLSLF("}\n");
-}
-
-// Assumes the texture was hooked
-void pass_sample_unsharp(struct gl_shader_cache *sc, float param) {
-    GLSLF("{\n");
-    GLSL(float st1 = 1.2;)
-    GLSL(vec4 p = HOOKED_tex(HOOKED_pos);)
-    GLSL(vec4 sum1 = HOOKED_texOff(st1 * vec2(+1, +1))
-                   + HOOKED_texOff(st1 * vec2(+1, -1))
-                   + HOOKED_texOff(st1 * vec2(-1, +1))
-                   + HOOKED_texOff(st1 * vec2(-1, -1));)
-    GLSL(float st2 = 1.5;)
-    GLSL(vec4 sum2 = HOOKED_texOff(st2 * vec2(+1,  0))
-                   + HOOKED_texOff(st2 * vec2( 0, +1))
-                   + HOOKED_texOff(st2 * vec2(-1,  0))
-                   + HOOKED_texOff(st2 * vec2( 0, -1));)
-    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
-    GLSLF("color = p + t * %f;\n", param);
-    GLSLF("}\n");
-}
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
deleted file mode 100644
index 8345e4c598..0000000000
--- a/video/out/opengl/video_shaders.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_VIDEO_SHADERS_H
-#define MP_GL_VIDEO_SHADERS_H
-
-#include <libavutil/lfg.h>
-
-#include "utils.h"
-#include "video.h"
-
-extern const struct deband_opts deband_opts_def;
-extern const struct m_sub_options deband_conf;
-
-void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
-void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
-                               int d_x, int d_y);
-void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                       int components, int glsl_version);
-void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                        int components, int bw, int bh, int iw, int ih);
-void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
-void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
-                            int w, int h);
-
-void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
-void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
-void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
-void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
-
-void pass_color_map(struct gl_shader_cache *sc,
-                    struct mp_colorspace src, struct mp_colorspace dst,
-                    enum tone_mapping algo, float tone_mapping_param,
-                    float tone_mapping_desat, bool use_detected_peak,
-                    bool gamut_warning, bool is_linear);
-
-void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
-                        AVLFG *lfg, enum mp_csp_trc trc);
-
-void pass_sample_unsharp(struct gl_shader_cache *sc, float param);
-
-#endif
diff --git a/video/out/vo.c b/video/out/vo.c
index f9c5d04e24..a40360b188 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -50,6 +50,7 @@
 extern const struct vo_driver video_out_x11;
 extern const struct vo_driver video_out_vdpau;
 extern const struct vo_driver video_out_xv;
+extern const struct vo_driver video_out_gpu;
 extern const struct vo_driver video_out_opengl;
 extern const struct vo_driver video_out_opengl_cb;
 extern const struct vo_driver video_out_null;
@@ -69,8 +70,8 @@ const struct vo_driver *const video_out_drivers[] =
 #if HAVE_RPI
     &video_out_rpi,
 #endif
-#if HAVE_GL
-    &video_out_opengl,
+#if HAVE_GPU
+    &video_out_gpu,
 #endif
 #if HAVE_VDPAU
     &video_out_vdpau,
@@ -107,6 +108,7 @@ const struct vo_driver *const video_out_drivers[] =
     &video_out_lavc,
 #endif
 #if HAVE_GL
+    &video_out_opengl,
     &video_out_opengl_cb,
 #endif
     NULL
diff --git a/video/out/vo_gpu.c b/video/out/vo_gpu.c
new file mode 100644
index 0000000000..5df9e06f47
--- /dev/null
+++ b/video/out/vo_gpu.c
@@ -0,0 +1,385 @@
+/*
+ * Based on vo_gl.c by Reimar Doeffinger.
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include <libavutil/common.h>
+
+#include "config.h"
+
+#include "mpv_talloc.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "common/msg.h"
+#include "common/global.h"
+#include "options/m_config.h"
+#include "vo.h"
+#include "video/mp_image.h"
+#include "sub/osd.h"
+
+#include "gpu/context.h"
+#include "gpu/hwdec.h"
+#include "gpu/video.h"
+
+struct gpu_priv {
+    struct vo *vo;
+    struct mp_log *log;
+    struct ra_ctx *ctx;
+
+    char *context_name;
+    char *context_type;
+    struct ra_ctx_opts opts;
+    struct gl_video *renderer;
+    struct ra_hwdec *hwdec;
+
+    int events;
+};
+
+static void resize(struct gpu_priv *p)
+{
+    struct vo *vo = p->vo;
+
+    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
+
+    struct mp_rect src, dst;
+    struct mp_osd_res osd;
+    vo_get_src_dst_rects(vo, &src, &dst, &osd);
+
+    gl_video_resize(p->renderer, &src, &dst, &osd);
+
+    vo->want_redraw = true;
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    struct ra_tex *tex = sw->fns->start_frame(sw);
+    if (!tex) {
+        MP_ERR(vo, "Failed starting frame!\n");
+        return;
+    }
+
+    struct fbodst dst = {
+        .tex  = tex,
+        .flip = sw->flip_v,
+    };
+
+    gl_video_render_frame(p->renderer, frame, dst);
+    if (!sw->fns->submit_frame(sw, frame)) {
+        MP_ERR(vo, "Failed presenting frame!\n");
+        return;
+    }
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+    sw->fns->swap_buffers(sw);
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    struct gpu_priv *p = vo->priv;
+    if (!gl_video_check_format(p->renderer, format))
+        return 0;
+    return 1;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    struct gpu_priv *p = vo->priv;
+
+    if (!p->ctx->fns->reconfig(p->ctx))
+        return -1;
+
+    resize(p);
+    gl_video_config(p->renderer, params);
+
+    return 0;
+}
+
+static void request_hwdec_api(struct vo *vo, void *api)
+{
+    struct gpu_priv *p = vo->priv;
+
+    if (p->hwdec)
+        return;
+
+    p->hwdec = ra_hwdec_load_api(p->vo->log, p->ctx->ra, p->vo->global,
+                                 vo->hwdec_devs, (intptr_t)api);
+    gl_video_set_hwdec(p->renderer, p->hwdec);
+}
+
+static void call_request_hwdec_api(void *ctx, enum hwdec_type type)
+{
+    // Roundabout way to run hwdec loading on the VO thread.
+    // Redirects to request_hwdec_api().
+    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t)type);
+}
+
+static void get_and_update_icc_profile(struct gpu_priv *p)
+{
+    if (gl_video_icc_auto_enabled(p->renderer)) {
+        MP_VERBOSE(p, "Querying ICC profile...\n");
+        bstr icc = bstr0(NULL);
+        int r = p->ctx->fns->control(p->ctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
+
+        if (r != VO_NOTAVAIL) {
+            if (r == VO_FALSE) {
+                MP_WARN(p, "Could not retrieve an ICC profile.\n");
+            } else if (r == VO_NOTIMPL) {
+                MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
+            }
+
+            gl_video_set_icc_profile(p->renderer, icc);
+        }
+    }
+}
+
+static void get_and_update_ambient_lighting(struct gpu_priv *p)
+{
+    int lux;
+    int r = p->ctx->fns->control(p->ctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
+    if (r == VO_TRUE) {
+        gl_video_set_ambient_lux(p->renderer, lux);
+    }
+    if (r != VO_TRUE && gl_video_gamma_auto_enabled(p->renderer)) {
+        MP_ERR(p, "gamma_auto option provided, but querying for ambient"
+                  " lighting is not supported on this platform\n");
+    }
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    switch (request) {
+    case VOCTRL_SET_PANSCAN:
+        resize(p);
+        return VO_TRUE;
+    case VOCTRL_SET_EQUALIZER:
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_SCREENSHOT_WIN: {
+        struct mp_image *screen = NULL;
+        if (sw->fns->screenshot)
+            screen = sw->fns->screenshot(sw);
+        if (!screen)
+            break; // redirect to backend
+        // set image parameters according to the display, if possible
+        screen->params.color = gl_video_get_output_colorspace(p->renderer);
+        *(struct mp_image **)data = screen;
+        return true;
+    }
+    case VOCTRL_LOAD_HWDEC_API:
+        request_hwdec_api(vo, data);
+        return true;
+    case VOCTRL_UPDATE_RENDER_OPTS: {
+        gl_video_update_options(p->renderer);
+        get_and_update_icc_profile(p);
+        gl_video_configure_queue(p->renderer, p->vo);
+        p->vo->want_redraw = true;
+        return true;
+    }
+    case VOCTRL_RESET:
+        gl_video_reset(p->renderer);
+        return true;
+    case VOCTRL_PAUSE:
+        if (gl_video_showing_interpolated_frame(p->renderer))
+            vo->want_redraw = true;
+        return true;
+    case VOCTRL_PERFORMANCE_DATA:
+        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
+        return true;
+    }
+
+    int events = 0;
+    int r = p->ctx->fns->control(p->ctx, &events, request, data);
+    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
+        get_and_update_icc_profile(p);
+        vo->want_redraw = true;
+    }
+    if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
+        get_and_update_ambient_lighting(p);
+        vo->want_redraw = true;
+    }
+    events |= p->events;
+    p->events = 0;
+    if (events & VO_EVENT_RESIZE)
+        resize(p);
+    if (events & VO_EVENT_EXPOSE)
+        vo->want_redraw = true;
+    vo_event(vo, events);
+
+    return r;
+}
+
+static void wakeup(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    if (p->ctx && p->ctx->fns->wakeup)
+        p->ctx->fns->wakeup(p->ctx);
+}
+
+static void wait_events(struct vo *vo, int64_t until_time_us)
+{
+    struct gpu_priv *p = vo->priv;
+    if (p->ctx && p->ctx->fns->wait_events) {
+        p->ctx->fns->wait_events(p->ctx, until_time_us);
+    } else {
+        vo_wait_default(vo, until_time_us);
+    }
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct gpu_priv *p = vo->priv;
+
+    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
+}
+
+static void uninit(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+
+    gl_video_uninit(p->renderer);
+    ra_hwdec_uninit(p->hwdec);
+    if (vo->hwdec_devs) {
+        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
+        hwdec_devices_destroy(vo->hwdec_devs);
+    }
+    ra_ctx_destroy(&p->ctx);
+}
+
+static int preinit(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    p->vo = vo;
+    p->log = vo->log;
+
+    int alpha_mode;
+    mp_read_option_raw(vo->global, "alpha", &m_option_type_choice, &alpha_mode);
+
+    struct ra_ctx_opts opts = p->opts;
+    opts.want_alpha = alpha_mode == 1;
+
+    p->ctx = ra_ctx_create(vo, p->context_type, p->context_name, opts);
+    if (!p->ctx)
+        goto err_out;
+    assert(p->ctx->ra);
+    assert(p->ctx->swapchain);
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    p->renderer = gl_video_init(p->ctx->ra, vo->log, vo->global);
+    gl_video_set_osd_source(p->renderer, vo->osd);
+    gl_video_configure_queue(p->renderer, vo);
+
+    get_and_update_icc_profile(p);
+
+    vo->hwdec_devs = hwdec_devices_create();
+
+    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
+
+    p->hwdec = ra_hwdec_load(p->vo->log, p->ctx->ra, vo->global,
+                             vo->hwdec_devs, vo->opts->gl_hwdec_interop);
+    gl_video_set_hwdec(p->renderer, p->hwdec);
+
+    int fb_depth = sw->fns->color_depth ? sw->fns->color_depth(sw) : 0;
+    if (fb_depth)
+        MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth);
+    gl_video_set_fb_depth(p->renderer, fb_depth);
+
+    return 0;
+
+err_out:
+    uninit(vo);
+    return -1;
+}
+
+#define OPT_BASE_STRUCT struct gpu_priv
+static const m_option_t options[] = {
+    OPT_STRING_VALIDATE("gpu-context", context_name, 0, ra_ctx_validate_context),
+    OPT_STRING_VALIDATE("gpu-api", context_type, 0, ra_ctx_validate_api),
+    OPT_FLAG("gpu-debug", opts.debug, 0),
+    OPT_FLAG("gpu-sw", opts.allow_sw, 0),
+    OPT_INTRANGE("swapchain-depth", opts.swapchain_depth, 0, 1, 8),
+    {0}
+};
+
+static const struct gpu_priv defaults = { .opts = {
+    .swapchain_depth = 3,
+}};
+
+const struct vo_driver video_out_gpu = {
+    .description = "Shader-based GPU Renderer",
+    .name = "gpu",
+    .caps = VO_CAP_ROTATE90,
+    .preinit = preinit,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct gpu_priv),
+    .priv_defaults = &defaults,
+    .options = options,
+};
+
+static int preinit_opengl(struct vo *vo)
+{
+    MP_WARN(vo, "--vo=opengl was replaced by --vo=gpu --gpu-api=opengl, and will"
+            " be removed in the future!\n");
+
+    struct gpu_priv *p = vo->priv;
+    p->context_type = "opengl";
+    return preinit(vo);
+}
+
+const struct vo_driver video_out_opengl = {
+    .description = "Shader-based GPU Renderer",
+    .name = "opengl",
+    .caps = VO_CAP_ROTATE90,
+    .preinit = preinit_opengl,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct gpu_priv),
+    .priv_defaults = &defaults,
+    .options = options,
+};
diff --git a/video/out/vo_opengl.c b/video/out/vo_opengl.c
deleted file mode 100644
index 72691e56c2..0000000000
--- a/video/out/vo_opengl.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Based on vo_gl.c by Reimar Doeffinger.
- *
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <stdbool.h>
-#include <assert.h>
-
-#include <libavutil/common.h>
-
-#include "config.h"
-
-#include "mpv_talloc.h"
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "common/msg.h"
-#include "common/global.h"
-#include "options/m_config.h"
-#include "vo.h"
-#include "video/mp_image.h"
-#include "sub/osd.h"
-
-#include "opengl/context.h"
-#include "opengl/utils.h"
-#include "opengl/hwdec.h"
-#include "opengl/osd.h"
-#include "filter_kernels.h"
-#include "video/hwdec.h"
-#include "opengl/video.h"
-#include "opengl/ra_gl.h"
-
-#define NUM_VSYNC_FENCES 10
-
-struct vo_opengl_opts {
-    int use_glFinish;
-    int waitvsync;
-    int use_gl_debug;
-    int allow_sw;
-    int swap_interval;
-    int vsync_fences;
-    char *backend;
-    int es;
-    int pattern[2];
-};
-
-struct gl_priv {
-    struct vo *vo;
-    struct mp_log *log;
-    MPGLContext *glctx;
-    GL *gl;
-    struct ra *ra;
-
-    struct vo_opengl_opts opts;
-
-    struct gl_video *renderer;
-
-    struct ra_hwdec *hwdec;
-
-    int events;
-
-    int frames_rendered;
-    unsigned int prev_sgi_sync_count;
-
-    // check-pattern sub-option; for testing/debugging
-    int last_pattern;
-    int matches, mismatches;
-
-    GLsync vsync_fences[NUM_VSYNC_FENCES];
-    int num_vsync_fences;
-};
-
-static void resize(struct gl_priv *p)
-{
-    struct vo *vo = p->vo;
-
-    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
-
-    struct mp_rect src, dst;
-    struct mp_osd_res osd;
-    vo_get_src_dst_rects(vo, &src, &dst, &osd);
-
-    gl_video_resize(p->renderer, &src, &dst, &osd);
-
-    vo->want_redraw = true;
-}
-
-static void check_pattern(struct vo *vo, int item)
-{
-    struct gl_priv *p = vo->priv;
-    int expected = p->opts.pattern[p->last_pattern];
-    if (item == expected) {
-        p->last_pattern++;
-        if (p->last_pattern >= 2)
-            p->last_pattern = 0;
-        p->matches++;
-    } else {
-        p->mismatches++;
-        MP_WARN(vo, "wrong pattern, expected %d got %d (hit: %d, mis: %d)\n",
-                expected, item, p->matches, p->mismatches);
-    }
-}
-
-static void draw_frame(struct vo *vo, struct vo_frame *frame)
-{
-    struct gl_priv *p = vo->priv;
-    GL *gl = p->gl;
-
-    mpgl_start_frame(p->glctx);
-
-    if (gl->FenceSync && p->num_vsync_fences < p->opts.vsync_fences) {
-        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);;
-        if (fence)
-            p->vsync_fences[p->num_vsync_fences++] = fence;
-    }
-
-    struct fbodst target = {
-        .tex = ra_create_wrapped_fb(p->ra, p->glctx->main_fb,
-                                    vo->dwidth, vo->dheight),
-        .flip = !p->glctx->flip_v,
-    };
-    gl_video_render_frame(p->renderer, frame, target);
-    ra_tex_free(p->ra, &target.tex);
-
-    if (p->opts.use_glFinish)
-        gl->Finish();
-}
-
-static void flip_page(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    GL *gl = p->gl;
-
-    mpgl_swap_buffers(p->glctx);
-
-    p->frames_rendered++;
-    if (p->frames_rendered > 5 && !p->opts.use_gl_debug)
-        ra_gl_set_debug(p->ra, false);
-
-    if (p->opts.use_glFinish)
-        gl->Finish();
-
-    if (p->opts.waitvsync || p->opts.pattern[0]) {
-        if (gl->GetVideoSync) {
-            unsigned int n1 = 0, n2 = 0;
-            gl->GetVideoSync(&n1);
-            if (p->opts.waitvsync)
-                gl->WaitVideoSync(2, (n1 + 1) % 2, &n2);
-            int step = n1 - p->prev_sgi_sync_count;
-            p->prev_sgi_sync_count = n1;
-            MP_DBG(vo, "Flip counts: %u->%u, step=%d\n", n1, n2, step);
-            if (p->opts.pattern[0])
-                check_pattern(vo, step);
-        } else {
-            MP_WARN(vo, "GLX_SGI_video_sync not available, disabling.\n");
-            p->opts.waitvsync = 0;
-            p->opts.pattern[0] = 0;
-        }
-    }
-    while (p->opts.vsync_fences > 0 && p->num_vsync_fences >= p->opts.vsync_fences) {
-        gl->ClientWaitSync(p->vsync_fences[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
-        gl->DeleteSync(p->vsync_fences[0]);
-        MP_TARRAY_REMOVE_AT(p->vsync_fences, p->num_vsync_fences, 0);
-    }
-}
-
-static int query_format(struct vo *vo, int format)
-{
-    struct gl_priv *p = vo->priv;
-    if (!gl_video_check_format(p->renderer, format))
-        return 0;
-    return 1;
-}
-
-static int reconfig(struct vo *vo, struct mp_image_params *params)
-{
-    struct gl_priv *p = vo->priv;
-
-    if (mpgl_reconfig_window(p->glctx) < 0)
-        return -1;
-
-    resize(p);
-
-    gl_video_config(p->renderer, params);
-
-    return 0;
-}
-
-static void request_hwdec_api(struct vo *vo, void *api)
-{
-    struct gl_priv *p = vo->priv;
-
-    if (p->hwdec)
-        return;
-
-    p->hwdec = ra_hwdec_load_api(p->vo->log, p->ra, p->vo->global,
-                                 vo->hwdec_devs, (intptr_t)api);
-    gl_video_set_hwdec(p->renderer, p->hwdec);
-}
-
-static void call_request_hwdec_api(void *ctx, enum hwdec_type type)
-{
-    // Roundabout way to run hwdec loading on the VO thread.
-    // Redirects to request_hwdec_api().
-    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t)type);
-}
-
-static void get_and_update_icc_profile(struct gl_priv *p)
-{
-    if (gl_video_icc_auto_enabled(p->renderer)) {
-        MP_VERBOSE(p, "Querying ICC profile...\n");
-        bstr icc = bstr0(NULL);
-        int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
-
-        if (r != VO_NOTAVAIL) {
-            if (r == VO_FALSE) {
-                MP_WARN(p, "Could not retrieve an ICC profile.\n");
-            } else if (r == VO_NOTIMPL) {
-                MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
-            }
-
-            gl_video_set_icc_profile(p->renderer, icc);
-        }
-    }
-}
-
-static void get_and_update_ambient_lighting(struct gl_priv *p)
-{
-    int lux;
-    int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
-    if (r == VO_TRUE) {
-        gl_video_set_ambient_lux(p->renderer, lux);
-    }
-    if (r != VO_TRUE && gl_video_gamma_auto_enabled(p->renderer)) {
-        MP_ERR(p, "gamma_auto option provided, but querying for ambient"
-                  " lighting is not supported on this platform\n");
-    }
-}
-
-static int control(struct vo *vo, uint32_t request, void *data)
-{
-    struct gl_priv *p = vo->priv;
-
-    switch (request) {
-    case VOCTRL_SET_PANSCAN:
-        resize(p);
-        return VO_TRUE;
-    case VOCTRL_SET_EQUALIZER:
-        vo->want_redraw = true;
-        return VO_TRUE;
-    case VOCTRL_SCREENSHOT_WIN: {
-        struct mp_image *screen = gl_read_fbo_contents(p->gl, p->glctx->main_fb,
-                                                       vo->dwidth, vo->dheight);
-        if (!screen)
-            break; // redirect to backend
-        // set image parameters according to the display, if possible
-        screen->params.color = gl_video_get_output_colorspace(p->renderer);
-        if (p->glctx->flip_v)
-            mp_image_vflip(screen);
-        *(struct mp_image **)data = screen;
-        return true;
-    }
-    case VOCTRL_LOAD_HWDEC_API:
-        request_hwdec_api(vo, data);
-        return true;
-    case VOCTRL_UPDATE_RENDER_OPTS: {
-        gl_video_update_options(p->renderer);
-        get_and_update_icc_profile(p);
-        gl_video_configure_queue(p->renderer, p->vo);
-        p->vo->want_redraw = true;
-        return true;
-    }
-    case VOCTRL_RESET:
-        gl_video_reset(p->renderer);
-        return true;
-    case VOCTRL_PAUSE:
-        if (gl_video_showing_interpolated_frame(p->renderer))
-            vo->want_redraw = true;
-        return true;
-    case VOCTRL_PERFORMANCE_DATA:
-        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
-        return true;
-    }
-
-    int events = 0;
-    int r = mpgl_control(p->glctx, &events, request, data);
-    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
-        get_and_update_icc_profile(p);
-        vo->want_redraw = true;
-    }
-    if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
-        get_and_update_ambient_lighting(p);
-        vo->want_redraw = true;
-    }
-    events |= p->events;
-    p->events = 0;
-    if (events & VO_EVENT_RESIZE)
-        resize(p);
-    if (events & VO_EVENT_EXPOSE)
-        vo->want_redraw = true;
-    vo_event(vo, events);
-
-    return r;
-}
-
-static void wakeup(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    if (p->glctx && p->glctx->driver->wakeup)
-        p->glctx->driver->wakeup(p->glctx);
-}
-
-static void wait_events(struct vo *vo, int64_t until_time_us)
-{
-    struct gl_priv *p = vo->priv;
-    if (p->glctx->driver->wait_events) {
-        p->glctx->driver->wait_events(p->glctx, until_time_us);
-    } else {
-        vo_wait_default(vo, until_time_us);
-    }
-}
-
-static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
-                                  int stride_align)
-{
-    struct gl_priv *p = vo->priv;
-
-    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
-}
-
-static void uninit(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-
-    gl_video_uninit(p->renderer);
-    ra_hwdec_uninit(p->hwdec);
-    if (vo->hwdec_devs) {
-        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
-        hwdec_devices_destroy(vo->hwdec_devs);
-    }
-    ra_free(&p->ra);
-    mpgl_uninit(p->glctx);
-}
-
-static int preinit(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    p->vo = vo;
-    p->log = vo->log;
-
-    int vo_flags = 0;
-
-    int alpha_mode;
-    mp_read_option_raw(vo->global, "alpha", &m_option_type_choice, &alpha_mode);
-
-    if (alpha_mode == 1)
-        vo_flags |= VOFLAG_ALPHA;
-
-    if (p->opts.use_gl_debug)
-        vo_flags |= VOFLAG_GL_DEBUG;
-
-    if (p->opts.es == 1)
-        vo_flags |= VOFLAG_GLES;
-    if (p->opts.es == 2)
-        vo_flags |= VOFLAG_GLES | VOFLAG_GLES2;
-    if (p->opts.es == -1)
-        vo_flags |= VOFLAG_NO_GLES;
-
-    if (p->opts.allow_sw)
-        vo_flags |= VOFLAG_SW;
-
-    p->glctx = mpgl_init(vo, p->opts.backend, vo_flags);
-    if (!p->glctx)
-        goto err_out;
-    p->gl = p->glctx->gl;
-
-    if (p->gl->SwapInterval) {
-        p->gl->SwapInterval(p->opts.swap_interval);
-    } else {
-        MP_VERBOSE(vo, "swap_control extension missing.\n");
-    }
-
-    p->ra = ra_create_gl(p->gl, vo->log);
-    if (!p->ra)
-        goto err_out;
-
-    p->renderer = gl_video_init(p->ra, vo->log, vo->global);
-    gl_video_set_osd_source(p->renderer, vo->osd);
-    gl_video_configure_queue(p->renderer, vo);
-
-    get_and_update_icc_profile(p);
-
-    vo->hwdec_devs = hwdec_devices_create();
-
-    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
-
-    p->hwdec = ra_hwdec_load(p->vo->log, p->ra, vo->global,
-                             vo->hwdec_devs, vo->opts->gl_hwdec_interop);
-    gl_video_set_hwdec(p->renderer, p->hwdec);
-
-    gl_check_error(p->gl, p->log, "before retrieving framebuffer depth");
-    int fb_depth = gl_get_fb_depth(p->gl, p->glctx->main_fb);
-    gl_check_error(p->gl, p->log, "retrieving framebuffer depth");
-    if (fb_depth)
-        MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth);
-    gl_video_set_fb_depth(p->renderer, fb_depth);
-
-    return 0;
-
-err_out:
-    uninit(vo);
-    return -1;
-}
-
-#define OPT_BASE_STRUCT struct gl_priv
-
-const struct vo_driver video_out_opengl = {
-    .description = "Extended OpenGL Renderer",
-    .name = "opengl",
-    .caps = VO_CAP_ROTATE90,
-    .preinit = preinit,
-    .query_format = query_format,
-    .reconfig = reconfig,
-    .control = control,
-    .get_image = get_image,
-    .draw_frame = draw_frame,
-    .flip_page = flip_page,
-    .wait_events = wait_events,
-    .wakeup = wakeup,
-    .uninit = uninit,
-    .priv_size = sizeof(struct gl_priv),
-    .options = (const m_option_t[]) {
-        OPT_FLAG("opengl-glfinish", opts.use_glFinish, 0),
-        OPT_FLAG("opengl-waitvsync", opts.waitvsync, 0),
-        OPT_INT("opengl-swapinterval", opts.swap_interval, 0),
-        OPT_FLAG("opengl-debug", opts.use_gl_debug, 0),
-        OPT_STRING_VALIDATE("opengl-backend", opts.backend, 0,
-                            mpgl_validate_backend_opt),
-        OPT_FLAG("opengl-sw", opts.allow_sw, 0),
-        OPT_CHOICE("opengl-es", opts.es, 0, ({"no", -1}, {"auto", 0},
-                                             {"yes", 1}, {"force2", 2})),
-        OPT_INTPAIR("opengl-check-pattern", opts.pattern, 0),
-        OPT_INTRANGE("opengl-vsync-fences", opts.vsync_fences, 0,
-                     0, NUM_VSYNC_FENCES),
-
-        {0}
-    },
-    .priv_defaults = &(const struct gl_priv){
-        .opts = {
-            .swap_interval = 1,
-        },
-    },
-};
diff --git a/video/out/vo_opengl_cb.c b/video/out/vo_opengl_cb.c
index ea6aaa9048..7e95e8bd31 100644
--- a/video/out/vo_opengl_cb.c
+++ b/video/out/vo_opengl_cb.c
@@ -24,9 +24,10 @@
 #include "common/global.h"
 #include "player/client.h"
 
+#include "gpu/video.h"
+#include "gpu/hwdec.h"
 #include "opengl/common.h"
-#include "opengl/video.h"
-#include "opengl/hwdec.h"
+#include "opengl/context.h"
 #include "opengl/ra_gl.h"
 
 #include "libmpv/opengl_cb.h"
@@ -86,7 +87,7 @@ struct mpv_opengl_cb_context {
     //     application's OpenGL context is current - i.e. only while the
     //     host application is calling certain mpv_opengl_cb_* APIs.
     GL *gl;
-    struct ra *ra;
+    struct ra_ctx *ra_ctx;
     struct gl_video *renderer;
     struct ra_hwdec *hwdec;
     struct m_config_cache *vo_opts_cache;
@@ -171,16 +172,36 @@ int mpv_opengl_cb_init_gl(struct mpv_opengl_cb_context *ctx, const char *exts,
         return MPV_ERROR_UNSUPPORTED;
     }
 
-    ctx->ra = ra_create_gl(ctx->gl, ctx->log);
-    if (!ctx->ra)
+    // initialize a blank ra_ctx to reuse ra_gl_ctx
+    ctx->ra_ctx = talloc_zero(ctx, struct ra_ctx);
+    ctx->ra_ctx->log = ctx->log;
+    ctx->ra_ctx->global = ctx->global;
+    ctx->ra_ctx->opts = (struct ra_ctx_opts) {
+        .probing = false,
+        .allow_sw = true,
+    };
+
+    static const struct ra_swapchain_fns empty_swapchain_fns = {0};
+    struct ra_gl_ctx_params gl_params = {
+        // vo_opengl_cb is essentially like a gigantic external swapchain where
+        // the user is in charge of presentation / swapping etc. But we don't
+        // actually need to provide any of these functions, since we can just
+        // not call them to begin with - so just set it to an empty object to
+        // signal to ra_gl_ctx that we don't care about its latency emulation
+        // functionality
+        .external_swapchain = &empty_swapchain_fns
+    };
+
+    ctx->gl->SwapInterval = NULL; // we shouldn't randomly change this, so lock it
+    if (!ra_gl_ctx_init(ctx->ra_ctx, ctx->gl, gl_params))
         return MPV_ERROR_UNSUPPORTED;
 
-    ctx->renderer = gl_video_init(ctx->ra, ctx->log, ctx->global);
+    ctx->renderer = gl_video_init(ctx->ra_ctx->ra, ctx->log, ctx->global);
 
     m_config_cache_update(ctx->vo_opts_cache);
 
     ctx->hwdec_devs = hwdec_devices_create();
-    ctx->hwdec = ra_hwdec_load(ctx->log, ctx->ra, ctx->global,
+    ctx->hwdec = ra_hwdec_load(ctx->log, ctx->ra_ctx->ra, ctx->global,
                                ctx->hwdec_devs, ctx->vo_opts->gl_hwdec_interop);
     gl_video_set_hwdec(ctx->renderer, ctx->hwdec);
 
@@ -221,7 +242,7 @@ int mpv_opengl_cb_uninit_gl(struct mpv_opengl_cb_context *ctx)
     ctx->hwdec = NULL;
     hwdec_devices_destroy(ctx->hwdec_devs);
     ctx->hwdec_devs = NULL;
-    ra_free(&ctx->ra);
+    ra_ctx_destroy(&ctx->ra_ctx);
     talloc_free(ctx->gl);
     ctx->gl = NULL;
     return 0;
@@ -236,11 +257,6 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         return MPV_ERROR_UNSUPPORTED;
     }
 
-    struct fbodst target = {
-        .tex = ra_create_wrapped_fb(ctx->ra, fbo, vp_w, abs(vp_h)),
-        .flip = vp_h < 0,
-    };
-
     reset_gl_state(ctx->gl);
 
     pthread_mutex_lock(&ctx->lock);
@@ -280,7 +296,7 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         mp_read_option_raw(ctx->global, "opengl-debug", &m_option_type_flag,
                            &debug);
         ctx->gl->debug_context = debug;
-        ra_gl_set_debug(ctx->ra, debug);
+        ra_gl_set_debug(ctx->ra_ctx->ra, debug);
         if (gl_video_icc_auto_enabled(ctx->renderer))
             MP_ERR(ctx, "icc-profile-auto is not available with opengl-cb\n");
     }
@@ -316,7 +332,14 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
     pthread_mutex_unlock(&ctx->lock);
 
     MP_STATS(ctx, "glcb-render");
+    struct ra_swapchain *sw = ctx->ra_ctx->swapchain;
+    ra_gl_ctx_resize(sw, vp_w, abs(vp_h), fbo);
+    struct fbodst target = {
+        .tex = ra_gl_ctx_start_frame(sw),
+        .flip = vp_h < 0,
+    };
     gl_video_render_frame(ctx->renderer, frame, target);
+    ra_gl_ctx_submit_frame(sw, frame);
 
     reset_gl_state(ctx->gl);
 
@@ -328,8 +351,6 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         pthread_cond_wait(&ctx->wakeup, &ctx->lock);
     pthread_mutex_unlock(&ctx->lock);
 
-    ra_tex_free(ctx->ra, &target.tex);
-
     return 0;
 }
 
diff --git a/video/out/vo_rpi.c b/video/out/vo_rpi.c
index 5b5d62c78f..8b819af163 100644
--- a/video/out/vo_rpi.c
+++ b/video/out/vo_rpi.c
@@ -44,7 +44,7 @@
 #include "sub/osd.h"
 
 #include "opengl/ra_gl.h"
-#include "opengl/video.h"
+#include "gpu/video.h"
 
 struct mp_egl_rpi {
     struct mp_log *log;
diff --git a/wscript b/wscript
index 9c4c823a08..a50dcfed69 100644
--- a/wscript
+++ b/wscript
@@ -783,15 +783,20 @@ video_output_features = [
         ),
     }, {
         'name': '--gl',
-        'desc': 'OpenGL video outputs',
+        'desc': 'OpenGL context support',
         'deps': 'gl-cocoa || gl-x11 || egl-x11 || egl-drm || '
                  + 'gl-win32 || gl-wayland || rpi || mali-fbdev || '
                  + 'plain-gl',
         'func': check_true,
+    }, {
+        'name': '--gpu',
+        'desc': 'GPU-accelerated video output support',
+        'deps': 'gl',
+        'func': check_true,
         'req': True,
-        'fmsg': "No OpenGL video output found or enabled. " +
-                "Aborting. If you really mean to compile without OpenGL " +
-                "video outputs use --disable-gl."
+        'fmsg': "No GPU context found or enabled. Aborting. " +
+                "If you really mean to compile without support for " +
+                "`--vo=gpu`, then use --disable-gpu."
     }, {
         'name': 'egl-helpers',
         'desc': 'EGL helper functions',
diff --git a/wscript_build.py b/wscript_build.py
index 2f6c08bc0a..8dab6012c0 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -385,45 +385,46 @@ def build(ctx):
         ( "video/out/dither.c" ),
         ( "video/out/filter_kernels.c" ),
         ( "video/out/opengl/angle_dynamic.c",    "egl-angle" ),
+        ( "video/out/gpu/context.c",             "gpu" ),
+        ( "video/out/gpu/hwdec.c",               "gpu" ),
+        ( "video/out/gpu/lcms.c",                "gpu" ),
+        ( "video/out/gpu/osd.c",                 "gpu" ),
+        ( "video/out/gpu/ra.c",                  "gpu" ),
+        ( "video/out/gpu/shader_cache.c",        "gpu" ),
+        ( "video/out/gpu/user_shaders.c",        "gpu" ),
+        ( "video/out/gpu/utils.c",               "gpu" ),
+        ( "video/out/gpu/video.c",               "gpu" ),
+        ( "video/out/gpu/video_shaders.c",       "gpu" ),
         ( "video/out/opengl/common.c",           "gl" ),
+        ( "video/out/opengl/formats.c",          "gl" ),
+        ( "video/out/opengl/utils.c",            "gl" ),
+        ( "video/out/opengl/ra_gl.c",            "gl" ),
         ( "video/out/opengl/context.c",          "gl" ),
-        ( "video/out/opengl/context_angle.c",    "egl-angle-win32" ),
-        ( "video/out/opengl/context_cocoa.c",    "gl-cocoa" ),
+#       ( "video/out/opengl/context_angle.c",    "egl-angle-win32" ),
+#       ( "video/out/opengl/context_cocoa.c",    "gl-cocoa" ),
         ( "video/out/opengl/context_drm_egl.c",  "egl-drm" ),
-        ( "video/out/opengl/context_dxinterop.c","gl-dxinterop" ),
+#       ( "video/out/opengl/context_dxinterop.c","gl-dxinterop" ),
         ( "video/out/opengl/context_mali_fbdev.c","mali-fbdev" ),
         ( "video/out/opengl/context_rpi.c",      "rpi" ),
         ( "video/out/opengl/context_vdpau.c",    "vdpau-gl-x11" ),
         ( "video/out/opengl/context_wayland.c",  "gl-wayland" ),
-        ( "video/out/opengl/context_w32.c",      "gl-win32" ),
-        ( "video/out/opengl/context_x11.c",      "gl-x11" ),
+#       ( "video/out/opengl/context_w32.c",      "gl-win32" ),
+        ( "video/out/opengl/context_glx.c",      "gl-x11" ),
         ( "video/out/opengl/context_x11egl.c",   "egl-x11" ),
         ( "video/out/opengl/cuda_dynamic.c",     "cuda-hwaccel" ),
-        ( "video/out/opengl/d3d11_helpers.c",    "egl-angle-win32" ),
+#       ( "video/out/opengl/d3d11_helpers.c",    "egl-angle-win32" ),
         ( "video/out/opengl/egl_helpers.c",      "egl-helpers" ),
-        ( "video/out/opengl/formats.c",          "gl" ),
-        ( "video/out/opengl/gl_utils.c",         "gl" ),
-        ( "video/out/opengl/hwdec.c",            "gl" ),
         ( "video/out/opengl/hwdec_cuda.c",       "cuda-hwaccel" ),
-        ( "video/out/opengl/hwdec_d3d11egl.c",   "d3d-hwaccel" ),
-        ( "video/out/opengl/hwdec_d3d11eglrgb.c","d3d-hwaccel" ),
-        ( "video/out/opengl/hwdec_dxva2gldx.c",  "gl-dxinterop-d3d9" ),
-        ( "video/out/opengl/hwdec_dxva2egl.c",   "d3d9-hwaccel" ),
+#       ( "video/out/opengl/hwdec_d3d11egl.c",   "d3d-hwaccel" ),
+#       ( "video/out/opengl/hwdec_d3d11eglrgb.c","d3d-hwaccel" ),
+#       ( "video/out/opengl/hwdec_dxva2gldx.c",  "gl-dxinterop-d3d9" ),
+#       ( "video/out/opengl/hwdec_dxva2egl.c",   "d3d9-hwaccel" ),
         ( "video/out/opengl/hwdec_osx.c",        "videotoolbox-gl" ),
         ( "video/out/opengl/hwdec_ios.m",        "ios-gl" ),
         ( "video/out/opengl/hwdec_rpi.c",        "rpi" ),
         ( "video/out/opengl/hwdec_vaegl.c",      "vaapi-egl" ),
         ( "video/out/opengl/hwdec_vaglx.c",      "vaapi-glx" ),
         ( "video/out/opengl/hwdec_vdpau.c",      "vdpau-gl-x11" ),
-        ( "video/out/opengl/lcms.c",             "gl" ),
-        ( "video/out/opengl/osd.c",              "gl" ),
-        ( "video/out/opengl/ra.c",               "gl" ),
-        ( "video/out/opengl/ra_gl.c",            "gl" ),
-        ( "video/out/opengl/shader_cache.c",     "gl" ),
-        ( "video/out/opengl/user_shaders.c",     "gl" ),
-        ( "video/out/opengl/utils.c",            "gl" ),
-        ( "video/out/opengl/video.c",            "gl" ),
-        ( "video/out/opengl/video_shaders.c",    "gl" ),
         ( "video/out/vo.c" ),
         ( "video/out/vo_caca.c",                 "caca" ),
         ( "video/out/vo_drm.c",                  "drm" ),
@@ -432,7 +433,7 @@ def build(ctx):
         ( "video/out/vo_lavc.c",                 "encoding" ),
         ( "video/out/vo_rpi.c",                  "rpi" ),
         ( "video/out/vo_null.c" ),
-        ( "video/out/vo_opengl.c",               "gl" ),
+        ( "video/out/vo_gpu.c",                  "gpu" ),
         ( "video/out/vo_opengl_cb.c",            "gl" ),
         ( "video/out/vo_sdl.c",                  "sdl2" ),
         ( "video/out/vo_tct.c" ),
-- 
cgit v1.2.3