vo_opengl: check against shmem limits

The radius check was not strict enough, especially not for all platforms. To fix this, actually check the hardware capabilities instead of relying on a hard-coded maximum radius.
author: Niklas Haas <git@haasn.xyz> 2017-07-26 01:42:19 +0200
committer: Niklas Haas <git@haasn.xyz> 2017-07-26 01:54:33 +0200
commit: b31020b193db24e175bce077755c2f3e814e57ff (patch)
tree: 8be29707f6c60ea310c290f18f2f2fc38833983a
parent: 9875f14ad4cb977fb3b6460704b29d4949fcb81b (diff)
6 files changed, 54 insertions, 26 deletions
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
index c7a714817a..f6202e2c8c 100644
--- a/video/out/opengl/common.c
+++ b/video/out/opengl/common.c
@@ -625,8 +625,10 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
     }
 
     // GL_ARB_compute_shader & GL_ARB_shader_image_load_store
-    if (gl->DispatchCompute && gl->BindImageTexture)
+    if (gl->DispatchCompute && gl->BindImageTexture) {
         gl->mpgl_caps |= MPGL_CAP_COMPUTE_SHADER;
+        gl->GetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &gl->max_shmem);
+    }
 
     // Provided for simpler handling if no framebuffer support is available.
     if (!gl->BindFramebuffer)
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index 6d8015c8b3..abc8f30192 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -87,6 +87,7 @@ struct GL {
     int glsl_version;           // e.g. 130 for GLSL 1.30
     char *extensions;           // Equivalent to GL_EXTENSIONS
     int mpgl_caps;              // Bitfield of MPGL_CAP_* constants
+    int max_shmem;              // Maximum shared memory for compute shaders
     bool debug_context;         // use of e.g. GLX_CONTEXT_DEBUG_BIT_ARB
 
     // Use mpgl_get_native_display() instead. Also, this is set to use the
diff --git a/video/out/opengl/gl_headers.h b/video/out/opengl/gl_headers.h
index e57cab35dc..9b9d1a506a 100644
--- a/video/out/opengl/gl_headers.h
+++ b/video/out/opengl/gl_headers.h
@@ -86,6 +86,7 @@
 // --- GL 4.3 or GL_ARB_compute_shader
 
 #define GL_COMPUTE_SHADER                 0x91B9
+#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262
 
 // --- GL 4.3 or GL_ARB_shader_storage_buffer_object
 
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 91adc62660..9867751684 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -1714,6 +1714,50 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
     pass_sample_separated_gen(p->sc, scaler, 1, 0);
 }
 
+// Picks either the compute shader version or the regular sampler version
+// depending on hardware support
+static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
+                                       struct img_tex tex, int w, int h)
+{
+    GL *gl = p->gl;
+
+    GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY;
+    if (!(gl->mpgl_caps & reqs))
+        goto fallback;
+
+    int bound = ceil(scaler->kernel->radius_cutoff);
+    int offset = bound - 1; // padding top/left
+    int padding = offset + bound; // total padding
+
+    float ratiox = (float)w / tex.w,
+          ratioy = (float)h / tex.h;
+
+    // For performance we want to load at least as many pixels
+    // horizontally as there are threads in a warp (32 for nvidia), as
+    // well as enough to take advantage of shmem parallelism
+    const int warp_size = 32, threads = 256;
+    int bw = warp_size;
+    int bh = threads / bw;
+
+    // We need to sample everything from base_min to base_max, so make sure
+    // we have enough room in shmem
+    int iw = (int)ceil(bw / ratiox) + padding + 1,
+        ih = (int)ceil(bh / ratioy) + padding + 1;
+
+    int shmem_req = iw * ih * tex.components * sizeof(GLfloat);
+    if (shmem_req > gl->max_shmem)
+        goto fallback;
+
+    compute_size_minimum(p, bw, bh);
+    pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
+    return;
+
+fallback:
+    // Fall back to regular polar shader when compute shaders are unsupported
+    // or the kernel is too big for shmem
+    pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+}
+
 // Sample from img_tex, with the src rectangle given by it.
 // The dst rectangle is implicit by what the caller will do next, but w and h
 // must still be what is going to be used (to dimension FBOs correctly).
@@ -1753,21 +1797,7 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     } else if (strcmp(name, "oversample") == 0) {
         pass_sample_oversample(p->sc, scaler, w, h);
     } else if (scaler->kernel && scaler->kernel->polar) {
-        GLenum reqs = MPGL_CAP_COMPUTE_SHADER | MPGL_CAP_NESTED_ARRAY;
-        if ((p->gl->mpgl_caps & reqs) && scaler->kernel->f.radius <= 16) {
-            // For performance we want to load at least as many pixels
-            // horizontally as there are threads in a warp (32 for nvidia), as
-            // well as enough to take advantage of shmem parallelism
-            const int warp_size = 32, threads = 256;
-            compute_size_minimum(p, warp_size, threads / warp_size);
-            pass_compute_polar(p->sc, scaler, tex.components,
-                               p->compute_w, p->compute_h,
-                               (float)w / tex.w, (float)h / tex.h);
-        } else {
-            // Fall back to regular polar shader when compute shaders are
-            // unsupported or the kernel is too big for shmem
-            pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
-        }
+        pass_dispatch_sample_polar(p, scaler, tex, w, h);
     } else if (scaler->kernel) {
         pass_sample_separated(p, tex, scaler, w, h);
     } else {
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
index 854c829f1d..c0ca40b48e 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/opengl/video_shaders.c
@@ -217,18 +217,13 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
     GLSLF("}\n");
 }
 
+// bw/bh: block size
+// iw/ih: input size (pre-calculated to fit all required texels)
 void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                        int components, int bw, int bh, float ratiox,
-                        float ratioy)
+                        int components, int bw, int bh, int iw, int ih)
 {
     int bound = ceil(scaler->kernel->radius_cutoff);
     int offset = bound - 1; // padding top/left
-    int padding = offset + bound; // total padding
-
-    // We need to sample everything from base_min to base_max, so make sure
-    // we have enough space to fit all relevant texels in shmem
-    int iw = (int)ceil(bw / ratiox) + padding + 1,
-        ih = (int)ceil(bh / ratioy) + padding + 1;
 
     GLSL(color = vec4(0.0);)
     GLSLF("{\n");
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
index 597027ca6b..af59d9b678 100644
--- a/video/out/opengl/video_shaders.h
+++ b/video/out/opengl/video_shaders.h
@@ -33,8 +33,7 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
 void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
                        int components, int glsl_version);
 void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                        int components, int bw, int bh, float ratiox,
-                        float ratioy);
+                        int components, int bw, int bh, int iw, int ih);
 void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
 void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
                             int w, int h);
author	Niklas Haas <git@haasn.xyz>	2017-07-26 01:42:19 +0200
committer	Niklas Haas <git@haasn.xyz>	2017-07-26 01:54:33 +0200
commit	b31020b193db24e175bce077755c2f3e814e57ff (patch)
tree	8be29707f6c60ea310c290f18f2f2fc38833983a
parent	9875f14ad4cb977fb3b6460704b29d4949fcb81b (diff)