diff options
Diffstat (limited to 'video/out/opengl/nnedi3.c')
-rw-r--r-- | video/out/opengl/nnedi3.c | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/video/out/opengl/nnedi3.c b/video/out/opengl/nnedi3.c new file mode 100644 index 0000000000..fb704d2ab6 --- /dev/null +++ b/video/out/opengl/nnedi3.c @@ -0,0 +1,217 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with mpv. If not, see <http://www.gnu.org/licenses/>. + * + * You can alternatively redistribute this file and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + */ + +#include "nnedi3.h" + +#include <assert.h> +#include <stdint.h> +#include <float.h> + +#include <libavutil/bswap.h> + +#include "video.h" + +#define GLSL(x) gl_sc_add(sc, #x "\n"); +#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__) +#define GLSLH(x) gl_sc_hadd(sc, #x "\n"); +#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__) + +const struct nnedi3_opts nnedi3_opts_def = { + .neurons = 1, + .window = 0, + .upload = NNEDI3_UPLOAD_UBO, +}; + +#define OPT_BASE_STRUCT struct nnedi3_opts +const struct m_sub_options nnedi3_conf = { + .opts = (const m_option_t[]) { + OPT_CHOICE("neurons", neurons, 0, + ({"16", 0}, + {"32", 1}, + {"64", 2}, + {"128", 3})), + OPT_CHOICE("window", window, 0, + ({"8x4", 0}, + {"8x6", 1})), + OPT_CHOICE("upload", upload, 0, + ({"ubo", NNEDI3_UPLOAD_UBO}, + {"shader", NNEDI3_UPLOAD_SHADER})), + {0} + }, + .size = sizeof(struct nnedi3_opts), + .defaults = &nnedi3_opts_def, +}; + +const static char nnedi3_weights[40320 * 4 + 1] = +#include "video/out/opengl/nnedi3_weights.inc" +; + +const int nnedi3_weight_offsets[9] = + {0, 1088, 3264, 7616, 16320, 17920, 21120, 27520, 40320}; + +const int nnedi3_neurons[4] = {16, 32, 64, 128}; +const int nnedi3_window_width[2] = {8, 8}; +const int nnedi3_window_height[2] = {4, 6}; + +const float* get_nnedi3_weights(const struct nnedi3_opts *conf, int *size) +{ + int idx = conf->window * 4 + conf->neurons; + const int offset = nnedi3_weight_offsets[idx]; + *size = (nnedi3_weight_offsets[idx + 1] - offset) * 4; + return (const float*)(nnedi3_weights + offset * 4); +} + +void pass_nnedi3(struct gl_shader_cache *sc, int planes, int tex_num, + int step, const struct nnedi3_opts *conf, + struct gl_transform *transform) +{ + assert(0 <= step && step < 2); + + if (!conf) + conf = &nnedi3_opts_def; + + const int neurons = nnedi3_neurons[conf->neurons]; + const int width = nnedi3_window_width[conf->window]; + const int height = nnedi3_window_height[conf->window]; + + const int offset = nnedi3_weight_offsets[conf->window * 4 + conf->neurons]; + const uint32_t *weights = (const int*)(nnedi3_weights + offset * 4); + + GLSLF("// nnedi3 (tex %d, step %d, neurons %d, window %dx%d, mode %d)\n", + tex_num, step + 1, neurons, width, height, conf->upload); + + // This is required since each row will be encoded into vec4s + assert(width % 4 == 0); + const int sample_count = width * height / 4; + + if (conf->upload == NNEDI3_UPLOAD_UBO) { + char buf[32]; + snprintf(buf, sizeof(buf), "vec4 weights[%d];", + neurons * (sample_count * 2 + 1)); + gl_sc_uniform_buffer(sc, "NNEDI3_WEIGHTS", buf, 0); + } else if (conf->upload == NNEDI3_UPLOAD_SHADER) { + // Somehow necessary for hard coding approach. + GLSLH(#pragma optionNV(fastprecision on)) + } + + GLSLHF("float nnedi3(sampler2D tex, vec2 pos, vec2 tex_size, int plane) {\n"); + + if (step == 0) { + *transform = (struct gl_transform){{{1.0,0.0}, {0.0,2.0}}, {0.0,-0.5}}; + + GLSLH(if (fract(pos.y * tex_size.y) < 0.5) + return texture(tex, pos + vec2(0, 0.25) / tex_size)[plane];) + GLSLHF("#define GET(i, j) " + "(texture(tex, pos+vec2((i)-(%f),(j)-(%f)+0.25)/tex_size)[plane])\n", + width / 2.0 - 1, (height - 1) / 2.0); + } else { + *transform = (struct gl_transform){{{2.0,0.0}, {0.0,1.0}}, {-0.5,0.0}}; + + GLSLH(if (fract(pos.x * tex_size.x) < 0.5) + return texture(tex, pos + vec2(0.25, 0) / tex_size)[plane];) + GLSLHF("#define GET(i, j) " + "(texture(tex, pos+vec2((j)-(%f)+0.25,(i)-(%f))/tex_size)[plane])\n", + (height - 1) / 2.0, width / 2.0 - 1); + } + + GLSLHF("vec4 samples[%d];\n", sample_count); + + for (int y = 0; y < height; y++) + for (int x = 0; x < width; x += 4) { + GLSLHF("samples[%d] = vec4(GET(%d, %d), GET(%d, %d)," + "GET(%d, %d), GET(%d, %d));\n", + (y * width + x) / 4, x, y, x+1, y, x+2, y, x+3, y); + } + + GLSLHF("float sum = 0, sumsq = 0;" + "for (int i = 0; i < %d; i++) {" + "sum += dot(samples[i], vec4(1.0));" + "sumsq += dot(samples[i], samples[i]);" + "}\n", sample_count); + + GLSLHF("float mstd0 = sum / %d.0;\n" + "float mstd1 = sumsq / %d.0 - mstd0 * mstd0;\n" + "float mstd2 = mix(0, inversesqrt(mstd1), mstd1 >= %.12e);\n" + "mstd1 *= mstd2;\n", + width * height, width * height, FLT_EPSILON); + + GLSLHF("float vsum = 0, wsum = 0, sum1, sum2;\n"); + + if (conf->upload == NNEDI3_UPLOAD_SHADER) { + GLSLH(#define T(x) intBitsToFloat(x)) + GLSLH(#define W(i,w0,w1,w2,w3) dot(samples[i],vec4(T(w0),T(w1),T(w2),T(w3)))) + + GLSLHF("#define WS(w0,w1) " + "sum1 = exp(sum1 * mstd2 + T(w0));" + "sum2 = sum2 * mstd2 + T(w1);" + "wsum += sum1;" + "vsum += sum1*(sum2/(1+abs(sum2)));\n"); + + for (int n = 0; n < neurons; n++) { + const uint32_t *weights_ptr = weights + (sample_count * 2 + 1) * 4 * n; + for (int s = 0; s < 2; s++) { + GLSLHF("sum%d", s + 1); + for (int i = 0; i < sample_count; i++) { + GLSLHF("%cW(%d,%d,%d,%d,%d)", i == 0 ? '=' : '+', i, + (int)av_le2ne32(weights_ptr[0]), + (int)av_le2ne32(weights_ptr[1]), + (int)av_le2ne32(weights_ptr[2]), + (int)av_le2ne32(weights_ptr[3])); + weights_ptr += 4; + } + GLSLHF(";"); + } + GLSLHF("WS(%d,%d);\n", (int)av_le2ne32(weights_ptr[0]), + (int)av_le2ne32(weights_ptr[1])); + } + } else if (conf->upload == NNEDI3_UPLOAD_UBO) { + GLSLH(int idx = 0;) + + GLSLHF("for (int n = 0; n < %d; n++) {\n", neurons); + + for (int s = 0; s < 2; s++) { + GLSLHF("sum%d = 0;\n" + "for (int i = 0; i < %d; i++) {" + "sum%d += dot(samples[i], weights[idx++]);" + "}\n", + s + 1, sample_count, s + 1); + } + + GLSLH(sum1 = exp(sum1 * mstd2 + weights[idx][0]); + sum2 = sum2 * mstd2 + weights[idx++][1]; + wsum += sum1; + vsum += sum1*(sum2/(1+abs(sum2)));) + + GLSLHF("}\n"); + } + + GLSLH(return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0, 1);) + + GLSLHF("}\n"); // nnedi3 + + GLSL(vec4 color = vec4(1.0);) + + for (int i = 0; i < planes; i++) { + GLSLF("color[%d] = nnedi3(texture%d, texcoord%d, texture_size%d, %d);\n", + i, tex_num, tex_num, tex_num, i); + } +} |