23 files changed, 801 insertions, 522 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 5c7f4ae1..16210830 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -2,7 +2,6 @@ set(SRCS
             renderer_opengl/generated/gl_3_2_core.c
             renderer_opengl/gl_rasterizer.cpp
             renderer_opengl/gl_rasterizer_cache.cpp
-            renderer_opengl/gl_resource_manager.cpp
             renderer_opengl/gl_shader_util.cpp
             renderer_opengl/gl_state.cpp
             renderer_opengl/renderer_opengl.cpp
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 943f3eb3..558b49d6 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -94,7 +94,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
     // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
     // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
     //       epsilon possible within float24 accuracy.
-    static const float24 EPSILON = float24::FromFloat32(0.00001);
+    static const float24 EPSILON = float24::FromFloat32(0.00001f);
     static const float24 f0 = float24::FromFloat32(0.0);
     static const float24 f1 = float24::FromFloat32(1.0);
     static const std::array<ClippingEdge, 7> clipping_edges = {{
@@ -153,7 +153,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                   "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                   "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                   "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i, output_list->size(),
+                  i + 1, output_list->size() - 2,
                   vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                   vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                   vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index b46fadd9..ef9584ab 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -6,18 +6,20 @@
 
 #include "common/profiler.h"
 
+#include "core/hle/service/gsp_gpu.h"
+#include "core/hw/gpu.h"
+#include "core/settings.h"
+
+#include "debug_utils/debug_utils.h"
+
 #include "clipper.h"
 #include "command_processor.h"
 #include "math.h"
 #include "pica.h"
 #include "primitive_assembly.h"
+#include "renderer_base.h"
 #include "vertex_shader.h"
 #include "video_core.h"
-#include "core/hle/service/gsp_gpu.h"
-#include "core/hw/gpu.h"
-#include "core/settings.h"
-
-#include "debug_utils/debug_utils.h"
 
 namespace Pica {
 
@@ -43,12 +45,12 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
     if (GPU::g_skip_frame && id != PICA_REG_INDEX(trigger_irq))
         return;
 
-    // TODO: Figure out how register masking acts on e.g. vs_uniform_setup.set_value
+    // TODO: Figure out how register masking acts on e.g. vs.uniform_setup.set_value
     u32 old_value = regs[id];
     regs[id] = (old_value & ~mask) | (value & mask);
 
     if (g_debug_context)
-        g_debug_context->OnEvent(DebugContext::Event::CommandLoaded, reinterpret_cast<void*>(&id));
+        g_debug_context->OnEvent(DebugContext::Event::PicaCommandLoaded, reinterpret_cast<void*>(&id));
 
     DebugUtils::OnPicaRegWrite(id, regs[id]);
 
@@ -58,10 +60,50 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);
             break;
 
+        // Load default vertex input attributes
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
+        {
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            default_attr_write_buffer[default_attr_counter++] = value;
+
+            // Default attributes are written in a packed format such that four float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if (default_attr_counter >= 3) {
+                default_attr_counter = 0;
+
+                auto& setup = regs.vs_default_attributes_setup;
+
+                if (setup.index >= 16) {
+                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+                    break;
+                }
+
+                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
+
+                // NOTE: The destination component order indeed is "backwards"
+                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
+                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
+                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
+
+                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                          attribute.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                setup.index = setup.index + 1;
+            }
+            break;
+        }
+
         case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[0], 0x23c):
         case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[1], 0x23d):
         {
-            unsigned index = id - PICA_REG_INDEX(command_buffer.trigger[0]);
+            unsigned index = static_cast<unsigned>(id - PICA_REG_INDEX(command_buffer.trigger[0]));
             u32* head_ptr = (u32*)Memory::GetPhysicalPointer(regs.command_buffer.GetPhysicalAddress(index));
             g_state.cmd_list.head_ptr = g_state.cmd_list.current_ptr = head_ptr;
             g_state.cmd_list.length = regs.command_buffer.GetSize(index) / sizeof(u32);
@@ -74,7 +116,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         {
             Common::Profiling::ScopeTimer scope_timer(category_drawing);
 
+#if PICA_LOG_TEV
             DebugUtils::DumpTevStageConfig(regs.GetTevStages());
+#endif
 
             if (g_debug_context)
                 g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
@@ -117,9 +161,50 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             const u16* index_address_16 = (u16*)index_address_8;
             bool index_u16 = index_info.format != 0;
 
+#if PICA_DUMP_GEOMETRY
             DebugUtils::GeometryDumper geometry_dumper;
-            PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value());
             PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value());
+#endif
+            PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value());
+
+            if (g_debug_context) {
+                for (int i = 0; i < 3; ++i) {
+                    const auto texture = regs.GetTextures()[i];
+                    if (!texture.enabled)
+                        continue;
+
+                    u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
+                    if (g_debug_context && Pica::g_debug_context->recorder)
+                        g_debug_context->recorder->MemoryAccessed(texture_data, Pica::Regs::NibblesPerPixel(texture.format) * texture.config.width / 2 * texture.config.height, texture.config.GetPhysicalAddress());
+                }
+            }
+
+            class {
+                /// Combine overlapping and close ranges
+                void SimplifyRanges() {
+                    for (auto it = ranges.begin(); it != ranges.end(); ++it) {
+                        // NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too
+                        auto it2 = std::next(it);
+                        while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) {
+                            it->second = std::max(it->second, it2->first + it2->second - it->first);
+                            it2 = ranges.erase(it2);
+                        }
+                    }
+                }
+
+            public:
+                /// Record a particular memory access in the list
+                void AddAccess(u32 paddr, u32 size) {
+                    // Create new range or extend existing one
+                    ranges[paddr] = std::max(ranges[paddr], size);
+
+                    // Simplify ranges...
+                    SimplifyRanges();
+                }
+
+                /// Map of accessed ranges (mapping start address to range size)
+                std::map<u32, u32> ranges;
+            } memory_accesses;
 
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
@@ -127,6 +212,10 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 if (is_indexed) {
                     // TODO: Implement some sort of vertex cache!
+                    if (g_debug_context && Pica::g_debug_context->recorder) {
+                        int size = index_u16 ? 2 : 1;
+                        memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
+                    }
                 }
 
                 // Initialize data for the current vertex
@@ -149,7 +238,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                     // Load per-vertex data from the loader arrays
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                        const u8* srcdata = Memory::GetPhysicalPointer(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]);
+                        u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
+                        const u8* srcdata = Memory::GetPhysicalPointer(source_addr);
+
+                        if (g_debug_context && Pica::g_debug_context->recorder) {
+                            memory_accesses.AddAccess(source_addr,
+                                    (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4
+                                    : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1);
+                        }
 
                         const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
                             (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
@@ -179,6 +275,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 if (g_debug_context)
                     g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
 
+#if PICA_DUMP_GEOMETRY
                 // NOTE: When dumping geometry, we simply assume that the first input attribute
                 //       corresponds to the position for now.
                 DebugUtils::GeometryDumper::Vertex dumped_vertex = {
@@ -188,9 +285,10 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 dumping_primitive_assembler.SubmitVertex(dumped_vertex,
                                                          std::bind(&DebugUtils::GeometryDumper::AddTriangle,
                                                                    &geometry_dumper, _1, _2, _3));
+#endif
 
                 // Send to vertex shader
-                VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes());
+                VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs);
 
                 if (is_indexed) {
                     // TODO: Add processed vertex to vertex cache!
@@ -211,47 +309,55 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 }
             }
 
+            for (auto& range : memory_accesses.ranges) {
+                g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
+                                                          range.second, range.first);
+            }
+
             if (Settings::values.use_hw_renderer) {
                 VideoCore::g_renderer->hw_rasterizer->DrawTriangles();
             }
 
+#if PICA_DUMP_GEOMETRY
             geometry_dumper.Dump();
+#endif
 
-            if (g_debug_context)
+            if (g_debug_context) {
                 g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+            }
 
             break;
         }
 
-        case PICA_REG_INDEX(vs_bool_uniforms):
+        case PICA_REG_INDEX(vs.bool_uniforms):
             for (unsigned i = 0; i < 16; ++i)
-                g_state.vs.uniforms.b[i] = (regs.vs_bool_uniforms.Value() & (1 << i)) != 0;
+                g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0;
 
             break;
 
-        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1):
-        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[1], 0x2b2):
-        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[2], 0x2b3):
-        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[3], 0x2b4):
+        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1):
+        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2):
+        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):
+        case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4):
         {
-            int index = (id - PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1));
-            auto values = regs.vs_int_uniforms[index];
+            int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
+            auto values = regs.vs.int_uniforms[index];
             g_state.vs.uniforms.i[index] = Math::Vec4<u8>(values.x, values.y, values.z, values.w);
             LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x",
                       index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value());
             break;
         }
 
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[3], 0x2c4):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[4], 0x2c5):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[5], 0x2c6):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[6], 0x2c7):
-        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[7], 0x2c8):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[0], 0x2c1):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[1], 0x2c2):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[2], 0x2c3):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[3], 0x2c4):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[4], 0x2c5):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):
+        case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8):
         {
-            auto& uniform_setup = regs.vs_uniform_setup;
+            auto& uniform_setup = regs.vs.uniform_setup;
 
             // TODO: Does actual hardware indeed keep an intermediate buffer or does
             //       it directly write the values?
@@ -293,73 +399,33 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             break;
         }
 
-        // Load default vertex input attributes
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
-        {
-            // TODO: Does actual hardware indeed keep an intermediate buffer or does
-            //       it directly write the values?
-            default_attr_write_buffer[default_attr_counter++] = value;
-
-            // Default attributes are written in a packed format such that four float24 values are encoded in
-            // three 32-bit numbers. We write to internal memory once a full such vector is
-            // written.
-            if (default_attr_counter >= 3) {
-                default_attr_counter = 0;
-
-                auto& setup = regs.vs_default_attributes_setup;
-
-                if (setup.index >= 16) {
-                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
-                    break;
-                }
-
-                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
-
-                // NOTE: The destination component order indeed is "backwards"
-                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
-                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
-                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
-
-                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
-                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
-                          attribute.w.ToFloat32());
-
-                // TODO: Verify that this actually modifies the register!
-                setup.index = setup.index + 1;
-            }
-            break;
-        }
-
         // Load shader program code
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[2], 0x2ce):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[3], 0x2cf):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[4], 0x2d0):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[5], 0x2d1):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
-        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[3], 0x2cf):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[4], 0x2d0):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):
+        case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3):
         {
-            g_state.vs.program_code[regs.vs_program.offset] = value;
-            regs.vs_program.offset++;
+            g_state.vs.program_code[regs.vs.program.offset] = value;
+            regs.vs.program.offset++;
             break;
         }
 
         // Load swizzle pattern data
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[2], 0x2d8):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[3], 0x2d9):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[4], 0x2da):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[5], 0x2db):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
-        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[3], 0x2d9):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[4], 0x2da):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):
+        case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd):
         {
-            g_state.vs.swizzle_data[regs.vs_swizzle_patterns.offset] = value;
-            regs.vs_swizzle_patterns.offset++;
+            g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value;
+            regs.vs.swizzle_patterns.offset++;
             break;
         }
 
@@ -370,7 +436,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
     VideoCore::g_renderer->hw_rasterizer->NotifyPicaRegisterChanged(id);
 
     if (g_debug_context)
-        g_debug_context->OnEvent(DebugContext::Event::CommandProcessed, reinterpret_cast<void*>(&id));
+        g_debug_context->OnEvent(DebugContext::Event::PicaCommandProcessed, reinterpret_cast<void*>(&id));
 }
 
 void ProcessCommandList(const u32* list, u32 size) {
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
index bb3d4150..022a71f5 100644
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@@ -4,11 +4,11 @@
 
 #pragma once
 
+#include <type_traits>
+
 #include "common/bit_field.h"
 #include "common/common_types.h"
 
-#include "pica.h"
-
 namespace Pica {
 
 namespace CommandProcessor {
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 7b8ab72b..e9a85841 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -23,6 +23,7 @@
 #include "common/vector_math.h"
 
 #include "video_core/pica.h"
+#include "video_core/renderer_base.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
 
@@ -84,15 +85,11 @@ void GeometryDumper::AddTriangle(Vertex& v0, Vertex& v1, Vertex& v2) {
     vertices.push_back(v1);
     vertices.push_back(v2);
 
-    int num_vertices = vertices.size();
+    int num_vertices = (int)vertices.size();
     faces.push_back({ num_vertices-3, num_vertices-2, num_vertices-1 });
 }
 
 void GeometryDumper::Dump() {
-    // NOTE: Permanently enabling this just trashes the hard disk for no reason.
-    //       Hence, this is currently disabled.
-    return;
-
     static int index = 0;
     std::string filename = std::string("geometry_dump") + std::to_string(++index) + ".obj";
 
@@ -115,10 +112,6 @@ void GeometryDumper::Dump() {
 void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size,
                 u32 main_offset, const Regs::VSOutputAttributes* output_attributes)
 {
-    // NOTE: Permanently enabling this just trashes hard disks for no reason.
-    //       Hence, this is currently disabled.
-    return;
-
     struct StuffToWrite {
         u8* pointer;
         u32 size;
@@ -240,8 +233,8 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
 
     dvle.main_offset_words = main_offset;
     dvle.output_register_table_offset = write_offset - dvlb.dvle_offset;
-    dvle.output_register_table_size = output_info_table.size();
-    QueueForWriting((u8*)output_info_table.data(), output_info_table.size() * sizeof(OutputRegisterInfo));
+    dvle.output_register_table_size = static_cast<uint32_t>(output_info_table.size());
+    QueueForWriting((u8*)output_info_table.data(), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo)));
 
     // TODO: Create a label table for "main"
 
@@ -496,31 +489,31 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
                 // Lookup base value
                 Math::Vec3<int> ret;
                 if (differential_mode) {
-                    ret.r() = differential.r;
-                    ret.g() = differential.g;
-                    ret.b() = differential.b;
+                    ret.r() = static_cast<int>(differential.r);
+                    ret.g() = static_cast<int>(differential.g);
+                    ret.b() = static_cast<int>(differential.b);
                     if (x >= 2) {
-                        ret.r() += differential.dr;
-                        ret.g() += differential.dg;
-                        ret.b() += differential.db;
+                        ret.r() += static_cast<int>(differential.dr);
+                        ret.g() += static_cast<int>(differential.dg);
+                        ret.b() += static_cast<int>(differential.db);
                     }
                     ret.r() = Color::Convert5To8(ret.r());
                     ret.g() = Color::Convert5To8(ret.g());
                     ret.b() = Color::Convert5To8(ret.b());
                 } else {
                     if (x < 2) {
-                        ret.r() = Color::Convert4To8(separate.r1);
-                        ret.g() = Color::Convert4To8(separate.g1);
-                        ret.b() = Color::Convert4To8(separate.b1);
+                        ret.r() = Color::Convert4To8(static_cast<u8>(separate.r1));
+                        ret.g() = Color::Convert4To8(static_cast<u8>(separate.g1));
+                        ret.b() = Color::Convert4To8(static_cast<u8>(separate.b1));
                     } else {
-                        ret.r() = Color::Convert4To8(separate.r2);
-                        ret.g() = Color::Convert4To8(separate.g2);
-                        ret.b() = Color::Convert4To8(separate.b2);
+                        ret.r() = Color::Convert4To8(static_cast<u8>(separate.r2));
+                        ret.g() = Color::Convert4To8(static_cast<u8>(separate.g2));
+                        ret.b() = Color::Convert4To8(static_cast<u8>(separate.b2));
                     }
                 }
 
                 // Add modifier
-                unsigned table_index = (x < 2) ? table_index_1.Value() : table_index_2.Value();
+                unsigned table_index = static_cast<int>((x < 2) ? table_index_1.Value() : table_index_2.Value());
 
                 static const std::array<std::array<u8, 2>, 8> etc1_modifier_table = {{
                     {  2,  8 }, {  5, 17 }, {  9,  29 }, { 13,  42 },
@@ -564,10 +557,6 @@ TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config,
 }
 
 void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
-    // NOTE: Permanently enabling this just trashes hard disks for no reason.
-    //       Hence, this is currently disabled.
-    return;
-
 #ifndef HAVE_PNG
     return;
 #else
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index 7926d64e..81eea30a 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -14,6 +14,8 @@
 
 #include "common/vector_math.h"
 
+#include "core/tracer/recorder.h"
+
 #include "video_core/pica.h"
 
 namespace Pica {
@@ -23,11 +25,14 @@ public:
     enum class Event {
         FirstEvent = 0,
 
-        CommandLoaded = FirstEvent,
-        CommandProcessed,
+        PicaCommandLoaded = FirstEvent,
+        PicaCommandProcessed,
         IncomingPrimitiveBatch,
         FinishedPrimitiveBatch,
         VertexLoaded,
+        IncomingDisplayTransfer,
+        GSPCommandProcessed,
+        BufferSwapped,
 
         NumEvents
     };
@@ -129,6 +134,8 @@ public:
     Event active_breakpoint;
     bool at_breakpoint = false;
 
+    std::shared_ptr<CiTrace::Recorder> recorder = nullptr;
+
 private:
     /**
      * Private default constructor to make sure people always construct this through Construct()
@@ -150,6 +157,11 @@ extern std::shared_ptr<DebugContext> g_debug_context; // TODO: Get rid of this g
 
 namespace DebugUtils {
 
+#define PICA_DUMP_GEOMETRY 0
+#define PICA_DUMP_SHADERS 0
+#define PICA_DUMP_TEXTURES 0
+#define PICA_LOG_TEV 0
+
 // Simple utility class for dumping geometry data to an OBJ file
 class GeometryDumper {
 public:
diff --git a/src/video_core/hwrasterizer_base.h b/src/video_core/hwrasterizer_base.h
index dec193f8..c8746c60 100644
--- a/src/video_core/hwrasterizer_base.h
+++ b/src/video_core/hwrasterizer_base.h
@@ -4,8 +4,13 @@
 
 #pragma once
 
-#include "common/emu_window.h"
-#include "video_core/vertex_shader.h"
+#include "common/common_types.h"
+
+namespace Pica {
+namespace VertexShader {
+struct OutputVertex;
+}
+}
 
 class HWRasterizer {
 public:
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index 543d9c44..17cb6678 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -2,7 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <string.h>
+#include <cstring>
+#include <unordered_map>
 
 #include "pica.h"
 
@@ -10,6 +11,75 @@ namespace Pica {
 
 State g_state;
 
+std::string Regs::GetCommandName(int index) {
+    static std::unordered_map<u32, std::string> map;
+
+    if (map.empty()) {
+        #define ADD_FIELD(name) \
+                map.insert({static_cast<u32>(PICA_REG_INDEX(name)), #name}); \
+                /* TODO: change to Regs::name when VS2015 and other compilers support it  */ \
+                for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \
+                    map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))}); \
+
+        ADD_FIELD(trigger_irq);
+        ADD_FIELD(cull_mode);
+        ADD_FIELD(viewport_size_x);
+        ADD_FIELD(viewport_size_y);
+        ADD_FIELD(viewport_depth_range);
+        ADD_FIELD(viewport_depth_far_plane);
+        ADD_FIELD(viewport_corner);
+        ADD_FIELD(texture0_enable);
+        ADD_FIELD(texture0);
+        ADD_FIELD(texture0_format);
+        ADD_FIELD(texture1);
+        ADD_FIELD(texture1_format);
+        ADD_FIELD(texture2);
+        ADD_FIELD(texture2_format);
+        ADD_FIELD(tev_stage0);
+        ADD_FIELD(tev_stage1);
+        ADD_FIELD(tev_stage2);
+        ADD_FIELD(tev_stage3);
+        ADD_FIELD(tev_combiner_buffer_input);
+        ADD_FIELD(tev_stage4);
+        ADD_FIELD(tev_stage5);
+        ADD_FIELD(tev_combiner_buffer_color);
+        ADD_FIELD(output_merger);
+        ADD_FIELD(framebuffer);
+        ADD_FIELD(vertex_attributes);
+        ADD_FIELD(index_array);
+        ADD_FIELD(num_vertices);
+        ADD_FIELD(trigger_draw);
+        ADD_FIELD(trigger_draw_indexed);
+        ADD_FIELD(vs_default_attributes_setup);
+        ADD_FIELD(command_buffer);
+        ADD_FIELD(triangle_topology);
+        ADD_FIELD(gs.bool_uniforms);
+        ADD_FIELD(gs.int_uniforms);
+        ADD_FIELD(gs.main_offset);
+        ADD_FIELD(gs.input_register_map);
+        ADD_FIELD(gs.uniform_setup);
+        ADD_FIELD(gs.program);
+        ADD_FIELD(gs.swizzle_patterns);
+        ADD_FIELD(vs.bool_uniforms);
+        ADD_FIELD(vs.int_uniforms);
+        ADD_FIELD(vs.main_offset);
+        ADD_FIELD(vs.input_register_map);
+        ADD_FIELD(vs.uniform_setup);
+        ADD_FIELD(vs.program);
+        ADD_FIELD(vs.swizzle_patterns);
+
+#undef ADD_FIELD
+    }
+
+    // Return empty string if no match is found
+    auto it = map.find(index);
+    if (it != map.end()) {
+        return it->second;
+    } else {
+        return std::string();
+    }
+}
+
 void Init() {
 }
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 9628a758..34b02b2f 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -5,10 +5,9 @@
 #pragma once
 
 #include <array>
+#include <cmath>
 #include <cstddef>
-#include <initializer_list>
-#include <map>
-#include <vector>
+#include <string>
 
 #include "common/assert.h"
 #include "common/bit_field.h"
@@ -114,11 +113,22 @@ struct Regs {
     struct TextureConfig {
         enum WrapMode : u32 {
             ClampToEdge    = 0,
+            ClampToBorder  = 1,
             Repeat         = 2,
             MirroredRepeat = 3,
         };
 
-        INSERT_PADDING_WORDS(0x1);
+        enum TextureFilter : u32 {
+            Nearest = 0,
+            Linear  = 1
+        };
+
+        union {
+            BitField< 0, 8, u32> r;
+            BitField< 8, 8, u32> g;
+            BitField<16, 8, u32> b;
+            BitField<24, 8, u32> a;
+        } border_color;
 
         union {
             BitField< 0, 16, u32> height;
@@ -126,8 +136,10 @@ struct Regs {
         };
 
         union {
-            BitField< 8, 2, WrapMode> wrap_s;
-            BitField<12, 2, WrapMode> wrap_t;
+            BitField< 1, 1, TextureFilter> mag_filter;
+            BitField< 2, 1, TextureFilter> min_filter;
+            BitField< 8, 2, WrapMode> wrap_t;
+            BitField<12, 2, WrapMode> wrap_s;
         };
 
         INSERT_PADDING_WORDS(0x1);
@@ -194,6 +206,7 @@ struct Regs {
         case TextureFormat::IA8:
             return 4;
 
+        case TextureFormat::I4:
         case TextureFormat::A4:
             return 1;
 
@@ -284,6 +297,7 @@ struct Regs {
             AddSigned       = 3,
             Lerp            = 4,
             Subtract        = 5,
+            Dot3_RGB        = 6,
 
             MultiplyThenAdd = 8,
             AddThenMultiply = 9,
@@ -414,6 +428,11 @@ struct Regs {
         GreaterThanOrEqual = 7,
     };
 
+    enum class StencilAction : u32 {
+        Keep = 0,
+        Xor  = 5,
+    };
+
     struct {
         union {
             // If false, logic blending is used
@@ -448,15 +467,35 @@ struct Regs {
             BitField< 8, 8, u32> ref;
         } alpha_test;
 
-        union {
-            BitField< 0, 1, u32> stencil_test_enable;
-            BitField< 4, 3, CompareFunc> stencil_test_func;
-            BitField< 8, 8, u32> stencil_replacement_value;
-            BitField<16, 8, u32> stencil_reference_value;
-            BitField<24, 8, u32> stencil_mask;
-        } stencil_test;
+        struct {
+            union {
+                // If true, enable stencil testing
+                BitField< 0, 1, u32> enable;
 
-        INSERT_PADDING_WORDS(0x1);
+                // Comparison operation for stencil testing
+                BitField< 4, 3, CompareFunc> func;
+
+                // Value to calculate the new stencil value from
+                BitField< 8, 8, u32> replacement_value;
+
+                // Value to compare against for stencil testing
+                BitField<16, 8, u32> reference_value;
+
+                // Mask to apply on stencil test inputs
+                BitField<24, 8, u32> mask;
+            };
+
+            union {
+                // Action to perform when the stencil test fails
+                BitField< 0, 3, StencilAction> action_stencil_fail;
+
+                // Action to perform when stencil testing passed but depth testing fails
+                BitField< 4, 3, StencilAction> action_depth_fail;
+
+                // Action to perform when both stencil and depth testing pass
+                BitField< 8, 3, StencilAction> action_depth_pass;
+            };
+        } stencil_test;
 
         union {
             BitField< 0, 1, u32> depth_test_enable;
@@ -506,7 +545,7 @@ struct Regs {
     struct {
         INSERT_PADDING_WORDS(0x6);
 
-        DepthFormat depth_format;
+        DepthFormat depth_format; // TODO: Should be a BitField!
         BitField<16, 3, ColorFormat> color_format;
 
         INSERT_PADDING_WORDS(0x4);
@@ -752,171 +791,123 @@ struct Regs {
     INSERT_PADDING_WORDS(0x20);
 
     enum class TriangleTopology : u32 {
-        List        = 0,
-        Strip       = 1,
-        Fan         = 2,
-        ListIndexed = 3, // TODO: No idea if this is correct
+        List   = 0,
+        Strip  = 1,
+        Fan    = 2,
+        Shader = 3, // Programmable setup unit implemented in a geometry shader
     };
 
     BitField<8, 2, TriangleTopology> triangle_topology;
 
-    INSERT_PADDING_WORDS(0x51);
+    INSERT_PADDING_WORDS(0x21);
 
-    BitField<0, 16, u32> vs_bool_uniforms;
-    union {
-        BitField< 0, 8, u32> x;
-        BitField< 8, 8, u32> y;
-        BitField<16, 8, u32> z;
-        BitField<24, 8, u32> w;
-    } vs_int_uniforms[4];
+    struct ShaderConfig {
+        BitField<0, 16, u32> bool_uniforms;
 
-    INSERT_PADDING_WORDS(0x5);
+        union {
+            BitField< 0, 8, u32> x;
+            BitField< 8, 8, u32> y;
+            BitField<16, 8, u32> z;
+            BitField<24, 8, u32> w;
+        } int_uniforms[4];
 
-    // Offset to shader program entry point (in words)
-    BitField<0, 16, u32> vs_main_offset;
+        INSERT_PADDING_WORDS(0x5);
 
-    union {
-        BitField< 0, 4, u64> attribute0_register;
-        BitField< 4, 4, u64> attribute1_register;
-        BitField< 8, 4, u64> attribute2_register;
-        BitField<12, 4, u64> attribute3_register;
-        BitField<16, 4, u64> attribute4_register;
-        BitField<20, 4, u64> attribute5_register;
-        BitField<24, 4, u64> attribute6_register;
-        BitField<28, 4, u64> attribute7_register;
-        BitField<32, 4, u64> attribute8_register;
-        BitField<36, 4, u64> attribute9_register;
-        BitField<40, 4, u64> attribute10_register;
-        BitField<44, 4, u64> attribute11_register;
-        BitField<48, 4, u64> attribute12_register;
-        BitField<52, 4, u64> attribute13_register;
-        BitField<56, 4, u64> attribute14_register;
-        BitField<60, 4, u64> attribute15_register;
-
-        int GetRegisterForAttribute(int attribute_index) const {
-            u64 fields[] = {
-                attribute0_register,  attribute1_register,  attribute2_register,  attribute3_register,
-                attribute4_register,  attribute5_register,  attribute6_register,  attribute7_register,
-                attribute8_register,  attribute9_register,  attribute10_register, attribute11_register,
-                attribute12_register, attribute13_register, attribute14_register, attribute15_register,
+        // Offset to shader program entry point (in words)
+        BitField<0, 16, u32> main_offset;
+
+        union {
+            BitField< 0, 4, u64> attribute0_register;
+            BitField< 4, 4, u64> attribute1_register;
+            BitField< 8, 4, u64> attribute2_register;
+            BitField<12, 4, u64> attribute3_register;
+            BitField<16, 4, u64> attribute4_register;
+            BitField<20, 4, u64> attribute5_register;
+            BitField<24, 4, u64> attribute6_register;
+            BitField<28, 4, u64> attribute7_register;
+            BitField<32, 4, u64> attribute8_register;
+            BitField<36, 4, u64> attribute9_register;
+            BitField<40, 4, u64> attribute10_register;
+            BitField<44, 4, u64> attribute11_register;
+            BitField<48, 4, u64> attribute12_register;
+            BitField<52, 4, u64> attribute13_register;
+            BitField<56, 4, u64> attribute14_register;
+            BitField<60, 4, u64> attribute15_register;
+
+            int GetRegisterForAttribute(int attribute_index) const {
+                u64 fields[] = {
+                    attribute0_register,  attribute1_register,  attribute2_register,  attribute3_register,
+                    attribute4_register,  attribute5_register,  attribute6_register,  attribute7_register,
+                    attribute8_register,  attribute9_register,  attribute10_register, attribute11_register,
+                    attribute12_register, attribute13_register, attribute14_register, attribute15_register,
+                };
+                return (int)fields[attribute_index];
+            }
+        } input_register_map;
+
+        // OUTMAP_MASK, 0x28E, CODETRANSFER_END
+        INSERT_PADDING_WORDS(0x3);
+
+        struct {
+            enum Format : u32
+            {
+                FLOAT24 = 0,
+                FLOAT32 = 1
             };
-            return (int)fields[attribute_index];
-        }
-    } vs_input_register_map;
 
-    INSERT_PADDING_WORDS(0x3);
+            bool IsFloat32() const {
+                return format == FLOAT32;
+            }
 
-    struct {
-        enum Format : u32
-        {
-            FLOAT24 = 0,
-            FLOAT32 = 1
-        };
+            union {
+                // Index of the next uniform to write to
+                // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices
+                // TODO: Maybe the uppermost index is for the geometry shader? Investigate!
+                BitField<0, 7, u32> index;
 
-        bool IsFloat32() const {
-            return format == FLOAT32;
-        }
+                BitField<31, 1, Format> format;
+            };
 
-        union {
-            // Index of the next uniform to write to
-            // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices
-            BitField<0, 7, u32> index;
+            // Writing to these registers sets the current uniform.
+            u32 set_value[8];
 
-            BitField<31, 1, Format> format;
-        };
+        } uniform_setup;
 
-        // Writing to these registers sets the "current" uniform.
-        // TODO: It's not clear how the hardware stores what the "current" uniform is.
-        u32 set_value[8];
+        INSERT_PADDING_WORDS(0x2);
 
-    } vs_uniform_setup;
+        struct {
+            // Offset of the next instruction to write code to.
+            // Incremented with each instruction write.
+            u32 offset;
 
-    INSERT_PADDING_WORDS(0x2);
+            // Writing to these registers sets the "current" word in the shader program.
+            u32 set_word[8];
+        } program;
 
-    struct {
-        // Offset of the next instruction to write code to.
-        // Incremented with each instruction write.
-        u32 offset;
+        INSERT_PADDING_WORDS(0x1);
 
-        // Writing to these registers sets the "current" word in the shader program.
-        // TODO: It's not clear how the hardware stores what the "current" word is.
-        u32 set_word[8];
-    } vs_program;
+        // This register group is used to load an internal table of swizzling patterns,
+        // which are indexed by each shader instruction to specify vector component swizzling.
+        struct {
+            // Offset of the next swizzle pattern to write code to.
+            // Incremented with each instruction write.
+            u32 offset;
 
-    INSERT_PADDING_WORDS(0x1);
+            // Writing to these registers sets the current swizzle pattern in the table.
+            u32 set_word[8];
+        } swizzle_patterns;
 
-    // This register group is used to load an internal table of swizzling patterns,
-    // which are indexed by each shader instruction to specify vector component swizzling.
-    struct {
-        // Offset of the next swizzle pattern to write code to.
-        // Incremented with each instruction write.
-        u32 offset;
+        INSERT_PADDING_WORDS(0x2);
+    };
 
-        // Writing to these registers sets the "current" swizzle pattern in the table.
-        // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
-        u32 set_word[8];
-    } vs_swizzle_patterns;
+    ShaderConfig gs;
+    ShaderConfig vs;
 
-    INSERT_PADDING_WORDS(0x22);
+    INSERT_PADDING_WORDS(0x20);
 
     // Map register indices to names readable by humans
     // Used for debugging purposes, so performance is not an issue here
-    static std::string GetCommandName(int index) {
-        std::map<u32, std::string> map;
-
-        #define ADD_FIELD(name)                                                                               \
-            do {                                                                                              \
-                map.insert({PICA_REG_INDEX(name), #name});                                                    \
-                /* TODO: change to Regs::name when VS2015 and other compilers support it  */                   \
-                for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \
-                    map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))});       \
-            } while(false)
-
-        ADD_FIELD(trigger_irq);
-        ADD_FIELD(cull_mode);
-        ADD_FIELD(viewport_size_x);
-        ADD_FIELD(viewport_size_y);
-        ADD_FIELD(viewport_depth_range);
-        ADD_FIELD(viewport_depth_far_plane);
-        ADD_FIELD(viewport_corner);
-        ADD_FIELD(texture0_enable);
-        ADD_FIELD(texture0);
-        ADD_FIELD(texture0_format);
-        ADD_FIELD(texture1);
-        ADD_FIELD(texture1_format);
-        ADD_FIELD(texture2);
-        ADD_FIELD(texture2_format);
-        ADD_FIELD(tev_stage0);
-        ADD_FIELD(tev_stage1);
-        ADD_FIELD(tev_stage2);
-        ADD_FIELD(tev_stage3);
-        ADD_FIELD(tev_combiner_buffer_input);
-        ADD_FIELD(tev_stage4);
-        ADD_FIELD(tev_stage5);
-        ADD_FIELD(tev_combiner_buffer_color);
-        ADD_FIELD(output_merger);
-        ADD_FIELD(framebuffer);
-        ADD_FIELD(vertex_attributes);
-        ADD_FIELD(index_array);
-        ADD_FIELD(num_vertices);
-        ADD_FIELD(trigger_draw);
-        ADD_FIELD(trigger_draw_indexed);
-        ADD_FIELD(vs_default_attributes_setup);
-        ADD_FIELD(command_buffer);
-        ADD_FIELD(triangle_topology);
-        ADD_FIELD(vs_bool_uniforms);
-        ADD_FIELD(vs_int_uniforms);
-        ADD_FIELD(vs_main_offset);
-        ADD_FIELD(vs_input_register_map);
-        ADD_FIELD(vs_uniform_setup);
-        ADD_FIELD(vs_program);
-        ADD_FIELD(vs_swizzle_patterns);
-
-        #undef ADD_FIELD
-
-        // Return empty string if no match is found
-        return map[index];
-    }
+    static std::string GetCommandName(int index);
 
     static inline size_t NumIds() {
         return sizeof(Regs) / sizeof(u32);
@@ -982,17 +973,14 @@ ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232);
 ASSERT_REG_POSITION(command_buffer, 0x238);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
-ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
-ASSERT_REG_POSITION(vs_int_uniforms, 0x2b1);
-ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
-ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
-ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
-ASSERT_REG_POSITION(vs_program, 0x2cb);
-ASSERT_REG_POSITION(vs_swizzle_patterns, 0x2d5);
+ASSERT_REG_POSITION(gs, 0x280);
+ASSERT_REG_POSITION(vs, 0x2b0);
 
 #undef ASSERT_REG_POSITION
 #endif // !defined(_MSC_VER)
 
+static_assert(sizeof(Regs::ShaderConfig) == 0x30 * sizeof(u32), "ShaderConfig structure has incorrect size");
+
 // The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway.
 static_assert(sizeof(Regs) <= 0x300 * sizeof(u32), "Register set structure larger than it should be");
 static_assert(sizeof(Regs) >= 0x300 * sizeof(u32), "Register set structure smaller than it should be");
@@ -1014,7 +1002,7 @@ struct float24 {
             u32 mantissa = hex & 0xFFFF;
             u32 exponent = (hex >> 16) & 0x7F;
             u32 sign = hex >> 23;
-            ret.value = powf(2.0f, (float)exponent-63.0f) * (1.0f + mantissa * powf(2.0f, -16.f));
+            ret.value = std::pow(2.0f, (float)exponent-63.0f) * (1.0f + mantissa * std::pow(2.0f, -16.f));
             if (sign)
                 ret.value = -ret.value;
         }
@@ -1102,7 +1090,7 @@ struct State {
     Regs regs;
 
     /// Vertex shader memory
-    struct {
+    struct ShaderSetup {
         struct {
             Math::Vec4<float24> f[96];
             std::array<bool, 16> b;
@@ -1113,7 +1101,10 @@ struct State {
 
         std::array<u32, 1024> program_code;
         std::array<u32, 1024> swizzle_data;
-    } vs;
+    };
+
+    ShaderSetup vs;
+    ShaderSetup gs;
 
     /// Current Pica command list
     struct {
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index 0120f289..2f22bdcc 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -20,8 +20,9 @@ template<typename VertexType>
 void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler)
 {
     switch (topology) {
+        // TODO: Figure out what's different with TriangleTopology::Shader.
         case Regs::TriangleTopology::List:
-        case Regs::TriangleTopology::ListIndexed:
+        case Regs::TriangleTopology::Shader:
             if (buffer_index < 2) {
                 buffer[buffer_index++] = vtx;
             } else {
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 59d156ee..68b7cc05 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -126,6 +126,30 @@ static u32 GetDepth(int x, int y) {
     }
 }
 
+static u8 GetStencil(int x, int y) {
+    const auto& framebuffer = g_state.regs.framebuffer;
+    const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
+    u8* depth_buffer = Memory::GetPhysicalPointer(addr);
+
+    y = framebuffer.height - y;
+
+    const u32 coarse_y = y & ~7;
+    u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(framebuffer.depth_format);
+    u32 stride = framebuffer.width * bytes_per_pixel;
+
+    u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
+    u8* src_pixel = depth_buffer + src_offset;
+
+    switch (framebuffer.depth_format) {
+        case Regs::DepthFormat::D24S8:
+            return Color::DecodeD24S8(src_pixel).y;
+
+        default:
+            LOG_WARNING(HW_GPU, "GetStencil called for function which doesn't have a stencil component (format %u)", framebuffer.depth_format);
+            return 0;
+    }
+}
+
 static void SetDepth(int x, int y, u32 value) {
     const auto& framebuffer = g_state.regs.framebuffer;
     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
@@ -144,13 +168,15 @@ static void SetDepth(int x, int y, u32 value) {
         case Regs::DepthFormat::D16:
             Color::EncodeD16(value, dst_pixel);
             break;
+
         case Regs::DepthFormat::D24:
             Color::EncodeD24(value, dst_pixel);
             break;
+
         case Regs::DepthFormat::D24S8:
-            // TODO(Subv): Implement the stencil buffer
-            Color::EncodeD24S8(value, 0, dst_pixel);
+            Color::EncodeD24X8(value, dst_pixel);
             break;
+
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented depth format %u", framebuffer.depth_format);
             UNIMPLEMENTED();
@@ -158,6 +184,53 @@ static void SetDepth(int x, int y, u32 value) {
     }
 }
 
+static void SetStencil(int x, int y, u8 value) {
+    const auto& framebuffer = g_state.regs.framebuffer;
+    const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
+    u8* depth_buffer = Memory::GetPhysicalPointer(addr);
+
+    y = framebuffer.height - y;
+
+    const u32 coarse_y = y & ~7;
+    u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(framebuffer.depth_format);
+    u32 stride = framebuffer.width * bytes_per_pixel;
+
+    u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
+    u8* dst_pixel = depth_buffer + dst_offset;
+
+    switch (framebuffer.depth_format) {
+        case Pica::Regs::DepthFormat::D16:
+        case Pica::Regs::DepthFormat::D24:
+            // Nothing to do
+            break;
+
+        case Pica::Regs::DepthFormat::D24S8:
+            Color::EncodeX24S8(value, dst_pixel);
+            break;
+
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented depth format %u", framebuffer.depth_format);
+            UNIMPLEMENTED();
+            break;
+    }
+}
+
+// TODO: Should the stencil mask be applied to the "dest" or "ref" operands? Most likely not!
+static u8 PerformStencilAction(Regs::StencilAction action, u8 dest, u8 ref) {
+    switch (action) {
+    case Regs::StencilAction::Keep:
+        return dest;
+
+    case Regs::StencilAction::Xor:
+        return dest ^ ref;
+
+    default:
+        LOG_CRITICAL(HW_GPU, "Unknown stencil action %x", (int)action);
+        UNIMPLEMENTED();
+        return 0;
+    }
+}
+
 // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
 struct Fix12P4 {
     Fix12P4() {}
@@ -276,6 +349,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
     auto textures = regs.GetTextures();
     auto tev_stages = regs.GetTevStages();
 
+    bool stencil_action_enable = g_state.regs.output_merger.stencil_test.enable && g_state.regs.framebuffer.depth_format == Regs::DepthFormat::D24S8;
+    const auto stencil_test = g_state.regs.output_merger.stencil_test;
+
     // Enter rasterization loop, starting at the center of the topleft bounding box corner.
     // TODO: Not sure if looping through x first might be faster
     for (u16 y = min_y + 8; y < max_y; y += 0x10) {
@@ -349,6 +425,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                             val = std::min(val, (int)size - 1);
                             return val;
 
+                        case Regs::TextureConfig::ClampToBorder:
+                            return val;
+
                         case Regs::TextureConfig::Repeat:
                             return (int)((unsigned)val % size);
 
@@ -367,17 +446,26 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                // Textures are laid out from bottom to top, hence we invert the t coordinate.
-                // NOTE: This may not be the right place for the inversion.
-                // TODO: Check if this applies to ETC textures, too.
-                s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
-                t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
-
-                u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
-                auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);
-
-                texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info);
-                DebugUtils::DumpTexture(texture.config, texture_data);
+                if ((texture.config.wrap_s == Regs::TextureConfig::ClampToBorder && (s < 0 || s >= texture.config.width))
+                    || (texture.config.wrap_t == Regs::TextureConfig::ClampToBorder && (t < 0 || t >= texture.config.height))) {
+                    auto border_color = texture.config.border_color;
+                    texture_color[i] = { border_color.r, border_color.g, border_color.b, border_color.a };
+                } else {
+                    // Textures are laid out from bottom to top, hence we invert the t coordinate.
+                    // NOTE: This may not be the right place for the inversion.
+                    // TODO: Check if this applies to ETC textures, too.
+                    s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
+                    t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
+
+                    u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
+                    auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);
+
+                    // TODO: Apply the min and mag filters to the texture
+                    texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info);
+#if PICA_DUMP_TEXTURES
+                    DebugUtils::DumpTexture(texture.config, texture_data);
+#endif
+                }
             }
 
             // Texture environment - consists of 6 stages of color and alpha combining.
@@ -556,7 +644,18 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                         result = (result * input[2].Cast<int>()) / 255;
                         return result.Cast<u8>();
                     }
-
+                    case Operation::Dot3_RGB:
+                    {
+                        // Not fully accurate.
+                        // Worst case scenario seems to yield a +/-3 error
+                        // Some HW results indicate that the per-component computation can't have a higher precision than 1/256,
+                        // while dot3_rgb( (0x80,g0,b0),(0x7F,g1,b1) ) and dot3_rgb( (0x80,g0,b0),(0x80,g1,b1) ) give different results
+                        int result = ((input[0].r() * 2 - 255) * (input[1].r() * 2 - 255) + 128) / 256 +
+                                     ((input[0].g() * 2 - 255) * (input[1].g() * 2 - 255) + 128) / 256 +
+                                     ((input[0].b() * 2 - 255) * (input[1].b() * 2 - 255) + 128) / 256;
+                        result = std::max(0, std::min(255, result));
+                        return { (u8)result, (u8)result, (u8)result };
+                    }
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();
@@ -638,6 +737,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
             }
 
             const auto& output_merger = regs.output_merger;
+            // TODO: Does alpha testing happen before or after stencil?
             if (output_merger.alpha_test.enable) {
                 bool pass = false;
 
@@ -679,6 +779,54 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     continue;
             }
 
+            u8 old_stencil = 0;
+            if (stencil_action_enable) {
+                old_stencil = GetStencil(x >> 4, y >> 4);
+                u8 dest = old_stencil & stencil_test.mask;
+                u8 ref = stencil_test.reference_value & stencil_test.mask;
+
+                bool pass = false;
+                switch (stencil_test.func) {
+                case Regs::CompareFunc::Never:
+                    pass = false;
+                    break;
+
+                case Regs::CompareFunc::Always:
+                    pass = true;
+                    break;
+
+                case Regs::CompareFunc::Equal:
+                    pass = (ref == dest);
+                    break;
+
+                case Regs::CompareFunc::NotEqual:
+                    pass = (ref != dest);
+                    break;
+
+                case Regs::CompareFunc::LessThan:
+                    pass = (ref < dest);
+                    break;
+
+                case Regs::CompareFunc::LessThanOrEqual:
+                    pass = (ref <= dest);
+                    break;
+
+                case Regs::CompareFunc::GreaterThan:
+                    pass = (ref > dest);
+                    break;
+
+                case Regs::CompareFunc::GreaterThanOrEqual:
+                    pass = (ref >= dest);
+                    break;
+                }
+
+                if (!pass) {
+                    u8 new_stencil = PerformStencilAction(stencil_test.action_stencil_fail, old_stencil, stencil_test.replacement_value);
+                    SetStencil(x >> 4, y >> 4, new_stencil);
+                    continue;
+                }
+            }
+
             // TODO: Does depth indeed only get written even if depth testing is enabled?
             if (output_merger.depth_test_enable) {
                 unsigned num_bits = Regs::DepthBitsPerPixel(regs.framebuffer.depth_format);
@@ -723,11 +871,22 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     break;
                 }
 
-                if (!pass)
+                if (!pass) {
+                    if (stencil_action_enable) {
+                        u8 new_stencil = PerformStencilAction(stencil_test.action_depth_fail, old_stencil, stencil_test.replacement_value);
+                        SetStencil(x >> 4, y >> 4, new_stencil);
+                    }
                     continue;
+                }
 
                 if (output_merger.depth_write_enable)
                     SetDepth(x >> 4, y >> 4, z);
+
+                if (stencil_action_enable) {
+                    // TODO: What happens if stencil testing is enabled, but depth testing is not? Will stencil get updated anyway?
+                    u8 new_stencil = PerformStencilAction(stencil_test.action_depth_pass, old_stencil, stencil_test.replacement_value);
+                    SetStencil(x >> 4, y >> 4, new_stencil);
+                }
             }
 
             auto dest = GetPixel(x >> 4, y >> 4);
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 5757ac75..6587bcf2 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -4,10 +4,14 @@
 
 #pragma once
 
+#include <memory>
+
 #include "common/common_types.h"
 
 #include "video_core/hwrasterizer_base.h"
 
+class EmuWindow;
+
 class RendererBase : NonCopyable {
 public:
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 518f7933..2db845da 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -2,10 +2,15 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <cstring>
+#include <memory>
+
 #include "common/color.h"
+#include "common/math_util.h"
 
-#include "core/settings.h"
 #include "core/hw/gpu.h"
+#include "core/memory.h"
+#include "core/settings.h"
 
 #include "video_core/pica.h"
 #include "video_core/utils.h"
@@ -16,8 +21,6 @@
 
 #include "generated/gl_3_2_core.h"
 
-#include <memory>
-
 static bool IsPassThroughTevStage(const Pica::Regs::TevStageConfig& stage) {
     return (stage.color_op == Pica::Regs::TevStageConfig::Operation::Replace &&
             stage.alpha_op == Pica::Regs::TevStageConfig::Operation::Replace &&
@@ -813,12 +816,16 @@ void RasterizerOpenGL::ReloadColorBuffer() {
 }
 
 void RasterizerOpenGL::ReloadDepthBuffer() {
+    PAddr depth_buffer_addr = Pica::g_state.regs.framebuffer.GetDepthBufferPhysicalAddress();
+
+    if (depth_buffer_addr == 0)
+        return;
+
     // TODO: Appears to work, but double-check endianness of depth values and order of depth-stencil
-    u8* depth_buffer = Memory::GetPhysicalPointer(Pica::g_state.regs.framebuffer.GetDepthBufferPhysicalAddress());
+    u8* depth_buffer = Memory::GetPhysicalPointer(depth_buffer_addr);
 
-    if (depth_buffer == nullptr) {
+    if (depth_buffer == nullptr)
         return;
-    }
 
     u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format);
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index d7d422b1..ae7b26fc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -4,7 +4,12 @@
 
 #pragma once
 
+#include <vector>
+
+#include "common/common_types.h"
+
 #include "video_core/hwrasterizer_base.h"
+#include "video_core/vertex_shader.h"
 
 #include "gl_state.h"
 #include "gl_rasterizer_cache.h"
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 2e4110a8..dc3ffdf2 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -31,12 +31,18 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text
         state.texture_units[texture_unit].texture_2d = new_texture->texture.handle;
         state.Apply();
 
-        // TODO: Need to choose filters that correspond to PICA once register is declared
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, PicaToGL::TextureFilterMode(config.config.mag_filter));
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, PicaToGL::TextureFilterMode(config.config.min_filter));
 
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, PicaToGL::WrapMode(config.config.wrap_s));
-        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, PicaToGL::WrapMode(config.config.wrap_t));
+        GLenum wrap_s = PicaToGL::WrapMode(config.config.wrap_s);
+        GLenum wrap_t = PicaToGL::WrapMode(config.config.wrap_t);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, wrap_s);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, wrap_t);
+
+        if (wrap_s == GL_CLAMP_TO_BORDER || wrap_t == GL_CLAMP_TO_BORDER) {
+            auto border_color = PicaToGL::ColorRGBA8((u8*)&config.config.border_color.r);
+            glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, border_color.data());
+        }
 
         const auto info = Pica::DebugUtils::TextureInfo::FromPicaRegister(config.config, config.format);
 
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
deleted file mode 100644
index 8f4ae28a..00000000
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-#include "video_core/renderer_opengl/gl_shader_util.h"
-
-// Textures
-OGLTexture::OGLTexture() : handle(0) {
-}
-
-OGLTexture::~OGLTexture() {
-    Release();
-}
-
-void OGLTexture::Create() {
-    if (handle != 0) {
-        return;
-    }
-
-    glGenTextures(1, &handle);
-}
-
-void OGLTexture::Release() {
-    glDeleteTextures(1, &handle);
-    handle = 0;
-}
-
-// Shaders
-OGLShader::OGLShader() : handle(0) {
-}
-
-OGLShader::~OGLShader() {
-    Release();
-}
-
-void OGLShader::Create(const char* vert_shader, const char* frag_shader) {
-    if (handle != 0) {
-        return;
-    }
-
-    handle = ShaderUtil::LoadShaders(vert_shader, frag_shader);
-}
-
-void OGLShader::Release() {
-    glDeleteProgram(handle);
-    handle = 0;
-}
-
-// Buffer objects
-OGLBuffer::OGLBuffer() : handle(0) {
-}
-
-OGLBuffer::~OGLBuffer() {
-    Release();
-}
-
-void OGLBuffer::Create() {
-    if (handle != 0) {
-        return;
-    }
-
-    glGenBuffers(1, &handle);
-}
-
-void OGLBuffer::Release() {
-    glDeleteBuffers(1, &handle);
-    handle = 0;
-}
-
-// Vertex array objects
-OGLVertexArray::OGLVertexArray() : handle(0) {
-}
-
-OGLVertexArray::~OGLVertexArray() {
-    Release();
-}
-
-void OGLVertexArray::Create() {
-    if (handle != 0) {
-        return;
-    }
-
-    glGenVertexArrays(1, &handle);
-}
-
-void OGLVertexArray::Release() {
-    glDeleteVertexArrays(1, &handle);
-    handle = 0;
-}
-
-// Framebuffers
-OGLFramebuffer::OGLFramebuffer() : handle(0) {
-}
-
-OGLFramebuffer::~OGLFramebuffer() {
-    Release();
-}
-
-void OGLFramebuffer::Create() {
-    if (handle != 0) {
-        return;
-    }
-
-    glGenFramebuffers(1, &handle);
-}
-
-void OGLFramebuffer::Release() {
-    glDeleteFramebuffers(1, &handle);
-    handle = 0;
-}
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 975720d0..6f9dc012 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -4,76 +4,124 @@
 
 #pragma once
 
+#include <utility>
+
 #include "common/common_types.h"
 
-#include "generated/gl_3_2_core.h"
+#include "video_core/renderer_opengl/generated/gl_3_2_core.h"
+#include "video_core/renderer_opengl/gl_shader_util.h"
 
-class OGLTexture : public NonCopyable {
+class OGLTexture : private NonCopyable {
 public:
-    OGLTexture();
-    ~OGLTexture();
+    OGLTexture() = default;
+    OGLTexture(OGLTexture&& o) { std::swap(handle, o.handle); }
+    ~OGLTexture() { Release(); }
+    OGLTexture& operator=(OGLTexture&& o) { std::swap(handle, o.handle); return *this; }
 
     /// Creates a new internal OpenGL resource and stores the handle
-    void Create();
+    void Create() {
+        if (handle != 0) return;
+        glGenTextures(1, &handle);
+    }
 
     /// Deletes the internal OpenGL resource
-    void Release();
+    void Release() {
+        if (handle == 0) return;
+        glDeleteTextures(1, &handle);
+        handle = 0;
+    }
 
-    GLuint handle;
+    GLuint handle = 0;
 };
 
-class OGLShader : public NonCopyable {
+class OGLShader : private NonCopyable {
 public:
-    OGLShader();
-    ~OGLShader();
+    OGLShader() = default;
+    OGLShader(OGLShader&& o) { std::swap(handle, o.handle); }
+    ~OGLShader() { Release(); }
+    OGLShader& operator=(OGLShader&& o) { std::swap(handle, o.handle); return *this; }
 
     /// Creates a new internal OpenGL resource and stores the handle
-    void Create(const char* vert_shader, const char* frag_shader);
+    void Create(const char* vert_shader, const char* frag_shader) {
+        if (handle != 0) return;
+        handle = ShaderUtil::LoadShaders(vert_shader, frag_shader);
+    }
 
     /// Deletes the internal OpenGL resource
-    void Release();
+    void Release() {
+        if (handle == 0) return;
+        glDeleteProgram(handle);
+        handle = 0;
+    }
 
-    GLuint handle;
+    GLuint handle = 0;
 };
 
-class OGLBuffer : public NonCopyable {
+class OGLBuffer : private NonCopyable {
 public:
-    OGLBuffer();
-    ~OGLBuffer();
+    OGLBuffer() = default;
+    OGLBuffer(OGLBuffer&& o) { std::swap(handle, o.handle); }
+    ~OGLBuffer() { Release(); }
+    OGLBuffer& operator=(OGLBuffer&& o) { std::swap(handle, o.handle); return *this; }
 
     /// Creates a new internal OpenGL resource and stores the handle
-    void Create();
+    void Create() {
+        if (handle != 0) return;
+        glGenBuffers(1, &handle);
+    }
 
     /// Deletes the internal OpenGL resource
-    void Release();
+    void Release() {
+        if (handle == 0) return;
+        glDeleteBuffers(1, &handle);
+        handle = 0;
+    }
 
-    GLuint handle;
+    GLuint handle = 0;
 };
 
-class OGLVertexArray : public NonCopyable {
+class OGLVertexArray : private NonCopyable {
 public:
-    OGLVertexArray();
-    ~OGLVertexArray();
+    OGLVertexArray() = default;
+    OGLVertexArray(OGLVertexArray&& o) { std::swap(handle, o.handle); }
+    ~OGLVertexArray() { Release(); }
+    OGLVertexArray& operator=(OGLVertexArray&& o) { std::swap(handle, o.handle); return *this; }
 
     /// Creates a new internal OpenGL resource and stores the handle
-    void Create();
+    void Create() {
+        if (handle != 0) return;
+        glGenVertexArrays(1, &handle);
+    }
 
     /// Deletes the internal OpenGL resource
-    void Release();
+    void Release() {
+        if (handle == 0) return;
+        glDeleteVertexArrays(1, &handle);
+        handle = 0;
+    }
 
-    GLuint handle;
+    GLuint handle = 0;
 };
 
-class OGLFramebuffer : public NonCopyable {
+class OGLFramebuffer : private NonCopyable {
 public:
-    OGLFramebuffer();
-    ~OGLFramebuffer();
+    OGLFramebuffer() = default;
+    OGLFramebuffer(OGLFramebuffer&& o) { std::swap(handle, o.handle); }
+    ~OGLFramebuffer() { Release(); }
+    OGLFramebuffer& operator=(OGLFramebuffer&& o) { std::swap(handle, o.handle); return *this; }
 
     /// Creates a new internal OpenGL resource and stores the handle
-    void Create();
+    void Create() {
+        if (handle != 0) return;
+        glGenFramebuffers(1, &handle);
+    }
 
     /// Deletes the internal OpenGL resource
-    void Release();
+    void Release() {
+        if (handle == 0) return;
+        glDeleteFramebuffers(1, &handle);
+        handle = 0;
+    }
 
-    GLuint handle;
+    GLuint handle = 0;
 };
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 3526e16d..9efc1533 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -147,20 +147,17 @@ void OpenGLState::Apply() {
 
     // Textures
     for (unsigned texture_index = 0; texture_index < ARRAY_SIZE(texture_units); ++texture_index) {
-        if (texture_units[texture_index].enabled_2d != cur_state.texture_units[texture_index].enabled_2d) {
+        if (texture_units[texture_index].enabled_2d != cur_state.texture_units[texture_index].enabled_2d ||
+            texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) {
+
             glActiveTexture(GL_TEXTURE0 + texture_index);
 
             if (texture_units[texture_index].enabled_2d) {
-                glEnable(GL_TEXTURE_2D);
+                glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d);
             } else {
-                glDisable(GL_TEXTURE_2D);
+                glBindTexture(GL_TEXTURE_2D, 0);
             }
         }
-
-        if (texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) {
-            glActiveTexture(GL_TEXTURE0 + texture_index);
-            glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d);
-        }
     }
 
     // Framebuffer
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index e566f9f7..3b562da8 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -12,10 +12,37 @@
 
 namespace PicaToGL {
 
+inline GLenum TextureFilterMode(Pica::Regs::TextureConfig::TextureFilter mode) {
+    static const GLenum filter_mode_table[] = {
+        GL_NEAREST,  // TextureFilter::Nearest
+        GL_LINEAR    // TextureFilter::Linear
+    };
+
+    // Range check table for input
+    if (mode >= ARRAY_SIZE(filter_mode_table)) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown texture filtering mode %d", mode);
+        UNREACHABLE();
+
+        return GL_LINEAR;
+    }
+
+    GLenum gl_mode = filter_mode_table[mode];
+
+    // Check for dummy values indicating an unknown mode
+    if (gl_mode == 0) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown texture filtering mode %d", mode);
+        UNIMPLEMENTED();
+
+        return GL_LINEAR;
+    }
+
+    return gl_mode;
+}
+
 inline GLenum WrapMode(Pica::Regs::TextureConfig::WrapMode mode) {
     static const GLenum wrap_mode_table[] = {
         GL_CLAMP_TO_EDGE,  // WrapMode::ClampToEdge
-        0,                 // Unknown
+        GL_CLAMP_TO_BORDER,// WrapMode::ClampToBorder
         GL_REPEAT,         // WrapMode::Repeat
         GL_MIRRORED_REPEAT // WrapMode::MirroredRepeat
     };
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 3399ca12..96e12839 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -2,22 +2,27 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdlib>
+
+#include "common/assert.h"
+#include "common/emu_window.h"
+#include "common/logging/log.h"
+#include "common/profiler_reporting.h"
+
 #include "core/hw/gpu.h"
 #include "core/hw/hw.h"
 #include "core/hw/lcd.h"
 #include "core/memory.h"
 #include "core/settings.h"
 
-#include "common/emu_window.h"
-#include "common/logging/log.h"
-#include "common/profiler_reporting.h"
-
 #include "video_core/video_core.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
 #include "video_core/renderer_opengl/gl_shaders.h"
 
-#include <algorithm>
+#include "video_core/debug_utils/debug_utils.h"
 
 /**
  * Vertex structure that the drawn screen rectangles are composed of.
@@ -126,6 +131,10 @@ void RendererOpenGL::SwapBuffers() {
             hw_rasterizer->Reset();
         }
     }
+
+    if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+        Pica::g_debug_context->recorder->FrameFinished();
+    }
 }
 
 /**
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 87006a83..960ae577 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -2,8 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <stack>
-
+#include <boost/container/static_vector.hpp>
 #include <boost/range/algorithm.hpp>
 
 #include <common/file_util.h>
@@ -27,7 +26,7 @@ namespace Pica {
 namespace VertexShader {
 
 struct VertexShaderState {
-    const u32* program_counter;
+    u32 program_counter;
 
     const float24* input_register_table[16];
     Math::Vec4<float24> output_registers[16];
@@ -53,7 +52,7 @@ struct VertexShaderState {
     };
 
     // TODO: Is there a maximal size for this?
-    std::stack<CallStackElement> call_stack;
+    boost::container::static_vector<CallStackElement, 16> call_stack;
 
     struct {
         u32 max_offset; // maximum program counter ever reached
@@ -71,15 +70,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
     while (true) {
         if (!state.call_stack.empty()) {
-            auto& top = state.call_stack.top();
-            if (state.program_counter - program_code.data() == top.final_address) {
+            auto& top = state.call_stack.back();
+            if (state.program_counter == top.final_address) {
                 state.address_registers[2] += top.loop_increment;
 
                 if (top.repeat_counter-- == 0) {
-                    state.program_counter = &program_code[top.return_address];
-                    state.call_stack.pop();
+                    state.program_counter = top.return_address;
+                    state.call_stack.pop_back();
                 } else {
-                    state.program_counter = &program_code[top.loop_address];
+                    state.program_counter = top.loop_address;
                 }
 
                 // TODO: Is "trying again" accurate to hardware?
@@ -88,17 +87,16 @@ static void ProcessShaderCode(VertexShaderState& state) {
         }
 
         bool exit_loop = false;
-        const Instruction& instr = *(const Instruction*)state.program_counter;
-        const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
+        const Instruction instr = { program_code[state.program_counter] };
+        const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
 
-        static auto call = [&program_code](VertexShaderState& state, u32 offset, u32 num_instructions,
+        static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
                               u32 return_offset, u8 repeat_count, u8 loop_increment) {
-            state.program_counter = &program_code[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+            state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
+            ASSERT(state.call_stack.size() < state.call_stack.capacity());
+            state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
         };
-        u32 binary_offset = state.program_counter - program_code.data();
-
-        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + binary_offset);
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
 
         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
             switch (source_reg.GetRegisterType()) {
@@ -221,7 +219,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 for (int i = 0; i < num_components; ++i)
                     dot = dot + src1[i] * src2[i];
 
-                for (int i = 0; i < num_components; ++i) {
+                for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
 
@@ -442,13 +440,13 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
             case OpCode::Id::JMPC:
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
-                    state.program_counter = &program_code[instr.flow_control.dest_offset] - 1;
+                    state.program_counter = instr.flow_control.dest_offset - 1;
                 }
                 break;
 
             case OpCode::Id::JMPU:
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
-                    state.program_counter = &program_code[instr.flow_control.dest_offset] - 1;
+                    state.program_counter = instr.flow_control.dest_offset - 1;
                 }
                 break;
 
@@ -456,7 +454,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 call(state,
                      instr.flow_control.dest_offset,
                      instr.flow_control.num_instructions,
-                     binary_offset + 1, 0, 0);
+                     state.program_counter + 1, 0, 0);
                 break;
 
             case OpCode::Id::CALLU:
@@ -464,7 +462,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        binary_offset + 1, 0, 0);
+                        state.program_counter + 1, 0, 0);
                 }
                 break;
 
@@ -473,7 +471,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        binary_offset + 1, 0, 0);
+                        state.program_counter + 1, 0, 0);
                 }
                 break;
 
@@ -483,8 +481,8 @@ static void ProcessShaderCode(VertexShaderState& state) {
             case OpCode::Id::IFU:
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                     call(state,
-                         binary_offset + 1,
-                         instr.flow_control.dest_offset - binary_offset - 1,
+                         state.program_counter + 1,
+                         instr.flow_control.dest_offset - state.program_counter - 1,
                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
@@ -501,8 +499,8 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                     call(state,
-                         binary_offset + 1,
-                         instr.flow_control.dest_offset - binary_offset - 1,
+                         state.program_counter + 1,
+                         instr.flow_control.dest_offset - state.program_counter - 1,
                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
@@ -519,8 +517,8 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y;
 
                 call(state,
-                     binary_offset + 1,
-                     instr.flow_control.dest_offset - binary_offset + 1,
+                     state.program_counter + 1,
+                     instr.flow_control.dest_offset - state.program_counter + 1,
                      instr.flow_control.dest_offset + 1,
                      uniforms.i[instr.flow_control.int_uniform_id].x,
                      uniforms.i[instr.flow_control.int_uniform_id].z);
@@ -546,20 +544,17 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
 static Common::Profiling::TimingCategory shader_category("Vertex Shader");
 
-OutputVertex RunShader(const InputVertex& input, int num_attributes) {
+OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) {
     Common::Profiling::ScopeTimer timer(shader_category);
 
-    const auto& regs = g_state.regs;
-    const auto& vs = g_state.vs;
     VertexShaderState state;
 
-    const u32* main = &vs.program_code[regs.vs_main_offset];
-    state.program_counter = (u32*)main;
+    state.program_counter = config.main_offset;
     state.debug.max_offset = 0;
     state.debug.max_opdesc_id = 0;
 
     // Setup input register table
-    const auto& attribute_register_map = regs.vs_input_register_map;
+    const auto& attribute_register_map = config.input_register_map;
     float24 dummy_register;
     boost::fill(state.input_register_table, &dummy_register);
 
@@ -584,16 +579,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
     state.conditional_code[1] = false;
 
     ProcessShaderCode(state);
-    DebugUtils::DumpShader(vs.program_code.data(), state.debug.max_offset, vs.swizzle_data.data(),
-                           state.debug.max_opdesc_id, regs.vs_main_offset,
-                           regs.vs_output_attributes);
+#if PICA_DUMP_SHADERS
+    DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(),
+                           state.debug.max_opdesc_id, config.main_offset,
+                           g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
+#endif
 
     // Setup output data
     OutputVertex ret;
     // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
     // figure out what those circumstances are and enable the remaining outputs then.
     for (int i = 0; i < 7; ++i) {
-        const auto& output_register_map = regs.vs_output_attributes[i];
+        const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here
 
         u32 semantics[4] = {
             output_register_map.map_x, output_register_map.map_y,
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index 7471a6de..97f9250d 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -4,11 +4,10 @@
 
 #pragma once
 
-#include <initializer_list>
+#include <type_traits>
 
-#include <common/common_types.h>
+#include "common/vector_math.h"
 
-#include "math.h"
 #include "pica.h"
 
 namespace Pica {
@@ -66,7 +65,7 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
 
-OutputVertex RunShader(const InputVertex& input, int num_attributes);
+OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup);
 
 } // namespace
 
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 3f24df7b..14b33c9d 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -4,12 +4,11 @@
 
 #pragma once
 
-#include "common/emu_window.h"
-
-#include "renderer_base.h"
-
 #include <atomic>
 
+class EmuWindow;
+class RendererBase;
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Video Core namespace