15 files changed, 1308 insertions, 225 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 8e7b93ac..71a1b5ec 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -5,6 +5,7 @@ set(SRCS    clipper.cpp
             utils.cpp
             vertex_shader.cpp
             video_core.cpp
+            debug_utils/debug_utils.cpp
             renderer_opengl/renderer_opengl.cpp)
 
 set(HEADERS clipper.h
@@ -17,6 +18,7 @@ set(HEADERS clipper.h
             renderer_base.h
             vertex_shader.h
             video_core.h
+            debug_utils/debug_utils.h
             renderer_opengl/renderer_opengl.h)
 
 add_library(video_core STATIC ${SRCS} ${HEADERS})
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 020a4da3..9567a984 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -2,12 +2,14 @@
 // Licensed under GPLv2
 // Refer to the license.txt file included.
 
+#include "clipper.h"
 #include "command_processor.h"
 #include "math.h"
 #include "pica.h"
 #include "primitive_assembly.h"
 #include "vertex_shader.h"
 
+#include "debug_utils/debug_utils.h"
 
 namespace Pica {
 
@@ -23,15 +25,24 @@ static u32 uniform_write_buffer[4];
 static u32 vs_binary_write_offset = 0;
 static u32 vs_swizzle_write_offset = 0;
 
-static inline void WritePicaReg(u32 id, u32 value) {
+static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
+
+    if (id >= registers.NumIds())
+        return;
+
+    // TODO: Figure out how register masking acts on e.g. vs_uniform_setup.set_value
     u32 old_value = registers[id];
-    registers[id] = value;
+    registers[id] = (old_value & ~mask) | (value & mask);
+
+    DebugUtils::OnPicaRegWrite(id, registers[id]);
 
     switch(id) {
         // It seems like these trigger vertex rendering
         case PICA_REG_INDEX(trigger_draw):
         case PICA_REG_INDEX(trigger_draw_indexed):
         {
+            DebugUtils::DumpTevStageConfig(registers.GetTevStages());
+
             const auto& attribute_config = registers.vertex_attributes;
             const u8* const base_address = Memory::GetPointer(attribute_config.GetBaseAddress());
 
@@ -68,6 +79,10 @@ static inline void WritePicaReg(u32 id, u32 value) {
             const u16* index_address_16 = (u16*)index_address_8;
             bool index_u16 = (bool)index_info.format;
 
+            DebugUtils::GeometryDumper geometry_dumper;
+            PrimitiveAssembler<VertexShader::OutputVertex> clipper_primitive_assembler(registers.triangle_topology.Value());
+            PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(registers.triangle_topology.Value());
+
             for (int index = 0; index < registers.num_vertices; ++index)
             {
                 int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
@@ -95,14 +110,28 @@ static inline void WritePicaReg(u32 id, u32 value) {
                                   input.attr[i][comp].ToFloat32());
                     }
                 }
+
+                // NOTE: When dumping geometry, we simply assume that the first input attribute
+                //       corresponds to the position for now.
+                DebugUtils::GeometryDumper::Vertex dumped_vertex = {
+                    input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32()
+                };
+                using namespace std::placeholders;
+                dumping_primitive_assembler.SubmitVertex(dumped_vertex,
+                                                         std::bind(&DebugUtils::GeometryDumper::AddTriangle,
+                                                                   &geometry_dumper, _1, _2, _3));
+
+                // Send to vertex shader
                 VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes());
 
                 if (is_indexed) {
                     // TODO: Add processed vertex to vertex cache!
                 }
 
-                PrimitiveAssembly::SubmitVertex(output);
+                // Send to triangle clipper
+                clipper_primitive_assembler.SubmitVertex(output, Clipper::ProcessTriangle);
             }
+            geometry_dumper.Dump();
             break;
         }
 
@@ -207,14 +236,17 @@ static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) {
 
     u32* read_pointer = (u32*)first_command_word;
 
-    // TODO: Take parameter mask into consideration!
+    const u32 write_mask = ((header.parameter_mask & 0x1) ? (0xFFu <<  0) : 0u) |
+                           ((header.parameter_mask & 0x2) ? (0xFFu <<  8) : 0u) |
+                           ((header.parameter_mask & 0x4) ? (0xFFu << 16) : 0u) |
+                           ((header.parameter_mask & 0x8) ? (0xFFu << 24) : 0u);
 
-    WritePicaReg(header.cmd_id, *read_pointer);
+    WritePicaReg(header.cmd_id, *read_pointer, write_mask);
     read_pointer += 2;
 
     for (int i = 1; i < 1+header.extra_data_length; ++i) {
         u32 cmd = header.cmd_id + ((header.group_commands) ? i : 0);
-        WritePicaReg(cmd, *read_pointer);
+        WritePicaReg(cmd, *read_pointer, write_mask);
         ++read_pointer;
     }
 
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
index 6b6241a2..955f9dae 100644
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@@ -17,11 +17,22 @@ union CommandHeader {
     u32 hex;
 
     BitField< 0, 16, u32> cmd_id;
+
+    // parameter_mask:
+    // Mask applied to the input value to make it possible to update
+    // parts of a register without overwriting its other fields.
+    // first bit:  0x000000FF
+    // second bit: 0x0000FF00
+    // third bit:  0x00FF0000
+    // fourth bit: 0xFF000000
     BitField<16,  4, u32> parameter_mask;
+
     BitField<20, 11, u32> extra_data_length;
+
     BitField<31,  1, u32> group_commands;
 };
-static_assert(std::is_standard_layout<CommandHeader>::value == true, "CommandHeader does not use standard layout");
+static_assert(std::is_standard_layout<CommandHeader>::value == true,
+              "CommandHeader does not use standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
 
 void ProcessCommandList(const u32* list, u32 size);
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
new file mode 100644
index 00000000..48e6dd18
--- /dev/null
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -0,0 +1,522 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <map>
+#include <fstream>
+#include <mutex>
+#include <string>
+
+#ifdef HAVE_PNG
+#include <png.h>
+#endif
+
+#include "common/file_util.h"
+
+#include "video_core/pica.h"
+
+#include "debug_utils.h"
+
+namespace Pica {
+
+namespace DebugUtils {
+
+void GeometryDumper::AddTriangle(Vertex& v0, Vertex& v1, Vertex& v2) {
+    vertices.push_back(v0);
+    vertices.push_back(v1);
+    vertices.push_back(v2);
+
+    int num_vertices = vertices.size();
+    faces.push_back({ num_vertices-3, num_vertices-2, num_vertices-1 });
+}
+
+void GeometryDumper::Dump() {
+    // NOTE: Permanently enabling this just trashes the hard disk for no reason.
+    //       Hence, this is currently disabled.
+    return;
+
+    static int index = 0;
+    std::string filename = std::string("geometry_dump") + std::to_string(++index) + ".obj";
+
+    std::ofstream file(filename);
+
+    for (const auto& vertex : vertices) {
+        file << "v " << vertex.pos[0]
+             << " "  << vertex.pos[1]
+             << " "  << vertex.pos[2] << std::endl;
+    }
+
+    for (const Face& face : faces) {
+        file << "f " << 1+face.index[0]
+             << " "  << 1+face.index[1]
+             << " "  << 1+face.index[2] << std::endl;
+    }
+}
+
+#pragma pack(1)
+struct DVLBHeader {
+    enum : u32 {
+        MAGIC_WORD = 0x424C5644, // "DVLB"
+    };
+
+    u32 magic_word;
+    u32 num_programs;
+//    u32 dvle_offset_table[];
+};
+static_assert(sizeof(DVLBHeader) == 0x8, "Incorrect structure size");
+
+struct DVLPHeader {
+    enum : u32 {
+        MAGIC_WORD = 0x504C5644, // "DVLP"
+    };
+
+    u32 magic_word;
+    u32 version;
+    u32 binary_offset;  // relative to DVLP start
+    u32 binary_size_words;
+    u32 swizzle_patterns_offset;
+    u32 swizzle_patterns_num_entries;
+    u32 unk2;
+};
+static_assert(sizeof(DVLPHeader) == 0x1C, "Incorrect structure size");
+
+struct DVLEHeader {
+    enum : u32 {
+        MAGIC_WORD = 0x454c5644, // "DVLE"
+    };
+
+    enum class ShaderType : u8 {
+        VERTEX = 0,
+        GEOMETRY = 1,
+    };
+
+    u32 magic_word;
+    u16 pad1;
+    ShaderType type;
+    u8 pad2;
+    u32 main_offset_words; // offset within binary blob
+    u32 endmain_offset_words;
+    u32 pad3;
+    u32 pad4;
+    u32 constant_table_offset;
+    u32 constant_table_size; // number of entries
+    u32 label_table_offset;
+    u32 label_table_size;
+    u32 output_register_table_offset;
+    u32 output_register_table_size;
+    u32 uniform_table_offset;
+    u32 uniform_table_size;
+    u32 symbol_table_offset;
+    u32 symbol_table_size;
+
+};
+static_assert(sizeof(DVLEHeader) == 0x40, "Incorrect structure size");
+#pragma pack()
+
+void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size,
+                u32 main_offset, const Regs::VSOutputAttributes* output_attributes)
+{
+    // NOTE: Permanently enabling this just trashes hard disks for no reason.
+    //       Hence, this is currently disabled.
+    return;
+
+    struct StuffToWrite {
+        u8* pointer;
+        u32 size;
+    };
+    std::vector<StuffToWrite> writing_queue;
+    u32 write_offset = 0;
+
+    auto QueueForWriting = [&writing_queue,&write_offset](u8* pointer, u32 size) {
+        writing_queue.push_back({pointer, size});
+        u32 old_write_offset = write_offset;
+        write_offset += size;
+        return old_write_offset;
+    };
+
+    // First off, try to translate Pica state (one enum for output attribute type and component)
+    // into shbin format (separate type and component mask).
+    union OutputRegisterInfo {
+        enum Type : u64 {
+            POSITION = 0,
+            COLOR = 2,
+            TEXCOORD0 = 3,
+            TEXCOORD1 = 5,
+            TEXCOORD2 = 6,
+        };
+
+        BitField< 0, 64, u64> hex;
+
+        BitField< 0, 16, Type> type;
+        BitField<16, 16, u64> id;
+        BitField<32,  4, u64> component_mask;
+    };
+
+    // This is put into a try-catch block to make sure we notice unknown configurations.
+    std::vector<OutputRegisterInfo> output_info_table;
+        for (int i = 0; i < 7; ++i) {
+            using OutputAttributes = Pica::Regs::VSOutputAttributes;
+
+            // TODO: It's still unclear how the attribute components map to the register!
+            //       Once we know that, this code probably will not make much sense anymore.
+            std::map<OutputAttributes::Semantic, std::pair<OutputRegisterInfo::Type, u32> > map = {
+                { OutputAttributes::POSITION_X, { OutputRegisterInfo::POSITION, 1} },
+                { OutputAttributes::POSITION_Y, { OutputRegisterInfo::POSITION, 2} },
+                { OutputAttributes::POSITION_Z, { OutputRegisterInfo::POSITION, 4} },
+                { OutputAttributes::POSITION_W, { OutputRegisterInfo::POSITION, 8} },
+                { OutputAttributes::COLOR_R, { OutputRegisterInfo::COLOR, 1} },
+                { OutputAttributes::COLOR_G, { OutputRegisterInfo::COLOR, 2} },
+                { OutputAttributes::COLOR_B, { OutputRegisterInfo::COLOR, 4} },
+                { OutputAttributes::COLOR_A, { OutputRegisterInfo::COLOR, 8} },
+                { OutputAttributes::TEXCOORD0_U, { OutputRegisterInfo::TEXCOORD0, 1} },
+                { OutputAttributes::TEXCOORD0_V, { OutputRegisterInfo::TEXCOORD0, 2} },
+                { OutputAttributes::TEXCOORD1_U, { OutputRegisterInfo::TEXCOORD1, 1} },
+                { OutputAttributes::TEXCOORD1_V, { OutputRegisterInfo::TEXCOORD1, 2} },
+                { OutputAttributes::TEXCOORD2_U, { OutputRegisterInfo::TEXCOORD2, 1} },
+                { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} }
+            };
+
+            for (const auto& semantic : std::vector<OutputAttributes::Semantic>{
+                                                output_attributes[i].map_x,
+                                                output_attributes[i].map_y,
+                                                output_attributes[i].map_z,
+                                                output_attributes[i].map_w     }) {
+                if (semantic == OutputAttributes::INVALID)
+                    continue;
+
+                try {
+                    OutputRegisterInfo::Type type = map.at(semantic).first;
+                    u32 component_mask = map.at(semantic).second;
+
+                    auto it = std::find_if(output_info_table.begin(), output_info_table.end(),
+                                        [&i, &type](const OutputRegisterInfo& info) {
+                                            return info.id == i && info.type == type;
+                                        }
+                                        );
+
+                    if (it == output_info_table.end()) {
+                        output_info_table.push_back({});
+                        output_info_table.back().type = type;
+                        output_info_table.back().component_mask = component_mask;
+                        output_info_table.back().id = i;
+                    } else {
+                        it->component_mask = it->component_mask | component_mask;
+                    }
+                } catch (const std::out_of_range& oor) {
+                    _dbg_assert_msg_(GPU, 0, "Unknown output attribute mapping");
+                    ERROR_LOG(GPU, "Unknown output attribute mapping: %03x, %03x, %03x, %03x",
+                              (int)output_attributes[i].map_x.Value(),
+                              (int)output_attributes[i].map_y.Value(),
+                              (int)output_attributes[i].map_z.Value(),
+                              (int)output_attributes[i].map_w.Value());
+                }
+            }
+        }
+
+
+    struct {
+        DVLBHeader header;
+        u32 dvle_offset;
+    } dvlb{ {DVLBHeader::MAGIC_WORD, 1 } }; // 1 DVLE
+
+    DVLPHeader dvlp{ DVLPHeader::MAGIC_WORD };
+    DVLEHeader dvle{ DVLEHeader::MAGIC_WORD };
+
+    QueueForWriting((u8*)&dvlb, sizeof(dvlb));
+    u32 dvlp_offset = QueueForWriting((u8*)&dvlp, sizeof(dvlp));
+    dvlb.dvle_offset = QueueForWriting((u8*)&dvle, sizeof(dvle));
+
+    // TODO: Reduce the amount of binary code written to relevant portions
+    dvlp.binary_offset = write_offset - dvlp_offset;
+    dvlp.binary_size_words = binary_size;
+    QueueForWriting((u8*)binary_data, binary_size * sizeof(u32));
+
+    dvlp.swizzle_patterns_offset = write_offset - dvlp_offset;
+    dvlp.swizzle_patterns_num_entries = swizzle_size;
+    u32 dummy = 0;
+    for (int i = 0; i < swizzle_size; ++i) {
+        QueueForWriting((u8*)&swizzle_data[i], sizeof(swizzle_data[i]));
+        QueueForWriting((u8*)&dummy, sizeof(dummy));
+    }
+
+    dvle.main_offset_words = main_offset;
+    dvle.output_register_table_offset = write_offset - dvlb.dvle_offset;
+    dvle.output_register_table_size = output_info_table.size();
+    QueueForWriting((u8*)output_info_table.data(), output_info_table.size() * sizeof(OutputRegisterInfo));
+
+    // TODO: Create a label table for "main"
+
+
+    // Write data to file
+    static int dump_index = 0;
+    std::string filename = std::string("shader_dump") + std::to_string(++dump_index) + std::string(".shbin");
+    std::ofstream file(filename, std::ios_base::out | std::ios_base::binary);
+
+    for (auto& chunk : writing_queue) {
+        file.write((char*)chunk.pointer, chunk.size);
+    }
+}
+
+static std::unique_ptr<PicaTrace> pica_trace;
+static std::mutex pica_trace_mutex;
+static int is_pica_tracing = false;
+
+void StartPicaTracing()
+{
+    if (is_pica_tracing) {
+        ERROR_LOG(GPU, "StartPicaTracing called even though tracing already running!");
+        return;
+    }
+
+    pica_trace_mutex.lock();
+    pica_trace = std::unique_ptr<PicaTrace>(new PicaTrace);
+
+    is_pica_tracing = true;
+    pica_trace_mutex.unlock();
+}
+
+bool IsPicaTracing()
+{
+    return is_pica_tracing;
+}
+
+void OnPicaRegWrite(u32 id, u32 value)
+{
+    // Double check for is_pica_tracing to avoid pointless locking overhead
+    if (!is_pica_tracing)
+        return;
+
+    std::unique_lock<std::mutex> lock(pica_trace_mutex);
+
+    if (!is_pica_tracing)
+        return;
+
+    pica_trace->writes.push_back({id, value});
+}
+
+std::unique_ptr<PicaTrace> FinishPicaTracing()
+{
+    if (!is_pica_tracing) {
+        ERROR_LOG(GPU, "FinishPicaTracing called even though tracing already running!");
+        return {};
+    }
+
+    // signalize that no further tracing should be performed
+    is_pica_tracing = false;
+
+    // Wait until running tracing is finished
+    pica_trace_mutex.lock();
+    std::unique_ptr<PicaTrace> ret(std::move(pica_trace));
+    pica_trace_mutex.unlock();
+    return std::move(ret);
+}
+
+void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
+    // NOTE: Permanently enabling this just trashes hard disks for no reason.
+    //       Hence, this is currently disabled.
+    return;
+
+#ifndef HAVE_PNG
+	return;
+#else
+	if (!data)
+        return;
+
+    // Write data to file
+    static int dump_index = 0;
+    std::string filename = std::string("texture_dump") + std::to_string(++dump_index) + std::string(".png");
+    u32 row_stride = texture_config.width * 3;
+
+    u8* buf;
+
+    char title[] = "Citra texture dump";
+    char title_key[] = "Title";
+    png_structp png_ptr = nullptr;
+    png_infop info_ptr = nullptr;
+
+    // Open file for writing (binary mode)
+    File::IOFile fp(filename, "wb");
+
+    // Initialize write structure
+    png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
+    if (png_ptr == nullptr) {
+        ERROR_LOG(GPU, "Could not allocate write struct\n");
+        goto finalise;
+
+    }
+
+    // Initialize info structure
+    info_ptr = png_create_info_struct(png_ptr);
+    if (info_ptr == nullptr) {
+        ERROR_LOG(GPU, "Could not allocate info struct\n");
+        goto finalise;
+    }
+
+    // Setup Exception handling
+    if (setjmp(png_jmpbuf(png_ptr))) {
+        ERROR_LOG(GPU, "Error during png creation\n");
+        goto finalise;
+    }
+
+    png_init_io(png_ptr, fp.GetHandle());
+
+    // Write header (8 bit colour depth)
+    png_set_IHDR(png_ptr, info_ptr, texture_config.width, texture_config.height,
+        8, PNG_COLOR_TYPE_RGB /*_ALPHA*/, PNG_INTERLACE_NONE,
+        PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
+
+    png_text title_text;
+    title_text.compression = PNG_TEXT_COMPRESSION_NONE;
+    title_text.key = title_key;
+    title_text.text = title;
+    png_set_text(png_ptr, info_ptr, &title_text, 1);
+
+    png_write_info(png_ptr, info_ptr);
+
+    buf = new u8[row_stride * texture_config.height];
+    for (int y = 0; y < texture_config.height; ++y) {
+        for (int x = 0; x < texture_config.width; ++x) {
+            // Cf. rasterizer code for an explanation of this algorithm.
+            int texel_index_within_tile = 0;
+            for (int block_size_index = 0; block_size_index < 3; ++block_size_index) {
+                int sub_tile_width = 1 << block_size_index;
+                int sub_tile_height = 1 << block_size_index;
+
+                int sub_tile_index = (x & sub_tile_width) << block_size_index;
+                sub_tile_index += 2 * ((y & sub_tile_height) << block_size_index);
+                texel_index_within_tile += sub_tile_index;
+            }
+
+            const int block_width = 8;
+            const int block_height = 8;
+
+            int coarse_x = (x / block_width) * block_width;
+            int coarse_y = (y / block_height) * block_height;
+
+            u8* source_ptr = (u8*)data + coarse_x * block_height * 3 + coarse_y * row_stride + texel_index_within_tile * 3;
+            buf[3 * x + y * row_stride    ] = source_ptr[2];
+            buf[3 * x + y * row_stride + 1] = source_ptr[1];
+            buf[3 * x + y * row_stride + 2] = source_ptr[0];
+        }
+    }
+
+    // Write image data
+    for (auto y = 0; y < texture_config.height; ++y)
+    {
+        u8* row_ptr = (u8*)buf + y * row_stride;
+        u8* ptr = row_ptr;
+        png_write_row(png_ptr, row_ptr);
+    }
+
+    delete[] buf;
+
+    // End write
+    png_write_end(png_ptr, nullptr);
+
+finalise:
+    if (info_ptr != nullptr) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1);
+    if (png_ptr != nullptr) png_destroy_write_struct(&png_ptr, (png_infopp)nullptr);
+#endif
+}
+
+void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
+{
+    using Source = Pica::Regs::TevStageConfig::Source;
+    using ColorModifier = Pica::Regs::TevStageConfig::ColorModifier;
+    using AlphaModifier = Pica::Regs::TevStageConfig::AlphaModifier;
+    using Operation = Pica::Regs::TevStageConfig::Operation;
+
+    std::string stage_info = "Tev setup:\n";
+    for (int index = 0; index < stages.size(); ++index) {
+        const auto& tev_stage = stages[index];
+
+        const std::map<Source, std::string> source_map = {
+            { Source::PrimaryColor, "PrimaryColor" },
+            { Source::Texture0, "Texture0" },
+            { Source::Constant, "Constant" },
+            { Source::Previous, "Previous" },
+        };
+
+        const std::map<ColorModifier, std::string> color_modifier_map = {
+            { ColorModifier::SourceColor, { "%source.rgb" } }
+        };
+        const std::map<AlphaModifier, std::string> alpha_modifier_map = {
+            { AlphaModifier::SourceAlpha, "%source.a" }
+        };
+
+        std::map<Operation, std::string> combiner_map = {
+            { Operation::Replace, "%source1" },
+            { Operation::Modulate, "(%source1 * %source2) / 255" },
+        };
+
+        auto ReplacePattern =
+                [](const std::string& input, const std::string& pattern, const std::string& replacement) -> std::string {
+                    size_t start = input.find(pattern);
+                    if (start == std::string::npos)
+                        return input;
+
+                    std::string ret = input;
+                    ret.replace(start, pattern.length(), replacement);
+                    return ret;
+                };
+        auto GetColorSourceStr =
+                [&source_map,&color_modifier_map,&ReplacePattern](const Source& src, const ColorModifier& modifier) {
+                    auto src_it = source_map.find(src);
+                    std::string src_str = "Unknown";
+                    if (src_it != source_map.end())
+                        src_str = src_it->second;
+
+                    auto modifier_it = color_modifier_map.find(modifier);
+                    std::string modifier_str = "%source.????";
+                    if (modifier_it != color_modifier_map.end())
+                        modifier_str = modifier_it->second;
+
+                    return ReplacePattern(modifier_str, "%source", src_str);
+                };
+        auto GetColorCombinerStr =
+                [&](const Regs::TevStageConfig& tev_stage) {
+                    auto op_it = combiner_map.find(tev_stage.color_op);
+                    std::string op_str = "Unknown op (%source1, %source2, %source3)";
+                    if (op_it != combiner_map.end())
+                        op_str = op_it->second;
+
+                    op_str = ReplacePattern(op_str, "%source1", GetColorSourceStr(tev_stage.color_source1, tev_stage.color_modifier1));
+                    op_str = ReplacePattern(op_str, "%source2", GetColorSourceStr(tev_stage.color_source2, tev_stage.color_modifier2));
+                    return   ReplacePattern(op_str, "%source3", GetColorSourceStr(tev_stage.color_source3, tev_stage.color_modifier3));
+                };
+        auto GetAlphaSourceStr =
+                [&source_map,&alpha_modifier_map,&ReplacePattern](const Source& src, const AlphaModifier& modifier) {
+                    auto src_it = source_map.find(src);
+                    std::string src_str = "Unknown";
+                    if (src_it != source_map.end())
+                        src_str = src_it->second;
+
+                    auto modifier_it = alpha_modifier_map.find(modifier);
+                    std::string modifier_str = "%source.????";
+                    if (modifier_it != alpha_modifier_map.end())
+                        modifier_str = modifier_it->second;
+
+                    return ReplacePattern(modifier_str, "%source", src_str);
+                };
+        auto GetAlphaCombinerStr =
+                [&](const Regs::TevStageConfig& tev_stage) {
+                    auto op_it = combiner_map.find(tev_stage.alpha_op);
+                    std::string op_str = "Unknown op (%source1, %source2, %source3)";
+                    if (op_it != combiner_map.end())
+                        op_str = op_it->second;
+
+                    op_str = ReplacePattern(op_str, "%source1", GetAlphaSourceStr(tev_stage.alpha_source1, tev_stage.alpha_modifier1));
+                    op_str = ReplacePattern(op_str, "%source2", GetAlphaSourceStr(tev_stage.alpha_source2, tev_stage.alpha_modifier2));
+                    return   ReplacePattern(op_str, "%source3", GetAlphaSourceStr(tev_stage.alpha_source3, tev_stage.alpha_modifier3));
+                };
+
+        stage_info += "Stage " + std::to_string(index) + ": " + GetColorCombinerStr(tev_stage) + "   " + GetAlphaCombinerStr(tev_stage) + "\n";
+    }
+
+    DEBUG_LOG(GPU, "%s", stage_info.c_str());
+}
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
new file mode 100644
index 00000000..8b1499bf
--- /dev/null
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -0,0 +1,66 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "video_core/pica.h"
+
+namespace Pica {
+
+namespace DebugUtils {
+
+// Simple utility class for dumping geometry data to an OBJ file
+class GeometryDumper {
+public:
+    struct Vertex {
+        std::array<float,3> pos;
+    };
+
+    void AddTriangle(Vertex& v0, Vertex& v1, Vertex& v2);
+
+    void Dump();
+
+private:
+    struct Face {
+        int index[3];
+    };
+
+    std::vector<Vertex> vertices;
+    std::vector<Face> faces;
+};
+
+void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size,
+                u32 main_offset, const Regs::VSOutputAttributes* output_attributes);
+
+
+// Utility class to log Pica commands.
+struct PicaTrace {
+    struct Write : public std::pair<u32,u32> {
+		Write(u32 id, u32 value) : std::pair<u32,u32>(id, value) {}
+
+        u32& Id() { return first; }
+        const u32& Id() const { return first; }
+
+        u32& Value() { return second; }
+        const u32& Value() const { return second; }
+    };
+    std::vector<Write> writes;
+};
+
+void StartPicaTracing();
+bool IsPicaTracing();
+void OnPicaRegWrite(u32 id, u32 value);
+std::unique_ptr<PicaTrace> FinishPicaTracing();
+
+void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
+
+void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h
index 2ba87345..5a81fcfc 100644
--- a/src/video_core/gpu_debugger.h
+++ b/src/video_core/gpu_debugger.h
@@ -18,19 +18,6 @@
 class GraphicsDebugger
 {
 public:
-    // A few utility structs used to expose data
-    // A vector of commands represented by their raw byte sequence
-    struct PicaCommand : public std::vector<u32>
-    {
-        const Pica::CommandProcessor::CommandHeader& GetHeader() const
-        {
-            const u32& val = at(1);
-            return *(Pica::CommandProcessor::CommandHeader*)&val;
-        }
-    };
-
-    typedef std::vector<PicaCommand> PicaCommandList;
-
     // Base class for all objects which need to be notified about GPU events
     class DebuggerObserver
     {
@@ -55,16 +42,6 @@ public:
             ERROR_LOG(GSP, "Received command: id=%x", (int)cmd.id.Value());
         }
 
-        /**
-        * @param lst command list which triggered this call
-        * @param is_new true if the command list was called for the first time
-        * @todo figure out how to make sure called functions don't keep references around beyond their life time
-        */
-        virtual void OnCommandListCalled(const PicaCommandList& lst, bool is_new)
-        {
-            ERROR_LOG(GSP, "Command list called: %d", (int)is_new);
-        }
-
     protected:
         const GraphicsDebugger* GetDebugger() const
         {
@@ -93,49 +70,12 @@ public:
                         } );
     }
 
-    void CommandListCalled(u32 address, u32* command_list, u32 size_in_words)
-    {
-        if (observers.empty())
-            return;
-
-        PicaCommandList cmdlist;
-        for (u32* parse_pointer = command_list; parse_pointer < command_list + size_in_words;)
-        {
-            const Pica::CommandProcessor::CommandHeader& header = *(Pica::CommandProcessor::CommandHeader*)(&parse_pointer[1]);
-
-            cmdlist.push_back(PicaCommand());
-            auto& cmd = cmdlist.back();
-
-            size_t size = 2 + header.extra_data_length;
-            size = (size + 1) / 2 * 2; // align to 8 bytes
-            cmd.reserve(size);
-            std::copy(parse_pointer, parse_pointer + size, std::back_inserter(cmd));
-
-            parse_pointer += size;
-        }
-
-        auto obj = std::pair<u32,PicaCommandList>(address, cmdlist);
-        auto it = std::find(command_lists.begin(), command_lists.end(), obj);
-        bool is_new = (it == command_lists.end());
-        if (is_new)
-            command_lists.push_back(obj);
-
-        ForEachObserver([&](DebuggerObserver* observer) {
-                            observer->OnCommandListCalled(obj.second, is_new);
-                        } );
-    }
-
     const GSP_GPU::Command& ReadGXCommandHistory(int index) const
     {
         // TODO: Is this thread-safe?
         return gx_command_history[index];
     }
 
-    const std::vector<std::pair<u32,PicaCommandList>>& GetCommandLists() const
-    {
-        return command_lists;
-    }
-
     void RegisterObserver(DebuggerObserver* observer)
     {
         // TODO: Check for duplicates
@@ -158,7 +98,4 @@ private:
     std::vector<DebuggerObserver*> observers;
 
     std::vector<GSP_GPU::Command> gx_command_history;
-
-    // vector of pairs of command lists and their storage address
-    std::vector<std::pair<u32,PicaCommandList>> command_lists;
 };
diff --git a/src/video_core/math.h b/src/video_core/math.h
index 7030f2cf..83ba8123 100644
--- a/src/video_core/math.h
+++ b/src/video_core/math.h
@@ -39,13 +39,19 @@ template<typename T> class Vec2;
 template<typename T> class Vec3;
 template<typename T> class Vec4;
 
+template<typename T>
+static inline Vec2<T> MakeVec(const T& x, const T& y);
+template<typename T>
+static inline Vec3<T> MakeVec(const T& x, const T& y, const T& z);
+template<typename T>
+static inline Vec4<T> MakeVec(const T& x, const T& y, const T& z, const T& w);
+
 
 template<typename T>
 class Vec2 {
 public:
-    struct {
-        T x,y;
-    };
+    T x;
+    T y;
 
     T* AsArray() { return &x; }
 
@@ -68,34 +74,34 @@ public:
         a[0] = x; a[1] = y;
     }
 
-    Vec2 operator +(const Vec2& other) const
+    Vec2<decltype(T{}+T{})> operator +(const Vec2& other) const
     {
-        return Vec2(x+other.x, y+other.y);
+        return MakeVec(x+other.x, y+other.y);
     }
     void operator += (const Vec2 &other)
     {
         x+=other.x; y+=other.y;
     }
-    Vec2 operator -(const Vec2& other) const
+    Vec2<decltype(T{}-T{})> operator -(const Vec2& other) const
     {
-        return Vec2(x-other.x, y-other.y);
+        return MakeVec(x-other.x, y-other.y);
     }
     void operator -= (const Vec2& other)
     {
         x-=other.x; y-=other.y;
     }
-    Vec2 operator -() const
+    Vec2<decltype(-T{})> operator -() const
     {
-        return Vec2(-x,-y);
+        return MakeVec(-x,-y);
     }
-    Vec2 operator * (const Vec2& other) const
+    Vec2<decltype(T{}*T{})> operator * (const Vec2& other) const
     {
-        return Vec2(x*other.x, y*other.y);
+        return MakeVec(x*other.x, y*other.y);
     }
     template<typename V>
-    Vec2 operator * (const V& f) const
+    Vec2<decltype(T{}*V{})> operator * (const V& f) const
     {
-        return Vec2(x*f,y*f);
+        return MakeVec(x*f,y*f);
     }
     template<typename V>
     void operator *= (const V& f)
@@ -103,9 +109,9 @@ public:
         x*=f; y*=f;
     }
     template<typename V>
-    Vec2 operator / (const V& f) const
+    Vec2<decltype(T{}/V{})> operator / (const V& f) const
     {
-        return Vec2(x/f,y/f);
+        return MakeVec(x/f,y/f);
     }
     template<typename V>
     void operator /= (const V& f)
@@ -152,20 +158,9 @@ public:
     const T& t() const { return y; }
 
     // swizzlers - create a subvector of specific components
-    Vec2 yx() const { return Vec2(y, x); }
-    Vec2 vu() const { return Vec2(y, x); }
-    Vec2 ts() const { return Vec2(y, x); }
-
-    // Inserters to add new elements to effectively create larger vectors containing this Vec2
-    Vec3<T> InsertBeforeX(const T& value) {
-        return Vec3<T>(value, x, y);
-    }
-    Vec3<T> InsertBeforeY(const T& value) {
-        return Vec3<T>(x, value, y);
-    }
-    Vec3<T> Append(const T& value) {
-        return Vec3<T>(x, y, value);
-    }
+    const Vec2 yx() const { return Vec2(y, x); }
+    const Vec2 vu() const { return Vec2(y, x); }
+    const Vec2 ts() const { return Vec2(y, x); }
 };
 
 template<typename T, typename V>
@@ -180,10 +175,9 @@ template<typename T>
 class Vec3
 {
 public:
-    struct
-    {
-        T x,y,z;
-    };
+    T x;
+    T y;
+    T z;
 
     T* AsArray() { return &x; }
 
@@ -193,7 +187,7 @@ public:
 
     template<typename T2>
     Vec3<T2> Cast() const {
-        return Vec3<T2>((T2)x, (T2)y, (T2)z);
+        return MakeVec<T2>((T2)x, (T2)y, (T2)z);
     }
 
     // Only implemented for T=int and T=float
@@ -202,7 +196,7 @@ public:
 
     static Vec3 AssignToAll(const T& f)
     {
-        return Vec3<T>(f, f, f);
+        return MakeVec(f, f, f);
     }
 
     void Write(T a[3])
@@ -210,34 +204,34 @@ public:
         a[0] = x; a[1] = y; a[2] = z;
     }
 
-    Vec3 operator +(const Vec3 &other) const
+    Vec3<decltype(T{}+T{})> operator +(const Vec3 &other) const
     {
-        return Vec3(x+other.x, y+other.y, z+other.z);
+        return MakeVec(x+other.x, y+other.y, z+other.z);
     }
     void operator += (const Vec3 &other)
     {
         x+=other.x; y+=other.y; z+=other.z;
     }
-    Vec3 operator -(const Vec3 &other) const
+    Vec3<decltype(T{}-T{})> operator -(const Vec3 &other) const
     {
-        return Vec3(x-other.x, y-other.y, z-other.z);
+        return MakeVec(x-other.x, y-other.y, z-other.z);
     }
     void operator -= (const Vec3 &other)
     {
         x-=other.x; y-=other.y; z-=other.z;
     }
-    Vec3 operator -() const
+    Vec3<decltype(-T{})> operator -() const
     {
-        return Vec3(-x,-y,-z);
+        return MakeVec(-x,-y,-z);
     }
-    Vec3 operator * (const Vec3 &other) const
+    Vec3<decltype(T{}*T{})> operator * (const Vec3 &other) const
     {
-        return Vec3(x*other.x, y*other.y, z*other.z);
+        return MakeVec(x*other.x, y*other.y, z*other.z);
     }
     template<typename V>
-    Vec3 operator * (const V& f) const
+    Vec3<decltype(T{}*V{})> operator * (const V& f) const
     {
-        return Vec3(x*f,y*f,z*f);
+        return MakeVec(x*f,y*f,z*f);
     }
     template<typename V>
     void operator *= (const V& f)
@@ -245,9 +239,9 @@ public:
         x*=f; y*=f; z*=f;
     }
     template<typename V>
-    Vec3 operator / (const V& f) const
+    Vec3<decltype(T{}/V{})> operator / (const V& f) const
     {
-        return Vec3(x/f,y/f,z/f);
+        return MakeVec(x/f,y/f,z/f);
     }
     template<typename V>
     void operator /= (const V& f)
@@ -310,7 +304,7 @@ public:
     // swizzlers - create a subvector of specific components
     // e.g. Vec2 uv() { return Vec2(x,y); }
     // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
-#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
 #define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
     _DEFINE_SWIZZLER2(a, b, a##b); \
     _DEFINE_SWIZZLER2(a, b, a2##b2); \
@@ -319,27 +313,13 @@ public:
     _DEFINE_SWIZZLER2(b, a, b##a); \
     _DEFINE_SWIZZLER2(b, a, b2##a2); \
     _DEFINE_SWIZZLER2(b, a, b3##a3); \
-    _DEFINE_SWIZZLER2(b, a, b4##a4);
+    _DEFINE_SWIZZLER2(b, a, b4##a4)
 
     DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
     DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
     DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
 #undef DEFINE_SWIZZLER2
 #undef _DEFINE_SWIZZLER2
-
-    // Inserters to add new elements to effectively create larger vectors containing this Vec2
-    Vec4<T> InsertBeforeX(const T& value) {
-        return Vec4<T>(value, x, y, z);
-    }
-    Vec4<T> InsertBeforeY(const T& value) {
-        return Vec4<T>(x, value, y, z);
-    }
-    Vec4<T> InsertBeforeZ(const T& value) {
-        return Vec4<T>(x, y, value, z);
-    }
-    Vec4<T> Append(const T& value) {
-        return Vec4<T>(x, y, z, value);
-    }
 };
 
 template<typename T, typename V>
@@ -348,16 +328,27 @@ Vec3<T> operator * (const V& f, const Vec3<T>& vec)
     return Vec3<T>(f*vec.x,f*vec.y,f*vec.z);
 }
 
+template<>
+inline float Vec3<float>::Length() const {
+    return std::sqrt(x * x + y * y + z * z);
+}
+
+template<>
+inline Vec3<float> Vec3<float>::Normalized() const {
+    return *this / Length();
+}
+
+
 typedef Vec3<float> Vec3f;
 
 template<typename T>
 class Vec4
 {
 public:
-    struct
-    {
-        T x,y,z,w;
-    };
+    T x;
+    T y;
+    T z;
+    T w;
 
     T* AsArray() { return &x; }
 
@@ -383,34 +374,34 @@ public:
         a[0] = x; a[1] = y; a[2] = z; a[3] = w;
     }
 
-    Vec4 operator +(const Vec4& other) const
+    Vec4<decltype(T{}+T{})> operator +(const Vec4& other) const
     {
-        return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
+        return MakeVec(x+other.x, y+other.y, z+other.z, w+other.w);
     }
     void operator += (const Vec4& other)
     {
         x+=other.x; y+=other.y; z+=other.z; w+=other.w;
     }
-    Vec4 operator -(const Vec4 &other) const
+    Vec4<decltype(T{}-T{})> operator -(const Vec4 &other) const
     {
-        return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
+        return MakeVec(x-other.x, y-other.y, z-other.z, w-other.w);
     }
     void operator -= (const Vec4 &other)
     {
         x-=other.x; y-=other.y; z-=other.z; w-=other.w;
     }
-    Vec4 operator -() const
+    Vec4<decltype(-T{})> operator -() const
     {
-        return Vec4(-x,-y,-z,-w);
+        return MakeVec(-x,-y,-z,-w);
     }
-    Vec4 operator * (const Vec4 &other) const
+    Vec4<decltype(T{}*T{})> operator * (const Vec4 &other) const
     {
-        return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
+        return MakeVec(x*other.x, y*other.y, z*other.z, w*other.w);
     }
     template<typename V>
-    Vec4 operator * (const V& f) const
+    Vec4<decltype(T{}*V{})> operator * (const V& f) const
     {
-        return Vec4(x*f,y*f,z*f,w*f);
+        return MakeVec(x*f,y*f,z*f,w*f);
     }
     template<typename V>
     void operator *= (const V& f)
@@ -418,9 +409,9 @@ public:
         x*=f; y*=f; z*=f; w*=f;
     }
     template<typename V>
-    Vec4 operator / (const V& f) const
+    Vec4<decltype(T{}/V{})> operator / (const V& f) const
     {
-        return Vec4(x/f,y/f,z/f,w/f);
+        return MakeVec(x/f,y/f,z/f,w/f);
     }
     template<typename V>
     void operator /= (const V& f)
@@ -469,12 +460,12 @@ public:
     // swizzlers - create a subvector of specific components
     // e.g. Vec2 uv() { return Vec2(x,y); }
     // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
-#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
 #define DEFINE_SWIZZLER2(a, b, a2, b2) \
     _DEFINE_SWIZZLER2(a, b, a##b); \
     _DEFINE_SWIZZLER2(a, b, a2##b2); \
     _DEFINE_SWIZZLER2(b, a, b##a); \
-    _DEFINE_SWIZZLER2(b, a, b2##a2);
+    _DEFINE_SWIZZLER2(b, a, b2##a2)
 
     DEFINE_SWIZZLER2(x, y, r, g);
     DEFINE_SWIZZLER2(x, z, r, b);
@@ -485,7 +476,7 @@ public:
 #undef DEFINE_SWIZZLER2
 #undef _DEFINE_SWIZZLER2
 
-#define _DEFINE_SWIZZLER3(a, b, c, name) Vec3<T> name() const { return Vec3<T>(a, b, c); }
+#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
 #define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
     _DEFINE_SWIZZLER3(a, b, c, a##b##c); \
     _DEFINE_SWIZZLER3(a, c, b, a##c##b); \
@@ -498,7 +489,7 @@ public:
     _DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
     _DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
     _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
-    _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
+    _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2)
 
     DEFINE_SWIZZLER3(x, y, z, r, g, b);
     DEFINE_SWIZZLER3(x, y, w, r, g, a);
@@ -510,69 +501,121 @@ public:
 
 
 template<typename T, typename V>
-Vec4<T> operator * (const V& f, const Vec4<T>& vec)
+Vec4<decltype(V{}*T{})> operator * (const V& f, const Vec4<T>& vec)
 {
-    return Vec4<T>(f*vec.x,f*vec.y,f*vec.z,f*vec.w);
+    return MakeVec(f*vec.x,f*vec.y,f*vec.z,f*vec.w);
 }
 
 typedef Vec4<float> Vec4f;
 
 
 template<typename T>
-static inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
+static inline decltype(T{}*T{}+T{}*T{}) Dot(const Vec2<T>& a, const Vec2<T>& b)
 {
     return a.x*b.x + a.y*b.y;
 }
 
 template<typename T>
-static inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
+static inline decltype(T{}*T{}+T{}*T{}) Dot(const Vec3<T>& a, const Vec3<T>& b)
 {
     return a.x*b.x + a.y*b.y + a.z*b.z;
 }
 
 template<typename T>
-static inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
+static inline decltype(T{}*T{}+T{}*T{}) Dot(const Vec4<T>& a, const Vec4<T>& b)
 {
     return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
 }
 
 template<typename T>
-static inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
+static inline Vec3<decltype(T{}*T{}-T{}*T{})> Cross(const Vec3<T>& a, const Vec3<T>& b)
 {
-    return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+    return MakeVec(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
 }
 
 // linear interpolation via float: 0.0=begin, 1.0=end
 template<typename X>
-static inline X Lerp(const X& begin, const X& end, const float t)
+static inline decltype(X{}*float{}+X{}*float{}) Lerp(const X& begin, const X& end, const float t)
 {
     return begin*(1.f-t) + end*t;
 }
 
 // linear interpolation via int: 0=begin, base=end
 template<typename X, int base>
-static inline X LerpInt(const X& begin, const X& end, const int t)
+static inline decltype((X{}*int{}+X{}*int{}) / base) LerpInt(const X& begin, const X& end, const int t)
 {
     return (begin*(base-t) + end*t) / base;
 }
 
 // Utility vector factories
 template<typename T>
-static inline Vec2<T> MakeVec2(const T& x, const T& y)
+static inline Vec2<T> MakeVec(const T& x, const T& y)
 {
     return Vec2<T>{x, y};
 }
 
 template<typename T>
-static inline Vec3<T> MakeVec3(const T& x, const T& y, const T& z)
+static inline Vec3<T> MakeVec(const T& x, const T& y, const T& z)
 {
     return Vec3<T>{x, y, z};
 }
 
 template<typename T>
-static inline Vec4<T> MakeVec4(const T& x, const T& y, const T& z, const T& w)
+static inline Vec4<T> MakeVec(const T& x, const T& y, const Vec2<T>& zw)
+{
+    return MakeVec(x, y, zw[0], zw[1]);
+}
+
+template<typename T>
+static inline Vec3<T> MakeVec(const Vec2<T>& xy, const T& z)
+{
+    return MakeVec(xy[0], xy[1], z);
+}
+
+template<typename T>
+static inline Vec3<T> MakeVec(const T& x, const Vec2<T>& yz)
+{
+    return MakeVec(x, yz[0], yz[1]);
+}
+
+template<typename T>
+static inline Vec4<T> MakeVec(const T& x, const T& y, const T& z, const T& w)
 {
     return Vec4<T>{x, y, z, w};
 }
 
+template<typename T>
+static inline Vec4<T> MakeVec(const Vec2<T>& xy, const T& z, const T& w)
+{
+    return MakeVec(xy[0], xy[1], z, w);
+}
+
+template<typename T>
+static inline Vec4<T> MakeVec(const T& x, const Vec2<T>& yz, const T& w)
+{
+    return MakeVec(x, yz[0], yz[1], w);
+}
+
+// NOTE: This has priority over "Vec2<Vec2<T>> MakeVec(const Vec2<T>& x, const Vec2<T>& y)".
+//       Even if someone wanted to use an odd object like Vec2<Vec2<T>>, the compiler would error
+//       out soon enough due to misuse of the returned structure.
+template<typename T>
+static inline Vec4<T> MakeVec(const Vec2<T>& xy, const Vec2<T>& zw)
+{
+    return MakeVec(xy[0], xy[1], zw[0], zw[1]);
+}
+
+template<typename T>
+static inline Vec4<T> MakeVec(const Vec3<T>& xyz, const T& w)
+{
+    return MakeVec(xyz[0], xyz[1], xyz[2], w);
+}
+
+template<typename T>
+static inline Vec4<T> MakeVec(const T& x, const Vec2<T>& yzw)
+{
+    return MakeVec(x, yzw[0], yzw[1], yzw[2]);
+}
+
+
 } // namespace
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 64083014..cfdc9b93 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <array>
 #include <cstddef>
 #include <initializer_list>
 #include <map>
@@ -57,7 +58,7 @@ struct Regs {
 
     INSERT_PADDING_WORDS(0x1);
 
-    union {
+    union VSOutputAttributes {
         // Maps components of output vertex attributes to semantics
         enum Semantic : u32
         {
@@ -94,7 +95,137 @@ struct Regs {
         BitField<16, 16, u32> y;
     } viewport_corner;
 
-    INSERT_PADDING_WORDS(0xa7);
+    INSERT_PADDING_WORDS(0x17);
+
+    struct TextureConfig {
+        INSERT_PADDING_WORDS(0x1);
+
+        union {
+            BitField< 0, 16, u32> height;
+            BitField<16, 16, u32> width;
+        };
+
+        INSERT_PADDING_WORDS(0x2);
+
+        u32 address;
+
+        u32 GetPhysicalAddress() {
+            return DecodeAddressRegister(address) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
+        }
+
+        // texture1 and texture2 store the texture format directly after the address
+        // whereas texture0 inserts some additional flags inbetween.
+        // Hence, we store the format separately so that all other parameters can be described
+        // in a single structure.
+    };
+
+    enum class TextureFormat : u32 {
+        RGBA8        =  0,
+        RGB8         =  1,
+        RGBA5551     =  2,
+        RGB565       =  3,
+        RGBA4        =  4,
+
+        // TODO: Support for the other formats is not implemented, yet.
+        // Seems like they are luminance formats and compressed textures.
+    };
+
+    BitField<0, 1, u32> texturing_enable;
+    TextureConfig texture0;
+    INSERT_PADDING_WORDS(0x8);
+    BitField<0, 4, TextureFormat> texture0_format;
+
+    INSERT_PADDING_WORDS(0x31);
+
+    // 0xc0-0xff: Texture Combiner (akin to glTexEnv)
+    struct TevStageConfig {
+        enum class Source : u32 {
+            PrimaryColor           = 0x0,
+            Texture0               = 0x3,
+            Texture1               = 0x4,
+            Texture2               = 0x5,
+            Texture3               = 0x6,
+            // 0x7-0xc = primary color??
+            Constant               = 0xe,
+            Previous               = 0xf,
+        };
+
+        enum class ColorModifier : u32 {
+            SourceColor         = 0,
+            OneMinusSourceColor = 1,
+            SourceAlpha         = 2,
+            OneMinusSourceAlpha = 3,
+
+            // Other values seem to be non-standard extensions
+        };
+
+        enum class AlphaModifier : u32 {
+            SourceAlpha         = 0,
+            OneMinusSourceAlpha = 1,
+
+            // Other values seem to be non-standard extensions
+        };
+
+        enum class Operation : u32 {
+            Replace         = 0,
+            Modulate        = 1,
+            Add             = 2,
+            AddSigned       = 3,
+            Lerp            = 4,
+            Subtract        = 5,
+        };
+
+        union {
+            BitField< 0, 4, Source> color_source1;
+            BitField< 4, 4, Source> color_source2;
+            BitField< 8, 4, Source> color_source3;
+            BitField<16, 4, Source> alpha_source1;
+            BitField<20, 4, Source> alpha_source2;
+            BitField<24, 4, Source> alpha_source3;
+        };
+
+        union {
+            BitField< 0, 4, ColorModifier> color_modifier1;
+            BitField< 4, 4, ColorModifier> color_modifier2;
+            BitField< 8, 4, ColorModifier> color_modifier3;
+            BitField<12, 3, AlphaModifier> alpha_modifier1;
+            BitField<16, 3, AlphaModifier> alpha_modifier2;
+            BitField<20, 3, AlphaModifier> alpha_modifier3;
+        };
+
+        union {
+            BitField< 0, 4, Operation> color_op;
+            BitField<16, 4, Operation> alpha_op;
+        };
+
+        union {
+            BitField< 0, 8, u32> const_r;
+            BitField< 8, 8, u32> const_g;
+            BitField<16, 8, u32> const_b;
+            BitField<24, 8, u32> const_a;
+        };
+
+        INSERT_PADDING_WORDS(0x1);
+    };
+
+    TevStageConfig tev_stage0;
+    INSERT_PADDING_WORDS(0x3);
+    TevStageConfig tev_stage1;
+    INSERT_PADDING_WORDS(0x3);
+    TevStageConfig tev_stage2;
+    INSERT_PADDING_WORDS(0x3);
+    TevStageConfig tev_stage3;
+    INSERT_PADDING_WORDS(0x13);
+    TevStageConfig tev_stage4;
+    INSERT_PADDING_WORDS(0x3);
+    TevStageConfig tev_stage5;
+    INSERT_PADDING_WORDS(0x13);
+
+    const std::array<Regs::TevStageConfig,6> GetTevStages() const {
+        return { tev_stage0, tev_stage1,
+                 tev_stage2, tev_stage3,
+                 tev_stage4, tev_stage5 };
+    };
 
     struct {
         enum ColorFormat : u32 {
@@ -403,6 +534,15 @@ struct Regs {
         ADD_FIELD(viewport_depth_range);
         ADD_FIELD(viewport_depth_far_plane);
         ADD_FIELD(viewport_corner);
+        ADD_FIELD(texturing_enable);
+        ADD_FIELD(texture0);
+        ADD_FIELD(texture0_format);
+        ADD_FIELD(tev_stage0);
+        ADD_FIELD(tev_stage1);
+        ADD_FIELD(tev_stage2);
+        ADD_FIELD(tev_stage3);
+        ADD_FIELD(tev_stage4);
+        ADD_FIELD(tev_stage5);
         ADD_FIELD(framebuffer);
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
@@ -460,6 +600,15 @@ ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(texturing_enable, 0x80);
+ASSERT_REG_POSITION(texture0, 0x81);
+ASSERT_REG_POSITION(texture0_format, 0x8e);
+ASSERT_REG_POSITION(tev_stage0, 0xc0);
+ASSERT_REG_POSITION(tev_stage1, 0xc8);
+ASSERT_REG_POSITION(tev_stage2, 0xd0);
+ASSERT_REG_POSITION(tev_stage3, 0xd8);
+ASSERT_REG_POSITION(tev_stage4, 0xf0);
+ASSERT_REG_POSITION(tev_stage5, 0xf8);
 ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index 2354ffb9..dabf2d1a 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -2,21 +2,23 @@
 // Licensed under GPLv2
 // Refer to the license.txt file included.
 
-#include "clipper.h"
 #include "pica.h"
 #include "primitive_assembly.h"
 #include "vertex_shader.h"
 
-namespace Pica {
+#include "video_core/debug_utils/debug_utils.h"
 
-namespace PrimitiveAssembly {
+namespace Pica {
 
-static OutputVertex buffer[2];
-static int buffer_index = 0; // TODO: reset this on emulation restart
+template<typename VertexType>
+PrimitiveAssembler<VertexType>::PrimitiveAssembler(Regs::TriangleTopology topology)
+    : topology(topology), buffer_index(0) {
+}
 
-void SubmitVertex(OutputVertex& vtx)
+template<typename VertexType>
+void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler)
 {
-    switch (registers.triangle_topology) {
+    switch (topology) {
         case Regs::TriangleTopology::List:
         case Regs::TriangleTopology::ListIndexed:
             if (buffer_index < 2) {
@@ -24,7 +26,7 @@ void SubmitVertex(OutputVertex& vtx)
             } else {
                 buffer_index = 0;
 
-                Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+                triangle_handler(buffer[0], buffer[1], vtx);
             }
             break;
 
@@ -32,7 +34,7 @@ void SubmitVertex(OutputVertex& vtx)
             if (buffer_index == 2) {
                 buffer_index = 0;
 
-                Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+                triangle_handler(buffer[0], buffer[1], vtx);
 
                 buffer[1] = vtx;
             } else {
@@ -41,11 +43,15 @@ void SubmitVertex(OutputVertex& vtx)
             break;
 
         default:
-            ERROR_LOG(GPU, "Unknown triangle mode %x:", (int)registers.triangle_topology.Value());
+            ERROR_LOG(GPU, "Unknown triangle topology %x:", (int)topology);
             break;
     }
 }
 
-} // namespace
+// explicitly instantiate use cases
+template
+struct PrimitiveAssembler<VertexShader::OutputVertex>;
+template
+struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>;
 
 } // namespace
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index 2a2b0c17..ea2e2f61 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -4,18 +4,40 @@
 
 #pragma once
 
-namespace Pica {
+#include <functional>
 
-namespace VertexShader {
-    struct OutputVertex;
-}
+#include "video_core/pica.h"
 
-namespace PrimitiveAssembly {
+#include "video_core/vertex_shader.h"
 
-using VertexShader::OutputVertex;
+namespace Pica {
 
-void SubmitVertex(OutputVertex& vtx);
+/*
+ * Utility class to build triangles from a series of vertices,
+ * according to a given triangle topology.
+ */
+template<typename VertexType>
+struct PrimitiveAssembler {
+    using TriangleHandler = std::function<void(VertexType& v0,
+                                               VertexType& v1,
+                                               VertexType& v2)>;
+
+    PrimitiveAssembler(Regs::TriangleTopology topology);
+
+    /*
+     * Queues a vertex, builds primitives from the vertex queue according to the given
+     * triangle topology, and calls triangle_handler for each generated primitive.
+     * NOTE: We could specify the triangle handler in the constructor, but this way we can
+     * keep event and handler code next to each other.
+     */
+    void SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler);
+
+private:
+    Regs::TriangleTopology topology;
+
+    int buffer_index;
+    VertexType buffer[2];
+};
 
-} // namespace
 
 } // namespace
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index a7c1bab3..cdfdb621 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -11,6 +11,8 @@
 #include "rasterizer.h"
 #include "vertex_shader.h"
 
+#include "debug_utils/debug_utils.h"
+
 namespace Pica {
 
 namespace Rasterizer {
@@ -78,10 +80,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
     u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
 
-    min_x = min_x & Fix12P4::IntMask();
-    min_y = min_y & Fix12P4::IntMask();
-    max_x = (max_x + Fix12P4::FracMask()) & Fix12P4::IntMask();
-    max_y = (max_y + Fix12P4::FracMask()) & Fix12P4::IntMask();
+    min_x &= Fix12P4::IntMask();
+    min_y &= Fix12P4::IntMask();
+    max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask());
+    max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask());
 
     // Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not
     // drawn. Pixels on any other triangle border are drawn. This is implemented with three bias
@@ -112,10 +114,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
                                const Math::Vec2<Fix12P4>& vtx2,
                                const Math::Vec2<Fix12P4>& vtx3) {
-                const auto vec1 = (vtx2.Cast<int>() - vtx1.Cast<int>()).Append(0);
-                const auto vec2 = (vtx3.Cast<int>() - vtx1.Cast<int>()).Append(0);
+                const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
+                const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
                 // TODO: There is a very small chance this will overflow for sizeof(int) == 4
-                return Cross(vec1, vec2).z;
+                return Math::Cross(vec1, vec2).z;
             };
 
             int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
@@ -143,15 +145,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             //
             // The generalization to three vertices is straightforward in baricentric coordinates.
             auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
-                auto attr_over_w = Math::MakeVec3(attr0 / v0.pos.w,
-                                                  attr1 / v1.pos.w,
-                                                  attr2 / v2.pos.w);
-                auto w_inverse   = Math::MakeVec3(float24::FromFloat32(1.f) / v0.pos.w,
-                                                  float24::FromFloat32(1.f) / v1.pos.w,
-                                                  float24::FromFloat32(1.f) / v2.pos.w);
-                auto baricentric_coordinates = Math::MakeVec3(float24::FromFloat32(w0),
-                                                              float24::FromFloat32(w1),
-                                                              float24::FromFloat32(w2));
+                auto attr_over_w = Math::MakeVec(attr0 / v0.pos.w,
+                                                 attr1 / v1.pos.w,
+                                                 attr2 / v2.pos.w);
+                auto w_inverse   = Math::MakeVec(float24::FromFloat32(1.f) / v0.pos.w,
+                                                 float24::FromFloat32(1.f) / v1.pos.w,
+                                                 float24::FromFloat32(1.f) / v2.pos.w);
+                auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(w0),
+                                                             float24::FromFloat32(w1),
+                                                             float24::FromFloat32(w2));
 
                 float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
                 float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates);
@@ -165,12 +167,196 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                 (u8)(GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * 255)
             };
 
+            Math::Vec4<u8> texture_color{};
+            float24 u = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
+            float24 v = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
+            if (registers.texturing_enable) {
+                // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
+                // of which is composed of four 2x2 subtiles each of which is composed of four texels.
+                // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
+                // texels are laid out in a 2x2 subtile like this:
+                // 2 3
+                // 0 1
+                //
+                // The full 8x8 tile has the texels arranged like this:
+                //
+                // 42 43 46 47 58 59 62 63
+                // 40 41 44 45 56 57 60 61
+                // 34 35 38 39 50 51 54 55
+                // 32 33 36 37 48 49 52 53
+                // 10 11 14 15 26 27 30 31
+                // 08 09 12 13 24 25 28 29
+                // 02 03 06 07 18 19 22 23
+                // 00 01 04 05 16 17 20 21
+
+                // TODO: This is currently hardcoded for RGB8
+                u32* texture_data = (u32*)Memory::GetPointer(registers.texture0.GetPhysicalAddress());
+
+                // TODO(neobrain): Not sure if this swizzling pattern is used for all textures.
+                // To be flexible in case different but similar patterns are used, we keep this
+                // somewhat inefficient code around for now.
+                int s = (int)(u * float24::FromFloat32(registers.texture0.width)).ToFloat32();
+                int t = (int)(v * float24::FromFloat32(registers.texture0.height)).ToFloat32();
+                int texel_index_within_tile = 0;
+                for (int block_size_index = 0; block_size_index < 3; ++block_size_index) {
+                    int sub_tile_width = 1 << block_size_index;
+                    int sub_tile_height = 1 << block_size_index;
+
+                    int sub_tile_index = (s & sub_tile_width) << block_size_index;
+                    sub_tile_index += 2 * ((t & sub_tile_height) << block_size_index);
+                    texel_index_within_tile += sub_tile_index;
+                }
+
+                const int block_width = 8;
+                const int block_height = 8;
+
+                int coarse_s = (s / block_width) * block_width;
+                int coarse_t = (t / block_height) * block_height;
+
+                const int row_stride = registers.texture0.width * 3;
+                u8* source_ptr = (u8*)texture_data + coarse_s * block_height * 3 + coarse_t * row_stride + texel_index_within_tile * 3;
+                texture_color.r() = source_ptr[2];
+                texture_color.g() = source_ptr[1];
+                texture_color.b() = source_ptr[0];
+                texture_color.a() = 0xFF;
+
+                DebugUtils::DumpTexture(registers.texture0, (u8*)texture_data);
+            }
+
+            // Texture environment - consists of 6 stages of color and alpha combining.
+            //
+            // Color combiners take three input color values from some source (e.g. interpolated
+            // vertex color, texture color, previous stage, etc), perform some very simple
+            // operations on each of them (e.g. inversion) and then calculate the output color
+            // with some basic arithmetic. Alpha combiners can be configured separately but work
+            // analogously.
+            Math::Vec4<u8> combiner_output;
+            for (auto tev_stage : registers.GetTevStages()) {
+                using Source = Regs::TevStageConfig::Source;
+                using ColorModifier = Regs::TevStageConfig::ColorModifier;
+                using AlphaModifier = Regs::TevStageConfig::AlphaModifier;
+                using Operation = Regs::TevStageConfig::Operation;
+
+                auto GetColorSource = [&](Source source) -> Math::Vec3<u8> {
+                    switch (source) {
+                    case Source::PrimaryColor:
+                        return primary_color.rgb();
+
+                    case Source::Texture0:
+                        return texture_color.rgb();
+
+                    case Source::Constant:
+                        return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b};
+
+                    case Source::Previous:
+                        return combiner_output.rgb();
+
+                    default:
+                        ERROR_LOG(GPU, "Unknown color combiner source %d\n", (int)source);
+                        return {};
+                    }
+                };
+
+                auto GetAlphaSource = [&](Source source) -> u8 {
+                    switch (source) {
+                    case Source::PrimaryColor:
+                        return primary_color.a();
+
+                    case Source::Texture0:
+                        return texture_color.a();
+
+                    case Source::Constant:
+                        return tev_stage.const_a;
+
+                    case Source::Previous:
+                        return combiner_output.a();
+
+                    default:
+                        ERROR_LOG(GPU, "Unknown alpha combiner source %d\n", (int)source);
+                        return 0;
+                    }
+                };
+
+                auto GetColorModifier = [](ColorModifier factor, const Math::Vec3<u8>& values) -> Math::Vec3<u8> {
+                    switch (factor)
+                    {
+                    case ColorModifier::SourceColor:
+                        return values;
+                    default:
+                        ERROR_LOG(GPU, "Unknown color factor %d\n", (int)factor);
+                        return {};
+                    }
+                };
+
+                auto GetAlphaModifier = [](AlphaModifier factor, u8 value) -> u8 {
+                    switch (factor) {
+                    case AlphaModifier::SourceAlpha:
+                        return value;
+                    default:
+                        ERROR_LOG(GPU, "Unknown color factor %d\n", (int)factor);
+                        return 0;
+                    }
+                };
+
+                auto ColorCombine = [](Operation op, const Math::Vec3<u8> input[3]) -> Math::Vec3<u8> {
+                    switch (op) {
+                    case Operation::Replace:
+                        return input[0];
+
+                    case Operation::Modulate:
+                        return ((input[0] * input[1]) / 255).Cast<u8>();
+
+                    default:
+                        ERROR_LOG(GPU, "Unknown color combiner operation %d\n", (int)op);
+                        return {};
+                    }
+                };
+
+                auto AlphaCombine = [](Operation op, const std::array<u8,3>& input) -> u8 {
+                    switch (op) {
+                    case Operation::Replace:
+                        return input[0];
+
+                    case Operation::Modulate:
+                        return input[0] * input[1] / 255;
+
+                    default:
+                        ERROR_LOG(GPU, "Unknown alpha combiner operation %d\n", (int)op);
+                        return 0;
+                    }
+                };
+
+                // color combiner
+                // NOTE: Not sure if the alpha combiner might use the color output of the previous
+                //       stage as input. Hence, we currently don't directly write the result to
+                //       combiner_output.rgb(), but instead store it in a temporary variable until
+                //       alpha combining has been done.
+                Math::Vec3<u8> color_result[3] = {
+                    GetColorModifier(tev_stage.color_modifier1, GetColorSource(tev_stage.color_source1)),
+                    GetColorModifier(tev_stage.color_modifier2, GetColorSource(tev_stage.color_source2)),
+                    GetColorModifier(tev_stage.color_modifier3, GetColorSource(tev_stage.color_source3))
+                };
+                auto color_output = ColorCombine(tev_stage.color_op, color_result);
+
+                // alpha combiner
+                std::array<u8,3> alpha_result = {
+                    GetAlphaModifier(tev_stage.alpha_modifier1, GetAlphaSource(tev_stage.alpha_source1)),
+                    GetAlphaModifier(tev_stage.alpha_modifier2, GetAlphaSource(tev_stage.alpha_source2)),
+                    GetAlphaModifier(tev_stage.alpha_modifier3, GetAlphaSource(tev_stage.alpha_source3))
+                };
+                auto alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);
+
+                combiner_output = Math::MakeVec(color_output, alpha_output);
+            }
+
+            // TODO: Not sure if the multiplication by 65535 has already been taken care
+            // of when transforming to screen coordinates or not.
             u16 z = (u16)(((float)v0.screenpos[2].ToFloat32() * w0 +
                            (float)v1.screenpos[2].ToFloat32() * w1 +
-                           (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); // TODO: Shouldn't need to multiply by 65536?
+                           (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
             SetDepth(x >> 4, y >> 4, z);
 
-            DrawPixel(x >> 4, y >> 4, primary_color);
+            DrawPixel(x >> 4, y >> 4, combiner_output);
         }
     }
 }
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 93830a96..db824431 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -4,6 +4,7 @@
 
 #include "pica.h"
 #include "vertex_shader.h"
+#include "debug_utils/debug_utils.h"
 #include <core/mem_map.h>
 #include <common/file_util.h>
 
@@ -50,6 +51,11 @@ struct VertexShaderState {
     };
     u32 call_stack[8]; // TODO: What is the maximal call stack depth?
     u32* call_stack_pointer;
+
+    struct {
+        u32 max_offset; // maximum program counter ever reached
+        u32 max_opdesc_id; // maximum swizzle pattern index ever used
+    } debug;
 };
 
 static void ProcessShaderCode(VertexShaderState& state) {
@@ -57,27 +63,34 @@ static void ProcessShaderCode(VertexShaderState& state) {
         bool increment_pc = true;
         bool exit_loop = false;
         const Instruction& instr = *(const Instruction*)state.program_counter;
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + (state.program_counter - shader_memory));
 
-        const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1]
-                             : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1-0x10].x
-                             : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1-0x20].x
-                             : nullptr;
-        const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2]
-                             : &state.temporary_registers[instr.common.src2-0x10].x;
-        // TODO: Unsure about the limit values
-        float24* dest = (instr.common.dest <= 0x1C) ? state.output_register_table[instr.common.dest]
-                             : (instr.common.dest <= 0x3C) ? nullptr
-                             : (instr.common.dest <= 0x7C) ? &state.temporary_registers[(instr.common.dest-0x40)/4][instr.common.dest%4]
+        const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1.GetIndex()]
+                             : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1.GetIndex()].x
+                             : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1.GetIndex()].x
                              : nullptr;
+        const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2.GetIndex()]
+                             : &state.temporary_registers[instr.common.src2.GetIndex()].x;
+        float24* dest = (instr.common.dest < 0x08) ? state.output_register_table[4*instr.common.dest.GetIndex()]
+                      : (instr.common.dest < 0x10) ? nullptr
+                      : (instr.common.dest < 0x20) ? &state.temporary_registers[instr.common.dest.GetIndex()][0]
+                      : nullptr;
 
         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
+        const bool negate_src1 = swizzle.negate;
 
-        const float24 src1[4] = {
+        float24 src1[4] = {
             src1_[(int)swizzle.GetSelectorSrc1(0)],
             src1_[(int)swizzle.GetSelectorSrc1(1)],
             src1_[(int)swizzle.GetSelectorSrc1(2)],
             src1_[(int)swizzle.GetSelectorSrc1(3)],
         };
+        if (negate_src1) {
+            src1[0] = src1[0] * float24::FromFloat32(-1);
+            src1[1] = src1[1] * float24::FromFloat32(-1);
+            src1[2] = src1[2] * float24::FromFloat32(-1);
+            src1[3] = src1[3] * float24::FromFloat32(-1);
+        }
         const float24 src2[4] = {
             src2_[(int)swizzle.GetSelectorSrc2(0)],
             src2_[(int)swizzle.GetSelectorSrc2(1)],
@@ -88,6 +101,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
         switch (instr.opcode) {
             case Instruction::OpCode::ADD:
             {
+                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -100,6 +114,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
             case Instruction::OpCode::MUL:
             {
+                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -113,6 +128,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
             case Instruction::OpCode::DP3:
             case Instruction::OpCode::DP4:
             {
+                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 float24 dot = float24::FromFloat32(0.f);
                 int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4;
                 for (int i = 0; i < num_components; ++i)
@@ -130,6 +146,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
             // Reciprocal
             case Instruction::OpCode::RCP:
             {
+                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -145,6 +162,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
             // Reciprocal Square Root
             case Instruction::OpCode::RSQ:
             {
+                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -159,6 +177,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
             case Instruction::OpCode::MOV:
             {
+                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -172,8 +191,9 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) {
                     exit_loop = true;
                 } else {
-                    state.program_counter = &shader_memory[*state.call_stack_pointer--];
-                    *state.call_stack_pointer = VertexShaderState::INVALID_ADDRESS;
+                    // Jump back to call stack position, invalidate call stack entry, move up call stack pointer
+                    state.program_counter = &shader_memory[*state.call_stack_pointer];
+                    *state.call_stack_pointer-- = VertexShaderState::INVALID_ADDRESS;
                 }
 
                 break;
@@ -212,6 +232,8 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
 
     const u32* main = &shader_memory[registers.vs_main_offset];
     state.program_counter = (u32*)main;
+    state.debug.max_offset = 0;
+    state.debug.max_opdesc_id = 0;
 
     // Setup input register table
     const auto& attribute_register_map = registers.vs_input_register_map;
@@ -255,6 +277,9 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
     state.call_stack_pointer = &state.call_stack[0];
 
     ProcessShaderCode(state);
+    DebugUtils::DumpShader(shader_memory, state.debug.max_offset, swizzle_data,
+                           state.debug.max_opdesc_id, registers.vs_main_offset,
+                           registers.vs_output_attributes);
 
     DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
         ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index 1b71e367..847fdc45 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -27,7 +27,6 @@ struct OutputVertex {
     Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
     Math::Vec4<float24> color;
     Math::Vec2<float24> tc0;
-    float24 tc0_v;
 
     // Padding for optimal alignment
     float24 pad[14];
@@ -36,6 +35,7 @@ struct OutputVertex {
 
     // position after perspective divide
     Math::Vec3<float24> screenpos;
+    float24 pad2;
 
     // Linear interpolation
     // factor: 0=this, 1=vtx
@@ -59,6 +59,7 @@ struct OutputVertex {
     }
 };
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
+static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
 
 union Instruction {
     enum class OpCode : u32 {
@@ -117,9 +118,78 @@ union Instruction {
     // while "dest" addresses individual floats.
     union {
         BitField<0x00, 0x5, u32> operand_desc_id;
-        BitField<0x07, 0x5, u32> src2;
-        BitField<0x0c, 0x7, u32> src1;
-        BitField<0x13, 0x7, u32> dest;
+
+        template<class BitFieldType>
+        struct SourceRegister : BitFieldType {
+            enum RegisterType {
+                Input,
+                Temporary,
+                FloatUniform
+            };
+
+            RegisterType GetRegisterType() const {
+                if (BitFieldType::Value() < 0x10)
+                    return Input;
+                else if (BitFieldType::Value() < 0x20)
+                    return Temporary;
+                else
+                    return FloatUniform;
+            }
+
+            int GetIndex() const {
+                if (GetRegisterType() == Input)
+                    return BitFieldType::Value();
+                else if (GetRegisterType() == Temporary)
+                    return BitFieldType::Value() - 0x10;
+                else if (GetRegisterType() == FloatUniform)
+                    return BitFieldType::Value() - 0x20;
+            }
+
+            std::string GetRegisterName() const {
+                std::map<RegisterType, std::string> type = {
+                    { Input, "i" },
+                    { Temporary, "t" },
+                    { FloatUniform, "f" },
+                };
+                return type[GetRegisterType()] + std::to_string(GetIndex());
+            }
+        };
+
+        SourceRegister<BitField<0x07, 0x5, u32>> src2;
+        SourceRegister<BitField<0x0c, 0x7, u32>> src1;
+
+        struct : BitField<0x15, 0x5, u32>
+        {
+            enum RegisterType {
+                Output,
+                Temporary,
+                Unknown
+            };
+            RegisterType GetRegisterType() const {
+                if (Value() < 0x8)
+                    return Output;
+                else if (Value() < 0x10)
+                    return Unknown;
+                else
+                    return Temporary;
+            }
+            int GetIndex() const {
+                if (GetRegisterType() == Output)
+                    return Value();
+                else if (GetRegisterType() == Temporary)
+                    return Value() - 0x10;
+                else
+                    return Value();
+            }
+            std::string GetRegisterName() const {
+                std::map<RegisterType, std::string> type = {
+                    { Output, "o" },
+                    { Temporary, "t" },
+                    { Unknown, "u" }
+                };
+                return type[GetRegisterType()] + std::to_string(GetIndex());
+            }
+        } dest;
     } common;
 
     // Format used for flow control instructions ("if")
@@ -128,6 +198,7 @@ union Instruction {
         BitField<0x0a, 0xc, u32> offset_words;
     } flow_control;
 };
+static_assert(std::is_standard_layout<Instruction>::value, "Structure is not using standard layout!");
 
 union SwizzlePattern {
     u32 hex;
@@ -185,6 +256,8 @@ union SwizzlePattern {
     // Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x
     BitField< 0, 4, u32> dest_mask;
 
+    BitField< 4, 1, u32> negate; // negates src1
+
     BitField< 5, 2, Selector> src1_selector_3;
     BitField< 7, 2, Selector> src1_selector_2;
     BitField< 9, 2, Selector> src1_selector_1;
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index 48d77cdc..4e129fbe 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -19,6 +19,7 @@
     </ProjectConfiguration>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="debug_utils\debug_utils.cpp" />
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp" />
     <ClCompile Include="clipper.cpp" />
     <ClCompile Include="command_processor.cpp" />
@@ -40,6 +41,7 @@
     <ClInclude Include="utils.h" />
     <ClInclude Include="vertex_shader.h" />
     <ClInclude Include="video_core.h" />
+    <ClInclude Include="debug_utils\debug_utils.h" />
     <ClInclude Include="renderer_opengl\renderer_opengl.h" />
   </ItemGroup>
   <ItemGroup>
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index 31af4f1d..90541aca 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -4,6 +4,9 @@
     <Filter Include="renderer_opengl">
       <UniqueIdentifier>{e0245557-dbd4-423e-9399-513d5e99f1e4}</UniqueIdentifier>
     </Filter>
+    <Filter Include="debug_utils">
+      <UniqueIdentifier>{0ac498e6-bbd8-46e3-9d5f-e816546ab90e}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp">
@@ -16,11 +19,11 @@
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
+    <ClCompile Include="debug_utils\debug_utils.cpp">
+      <Filter>debug_utils</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ClInclude Include="renderer_opengl\renderer_opengl.h">
-      <Filter>renderer_opengl</Filter>
-    </ClInclude>
     <ClInclude Include="clipper.h" />
     <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
@@ -32,8 +35,12 @@
     <ClInclude Include="utils.h" />
     <ClInclude Include="vertex_shader.h" />
     <ClInclude Include="video_core.h" />
+    <ClInclude Include="renderer_opengl\renderer_opengl.h" />
+    <ClInclude Include="debug_utils\debug_utils.h">
+      <Filter>debug_utils</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Text Include="CMakeLists.txt" />
   </ItemGroup>
-</Project>
+</Project>
+\ No newline at end of file