18 files changed, 438 insertions, 1027 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0258a325..5c7f4ae1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -29,11 +29,9 @@ set(HEADERS
             renderer_opengl/pica_to_gl.h
             renderer_opengl/renderer_opengl.h
             clipper.h
-            color.h
             command_processor.h
             gpu_debugger.h
             hwrasterizer_base.h
-            math.h
             pica.h
             primitive_assembly.h
             rasterizer.h
diff --git a/src/video_core/color.h b/src/video_core/color.h
deleted file mode 100644
index 4d2026eb..00000000
--- a/src/video_core/color.h
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include "common/common_types.h"
-#include "common/swap.h"
-
-#include "video_core/math.h"
-
-namespace Color {
-
-/// Convert a 1-bit color component to 8 bit
-inline u8 Convert1To8(u8 value) {
-    return value * 255;
-}
-
-/// Convert a 4-bit color component to 8 bit
-inline u8 Convert4To8(u8 value) {
-    return (value << 4) | value;
-}
-
-/// Convert a 5-bit color component to 8 bit
-inline u8 Convert5To8(u8 value) {
-    return (value << 3) | (value >> 2);
-}
-
-/// Convert a 6-bit color component to 8 bit
-inline u8 Convert6To8(u8 value) {
-    return (value << 2) | (value >> 4);
-}
-
-/// Convert a 8-bit color component to 1 bit
-inline u8 Convert8To1(u8 value) {
-    return value >> 7;
-}
-
-/// Convert a 8-bit color component to 4 bit
-inline u8 Convert8To4(u8 value) {
-    return value >> 4;
-}
-
-/// Convert a 8-bit color component to 5 bit
-inline u8 Convert8To5(u8 value) {
-    return value >> 3;
-}
-
-/// Convert a 8-bit color component to 6 bit
-inline u8 Convert8To6(u8 value) {
-    return value >> 2;
-}
-
-/**
- * Decode a color stored in RGBA8 format
- * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
- */
-inline const Math::Vec4<u8> DecodeRGBA8(const u8* bytes) {
-    return { bytes[3], bytes[2], bytes[1], bytes[0] };
-}
-
-/**
- * Decode a color stored in RGB8 format
- * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
- */
-inline const Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
-    return { bytes[2], bytes[1], bytes[0], 255 };
-}
-
-/**
- * Decode a color stored in RGB565 format
- * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
- */
-inline const Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
-    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
-    return { Convert5To8((pixel >> 11) & 0x1F), Convert6To8((pixel >> 5) & 0x3F),
-        Convert5To8(pixel & 0x1F), 255 };
-}
-
-/**
- * Decode a color stored in RGB5A1 format
- * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
- */
-inline const Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
-    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
-    return { Convert5To8((pixel >> 11) & 0x1F), Convert5To8((pixel >> 6) & 0x1F),
-        Convert5To8((pixel >> 1) & 0x1F), Convert1To8(pixel & 0x1) };
-}
-
-/**
- * Decode a color stored in RGBA4 format
- * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
- */
-inline const Math::Vec4<u8> DecodeRGBA4(const u8* bytes) {
-    const u16_le pixel = *reinterpret_cast<const u16_le*>(bytes);
-    return { Convert4To8((pixel >> 12) & 0xF), Convert4To8((pixel >> 8) & 0xF),
-        Convert4To8((pixel >> 4) & 0xF), Convert4To8(pixel & 0xF) };
-}
-
-/**
- * Decode a depth value stored in D16 format
- * @param bytes Pointer to encoded source value
- * @return Depth value as an u32
- */
-inline u32 DecodeD16(const u8* bytes) {
-    return *reinterpret_cast<const u16_le*>(bytes);
-}
-
-/**
- * Decode a depth value stored in D24 format
- * @param bytes Pointer to encoded source value
- * @return Depth value as an u32
- */
-inline u32 DecodeD24(const u8* bytes) {
-    return (bytes[2] << 16) | (bytes[1] << 8) | bytes[0];
-}
-
-/**
- * Decode a depth value and a stencil value stored in D24S8 format
- * @param bytes Pointer to encoded source values
- * @return Resulting values stored as a Math::Vec2
- */
-inline const Math::Vec2<u32> DecodeD24S8(const u8* bytes) {
-    return { static_cast<u32>((bytes[2] << 16) | (bytes[1] << 8) | bytes[0]), bytes[3] };
-}
-
-/**
- * Encode a color as RGBA8 format
- * @param color Source color to encode
- * @param bytes Destination pointer to store encoded color
- */
-inline void EncodeRGBA8(const Math::Vec4<u8>& color, u8* bytes) {
-    bytes[3] = color.r();
-    bytes[2] = color.g();
-    bytes[1] = color.b();
-    bytes[0] = color.a();
-}
-
-/**
- * Encode a color as RGB8 format
- * @param color Source color to encode
- * @param bytes Destination pointer to store encoded color
- */
-inline void EncodeRGB8(const Math::Vec4<u8>& color, u8* bytes) {
-    bytes[2] = color.r();
-    bytes[1] = color.g();
-    bytes[0] = color.b();
-}
-
-/**
- * Encode a color as RGB565 format
- * @param color Source color to encode
- * @param bytes Destination pointer to store encoded color
- */
-inline void EncodeRGB565(const Math::Vec4<u8>& color, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = (Convert8To5(color.r()) << 11) |
-        (Convert8To6(color.g()) << 5) | Convert8To5(color.b());
-}
-
-/**
- * Encode a color as RGB5A1 format
- * @param color Source color to encode
- * @param bytes Destination pointer to store encoded color
- */
-inline void EncodeRGB5A1(const Math::Vec4<u8>& color, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = (Convert8To5(color.r()) << 11) |
-        (Convert8To5(color.g()) << 6) | (Convert8To5(color.b()) << 1) | Convert8To1(color.a());
-}
-
-/**
- * Encode a color as RGBA4 format
- * @param color Source color to encode
- * @param bytes Destination pointer to store encoded color
- */
-inline void EncodeRGBA4(const Math::Vec4<u8>& color, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = (Convert8To4(color.r()) << 12) |
-        (Convert8To4(color.g()) << 8) | (Convert8To4(color.b()) << 4) | Convert8To4(color.a());
-}
-
-/**
- * Encode a 16 bit depth value as D16 format
- * @param value 16 bit source depth value to encode
- * @param bytes Pointer where to store the encoded value
- */
-inline void EncodeD16(u32 value, u8* bytes) {
-    *reinterpret_cast<u16_le*>(bytes) = value & 0xFFFF;
-}
-
-/**
- * Encode a 24 bit depth value as D24 format
- * @param value 24 bit source depth value to encode
- * @param bytes Pointer where to store the encoded value
- */
-inline void EncodeD24(u32 value, u8* bytes) {
-    bytes[0] = value & 0xFF;
-    bytes[1] = (value >> 8) & 0xFF;
-    bytes[2] = (value >> 16) & 0xFF;
-}
-
-/**
- * Encode a 24 bit depth and 8 bit stencil values as D24S8 format
- * @param depth 24 bit source depth value to encode
- * @param stencil 8 bit source stencil value to encode
- * @param bytes Pointer where to store the encoded value
- */
-inline void EncodeD24S8(u32 depth, u8 stencil, u8* bytes) {
-    *reinterpret_cast<u32_le*>(bytes) = (stencil << 24) | depth;
-}
-
-} // namespace
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 100d8c7c..b46fadd9 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -56,7 +56,17 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         // Trigger IRQ
         case PICA_REG_INDEX(trigger_irq):
             GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);
-            return;
+            break;
+
+        case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[0], 0x23c):
+        case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[1], 0x23d):
+        {
+            unsigned index = id - PICA_REG_INDEX(command_buffer.trigger[0]);
+            u32* head_ptr = (u32*)Memory::GetPhysicalPointer(regs.command_buffer.GetPhysicalAddress(index));
+            g_state.cmd_list.head_ptr = g_state.cmd_list.current_ptr = head_ptr;
+            g_state.cmd_list.length = regs.command_buffer.GetSize(index) / sizeof(u32);
+            break;
+        }
 
         // It seems like these trigger vertex rendering
         case PICA_REG_INDEX(trigger_draw):
@@ -136,7 +146,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                                   input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
                                   input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
                     }
-                    
+
                     // Load per-vertex data from the loader arrays
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
                         const u8* srcdata = Memory::GetPhysicalPointer(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]);
@@ -193,7 +203,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                                                    const Pica::VertexShader::OutputVertex& v2) {
                         VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2);
                     };
-                    
+
                     primitive_assembler.SubmitVertex(output, AddHWTriangle);
                 } else {
                     // Send to triangle clipper
@@ -282,7 +292,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             }
             break;
         }
-        
+
         // Load default vertex input attributes
         case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
         case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
@@ -306,7 +316,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 }
 
                 Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
-                
+
                 // NOTE: The destination component order indeed is "backwards"
                 attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
                 attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
@@ -363,38 +373,34 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         g_debug_context->OnEvent(DebugContext::Event::CommandProcessed, reinterpret_cast<void*>(&id));
 }
 
-static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) {
-    const CommandHeader& header = *(const CommandHeader*)(&first_command_word[1]);
-
-    u32* read_pointer = (u32*)first_command_word;
-
-    const u32 write_mask = ((header.parameter_mask & 0x1) ? (0xFFu <<  0) : 0u) |
-                           ((header.parameter_mask & 0x2) ? (0xFFu <<  8) : 0u) |
-                           ((header.parameter_mask & 0x4) ? (0xFFu << 16) : 0u) |
-                           ((header.parameter_mask & 0x8) ? (0xFFu << 24) : 0u);
-
-    WritePicaReg(header.cmd_id, *read_pointer, write_mask);
-    read_pointer += 2;
-
-    for (unsigned int i = 1; i < 1+header.extra_data_length; ++i) {
-        u32 cmd = header.cmd_id + ((header.group_commands) ? i : 0);
-        WritePicaReg(cmd, *read_pointer, write_mask);
-        ++read_pointer;
-    }
-
-    // align read pointer to 8 bytes
-    if ((first_command_word - read_pointer) % 2)
-        ++read_pointer;
-
-    return read_pointer - first_command_word;
-}
-
 void ProcessCommandList(const u32* list, u32 size) {
-    u32* read_pointer = (u32*)list;
-    u32 list_length = size / sizeof(u32);
-
-    while (read_pointer < list + list_length) {
-        read_pointer += ExecuteCommandBlock(read_pointer);
+    g_state.cmd_list.head_ptr = g_state.cmd_list.current_ptr = list;
+    g_state.cmd_list.length = size / sizeof(u32);
+
+    while (g_state.cmd_list.current_ptr < g_state.cmd_list.head_ptr + g_state.cmd_list.length) {
+        // Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF
+        static const u32 expand_bits_to_bytes[] = {
+            0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
+            0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
+            0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
+            0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff
+        };
+
+        // Align read pointer to 8 bytes
+        if ((g_state.cmd_list.head_ptr - g_state.cmd_list.current_ptr) % 2 != 0)
+            ++g_state.cmd_list.current_ptr;
+
+        u32 value = *g_state.cmd_list.current_ptr++;
+        const CommandHeader header = { *g_state.cmd_list.current_ptr++ };
+        const u32 write_mask = expand_bits_to_bytes[header.parameter_mask];
+        u32 cmd = header.cmd_id;
+
+        WritePicaReg(cmd, value, write_mask);
+
+        for (unsigned i = 0; i < header.extra_data_length; ++i) {
+            u32 cmd = header.cmd_id + (header.group_commands ? i + 1 : 0);
+            WritePicaReg(cmd, *g_state.cmd_list.current_ptr++, write_mask);
+         }
     }
 }
 
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 7987b922..7b8ab72b 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -17,11 +17,11 @@
 #include <nihstro/shader_binary.h>
 
 #include "common/assert.h"
+#include "common/color.h"
 #include "common/file_util.h"
 #include "common/math_util.h"
+#include "common/vector_math.h"
 
-#include "video_core/color.h"
-#include "video_core/math.h"
 #include "video_core/pica.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
@@ -319,7 +319,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
         // TODO(neobrain): Fix code design to unify vertical block offsets!
         source += coarse_y * info.stride;
     }
-    
+
     // TODO: Assert that width/height are multiples of block dimensions
 
     switch (info.format) {
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index f361a538..7926d64e 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -12,7 +12,8 @@
 #include <mutex>
 #include <vector>
 
-#include "video_core/math.h"
+#include "common/vector_math.h"
+
 #include "video_core/pica.h"
 
 namespace Pica {
diff --git a/src/video_core/math.h b/src/video_core/math.h
deleted file mode 100644
index f9a82265..00000000
--- a/src/video_core/math.h
+++ /dev/null
@@ -1,640 +0,0 @@
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-
-// Copyright 2014 Tony Wasserka
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//     * Redistributions of source code must retain the above copyright
-//       notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in the
-//       documentation and/or other materials provided with the distribution.
-//     * Neither the name of the owner nor the names of its contributors may
-//       be used to endorse or promote products derived from this software
-//       without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#pragma once
-
-#include <cmath>
-
-namespace Math {
-
-template<typename T> class Vec2;
-template<typename T> class Vec3;
-template<typename T> class Vec4;
-
-template<typename T>
-static inline Vec2<T> MakeVec(const T& x, const T& y);
-template<typename T>
-static inline Vec3<T> MakeVec(const T& x, const T& y, const T& z);
-template<typename T>
-static inline Vec4<T> MakeVec(const T& x, const T& y, const T& z, const T& w);
-
-
-template<typename T>
-class Vec2 {
-public:
-    T x;
-    T y;
-
-    T* AsArray() { return &x; }
-
-    Vec2() = default;
-    Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
-    Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
-
-    template<typename T2>
-    Vec2<T2> Cast() const {
-        return Vec2<T2>((T2)x, (T2)y);
-    }
-
-    static Vec2 AssignToAll(const T& f)
-    {
-        return Vec2<T>(f, f);
-    }
-
-    void Write(T a[2])
-    {
-        a[0] = x; a[1] = y;
-    }
-
-    Vec2<decltype(T{}+T{})> operator +(const Vec2& other) const
-    {
-        return MakeVec(x+other.x, y+other.y);
-    }
-    void operator += (const Vec2 &other)
-    {
-        x+=other.x; y+=other.y;
-    }
-    Vec2<decltype(T{}-T{})> operator -(const Vec2& other) const
-    {
-        return MakeVec(x-other.x, y-other.y);
-    }
-    void operator -= (const Vec2& other)
-    {
-        x-=other.x; y-=other.y;
-    }
-    Vec2<decltype(-T{})> operator -() const
-    {
-        return MakeVec(-x,-y);
-    }
-    Vec2<decltype(T{}*T{})> operator * (const Vec2& other) const
-    {
-        return MakeVec(x*other.x, y*other.y);
-    }
-    template<typename V>
-    Vec2<decltype(T{}*V{})> operator * (const V& f) const
-    {
-        return MakeVec(x*f,y*f);
-    }
-    template<typename V>
-    void operator *= (const V& f)
-    {
-        x*=f; y*=f;
-    }
-    template<typename V>
-    Vec2<decltype(T{}/V{})> operator / (const V& f) const
-    {
-        return MakeVec(x/f,y/f);
-    }
-    template<typename V>
-    void operator /= (const V& f)
-    {
-        *this = *this / f;
-    }
-
-    T Length2() const
-    {
-        return x*x + y*y;
-    }
-
-    // Only implemented for T=float
-    float Length() const;
-    void SetLength(const float l);
-    Vec2 WithLength(const float l) const;
-    float Distance2To(Vec2 &other);
-    Vec2 Normalized() const;
-    float Normalize(); // returns the previous length, which is often useful
-
-    T& operator [] (int i) //allow vector[1] = 3   (vector.y=3)
-    {
-        return *((&x) + i);
-    }
-    T operator [] (const int i) const
-    {
-        return *((&x) + i);
-    }
-
-    void SetZero()
-    {
-        x=0; y=0;
-    }
-
-    // Common aliases: UV (texel coordinates), ST (texture coordinates)
-    T& u() { return x; }
-    T& v() { return y; }
-    T& s() { return x; }
-    T& t() { return y; }
-
-    const T& u() const { return x; }
-    const T& v() const { return y; }
-    const T& s() const { return x; }
-    const T& t() const { return y; }
-
-    // swizzlers - create a subvector of specific components
-    const Vec2 yx() const { return Vec2(y, x); }
-    const Vec2 vu() const { return Vec2(y, x); }
-    const Vec2 ts() const { return Vec2(y, x); }
-};
-
-template<typename T, typename V>
-Vec2<T> operator * (const V& f, const Vec2<T>& vec)
-{
-    return Vec2<T>(f*vec.x,f*vec.y);
-}
-
-typedef Vec2<float> Vec2f;
-
-template<typename T>
-class Vec3
-{
-public:
-    T x;
-    T y;
-    T z;
-
-    T* AsArray() { return &x; }
-
-    Vec3() = default;
-    Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
-    Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
-
-    template<typename T2>
-    Vec3<T2> Cast() const {
-        return MakeVec<T2>((T2)x, (T2)y, (T2)z);
-    }
-
-    // Only implemented for T=int and T=float
-    static Vec3 FromRGB(unsigned int rgb);
-    unsigned int ToRGB() const; // alpha bits set to zero
-
-    static Vec3 AssignToAll(const T& f)
-    {
-        return MakeVec(f, f, f);
-    }
-
-    void Write(T a[3])
-    {
-        a[0] = x; a[1] = y; a[2] = z;
-    }
-
-    Vec3<decltype(T{}+T{})> operator +(const Vec3 &other) const
-    {
-        return MakeVec(x+other.x, y+other.y, z+other.z);
-    }
-    void operator += (const Vec3 &other)
-    {
-        x+=other.x; y+=other.y; z+=other.z;
-    }
-    Vec3<decltype(T{}-T{})> operator -(const Vec3 &other) const
-    {
-        return MakeVec(x-other.x, y-other.y, z-other.z);
-    }
-    void operator -= (const Vec3 &other)
-    {
-        x-=other.x; y-=other.y; z-=other.z;
-    }
-    Vec3<decltype(-T{})> operator -() const
-    {
-        return MakeVec(-x,-y,-z);
-    }
-    Vec3<decltype(T{}*T{})> operator * (const Vec3 &other) const
-    {
-        return MakeVec(x*other.x, y*other.y, z*other.z);
-    }
-    template<typename V>
-    Vec3<decltype(T{}*V{})> operator * (const V& f) const
-    {
-        return MakeVec(x*f,y*f,z*f);
-    }
-    template<typename V>
-    void operator *= (const V& f)
-    {
-        x*=f; y*=f; z*=f;
-    }
-    template<typename V>
-    Vec3<decltype(T{}/V{})> operator / (const V& f) const
-    {
-        return MakeVec(x/f,y/f,z/f);
-    }
-    template<typename V>
-    void operator /= (const V& f)
-    {
-        *this = *this / f;
-    }
-
-    T Length2() const
-    {
-        return x*x + y*y + z*z;
-    }
-
-    // Only implemented for T=float
-    float Length() const;
-    void SetLength(const float l);
-    Vec3 WithLength(const float l) const;
-    float Distance2To(Vec3 &other);
-    Vec3 Normalized() const;
-    float Normalize(); // returns the previous length, which is often useful
-
-    T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
-    {
-        return *((&x) + i);
-    }
-    T operator [] (const int i) const
-    {
-        return *((&x) + i);
-    }
-
-    void SetZero()
-    {
-        x=0; y=0; z=0;
-    }
-
-    // Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
-    T& u() { return x; }
-    T& v() { return y; }
-    T& w() { return z; }
-
-    T& r() { return x; }
-    T& g() { return y; }
-    T& b() { return z; }
-
-    T& s() { return x; }
-    T& t() { return y; }
-    T& q() { return z; }
-
-    const T& u() const { return x; }
-    const T& v() const { return y; }
-    const T& w() const { return z; }
-
-    const T& r() const { return x; }
-    const T& g() const { return y; }
-    const T& b() const { return z; }
-
-    const T& s() const { return x; }
-    const T& t() const { return y; }
-    const T& q() const { return z; }
-
-    // swizzlers - create a subvector of specific components
-    // e.g. Vec2 uv() { return Vec2(x,y); }
-    // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
-#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
-#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
-    _DEFINE_SWIZZLER2(a, b, a##b); \
-    _DEFINE_SWIZZLER2(a, b, a2##b2); \
-    _DEFINE_SWIZZLER2(a, b, a3##b3); \
-    _DEFINE_SWIZZLER2(a, b, a4##b4); \
-    _DEFINE_SWIZZLER2(b, a, b##a); \
-    _DEFINE_SWIZZLER2(b, a, b2##a2); \
-    _DEFINE_SWIZZLER2(b, a, b3##a3); \
-    _DEFINE_SWIZZLER2(b, a, b4##a4)
-
-    DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
-    DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
-    DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
-#undef DEFINE_SWIZZLER2
-#undef _DEFINE_SWIZZLER2
-};
-
-template<typename T, typename V>
-Vec3<T> operator * (const V& f, const Vec3<T>& vec)
-{
-    return Vec3<T>(f*vec.x,f*vec.y,f*vec.z);
-}
-
-template<>
-inline float Vec3<float>::Length() const {
-    return std::sqrt(x * x + y * y + z * z);
-}
-
-template<>
-inline Vec3<float> Vec3<float>::Normalized() const {
-    return *this / Length();
-}
-
-
-typedef Vec3<float> Vec3f;
-
-template<typename T>
-class Vec4
-{
-public:
-    T x;
-    T y;
-    T z;
-    T w;
-
-    T* AsArray() { return &x; }
-
-    Vec4() = default;
-    Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
-    Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
-
-    template<typename T2>
-    Vec4<T2> Cast() const {
-        return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
-    }
-
-    // Only implemented for T=int and T=float
-    static Vec4 FromRGBA(unsigned int rgba);
-    unsigned int ToRGBA() const;
-
-    static Vec4 AssignToAll(const T& f) {
-        return Vec4<T>(f, f, f, f);
-    }
-
-    void Write(T a[4])
-    {
-        a[0] = x; a[1] = y; a[2] = z; a[3] = w;
-    }
-
-    Vec4<decltype(T{}+T{})> operator +(const Vec4& other) const
-    {
-        return MakeVec(x+other.x, y+other.y, z+other.z, w+other.w);
-    }
-    void operator += (const Vec4& other)
-    {
-        x+=other.x; y+=other.y; z+=other.z; w+=other.w;
-    }
-    Vec4<decltype(T{}-T{})> operator -(const Vec4 &other) const
-    {
-        return MakeVec(x-other.x, y-other.y, z-other.z, w-other.w);
-    }
-    void operator -= (const Vec4 &other)
-    {
-        x-=other.x; y-=other.y; z-=other.z; w-=other.w;
-    }
-    Vec4<decltype(-T{})> operator -() const
-    {
-        return MakeVec(-x,-y,-z,-w);
-    }
-    Vec4<decltype(T{}*T{})> operator * (const Vec4 &other) const
-    {
-        return MakeVec(x*other.x, y*other.y, z*other.z, w*other.w);
-    }
-    template<typename V>
-    Vec4<decltype(T{}*V{})> operator * (const V& f) const
-    {
-        return MakeVec(x*f,y*f,z*f,w*f);
-    }
-    template<typename V>
-    void operator *= (const V& f)
-    {
-        x*=f; y*=f; z*=f; w*=f;
-    }
-    template<typename V>
-    Vec4<decltype(T{}/V{})> operator / (const V& f) const
-    {
-        return MakeVec(x/f,y/f,z/f,w/f);
-    }
-    template<typename V>
-    void operator /= (const V& f)
-    {
-        *this = *this / f;
-    }
-
-    T Length2() const
-    {
-        return x*x + y*y + z*z + w*w;
-    }
-
-    // Only implemented for T=float
-    float Length() const;
-    void SetLength(const float l);
-    Vec4 WithLength(const float l) const;
-    float Distance2To(Vec4 &other);
-    Vec4 Normalized() const;
-    float Normalize(); // returns the previous length, which is often useful
-
-    T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
-    {
-        return *((&x) + i);
-    }
-    T operator [] (const int i) const
-    {
-        return *((&x) + i);
-    }
-
-    void SetZero()
-    {
-        x=0; y=0; z=0;
-    }
-
-    // Common alias: RGBA (colors)
-    T& r() { return x; }
-    T& g() { return y; }
-    T& b() { return z; }
-    T& a() { return w; }
-
-    const T& r() const { return x; }
-    const T& g() const { return y; }
-    const T& b() const { return z; }
-    const T& a() const { return w; }
-
-    // Swizzlers - Create a subvector of specific components
-    // e.g. Vec2 uv() { return Vec2(x,y); }
-
-    // _DEFINE_SWIZZLER2 defines a single such function
-    // DEFINE_SWIZZLER2_COMP1 defines one-component functions for all component names (x<->r) 
-    // DEFINE_SWIZZLER2_COMP2 defines two component functions for all component names (x<->r) and permutations (xy<->yx)
-#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
-#define DEFINE_SWIZZLER2_COMP1(a, a2) \
-    _DEFINE_SWIZZLER2(a, a, a##a); \
-    _DEFINE_SWIZZLER2(a, a, a2##a2)
-#define DEFINE_SWIZZLER2_COMP2(a, b, a2, b2) \
-    _DEFINE_SWIZZLER2(a, b, a##b); \
-    _DEFINE_SWIZZLER2(a, b, a2##b2); \
-    _DEFINE_SWIZZLER2(b, a, b##a); \
-    _DEFINE_SWIZZLER2(b, a, b2##a2)
-
-    DEFINE_SWIZZLER2_COMP2(x, y, r, g);
-    DEFINE_SWIZZLER2_COMP2(x, z, r, b);
-    DEFINE_SWIZZLER2_COMP2(x, w, r, a);
-    DEFINE_SWIZZLER2_COMP2(y, z, g, b);
-    DEFINE_SWIZZLER2_COMP2(y, w, g, a);
-    DEFINE_SWIZZLER2_COMP2(z, w, b, a);
-    DEFINE_SWIZZLER2_COMP1(x, r);
-    DEFINE_SWIZZLER2_COMP1(y, g);
-    DEFINE_SWIZZLER2_COMP1(z, b);
-    DEFINE_SWIZZLER2_COMP1(w, a);
-#undef DEFINE_SWIZZLER2_COMP1
-#undef DEFINE_SWIZZLER2_COMP2
-#undef _DEFINE_SWIZZLER2
-
-#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
-#define DEFINE_SWIZZLER3_COMP1(a, a2) \
-    _DEFINE_SWIZZLER3(a, a, a, a##a##a); \
-    _DEFINE_SWIZZLER3(a, a, a, a2##a2##a2)
-#define DEFINE_SWIZZLER3_COMP3(a, b, c, a2, b2, c2) \
-    _DEFINE_SWIZZLER3(a, b, c, a##b##c); \
-    _DEFINE_SWIZZLER3(a, c, b, a##c##b); \
-    _DEFINE_SWIZZLER3(b, a, c, b##a##c); \
-    _DEFINE_SWIZZLER3(b, c, a, b##c##a); \
-    _DEFINE_SWIZZLER3(c, a, b, c##a##b); \
-    _DEFINE_SWIZZLER3(c, b, a, c##b##a); \
-    _DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
-    _DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
-    _DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
-    _DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
-    _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
-    _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2)
-
-    DEFINE_SWIZZLER3_COMP3(x, y, z, r, g, b);
-    DEFINE_SWIZZLER3_COMP3(x, y, w, r, g, a);
-    DEFINE_SWIZZLER3_COMP3(x, z, w, r, b, a);
-    DEFINE_SWIZZLER3_COMP3(y, z, w, g, b, a);
-    DEFINE_SWIZZLER3_COMP1(x, r);
-    DEFINE_SWIZZLER3_COMP1(y, g);
-    DEFINE_SWIZZLER3_COMP1(z, b);
-    DEFINE_SWIZZLER3_COMP1(w, a);
-#undef DEFINE_SWIZZLER3_COMP1
-#undef DEFINE_SWIZZLER3_COMP3
-#undef _DEFINE_SWIZZLER3
-};
-
-
-template<typename T, typename V>
-Vec4<decltype(V{}*T{})> operator * (const V& f, const Vec4<T>& vec)
-{
-    return MakeVec(f*vec.x,f*vec.y,f*vec.z,f*vec.w);
-}
-
-typedef Vec4<float> Vec4f;
-
-
-template<typename T>
-static inline decltype(T{}*T{}+T{}*T{}) Dot(const Vec2<T>& a, const Vec2<T>& b)
-{
-    return a.x*b.x + a.y*b.y;
-}
-
-template<typename T>
-static inline decltype(T{}*T{}+T{}*T{}) Dot(const Vec3<T>& a, const Vec3<T>& b)
-{
-    return a.x*b.x + a.y*b.y + a.z*b.z;
-}
-
-template<typename T>
-static inline decltype(T{}*T{}+T{}*T{}) Dot(const Vec4<T>& a, const Vec4<T>& b)
-{
-    return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
-}
-
-template<typename T>
-static inline Vec3<decltype(T{}*T{}-T{}*T{})> Cross(const Vec3<T>& a, const Vec3<T>& b)
-{
-    return MakeVec(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
-}
-
-// linear interpolation via float: 0.0=begin, 1.0=end
-template<typename X>
-static inline decltype(X{}*float{}+X{}*float{}) Lerp(const X& begin, const X& end, const float t)
-{
-    return begin*(1.f-t) + end*t;
-}
-
-// linear interpolation via int: 0=begin, base=end
-template<typename X, int base>
-static inline decltype((X{}*int{}+X{}*int{}) / base) LerpInt(const X& begin, const X& end, const int t)
-{
-    return (begin*(base-t) + end*t) / base;
-}
-
-// Utility vector factories
-template<typename T>
-static inline Vec2<T> MakeVec(const T& x, const T& y)
-{
-    return Vec2<T>{x, y};
-}
-
-template<typename T>
-static inline Vec3<T> MakeVec(const T& x, const T& y, const T& z)
-{
-    return Vec3<T>{x, y, z};
-}
-
-template<typename T>
-static inline Vec4<T> MakeVec(const T& x, const T& y, const Vec2<T>& zw)
-{
-    return MakeVec(x, y, zw[0], zw[1]);
-}
-
-template<typename T>
-static inline Vec3<T> MakeVec(const Vec2<T>& xy, const T& z)
-{
-    return MakeVec(xy[0], xy[1], z);
-}
-
-template<typename T>
-static inline Vec3<T> MakeVec(const T& x, const Vec2<T>& yz)
-{
-    return MakeVec(x, yz[0], yz[1]);
-}
-
-template<typename T>
-static inline Vec4<T> MakeVec(const T& x, const T& y, const T& z, const T& w)
-{
-    return Vec4<T>{x, y, z, w};
-}
-
-template<typename T>
-static inline Vec4<T> MakeVec(const Vec2<T>& xy, const T& z, const T& w)
-{
-    return MakeVec(xy[0], xy[1], z, w);
-}
-
-template<typename T>
-static inline Vec4<T> MakeVec(const T& x, const Vec2<T>& yz, const T& w)
-{
-    return MakeVec(x, yz[0], yz[1], w);
-}
-
-// NOTE: This has priority over "Vec2<Vec2<T>> MakeVec(const Vec2<T>& x, const Vec2<T>& y)".
-//       Even if someone wanted to use an odd object like Vec2<Vec2<T>>, the compiler would error
-//       out soon enough due to misuse of the returned structure.
-template<typename T>
-static inline Vec4<T> MakeVec(const Vec2<T>& xy, const Vec2<T>& zw)
-{
-    return MakeVec(xy[0], xy[1], zw[0], zw[1]);
-}
-
-template<typename T>
-static inline Vec4<T> MakeVec(const Vec3<T>& xyz, const T& w)
-{
-    return MakeVec(xyz[0], xyz[1], xyz[2], w);
-}
-
-template<typename T>
-static inline Vec4<T> MakeVec(const T& x, const Vec3<T>& yzw)
-{
-    return MakeVec(x, yzw[0], yzw[1], yzw[2]);
-}
-
-
-} // namespace
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index b67dce1a..9628a758 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -15,8 +15,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
-
-#include "math.h" 
+#include "common/vector_math.h"
 
 namespace Pica {
 
@@ -162,6 +161,25 @@ struct Regs {
         ETC1A4       = 13,  // compressed
     };
 
+    enum class LogicOp : u32 {
+        Clear        =  0,
+        And          =  1,
+        AndReverse   =  2,
+        Copy         =  3,
+        Set          =  4,
+        CopyInverted =  5,
+        NoOp         =  6,
+        Invert       =  7,
+        Nand         =  8,
+        Or           =  9,
+        Nor          = 10,
+        Xor          = 11,
+        Equiv        = 12,
+        AndInverted  = 13,
+        OrReverse    = 14,
+        OrInverted   = 15,
+    };
+
     static unsigned NibblesPerPixel(TextureFormat format) {
         switch (format) {
         case TextureFormat::RGBA8:
@@ -221,6 +239,7 @@ struct Regs {
         enum class Source : u32 {
             PrimaryColor           = 0x0,
             PrimaryFragmentColor   = 0x1,
+            SecondaryFragmentColor = 0x2,
 
             Texture0               = 0x3,
             Texture1               = 0x4,
@@ -337,7 +356,7 @@ struct Regs {
             return (stage_index < 4) && (update_mask_a & (1 << stage_index));
         }
     } tev_combiner_buffer_input;
-    
+
     INSERT_PADDING_WORDS(0xf);
     TevStageConfig tev_stage4;
     INSERT_PADDING_WORDS(0x3);
@@ -353,9 +372,9 @@ struct Regs {
     INSERT_PADDING_WORDS(0x2);
 
     const std::array<Regs::TevStageConfig,6> GetTevStages() const {
-        return { tev_stage0, tev_stage1,
-                 tev_stage2, tev_stage3,
-                 tev_stage4, tev_stage5 };
+        return {{ tev_stage0, tev_stage1,
+                  tev_stage2, tev_stage3,
+                  tev_stage4, tev_stage5 }};
     };
 
     enum class BlendEquation : u32 {
@@ -413,12 +432,8 @@ struct Regs {
         } alpha_blending;
 
         union {
-            enum Op {
-                Set = 4,
-            };
-
-            BitField<0, 4, Op> op;
-        } logic_op;
+            BitField<0, 4, LogicOp> logic_op;
+        };
 
         union {
             BitField< 0, 8, u32> r;
@@ -703,12 +718,38 @@ struct Regs {
     struct {
         // Index of the current default attribute
         u32 index;
-        
+
         // Writing to these registers sets the "current" default attribute.
         u32 set_value[3];
     } vs_default_attributes_setup;
-    
-    INSERT_PADDING_WORDS(0x28);
+
+    INSERT_PADDING_WORDS(0x2);
+
+    struct {
+        // There are two channels that can be used to configure the next command buffer, which
+        // can be then executed by writing to the "trigger" registers. There are two reasons why a
+        // game might use this feature:
+        //  1) With this, an arbitrary number of additional command buffers may be executed in
+        //     sequence without requiring any intervention of the CPU after the initial one is
+        //     kicked off.
+        //  2) Games can configure these registers to provide a command list subroutine mechanism.
+
+        BitField< 0, 20, u32> size[2]; ///< Size (in bytes / 8) of each channel's command buffer
+        BitField< 0, 28, u32> addr[2]; ///< Physical address / 8 of each channel's command buffer
+        u32 trigger[2]; ///< Triggers execution of the channel's command buffer when written to
+
+        unsigned GetSize(unsigned index) const {
+            ASSERT(index < 2);
+            return 8 * size[index];
+        }
+
+        PAddr GetPhysicalAddress(unsigned index) const {
+            ASSERT(index < 2);
+            return (PAddr)(8 * addr[index]);
+        }
+    } command_buffer;
+
+    INSERT_PADDING_WORDS(0x20);
 
     enum class TriangleTopology : u32 {
         List        = 0,
@@ -861,6 +902,7 @@ struct Regs {
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
         ADD_FIELD(vs_default_attributes_setup);
+        ADD_FIELD(command_buffer);
         ADD_FIELD(triangle_topology);
         ADD_FIELD(vs_bool_uniforms);
         ADD_FIELD(vs_int_uniforms);
@@ -938,6 +980,7 @@ ASSERT_REG_POSITION(num_vertices, 0x228);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232);
+ASSERT_REG_POSITION(command_buffer, 0x238);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
 ASSERT_REG_POSITION(vs_int_uniforms, 0x2b1);
@@ -1053,21 +1096,12 @@ private:
     float value;
 };
 
-union CommandHeader {
-    CommandHeader(u32 h) : hex(h) {}
-
-    u32 hex;
-
-    BitField< 0, 16, u32> cmd_id;
-    BitField<16,  4, u32> parameter_mask;
-    BitField<20, 11, u32> extra_data_length;
-    BitField<31,  1, u32> group_commands;
-};
-
 /// Struct used to describe current Pica state
 struct State {
+    /// Pica registers
     Regs regs;
 
+    /// Vertex shader memory
     struct {
         struct {
             Math::Vec4<float24> f[96];
@@ -1080,6 +1114,13 @@ struct State {
         std::array<u32, 1024> program_code;
         std::array<u32, 1024> swizzle_data;
     } vs;
+
+    /// Current Pica command list
+    struct {
+        const u32* head_ptr;
+        const u32* current_ptr;
+        u32 length;
+    } cmd_list;
 };
 
 /// Initialize Pica state
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 767ff420..59d156ee 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 
+#include "common/color.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
 #include "common/profiler.h"
@@ -13,7 +14,6 @@
 
 #include "debug_utils/debug_utils.h"
 #include "math.h"
-#include "color.h"
 #include "pica.h"
 #include "rasterizer.h"
 #include "vertex_shader.h"
@@ -104,7 +104,7 @@ static u32 GetDepth(int x, int y) {
     u8* depth_buffer = Memory::GetPhysicalPointer(addr);
 
     y = framebuffer.height - y;
-    
+
     const u32 coarse_y = y & ~7;
     u32 bytes_per_pixel = Regs::BytesPerDepthPixel(framebuffer.depth_format);
     u32 stride = framebuffer.width * bytes_per_pixel;
@@ -402,11 +402,16 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
-                    // TODO: What's the difference between these two?
                     case Source::PrimaryColor:
+
+                    // HACK: Until we implement fragment lighting, use primary_color
                     case Source::PrimaryFragmentColor:
                         return primary_color;
 
+                    // HACK: Until we implement fragment lighting, use zero
+                    case Source::SecondaryFragmentColor:
+                        return {0, 0, 0, 0};
+
                     case Source::Texture0:
                         return texture_color[0];
 
@@ -570,6 +575,13 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     case Operation::Add:
                         return std::min(255, input[0] + input[1]);
 
+                    case Operation::AddSigned:
+                    {
+                        // TODO(bunnei): Verify that the color conversion from (float) 0.5f to (byte) 128 is correct
+                        auto result = static_cast<int>(input[0]) + static_cast<int>(input[1]) - 128;
+                        return static_cast<u8>(MathUtil::Clamp<int>(result, 0, 255));
+                    }
+
                     case Operation::Lerp:
                         return (input[0] * input[2] + input[1] * (255 - input[2])) / 255;
 
@@ -808,10 +820,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                using BlendEquation = Regs::BlendEquation;
                 static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
                                                        const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
-                                                       BlendEquation equation) {
+                                                       Regs::BlendEquation equation) {
                     Math::Vec4<int> result;
 
                     auto src_result = (src  *  srcfactor).Cast<int>();
@@ -866,8 +877,63 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                 blend_output     = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
                 blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
             } else {
-                LOG_CRITICAL(HW_GPU, "logic op: %x", output_merger.logic_op);
-                UNIMPLEMENTED();
+                static auto LogicOp = [](u8 src, u8 dest, Regs::LogicOp op) -> u8 {
+                    switch (op) {
+                    case Regs::LogicOp::Clear:
+                        return 0;
+
+                    case Regs::LogicOp::And:
+                        return src & dest;
+
+                    case Regs::LogicOp::AndReverse:
+                        return src & ~dest;
+
+                    case Regs::LogicOp::Copy:
+                        return src;
+
+                    case Regs::LogicOp::Set:
+                        return 255;
+
+                    case Regs::LogicOp::CopyInverted:
+                        return ~src;
+
+                    case Regs::LogicOp::NoOp:
+                        return dest;
+
+                    case Regs::LogicOp::Invert:
+                        return ~dest;
+
+                    case Regs::LogicOp::Nand:
+                        return ~(src & dest);
+
+                    case Regs::LogicOp::Or:
+                        return src | dest;
+
+                    case Regs::LogicOp::Nor:
+                        return ~(src | dest);
+
+                    case Regs::LogicOp::Xor:
+                        return src ^ dest;
+
+                    case Regs::LogicOp::Equiv:
+                        return ~(src ^ dest);
+
+                    case Regs::LogicOp::AndInverted:
+                        return ~src & dest;
+
+                    case Regs::LogicOp::OrReverse:
+                        return src | ~dest;
+
+                    case Regs::LogicOp::OrInverted:
+                        return ~src | dest;
+                    }
+                };
+
+                blend_output = Math::MakeVec(
+                    LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op),
+                    LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op),
+                    LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op),
+                    LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op));
             }
 
             const Math::Vec4<u8> result = {
diff --git a/src/video_core/renderer_opengl/generated/gl_3_2_core.c b/src/video_core/renderer_opengl/generated/gl_3_2_core.c
index ef29972d..95fd29c0 100644
--- a/src/video_core/renderer_opengl/generated/gl_3_2_core.c
+++ b/src/video_core/renderer_opengl/generated/gl_3_2_core.c
@@ -62,9 +62,9 @@ static int TestPointer(const PROC pTest)
 	ptrdiff_t iTest;
 	if(!pTest) return 0;
 	iTest = (ptrdiff_t)pTest;
-	
+
 	if(iTest == 1 || iTest == 2 || iTest == 3 || iTest == -1) return 0;
-	
+
 	return 1;
 }
 
@@ -79,7 +79,7 @@ static PROC WinGetProcAddress(const char *name)
 	glMod = GetModuleHandleA("OpenGL32.dll");
 	return (PROC)GetProcAddress(glMod, (LPCSTR)name);
 }
-	
+
 #define IntGetProcAddress(name) WinGetProcAddress(name)
 #else
 	#if defined(__APPLE__)
@@ -1083,7 +1083,7 @@ static ogl_StrToExtMap *FindExtEntry(const char *extensionName)
   	if(strcmp(extensionName, currLoc->extensionName) == 0)
   		return currLoc;
   }
-  
+
   return NULL;
 }
 
@@ -1135,15 +1135,15 @@ int ogl_LoadFunctions()
 {
   int numFailed = 0;
   ClearExtensionVars();
-  
+
   _ptrc_glGetIntegerv = (void (CODEGEN_FUNCPTR *)(GLenum, GLint *))IntGetProcAddress("glGetIntegerv");
   if(!_ptrc_glGetIntegerv) return ogl_LOAD_FAILED;
   _ptrc_glGetStringi = (const GLubyte * (CODEGEN_FUNCPTR *)(GLenum, GLuint))IntGetProcAddress("glGetStringi");
   if(!_ptrc_glGetStringi) return ogl_LOAD_FAILED;
-  
+
   ProcExtsFromExtList();
   numFailed = Load_Version_3_2();
-  
+
   if(numFailed == 0)
   	return ogl_LOAD_SUCCEEDED;
   else
@@ -1177,7 +1177,7 @@ int ogl_IsVersionGEQ(int majorVersion, int minorVersion)
 {
 	if(g_major_version == 0)
 		GetGLVersion();
-		
+
 	if(majorVersion > g_major_version) return 1;
 	if(majorVersion < g_major_version) return 0;
 	if(minorVersion >= g_minor_version) return 1;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 4b7d099a..518f7933 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -2,10 +2,11 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/color.h"
+
 #include "core/settings.h"
 #include "core/hw/gpu.h"
 
-#include "video_core/color.h"
 #include "video_core/pica.h"
 #include "video_core/utils.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -93,14 +94,27 @@ void RasterizerOpenGL::InitObjects() {
     // Create textures for OGL framebuffer that will be rendered to, initially 1x1 to succeed in framebuffer creation
     fb_color_texture.texture.Create();
     ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1);
+
+    state.texture_units[0].enabled_2d = true;
+    state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
+    state.Apply();
+
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
+
     fb_depth_texture.texture.Create();
     ReconfigureDepthTexture(fb_depth_texture, Pica::Regs::DepthFormat::D16, 1, 1);
+
+    state.texture_units[0].enabled_2d = true;
+    state.texture_units[0].texture_2d = fb_depth_texture.texture.handle;
+    state.Apply();
+
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, 0);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
@@ -109,14 +123,13 @@ void RasterizerOpenGL::InitObjects() {
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_FUNC, GL_LEQUAL);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_NONE);
 
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
+
     // Configure OpenGL framebuffer
     framebuffer.Create();
 
     state.draw.framebuffer = framebuffer.handle;
-
-    // Unbind texture to allow binding to framebuffer
-    state.texture_units[0].enabled_2d = true;
-    state.texture_units[0].texture_2d = 0;
     state.Apply();
 
     glActiveTexture(GL_TEXTURE0);
@@ -135,6 +148,7 @@ void RasterizerOpenGL::Reset() {
     SyncBlendFuncs();
     SyncBlendColor();
     SyncAlphaTest();
+    SyncLogicOp();
     SyncStencilTest();
     SyncDepthTest();
 
@@ -203,7 +217,19 @@ void RasterizerOpenGL::DrawTriangles() {
 
     vertex_batch.clear();
 
-    // TODO: Flush the resource cache at the current depth and color framebuffer addresses for render-to-texture
+    // Flush the resource cache at the current depth and color framebuffer addresses for render-to-texture
+    const auto& regs = Pica::g_state.regs;
+
+    PAddr cur_fb_color_addr = regs.framebuffer.GetColorBufferPhysicalAddress();
+    u32 cur_fb_color_size = Pica::Regs::BytesPerColorPixel(regs.framebuffer.color_format)
+                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
+
+    PAddr cur_fb_depth_addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
+    u32 cur_fb_depth_size = Pica::Regs::BytesPerDepthPixel(regs.framebuffer.depth_format)
+                            * regs.framebuffer.GetWidth() * regs.framebuffer.GetHeight();
+
+    res_cache.NotifyFlush(cur_fb_color_addr, cur_fb_color_size);
+    res_cache.NotifyFlush(cur_fb_depth_addr, cur_fb_depth_size);
 }
 
 void RasterizerOpenGL::CommitFramebuffer() {
@@ -249,6 +275,11 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncDepthTest();
         break;
 
+    // Logic op
+    case PICA_REG_INDEX(output_merger.logic_op):
+        SyncLogicOp();
+        break;
+
     // TEV stage 0
     case PICA_REG_INDEX(tev_stage0.color_source1):
         SyncTevSources(0, regs.tev_stage0);
@@ -350,7 +381,7 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
     case PICA_REG_INDEX(tev_stage5.color_scale):
         SyncTevMultipliers(5, regs.tev_stage5);
         break;
-    
+
     // TEV combiner buffer color
     case PICA_REG_INDEX(tev_combiner_buffer_color):
         SyncCombinerColor();
@@ -465,6 +496,9 @@ void RasterizerOpenGL::ReconfigureColorTexture(TextureInfo& texture, Pica::Regs:
     glActiveTexture(GL_TEXTURE0);
     glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0,
                  texture.gl_format, texture.gl_type, nullptr);
+
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
 }
 
 void RasterizerOpenGL::ReconfigureDepthTexture(DepthTextureInfo& texture, Pica::Regs::DepthFormat format, u32 width, u32 height) {
@@ -484,7 +518,7 @@ void RasterizerOpenGL::ReconfigureDepthTexture(DepthTextureInfo& texture, Pica::
     case Pica::Regs::DepthFormat::D24:
         internal_format = GL_DEPTH_COMPONENT24;
         texture.gl_format = GL_DEPTH_COMPONENT;
-        texture.gl_type = GL_UNSIGNED_INT_24_8;
+        texture.gl_type = GL_UNSIGNED_INT;
         break;
 
     case Pica::Regs::DepthFormat::D24S8:
@@ -506,6 +540,9 @@ void RasterizerOpenGL::ReconfigureDepthTexture(DepthTextureInfo& texture, Pica::
     glActiveTexture(GL_TEXTURE0);
     glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0,
                  texture.gl_format, texture.gl_type, nullptr);
+
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
 }
 
 void RasterizerOpenGL::SyncFramebuffer() {
@@ -633,6 +670,10 @@ void RasterizerOpenGL::SyncAlphaTest() {
     glUniform1f(uniform_alphatest_ref, regs.output_merger.alpha_test.ref / 255.0f);
 }
 
+void RasterizerOpenGL::SyncLogicOp() {
+    state.logic_op = PicaToGL::LogicOp(Pica::g_state.regs.output_merger.logic_op);
+}
+
 void RasterizerOpenGL::SyncStencilTest() {
     // TODO: Implement stencil test, mask, and op
 }
@@ -641,6 +682,10 @@ void RasterizerOpenGL::SyncDepthTest() {
     const auto& regs = Pica::g_state.regs;
     state.depth.test_enabled = (regs.output_merger.depth_test_enable == 1);
     state.depth.test_func = PicaToGL::CompareFunc(regs.output_merger.depth_test_func);
+    state.color_mask.red_enabled = regs.output_merger.red_enable;
+    state.color_mask.green_enabled = regs.output_merger.green_enable;
+    state.color_mask.blue_enabled = regs.output_merger.blue_enable;
+    state.color_mask.alpha_enabled = regs.output_merger.alpha_enable;
     state.depth.write_mask = regs.output_merger.depth_write_enable ? GL_TRUE : GL_FALSE;
 }
 
@@ -748,10 +793,10 @@ void RasterizerOpenGL::ReloadColorBuffer() {
         for (int x = 0; x < fb_color_texture.width; ++x) {
             const u32 coarse_y = y & ~7;
             u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel;
-            u32 gl_px_idx = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel;
+            u32 gl_pixel_index = (x + y * fb_color_texture.width) * bytes_per_pixel;
 
             u8* pixel = color_buffer + dst_offset;
-            memcpy(&temp_fb_color_buffer[gl_px_idx], pixel, bytes_per_pixel);
+            memcpy(&temp_fb_color_buffer[gl_pixel_index], pixel, bytes_per_pixel);
         }
     }
 
@@ -762,6 +807,9 @@ void RasterizerOpenGL::ReloadColorBuffer() {
     glActiveTexture(GL_TEXTURE0);
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fb_color_texture.width, fb_color_texture.height,
                     fb_color_texture.gl_format, fb_color_texture.gl_type, temp_fb_color_buffer.get());
+
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
 }
 
 void RasterizerOpenGL::ReloadDepthBuffer() {
@@ -779,29 +827,29 @@ void RasterizerOpenGL::ReloadDepthBuffer() {
 
     std::unique_ptr<u8[]> temp_fb_depth_buffer(new u8[fb_depth_texture.width * fb_depth_texture.height * gl_bpp]);
 
-    for (int y = 0; y < fb_depth_texture.height; ++y) {
-        for (int x = 0; x < fb_depth_texture.width; ++x) {
-            const u32 coarse_y = y & ~7;
-            u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_depth_texture.width * bytes_per_pixel;
-            u32 gl_px_idx = x + y * fb_depth_texture.width;
-
-            switch (fb_depth_texture.format) {
-            case Pica::Regs::DepthFormat::D16:
-                ((u16*)temp_fb_depth_buffer.get())[gl_px_idx] = Color::DecodeD16(depth_buffer + dst_offset);
-                break;
-            case Pica::Regs::DepthFormat::D24:
-                ((u32*)temp_fb_depth_buffer.get())[gl_px_idx] = Color::DecodeD24(depth_buffer + dst_offset);
-                break;
-            case Pica::Regs::DepthFormat::D24S8:
-            {
-                Math::Vec2<u32> depth_stencil = Color::DecodeD24S8(depth_buffer + dst_offset);
-                ((u32*)temp_fb_depth_buffer.get())[gl_px_idx] = (depth_stencil.x << 8) | depth_stencil.y;
-                break;
+    u8* temp_fb_depth_data = bytes_per_pixel == 3 ? (temp_fb_depth_buffer.get() + 1) : temp_fb_depth_buffer.get();
+
+    if (fb_depth_texture.format == Pica::Regs::DepthFormat::D24S8) {
+        for (int y = 0; y < fb_depth_texture.height; ++y) {
+            for (int x = 0; x < fb_depth_texture.width; ++x) {
+                const u32 coarse_y = y & ~7;
+                u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_depth_texture.width * bytes_per_pixel;
+                u32 gl_pixel_index = (x + y * fb_depth_texture.width);
+
+                u8* pixel = depth_buffer + dst_offset;
+                u32 depth_stencil = *(u32*)pixel;
+                ((u32*)temp_fb_depth_data)[gl_pixel_index] = (depth_stencil << 8) | (depth_stencil >> 24);
             }
-            default:
-                LOG_CRITICAL(Render_OpenGL, "Unknown memory framebuffer depth format %x", fb_depth_texture.format);
-                UNIMPLEMENTED();
-                break;
+        }
+    } else {
+        for (int y = 0; y < fb_depth_texture.height; ++y) {
+            for (int x = 0; x < fb_depth_texture.width; ++x) {
+                const u32 coarse_y = y & ~7;
+                u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_depth_texture.width * bytes_per_pixel;
+                u32 gl_pixel_index = (x + y * fb_depth_texture.width) * gl_bpp;
+
+                u8* pixel = depth_buffer + dst_offset;
+                memcpy(&temp_fb_depth_data[gl_pixel_index], pixel, bytes_per_pixel);
             }
         }
     }
@@ -813,6 +861,9 @@ void RasterizerOpenGL::ReloadDepthBuffer() {
     glActiveTexture(GL_TEXTURE0);
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fb_depth_texture.width, fb_depth_texture.height,
                     fb_depth_texture.gl_format, fb_depth_texture.gl_type, temp_fb_depth_buffer.get());
+
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
 }
 
 void RasterizerOpenGL::CommitColorBuffer() {
@@ -831,15 +882,18 @@ void RasterizerOpenGL::CommitColorBuffer() {
             glActiveTexture(GL_TEXTURE0);
             glGetTexImage(GL_TEXTURE_2D, 0, fb_color_texture.gl_format, fb_color_texture.gl_type, temp_gl_color_buffer.get());
 
+            state.texture_units[0].texture_2d = 0;
+            state.Apply();
+
             // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
             for (int y = 0; y < fb_color_texture.height; ++y) {
                 for (int x = 0; x < fb_color_texture.width; ++x) {
                     const u32 coarse_y = y & ~7;
                     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel;
-                    u32 gl_px_idx = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel;
+                    u32 gl_pixel_index = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel;
 
                     u8* pixel = color_buffer + dst_offset;
-                    memcpy(pixel, &temp_gl_color_buffer[gl_px_idx], bytes_per_pixel);
+                    memcpy(pixel, &temp_gl_color_buffer[gl_pixel_index], bytes_per_pixel);
                 }
             }
         }
@@ -866,29 +920,32 @@ void RasterizerOpenGL::CommitDepthBuffer() {
             glActiveTexture(GL_TEXTURE0);
             glGetTexImage(GL_TEXTURE_2D, 0, fb_depth_texture.gl_format, fb_depth_texture.gl_type, temp_gl_depth_buffer.get());
 
-            for (int y = 0; y < fb_depth_texture.height; ++y) {
-                for (int x = 0; x < fb_depth_texture.width; ++x) {
-                    const u32 coarse_y = y & ~7;
-                    u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_depth_texture.width * bytes_per_pixel;
-                    u32 gl_px_idx = x + y * fb_depth_texture.width;
-
-                    switch (fb_depth_texture.format) {
-                    case Pica::Regs::DepthFormat::D16:
-                        Color::EncodeD16(((u16*)temp_gl_depth_buffer.get())[gl_px_idx], depth_buffer + dst_offset);
-                        break;
-                    case Pica::Regs::DepthFormat::D24:
-                        Color::EncodeD24(((u32*)temp_gl_depth_buffer.get())[gl_px_idx], depth_buffer + dst_offset);
-                        break;
-                    case Pica::Regs::DepthFormat::D24S8:
-                    {
-                        u32 depth_stencil = ((u32*)temp_gl_depth_buffer.get())[gl_px_idx];
-                        Color::EncodeD24S8((depth_stencil >> 8), depth_stencil & 0xFF, depth_buffer + dst_offset);
-                        break;
+            state.texture_units[0].texture_2d = 0;
+            state.Apply();
+
+            u8* temp_gl_depth_data = bytes_per_pixel == 3 ? (temp_gl_depth_buffer.get() + 1) : temp_gl_depth_buffer.get();
+
+            if (fb_depth_texture.format == Pica::Regs::DepthFormat::D24S8) {
+                for (int y = 0; y < fb_depth_texture.height; ++y) {
+                    for (int x = 0; x < fb_depth_texture.width; ++x) {
+                        const u32 coarse_y = y & ~7;
+                        u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_depth_texture.width * bytes_per_pixel;
+                        u32 gl_pixel_index = (x + y * fb_depth_texture.width);
+
+                        u8* pixel = depth_buffer + dst_offset;
+                        u32 depth_stencil = ((u32*)temp_gl_depth_data)[gl_pixel_index];
+                        *(u32*)pixel = (depth_stencil >> 8) | (depth_stencil << 24);
                     }
-                    default:
-                        LOG_CRITICAL(Render_OpenGL, "Unknown framebuffer depth format %x", fb_depth_texture.format);
-                        UNIMPLEMENTED();
-                        break;
+                }
+            } else {
+                for (int y = 0; y < fb_depth_texture.height; ++y) {
+                    for (int x = 0; x < fb_depth_texture.width; ++x) {
+                        const u32 coarse_y = y & ~7;
+                        u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_depth_texture.width * bytes_per_pixel;
+                        u32 gl_pixel_index = (x + y * fb_depth_texture.width) * gl_bpp;
+
+                        u8* pixel = depth_buffer + dst_offset;
+                        memcpy(pixel, &temp_gl_depth_data[gl_pixel_index], bytes_per_pixel);
                     }
                 }
             }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 9896f8d0..d7d422b1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -125,6 +125,9 @@ private:
     /// Syncs the alpha test states to match the PICA register
     void SyncAlphaTest();
 
+    /// Syncs the logic op states to match the PICA register
+    void SyncLogicOp();
+
     /// Syncs the stencil test states to match the PICA register
     void SyncStencilTest();
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 6f88a8b2..2e4110a8 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -4,13 +4,13 @@
 
 #include "common/make_unique.h"
 #include "common/math_util.h"
+#include "common/vector_math.h"
 
 #include "core/memory.h"
 
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/pica_to_gl.h"
 #include "video_core/debug_utils/debug_utils.h"
-#include "video_core/math.h"
 
 RasterizerCacheOpenGL::~RasterizerCacheOpenGL() {
     FullFlush();
diff --git a/src/video_core/renderer_opengl/gl_shaders.h b/src/video_core/renderer_opengl/gl_shaders.h
index 8f094123..a8cb2f59 100644
--- a/src/video_core/renderer_opengl/gl_shaders.h
+++ b/src/video_core/renderer_opengl/gl_shaders.h
@@ -69,15 +69,16 @@ const char g_fragment_shader_hw[] = R"(
 #define NUM_VTX_ATTR 7
 #define NUM_TEV_STAGES 6
 
-#define SOURCE_PRIMARYCOLOR         0x0
-#define SOURCE_PRIMARYFRAGMENTCOLOR 0x1
-#define SOURCE_TEXTURE0             0x3
-#define SOURCE_TEXTURE1             0x4
-#define SOURCE_TEXTURE2             0x5
-#define SOURCE_TEXTURE3             0x6
-#define SOURCE_PREVIOUSBUFFER       0xd
-#define SOURCE_CONSTANT             0xe
-#define SOURCE_PREVIOUS             0xf
+#define SOURCE_PRIMARYCOLOR           0x0
+#define SOURCE_PRIMARYFRAGMENTCOLOR   0x1
+#define SOURCE_SECONDARYFRAGMENTCOLOR 0x2
+#define SOURCE_TEXTURE0               0x3
+#define SOURCE_TEXTURE1               0x4
+#define SOURCE_TEXTURE2               0x5
+#define SOURCE_TEXTURE3               0x6
+#define SOURCE_PREVIOUSBUFFER         0xd
+#define SOURCE_CONSTANT               0xe
+#define SOURCE_PREVIOUS               0xf
 
 #define COLORMODIFIER_SOURCECOLOR         0x0
 #define COLORMODIFIER_ONEMINUSSOURCECOLOR 0x1
@@ -151,8 +152,11 @@ vec4 GetSource(int source) {
     if (source == SOURCE_PRIMARYCOLOR) {
         return o[2];
     } else if (source == SOURCE_PRIMARYFRAGMENTCOLOR) {
-        // HACK: Uses color value, but should really use fragment lighting output
+        // HACK: Until we implement fragment lighting, use primary_color
         return o[2];
+    } else if (source == SOURCE_SECONDARYFRAGMENTCOLOR) {
+        // HACK: Until we implement fragment lighting, use zero
+        return vec4(0.0, 0.0, 0.0, 0.0);
     } else if (source == SOURCE_TEXTURE0) {
         return texture(tex[0], o[3].xy);
     } else if (source == SOURCE_TEXTURE1) {
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 1afa58c9..3526e16d 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -16,6 +16,11 @@ OpenGLState::OpenGLState() {
     depth.test_func = GL_LESS;
     depth.write_mask = GL_TRUE;
 
+    color_mask.red_enabled = GL_TRUE;
+    color_mask.green_enabled = GL_TRUE;
+    color_mask.blue_enabled = GL_TRUE;
+    color_mask.alpha_enabled = GL_TRUE;
+
     stencil.test_enabled = false;
     stencil.test_func = GL_ALWAYS;
     stencil.test_ref = 0;
@@ -32,6 +37,8 @@ OpenGLState::OpenGLState() {
     blend.color.blue = 0.0f;
     blend.color.alpha = 0.0f;
 
+    logic_op = GL_COPY;
+
     for (auto& texture_unit : texture_units) {
         texture_unit.enabled_2d = false;
         texture_unit.texture_2d = 0;
@@ -75,6 +82,15 @@ void OpenGLState::Apply() {
         glDepthMask(depth.write_mask);
     }
 
+    // Color mask
+    if (color_mask.red_enabled != cur_state.color_mask.red_enabled ||
+            color_mask.green_enabled != cur_state.color_mask.green_enabled ||
+            color_mask.blue_enabled != cur_state.color_mask.blue_enabled ||
+            color_mask.alpha_enabled != cur_state.color_mask.alpha_enabled) {
+        glColorMask(color_mask.red_enabled, color_mask.green_enabled,
+                    color_mask.blue_enabled, color_mask.alpha_enabled);
+    }
+
     // Stencil test
     if (stencil.test_enabled != cur_state.stencil.test_enabled) {
         if (stencil.test_enabled) {
@@ -82,11 +98,11 @@ void OpenGLState::Apply() {
         } else {
             glDisable(GL_STENCIL_TEST);
         }
-    } 
+    }
 
     if (stencil.test_func != cur_state.stencil.test_func ||
-        stencil.test_ref != cur_state.stencil.test_ref ||
-        stencil.test_mask != cur_state.stencil.test_mask) {
+            stencil.test_ref != cur_state.stencil.test_ref ||
+            stencil.test_mask != cur_state.stencil.test_mask) {
         glStencilFunc(stencil.test_func, stencil.test_ref, stencil.test_mask);
     }
 
@@ -99,23 +115,34 @@ void OpenGLState::Apply() {
     if (blend.enabled != cur_state.blend.enabled) {
         if (blend.enabled) {
             glEnable(GL_BLEND);
+
+            cur_state.logic_op = GL_COPY;
+            glLogicOp(cur_state.logic_op);
+            glDisable(GL_COLOR_LOGIC_OP);
         } else {
             glDisable(GL_BLEND);
+            glEnable(GL_COLOR_LOGIC_OP);
         }
     }
 
     if (blend.color.red != cur_state.blend.color.red ||
-        blend.color.green != cur_state.blend.color.green ||
-        blend.color.blue != cur_state.blend.color.blue ||
-        blend.color.alpha != cur_state.blend.color.alpha) {
-        glBlendColor(blend.color.red, blend.color.green, blend.color.blue, blend.color.alpha);
+            blend.color.green != cur_state.blend.color.green ||
+            blend.color.blue != cur_state.blend.color.blue ||
+            blend.color.alpha != cur_state.blend.color.alpha) {
+        glBlendColor(blend.color.red, blend.color.green,
+                     blend.color.blue, blend.color.alpha);
     }
 
     if (blend.src_rgb_func != cur_state.blend.src_rgb_func ||
-        blend.dst_rgb_func != cur_state.blend.dst_rgb_func ||
-        blend.src_a_func != cur_state.blend.src_a_func ||
-        blend.dst_a_func != cur_state.blend.dst_a_func) {
-        glBlendFuncSeparate(blend.src_rgb_func, blend.dst_rgb_func, blend.src_a_func, blend.dst_a_func);
+            blend.dst_rgb_func != cur_state.blend.dst_rgb_func ||
+            blend.src_a_func != cur_state.blend.src_a_func ||
+            blend.dst_a_func != cur_state.blend.dst_a_func) {
+        glBlendFuncSeparate(blend.src_rgb_func, blend.dst_rgb_func,
+                            blend.src_a_func, blend.dst_a_func);
+    }
+
+    if (logic_op != cur_state.logic_op) {
+        glLogicOp(logic_op);
     }
 
     // Textures
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 281b7cad..26b91636 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -20,6 +20,13 @@ public:
     } depth;
 
     struct {
+        GLboolean red_enabled;
+        GLboolean green_enabled;
+        GLboolean blue_enabled;
+        GLboolean alpha_enabled;
+    } color_mask; // GL_COLOR_WRITEMASK
+
+    struct {
         bool test_enabled; // GL_STENCIL_TEST
         GLenum test_func; // GL_STENCIL_FUNC
         GLint test_ref; // GL_STENCIL_REF
@@ -42,6 +49,8 @@ public:
         } color; // GL_BLEND_COLOR
     } blend;
 
+    GLenum logic_op; // GL_LOGIC_OP_MODE
+
     // 3 texture units - one for each that is used in PICA fragment shader emulation
     struct {
         bool enabled_2d; // GL_TEXTURE_2D
@@ -61,7 +70,7 @@ public:
     static const OpenGLState& GetCurState() {
         return cur_state;
     }
-    
+
     /// Apply this state as the current OpenGL state
     void Apply();
 
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index f8763e71..e566f9f7 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -71,6 +71,37 @@ inline GLenum BlendFunc(Pica::Regs::BlendFactor factor) {
     return blend_func_table[(unsigned)factor];
 }
 
+inline GLenum LogicOp(Pica::Regs::LogicOp op) {
+    static const GLenum logic_op_table[] = {
+        GL_CLEAR,           // Clear
+        GL_AND,             // And
+        GL_AND_REVERSE,     // AndReverse
+        GL_COPY,            // Copy
+        GL_SET,             // Set
+        GL_COPY_INVERTED,   // CopyInverted
+        GL_NOOP,            // NoOp
+        GL_INVERT,          // Invert
+        GL_NAND,            // Nand
+        GL_OR,              // Or
+        GL_NOR,             // Nor
+        GL_XOR,             // Xor
+        GL_EQUIV,           // Equiv
+        GL_AND_INVERTED,    // AndInverted
+        GL_OR_REVERSE,      // OrReverse
+        GL_OR_INVERTED,     // OrInverted
+    };
+
+    // Range check table for input
+    if ((unsigned)op >= ARRAY_SIZE(logic_op_table)) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown logic op %d", op);
+        UNREACHABLE();
+
+        return GL_COPY;
+    }
+
+    return logic_op_table[(unsigned)op];
+}
+
 inline GLenum CompareFunc(Pica::Regs::CompareFunc func) {
     static const GLenum compare_func_table[] = {
         GL_NEVER,    // CompareFunc::Never
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 16cf92e2..3399ca12 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -157,7 +157,7 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
     state.texture_units[0].enabled_2d = true;
     state.texture_units[0].texture_2d = texture.handle;
     state.Apply();
-    
+
     glActiveTexture(GL_TEXTURE0);
     glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)pixel_stride);
 
@@ -170,6 +170,9 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
                     texture.gl_format, texture.gl_type, framebuffer_data);
 
     glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
 }
 
 /**
@@ -239,6 +242,9 @@ void RendererOpenGL::InitOpenGLObjects() {
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
     }
 
+    state.texture_units[0].texture_2d = 0;
+    state.Apply();
+
     hw_rasterizer->InitObjects();
 }
 
@@ -370,6 +376,8 @@ void RendererOpenGL::Init() {
     }
 
     LOG_INFO(Render_OpenGL, "GL_VERSION: %s", glGetString(GL_VERSION));
+    LOG_INFO(Render_OpenGL, "GL_VENDOR: %s", glGetString(GL_VENDOR));
+    LOG_INFO(Render_OpenGL, "GL_RENDERER: %s", glGetString(GL_RENDERER));
     InitOpenGLObjects();
 }
 
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 7d68998f..87006a83 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -119,17 +119,13 @@ static void ProcessShaderCode(VertexShaderState& state) {
         switch (instr.opcode.Value().GetInfo().type) {
         case OpCode::Type::Arithmetic:
         {
-            bool is_inverted = 0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed);
-            // TODO: We don't really support this properly: For instance, the address register
-            //       offset needs to be applied to SRC2 instead, etc.
-            //       For now, we just abort in this situation.
-            ASSERT_MSG(!is_inverted, "Bad condition...");
+            const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
 
             const int address_offset = (instr.common.address_register_index == 0)
                                        ? 0 : state.address_registers[instr.common.address_register_index - 1];
 
-            const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + address_offset);
-            const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted));
+            const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + (!is_inverted * address_offset));
+            const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + ( is_inverted * address_offset));
 
             const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
             const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
@@ -208,6 +204,15 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 }
                 break;
 
+            case OpCode::Id::MIN:
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = std::min(src1[i], src2[i]);
+                }
+                break;
+
             case OpCode::Id::DP3:
             case OpCode::Id::DP4:
             {
@@ -279,6 +284,16 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
+            case OpCode::Id::SLT:
+            case OpCode::Id::SLTI:
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
+                }
+                break;
+
             case OpCode::Id::CMP:
                 for (int i = 0; i < 2; ++i) {
                     // TODO: Can you restrict to one compare via dest masking?
@@ -330,7 +345,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
         case OpCode::Type::MultiplyAdd:
         {
-            if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) || 
+            if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) ||
                 (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) {
                 const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.mad.operand_desc_id];
 
@@ -547,7 +562,7 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
     const auto& attribute_register_map = regs.vs_input_register_map;
     float24 dummy_register;
     boost::fill(state.input_register_table, &dummy_register);
-    
+
     if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
     if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
     if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;