5 files changed, 55 insertions, 43 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
index b40f13ca..f80e26ec 100644
--- a/src/core/hw/y2r.cpp
+++ b/src/core/hw/y2r.cpp
@@ -111,7 +111,7 @@ static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data
         while (output < unit_end) {
             u32 color = *input++;
             Math::Vec4<u8> col_vec{
-                (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >>  8) & 0xFF, alpha,
+                (u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha
             };
 
             switch (output_format) {
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 224132d7..558b49d6 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -153,7 +153,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                   "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                   "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                   "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i, output_list->size(),
+                  i + 1, output_list->size() - 2,
                   vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                   vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                   vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 2a1c885a..f2e3aee8 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -60,6 +60,46 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);
             break;
 
+        // Load default vertex input attributes
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
+        {
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            default_attr_write_buffer[default_attr_counter++] = value;
+
+            // Default attributes are written in a packed format such that four float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if (default_attr_counter >= 3) {
+                default_attr_counter = 0;
+
+                auto& setup = regs.vs_default_attributes_setup;
+
+                if (setup.index >= 16) {
+                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+                    break;
+                }
+
+                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
+
+                // NOTE: The destination component order indeed is "backwards"
+                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
+                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
+                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
+
+                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                          attribute.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                setup.index = setup.index + 1;
+            }
+            break;
+        }
+
         case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[0], 0x23c):
         case PICA_REG_INDEX_WORKAROUND(command_buffer.trigger[1], 0x23d):
         {
@@ -351,46 +391,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             break;
         }
 
-        // Load default vertex input attributes
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
-        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
-        {
-            // TODO: Does actual hardware indeed keep an intermediate buffer or does
-            //       it directly write the values?
-            default_attr_write_buffer[default_attr_counter++] = value;
-
-            // Default attributes are written in a packed format such that four float24 values are encoded in
-            // three 32-bit numbers. We write to internal memory once a full such vector is
-            // written.
-            if (default_attr_counter >= 3) {
-                default_attr_counter = 0;
-
-                auto& setup = regs.vs_default_attributes_setup;
-
-                if (setup.index >= 16) {
-                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
-                    break;
-                }
-
-                Math::Vec4<float24>& attribute = g_state.vs.default_attributes[setup.index];
-
-                // NOTE: The destination component order indeed is "backwards"
-                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
-                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
-                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
-
-                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
-                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
-                          attribute.w.ToFloat32());
-
-                // TODO: Verify that this actually modifies the register!
-                setup.index = setup.index + 1;
-            }
-            break;
-        }
-
         // Load shader program code
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 46a7b21d..026b10a6 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -290,6 +290,7 @@ struct Regs {
             AddSigned       = 3,
             Lerp            = 4,
             Subtract        = 5,
+            Dot3_RGB        = 6,
 
             MultiplyThenAdd = 8,
             AddThenMultiply = 9,
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index c381c2bd..a6b7997c 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -641,7 +641,18 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                         result = (result * input[2].Cast<int>()) / 255;
                         return result.Cast<u8>();
                     }
-
+                    case Operation::Dot3_RGB:
+                    {
+                        // Not fully accurate.
+                        // Worst case scenario seems to yield a +/-3 error
+                        // Some HW results indicate that the per-component computation can't have a higher precision than 1/256,
+                        // while dot3_rgb( (0x80,g0,b0),(0x7F,g1,b1) ) and dot3_rgb( (0x80,g0,b0),(0x80,g1,b1) ) give different results
+                        int result = ((input[0].r() * 2 - 255) * (input[1].r() * 2 - 255) + 128) / 256 +
+                                     ((input[0].g() * 2 - 255) * (input[1].g() * 2 - 255) + 128) / 256 +
+                                     ((input[0].b() * 2 - 255) * (input[1].b() * 2 - 255) + 128) / 256;
+                        result = std::max(0, std::min(255, result));
+                        return { (u8)result, (u8)result, (u8)result };
+                    }
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();