6 files changed, 185 insertions, 80 deletions
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 7471def5..3ccbc03b 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -2,17 +2,18 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <cstring>
+#include <type_traits>
+
 #include "common/color.h"
 #include "common/common_types.h"
-
-#include "core/arm/arm_interface.h"
+#include "common/logging/log.h"
+#include "common/vector_math.h"
 
 #include "core/settings.h"
-#include "core/core.h"
 #include "core/memory.h"
 #include "core/core_timing.h"
 
-#include "core/hle/hle.h"
 #include "core/hle/service/gsp_gpu.h"
 #include "core/hle/service/dsp_dsp.h"
 #include "core/hle/service/hid/hid.h"
@@ -20,10 +21,17 @@
 #include "core/hw/hw.h"
 #include "core/hw/gpu.h"
 
+#include "core/tracer/recorder.h"
+
 #include "video_core/command_processor.h"
+#include "video_core/hwrasterizer_base.h"
+#include "video_core/renderer_base.h"
 #include "video_core/utils.h"
 #include "video_core/video_core.h"
 
+#include "video_core/debug_utils/debug_utils.h"
+
+
 namespace GPU {
 
 Regs g_regs;
@@ -53,6 +61,29 @@ inline void Read(T &var, const u32 raw_addr) {
     var = g_regs[addr / 4];
 }
 
+static Math::Vec4<u8> DecodePixel(Regs::PixelFormat input_format, const u8* src_pixel) {
+    switch (input_format) {
+    case Regs::PixelFormat::RGBA8:
+        return Color::DecodeRGBA8(src_pixel);
+
+    case Regs::PixelFormat::RGB8:
+        return Color::DecodeRGB8(src_pixel);
+
+    case Regs::PixelFormat::RGB565:
+        return Color::DecodeRGB565(src_pixel);
+
+    case Regs::PixelFormat::RGB5A1:
+        return Color::DecodeRGB5A1(src_pixel);
+
+    case Regs::PixelFormat::RGBA4:
+        return Color::DecodeRGBA4(src_pixel);
+
+    default:
+        LOG_ERROR(HW_GPU, "Unknown source framebuffer format %x", input_format);
+        return {0, 0, 0, 0};
+    }
+}
+
 template <typename T>
 inline void Write(u32 addr, const T data) {
     addr -= HW::VADDR_GPU;
@@ -75,39 +106,43 @@ inline void Write(u32 addr, const T data) {
         const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
         auto& config = g_regs.memory_fill_config[is_second_filler];
 
-        if (config.address_start && config.trigger) {
-            u8* start = Memory::GetPhysicalPointer(config.GetStartAddress());
-            u8* end = Memory::GetPhysicalPointer(config.GetEndAddress());
-
-            if (config.fill_24bit) {
-                // fill with 24-bit values
-                for (u8* ptr = start; ptr < end; ptr += 3) {
-                    ptr[0] = config.value_24bit_r;
-                    ptr[1] = config.value_24bit_g;
-                    ptr[2] = config.value_24bit_b;
+        if (config.trigger) {
+            if (config.address_start) { // Some games pass invalid values here
+                u8* start = Memory::GetPhysicalPointer(config.GetStartAddress());
+                u8* end = Memory::GetPhysicalPointer(config.GetEndAddress());
+
+                if (config.fill_24bit) {
+                    // fill with 24-bit values
+                    for (u8* ptr = start; ptr < end; ptr += 3) {
+                        ptr[0] = config.value_24bit_r;
+                        ptr[1] = config.value_24bit_g;
+                        ptr[2] = config.value_24bit_b;
+                    }
+                } else if (config.fill_32bit) {
+                    // fill with 32-bit values
+                    for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
+                        *ptr = config.value_32bit;
+                } else {
+                    // fill with 16-bit values
+                    for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
+                        *ptr = config.value_16bit;
                 }
-            } else if (config.fill_32bit) {
-                // fill with 32-bit values
-                for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
-                    *ptr = config.value_32bit;
-            } else {
-                // fill with 16-bit values
-                for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
-                    *ptr = config.value_16bit;
-            }
 
-            LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
+                LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
 
-            config.trigger = 0;
-            config.finished = 1;
+                if (!is_second_filler) {
+                    GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
+                } else {
+                    GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC1);
+                }
 
-            if (!is_second_filler) {
-                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
-            } else {
-                GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC1);
+                VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetStartAddress(), config.GetEndAddress() - config.GetStartAddress());
             }
 
-            VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetStartAddress(), config.GetEndAddress() - config.GetStartAddress());
+            // Reset "trigger" flag and set the "finish" flag
+            // NOTE: This was confirmed to happen on hardware even if "address_start" is zero.
+            config.trigger = 0;
+            config.finished = 1;
         }
         break;
     }
@@ -116,6 +151,10 @@ inline void Write(u32 addr, const T data) {
     {
         const auto& config = g_regs.display_transfer_config;
         if (config.trigger & 1) {
+
+            if (Pica::g_debug_context)
+                Pica::g_debug_context->OnEvent(Pica::DebugContext::Event::IncomingDisplayTransfer, nullptr);
+
             u8* src_pointer = Memory::GetPhysicalPointer(config.GetPhysicalInputAddress());
             u8* dst_pointer = Memory::GetPhysicalPointer(config.GetPhysicalOutputAddress());
 
@@ -125,11 +164,18 @@ inline void Write(u32 addr, const T data) {
                 break;
             }
 
-            unsigned horizontal_scale = (config.scaling != config.NoScale) ? 2 : 1;
-            unsigned vertical_scale = (config.scaling == config.ScaleXY) ? 2 : 1;
+            if (config.output_tiled &&
+                    (config.scaling == config.ScaleXY || config.scaling == config.ScaleX)) {
+                LOG_CRITICAL(HW_GPU, "Scaling is only implemented on tiled input");
+                UNIMPLEMENTED();
+                break;
+            }
 
-            u32 output_width = config.output_width / horizontal_scale;
-            u32 output_height = config.output_height / vertical_scale;
+            bool horizontal_scale = config.scaling != config.NoScale;
+            bool vertical_scale = config.scaling == config.ScaleXY;
+
+            u32 output_width = config.output_width >> horizontal_scale;
+            u32 output_height = config.output_height >> vertical_scale;
 
             u32 input_size = config.input_width * config.input_height * GPU::Regs::BytesPerPixel(config.input_format);
             u32 output_size = output_width * output_height * GPU::Regs::BytesPerPixel(config.output_format);
@@ -153,16 +199,14 @@ inline void Write(u32 addr, const T data) {
                 break;
             }
 
-            // TODO(Subv): Implement the box filter when scaling is enabled
-            // right now we're just skipping the extra pixels.
             for (u32 y = 0; y < output_height; ++y) {
                 for (u32 x = 0; x < output_width; ++x) {
-                    Math::Vec4<u8> src_color = { 0, 0, 0, 0 };
+                    Math::Vec4<u8> src_color;
 
                     // Calculate the [x,y] position of the input image
                     // based on the current output position and the scale
-                    u32 input_x = x * horizontal_scale;
-                    u32 input_y = y * vertical_scale;
+                    u32 input_x = x << horizontal_scale;
+                    u32 input_y = y << vertical_scale;
 
                     if (config.flip_vertically) {
                         // Flip the y value of the output data,
@@ -177,46 +221,49 @@ inline void Write(u32 addr, const T data) {
                     u32 dst_offset;
 
                     if (config.output_tiled) {
-                        // Interpret the input as linear and the output as tiled
-                        u32 coarse_y = y & ~7;
-                        u32 stride = output_width * dst_bytes_per_pixel;
-
-                        src_offset = (input_x + input_y * config.input_width) * src_bytes_per_pixel;
-                        dst_offset = VideoCore::GetMortonOffset(x, y, dst_bytes_per_pixel) + coarse_y * stride;
+                        if (!config.dont_swizzle) {
+                            // Interpret the input as linear and the output as tiled
+                            u32 coarse_y = y & ~7;
+                            u32 stride = output_width * dst_bytes_per_pixel;
+
+                            src_offset = (input_x + input_y * config.input_width) * src_bytes_per_pixel;
+                            dst_offset = VideoCore::GetMortonOffset(x, y, dst_bytes_per_pixel) + coarse_y * stride;
+                        } else {
+                           // Both input and output are linear
+                            src_offset = (input_x + input_y * config.input_width) * src_bytes_per_pixel;
+                            dst_offset = (x + y * output_width) * dst_bytes_per_pixel;
+                        }
                     } else {
-                        // Interpret the input as tiled and the output as linear
-                        u32 coarse_y = input_y & ~7;
-                        u32 stride = config.input_width * src_bytes_per_pixel;
-
-                        src_offset = VideoCore::GetMortonOffset(input_x, input_y, src_bytes_per_pixel) + coarse_y * stride;
-                        dst_offset = (x + y * output_width) * dst_bytes_per_pixel;
+                        if (!config.dont_swizzle) {
+                            // Interpret the input as tiled and the output as linear
+                            u32 coarse_y = input_y & ~7;
+                            u32 stride = config.input_width * src_bytes_per_pixel;
+
+                            src_offset = VideoCore::GetMortonOffset(input_x, input_y, src_bytes_per_pixel) + coarse_y * stride;
+                            dst_offset = (x + y * output_width) * dst_bytes_per_pixel;
+                        } else {
+                            // Both input and output are tiled
+                            u32 out_coarse_y = y & ~7;
+                            u32 out_stride = output_width * dst_bytes_per_pixel;
+
+                            u32 in_coarse_y = input_y & ~7;
+                            u32 in_stride = config.input_width * src_bytes_per_pixel;
+
+                            src_offset = VideoCore::GetMortonOffset(input_x, input_y, src_bytes_per_pixel) + in_coarse_y * in_stride;
+                            dst_offset = VideoCore::GetMortonOffset(x, y, dst_bytes_per_pixel) + out_coarse_y * out_stride;
+                        }
                     }
 
                     const u8* src_pixel = src_pointer + src_offset;
-                    switch (config.input_format) {
-                    case Regs::PixelFormat::RGBA8:
-                        src_color = Color::DecodeRGBA8(src_pixel);
-                        break;
-
-                    case Regs::PixelFormat::RGB8:
-                        src_color = Color::DecodeRGB8(src_pixel);
-                        break;
-
-                    case Regs::PixelFormat::RGB565:
-                        src_color = Color::DecodeRGB565(src_pixel);
-                        break;
-
-                    case Regs::PixelFormat::RGB5A1:
-                        src_color = Color::DecodeRGB5A1(src_pixel);
-                        break;
-
-                    case Regs::PixelFormat::RGBA4:
-                        src_color = Color::DecodeRGBA4(src_pixel);
-                        break;
-
-                    default:
-                        LOG_ERROR(HW_GPU, "Unknown source framebuffer format %x", config.input_format.Value());
-                        break;
+                    src_color = DecodePixel(config.input_format, src_pixel);
+                    if (config.scaling == config.ScaleX) {
+                        Math::Vec4<u8> pixel = DecodePixel(config.input_format, src_pixel + src_bytes_per_pixel);
+                        src_color = ((src_color + pixel) / 2).Cast<u8>();
+                    } else if (config.scaling == config.ScaleXY) {
+                        Math::Vec4<u8> pixel1 = DecodePixel(config.input_format, src_pixel + 1 * src_bytes_per_pixel);
+                        Math::Vec4<u8> pixel2 = DecodePixel(config.input_format, src_pixel + 2 * src_bytes_per_pixel);
+                        Math::Vec4<u8> pixel3 = DecodePixel(config.input_format, src_pixel + 3 * src_bytes_per_pixel);
+                        src_color = (((src_color + pixel1) + (pixel2 + pixel3)) / 4).Cast<u8>();
                     }
 
                     u8* dst_pixel = dst_pointer + dst_offset;
@@ -254,6 +301,7 @@ inline void Write(u32 addr, const T data) {
                       config.GetPhysicalOutputAddress(), output_width, output_height,
                       config.output_format.Value(), config.flags);
 
+            g_regs.display_transfer_config.trigger = 0;
             GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PPF);
 
             VideoCore::g_renderer->hw_rasterizer->NotifyFlush(config.GetPhysicalOutputAddress(), output_size);
@@ -268,7 +316,14 @@ inline void Write(u32 addr, const T data) {
         if (config.trigger & 1)
         {
             u32* buffer = (u32*)Memory::GetPhysicalPointer(config.GetPhysicalAddress());
+
+            if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+                Pica::g_debug_context->recorder->MemoryAccessed((u8*)buffer, config.size * sizeof(u32), config.GetPhysicalAddress());
+            }
+
             Pica::CommandProcessor::ProcessCommandList(buffer, config.size);
+
+            g_regs.command_processor_config.trigger = 0;
         }
         break;
     }
@@ -276,6 +331,13 @@ inline void Write(u32 addr, const T data) {
     default:
         break;
     }
+
+    // Notify tracer about the register write
+    // This is happening *after* handling the write to make sure we properly catch all memory reads.
+    if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+        // addr + GPU VBase - IO VBase + IO PBase
+        Pica::g_debug_context->recorder->RegisterWritten<T>(addr + 0x1EF00000 - 0x1EC00000 + 0x10100000, data);
+    }
 }
 
 // Explicitly instantiate template functions because we aren't defining this in the header:
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 699bcd2a..daad506f 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <cstddef>
+#include <type_traits>
 
 #include "common/assert.h"
 #include "common/bit_field.h"
@@ -202,6 +203,7 @@ struct Regs {
             BitField< 0, 1, u32> flip_vertically;  // flips input data vertically
             BitField< 1, 1, u32> output_tiled;     // Converts from linear to tiled format
             BitField< 3, 1, u32> raw_copy;         // Copies the data without performing any processing
+            BitField< 5, 1, u32> dont_swizzle;
             BitField< 8, 3, PixelFormat> input_format;
             BitField<12, 3, PixelFormat> output_format;
 
diff --git a/src/core/hw/hw.cpp b/src/core/hw/hw.cpp
index c7006a49..b5fdbf9c 100644
--- a/src/core/hw/hw.cpp
+++ b/src/core/hw/hw.cpp
@@ -15,6 +15,21 @@ template <typename T>
 inline void Read(T &var, const u32 addr) {
     switch (addr & 0xFFFFF000) {
     case VADDR_GPU:
+    case VADDR_GPU + 0x1000:
+    case VADDR_GPU + 0x2000:
+    case VADDR_GPU + 0x3000:
+    case VADDR_GPU + 0x4000:
+    case VADDR_GPU + 0x5000:
+    case VADDR_GPU + 0x6000:
+    case VADDR_GPU + 0x7000:
+    case VADDR_GPU + 0x8000:
+    case VADDR_GPU + 0x9000:
+    case VADDR_GPU + 0xA000:
+    case VADDR_GPU + 0xB000:
+    case VADDR_GPU + 0xC000:
+    case VADDR_GPU + 0xD000:
+    case VADDR_GPU + 0xE000:
+    case VADDR_GPU + 0xF000:
         GPU::Read(var, addr);
         break;
     case VADDR_LCD:
@@ -29,6 +44,21 @@ template <typename T>
 inline void Write(u32 addr, const T data) {
     switch (addr & 0xFFFFF000) {
     case VADDR_GPU:
+    case VADDR_GPU + 0x1000:
+    case VADDR_GPU + 0x2000:
+    case VADDR_GPU + 0x3000:
+    case VADDR_GPU + 0x4000:
+    case VADDR_GPU + 0x5000:
+    case VADDR_GPU + 0x6000:
+    case VADDR_GPU + 0x7000:
+    case VADDR_GPU + 0x8000:
+    case VADDR_GPU + 0x9000:
+    case VADDR_GPU + 0xA000:
+    case VADDR_GPU + 0xB000:
+    case VADDR_GPU + 0xC000:
+    case VADDR_GPU + 0xD000:
+    case VADDR_GPU + 0xE000:
+    case VADDR_GPU + 0xF000:
         GPU::Write(addr, data);
         break;
     case VADDR_LCD:
diff --git a/src/core/hw/lcd.cpp b/src/core/hw/lcd.cpp
index 963c8d98..6f93709e 100644
--- a/src/core/hw/lcd.cpp
+++ b/src/core/hw/lcd.cpp
@@ -7,11 +7,12 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 
-#include "core/arm/arm_interface.h"
-#include "core/hle/hle.h"
 #include "core/hw/hw.h"
 #include "core/hw/lcd.h"
 
+#include "core/tracer/recorder.h"
+#include "video_core/debug_utils/debug_utils.h"
+
 namespace LCD {
 
 Regs g_regs;
@@ -42,6 +43,13 @@ inline void Write(u32 addr, const T data) {
     }
 
     g_regs[index] = static_cast<u32>(data);
+
+    // Notify tracer about the register write
+    // This is happening *after* handling the write to make sure we properly catch all memory reads.
+    if (Pica::g_debug_context && Pica::g_debug_context->recorder) {
+        // addr + GPU VBase - IO VBase + IO PBase
+        Pica::g_debug_context->recorder->RegisterWritten<T>(addr + HW::VADDR_LCD - 0x1EC00000 + 0x10100000, data);
+    }
 }
 
 // Explicitly instantiate template functions because we aren't defining this in the header:
diff --git a/src/core/hw/lcd.h b/src/core/hw/lcd.h
index 8631eb20..bcce6d8c 100644
--- a/src/core/hw/lcd.h
+++ b/src/core/hw/lcd.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <cstddef>
+#include <type_traits>
 
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp
index 5b7fb39e..f80e26ec 100644
--- a/src/core/hw/y2r.cpp
+++ b/src/core/hw/y2r.cpp
@@ -2,8 +2,10 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <array>
-#include <numeric>
+#include <cstddef>
+#include <memory>
 
 #include "common/assert.h"
 #include "common/color.h"
@@ -109,7 +111,7 @@ static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data
         while (output < unit_end) {
             u32 color = *input++;
             Math::Vec4<u8> col_vec{
-                (color >> 24) & 0xFF, (color >> 16) & 0xFF, (color >>  8) & 0xFF, alpha,
+                (u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha
             };
 
             switch (output_format) {