aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--src/common/common_funcs.h7
-rw-r--r--src/core/arm/dyncom/arm_dyncom_dec.cpp2
-rw-r--r--src/core/arm/dyncom/arm_dyncom_interpreter.cpp29
-rw-r--r--src/core/arm/skyeye_common/armemu.h2
-rw-r--r--src/core/arm/skyeye_common/vfp/vfpdouble.cpp4
-rw-r--r--src/core/arm/skyeye_common/vfp/vfpinstr.cpp8
-rw-r--r--src/core/hle/config_mem.cpp96
-rw-r--r--src/core/hle/config_mem.h2
-rw-r--r--src/core/hle/hle.cpp2
-rw-r--r--src/core/hle/service/gsp_gpu.cpp34
-rw-r--r--src/core/hle/service/gsp_gpu.h4
-rw-r--r--src/core/hle/service/service.cpp2
-rw-r--r--src/core/hle/shared_page.cpp8
-rw-r--r--src/core/hw/gpu.cpp41
-rw-r--r--src/core/hw/gpu.h51
-rw-r--r--src/video_core/clipper.cpp84
-rw-r--r--src/video_core/command_processor.cpp32
-rw-r--r--src/video_core/math.h2
-rw-r--r--src/video_core/pica.h24
-rw-r--r--src/video_core/rasterizer.cpp252
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp74
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h5
-rw-r--r--src/video_core/vertex_shader.cpp50
23 files changed, 530 insertions, 285 deletions
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h
index 44d8ae11..4bbcc357 100644
--- a/src/common/common_funcs.h
+++ b/src/common/common_funcs.h
@@ -35,6 +35,13 @@ template<> struct CompileTimeAssert<true> {};
#define CONCAT2(x, y) DO_CONCAT2(x, y)
#define DO_CONCAT2(x, y) x ## y
+// helper macro to properly align structure members.
+// Calling INSERT_PADDING_BYTES will add a new member variable with a name like "pad121",
+// depending on the current source line to make sure variable names are unique.
+#define INSERT_PADDING_BYTES_HELPER1(x, y) x ## y
+#define INSERT_PADDING_BYTES_HELPER2(x, y) INSERT_PADDING_BYTES_HELPER1(x, y)
+#define INSERT_PADDING_BYTES(num_words) u8 INSERT_PADDING_BYTES_HELPER2(pad, __LINE__)[(num_words)]
+
#ifndef _MSC_VER
#include <errno.h>
diff --git a/src/core/arm/dyncom/arm_dyncom_dec.cpp b/src/core/arm/dyncom/arm_dyncom_dec.cpp
index ffa62735..9f3b90fd 100644
--- a/src/core/arm/dyncom/arm_dyncom_dec.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_dec.cpp
@@ -42,7 +42,7 @@ const ISEITEM arm_instruction[] = {
{ "srs", 4, 6, 25, 31, 0x0000007c, 22, 22, 0x00000001, 16, 20, 0x0000000d, 8, 11, 0x00000005 },
{ "rfe", 4, 6, 25, 31, 0x0000007c, 22, 22, 0x00000000, 20, 20, 0x00000001, 8, 11, 0x0000000a },
- { "bkpt", 2, 3, 20, 31, 0x00000e12, 4, 7, 0x00000007 },
+ { "bkpt", 2, 3, 20, 27, 0x00000012, 4, 7, 0x00000007 },
{ "blx", 1, 3, 25, 31, 0x0000007d },
{ "cps", 3, 6, 20, 31, 0x00000f10, 16, 16, 0x00000000, 5, 5, 0x00000000 },
{ "pld", 4, 4, 26, 31, 0x0000003d, 24, 24, 0x00000001, 20, 22, 0x00000005, 12, 15, 0x0000000f },
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index f4b3c473..b691ffbc 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -792,6 +792,7 @@ typedef struct _stm_inst {
} stm_inst;
struct bkpt_inst {
+ u32 imm;
};
struct blx1_inst {
@@ -1371,7 +1372,22 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(bic)(unsigned int inst, int index)
inst_base->br = INDIRECT_BRANCH;
return inst_base;
}
-static ARM_INST_PTR INTERPRETER_TRANSLATE(bkpt)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("BKPT"); }
+
+static ARM_INST_PTR INTERPRETER_TRANSLATE(bkpt)(unsigned int inst, int index)
+{
+ arm_inst* const inst_base = (arm_inst*)AllocBuffer(sizeof(arm_inst) + sizeof(bkpt_inst));
+ bkpt_inst* const inst_cream = (bkpt_inst*)inst_base->component;
+
+ inst_base->cond = BITS(inst, 28, 31);
+ inst_base->idx = index;
+ inst_base->br = NON_BRANCH;
+ inst_base->load_r15 = 0;
+
+ inst_cream->imm = BITS(inst, 8, 19) | BITS(inst, 0, 3);
+
+ return inst_base;
+}
+
static ARM_INST_PTR INTERPRETER_TRANSLATE(blx)(unsigned int inst, int index)
{
arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(blx_inst));
@@ -3211,6 +3227,7 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(usada8)(unsigned int inst, int index)
inst_cream->op1 = BITS(inst, 20, 24);
inst_cream->op2 = BITS(inst, 5, 7);
+ inst_cream->Rd = BITS(inst, 16, 19);
inst_cream->Rm = BITS(inst, 8, 11);
inst_cream->Rn = BITS(inst, 0, 3);
inst_cream->Ra = BITS(inst, 12, 15);
@@ -4080,6 +4097,16 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
GOTO_NEXT_INST;
}
BKPT_INST:
+ {
+ if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
+ bkpt_inst* const inst_cream = (bkpt_inst*)inst_base->component;
+ LOG_DEBUG(Core_ARM11, "Breakpoint instruction hit. Immediate: 0x%08X", inst_cream->imm);
+ }
+ cpu->Reg[15] += GET_INST_SIZE(cpu);
+ INC_PC(sizeof(bkpt_inst));
+ FETCH_INST;
+ GOTO_NEXT_INST;
+ }
BLX_INST:
{
blx_inst *inst_cream = (blx_inst *)inst_base->component;
diff --git a/src/core/arm/skyeye_common/armemu.h b/src/core/arm/skyeye_common/armemu.h
index 8bfd4e0f..2a1c5077 100644
--- a/src/core/arm/skyeye_common/armemu.h
+++ b/src/core/arm/skyeye_common/armemu.h
@@ -35,7 +35,7 @@ enum : u32 {
// Masks for groups of bits in the APSR.
MODEBITS = 0x1F,
- INTBITS = 0xC0,
+ INTBITS = 0x1C0,
};
// Different ways to start the next instruction.
diff --git a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
index 17726b8b..1a05ef8c 100644
--- a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
@@ -381,7 +381,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u
s64 d, m;
u32 ret = 0;
- LOG_TRACE(Core_ARM11, "In %s, state=0x%x, fpscr=0x%x\n", __FUNCTION__, state, fpscr);
+ LOG_TRACE(Core_ARM11, "In %s, state=0x%p, fpscr=0x%x\n", __FUNCTION__, state, fpscr);
m = vfp_get_double(state, dm);
if (vfp_double_packed_exponent(m) == 2047 && vfp_double_packed_mantissa(m)) {
ret |= FPSCR_CFLAG | FPSCR_VFLAG;
@@ -436,7 +436,7 @@ static u32 vfp_compare(ARMul_State* state, int dd, int signal_on_qnan, int dm, u
ret |= FPSCR_CFLAG;
}
}
- LOG_TRACE(Core_ARM11, "In %s, state=0x%x, ret=0x%x\n", __FUNCTION__, state, ret);
+ LOG_TRACE(Core_ARM11, "In %s, state=0x%p, ret=0x%x\n", __FUNCTION__, state, ret);
return ret;
}
diff --git a/src/core/arm/skyeye_common/vfp/vfpinstr.cpp b/src/core/arm/skyeye_common/vfp/vfpinstr.cpp
index 1f1b5b1c..b9b96c38 100644
--- a/src/core/arm/skyeye_common/vfp/vfpinstr.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpinstr.cpp
@@ -1443,7 +1443,7 @@ VPUSH_INST:
addr = cpu->Reg[R13] - inst_cream->imm32;
- for (int i = 0; i < inst_cream->regs; i++)
+ for (unsigned int i = 0; i < inst_cream->regs; i++)
{
if (inst_cream->single)
{
@@ -1512,7 +1512,7 @@ VSTM_INST: /* encoding 1 */
addr = (inst_cream->add ? cpu->Reg[inst_cream->n] : cpu->Reg[inst_cream->n] - inst_cream->imm32);
- for (int i = 0; i < inst_cream->regs; i++)
+ for (unsigned int i = 0; i < inst_cream->regs; i++)
{
if (inst_cream->single)
{
@@ -1581,7 +1581,7 @@ VPOP_INST:
addr = cpu->Reg[R13];
- for (int i = 0; i < inst_cream->regs; i++)
+ for (unsigned int i = 0; i < inst_cream->regs; i++)
{
if (inst_cream->single)
{
@@ -1718,7 +1718,7 @@ VLDM_INST:
addr = (inst_cream->add ? cpu->Reg[inst_cream->n] : cpu->Reg[inst_cream->n] - inst_cream->imm32);
- for (int i = 0; i < inst_cream->regs; i++)
+ for (unsigned int i = 0; i < inst_cream->regs; i++)
{
if (inst_cream->single)
{
diff --git a/src/core/hle/config_mem.cpp b/src/core/hle/config_mem.cpp
index 68d3071f..b10c19d1 100644
--- a/src/core/hle/config_mem.cpp
+++ b/src/core/hle/config_mem.cpp
@@ -3,60 +3,54 @@
// Refer to the license.txt file included.
#include "common/common_types.h"
-#include "common/logging/log.h"
+#include "common/common_funcs.h"
+#include "core/core.h"
+#include "core/mem_map.h"
#include "core/hle/config_mem.h"
////////////////////////////////////////////////////////////////////////////////////////////////////
namespace ConfigMem {
-enum {
- KERNEL_VERSIONREVISION = 0x1FF80001,
- KERNEL_VERSIONMINOR = 0x1FF80002,
- KERNEL_VERSIONMAJOR = 0x1FF80003,
- UPDATEFLAG = 0x1FF80004,
- NSTID = 0x1FF80008,
- SYSCOREVER = 0x1FF80010,
- UNITINFO = 0x1FF80014,
- KERNEL_CTRSDKVERSION = 0x1FF80018,
- APPMEMTYPE = 0x1FF80030,
- APPMEMALLOC = 0x1FF80040,
- FIRM_VERSIONREVISION = 0x1FF80061,
- FIRM_VERSIONMINOR = 0x1FF80062,
- FIRM_VERSIONMAJOR = 0x1FF80063,
- FIRM_SYSCOREVER = 0x1FF80064,
- FIRM_CTRSDKVERSION = 0x1FF80068,
+struct ConfigMemDef {
+ u8 kernel_unk; // 0
+ u8 kernel_version_rev; // 1
+ u8 kernel_version_min; // 2
+ u8 kernel_version_maj; // 3
+ u32 update_flag; // 4
+ u64 ns_tid; // 8
+ u32 sys_core_ver; // 10
+ u8 unit_info; // 14
+ u8 boot_firm; // 15
+ u8 prev_firm; // 16
+ INSERT_PADDING_BYTES(0x1); // 17
+ u32 ctr_sdk_ver; // 18
+ INSERT_PADDING_BYTES(0x30 - 0x1C); // 1C
+ u32 app_mem_type; // 30
+ INSERT_PADDING_BYTES(0x40 - 0x34); // 34
+ u32 app_mem_alloc; // 40
+ u32 sys_mem_alloc; // 44
+ u32 base_mem_alloc; // 48
+ INSERT_PADDING_BYTES(0x60 - 0x4C); // 4C
+ u8 firm_unk; // 60
+ u8 firm_version_rev; // 61
+ u8 firm_version_min; // 62
+ u8 firm_version_maj; // 63
+ u32 firm_sys_core_ver; // 64
+ u32 firm_ctr_sdk_ver; // 68
+ INSERT_PADDING_BYTES(0x1000 - 0x6C); // 6C
};
-template <typename T>
-inline void Read(T &var, const u32 addr) {
- switch (addr) {
-
- // Bit 0 set for Retail
- case UNITINFO:
- var = 0x00000001;
- break;
-
- // Set app memory size to 64MB?
- case APPMEMALLOC:
- var = 0x04000000;
- break;
+static_assert(sizeof(ConfigMemDef) == Memory::CONFIG_MEMORY_SIZE, "Config Memory structure size is wrong");
- // Unknown - normally set to: 0x08000000 - (APPMEMALLOC + *0x1FF80048)
- // (Total FCRAM size - APPMEMALLOC - *0x1FF80048)
- case 0x1FF80044:
- var = 0x08000000 - (0x04000000 + 0x1400000);
- break;
+static ConfigMemDef config_mem;
- // Unknown - normally set to: 0x1400000 (20MB)
- case 0x1FF80048:
- var = 0x1400000;
- break;
-
- default:
- LOG_ERROR(Kernel, "unknown addr=0x%08X", addr);
- }
+template <typename T>
+inline void Read(T &var, const u32 addr) {
+ u32 offset = addr - Memory::CONFIG_MEMORY_VADDR;
+ ASSERT(offset < Memory::CONFIG_MEMORY_SIZE);
+ var = *(reinterpret_cast<T*>(((uintptr_t)&config_mem) + offset));
}
// Explicitly instantiate template functions because we aren't defining this in the header:
@@ -66,5 +60,21 @@ template void Read<u32>(u32 &var, const u32 addr);
template void Read<u16>(u16 &var, const u32 addr);
template void Read<u8>(u8 &var, const u32 addr);
+void Init() {
+ config_mem.update_flag = 0; // No update
+ config_mem.sys_core_ver = 0x2;
+ config_mem.unit_info = 0x1; // Bit 0 set for Retail
+ config_mem.prev_firm = 0;
+ config_mem.app_mem_type = 0; // Defualt app mem type
+ config_mem.unit_info = 0x1; // Bit 0 set for Retail
+ config_mem.app_mem_alloc = 0x04000000; // Default app memory size is 64MB
+ config_mem.base_mem_alloc = 0x01400000; // Default base memory is 20MB
+ config_mem.sys_mem_alloc = Memory::FCRAM_SIZE - (config_mem.app_mem_alloc + config_mem.base_mem_alloc);
+ config_mem.firm_unk = 0;
+ config_mem.firm_version_rev = 0;
+ config_mem.firm_version_min = 0x40;
+ config_mem.firm_version_maj = 0x2;
+ config_mem.firm_sys_core_ver = 0x2;
+}
} // namespace
diff --git a/src/core/hle/config_mem.h b/src/core/hle/config_mem.h
index 3975af18..94853901 100644
--- a/src/core/hle/config_mem.h
+++ b/src/core/hle/config_mem.h
@@ -18,4 +18,6 @@ namespace ConfigMem {
template <typename T>
void Read(T &var, const u32 addr);
+void Init();
+
} // namespace
diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp
index 529133ca..b0066e15 100644
--- a/src/core/hle/hle.cpp
+++ b/src/core/hle/hle.cpp
@@ -7,6 +7,7 @@
#include "core/arm/arm_interface.h"
#include "core/mem_map.h"
#include "core/hle/hle.h"
+#include "core/hle/config_mem.h"
#include "core/hle/shared_page.h"
#include "core/hle/kernel/thread.h"
#include "core/hle/service/service.h"
@@ -75,6 +76,7 @@ void Init() {
RegisterAllModules();
+ ConfigMem::Init();
SharedPage::Init();
LOG_DEBUG(Kernel, "initialized OK");
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 31e61391..c23cfa3c 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -368,28 +368,28 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
case CommandId::SET_MEMORY_FILL:
{
auto& params = command.memory_fill;
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)),
- Memory::VirtualToPhysicalAddress(params.start1) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)),
- Memory::VirtualToPhysicalAddress(params.end1) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].size)), params.end1 - params.start1);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value)), params.value1);
-
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)),
- Memory::VirtualToPhysicalAddress(params.start2) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)),
- Memory::VirtualToPhysicalAddress(params.end2) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].size)), params.end2 - params.start2);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value)), params.value2);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)),
+ Memory::VirtualToPhysicalAddress(params.start1) >> 3);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)),
+ Memory::VirtualToPhysicalAddress(params.end1) >> 3);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value_32bit)), params.value1);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].control)), params.control1);
+
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)),
+ Memory::VirtualToPhysicalAddress(params.start2) >> 3);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)),
+ Memory::VirtualToPhysicalAddress(params.end2) >> 3);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value_32bit)), params.value2);
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].control)), params.control2);
break;
}
case CommandId::SET_DISPLAY_TRANSFER:
{
auto& params = command.image_copy;
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
@@ -402,9 +402,9 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
case CommandId::SET_TEXTURE_COPY:
{
auto& params = command.image_copy;
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
- WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
+ WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h
index 65abb194..a435d418 100644
--- a/src/core/hle/service/gsp_gpu.h
+++ b/src/core/hle/service/gsp_gpu.h
@@ -109,9 +109,13 @@ struct Command {
u32 start1;
u32 value1;
u32 end1;
+
u32 start2;
u32 value2;
u32 end2;
+
+ u16 control1;
+ u16 control2;
} memory_fill;
struct {
diff --git a/src/core/hle/service/service.cpp b/src/core/hle/service/service.cpp
index e0979ea5..5dce8068 100644
--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -71,6 +71,7 @@ static void AddService(Interface* interface) {
/// Initialize ServiceManager
void Init() {
AddNamedPort(new SRV::Interface);
+ AddNamedPort(new ERR_F::Interface);
AddService(new AC_U::Interface);
AddService(new ACT_U::Interface);
@@ -90,7 +91,6 @@ void Init() {
AddService(new CFG_U::Interface);
AddService(new CSND_SND::Interface);
AddService(new DSP_DSP::Interface);
- AddService(new ERR_F::Interface);
AddService(new FRD_A::Interface);
AddService(new FRD_U::Interface);
AddService(new FS::FSUserInterface);
diff --git a/src/core/hle/shared_page.cpp b/src/core/hle/shared_page.cpp
index f0726ef0..568dad68 100644
--- a/src/core/hle/shared_page.cpp
+++ b/src/core/hle/shared_page.cpp
@@ -3,6 +3,7 @@
// Refer to the license.txt file included.
#include "common/common_types.h"
+#include "common/common_funcs.h"
#include "core/core.h"
#include "core/mem_map.h"
@@ -13,13 +14,6 @@
namespace SharedPage {
-// helper macro to properly align structure members.
-// Calling INSERT_PADDING_BYTES will add a new member variable with a name like "pad121",
-// depending on the current source line to make sure variable names are unique.
-#define INSERT_PADDING_BYTES_HELPER1(x, y) x ## y
-#define INSERT_PADDING_BYTES_HELPER2(x, y) INSERT_PADDING_BYTES_HELPER1(x, y)
-#define INSERT_PADDING_BYTES(num_words) u8 INSERT_PADDING_BYTES_HELPER2(pad, __LINE__)[(num_words)]
-
// see http://3dbrew.org/wiki/Configuration_Memory#Shared_Memory_Page_For_ARM11_Processes
#pragma pack(1)
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index aad0e5d0..bd7d92cd 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -67,23 +67,38 @@ inline void Write(u32 addr, const T data) {
switch (index) {
// Memory fills are triggered once the fill value is written.
- // NOTE: This is not verified.
- case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3):
- case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3):
+ case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].trigger, 0x00004 + 0x3):
+ case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].trigger, 0x00008 + 0x3):
{
- const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value));
- const auto& config = g_regs.memory_fill_config[is_second_filler];
-
- // TODO: Not sure if this check should be done at GSP level instead
- if (config.address_start) {
- // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all
- u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
- u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
- for (u32* ptr = start; ptr < end; ++ptr)
- *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation
+ const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
+ auto& config = g_regs.memory_fill_config[is_second_filler];
+
+ if (config.address_start && config.trigger) {
+ u8* start = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
+ u8* end = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
+
+ if (config.fill_24bit) {
+ // fill with 24-bit values
+ for (u8* ptr = start; ptr < end; ptr += 3) {
+ ptr[0] = config.value_24bit_b;
+ ptr[1] = config.value_24bit_g;
+ ptr[2] = config.value_24bit_r;
+ }
+ } else if (config.fill_32bit) {
+ // fill with 32-bit values
+ for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
+ *ptr = config.value_32bit;
+ } else {
+ // fill with 16-bit values
+ for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
+ *ptr = config.value_16bit;
+ }
LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
+ config.trigger = 0;
+ config.finished = 1;
+
if (!is_second_filler) {
GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
} else {
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 7c3a17ee..df9aa0d7 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -53,6 +53,7 @@ struct Regs {
"Structure size and register block length don't match")
#endif
+ // All of those formats are described in reverse byte order, since the 3DS is little-endian.
enum class PixelFormat : u32 {
RGBA8 = 0,
RGB8 = 1,
@@ -61,13 +62,57 @@ struct Regs {
RGBA4 = 4,
};
+ /**
+ * Returns the number of bytes per pixel.
+ */
+ static int BytesPerPixel(PixelFormat format) {
+ switch (format) {
+ case PixelFormat::RGBA8:
+ return 4;
+ case PixelFormat::RGB8:
+ return 3;
+ case PixelFormat::RGB565:
+ case PixelFormat::RGB5A1:
+ case PixelFormat::RGBA4:
+ return 2;
+ default:
+ UNIMPLEMENTED();
+ }
+ }
+
INSERT_PADDING_WORDS(0x4);
struct {
u32 address_start;
- u32 address_end; // ?
- u32 size;
- u32 value; // ?
+ u32 address_end;
+
+ union {
+ u32 value_32bit;
+
+ BitField<0, 16, u32> value_16bit;
+
+ // TODO: Verify component order
+ BitField< 0, 8, u32> value_24bit_r;
+ BitField< 8, 8, u32> value_24bit_g;
+ BitField<16, 8, u32> value_24bit_b;
+ };
+
+ union {
+ u32 control;
+
+ // Setting this field to 1 triggers the memory fill.
+ // This field also acts as a status flag, and gets reset to 0 upon completion.
+ BitField<0, 1, u32> trigger;
+
+ // Set to 1 upon completion.
+ BitField<0, 1, u32> finished;
+
+ // 0: fill with 16- or 32-bit wide values; 1: fill with 24-bit wide values
+ BitField<8, 1, u32> fill_24bit;
+
+ // 0: fill with 16-bit wide values; 1: fill with 32-bit wide values
+ BitField<9, 1, u32> fill_32bit;
+ };
inline u32 GetStartAddress() const {
return DecodeAddressRegister(address_start);
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 1744066b..ba3876a7 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -15,30 +15,18 @@ namespace Clipper {
struct ClippingEdge {
public:
- enum Type {
- POS_X = 0,
- NEG_X = 1,
- POS_Y = 2,
- NEG_Y = 3,
- POS_Z = 4,
- NEG_Z = 5,
- };
-
- ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
+ ClippingEdge(Math::Vec4<float24> coeffs,
+ Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0),
+ float24::FromFloat32(0),
+ float24::FromFloat32(0),
+ float24::FromFloat32(0)))
+ : coeffs(coeffs),
+ bias(bias)
+ {
+ }
bool IsInside(const OutputVertex& vertex) const {
- switch (type) {
- case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
- case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
- case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
- case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
-
- // TODO: Check z compares ... should be 0..1 instead?
- case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
-
- default:
- case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
- }
+ return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
}
bool IsOutSide(const OutputVertex& vertex) const {
@@ -46,31 +34,17 @@ public:
}
OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
- auto dotpr = [this](const OutputVertex& vtx) {
- switch (type) {
- case POS_X: return vtx.pos.x - vtx.pos.w;
- case NEG_X: return -vtx.pos.x - vtx.pos.w;
- case POS_Y: return vtx.pos.y - vtx.pos.w;
- case NEG_Y: return -vtx.pos.y - vtx.pos.w;
-
- // TODO: Verify z clipping
- case POS_Z: return vtx.pos.z - vtx.pos.w;
-
- default:
- case NEG_Z: return -vtx.pos.w;
- }
- };
-
- float24 dp = dotpr(v0);
- float24 dp_prev = dotpr(v1);
+ float24 dp = Math::Dot(v0.pos + bias, coeffs);
+ float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
float24 factor = dp_prev / (dp_prev - dp);
return OutputVertex::Lerp(factor, v0, v1);
}
private:
- Type type;
float24 pos;
+ Math::Vec4<float24> coeffs;
+ Math::Vec4<float24> bias;
};
static void InitScreenCoordinates(OutputVertex& vtx)
@@ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx)
vtx.tc2 *= inv_w;
vtx.pos.w = inv_w;
- // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
- vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
+ vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
}
void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
@@ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
auto* output_list = &buffer_a;
auto* input_list = &buffer_b;
+ // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
+ // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
+ // epsilon possible within float24 accuracy.
+ static const float24 EPSILON = float24::FromFloat32(0.00001);
+ static const float24 f0 = float24::FromFloat32(0.0);
+ static const float24 f1 = float24::FromFloat32(1.0);
+ static const std::array<ClippingEdge, 7> clipping_edges = {{
+ { Math::MakeVec( f1, f0, f0, -f1) }, // x = +w
+ { Math::MakeVec(-f1, f0, f0, -f1) }, // x = -w
+ { Math::MakeVec( f0, f1, f0, -f1) }, // y = +w
+ { Math::MakeVec( f0, -f1, f0, -f1) }, // y = -w
+ { Math::MakeVec( f0, f0, f1, f0) }, // z = 0
+ { Math::MakeVec( f0, f0, -f1, -f1) }, // z = -w
+ { Math::MakeVec( f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
+ }};
+
+ // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
+ // drop the whole primitive instead of clipping the primitive properly. We should test if
+ // this happens on the 3DS, too.
+
// Simple implementation of the Sutherland-Hodgman clipping algorithm.
// TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
- for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)),
- ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
- ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
- ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
- ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
- ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
+ for (auto edge : clipping_edges) {
std::swap(input_list, output_list);
output_list->clear();
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 0d9f4ba6..586ad62b 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -2,6 +2,8 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <boost/range/algorithm/fill.hpp>
+
#include "clipper.h"
#include "command_processor.h"
#include "math.h"
@@ -23,10 +25,6 @@ static int float_regs_counter = 0;
static u32 uniform_write_buffer[4];
-// Used for VSLoadProgramData and VSLoadSwizzleData
-static u32 vs_binary_write_offset = 0;
-static u32 vs_swizzle_write_offset = 0;
-
static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
if (id >= registers.NumIds())
@@ -65,10 +63,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
// Information about internal vertex attributes
u32 vertex_attribute_sources[16];
- std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef);
+ boost::fill(vertex_attribute_sources, 0xdeadbeef);
u32 vertex_attribute_strides[16];
u32 vertex_attribute_formats[16];
- u32 vertex_attribute_elements[16];
+
+ // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
+ // This is one of the hacks required to deal with uninitalized vertex attributes.
+ // TODO: Fix this properly.
+ u32 vertex_attribute_elements[16] = {};
u32 vertex_attribute_element_size[16];
// Setup attribute data from loaders
@@ -252,11 +254,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
break;
}
- // Seems to be used to reset the write pointer for VSLoadProgramData
- case PICA_REG_INDEX(vs_program.begin_load):
- vs_binary_write_offset = 0;
- break;
-
// Load shader program code
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
@@ -267,16 +264,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
{
- VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value);
- vs_binary_write_offset++;
+ VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value);
+ registers.vs_program.offset++;
break;
}
- // Seems to be used to reset the write pointer for VSLoadSwizzleData
- case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
- vs_swizzle_write_offset = 0;
- break;
-
// Load swizzle pattern data
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
@@ -287,8 +279,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
{
- VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value);
- vs_swizzle_write_offset++;
+ VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value);
+ registers.vs_swizzle_patterns.offset++;
break;
}
diff --git a/src/video_core/math.h b/src/video_core/math.h
index c176b225..f9a82265 100644
--- a/src/video_core/math.h
+++ b/src/video_core/math.h
@@ -631,7 +631,7 @@ static inline Vec4<T> MakeVec(const Vec3<T>& xyz, const T& w)
}
template<typename T>
-static inline Vec4<T> MakeVec(const T& x, const Vec2<T>& yzw)
+static inline Vec4<T> MakeVec(const T& x, const Vec3<T>& yzw)
{
return MakeVec(x, yzw[0], yzw[1], yzw[2]);
}
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 9c1a12dc..e4a5ef78 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -118,8 +118,9 @@ struct Regs {
struct TextureConfig {
enum WrapMode : u32 {
- ClampToEdge = 0,
- Repeat = 2,
+ ClampToEdge = 0,
+ Repeat = 2,
+ MirroredRepeat = 3,
};
INSERT_PADDING_WORDS(0x1);
@@ -131,7 +132,7 @@ struct Regs {
union {
BitField< 8, 2, WrapMode> wrap_s;
- BitField<11, 2, WrapMode> wrap_t;
+ BitField<12, 2, WrapMode> wrap_t;
};
INSERT_PADDING_WORDS(0x1);
@@ -223,6 +224,8 @@ struct Regs {
struct TevStageConfig {
enum class Source : u32 {
PrimaryColor = 0x0,
+ PrimaryFragmentColor = 0x1,
+
Texture0 = 0x3,
Texture1 = 0x4,
Texture2 = 0x5,
@@ -265,6 +268,9 @@ struct Regs {
AddSigned = 3,
Lerp = 4,
Subtract = 5,
+
+ MultiplyThenAdd = 8,
+ AddThenMultiply = 9,
};
union {
@@ -337,7 +343,7 @@ struct Regs {
};
union {
- enum BlendEquation : u32 {
+ enum class BlendEquation : u32 {
Add = 0,
Subtract = 1,
ReverseSubtract = 2,
@@ -421,7 +427,7 @@ struct Regs {
INSERT_PADDING_WORDS(0x6);
u32 depth_format;
- u32 color_format;
+ BitField<16, 3, u32> color_format;
INSERT_PADDING_WORDS(0x4);
@@ -678,7 +684,9 @@ struct Regs {
INSERT_PADDING_WORDS(0x2);
struct {
- u32 begin_load;
+ // Offset of the next instruction to write code to.
+ // Incremented with each instruction write.
+ u32 offset;
// Writing to these registers sets the "current" word in the shader program.
// TODO: It's not clear how the hardware stores what the "current" word is.
@@ -690,7 +698,9 @@ struct Regs {
// This register group is used to load an internal table of swizzling patterns,
// which are indexed by each shader instruction to specify vector component swizzling.
struct {
- u32 begin_load;
+ // Offset of the next swizzle pattern to write code to.
+ // Incremented with each instruction write.
+ u32 offset;
// Writing to these registers sets the "current" swizzle pattern in the table.
// TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 3faa1015..94873f40 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -5,6 +5,7 @@
#include <algorithm>
#include "common/common_types.h"
+#include "common/math_util.h"
#include "math.h"
#include "pica.h"
@@ -20,16 +21,31 @@ namespace Rasterizer {
static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
- u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
- // Assuming RGBA8 format until actual framebuffer format handling is implemented
- *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+ // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
+ // NOTE: The framebuffer height register contains the actual FB height minus one.
+ y = (registers.framebuffer.height - y);
+
+ switch (registers.framebuffer.color_format) {
+ case registers.framebuffer.RGBA8:
+ {
+ u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
+ *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+ break;
+ }
+
+ default:
+ LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
+ UNIMPLEMENTED();
+ }
}
static const Math::Vec4<u8> GetPixel(int x, int y) {
const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
+ y = (registers.framebuffer.height - y);
+
u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
Math::Vec4<u8> ret;
ret.a() = value >> 24;
@@ -43,6 +59,8 @@ static u32 GetDepth(int x, int y) {
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
+ y = (registers.framebuffer.height - y);
+
// Assuming 16-bit depth buffer format until actual format handling is implemented
return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
}
@@ -51,6 +69,8 @@ static void SetDepth(int x, int y, u16 value) {
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
+ y = (registers.framebuffer.height - y);
+
// Assuming 16-bit depth buffer format until actual format handling is implemented
*(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
}
@@ -90,30 +110,43 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
return Math::Cross(vec1, vec2).z;
};
-void ProcessTriangle(const VertexShader::OutputVertex& v0,
- const VertexShader::OutputVertex& v1,
- const VertexShader::OutputVertex& v2)
+/**
+ * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
+ * culling via recursion.
+ */
+static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
+ const VertexShader::OutputVertex& v1,
+ const VertexShader::OutputVertex& v2,
+ bool reversed = false)
{
// vertex positions in rasterizer coordinates
- auto FloatToFix = [](float24 flt) {
- return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f));
- };
- auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
- return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
- };
+ static auto FloatToFix = [](float24 flt) {
+ // TODO: Rounding here is necessary to prevent garbage pixels at
+ // triangle borders. Is it that the correct solution, though?
+ return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
+ };
+ static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) {
+ return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
+ };
Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
ScreenToRasterizerCoordinates(v1.screenpos),
ScreenToRasterizerCoordinates(v2.screenpos) };
- if (registers.cull_mode == Regs::CullMode::KeepClockWise) {
- // Reverse vertex order and use the CCW code path.
- std::swap(vtxpos[1], vtxpos[2]);
- }
+ if (registers.cull_mode == Regs::CullMode::KeepAll) {
+ // Make sure we always end up with a triangle wound counter-clockwise
+ if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
+ ProcessTriangleInternal(v0, v2, v1, true);
+ return;
+ }
+ } else {
+ if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) {
+ // Reverse vertex order and use the CCW code path.
+ ProcessTriangleInternal(v0, v2, v1, true);
+ return;
+ }
- if (registers.cull_mode != Regs::CullMode::KeepAll) {
// Cull away triangles which are wound clockwise.
- // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
return;
}
@@ -155,9 +188,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
auto textures = registers.GetTextures();
auto tev_stages = registers.GetTevStages();
+ // Enter rasterization loop, starting at the center of the topleft bounding box corner.
// TODO: Not sure if looping through x first might be faster
- for (u16 y = min_y; y < max_y; y += 0x10) {
- for (u16 x = min_x; x < max_x; x += 0x10) {
+ for (u16 y = min_y + 8; y < max_y; y += 0x10) {
+ for (u16 x = min_x + 8; x < max_x; x += 0x10) {
// Calculate the barycentric coordinates w0, w1 and w2
int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
@@ -220,7 +254,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
- auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
+ static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
switch (mode) {
case Regs::TextureConfig::ClampToEdge:
val = std::max(val, 0);
@@ -228,7 +262,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
return val;
case Regs::TextureConfig::Repeat:
- return (int)(((unsigned)val) % size);
+ return (int)((unsigned)val % size);
+
+ case Regs::TextureConfig::MirroredRepeat:
+ {
+ int val = (int)((unsigned)val % (2 * size));
+ if (val >= size)
+ val = 2 * size - 1 - val;
+ return val;
+ }
default:
LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
@@ -236,6 +278,10 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
return 0;
}
};
+
+ // Textures are laid out from bottom to top, hence we invert the t coordinate.
+ // NOTE: This may not be the right place for the inversion.
+ // TODO: Check if this applies to ETC textures, too.
s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
@@ -262,7 +308,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
auto GetSource = [&](Source source) -> Math::Vec4<u8> {
switch (source) {
+ // TODO: What's the difference between these two?
case Source::PrimaryColor:
+ case Source::PrimaryFragmentColor:
return primary_color;
case Source::Texture0:
@@ -378,6 +426,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
return result.Cast<u8>();
}
+ case Operation::MultiplyThenAdd:
+ {
+ auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255;
+ result.r() = std::min(255, result.r());
+ result.g() = std::min(255, result.g());
+ result.b() = std::min(255, result.b());
+ return result.Cast<u8>();
+ }
+
+ case Operation::AddThenMultiply:
+ {
+ auto result = input[0] + input[1];
+ result.r() = std::min(255, result.r());
+ result.g() = std::min(255, result.g());
+ result.b() = std::min(255, result.b());
+ result = (result * input[2].Cast<int>()) / 255;
+ return result.Cast<u8>();
+ }
+
default:
LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
UNIMPLEMENTED();
@@ -402,6 +469,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
case Operation::Subtract:
return std::max(0, (int)input[0] - (int)input[1]);
+ case Operation::MultiplyThenAdd:
+ return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255);
+
+ case Operation::AddThenMultiply:
+ return (std::min(255, (input[0] + input[1])) * input[2]) / 255;
+
default:
LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
UNIMPLEMENTED();
@@ -475,7 +548,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
// TODO: Does depth indeed only get written even if depth testing is enabled?
if (registers.output_merger.depth_test_enable) {
- u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 +
+ u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
v1.screenpos[2].ToFloat32() * w1 +
v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
u16 ref_z = GetDepth(x >> 4, y >> 4);
@@ -524,6 +597,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
}
auto dest = GetPixel(x >> 4, y >> 4);
+ Math::Vec4<u8> blend_output = combiner_output;
if (registers.output_merger.alphablend_enable) {
auto params = registers.output_merger.alpha_blending;
@@ -574,7 +648,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
default:
LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
- exit(0);
+ UNIMPLEMENTED();
break;
}
};
@@ -607,86 +681,78 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
default:
LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
- exit(0);
+ UNIMPLEMENTED();
+ break;
+ }
+ };
+
+ using BlendEquation = decltype(params)::BlendEquation;
+ static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
+ const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
+ BlendEquation equation) {
+ Math::Vec4<int> result;
+
+ auto src_result = (src * srcfactor).Cast<int>();
+ auto dst_result = (dest * destfactor).Cast<int>();
+
+ switch (equation) {
+ case BlendEquation::Add:
+ result = (src_result + dst_result) / 255;
break;
+
+ case BlendEquation::Subtract:
+ result = (src_result - dst_result) / 255;
+ break;
+
+ case BlendEquation::ReverseSubtract:
+ result = (dst_result - src_result) / 255;
+ break;
+
+ // TODO: How do these two actually work?
+ // OpenGL doesn't include the blend factors in the min/max computations,
+ // but is this what the 3DS actually does?
+ case BlendEquation::Min:
+ result.r() = std::min(src.r(), dest.r());
+ result.g() = std::min(src.g(), dest.g());
+ result.b() = std::min(src.b(), dest.b());
+ result.a() = std::min(src.a(), dest.a());
+ break;
+
+ case BlendEquation::Max:
+ result.r() = std::max(src.r(), dest.r());
+ result.g() = std::max(src.g(), dest.g());
+ result.b() = std::max(src.b(), dest.b());
+ result.a() = std::max(src.a(), dest.a());
+ break;
+
+ default:
+ LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
+ UNIMPLEMENTED();
}
+
+ return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
+ MathUtil::Clamp(result.g(), 0, 255),
+ MathUtil::Clamp(result.b(), 0, 255),
+ MathUtil::Clamp(result.a(), 0, 255));
};
auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
LookupFactorA(params.factor_source_a));
auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
LookupFactorA(params.factor_dest_a));
-
- auto src_result = (combiner_output * srcfactor).Cast<int>();
- auto dst_result = (dest * dstfactor).Cast<int>();
-
- switch (params.blend_equation_rgb) {
- case params.Add:
- {
- auto result = (src_result + dst_result) / 255;
- result.r() = std::min(255, result.r());
- result.g() = std::min(255, result.g());
- result.b() = std::min(255, result.b());
- combiner_output = result.Cast<u8>();
- break;
- }
-
- case params.Subtract:
- {
- auto result = (src_result - dst_result) / 255;
- result.r() = std::max(0, result.r());
- result.g() = std::max(0, result.g());
- result.b() = std::max(0, result.b());
- combiner_output = result.Cast<u8>();
- break;
- }
-
- case params.ReverseSubtract:
- {
- auto result = (dst_result - src_result) / 255;
- result.r() = std::max(0, result.r());
- result.g() = std::max(0, result.g());
- result.b() = std::max(0, result.b());
- combiner_output = result.Cast<u8>();
- break;
- }
-
- case params.Min:
- {
- // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
- Math::Vec4<int> result;
- result.r() = std::min(combiner_output.r(),dest.r());
- result.g() = std::min(combiner_output.g(),dest.g());
- result.b() = std::min(combiner_output.b(),dest.b());
- combiner_output = result.Cast<u8>();
- break;
- }
-
- case params.Max:
- {
- // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
- Math::Vec4<int> result;
- result.r() = std::max(combiner_output.r(),dest.r());
- result.g() = std::max(combiner_output.g(),dest.g());
- result.b() = std::max(combiner_output.b(),dest.b());
- combiner_output = result.Cast<u8>();
- break;
- }
- default:
- LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
- exit(0);
- }
+ blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
+ blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
} else {
LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
- exit(0);
+ UNIMPLEMENTED();
}
const Math::Vec4<u8> result = {
- registers.output_merger.red_enable ? combiner_output.r() : dest.r(),
- registers.output_merger.green_enable ? combiner_output.g() : dest.g(),
- registers.output_merger.blue_enable ? combiner_output.b() : dest.b(),
- registers.output_merger.alpha_enable ? combiner_output.a() : dest.a()
+ registers.output_merger.red_enable ? blend_output.r() : dest.r(),
+ registers.output_merger.green_enable ? blend_output.g() : dest.g(),
+ registers.output_merger.blue_enable ? blend_output.b() : dest.b(),
+ registers.output_merger.alpha_enable ? blend_output.a() : dest.a()
};
DrawPixel(x >> 4, y >> 4, result);
@@ -694,6 +760,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
}
}
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+ const VertexShader::OutputVertex& v1,
+ const VertexShader::OutputVertex& v2) {
+ ProcessTriangleInternal(v0, v1, v2);
+}
+
} // namespace Rasterizer
} // namespace Pica
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 735c0cf4..27269517 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -61,15 +61,13 @@ void RendererOpenGL::SwapBuffers() {
for(int i : {0, 1}) {
const auto& framebuffer = GPU::g_regs.framebuffer_config[i];
- if (textures[i].width != (GLsizei)framebuffer.width || textures[i].height != (GLsizei)framebuffer.height) {
+ if (textures[i].width != (GLsizei)framebuffer.width ||
+ textures[i].height != (GLsizei)framebuffer.height ||
+ textures[i].format != framebuffer.color_format) {
// Reallocate texture if the framebuffer size has changed.
// This is expected to not happen very often and hence should not be a
// performance problem.
- glBindTexture(GL_TEXTURE_2D, textures[i].handle);
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, framebuffer.width, framebuffer.height, 0,
- GL_BGR, GL_UNSIGNED_BYTE, nullptr);
- textures[i].width = framebuffer.width;
- textures[i].height = framebuffer.height;
+ ConfigureFramebufferTexture(textures[i], framebuffer);
}
LoadFBToActiveGLTexture(GPU::g_regs.framebuffer_config[i], textures[i]);
@@ -98,13 +96,12 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
const u8* framebuffer_data = Memory::GetPointer(framebuffer_vaddr);
- // TODO: Handle other pixel formats
- ASSERT_MSG(framebuffer.color_format == GPU::Regs::PixelFormat::RGB8,
- "Unsupported 3DS pixel format.");
+ int bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format);
+ size_t pixel_stride = framebuffer.stride / bpp;
- size_t pixel_stride = framebuffer.stride / 3;
// OpenGL only supports specifying a stride in units of pixels, not bytes, unfortunately
- ASSERT(pixel_stride * 3 == framebuffer.stride);
+ ASSERT(pixel_stride * bpp == framebuffer.stride);
+
// Ensure no bad interactions with GL_UNPACK_ALIGNMENT, which by default
// only allows rows to have a memory alignement of 4.
ASSERT(pixel_stride % 4 == 0);
@@ -118,7 +115,7 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
// TODO: Applications could theoretically crash Citra here by specifying too large
// framebuffer sizes. We should make sure that this cannot happen.
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, framebuffer.width, framebuffer.height,
- GL_BGR, GL_UNSIGNED_BYTE, framebuffer_data);
+ texture.gl_format, texture.gl_type, framebuffer_data);
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
@@ -171,6 +168,59 @@ void RendererOpenGL::InitOpenGLObjects() {
glBindTexture(GL_TEXTURE_2D, 0);
}
+void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
+ const GPU::Regs::FramebufferConfig& framebuffer) {
+ GPU::Regs::PixelFormat format = framebuffer.color_format;
+ GLint internal_format;
+
+ texture.format = format;
+ texture.width = framebuffer.width;
+ texture.height = framebuffer.height;
+
+ switch (format) {
+ case GPU::Regs::PixelFormat::RGBA8:
+ internal_format = GL_RGBA;
+ texture.gl_format = GL_RGBA;
+ texture.gl_type = GL_UNSIGNED_INT_8_8_8_8;
+ break;
+
+ case GPU::Regs::PixelFormat::RGB8:
+ // This pixel format uses BGR since GL_UNSIGNED_BYTE specifies byte-order, unlike every
+ // specific OpenGL type used in this function using native-endian (that is, little-endian
+ // mostly everywhere) for words or half-words.
+ // TODO: check how those behave on big-endian processors.
+ internal_format = GL_RGB;
+ texture.gl_format = GL_BGR;
+ texture.gl_type = GL_UNSIGNED_BYTE;
+ break;
+
+ case GPU::Regs::PixelFormat::RGB565:
+ internal_format = GL_RGB;
+ texture.gl_format = GL_RGB;
+ texture.gl_type = GL_UNSIGNED_SHORT_5_6_5;
+ break;
+
+ case GPU::Regs::PixelFormat::RGB5A1:
+ internal_format = GL_RGBA;
+ texture.gl_format = GL_RGBA;
+ texture.gl_type = GL_UNSIGNED_SHORT_5_5_5_1;
+ break;
+
+ case GPU::Regs::PixelFormat::RGBA4:
+ internal_format = GL_RGBA;
+ texture.gl_format = GL_RGBA;
+ texture.gl_type = GL_UNSIGNED_SHORT_4_4_4_4;
+ break;
+
+ default:
+ UNIMPLEMENTED();
+ }
+
+ glBindTexture(GL_TEXTURE_2D, texture.handle);
+ glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0,
+ texture.gl_format, texture.gl_type, nullptr);
+}
+
/**
* Draws a single texture to the emulator window, rotating the texture to correct for the 3DS's LCD rotation.
*/
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index cf78c1e7..bcabab55 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -43,9 +43,14 @@ private:
GLuint handle;
GLsizei width;
GLsizei height;
+ GPU::Regs::PixelFormat format;
+ GLenum gl_format;
+ GLenum gl_type;
};
void InitOpenGLObjects();
+ static void ConfigureFramebufferTexture(TextureInfo& texture,
+ const GPU::Regs::FramebufferConfig& framebuffer);
void DrawScreens();
void DrawSingleScreenRotated(const TextureInfo& texture, float x, float y, float w, float h);
void UpdateFramerate();
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 80935a50..def868ac 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -85,8 +85,11 @@ struct VertexShaderState {
};
struct CallStackElement {
- u32 final_address;
- u32 return_address;
+ u32 final_address; // Address upon which we jump to return_address
+ u32 return_address; // Where to jump when leaving scope
+ u8 repeat_counter; // How often to repeat until this call stack element is removed
+ u8 loop_increment; // Which value to add to the loop counter after an iteration
+ // TODO: Should this be a signed value? Does it even matter?
};
// TODO: Is there a maximal size for this?
@@ -105,9 +108,14 @@ static void ProcessShaderCode(VertexShaderState& state) {
while (true) {
if (!state.call_stack.empty()) {
- if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) {
- state.program_counter = &shader_memory[state.call_stack.top().return_address];
- state.call_stack.pop();
+ auto& top = state.call_stack.top();
+ if (state.program_counter - shader_memory.data() == top.final_address) {
+ state.address_registers[2] += top.loop_increment;
+
+ if (top.repeat_counter-- == 0) {
+ state.program_counter = &shader_memory[top.return_address];
+ state.call_stack.pop();
+ }
// TODO: Is "trying again" accurate to hardware?
continue;
@@ -118,9 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
const Instruction& instr = *(const Instruction*)state.program_counter;
const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
- auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) {
+ static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
+ u32 return_offset, u8 repeat_count, u8 loop_increment) {
state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
- state.call_stack.push({ offset + num_instructions, return_offset });
+ state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment });
};
u32 binary_offset = state.program_counter - shader_memory.data();
@@ -457,7 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
call(state,
instr.flow_control.dest_offset,
instr.flow_control.num_instructions,
- binary_offset + 1);
+ binary_offset + 1, 0, 0);
break;
case Instruction::OpCode::CALLU:
@@ -465,7 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
call(state,
instr.flow_control.dest_offset,
instr.flow_control.num_instructions,
- binary_offset + 1);
+ binary_offset + 1, 0, 0);
}
break;
@@ -474,7 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
call(state,
instr.flow_control.dest_offset,
instr.flow_control.num_instructions,
- binary_offset + 1);
+ binary_offset + 1, 0, 0);
}
break;
@@ -486,12 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
call(state,
binary_offset + 1,
instr.flow_control.dest_offset - binary_offset - 1,
- instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+ instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
} else {
call(state,
instr.flow_control.dest_offset,
instr.flow_control.num_instructions,
- instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+ instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
}
break;
@@ -504,17 +513,30 @@ static void ProcessShaderCode(VertexShaderState& state) {
call(state,
binary_offset + 1,
instr.flow_control.dest_offset - binary_offset - 1,
- instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+ instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
} else {
call(state,
instr.flow_control.dest_offset,
instr.flow_control.num_instructions,
- instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+ instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
}
break;
}
+ case Instruction::OpCode::LOOP:
+ {
+ state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y;
+
+ call(state,
+ binary_offset + 1,
+ instr.flow_control.dest_offset - binary_offset + 1,
+ instr.flow_control.dest_offset + 1,
+ shader_uniforms.i[instr.flow_control.int_uniform_id].x,
+ shader_uniforms.i[instr.flow_control.int_uniform_id].z);
+ break;
+ }
+
default:
LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
(int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);