14 files changed, 269 insertions, 151 deletions
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index c2973fb3..315b4cc9 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -992,6 +992,14 @@ typedef struct _mcr_inst {
     unsigned int inst;
 } mcr_inst;
 
+typedef struct mcrr_inst {
+    unsigned int opcode_1;
+    unsigned int cp_num;
+    unsigned int crm;
+    unsigned int rt;
+    unsigned int rt2;
+} mcrr_inst;
+
 typedef struct _mrs_inst {
     unsigned int R;
     unsigned int Rd;
@@ -1261,11 +1269,6 @@ static get_addr_fp_t get_calc_addr_op(unsigned int inst) {
 #define CHECK_RM            (inst_cream->Rm == 15)
 #define CHECK_RS            (inst_cream->Rs == 15)
 
-#define UNIMPLEMENTED_INSTRUCTION(mnemonic) \
-    LOG_ERROR(Core_ARM11, "unimplemented instruction: %s", mnemonic); \
-    CITRA_IGNORE_EXIT(-1); \
-    return nullptr;
-
 static ARM_INST_PTR INTERPRETER_TRANSLATE(adc)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(adc_inst));
@@ -1871,7 +1874,26 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(mcr)(unsigned int inst, int index)
     inst_cream->inst     = inst;
     return inst_base;
 }
-static ARM_INST_PTR INTERPRETER_TRANSLATE(mcrr)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("MCRR"); }
+
+static ARM_INST_PTR INTERPRETER_TRANSLATE(mcrr)(unsigned int inst, int index)
+{
+    arm_inst* const inst_base = (arm_inst*)AllocBuffer(sizeof(arm_inst) + sizeof(mcrr_inst));
+    mcrr_inst* const inst_cream = (mcrr_inst*)inst_base->component;
+
+    inst_base->cond     = BITS(inst, 28, 31);
+    inst_base->idx      = index;
+    inst_base->br       = NON_BRANCH;
+    inst_base->load_r15 = 0;
+
+    inst_cream->crm      = BITS(inst, 0, 3);
+    inst_cream->opcode_1 = BITS(inst, 4, 7);
+    inst_cream->cp_num   = BITS(inst, 8, 11);
+    inst_cream->rt       = BITS(inst, 12, 15);
+    inst_cream->rt2      = BITS(inst, 16, 19);
+
+    return inst_base;
+}
+
 static ARM_INST_PTR INTERPRETER_TRANSLATE(mla)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(mla_inst));
@@ -1930,7 +1952,12 @@ static ARM_INST_PTR INTERPRETER_TRANSLATE(mrc)(unsigned int inst, int index)
     inst_cream->inst     = inst;
     return inst_base;
 }
-static ARM_INST_PTR INTERPRETER_TRANSLATE(mrrc)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("MRRC"); }
+
+static ARM_INST_PTR INTERPRETER_TRANSLATE(mrrc)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(mcrr)(inst, index);
+}
+
 static ARM_INST_PTR INTERPRETER_TRANSLATE(mrs)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(mrs_inst));
@@ -4754,7 +4781,24 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) {
         FETCH_INST;
         GOTO_NEXT_INST;
     }
+
     MCRR_INST:
+    {
+        // Stubbed, as the MPCore doesn't have any registers that are accessible
+        // through this instruction.
+        if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
+            mcrr_inst* const inst_cream = (mcrr_inst*)inst_base->component;
+
+            LOG_ERROR(Core_ARM11, "MCRR executed | Coprocessor: %u, CRm %u, opc1: %u, Rt: %u, Rt2: %u",
+                      inst_cream->cp_num, inst_cream->crm, inst_cream->opcode_1, inst_cream->rt, inst_cream->rt2);
+        }
+
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(mcrr_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+
     MLA_INST:
     {
         if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
@@ -4830,7 +4874,24 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) {
         FETCH_INST;
         GOTO_NEXT_INST;
     }
+
     MRRC_INST:
+    {
+        // Stubbed, as the MPCore doesn't have any registers that are accessible
+        // through this instruction.
+        if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
+            mcrr_inst* const inst_cream = (mcrr_inst*)inst_base->component;
+
+            LOG_ERROR(Core_ARM11, "MRRC executed | Coprocessor: %u, CRm %u, opc1: %u, Rt: %u, Rt2: %u",
+                      inst_cream->cp_num, inst_cream->crm, inst_cream->opcode_1, inst_cream->rt, inst_cream->rt2);
+        }
+
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(mcrr_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+
     MRS_INST:
     {
         if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
diff --git a/src/core/arm/skyeye_common/vfp/vfp_helper.h b/src/core/arm/skyeye_common/vfp/vfp_helper.h
index 6b3dae28..ccc0212a 100644
--- a/src/core/arm/skyeye_common/vfp/vfp_helper.h
+++ b/src/core/arm/skyeye_common/vfp/vfp_helper.h
@@ -35,6 +35,7 @@
 #include <cstdio>
 #include "common/common_types.h"
 #include "core/arm/skyeye_common/armdefs.h"
+#include "core/arm/skyeye_common/vfp/asm_vfp.h"
 
 #define do_div(n, base) {n/=base;}
 
@@ -236,33 +237,6 @@ struct vfp_single {
 #define vfp_single_packed_exponent(v) (((v) >> VFP_SINGLE_MANTISSA_BITS) & ((1 << VFP_SINGLE_EXPONENT_BITS) - 1))
 #define vfp_single_packed_mantissa(v) ((v) & ((1 << VFP_SINGLE_MANTISSA_BITS) - 1))
 
-// Unpack a single-precision float.  Note that this returns the magnitude
-// of the single-precision float mantissa with the 1. if necessary,
-// aligned to bit 30.
-static inline void vfp_single_unpack(vfp_single* s, s32 val)
-{
-    u32 significand;
-
-    s->sign = vfp_single_packed_sign(val) >> 16,
-    s->exponent = vfp_single_packed_exponent(val);
-
-    significand = (u32) val;
-    significand = (significand << (32 - VFP_SINGLE_MANTISSA_BITS)) >> 2;
-    if (s->exponent && s->exponent != 255)
-        significand |= 0x40000000;
-    s->significand = significand;
-}
-
-// Re-pack a single-precision float.  This assumes that the float is
-// already normalised such that the MSB is bit 30, _not_ bit 31.
-static inline s32 vfp_single_pack(vfp_single* s)
-{
-    u32 val = (s->sign << 16) +
-              (s->exponent << VFP_SINGLE_MANTISSA_BITS) +
-              (s->significand >> VFP_SINGLE_LOW_BITS);
-    return (s32)val;
-}
-
 enum : u32 {
     VFP_NUMBER     = (1 << 0),
     VFP_ZERO       = (1 << 1),
@@ -294,6 +268,39 @@ static inline int vfp_single_type(vfp_single* s)
     return type;
 }
 
+// Unpack a single-precision float.  Note that this returns the magnitude
+// of the single-precision float mantissa with the 1. if necessary,
+// aligned to bit 30.
+static inline void vfp_single_unpack(vfp_single* s, s32 val, u32* fpscr)
+{
+    s->sign = vfp_single_packed_sign(val) >> 16,
+    s->exponent = vfp_single_packed_exponent(val);
+
+    u32 significand = ((u32)val << (32 - VFP_SINGLE_MANTISSA_BITS)) >> 2;
+    if (s->exponent && s->exponent != 255)
+        significand |= 0x40000000;
+    s->significand = significand;
+
+    // If flush-to-zero mode is enabled, turn the denormal into zero.
+    // On a VFPv2 architecture, the sign of the zero is always positive.
+    if ((*fpscr & FPSCR_FLUSH_TO_ZERO) != 0 && (vfp_single_type(s) & VFP_DENORMAL) != 0) {
+        s->sign = 0;
+        s->exponent = 0;
+        s->significand = 0;
+        *fpscr |= FPSCR_IDC;
+    }
+}
+
+// Re-pack a single-precision float. This assumes that the float is
+// already normalised such that the MSB is bit 30, _not_ bit 31.
+static inline s32 vfp_single_pack(vfp_single* s)
+{
+    u32 val = (s->sign << 16) +
+              (s->exponent << VFP_SINGLE_MANTISSA_BITS) +
+              (s->significand >> VFP_SINGLE_LOW_BITS);
+    return (s32)val;
+}
+
 
 u32 vfp_single_normaliseround(ARMul_State* state, int sd, vfp_single* vs, u32 fpscr, u32 exceptions, const char* func);
 
@@ -328,24 +335,49 @@ struct vfp_double {
 #define vfp_double_packed_exponent(v) (((v) >> VFP_DOUBLE_MANTISSA_BITS) & ((1 << VFP_DOUBLE_EXPONENT_BITS) - 1))
 #define vfp_double_packed_mantissa(v) ((v) & ((1ULL << VFP_DOUBLE_MANTISSA_BITS) - 1))
 
+static inline int vfp_double_type(vfp_double* s)
+{
+    int type = VFP_NUMBER;
+    if (s->exponent == 2047) {
+        if (s->significand == 0)
+            type = VFP_INFINITY;
+        else if (s->significand & VFP_DOUBLE_SIGNIFICAND_QNAN)
+            type = VFP_QNAN;
+        else
+            type = VFP_SNAN;
+    } else if (s->exponent == 0) {
+        if (s->significand == 0)
+            type |= VFP_ZERO;
+        else
+            type |= VFP_DENORMAL;
+    }
+    return type;
+}
+
 // Unpack a double-precision float.  Note that this returns the magnitude
 // of the double-precision float mantissa with the 1. if necessary,
 // aligned to bit 62.
-static inline void vfp_double_unpack(vfp_double* s, s64 val)
+static inline void vfp_double_unpack(vfp_double* s, s64 val, u32* fpscr)
 {
-    u64 significand;
-
     s->sign = vfp_double_packed_sign(val) >> 48;
     s->exponent = vfp_double_packed_exponent(val);
 
-    significand = (u64) val;
-    significand = (significand << (64 - VFP_DOUBLE_MANTISSA_BITS)) >> 2;
+    u64 significand = ((u64)val << (64 - VFP_DOUBLE_MANTISSA_BITS)) >> 2;
     if (s->exponent && s->exponent != 2047)
         significand |= (1ULL << 62);
     s->significand = significand;
+
+    // If flush-to-zero mode is enabled, turn the denormal into zero.
+    // On a VFPv2 architecture, the sign of the zero is always positive.
+    if ((*fpscr & FPSCR_FLUSH_TO_ZERO) != 0 && (vfp_double_type(s) & VFP_DENORMAL) != 0) {
+        s->sign = 0;
+        s->exponent = 0;
+        s->significand = 0;
+        *fpscr |= FPSCR_IDC;
+    }
 }
 
-// Re-pack a double-precision float.  This assumes that the float is
+// Re-pack a double-precision float. This assumes that the float is
 // already normalised such that the MSB is bit 30, _not_ bit 31.
 static inline s64 vfp_double_pack(vfp_double* s)
 {
@@ -355,25 +387,6 @@ static inline s64 vfp_double_pack(vfp_double* s)
     return (s64)val;
 }
 
-static inline int vfp_double_type(vfp_double* s)
-{
-    int type = VFP_NUMBER;
-    if (s->exponent == 2047) {
-        if (s->significand == 0)
-            type = VFP_INFINITY;
-        else if (s->significand & VFP_DOUBLE_SIGNIFICAND_QNAN)
-            type = VFP_QNAN;
-        else
-            type = VFP_SNAN;
-    } else if (s->exponent == 0) {
-        if (s->significand == 0)
-            type |= VFP_ZERO;
-        else
-            type |= VFP_DENORMAL;
-    }
-    return type;
-}
-
 u32 vfp_estimate_sqrt_significand(u32 exponent, u32 significand);
 
 // A special flag to tell the normalisation code not to normalise.
diff --git a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
index d76d37fd..ab9fec39 100644
--- a/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpdouble.cpp
@@ -291,7 +291,8 @@ static u32 vfp_double_fsqrt(ARMul_State* state, int dd, int unused, int dm, u32
     vfp_double vdm, vdd, *vdp;
     int ret, tm;
 
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
+
     tm = vfp_double_type(&vdm);
     if (tm & (VFP_NAN|VFP_INFINITY)) {
         vdp = &vdd;
@@ -473,7 +474,7 @@ static u32 vfp_double_fcvts(ARMul_State* state, int sd, int unused, int dm, u32
     u32 exceptions = 0;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
 
     tm = vfp_double_type(&vdm);
 
@@ -543,7 +544,7 @@ static u32 vfp_double_ftoui(ARMul_State* state, int sd, int unused, int dm, u32
     int tm;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
 
     /*
      * Do we have a denormalised number?
@@ -624,7 +625,7 @@ static u32 vfp_double_ftosi(ARMul_State* state, int sd, int unused, int dm, u32
     int tm;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
     vfp_double_dump("VDM", &vdm);
 
     /*
@@ -896,11 +897,11 @@ vfp_double_multiply_accumulate(ARMul_State* state, int dd, int dn, int dm, u32 f
     struct vfp_double vdd, vdp, vdn, vdm;
     u32 exceptions;
 
-    vfp_double_unpack(&vdn, vfp_get_double(state, dn));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dn), &fpscr);
     if (vdn.exponent == 0 && vdn.significand)
         vfp_double_normalise_denormal(&vdn);
 
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
     if (vdm.exponent == 0 && vdm.significand)
         vfp_double_normalise_denormal(&vdm);
 
@@ -908,7 +909,7 @@ vfp_double_multiply_accumulate(ARMul_State* state, int dd, int dn, int dm, u32 f
     if (negate & NEG_MULTIPLY)
         vdp.sign = vfp_sign_negate(vdp.sign);
 
-    vfp_double_unpack(&vdn, vfp_get_double(state, dd));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dd), &fpscr);
     if (vdn.exponent == 0 && vdn.significand != 0)
         vfp_double_normalise_denormal(&vdn);
 
@@ -969,11 +970,11 @@ static u32 vfp_double_fmul(ARMul_State* state, int dd, int dn, int dm, u32 fpscr
     u32 exceptions;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdn, vfp_get_double(state, dn));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dn), &fpscr);
     if (vdn.exponent == 0 && vdn.significand)
         vfp_double_normalise_denormal(&vdn);
 
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
     if (vdm.exponent == 0 && vdm.significand)
         vfp_double_normalise_denormal(&vdm);
 
@@ -990,11 +991,11 @@ static u32 vfp_double_fnmul(ARMul_State* state, int dd, int dn, int dm, u32 fpsc
     u32 exceptions;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdn, vfp_get_double(state, dn));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dn), &fpscr);
     if (vdn.exponent == 0 && vdn.significand)
         vfp_double_normalise_denormal(&vdn);
 
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
     if (vdm.exponent == 0 && vdm.significand)
         vfp_double_normalise_denormal(&vdm);
 
@@ -1013,11 +1014,11 @@ static u32 vfp_double_fadd(ARMul_State* state, int dd, int dn, int dm, u32 fpscr
     u32 exceptions;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdn, vfp_get_double(state, dn));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dn), &fpscr);
     if (vdn.exponent == 0 && vdn.significand)
         vfp_double_normalise_denormal(&vdn);
 
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
     if (vdm.exponent == 0 && vdm.significand)
         vfp_double_normalise_denormal(&vdm);
 
@@ -1035,11 +1036,11 @@ static u32 vfp_double_fsub(ARMul_State* state, int dd, int dn, int dm, u32 fpscr
     u32 exceptions;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdn, vfp_get_double(state, dn));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dn), &fpscr);
     if (vdn.exponent == 0 && vdn.significand)
         vfp_double_normalise_denormal(&vdn);
 
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
     if (vdm.exponent == 0 && vdm.significand)
         vfp_double_normalise_denormal(&vdm);
 
@@ -1063,8 +1064,8 @@ static u32 vfp_double_fdiv(ARMul_State* state, int dd, int dn, int dm, u32 fpscr
     int tm, tn;
 
     LOG_TRACE(Core_ARM11, "In %s\n", __FUNCTION__);
-    vfp_double_unpack(&vdn, vfp_get_double(state, dn));
-    vfp_double_unpack(&vdm, vfp_get_double(state, dm));
+    vfp_double_unpack(&vdn, vfp_get_double(state, dn), &fpscr);
+    vfp_double_unpack(&vdm, vfp_get_double(state, dm), &fpscr);
 
     vdd.sign = vdn.sign ^ vdm.sign;
 
diff --git a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
index a78bdc43..4dfe0254 100644
--- a/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
+++ b/src/core/arm/skyeye_common/vfp/vfpsingle.cpp
@@ -330,7 +330,7 @@ static u32 vfp_single_fsqrt(ARMul_State* state, int sd, int unused, s32 m, u32 f
     struct vfp_single vsm, vsd, *vsp;
     int ret, tm;
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     tm = vfp_single_type(&vsm);
     if (tm & (VFP_NAN|VFP_INFINITY)) {
         vsp = &vsd;
@@ -498,7 +498,7 @@ static u32 vfp_single_fcvtd(ARMul_State* state, int dd, int unused, s32 m, u32 f
     int tm;
     u32 exceptions = 0;
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
 
     tm = vfp_single_type(&vsm);
 
@@ -563,7 +563,7 @@ static u32 vfp_single_ftoui(ARMul_State* state, int sd, int unused, s32 m, u32 f
     int rmode = fpscr & FPSCR_RMODE_MASK;
     int tm;
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     vfp_single_dump("VSM", &vsm);
 
     /*
@@ -643,7 +643,7 @@ static u32 vfp_single_ftosi(ARMul_State* state, int sd, int unused, s32 m, u32 f
     int rmode = fpscr & FPSCR_RMODE_MASK;
     int tm;
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     vfp_single_dump("VSM", &vsm);
 
     /*
@@ -925,11 +925,11 @@ vfp_single_multiply_accumulate(ARMul_State* state, int sd, int sn, s32 m, u32 fp
 
     v = vfp_get_float(state, sn);
     LOG_DEBUG(Core_ARM11, "s%u = %08x", sn, v);
-    vfp_single_unpack(&vsn, v);
+    vfp_single_unpack(&vsn, v, &fpscr);
     if (vsn.exponent == 0 && vsn.significand)
         vfp_single_normalise_denormal(&vsn);
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     if (vsm.exponent == 0 && vsm.significand)
         vfp_single_normalise_denormal(&vsm);
 
@@ -940,7 +940,7 @@ vfp_single_multiply_accumulate(ARMul_State* state, int sd, int sn, s32 m, u32 fp
 
     v = vfp_get_float(state, sd);
     LOG_DEBUG(Core_ARM11, "s%u = %08x", sd, v);
-    vfp_single_unpack(&vsn, v);
+    vfp_single_unpack(&vsn, v, &fpscr);
     if (vsn.exponent == 0 && vsn.significand != 0)
         vfp_single_normalise_denormal(&vsn);
 
@@ -1004,11 +1004,11 @@ static u32 vfp_single_fmul(ARMul_State* state, int sd, int sn, s32 m, u32 fpscr)
 
     LOG_DEBUG(Core_ARM11, "s%u = %08x", sn, n);
 
-    vfp_single_unpack(&vsn, n);
+    vfp_single_unpack(&vsn, n, &fpscr);
     if (vsn.exponent == 0 && vsn.significand)
         vfp_single_normalise_denormal(&vsn);
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     if (vsm.exponent == 0 && vsm.significand)
         vfp_single_normalise_denormal(&vsm);
 
@@ -1027,11 +1027,11 @@ static u32 vfp_single_fnmul(ARMul_State* state, int sd, int sn, s32 m, u32 fpscr
 
     LOG_DEBUG(Core_ARM11, "s%u = %08x", sn, n);
 
-    vfp_single_unpack(&vsn, n);
+    vfp_single_unpack(&vsn, n, &fpscr);
     if (vsn.exponent == 0 && vsn.significand)
         vfp_single_normalise_denormal(&vsn);
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     if (vsm.exponent == 0 && vsm.significand)
         vfp_single_normalise_denormal(&vsm);
 
@@ -1054,11 +1054,11 @@ static u32 vfp_single_fadd(ARMul_State* state, int sd, int sn, s32 m, u32 fpscr)
     /*
      * Unpack and normalise denormals.
      */
-    vfp_single_unpack(&vsn, n);
+    vfp_single_unpack(&vsn, n, &fpscr);
     if (vsn.exponent == 0 && vsn.significand)
         vfp_single_normalise_denormal(&vsn);
 
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsm, m, &fpscr);
     if (vsm.exponent == 0 && vsm.significand)
         vfp_single_normalise_denormal(&vsm);
 
@@ -1094,8 +1094,8 @@ static u32 vfp_single_fdiv(ARMul_State* state, int sd, int sn, s32 m, u32 fpscr)
 
     LOG_DEBUG(Core_ARM11, "s%u = %08x", sn, n);
 
-    vfp_single_unpack(&vsn, n);
-    vfp_single_unpack(&vsm, m);
+    vfp_single_unpack(&vsn, n, &fpscr);
+    vfp_single_unpack(&vsm, m, &fpscr);
 
     vsd.sign = vsn.sign ^ vsm.sign;
 
diff --git a/src/core/core.cpp b/src/core/core.cpp
index b5c25823..53aae8c2 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -24,9 +24,9 @@ ARM_Interface*     g_sys_core = nullptr;  ///< ARM11 system (OS) core
 
 /// Run the core CPU loop
 void RunLoop(int tight_loop) {
-    // If the current thread is an idle thread, then don't execute instructions,
+    // If we don't have a currently active thread then don't execute instructions,
     // instead advance to the next event and try to yield to the next thread
-    if (Kernel::GetCurrentThread()->IsIdle()) {
+    if (Kernel::GetCurrentThread() == nullptr) {
         LOG_TRACE(Core_ARM11, "Idling");
         CoreTiming::Idle();
         CoreTiming::Advance();
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index a3715e55..b5c98b24 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -115,8 +115,7 @@ SharedPtr<Object> HandleTable::GetGeneric(Handle handle) const {
     if (handle == CurrentThread) {
         return GetCurrentThread();
     } else if (handle == CurrentProcess) {
-        LOG_ERROR(Kernel, "Current process (%08X) pseudo-handle not supported", CurrentProcess);
-        return nullptr;
+        return g_current_process;
     }
 
     if (!IsValid(handle)) {
@@ -139,6 +138,9 @@ void Init() {
     Kernel::TimersInit();
 
     Object::next_object_id = 0;
+    // TODO(Subv): Start the process ids from 10 for now, as lower PIDs are
+    // reserved for low-level services
+    Process::next_process_id = 10;
 }
 
 /// Shutdown the kernel
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index efae4a17..1e439db9 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -12,6 +12,8 @@
 
 namespace Kernel {
 
+u32 Process::next_process_id;
+
 SharedPtr<Process> Process::Create(std::string name, u64 program_id) {
     SharedPtr<Process> process(new Process);
 
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index 88ed9a5a..90881054 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -55,6 +55,8 @@ public:
     static const HandleType HANDLE_TYPE = HandleType::Process;
     HandleType GetHandleType() const override { return HANDLE_TYPE; }
 
+    static u32 next_process_id;
+
     /// Name of the process
     std::string name;
     /// Title ID corresponding to the process
@@ -69,6 +71,12 @@ public:
     boost::container::static_vector<AddressMapping, 8> address_mappings;
     ProcessFlags flags;
 
+    /// The id of this process
+    u32 process_id = next_process_id++;
+
+    /// Bitmask of the used TLS slots
+    std::bitset<300> used_tls_slots;
+
     /**
      * Parses a list of kernel capability descriptors (as found in the ExHeader) and applies them
      * to this process.
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 957cbdfe..afaf0cd5 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -17,6 +17,7 @@
 #include "core/core_timing.h"
 #include "core/hle/hle.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/process.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/mutex.h"
 #include "core/hle/result.h"
@@ -106,6 +107,8 @@ void Thread::Stop() {
     for (auto& wait_object : wait_objects) {
         wait_object->RemoveWaitingThread(this);
     }
+
+    Kernel::g_current_process->used_tls_slots[tls_index] = false;
 }
 
 Thread* ArbitrateHighestPriorityThread(u32 address) {
@@ -157,7 +160,7 @@ static void PriorityBoostStarvedThreads() {
 
         u64 delta = current_ticks - thread->last_running_ticks;
 
-        if (thread->status == THREADSTATUS_READY && delta > boost_timeout && !thread->idle) {
+        if (thread->status == THREADSTATUS_READY && delta > boost_timeout) {
             const s32 priority = std::max(ready_queue.get_first()->current_priority - 1, 0);
             thread->BoostPriority(priority);
         }
@@ -169,8 +172,6 @@ static void PriorityBoostStarvedThreads() {
  * @param new_thread The thread to switch to
  */
 static void SwitchContext(Thread* new_thread) {
-    DEBUG_ASSERT_MSG(new_thread->status == THREADSTATUS_READY, "Thread must be ready to become running.");
-
     Thread* previous_thread = GetCurrentThread();
 
     // Save context for previous thread
@@ -188,6 +189,8 @@ static void SwitchContext(Thread* new_thread) {
 
     // Load context of new thread
     if (new_thread) {
+        DEBUG_ASSERT_MSG(new_thread->status == THREADSTATUS_READY, "Thread must be ready to become running.");
+
         current_thread = new_thread;
 
         ready_queue.remove(new_thread->current_priority, new_thread);
@@ -215,6 +218,10 @@ static Thread* PopNextReadyThread() {
         // We have to do better than the current thread.
         // This call returns null when that's not possible.
         next = ready_queue.pop_first_better(thread->current_priority);
+        if (!next) {
+            // Otherwise just keep going with the current thread
+            next = thread;
+        }
     } else  {
         next = ready_queue.pop_first();
     }
@@ -402,12 +409,20 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
     thread->wait_address = 0;
     thread->name = std::move(name);
     thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();
+    thread->owner_process = g_current_process;
+    thread->tls_index = -1;
+
+    // Find the next available TLS index, and mark it as used
+    auto& used_tls_slots = Kernel::g_current_process->used_tls_slots;
+    for (unsigned int i = 0; i < used_tls_slots.size(); ++i) {
+        if (used_tls_slots[i] == false) {
+            thread->tls_index = i;
+            used_tls_slots[i] = true;
+            break;
+        }
+    }
 
-    VAddr tls_address = Memory::TLS_AREA_VADDR + (thread->thread_id - 1) * 0x200;
-
-    ASSERT_MSG(tls_address < Memory::TLS_AREA_VADDR_END, "Too many threads");
-
-    thread->tls_address = tls_address;
+    ASSERT_MSG(thread->tls_index != -1, "Out of TLS space");
 
     // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
     // to initialize the context
@@ -439,6 +454,8 @@ void Thread::SetPriority(s32 priority) {
     // If thread was ready, adjust queues
     if (status == THREADSTATUS_READY)
         ready_queue.move(this, current_priority, priority);
+    else
+        ready_queue.prepare(priority);
 
     nominal_priority = current_priority = priority;
 }
@@ -448,16 +465,6 @@ void Thread::BoostPriority(s32 priority) {
     current_priority = priority;
 }
 
-SharedPtr<Thread> SetupIdleThread() {
-    // We need to pass a few valid values to get around parameter checking in Thread::Create.
-    // TODO(yuriks): Figure out a way to avoid passing the bogus VAddr parameter
-    auto thread = Thread::Create("idle", Memory::TLS_AREA_VADDR, THREADPRIO_LOWEST, 0,
-            THREADPROCESSORID_0, 0).MoveFrom();
-
-    thread->idle = true;
-    return thread;
-}
-
 SharedPtr<Thread> SetupMainThread(u32 entry_point, s32 priority) {
     DEBUG_ASSERT(!GetCurrentThread());
 
@@ -474,24 +481,25 @@ SharedPtr<Thread> SetupMainThread(u32 entry_point, s32 priority) {
 }
 
 void Reschedule() {
-    Thread* prev = GetCurrentThread();
-
     PriorityBoostStarvedThreads();
 
+    Thread* cur = GetCurrentThread();
     Thread* next = PopNextReadyThread();
     HLE::g_reschedule = false;
 
-    if (next != nullptr) {
-        LOG_TRACE(Kernel, "context switch %u -> %u", prev->GetObjectId(), next->GetObjectId());
-        SwitchContext(next);
-    } else {
-        LOG_TRACE(Kernel, "cannot context switch from %u, no higher priority thread!", prev->GetObjectId());
+    // Don't bother switching to the same thread
+    if (next == cur)
+        return;
 
-        for (auto& thread : thread_list) {
-            LOG_TRACE(Kernel, "\tid=%u prio=0x%02X, status=0x%08X", thread->GetObjectId(), 
-                      thread->current_priority, thread->status);
-        }
+    if (cur && next) {
+        LOG_TRACE(Kernel, "context switch %u -> %u", cur->GetObjectId(), next->GetObjectId());
+    } else if (cur) {
+        LOG_TRACE(Kernel, "context switch %u -> idle", cur->GetObjectId());
+    } else {
+        LOG_TRACE(Kernel, "context switch idle -> %u", next->GetObjectId());
     }
+    
+    SwitchContext(next);
 }
 
 void Thread::SetWaitSynchronizationResult(ResultCode result) {
@@ -503,7 +511,7 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
 }
 
 VAddr Thread::GetTLSAddress() const {
-    return tls_address;
+    return Memory::TLS_AREA_VADDR + tls_index * 0x200;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -516,9 +524,6 @@ void ThreadingInit() {
 
     thread_list.clear();
     ready_queue.clear();
-
-    // Setup the idle thread
-    SetupIdleThread();
 }
 
 void ThreadingShutdown() {
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index afdaf851..6b329c12 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -45,6 +45,7 @@ enum ThreadStatus {
 namespace Kernel {
 
 class Mutex;
+class Process;
 
 class Thread final : public WaitObject {
 public:
@@ -72,12 +73,6 @@ public:
     void Acquire() override;
 
     /**
-     * Checks if the thread is an idle (stub) thread
-     * @return True if the thread is an idle (stub) thread, false otherwise
-     */
-    inline bool IsIdle() const { return idle; }
-
-    /**
      * Gets the thread's current priority
      * @return The current thread's priority
      */
@@ -156,11 +151,12 @@ public:
 
     s32 processor_id;
 
-    VAddr tls_address; ///< Address of the Thread Local Storage of the thread
+    s32 tls_index; ///< Index of the Thread Local Storage of the thread
 
     /// Mutexes currently held by this thread, which will be released when it exits.
     boost::container::flat_set<SharedPtr<Mutex>> held_mutexes;
 
+    SharedPtr<Process> owner_process; ///< Process that owns this thread
     std::vector<SharedPtr<WaitObject>> wait_objects; ///< Objects that the thread is waiting on
     VAddr wait_address;     ///< If waiting on an AddressArbiter, this is the arbitration address
     bool wait_all;          ///< True if the thread is waiting on all objects before resuming
@@ -168,9 +164,6 @@ public:
 
     std::string name;
 
-    /// Whether this thread is intended to never actually be executed, i.e. always idle
-    bool idle = false;
-
 private:
     Thread();
     ~Thread() override;
@@ -229,14 +222,6 @@ void WaitCurrentThread_WaitSynchronization(std::vector<SharedPtr<WaitObject>> wa
 void WaitCurrentThread_ArbitrateAddress(VAddr wait_address);
 
 /**
- * Sets up the idle thread, this is a thread that is intended to never execute instructions,
- * only to advance the timing. It is scheduled when there are no other ready threads in the thread queue
- * and will try to yield on every call.
- * @return The handle of the idle thread
- */
-SharedPtr<Thread> SetupIdleThread();
-
-/**
  * Initialize threading
  */
 void ThreadingInit();
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index 1ec6599c..e8159fbd 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -16,6 +16,7 @@
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/event.h"
 #include "core/hle/kernel/mutex.h"
+#include "core/hle/kernel/process.h"
 #include "core/hle/kernel/semaphore.h"
 #include "core/hle/kernel/shared_memory.h"
 #include "core/hle/kernel/thread.h"
@@ -424,6 +425,34 @@ static ResultCode ReleaseMutex(Handle handle) {
     return RESULT_SUCCESS;
 }
 
+/// Get the ID of the specified process
+static ResultCode GetProcessId(u32* process_id, Handle process_handle) {
+    LOG_TRACE(Kernel_SVC, "called process=0x%08X", process_handle);
+
+    const SharedPtr<Kernel::Process> process = Kernel::g_handle_table.Get<Kernel::Process>(process_handle);
+    if (process == nullptr)
+        return ERR_INVALID_HANDLE;
+
+    *process_id = process->process_id;
+    return RESULT_SUCCESS;
+}
+
+/// Get the ID of the process that owns the specified thread
+static ResultCode GetProcessIdOfThread(u32* process_id, Handle thread_handle) {
+    LOG_TRACE(Kernel_SVC, "called thread=0x%08X", thread_handle);
+
+    const SharedPtr<Kernel::Thread> thread = Kernel::g_handle_table.Get<Kernel::Thread>(thread_handle);
+    if (thread == nullptr)
+        return ERR_INVALID_HANDLE;
+
+    const SharedPtr<Kernel::Process> process = thread->owner_process;
+    
+    ASSERT_MSG(process != nullptr, "Invalid parent process for thread=0x%08X", thread_handle);
+
+    *process_id = process->process_id;
+    return RESULT_SUCCESS;
+}
+
 /// Get the ID for the specified thread.
 static ResultCode GetThreadId(u32* thread_id, Handle handle) {
     LOG_TRACE(Kernel_SVC, "called thread=0x%08X", handle);
@@ -674,8 +703,8 @@ static const FunctionDef SVC_Table[] = {
     {0x32, HLE::Wrap<SendSyncRequest>,      "SendSyncRequest"},
     {0x33, nullptr,                         "OpenProcess"},
     {0x34, nullptr,                         "OpenThread"},
-    {0x35, nullptr,                         "GetProcessId"},
-    {0x36, nullptr,                         "GetProcessIdOfThread"},
+    {0x35, HLE::Wrap<GetProcessId>,         "GetProcessId"},
+    {0x36, HLE::Wrap<GetProcessIdOfThread>, "GetProcessIdOfThread"},
     {0x37, HLE::Wrap<GetThreadId>,          "GetThreadId"},
     {0x38, HLE::Wrap<GetResourceLimit>,     "GetResourceLimit"},
     {0x39, nullptr,                         "GetResourceLimitLimitValues"},
diff --git a/src/core/mem_map.h b/src/core/mem_map.h
index 64de76c3..71f90cb8 100644
--- a/src/core/mem_map.h
+++ b/src/core/mem_map.h
@@ -94,10 +94,12 @@ enum : VAddr {
     SHARED_PAGE_SIZE      = 0x00001000,
     SHARED_PAGE_VADDR_END = SHARED_PAGE_VADDR + SHARED_PAGE_SIZE,
 
-    // TODO(yuriks): The exact location and size of this area is uncomfirmed.
+    // TODO(yuriks): The size of this area is dynamic, the kernel grows
+    // it as more and more threads are created. For now we'll just use a 
+    // hardcoded value.
     /// Area where TLS (Thread-Local Storage) buffers are allocated.
-    TLS_AREA_VADDR     = 0x1FFA0000,
-    TLS_AREA_SIZE      = 0x00002000, // Each TLS buffer is 0x200 bytes, allows for 16 threads
+    TLS_AREA_VADDR     = 0x1FF82000,
+    TLS_AREA_SIZE      = 0x00030000, // Each TLS buffer is 0x200 bytes, allows for 300 threads
     TLS_AREA_VADDR_END = TLS_AREA_VADDR + TLS_AREA_SIZE,
 };
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 46a326bb..02a08b20 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -6,6 +6,7 @@
 
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "common/profiler.h"
 
 #include "core/hw/gpu.h"
 #include "debug_utils/debug_utils.h"
@@ -186,6 +187,8 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
     return Math::Cross(vec1, vec2).z;
 };
 
+static Common::Profiling::TimingCategory rasterization_category("Rasterization");
+
 /**
  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
  * culling via recursion.
@@ -195,6 +198,8 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                                     const VertexShader::OutputVertex& v2,
                                     bool reversed = false)
 {
+    Common::Profiling::ScopeTimer timer(rasterization_category);
+
     // vertex positions in rasterizer coordinates
     static auto FloatToFix = [](float24 flt) {
         // TODO: Rounding here is necessary to prevent garbage pixels at
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 885b7de5..4734e546 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -12,6 +12,7 @@
 
 #include <nihstro/shader_bytecode.h>
 
+#include "common/profiler.h"
 
 #include "pica.h"
 #include "vertex_shader.h"
@@ -574,7 +575,11 @@ static void ProcessShaderCode(VertexShaderState& state) {
     }
 }
 
+static Common::Profiling::TimingCategory shader_category("Vertex Shader");
+
 OutputVertex RunShader(const InputVertex& input, int num_attributes) {
+    Common::Profiling::ScopeTimer timer(shader_category);
+
     VertexShaderState state;
 
     const u32* main = &shader_memory[registers.vs_main_offset];