17 files changed, 748 insertions, 147 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index b05c3554..daa2d59d 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SRCS
             mem_arena.cpp
             memory_util.cpp
             misc.cpp
+            profiler.cpp
             scm_rev.cpp
             string_util.cpp
             symbols.cpp
@@ -48,11 +49,14 @@ set(HEADERS
             mem_arena.h
             memory_util.h
             platform.h
+            profiler.h
+            profiler_reporting.h
             scm_rev.h
             scope_exit.h
             string_util.h
             swap.h
             symbols.h
+            synchronized_wrapper.h
             thread.h
             thread_queue_list.h
             thunk.h
diff --git a/src/common/assert.h b/src/common/assert.h
index 3b2232a7..9ca7adb1 100644
--- a/src/common/assert.h
+++ b/src/common/assert.h
@@ -4,24 +4,43 @@
 
 #pragma once
 
+#include <cstdlib>
+
 #include "common/common_funcs.h"
 
+// For asserts we'd like to keep all the junk executed when an assert happens away from the
+// important code in the function. One way of doing this is to put all the relevant code inside a
+// lambda and force the compiler to not inline it. Unfortunately, MSVC seems to have no syntax to
+// specify __declspec on lambda functions, so what we do instead is define a noinline wrapper
+// template that calls the lambda. This seems to generate an extra instruction at the call-site
+// compared to the ideal implementation (which wouldn't support ASSERT_MSG parameters), but is good
+// enough for our purposes.
+template <typename Fn>
+#if defined(_MSC_VER)
+    __declspec(noinline, noreturn)
+#elif defined(__GNUC__)
+    __attribute__((noinline, noreturn, cold))
+#endif
+static void assert_noinline_call(const Fn& fn) {
+    fn();
+    Crash();
+    exit(1); // Keeps GCC's mouth shut about this actually returning
+}
+
 // TODO (yuriks) allow synchronous logging so we don't need printf
 #define ASSERT(_a_) \
-    do if (!(_a_)) {\
+    do if (!(_a_)) { assert_noinline_call([] { \
         fprintf(stderr, "Assertion Failed!\n\n  Line: %d\n  File: %s\n  Time: %s\n", \
                      __LINE__, __FILE__, __TIME__); \
-        Crash(); \
-    } while (0)
+    }); } while (0)
 
 #define ASSERT_MSG(_a_, ...) \
-    do if (!(_a_)) {\
+    do if (!(_a_)) { assert_noinline_call([&] { \
         fprintf(stderr, "Assertion Failed!\n\n  Line: %d\n  File: %s\n  Time: %s\n", \
                      __LINE__, __FILE__, __TIME__); \
         fprintf(stderr, __VA_ARGS__); \
         fprintf(stderr, "\n"); \
-        Crash(); \
-    } while (0)
+    }); } while (0)
 
 #define UNREACHABLE() ASSERT_MSG(false, "Unreachable code!")
 
diff --git a/src/common/common.h b/src/common/common.h
index 948dc536..f7d0f55c 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -117,40 +117,4 @@ enum EMUSTATE_CHANGE
     EMUSTATE_CHANGE_STOP
 };
 
-
-#ifdef _MSC_VER
-inline unsigned long long bswap64(unsigned long long x) { return _byteswap_uint64(x); }
-inline unsigned int bswap32(unsigned int x) { return _byteswap_ulong(x); }
-inline unsigned short bswap16(unsigned short x) { return _byteswap_ushort(x); }
-#else
-// TODO: speedup
-inline unsigned short bswap16(unsigned short x) { return (x << 8) | (x >> 8); }
-inline unsigned int bswap32(unsigned int x) { return (x >> 24) | ((x & 0xFF0000) >> 8) | ((x & 0xFF00) << 8) | (x << 24);}
-inline unsigned long long bswap64(unsigned long long x) {return ((unsigned long long)bswap32(x) << 32) | bswap32(x >> 32); }
-#endif
-
-inline float bswapf(float f) {
-    union {
-        float f;
-        unsigned int u32;
-    } dat1, dat2;
-
-    dat1.f = f;
-    dat2.u32 = bswap32(dat1.u32);
-
-    return dat2.f;
-}
-
-inline double bswapd(double f) {
-    union  {
-        double f;
-        unsigned long long u64;
-    } dat1, dat2;
-
-    dat1.f = f;
-    dat2.u64 = bswap64(dat1.u64);
-
-    return dat2.f;
-}
-
 #include "swap.h"
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h
index d56156e4..e76cb7d6 100644
--- a/src/common/common_funcs.h
+++ b/src/common/common_funcs.h
@@ -37,11 +37,6 @@
 #ifndef _MSC_VER
 
 #include <errno.h>
-#ifdef __linux__
-#include <byteswap.h>
-#elif defined __FreeBSD__
-#include <sys/endian.h>
-#endif
 
 #if defined(__x86_64__) || defined(_M_X64)
 #define Crash() __asm__ __volatile__("int $3")
@@ -145,75 +140,3 @@ inline u64 _rotr64(u64 x, unsigned int shift){
 // This function might change the error code.
 // Defined in Misc.cpp.
 const char* GetLastErrorMsg();
-
-namespace Common
-{
-inline u8 swap8(u8 _data) {return _data;}
-inline u32 swap24(const u8* _data) {return (_data[0] << 16) | (_data[1] << 8) | _data[2];}
-
-#ifdef ANDROID
-#undef swap16
-#undef swap32
-#undef swap64
-#endif
-
-#ifdef _MSC_VER
-inline u16 swap16(u16 _data) {return _byteswap_ushort(_data);}
-inline u32 swap32(u32 _data) {return _byteswap_ulong (_data);}
-inline u64 swap64(u64 _data) {return _byteswap_uint64(_data);}
-#elif _M_ARM
-inline u16 swap16 (u16 _data) { u32 data = _data; __asm__ ("rev16 %0, %1\n" : "=l" (data) : "l" (data)); return (u16)data;}
-inline u32 swap32 (u32 _data) {__asm__ ("rev %0, %1\n" : "=l" (_data) : "l" (_data)); return _data;}
-inline u64 swap64(u64 _data) {return ((u64)swap32(_data) << 32) | swap32(_data >> 32);}
-#elif __linux__
-inline u16 swap16(u16 _data) {return bswap_16(_data);}
-inline u32 swap32(u32 _data) {return bswap_32(_data);}
-inline u64 swap64(u64 _data) {return bswap_64(_data);}
-#elif __APPLE__
-inline __attribute__((always_inline)) u16 swap16(u16 _data)
-    {return (_data >> 8) | (_data << 8);}
-inline __attribute__((always_inline)) u32 swap32(u32 _data)
-    {return __builtin_bswap32(_data);}
-inline __attribute__((always_inline)) u64 swap64(u64 _data)
-    {return __builtin_bswap64(_data);}
-#elif __FreeBSD__
-inline u16 swap16(u16 _data) {return bswap16(_data);}
-inline u32 swap32(u32 _data) {return bswap32(_data);}
-inline u64 swap64(u64 _data) {return bswap64(_data);}
-#else
-// Slow generic implementation.
-inline u16 swap16(u16 data) {return (data >> 8) | (data << 8);}
-inline u32 swap32(u32 data) {return (swap16(data) << 16) | swap16(data >> 16);}
-inline u64 swap64(u64 data) {return ((u64)swap32(data) << 32) | swap32(data >> 32);}
-#endif
-
-inline u16 swap16(const u8* _pData) {return swap16(*(const u16*)_pData);}
-inline u32 swap32(const u8* _pData) {return swap32(*(const u32*)_pData);}
-inline u64 swap64(const u8* _pData) {return swap64(*(const u64*)_pData);}
-
-template <int count>
-void swap(u8*);
-
-template <>
-inline void swap<1>(u8* data)
-{}
-
-template <>
-inline void swap<2>(u8* data)
-{
-    *reinterpret_cast<u16*>(data) = swap16(data);
-}
-
-template <>
-inline void swap<4>(u8* data)
-{
-    *reinterpret_cast<u32*>(data) = swap32(data);
-}
-
-template <>
-inline void swap<8>(u8* data)
-{
-    *reinterpret_cast<u64*>(data) = swap64(data);
-}
-
-}  // Namespace Common
diff --git a/src/common/emu_window.cpp b/src/common/emu_window.cpp
index 48bb35db..6459d2f3 100644
--- a/src/common/emu_window.cpp
+++ b/src/common/emu_window.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include "emu_window.h"
+#include "video_core/video_core.h"
 
 void EmuWindow::KeyPressed(KeyMap::HostDeviceKey key) {
     Service::HID::PadState mapped_key = KeyMap::GetPadKey(key);
@@ -15,3 +16,52 @@ void EmuWindow::KeyReleased(KeyMap::HostDeviceKey key) {
 
     Service::HID::PadButtonRelease(mapped_key);
 }
+
+EmuWindow::FramebufferLayout EmuWindow::FramebufferLayout::DefaultScreenLayout(unsigned width, unsigned height) {
+    ASSERT(width > 0);
+    ASSERT(height > 0);
+
+    EmuWindow::FramebufferLayout res = { width, height, {}, {} };
+
+    float window_aspect_ratio = static_cast<float>(height) / width;
+    float emulation_aspect_ratio = static_cast<float>(VideoCore::kScreenTopHeight * 2) /
+        VideoCore::kScreenTopWidth;
+
+    if (window_aspect_ratio > emulation_aspect_ratio) {
+        // Window is narrower than the emulation content => apply borders to the top and bottom
+        int viewport_height = static_cast<int>(std::round(emulation_aspect_ratio * width));
+
+        res.top_screen.left = 0;
+        res.top_screen.right = res.top_screen.left + width;
+        res.top_screen.top = (height - viewport_height) / 2;
+        res.top_screen.bottom = res.top_screen.top + viewport_height / 2;
+
+        int bottom_width = static_cast<int>((static_cast<float>(VideoCore::kScreenBottomWidth) /
+            VideoCore::kScreenTopWidth) * (res.top_screen.right - res.top_screen.left));
+        int bottom_border = ((res.top_screen.right - res.top_screen.left) - bottom_width) / 2;
+
+        res.bottom_screen.left = bottom_border;
+        res.bottom_screen.right = res.bottom_screen.left + bottom_width;
+        res.bottom_screen.top = res.top_screen.bottom;
+        res.bottom_screen.bottom = res.bottom_screen.top + viewport_height / 2;
+    } else {
+        // Otherwise, apply borders to the left and right sides of the window.
+        int viewport_width = static_cast<int>(std::round(height / emulation_aspect_ratio));
+
+        res.top_screen.left = (width - viewport_width) / 2;
+        res.top_screen.right = res.top_screen.left + viewport_width;
+        res.top_screen.top = 0;
+        res.top_screen.bottom = res.top_screen.top + height / 2;
+
+        int bottom_width = static_cast<int>((static_cast<float>(VideoCore::kScreenBottomWidth) /
+            VideoCore::kScreenTopWidth) * (res.top_screen.right - res.top_screen.left));
+        int bottom_border = ((res.top_screen.right - res.top_screen.left) - bottom_width) / 2;
+
+        res.bottom_screen.left = res.top_screen.left + bottom_border;
+        res.bottom_screen.right = res.bottom_screen.left + bottom_width;
+        res.bottom_screen.top = res.top_screen.bottom;
+        res.bottom_screen.bottom = res.bottom_screen.top + height / 2;
+    }
+
+    return res;
+}
diff --git a/src/common/emu_window.h b/src/common/emu_window.h
index 1ad4b82a..f6099fdb 100644
--- a/src/common/emu_window.h
+++ b/src/common/emu_window.h
@@ -8,6 +8,7 @@
 #include "common/scm_rev.h"
 #include "common/string_util.h"
 #include "common/key_map.h"
+#include "common/math_util.h"
 
 /**
  * Abstraction class used to provide an interface between emulation code and the frontend
@@ -38,6 +39,23 @@ public:
         std::pair<unsigned,unsigned> min_client_area_size;
     };
 
+    /// Describes the layout of the window framebuffer (size and top/bottom screen positions)
+    struct FramebufferLayout {
+
+        /**
+         * Factory method for constructing a default FramebufferLayout
+         * @param width Window framebuffer width in pixels
+         * @param height Window framebuffer height in pixels
+         * @return Newly created FramebufferLayout object with default screen regions initialized
+         */
+        static FramebufferLayout DefaultScreenLayout(unsigned width, unsigned height);
+
+        unsigned width;
+        unsigned height;
+        MathUtil::Rectangle<unsigned> top_screen;
+        MathUtil::Rectangle<unsigned> bottom_screen;
+    };
+
     /// Swap buffers to display the next frame
     virtual void SwapBuffers() = 0;
 
@@ -75,11 +93,11 @@ public:
     }
 
     /**
-      * Gets the framebuffer size in pixels.
+      * Gets the framebuffer layout (width, height, and screen regions)
       * @note This method is thread-safe
       */
-    const std::pair<unsigned,unsigned> GetFramebufferSize() const {
-        return framebuffer_size;
+    const FramebufferLayout& GetFramebufferLayout() const {
+        return framebuffer_layout;
     }
 
     /**
@@ -118,11 +136,11 @@ protected:
     }
 
     /**
-     * Update internal framebuffer size with the given parameter.
+     * Update framebuffer layout with the given parameter.
      * @note EmuWindow implementations will usually use this in window resize event handlers.
      */
-    void NotifyFramebufferSizeChanged(const std::pair<unsigned,unsigned>& size) {
-        framebuffer_size = size;
+    void NotifyFramebufferLayoutChanged(const FramebufferLayout& layout) {
+        framebuffer_layout = layout;
     }
 
     /**
@@ -143,7 +161,7 @@ private:
         // By default, ignore this request and do nothing.
     }
 
-    std::pair<unsigned,unsigned> framebuffer_size;
+    FramebufferLayout framebuffer_layout; ///< Current framebuffer layout
 
     unsigned client_area_width;    ///< Current client width, should be set by window impl.
     unsigned client_area_height;   ///< Current client height, should be set by window impl.
diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp
index da287f69..649640e7 100644
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -136,9 +136,18 @@ Entry CreateEntry(Class log_class, Level log_level,
     return std::move(entry);
 }
 
+static Filter* filter;
+
+void SetFilter(Filter* new_filter) {
+    filter = new_filter;
+}
+
 void LogMessage(Class log_class, Level log_level,
                 const char* filename, unsigned int line_nr, const char* function,
                 const char* format, ...) {
+    if (!filter->CheckMessage(log_class, log_level))
+        return;
+
     va_list args;
     va_start(args, format);
     Entry entry = CreateEntry(log_class, log_level,
diff --git a/src/common/logging/backend.h b/src/common/logging/backend.h
index 1c44c929..3114f864 100644
--- a/src/common/logging/backend.h
+++ b/src/common/logging/backend.h
@@ -10,6 +10,7 @@
 
 #include "common/concurrent_ring_buffer.h"
 
+#include "common/logging/filter.h"
 #include "common/logging/log.h"
 
 namespace Log {
@@ -131,4 +132,6 @@ Entry CreateEntry(Class log_class, Level log_level,
 /// Initializes the default Logger.
 std::shared_ptr<Logger> InitGlobalLogger();
 
+void SetFilter(Filter* filter);
+
 }
diff --git a/src/common/logging/filter.h b/src/common/logging/filter.h
index c3da9989..b53e4e63 100644
--- a/src/common/logging/filter.h
+++ b/src/common/logging/filter.h
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#pragma once
+
 #include <array>
 #include <string>
 
diff --git a/src/common/logging/text_formatter.cpp b/src/common/logging/text_formatter.cpp
index ef5739d8..36c91c4f 100644
--- a/src/common/logging/text_formatter.cpp
+++ b/src/common/logging/text_formatter.cpp
@@ -11,7 +11,6 @@
 #endif
 
 #include "common/logging/backend.h"
-#include "common/logging/filter.h"
 #include "common/logging/log.h"
 #include "common/logging/text_formatter.h"
 
@@ -116,7 +115,7 @@ void PrintColoredMessage(const Entry& entry) {
 #endif
 }
 
-void TextLoggingLoop(std::shared_ptr<Logger> logger, const Filter* filter) {
+void TextLoggingLoop(std::shared_ptr<Logger> logger) {
     std::array<Entry, 256> entry_buffer;
 
     while (true) {
@@ -126,9 +125,7 @@ void TextLoggingLoop(std::shared_ptr<Logger> logger, const Filter* filter) {
         }
         for (size_t i = 0; i < num_entries; ++i) {
             const Entry& entry = entry_buffer[i];
-            if (filter->CheckMessage(entry.log_class, entry.log_level)) {
-                PrintColoredMessage(entry);
-            }
+            PrintColoredMessage(entry);
         }
     }
 }
diff --git a/src/common/logging/text_formatter.h b/src/common/logging/text_formatter.h
index 2f05794f..8474a190 100644
--- a/src/common/logging/text_formatter.h
+++ b/src/common/logging/text_formatter.h
@@ -11,7 +11,6 @@ namespace Log {
 
 class Logger;
 struct Entry;
-class Filter;
 
 /**
  * Attempts to trim an arbitrary prefix from `path`, leaving only the part starting at `root`. It's
@@ -36,6 +35,6 @@ void PrintColoredMessage(const Entry& entry);
  * Logging loop that repeatedly reads messages from the provided logger and prints them to the
  * console. It is the baseline barebones log outputter.
  */
-void TextLoggingLoop(std::shared_ptr<Logger> logger, const Filter* filter);
+void TextLoggingLoop(std::shared_ptr<Logger> logger);
 
 }
diff --git a/src/common/profiler.cpp b/src/common/profiler.cpp
new file mode 100644
index 00000000..65c3df16
--- /dev/null
+++ b/src/common/profiler.cpp
@@ -0,0 +1,182 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/profiler.h"
+#include "common/profiler_reporting.h"
+#include "common/assert.h"
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013.
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h> // For QueryPerformanceCounter/Frequency
+#endif
+
+namespace Common {
+namespace Profiling {
+
+#if ENABLE_PROFILING
+thread_local Timer* Timer::current_timer = nullptr;
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013
+QPCClock::time_point QPCClock::now() {
+    static LARGE_INTEGER freq;
+    // Use this dummy local static to ensure this gets initialized once.
+    static BOOL dummy = QueryPerformanceFrequency(&freq);
+
+    LARGE_INTEGER ticks;
+    QueryPerformanceCounter(&ticks);
+
+    // This is prone to overflow when multiplying, which is why I'm using micro instead of nano. The
+    // correct way to approach this would be to just return ticks as a time_point and then subtract
+    // and do this conversion when creating a duration from two time_points, however, as far as I
+    // could tell the C++ requirements for these types are incompatible with this approach.
+    return time_point(duration(ticks.QuadPart * std::micro::den / freq.QuadPart));
+}
+#endif
+
+TimingCategory::TimingCategory(const char* name, TimingCategory* parent)
+        : accumulated_duration(0) {
+
+    ProfilingManager& manager = GetProfilingManager();
+    category_id = manager.RegisterTimingCategory(this, name);
+    if (parent != nullptr)
+        manager.SetTimingCategoryParent(category_id, parent->category_id);
+}
+
+ProfilingManager::ProfilingManager()
+        : last_frame_end(Clock::now()), this_frame_start(Clock::now()) {
+}
+
+unsigned int ProfilingManager::RegisterTimingCategory(TimingCategory* category, const char* name) {
+    TimingCategoryInfo info;
+    info.category = category;
+    info.name = name;
+    info.parent = TimingCategoryInfo::NO_PARENT;
+
+    unsigned int id = (unsigned int)timing_categories.size();
+    timing_categories.push_back(std::move(info));
+
+    return id;
+}
+
+void ProfilingManager::SetTimingCategoryParent(unsigned int category, unsigned int parent) {
+    ASSERT(category < timing_categories.size());
+    ASSERT(parent < timing_categories.size());
+
+    timing_categories[category].parent = parent;
+}
+
+void ProfilingManager::BeginFrame() {
+    this_frame_start = Clock::now();
+}
+
+void ProfilingManager::FinishFrame() {
+    Clock::time_point now = Clock::now();
+
+    results.interframe_time = now - last_frame_end;
+    results.frame_time = now - this_frame_start;
+
+    results.time_per_category.resize(timing_categories.size());
+    for (size_t i = 0; i < timing_categories.size(); ++i) {
+        results.time_per_category[i] = timing_categories[i].category->GetAccumulatedTime();
+    }
+
+    last_frame_end = now;
+}
+
+TimingResultsAggregator::TimingResultsAggregator(size_t window_size)
+        : max_window_size(window_size), window_size(0) {
+    interframe_times.resize(window_size, Duration::zero());
+    frame_times.resize(window_size, Duration::zero());
+}
+
+void TimingResultsAggregator::Clear() {
+    window_size = cursor = 0;
+}
+
+void TimingResultsAggregator::SetNumberOfCategories(size_t n) {
+    size_t old_size = times_per_category.size();
+    if (n == old_size)
+        return;
+
+    times_per_category.resize(n);
+
+    for (size_t i = old_size; i < n; ++i) {
+        times_per_category[i].resize(max_window_size, Duration::zero());
+    }
+}
+
+void TimingResultsAggregator::AddFrame(const ProfilingFrameResult& frame_result) {
+    SetNumberOfCategories(frame_result.time_per_category.size());
+
+    interframe_times[cursor] = frame_result.interframe_time;
+    frame_times[cursor] = frame_result.frame_time;
+    for (size_t i = 0; i < frame_result.time_per_category.size(); ++i) {
+        times_per_category[i][cursor] = frame_result.time_per_category[i];
+    }
+
+    ++cursor;
+    if (cursor == max_window_size)
+        cursor = 0;
+    if (window_size < max_window_size)
+        ++window_size;
+}
+
+static AggregatedDuration AggregateField(const std::vector<Duration>& v, size_t len) {
+    AggregatedDuration result;
+    result.avg = Duration::zero();
+
+    result.min = result.max = (len == 0 ? Duration::zero() : v[0]);
+
+    for (size_t i = 1; i < len; ++i) {
+        Duration value = v[i];
+        result.avg += value;
+        result.min = std::min(result.min, value);
+        result.max = std::max(result.max, value);
+    }
+    if (len != 0)
+        result.avg /= len;
+
+    return result;
+}
+
+static float tof(Common::Profiling::Duration dur) {
+    using FloatMs = std::chrono::duration<float, std::chrono::milliseconds::period>;
+    return std::chrono::duration_cast<FloatMs>(dur).count();
+}
+
+AggregatedFrameResult TimingResultsAggregator::GetAggregatedResults() const {
+    AggregatedFrameResult result;
+
+    result.interframe_time = AggregateField(interframe_times, window_size);
+    result.frame_time = AggregateField(frame_times, window_size);
+
+    if (result.interframe_time.avg != Duration::zero()) {
+        result.fps = 1000.0f / tof(result.interframe_time.avg);
+    } else {
+        result.fps = 0.0f;
+    }
+
+    result.time_per_category.resize(times_per_category.size());
+    for (size_t i = 0; i < times_per_category.size(); ++i) {
+        result.time_per_category[i] = AggregateField(times_per_category[i], window_size);
+    }
+
+    return result;
+}
+
+ProfilingManager& GetProfilingManager() {
+    // Takes advantage of "magic" static initialization for race-free initialization.
+    static ProfilingManager manager;
+    return manager;
+}
+
+SynchronizedRef<TimingResultsAggregator> GetTimingResultsAggregator() {
+    static SynchronizedWrapper<TimingResultsAggregator> aggregator(30);
+    return SynchronizedRef<TimingResultsAggregator>(aggregator);
+}
+
+} // namespace Profiling
+} // namespace Common
diff --git a/src/common/profiler.h b/src/common/profiler.h
new file mode 100644
index 00000000..3e967b4b
--- /dev/null
+++ b/src/common/profiler.h
@@ -0,0 +1,152 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+
+#include "common/assert.h"
+#include "common/thread.h"
+
+namespace Common {
+namespace Profiling {
+
+// If this is defined to 0, it turns all Timers into no-ops.
+#ifndef ENABLE_PROFILING
+#define ENABLE_PROFILING 1
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013
+// MSVC up to 2013 doesn't use QueryPerformanceCounter for high_resolution_clock, so it has bad
+// precision. We manually implement a clock based on QPC to get good results.
+
+struct QPCClock {
+    using duration = std::chrono::microseconds;
+    using time_point = std::chrono::time_point<QPCClock>;
+    using rep = duration::rep;
+    using period = duration::period;
+    static const bool is_steady = false;
+
+    static time_point now();
+};
+
+using Clock = QPCClock;
+#else
+using Clock = std::chrono::high_resolution_clock;
+#endif
+
+using Duration = Clock::duration;
+
+/**
+ * Represents a timing category that measured time can be accounted towards. Should be declared as a
+ * global variable and passed to Timers.
+ */
+class TimingCategory final {
+public:
+    TimingCategory(const char* name, TimingCategory* parent = nullptr);
+
+    unsigned int GetCategoryId() const {
+        return category_id;
+    }
+
+    /// Adds some time to this category. Can safely be called from multiple threads at the same time.
+    void AddTime(Duration amount) {
+        std::atomic_fetch_add_explicit(
+                &accumulated_duration, amount.count(),
+                std::memory_order_relaxed);
+    }
+
+    /**
+     * Atomically retrieves the accumulated measured time for this category and resets the counter
+     * to zero. Can be safely called concurrently with AddTime.
+     */
+    Duration GetAccumulatedTime() {
+        return Duration(std::atomic_exchange_explicit(
+                &accumulated_duration, (Duration::rep)0,
+                std::memory_order_relaxed));
+    }
+
+private:
+    unsigned int category_id;
+    std::atomic<Duration::rep> accumulated_duration;
+};
+
+/**
+ * Measures time elapsed between a call to Start and a call to Stop and attributes it to the given
+ * TimingCategory. Start/Stop can be called multiple times on the same timer, but each call must be
+ * appropriately paired.
+ *
+ * When a Timer is started, it automatically pauses a previously running timer on the same thread,
+ * which is resumed when it is stopped. As such, no special action needs to be taken to avoid
+ * double-accounting of time on two categories.
+ */
+class Timer {
+public:
+    Timer(TimingCategory& category) : category(category) {
+    }
+
+    void Start() {
+#if ENABLE_PROFILING
+        ASSERT(!running);
+        previous_timer = current_timer;
+        current_timer = this;
+        if (previous_timer != nullptr)
+            previous_timer->StopTiming();
+
+        StartTiming();
+#endif
+    }
+
+    void Stop() {
+#if ENABLE_PROFILING
+        ASSERT(running);
+        StopTiming();
+
+        if (previous_timer != nullptr)
+            previous_timer->StartTiming();
+        current_timer = previous_timer;
+#endif
+    }
+
+private:
+#if ENABLE_PROFILING
+    void StartTiming() {
+        start = Clock::now();
+        running = true;
+    }
+
+    void StopTiming() {
+        auto duration = Clock::now() - start;
+        running = false;
+        category.AddTime(std::chrono::duration_cast<Duration>(duration));
+    }
+
+    Clock::time_point start;
+    bool running = false;
+
+    Timer* previous_timer;
+    static thread_local Timer* current_timer;
+#endif
+
+    TimingCategory& category;
+};
+
+/**
+ * A Timer that automatically starts timing when created and stops at the end of the scope. Should
+ * be used in the majority of cases.
+ */
+class ScopeTimer : public Timer {
+public:
+    ScopeTimer(TimingCategory& category) : Timer(category) {
+        Start();
+    }
+
+    ~ScopeTimer() {
+        Stop();
+    }
+};
+
+} // namespace Profiling
+} // namespace Common
diff --git a/src/common/profiler_reporting.h b/src/common/profiler_reporting.h
new file mode 100644
index 00000000..3abb7331
--- /dev/null
+++ b/src/common/profiler_reporting.h
@@ -0,0 +1,108 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include "common/profiler.h"
+#include "common/synchronized_wrapper.h"
+
+namespace Common {
+namespace Profiling {
+
+struct TimingCategoryInfo {
+    static const unsigned int NO_PARENT = -1;
+
+    TimingCategory* category;
+    const char* name;
+    unsigned int parent;
+};
+
+struct ProfilingFrameResult {
+    /// Time since the last delivered frame
+    Duration interframe_time;
+
+    /// Time spent processing a frame, excluding VSync
+    Duration frame_time;
+
+    /// Total amount of time spent inside each category in this frame. Indexed by the category id
+    std::vector<Duration> time_per_category;
+};
+
+class ProfilingManager final {
+public:
+    ProfilingManager();
+
+    unsigned int RegisterTimingCategory(TimingCategory* category, const char* name);
+    void SetTimingCategoryParent(unsigned int category, unsigned int parent);
+
+    const std::vector<TimingCategoryInfo>& GetTimingCategoriesInfo() const {
+        return timing_categories;
+    }
+
+    /// This should be called after swapping screen buffers.
+    void BeginFrame();
+    /// This should be called before swapping screen buffers.
+    void FinishFrame();
+
+    /// Get the timing results from the previous frame. This is updated when you call FinishFrame().
+    const ProfilingFrameResult& GetPreviousFrameResults() const {
+        return results;
+    }
+
+private:
+    std::vector<TimingCategoryInfo> timing_categories;
+    Clock::time_point last_frame_end;
+    Clock::time_point this_frame_start;
+
+    ProfilingFrameResult results;
+};
+
+struct AggregatedDuration {
+    Duration avg, min, max;
+};
+
+struct AggregatedFrameResult {
+    /// Time since the last delivered frame
+    AggregatedDuration interframe_time;
+
+    /// Time spent processing a frame, excluding VSync
+    AggregatedDuration frame_time;
+
+    float fps;
+
+    /// Total amount of time spent inside each category in this frame. Indexed by the category id
+    std::vector<AggregatedDuration> time_per_category;
+};
+
+class TimingResultsAggregator final {
+public:
+    TimingResultsAggregator(size_t window_size);
+
+    void Clear();
+    void SetNumberOfCategories(size_t n);
+
+    void AddFrame(const ProfilingFrameResult& frame_result);
+
+    AggregatedFrameResult GetAggregatedResults() const;
+
+    size_t max_window_size;
+    size_t window_size;
+    size_t cursor;
+
+    std::vector<Duration> interframe_times;
+    std::vector<Duration> frame_times;
+    std::vector<std::vector<Duration>> times_per_category;
+};
+
+ProfilingManager& GetProfilingManager();
+SynchronizedRef<TimingResultsAggregator> GetTimingResultsAggregator();
+
+} // namespace Profiling
+} // namespace Common
diff --git a/src/common/swap.h b/src/common/swap.h
index e2d91836..7e37655b 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -17,18 +17,14 @@
 
 #pragma once
 
-// Android
-#if defined(ANDROID)
+#if defined(__linux__)
+#include <byteswap.h>
+#elif defined(__FreeBSD__)
 #include <sys/endian.h>
-
-#if _BYTE_ORDER == _LITTLE_ENDIAN && !defined(COMMON_LITTLE_ENDIAN)
-#define COMMON_LITTLE_ENDIAN 1
-#elif _BYTE_ORDER == _BIG_ENDIAN && !defined(COMMON_BIG_ENDIAN)
-#define COMMON_BIG_ENDIAN 1
 #endif
 
 // GCC 4.6+
-#elif __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#if __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
 
 #if __BYTE_ORDER__ && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && !defined(COMMON_LITTLE_ENDIAN)
 #define COMMON_LITTLE_ENDIAN 1
@@ -49,7 +45,6 @@
 #elif defined(_MSC_VER) && !defined(COMMON_BIG_ENDIAN) && !defined(COMMON_LITTLE_ENDIAN)
 
 #define COMMON_LITTLE_ENDIAN 1
-
 #endif
 
 // Worst case, default to little endian.
@@ -57,6 +52,93 @@
 #define COMMON_LITTLE_ENDIAN 1
 #endif
 
+namespace Common {
+
+inline u8 swap8(u8 _data) {return _data;}
+inline u32 swap24(const u8* _data) {return (_data[0] << 16) | (_data[1] << 8) | _data[2];}
+
+#ifdef _MSC_VER
+inline u16 swap16(u16 _data) {return _byteswap_ushort(_data);}
+inline u32 swap32(u32 _data) {return _byteswap_ulong (_data);}
+inline u64 swap64(u64 _data) {return _byteswap_uint64(_data);}
+#elif _M_ARM
+inline u16 swap16 (u16 _data) { u32 data = _data; __asm__ ("rev16 %0, %1\n" : "=l" (data) : "l" (data)); return (u16)data;}
+inline u32 swap32 (u32 _data) {__asm__ ("rev %0, %1\n" : "=l" (_data) : "l" (_data)); return _data;}
+inline u64 swap64(u64 _data) {return ((u64)swap32(_data) << 32) | swap32(_data >> 32);}
+#elif __linux__
+inline u16 swap16(u16 _data) {return bswap_16(_data);}
+inline u32 swap32(u32 _data) {return bswap_32(_data);}
+inline u64 swap64(u64 _data) {return bswap_64(_data);}
+#elif __APPLE__
+inline __attribute__((always_inline)) u16 swap16(u16 _data)
+{return (_data >> 8) | (_data << 8);}
+inline __attribute__((always_inline)) u32 swap32(u32 _data)
+{return __builtin_bswap32(_data);}
+inline __attribute__((always_inline)) u64 swap64(u64 _data)
+{return __builtin_bswap64(_data);}
+#elif __FreeBSD__
+inline u16 swap16(u16 _data) {return bswap16(_data);}
+inline u32 swap32(u32 _data) {return bswap32(_data);}
+inline u64 swap64(u64 _data) {return bswap64(_data);}
+#else
+// Slow generic implementation.
+inline u16 swap16(u16 data) {return (data >> 8) | (data << 8);}
+inline u32 swap32(u32 data) {return (swap16(data) << 16) | swap16(data >> 16);}
+inline u64 swap64(u64 data) {return ((u64)swap32(data) << 32) | swap32(data >> 32);}
+#endif
+
+inline float swapf(float f) {
+    union {
+        float f;
+        unsigned int u32;
+    } dat1, dat2;
+
+    dat1.f = f;
+    dat2.u32 = swap32(dat1.u32);
+
+    return dat2.f;
+}
+
+inline double swapd(double f) {
+    union  {
+        double f;
+        unsigned long long u64;
+    } dat1, dat2;
+
+    dat1.f = f;
+    dat2.u64 = swap64(dat1.u64);
+
+    return dat2.f;
+}
+
+inline u16 swap16(const u8* _pData) {return swap16(*(const u16*)_pData);}
+inline u32 swap32(const u8* _pData) {return swap32(*(const u32*)_pData);}
+inline u64 swap64(const u8* _pData) {return swap64(*(const u64*)_pData);}
+
+template <int count>
+void swap(u8*);
+
+template <>
+inline void swap<1>(u8* data) { }
+
+template <>
+inline void swap<2>(u8* data) {
+    *reinterpret_cast<u16*>(data) = swap16(data);
+}
+
+template <>
+inline void swap<4>(u8* data) {
+    *reinterpret_cast<u32*>(data) = swap32(data);
+}
+
+template <>
+inline void swap<8>(u8* data) {
+    *reinterpret_cast<u64*>(data) = swap64(data);
+}
+    
+}  // Namespace Common
+
+
 template <typename T, typename F>
 struct swap_struct_t {
     typedef swap_struct_t<T, F> swapped_t;
@@ -448,35 +530,35 @@ bool operator==(const S &p, const swap_struct_t<T, F> v) {
 template <typename T>
 struct swap_64_t {
     static T swap(T x) {
-        return (T)bswap64(*(u64 *)&x);
+        return (T)Common::swap64(*(u64 *)&x);
     }
 };
 
 template <typename T>
 struct swap_32_t {
     static T swap(T x) {
-        return (T)bswap32(*(u32 *)&x);
+        return (T)Common::swap32(*(u32 *)&x);
     }
 };
 
 template <typename T>
 struct swap_16_t {
     static T swap(T x) {
-        return (T)bswap16(*(u16 *)&x);
+        return (T)Common::swap16(*(u16 *)&x);
     }
 };
 
 template <typename T>
 struct swap_float_t {
     static T swap(T x) {
-        return (T)bswapf(*(float *)&x);
+        return (T)Common::swapf(*(float *)&x);
     }
 };
 
 template <typename T>
 struct swap_double_t {
     static T swap(T x) {
-        return (T)bswapd(*(double *)&x);
+        return (T)Common::swapd(*(double *)&x);
     }
 };
 
@@ -527,4 +609,5 @@ typedef s64 s64_be;
 
 typedef float float_be;
 typedef double double_be;
+
 #endif
diff --git a/src/common/synchronized_wrapper.h b/src/common/synchronized_wrapper.h
new file mode 100644
index 00000000..946252b8
--- /dev/null
+++ b/src/common/synchronized_wrapper.h
@@ -0,0 +1,69 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <mutex>
+
+namespace Common {
+
+/**
+ * Wraps an object, only allowing access to it via a locking reference wrapper. Good to ensure no
+ * one forgets to lock a mutex before acessing an object. To access the wrapped object construct a
+ * SyncronizedRef on this wrapper. Inspired by Rust's Mutex type (http://doc.rust-lang.org/std/sync/struct.Mutex.html).
+ */
+template <typename T>
+class SynchronizedWrapper {
+public:
+    template <typename... Args>
+    SynchronizedWrapper(Args&&... args) :
+        data(std::forward<Args>(args)...) {
+    }
+
+private:
+    template <typename U>
+    friend class SynchronizedRef;
+
+    std::mutex mutex;
+    T data;
+};
+
+/**
+ * Synchronized reference, that keeps a SynchronizedWrapper's mutex locked during its lifetime. This
+ * greatly reduces the chance that someone will access the wrapped resource without locking the
+ * mutex.
+ */
+template <typename T>
+class SynchronizedRef {
+public:
+    SynchronizedRef(SynchronizedWrapper<T>& wrapper) : wrapper(&wrapper) {
+        wrapper.mutex.lock();
+    }
+
+    SynchronizedRef(SynchronizedRef&) = delete;
+    SynchronizedRef(SynchronizedRef&& o) : wrapper(o.wrapper) {
+        o.wrapper = nullptr;
+    }
+
+    ~SynchronizedRef() {
+        if (wrapper)
+            wrapper->mutex.unlock();
+    }
+
+    SynchronizedRef& operator=(SynchronizedRef&) = delete;
+    SynchronizedRef& operator=(SynchronizedRef&& o) {
+        std::swap(wrapper, o.wrapper);
+    }
+
+    T& operator*() { return wrapper->data; }
+    const T& operator*() const { return wrapper->data; }
+
+    T* operator->() { return &wrapper->data; }
+    const T* operator->() const { return &wrapper->data; }
+
+private:
+    SynchronizedWrapper<T>* wrapper;
+};
+
+} // namespace Common
diff --git a/src/common/thread.h b/src/common/thread.h
index eaf1ba00..a45728e1 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -24,6 +24,25 @@
 #include <unistd.h>
 #endif
 
+// Support for C++11's thread_local keyword was surprisingly spotty in compilers until very
+// recently. Fortunately, thread local variables have been well supported for compilers for a while,
+// but with semantics supporting only POD types, so we can use a few defines to get some amount of
+// backwards compat support.
+// WARNING: This only works correctly with POD types.
+#if defined(__clang__)
+#   if !__has_feature(cxx_thread_local)
+#       define thread_local __thread
+#   endif
+#elif defined(__GNUC__)
+#   if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#       define thread_local __thread
+#   endif
+#elif defined(_MSC_VER)
+#   if _MSC_VER < 1900
+#       define thread_local __declspec(thread)
+#   endif
+#endif
+
 namespace Common
 {