diff options
author | mtklein <mtklein@chromium.org> | 2015-03-18 09:51:23 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-03-18 09:51:24 -0700 |
commit | f94fa7112f67af6fc5db19f86d8397307ba17105 (patch) | |
tree | 0a869f7981b087a786b09e1f630044ca303d8032 /src/core | |
parent | 6f94076da504a9e292c7f6173b039d2692d47c51 (diff) |
SkPMFloat: avoid loads and stores where possible.
A store/load pair like this is a redundant no-op:
store simd_register_a, memory_address
load memory_address, simd_register_a
Everyone seems to be good at removing those when using SSE, but GCC and Clang
are pretty terrible at this for NEON. We end up issuing both redundant
commands, usually to and from the stack. That's slow. Let's not do that.
This CL unions in the native SIMD register type into SkPMFloat, so that we can
assign to and from it directly, which is generating a lot better NEON code. On
my Nexus 5, the benchmarks improve from 36ns to 23ns.
SSE is just as fast either way, but I paralleled the NEON code for consistency.
It's a little terser. And because it needed the platform headers anyway, I
moved all includes into SkPMFloat.h, again only for consistency.
I'd union in Sk4f too to make its conversion methods a little clearer,
but MSVC won't let me (it has a copy constructor... they're apparently not up
to speed with C++11 unrestricted unions).
BUG=skia:
Review URL: https://codereview.chromium.org/1015083004
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/SkPMFloat.h | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h index 4534f1142c..04323ad1fc 100644 --- a/src/core/SkPMFloat.h +++ b/src/core/SkPMFloat.h @@ -3,8 +3,15 @@ #include "SkTypes.h" #include "SkColor.h" +#include "SkColorPriv.h" #include "Sk4x.h" +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + #include <immintrin.h> +#elif defined(__ARM_NEON__) + #include <arm_neon.h> +#endif + // A pre-multiplied color storing each component in the same order as SkPMColor, // but as a float in the range [0, 255]. class SK_STRUCT_ALIGN(16) SkPMFloat { @@ -27,16 +34,12 @@ public: // Uninitialized. SkPMFloat() {} - // Copy and assign are fastest if we remind the compiler we work best as Sk4f. - SkPMFloat(const SkPMFloat& that) { Sk4f(that).storeAligned(fColor); } - SkPMFloat& operator=(const SkPMFloat& that) { - Sk4f(that).storeAligned(fColor); - return *this; - } + SkPMFloat(const SkPMFloat& that) { *this = that; } + SkPMFloat& operator=(const SkPMFloat& that); - // Freely autoconvert between SkPMFloat and Sk4f. - /*implicit*/ SkPMFloat(const Sk4f& fs) { fs.storeAligned(fColor); } - /*implicit*/ operator Sk4f() const { return Sk4f::LoadAligned(fColor); } + // Freely autoconvert between SkPMFloat and Sk4f. They're always byte-for-byte identical. + /*implicit*/ SkPMFloat(const Sk4f& fs) { *(Sk4f*)this = fs; } + /*implicit*/ operator Sk4f() const { return *(const Sk4f*)this; } float a() const { return fColor[SK_A32_SHIFT / 8]; } float r() const { return fColor[SK_R32_SHIFT / 8]; } @@ -59,7 +62,14 @@ public: } private: - float fColor[4]; + union { + float fColor[4]; +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + __m128 fColors; +#elif defined(__ARM_NEON__) + float32x4_t fColors; +#endif + }; }; #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |