SkPMFloat: avoid loads and stores where possible.

A store/load pair like this is a redundant no-op: store simd_register_a, memory_address load memory_address, simd_register_a Everyone seems to be good at removing those when using SSE, but GCC and Clang are pretty terrible at this for NEON. We end up issuing both redundant commands, usually to and from the stack. That's slow. Let's not do that. This CL unions in the native SIMD register type into SkPMFloat, so that we can assign to and from it directly, which is generating a lot better NEON code. On my Nexus 5, the benchmarks improve from 36ns to 23ns. SSE is just as fast either way, but I paralleled the NEON code for consistency. It's a little terser. And because it needed the platform headers anyway, I moved all includes into SkPMFloat.h, again only for consistency. I'd union in Sk4f too to make its conversion methods a little clearer, but MSVC won't let me (it has a copy constructor... they're apparently not up to speed with C++11 unrestricted unions). BUG=skia: Review URL: https://codereview.chromium.org/1015083004
author: mtklein <mtklein@chromium.org> 2015-03-18 09:51:23 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-03-18 09:51:24 -0700
commit: f94fa7112f67af6fc5db19f86d8397307ba17105 (patch)
tree: 0a869f7981b087a786b09e1f630044ca303d8032 /src
parent: 6f94076da504a9e292c7f6173b039d2692d47c51 (diff)
5 files changed, 52 insertions, 33 deletions
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index 4534f1142c..04323ad1fc 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -3,8 +3,15 @@
 
 #include "SkTypes.h"
 #include "SkColor.h"
+#include "SkColorPriv.h"
 #include "Sk4x.h"
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    #include <immintrin.h>
+#elif defined(__ARM_NEON__)
+    #include <arm_neon.h>
+#endif
+
 // A pre-multiplied color storing each component in the same order as SkPMColor,
 // but as a float in the range [0, 255].
 class SK_STRUCT_ALIGN(16) SkPMFloat {
@@ -27,16 +34,12 @@ public:
     // Uninitialized.
     SkPMFloat() {}
 
-    // Copy and assign are fastest if we remind the compiler we work best as Sk4f.
-    SkPMFloat(const SkPMFloat& that) { Sk4f(that).storeAligned(fColor); }
-    SkPMFloat& operator=(const SkPMFloat& that) {
-        Sk4f(that).storeAligned(fColor);
-        return *this;
-    }
+    SkPMFloat(const SkPMFloat& that) { *this = that; }
+    SkPMFloat& operator=(const SkPMFloat& that);
 
-    // Freely autoconvert between SkPMFloat and Sk4f.
-    /*implicit*/ SkPMFloat(const Sk4f& fs) { fs.storeAligned(fColor); }
-    /*implicit*/ operator Sk4f() const { return Sk4f::LoadAligned(fColor); }
+    // Freely autoconvert between SkPMFloat and Sk4f.  They're always byte-for-byte identical.
+    /*implicit*/ SkPMFloat(const Sk4f& fs) { *(Sk4f*)this = fs; }
+    /*implicit*/ operator Sk4f() const     { return *(const Sk4f*)this;    }
 
     float a() const { return fColor[SK_A32_SHIFT / 8]; }
     float r() const { return fColor[SK_R32_SHIFT / 8]; }
@@ -59,7 +62,14 @@ public:
     }
 
 private:
-    float fColor[4];
+    union {
+        float fColor[4];
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+        __m128 fColors;
+#elif defined(__ARM_NEON__)
+        float32x4_t fColors;
+#endif
+    };
 };
 
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
index 7298b4da3c..9c1295b565 100644
--- a/src/opts/SkPMFloat_SSE2.h
+++ b/src/opts/SkPMFloat_SSE2.h
@@ -1,5 +1,7 @@
-#include "SkColorPriv.h"
-#include <emmintrin.h>
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    fColors = that.fColors;
+    return *this;
+}
 
 // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
 // (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
@@ -13,7 +15,7 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
     __m128i fix8    = _mm_set_epi32(0,0,0,c),
             fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
             fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
-    _mm_store_ps(fColor, _mm_cvtepi32_ps(fix8_32));
+    fColors = _mm_cvtepi32_ps(fix8_32);
     SkASSERT(this->isValid());
 }
 
@@ -23,7 +25,7 @@ inline SkPMColor SkPMFloat::get() const {
 }
 
 inline SkPMColor SkPMFloat::clamped() const {
-    __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)),  // _mm_cvtps_epi32 rounds for us!
+    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
             fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
             fix8    = _mm_packus_epi16(fix8_16, fix8_16);
     SkPMColor c = _mm_cvtsi128_si32(fix8);
@@ -45,10 +47,10 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
 
 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
     // Same as _SSSE3.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)),  // _mm_cvtps_epi32 rounds for us!
-            c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)),
-            c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)),
-            c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor));
+    __m128i c0 = _mm_cvtps_epi32(floats[0].fColors),  // _mm_cvtps_epi32 rounds for us!
+            c1 = _mm_cvtps_epi32(floats[1].fColors),
+            c2 = _mm_cvtps_epi32(floats[2].fColors),
+            c3 = _mm_cvtps_epi32(floats[3].fColors);
     __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                      _mm_packus_epi16(c2, c3));
     _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
index ff296178cc..b2c2b23429 100644
--- a/src/opts/SkPMFloat_SSSE3.h
+++ b/src/opts/SkPMFloat_SSSE3.h
@@ -1,5 +1,7 @@
-#include "SkColorPriv.h"
-#include <tmmintrin.h>
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    fColors = that.fColors;
+    return *this;
+}
 
 // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
 // (fix8_32), then convert those to floats.
@@ -14,14 +16,14 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
     const int _ = 255;  // _ means to zero that byte.
     __m128i fix8    = _mm_set_epi32(0,0,0,c),
             fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
-    _mm_store_ps(fColor, _mm_cvtepi32_ps(fix8_32));
+    fColors = _mm_cvtepi32_ps(fix8_32);
     SkASSERT(this->isValid());
 }
 
 inline SkPMColor SkPMFloat::get() const {
     SkASSERT(this->isValid());
     const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)),  // _mm_cvtps_epi32 rounds for us!
+    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
             fix8    = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
     SkPMColor c = _mm_cvtsi128_si32(fix8);
     SkPMColorAssert(c);
@@ -29,7 +31,7 @@ inline SkPMColor SkPMFloat::get() const {
 }
 
 inline SkPMColor SkPMFloat::clamped() const {
-    __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)),  // _mm_cvtps_epi32 rounds for us!
+    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
             fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
             fix8    = _mm_packus_epi16(fix8_16, fix8_16);
     SkPMColor c = _mm_cvtsi128_si32(fix8);
@@ -49,10 +51,10 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
 
 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
     // Same as _SSE2.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)),  // _mm_cvtps_epi32 rounds for us!
-            c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)),
-            c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)),
-            c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor));
+    __m128i c0 = _mm_cvtps_epi32(floats[0].fColors),  // _mm_cvtps_epi32 rounds for us!
+            c1 = _mm_cvtps_epi32(floats[1].fColors),
+            c2 = _mm_cvtps_epi32(floats[2].fColors),
+            c3 = _mm_cvtps_epi32(floats[3].fColors);
     __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                      _mm_packus_epi16(c2, c3));
     _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index 036d10d0d6..6c9df37e51 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -1,5 +1,7 @@
-#include "SkColorPriv.h"
-#include <arm_neon.h>
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    fColors = that.fColors;
+    return *this;
+}
 
 // For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
 // (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
@@ -13,13 +15,13 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
     uint8x8_t   fix8    = (uint8x8_t)vdup_n_u32(c);
     uint16x8_t  fix8_16 = vmovl_u8(fix8);
     uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-    vst1q_f32(fColor, vcvtq_f32_u32(fix8_32));
+    fColors = vcvtq_f32_u32(fix8_32);
     SkASSERT(this->isValid());
 }
 
 inline SkPMColor SkPMFloat::get() const {
     SkASSERT(this->isValid());
-    float32x4_t add_half = vaddq_f32(vld1q_f32(fColor), vdupq_n_f32(0.5f));
+    float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f));
     uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
     uint16x4_t  fix8_16  = vmovn_u32(fix8_32);
     uint8x8_t   fix8     = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
@@ -29,7 +31,7 @@ inline SkPMColor SkPMFloat::get() const {
 }
 
 inline SkPMColor SkPMFloat::clamped() const {
-    float32x4_t add_half = vaddq_f32(vld1q_f32(fColor), vdupq_n_f32(0.5f));
+    float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f));
     uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
     uint16x4_t  fix8_16  = vqmovn_u32(fix8_32);
     uint8x8_t   fix8     = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index a33fa7a28d..c47f8a3713 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -1,4 +1,7 @@
-#include "SkColorPriv.h"
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    for (int i = 0; i < 4; i++) { fColor[i] = that.fColor[i]; }
+    return *this;
+}
 
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
     *this = SkPMFloat::FromARGB(SkGetPackedA32(c),
author	mtklein <mtklein@chromium.org>	2015-03-18 09:51:23 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-03-18 09:51:24 -0700
commit	f94fa7112f67af6fc5db19f86d8397307ba17105 (patch)
tree	0a869f7981b087a786b09e1f630044ca303d8032 /src
parent	6f94076da504a9e292c7f6173b039d2692d47c51 (diff)