5 files changed, 52 insertions, 33 deletions
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h
index 4534f1142c..04323ad1fc 100644
--- a/src/core/SkPMFloat.h
+++ b/src/core/SkPMFloat.h
@@ -3,8 +3,15 @@
 
 #include "SkTypes.h"
 #include "SkColor.h"
+#include "SkColorPriv.h"
 #include "Sk4x.h"
 
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+    #include <immintrin.h>
+#elif defined(__ARM_NEON__)
+    #include <arm_neon.h>
+#endif
+
 // A pre-multiplied color storing each component in the same order as SkPMColor,
 // but as a float in the range [0, 255].
 class SK_STRUCT_ALIGN(16) SkPMFloat {
@@ -27,16 +34,12 @@ public:
     // Uninitialized.
     SkPMFloat() {}
 
-    // Copy and assign are fastest if we remind the compiler we work best as Sk4f.
-    SkPMFloat(const SkPMFloat& that) { Sk4f(that).storeAligned(fColor); }
-    SkPMFloat& operator=(const SkPMFloat& that) {
-        Sk4f(that).storeAligned(fColor);
-        return *this;
-    }
+    SkPMFloat(const SkPMFloat& that) { *this = that; }
+    SkPMFloat& operator=(const SkPMFloat& that);
 
-    // Freely autoconvert between SkPMFloat and Sk4f.
-    /*implicit*/ SkPMFloat(const Sk4f& fs) { fs.storeAligned(fColor); }
-    /*implicit*/ operator Sk4f() const { return Sk4f::LoadAligned(fColor); }
+    // Freely autoconvert between SkPMFloat and Sk4f.  They're always byte-for-byte identical.
+    /*implicit*/ SkPMFloat(const Sk4f& fs) { *(Sk4f*)this = fs; }
+    /*implicit*/ operator Sk4f() const     { return *(const Sk4f*)this;    }
 
     float a() const { return fColor[SK_A32_SHIFT / 8]; }
     float r() const { return fColor[SK_R32_SHIFT / 8]; }
@@ -59,7 +62,14 @@ public:
     }
 
 private:
-    float fColor[4];
+    union {
+        float fColor[4];
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+        __m128 fColors;
+#elif defined(__ARM_NEON__)
+        float32x4_t fColors;
+#endif
+    };
 };
 
 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h
index 7298b4da3c..9c1295b565 100644
--- a/src/opts/SkPMFloat_SSE2.h
+++ b/src/opts/SkPMFloat_SSE2.h
@@ -1,5 +1,7 @@
-#include "SkColorPriv.h"
-#include <emmintrin.h>
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    fColors = that.fColors;
+    return *this;
+}
 
 // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
 // (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
@@ -13,7 +15,7 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
     __m128i fix8    = _mm_set_epi32(0,0,0,c),
             fix8_16 = _mm_unpacklo_epi8 (fix8,    _mm_setzero_si128()),
             fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128());
-    _mm_store_ps(fColor, _mm_cvtepi32_ps(fix8_32));
+    fColors = _mm_cvtepi32_ps(fix8_32);
     SkASSERT(this->isValid());
 }
 
@@ -23,7 +25,7 @@ inline SkPMColor SkPMFloat::get() const {
 }
 
 inline SkPMColor SkPMFloat::clamped() const {
-    __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)),  // _mm_cvtps_epi32 rounds for us!
+    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
             fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
             fix8    = _mm_packus_epi16(fix8_16, fix8_16);
     SkPMColor c = _mm_cvtsi128_si32(fix8);
@@ -45,10 +47,10 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
 
 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
     // Same as _SSSE3.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)),  // _mm_cvtps_epi32 rounds for us!
-            c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)),
-            c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)),
-            c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor));
+    __m128i c0 = _mm_cvtps_epi32(floats[0].fColors),  // _mm_cvtps_epi32 rounds for us!
+            c1 = _mm_cvtps_epi32(floats[1].fColors),
+            c2 = _mm_cvtps_epi32(floats[2].fColors),
+            c3 = _mm_cvtps_epi32(floats[3].fColors);
     __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                      _mm_packus_epi16(c2, c3));
     _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h
index ff296178cc..b2c2b23429 100644
--- a/src/opts/SkPMFloat_SSSE3.h
+++ b/src/opts/SkPMFloat_SSSE3.h
@@ -1,5 +1,7 @@
-#include "SkColorPriv.h"
-#include <tmmintrin.h>
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    fColors = that.fColors;
+    return *this;
+}
 
 // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits
 // (fix8_32), then convert those to floats.
@@ -14,14 +16,14 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
     const int _ = 255;  // _ means to zero that byte.
     __m128i fix8    = _mm_set_epi32(0,0,0,c),
             fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0));
-    _mm_store_ps(fColor, _mm_cvtepi32_ps(fix8_32));
+    fColors = _mm_cvtepi32_ps(fix8_32);
     SkASSERT(this->isValid());
 }
 
 inline SkPMColor SkPMFloat::get() const {
     SkASSERT(this->isValid());
     const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)),  // _mm_cvtps_epi32 rounds for us!
+    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
             fix8    = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
     SkPMColor c = _mm_cvtsi128_si32(fix8);
     SkPMColorAssert(c);
@@ -29,7 +31,7 @@ inline SkPMColor SkPMFloat::get() const {
 }
 
 inline SkPMColor SkPMFloat::clamped() const {
-    __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)),  // _mm_cvtps_epi32 rounds for us!
+    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
             fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
             fix8    = _mm_packus_epi16(fix8_16, fix8_16);
     SkPMColor c = _mm_cvtsi128_si32(fix8);
@@ -49,10 +51,10 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
 
 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
     // Same as _SSE2.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)),  // _mm_cvtps_epi32 rounds for us!
-            c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)),
-            c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)),
-            c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor));
+    __m128i c0 = _mm_cvtps_epi32(floats[0].fColors),  // _mm_cvtps_epi32 rounds for us!
+            c1 = _mm_cvtps_epi32(floats[1].fColors),
+            c2 = _mm_cvtps_epi32(floats[2].fColors),
+            c3 = _mm_cvtps_epi32(floats[3].fColors);
     __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                      _mm_packus_epi16(c2, c3));
     _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h
index 036d10d0d6..6c9df37e51 100644
--- a/src/opts/SkPMFloat_neon.h
+++ b/src/opts/SkPMFloat_neon.h
@@ -1,5 +1,7 @@
-#include "SkColorPriv.h"
-#include <arm_neon.h>
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    fColors = that.fColors;
+    return *this;
+}
 
 // For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits
 // (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats.
@@ -13,13 +15,13 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
     uint8x8_t   fix8    = (uint8x8_t)vdup_n_u32(c);
     uint16x8_t  fix8_16 = vmovl_u8(fix8);
     uint32x4_t  fix8_32 = vmovl_u16(vget_low_u16(fix8_16));
-    vst1q_f32(fColor, vcvtq_f32_u32(fix8_32));
+    fColors = vcvtq_f32_u32(fix8_32);
     SkASSERT(this->isValid());
 }
 
 inline SkPMColor SkPMFloat::get() const {
     SkASSERT(this->isValid());
-    float32x4_t add_half = vaddq_f32(vld1q_f32(fColor), vdupq_n_f32(0.5f));
+    float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f));
     uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
     uint16x4_t  fix8_16  = vmovn_u32(fix8_32);
     uint8x8_t   fix8     = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
@@ -29,7 +31,7 @@ inline SkPMColor SkPMFloat::get() const {
 }
 
 inline SkPMColor SkPMFloat::clamped() const {
-    float32x4_t add_half = vaddq_f32(vld1q_f32(fColor), vdupq_n_f32(0.5f));
+    float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f));
     uint32x4_t  fix8_32  = vcvtq_u32_f32(add_half);  // vcvtq_u32_f32 truncates, so round manually
     uint16x4_t  fix8_16  = vqmovn_u32(fix8_32);
     uint8x8_t   fix8     = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0)));
diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h
index a33fa7a28d..c47f8a3713 100644
--- a/src/opts/SkPMFloat_none.h
+++ b/src/opts/SkPMFloat_none.h
@@ -1,4 +1,7 @@
-#include "SkColorPriv.h"
+inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) {
+    for (int i = 0; i < 4; i++) { fColor[i] = that.fColor[i]; }
+    return *this;
+}
 
 inline SkPMFloat::SkPMFloat(SkPMColor c) {
     *this = SkPMFloat::FromARGB(SkGetPackedA32(c),