diff options
-rw-r--r-- | src/core/SkPMFloat.h | 30 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSE2.h | 18 | ||||
-rw-r--r-- | src/opts/SkPMFloat_SSSE3.h | 20 | ||||
-rw-r--r-- | src/opts/SkPMFloat_neon.h | 12 | ||||
-rw-r--r-- | src/opts/SkPMFloat_none.h | 5 |
5 files changed, 52 insertions, 33 deletions
diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h index 4534f1142c..04323ad1fc 100644 --- a/src/core/SkPMFloat.h +++ b/src/core/SkPMFloat.h @@ -3,8 +3,15 @@ #include "SkTypes.h" #include "SkColor.h" +#include "SkColorPriv.h" #include "Sk4x.h" +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + #include <immintrin.h> +#elif defined(__ARM_NEON__) + #include <arm_neon.h> +#endif + // A pre-multiplied color storing each component in the same order as SkPMColor, // but as a float in the range [0, 255]. class SK_STRUCT_ALIGN(16) SkPMFloat { @@ -27,16 +34,12 @@ public: // Uninitialized. SkPMFloat() {} - // Copy and assign are fastest if we remind the compiler we work best as Sk4f. - SkPMFloat(const SkPMFloat& that) { Sk4f(that).storeAligned(fColor); } - SkPMFloat& operator=(const SkPMFloat& that) { - Sk4f(that).storeAligned(fColor); - return *this; - } + SkPMFloat(const SkPMFloat& that) { *this = that; } + SkPMFloat& operator=(const SkPMFloat& that); - // Freely autoconvert between SkPMFloat and Sk4f. - /*implicit*/ SkPMFloat(const Sk4f& fs) { fs.storeAligned(fColor); } - /*implicit*/ operator Sk4f() const { return Sk4f::LoadAligned(fColor); } + // Freely autoconvert between SkPMFloat and Sk4f. They're always byte-for-byte identical. + /*implicit*/ SkPMFloat(const Sk4f& fs) { *(Sk4f*)this = fs; } + /*implicit*/ operator Sk4f() const { return *(const Sk4f*)this; } float a() const { return fColor[SK_A32_SHIFT / 8]; } float r() const { return fColor[SK_R32_SHIFT / 8]; } @@ -59,7 +62,14 @@ public: } private: - float fColor[4]; + union { + float fColor[4]; +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 + __m128 fColors; +#elif defined(__ARM_NEON__) + float32x4_t fColors; +#endif + }; }; #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h index 7298b4da3c..9c1295b565 100644 --- a/src/opts/SkPMFloat_SSE2.h +++ b/src/opts/SkPMFloat_SSE2.h @@ -1,5 +1,7 @@ -#include "SkColorPriv.h" -#include <emmintrin.h> +inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) { + fColors = that.fColors; + return *this; +} // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits // (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats. @@ -13,7 +15,7 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) { __m128i fix8 = _mm_set_epi32(0,0,0,c), fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); - _mm_store_ps(fColor, _mm_cvtepi32_ps(fix8_32)); + fColors = _mm_cvtepi32_ps(fix8_32); SkASSERT(this->isValid()); } @@ -23,7 +25,7 @@ inline SkPMColor SkPMFloat::get() const { } inline SkPMColor SkPMFloat::clamped() const { - __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)), // _mm_cvtps_epi32 rounds for us! + __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for us! fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), fix8 = _mm_packus_epi16(fix8_16, fix8_16); SkPMColor c = _mm_cvtsi128_si32(fix8); @@ -45,10 +47,10 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { // Same as _SSSE3.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. - __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)), // _mm_cvtps_epi32 rounds for us! - c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)), - c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)), - c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor)); + __m128i c0 = _mm_cvtps_epi32(floats[0].fColors), // _mm_cvtps_epi32 rounds for us! + c1 = _mm_cvtps_epi32(floats[1].fColors), + c2 = _mm_cvtps_epi32(floats[2].fColors), + c3 = _mm_cvtps_epi32(floats[3].fColors); __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), _mm_packus_epi16(c2, c3)); _mm_storeu_si128((__m128i*)colors, c3210); diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h index ff296178cc..b2c2b23429 100644 --- a/src/opts/SkPMFloat_SSSE3.h +++ b/src/opts/SkPMFloat_SSSE3.h @@ -1,5 +1,7 @@ -#include "SkColorPriv.h" -#include <tmmintrin.h> +inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) { + fColors = that.fColors; + return *this; +} // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit components in 32 bits // (fix8_32), then convert those to floats. @@ -14,14 +16,14 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) { const int _ = 255; // _ means to zero that byte. __m128i fix8 = _mm_set_epi32(0,0,0,c), fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,_,1, _,_,_,0)); - _mm_store_ps(fColor, _mm_cvtepi32_ps(fix8_32)); + fColors = _mm_cvtepi32_ps(fix8_32); SkASSERT(this->isValid()); } inline SkPMColor SkPMFloat::get() const { SkASSERT(this->isValid()); const int _ = 255; // _ means to zero that byte. - __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)), // _mm_cvtps_epi32 rounds for us! + __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for us! fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0)); SkPMColor c = _mm_cvtsi128_si32(fix8); SkPMColorAssert(c); @@ -29,7 +31,7 @@ inline SkPMColor SkPMFloat::get() const { } inline SkPMColor SkPMFloat::clamped() const { - __m128i fix8_32 = _mm_cvtps_epi32(_mm_load_ps(fColor)), // _mm_cvtps_epi32 rounds for us! + __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for us! fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), fix8 = _mm_packus_epi16(fix8_16, fix8_16); SkPMColor c = _mm_cvtsi128_si32(fix8); @@ -49,10 +51,10 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) { // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses 8. - __m128i c0 = _mm_cvtps_epi32(_mm_load_ps(floats[0].fColor)), // _mm_cvtps_epi32 rounds for us! - c1 = _mm_cvtps_epi32(_mm_load_ps(floats[1].fColor)), - c2 = _mm_cvtps_epi32(_mm_load_ps(floats[2].fColor)), - c3 = _mm_cvtps_epi32(_mm_load_ps(floats[3].fColor)); + __m128i c0 = _mm_cvtps_epi32(floats[0].fColors), // _mm_cvtps_epi32 rounds for us! + c1 = _mm_cvtps_epi32(floats[1].fColors), + c2 = _mm_cvtps_epi32(floats[2].fColors), + c3 = _mm_cvtps_epi32(floats[3].fColors); __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), _mm_packus_epi16(c2, c3)); _mm_storeu_si128((__m128i*)colors, c3210); diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h index 036d10d0d6..6c9df37e51 100644 --- a/src/opts/SkPMFloat_neon.h +++ b/src/opts/SkPMFloat_neon.h @@ -1,5 +1,7 @@ -#include "SkColorPriv.h" -#include <arm_neon.h> +inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) { + fColors = that.fColors; + return *this; +} // For SkPMFloat(SkPMFColor), we widen our 8 bit components (fix8) to 8-bit components in 16 bits // (fix8_16), then widen those to 8-bit-in-32-bits (fix8_32), and finally convert those to floats. @@ -13,13 +15,13 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) { uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c); uint16x8_t fix8_16 = vmovl_u8(fix8); uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); - vst1q_f32(fColor, vcvtq_f32_u32(fix8_32)); + fColors = vcvtq_f32_u32(fix8_32); SkASSERT(this->isValid()); } inline SkPMColor SkPMFloat::get() const { SkASSERT(this->isValid()); - float32x4_t add_half = vaddq_f32(vld1q_f32(fColor), vdupq_n_f32(0.5f)); + float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f)); uint32x4_t fix8_32 = vcvtq_u32_f32(add_half); // vcvtq_u32_f32 truncates, so round manually uint16x4_t fix8_16 = vmovn_u32(fix8_32); uint8x8_t fix8 = vmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); @@ -29,7 +31,7 @@ inline SkPMColor SkPMFloat::get() const { } inline SkPMColor SkPMFloat::clamped() const { - float32x4_t add_half = vaddq_f32(vld1q_f32(fColor), vdupq_n_f32(0.5f)); + float32x4_t add_half = vaddq_f32(fColors, vdupq_n_f32(0.5f)); uint32x4_t fix8_32 = vcvtq_u32_f32(add_half); // vcvtq_u32_f32 truncates, so round manually uint16x4_t fix8_16 = vqmovn_u32(fix8_32); uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h index a33fa7a28d..c47f8a3713 100644 --- a/src/opts/SkPMFloat_none.h +++ b/src/opts/SkPMFloat_none.h @@ -1,4 +1,7 @@ -#include "SkColorPriv.h" +inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) { + for (int i = 0; i < 4; i++) { fColor[i] = that.fColor[i]; } + return *this; +} inline SkPMFloat::SkPMFloat(SkPMColor c) { *this = SkPMFloat::FromARGB(SkGetPackedA32(c), |