diff options
-rw-r--r-- | src/core/SkNx.h | 14 | ||||
-rw-r--r-- | src/core/SkPMFloat.h | 23 | ||||
-rw-r--r-- | src/opts/SkColorCubeFilter_opts.h | 29 | ||||
-rw-r--r-- | src/opts/SkNx_neon.h | 13 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 21 | ||||
-rw-r--r-- | src/opts/SkPMFloat_neon.h | 50 | ||||
-rw-r--r-- | src/opts/SkPMFloat_none.h | 48 | ||||
-rw-r--r-- | src/opts/SkPMFloat_sse.h | 65 |
8 files changed, 69 insertions, 194 deletions
diff --git a/src/core/SkNx.h b/src/core/SkNx.h index 84f9b69353..895b972dd1 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -92,6 +92,12 @@ public: static SkNf Load(const T vals[N]) { return SkNf(SkNf<N/2,T>::Load(vals), SkNf<N/2,T>::Load(vals+N/2)); } + // FromBytes() and toBytes() specializations may assume their argument is N-byte aligned. + // E.g. Sk4f::FromBytes() may assume it's reading from a 4-byte-aligned pointer. + // Converts [0,255] bytes to [0.0, 255.0] floats. + static SkNf FromBytes(const uint8_t bytes[N]) { + return SkNf(SkNf<N/2,T>::FromBytes(bytes), SkNf<N/2,T>::FromBytes(bytes+N/2)); + } SkNf(T a, T b) : fLo(a), fHi(b) { REQUIRE(N==2); } SkNf(T a, T b, T c, T d) : fLo(a,b), fHi(c,d) { REQUIRE(N==4); } @@ -101,6 +107,12 @@ public: fLo.store(vals); fHi.store(vals+N/2); } + // Please see note on FromBytes(). + // Truncates [0.0,256.0) floats to [0,255] bytes. Other inputs are unspecified. + void toBytes(uint8_t bytes[N]) const { + fLo.toBytes(bytes); + fHi.toBytes(bytes+N/2); + } SkNi<N,I> castTrunc() const { return SkNi<N,I>(fLo.castTrunc(), fHi.castTrunc()); } @@ -201,8 +213,10 @@ public: SkNf() {} explicit SkNf(T val) : fVal(val) {} static SkNf Load(const T vals[1]) { return SkNf(vals[0]); } + static SkNf FromBytes(const uint8_t bytes[1]) { return SkNf((T)bytes[0]); } void store(T vals[1]) const { vals[0] = fVal; } + void toBytes(uint8_t bytes[1]) const { bytes[0] = (uint8_t)(fVal); } SkNi<1,I> castTrunc() const { return SkNi<1,I>(fVal); } diff --git a/src/core/SkPMFloat.h b/src/core/SkPMFloat.h index 4a5621f5b3..4a2235dd75 100644 --- a/src/core/SkPMFloat.h +++ b/src/core/SkPMFloat.h @@ -28,11 +28,11 @@ public: static SkPMFloat FromARGB(float a, float r, float g, float b) { return SkPMFloat(a,r,g,b); } static SkPMFloat FromOpaqueColor(SkColor c); // Requires c's alpha == 0xFF. - Sk4f alphas() const; // argb -> aaaa, generally faster than the equivalent Sk4f(this->a()). + Sk4f alphas() const { return Sk4f(this->a()); } // Uninitialized. SkPMFloat() {} - explicit SkPMFloat(SkPMColor); + explicit SkPMFloat(SkPMColor c) { *this = Sk4f::FromBytes((uint8_t*)&c) * Sk4f(1.0f/255); } SkPMFloat(float a, float r, float g, float b) #ifdef SK_PMCOLOR_IS_RGBA : INHERITED(r,g,b,a) {} @@ -47,7 +47,11 @@ public: float g() const { return this->kth<SK_G32_SHIFT / 8>(); } float b() const { return this->kth<SK_B32_SHIFT / 8>(); } - SkPMColor round() const; // Rounds from [0.0f, 1.0f] to [0, 255], clamping if out of range. + SkPMColor round() const { + SkPMColor c; + (*this * Sk4f(255) + Sk4f(0.5f)).toBytes((uint8_t*)&c); + return c; + } bool isValid() const { return this->a() >= 0 && this->a() <= 1 @@ -62,17 +66,4 @@ private: } // namespace -#ifdef SKNX_NO_SIMD - // Platform implementations of SkPMFloat assume Sk4f uses SSE or NEON. _none is generic. - #include "../opts/SkPMFloat_none.h" -#else - #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - #include "../opts/SkPMFloat_sse.h" - #elif defined(SK_ARM_HAS_NEON) - #include "../opts/SkPMFloat_neon.h" - #else - #include "../opts/SkPMFloat_none.h" - #endif -#endif - #endif//SkPM_DEFINED diff --git a/src/opts/SkColorCubeFilter_opts.h b/src/opts/SkColorCubeFilter_opts.h index 41d2525b5a..afd6149cff 100644 --- a/src/opts/SkColorCubeFilter_opts.h +++ b/src/opts/SkColorCubeFilter_opts.h @@ -6,7 +6,7 @@ #define SkColorCubeFilter_opts_DEFINED #include "SkColor.h" -#include "SkPMFloat.h" +#include "SkNx.h" #include "SkUnPreMultiply.h" namespace SK_OPTS_NS { @@ -18,7 +18,6 @@ void color_cube_filter_span(const SkPMColor src[], const SkScalar* colorToFactors[2], int dim, const SkColor* colorCube) { - uint8_t* ptr_dst = reinterpret_cast<uint8_t*>(dst); uint8_t r, g, b, a; for (int i = 0; i < count; ++i) { @@ -51,8 +50,7 @@ void color_cube_filter_span(const SkPMColor src[], const int i10 = (colorToIndex[1][g] + colorToIndex[0][b] * dim) * dim; const int i11 = (colorToIndex[1][g] + colorToIndex[1][b] * dim) * dim; - SkPMFloat color(0,0,0,0); - + Sk4f color(0,0,0,0); for (int x = 0; x < 2; ++x) { const int ix = colorToIndex[x][r]; @@ -61,22 +59,23 @@ void color_cube_filter_span(const SkPMColor src[], const SkColor lutColor10 = colorCube[ix + i10]; const SkColor lutColor11 = colorCube[ix + i11]; - Sk4f sum = SkPMFloat::FromOpaqueColor(lutColor00) * g0b0; - sum = sum + SkPMFloat::FromOpaqueColor(lutColor01) * g0b1; - sum = sum + SkPMFloat::FromOpaqueColor(lutColor10) * g1b0; - sum = sum + SkPMFloat::FromOpaqueColor(lutColor11) * g1b1; - + Sk4f sum = Sk4f::FromBytes((const uint8_t*)&lutColor00) * g0b0; + sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor01) * g0b1; + sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor10) * g1b0; + sum = sum + Sk4f::FromBytes((const uint8_t*)&lutColor11) * g1b1; color = color + sum * Sk4f((float)colorToFactors[x][r]); } - if (a != 255) { - color = color * Sk4f(a * 1.0f/255); + color = color * Sk4f(a * (1.0f/255)); } - dst[i] = color.round(); - - ptr_dst[SK_A32_SHIFT / 8] = a; - ptr_dst += 4; + // color is BGRA (SkColor order), dst is SkPMColor order, so may need to swap R+B. + #if defined(SK_PMCOLOR_IS_RGBA) + color = Sk4f(color.kth<2>(), color.kth<1>(), color.kth<0>(), color.kth<3>()); + #endif + uint8_t* dstBytes = (uint8_t*)(dst+i); + color.toBytes(dstBytes); + dstBytes[SK_A32_SHIFT/8] = a; } } diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 660b92c6b0..cf149862c5 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -223,9 +223,22 @@ public: SkNf() {} explicit SkNf(float val) : fVec(vdupq_n_f32(val)) {} static SkNf Load(const float vals[4]) { return vld1q_f32(vals); } + static SkNf FromBytes(const uint8_t vals[4]) { + uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); + uint16x8_t fix8_16 = vmovl_u8(fix8); + uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); + return SkNf(vcvtq_f32_u32(fix8_32)); + } + SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; } void store(float vals[4]) const { vst1q_f32(vals, fVec); } + void toBytes(uint8_t bytes[4]) const { + uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); + uint16x4_t fix8_16 = vqmovn_u32(fix8_32); + uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); + vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); + } SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index e165f58737..093cd4c14c 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -155,9 +155,30 @@ public: SkNf() {} explicit SkNf(float val) : fVec( _mm_set1_ps(val) ) {} static SkNf Load(const float vals[4]) { return _mm_loadu_ps(vals); } + + static SkNf FromBytes(const uint8_t bytes[4]) { + __m128i fix8 = _mm_cvtsi32_si128(*(const int*)bytes); + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 + const char _ = ~0; // Zero these bytes. + __m128i fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); + #else + __m128i fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), + fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); + #endif + return SkNf(_mm_cvtepi32_ps(fix8_32)); + // TODO: use _mm_cvtepu8_epi32 w/SSE4.1? + } + SkNf(float a, float b, float c, float d) : fVec(_mm_setr_ps(a,b,c,d)) {} void store(float vals[4]) const { _mm_storeu_ps(vals, fVec); } + void toBytes(uint8_t bytes[4]) const { + __m128i fix8_32 = _mm_cvttps_epi32(fVec), + fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), + fix8 = _mm_packus_epi16(fix8_16, fix8_16); + *(int*)bytes = _mm_cvtsi128_si32(fix8); + // TODO: use _mm_shuffle_epi8 w/SSSE3? + } SkNi<4, int> castTrunc() const { return _mm_cvttps_epi32(fVec); } diff --git a/src/opts/SkPMFloat_neon.h b/src/opts/SkPMFloat_neon.h deleted file mode 100644 index 4e099f91ec..0000000000 --- a/src/opts/SkPMFloat_neon.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 2015 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -namespace { // See SkPMFloat.h - -static_assert(SK_A32_SHIFT == 24, "This file assumes little-endian."); - -inline SkPMFloat::SkPMFloat(SkPMColor c) { - SkPMColorAssert(c); - uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c); - uint16x8_t fix8_16 = vmovl_u8(fix8); - uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); - fVec = vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255)); - SkASSERT(this->isValid()); -} - -inline SkPMColor SkPMFloat::round() const { - // vcvt_u32_f32 truncates, so we round manually by adding a half before converting. - float32x4_t rounded = vmlaq_f32(vdupq_n_f32(0.5f), fVec, vdupq_n_f32(255)); - uint32x4_t fix8_32 = vcvtq_u32_f32(rounded); - uint16x4_t fix8_16 = vqmovn_u32(fix8_32); - uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); - SkPMColor c = vget_lane_u32((uint32x2_t)fix8, 0); - SkPMColorAssert(c); - return c; -} - -inline Sk4f SkPMFloat::alphas() const { - return vdupq_lane_f32(vget_high_f32(fVec), 1); // Duplicate high lane of high half i.e. lane 3. -} - -inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) { - SkASSERT(SkColorGetA(c) == 0xFF); - uint8x8_t fix8 = (uint8x8_t)vdup_n_u32(c); -#if defined(SK_PMCOLOR_IS_RGBA) - fix8 = vtbl1_u8(fix8, vcreate_u8(0x0300010203000102ULL)); // 03 00 01 02, 2x, i.e. swap R&B. -#endif - uint16x8_t fix8_16 = vmovl_u8(fix8); - uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); - - SkPMFloat pmf = Sk4f(vmulq_f32(vcvtq_f32_u32(fix8_32), vdupq_n_f32(1.0f/255))); - SkASSERT(pmf.isValid()); - return pmf; -} - -} // namespace diff --git a/src/opts/SkPMFloat_none.h b/src/opts/SkPMFloat_none.h deleted file mode 100644 index 17c76d90f4..0000000000 --- a/src/opts/SkPMFloat_none.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2015 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -namespace { // See SkPMFloat.h - -inline SkPMFloat::SkPMFloat(SkPMColor c) { - float inv255 = 1.0f/255; - *this = SkPMFloat::FromARGB(SkGetPackedA32(c) * inv255, - SkGetPackedR32(c) * inv255, - SkGetPackedG32(c) * inv255, - SkGetPackedB32(c) * inv255); - SkASSERT(this->isValid()); -} - -inline SkPMColor SkPMFloat::round() const { - float a = this->a(), - r = this->r(), - g = this->g(), - b = this->b(); - a = a < 0 ? 0 : (a > 1 ? 1 : a); - r = r < 0 ? 0 : (r > 1 ? 1 : r); - g = g < 0 ? 0 : (g > 1 ? 1 : g); - b = b < 0 ? 0 : (b > 1 ? 1 : b); - SkPMColor c = SkPackARGB32(255*a+0.5f, 255*r+0.5f, 255*g+0.5f, 255*b+0.5f); - SkPMColorAssert(c); - return c; -} - -inline Sk4f SkPMFloat::alphas() const { - return Sk4f(this->a()); -} - -inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) { - SkASSERT(SkColorGetA(c) == 0xFF); - float inv255 = 1.0f / 255; - SkPMFloat pmf = SkPMFloat::FromARGB(1.0f, - SkColorGetR(c) * inv255, - SkColorGetG(c) * inv255, - SkColorGetB(c) * inv255); - SkASSERT(pmf.isValid()); - return pmf; -} - -} // namespace diff --git a/src/opts/SkPMFloat_sse.h b/src/opts/SkPMFloat_sse.h deleted file mode 100644 index 6a4d5b6b5c..0000000000 --- a/src/opts/SkPMFloat_sse.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2015 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -namespace { // See SkPMFloat.h - -inline SkPMFloat::SkPMFloat(SkPMColor c) { - SkPMColorAssert(c); -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 - const char _ = ~0; // Zero these bytes. - __m128i fix8 = _mm_cvtsi32_si128((int)c), - fix8_32 = _mm_shuffle_epi8(fix8, _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_)); -#else - __m128i fix8 = _mm_cvtsi32_si128((int)c), - fix8_16 = _mm_unpacklo_epi8 (fix8, _mm_setzero_si128()), - fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); -#endif - fVec = _mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f / 255)); - SkASSERT(this->isValid()); -} - -inline SkPMColor SkPMFloat::round() const { - // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up). - __m128 scaled = _mm_mul_ps(_mm_set1_ps(255), fVec); - __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), scaled)), - fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), - fix8 = _mm_packus_epi16(fix8_16, fix8_16); - SkPMColor c = _mm_cvtsi128_si32(fix8); - SkPMColorAssert(c); - return c; -} - -inline Sk4f SkPMFloat::alphas() const { - static_assert(SK_A32_SHIFT == 24, ""); - return _mm_shuffle_ps(fVec, fVec, 0xff); // Read as 11 11 11 11, copying lane 3 to all lanes. -} - -inline SkPMFloat SkPMFloat::FromOpaqueColor(SkColor c) { - SkASSERT(SkColorGetA(c) == 0xFF); - __m128i fix8 = _mm_cvtsi32_si128((int)c); -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 - const char _ = ~0; // Zero these bytes. - __m128i fix8_32 = _mm_shuffle_epi8(fix8, - #if defined(SK_PMCOLOR_IS_BGRA) - _mm_setr_epi8(0,_,_,_, 1,_,_,_, 2,_,_,_, 3,_,_,_) - #else - _mm_setr_epi8(2,_,_,_, 1,_,_,_, 0,_,_,_, 3,_,_,_) - #endif - ); -#else - __m128i fix8_16 = _mm_unpacklo_epi8 (fix8 , _mm_setzero_si128()), - fix8_32 = _mm_unpacklo_epi16(fix8_16, _mm_setzero_si128()); - #if defined(SK_PMCOLOR_IS_RGBA) - fix8_32 = _mm_shuffle_epi32(fix8_32, 0xC6); // C6 == 11 00 01 10, i.e swap lanes 0 and 2. - #endif -#endif - SkPMFloat pmf = Sk4f(_mm_mul_ps(_mm_cvtepi32_ps(fix8_32), _mm_set1_ps(1.0f/255))); - SkASSERT(pmf.isValid()); - return pmf; -} - -} // namespace |