diff options
Diffstat (limited to 'src/opts/SkNx_sse.h')
-rw-r--r-- | src/opts/SkNx_sse.h | 29 |
1 files changed, 19 insertions, 10 deletions
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 415d6e1205..469aefb145 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -165,6 +165,17 @@ public: __m128 fVec; }; +AI static __m128i mullo32(__m128i a, __m128i b) { +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + return _mm_mullo_epi32(a, b); +#else + __m128i mul20 = _mm_mul_epu32(a, b), + mul31 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); + return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), + _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); +#endif +} + template <> class SkNx<4, int32_t> { public: @@ -179,15 +190,10 @@ public: AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } - AI SkNx operator * (const SkNx& o) const { - __m128i mul20 = _mm_mul_epu32(fVec, o.fVec), - mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4)); - return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)), - _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0))); - } + AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); } AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } - AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } + AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } @@ -254,10 +260,10 @@ public: AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } - // Not quite sure how to best do operator * in SSE2. We probably don't use it. + AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); } AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } - AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } + AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } @@ -465,7 +471,6 @@ public: AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) : fVec(_mm_setr_epi8(a,b,c,d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {} - AI static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); } AI void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); } @@ -612,6 +617,10 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) { return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); } +template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) { + return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); +} + template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) { return src.fVec; } |