aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts/SkNx_sse.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/opts/SkNx_sse.h')
-rw-r--r--src/opts/SkNx_sse.h29
1 files changed, 19 insertions, 10 deletions
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 415d6e1205..469aefb145 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -165,6 +165,17 @@ public:
__m128 fVec;
};
+AI static __m128i mullo32(__m128i a, __m128i b) {
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+ return _mm_mullo_epi32(a, b);
+#else
+ __m128i mul20 = _mm_mul_epu32(a, b),
+ mul31 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+ return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
+ _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
+#endif
+}
+
template <>
class SkNx<4, int32_t> {
public:
@@ -179,15 +190,10 @@ public:
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
- AI SkNx operator * (const SkNx& o) const {
- __m128i mul20 = _mm_mul_epu32(fVec, o.fVec),
- mul31 = _mm_mul_epu32(_mm_srli_si128(fVec, 4), _mm_srli_si128(o.fVec, 4));
- return _mm_unpacklo_epi32(_mm_shuffle_epi32(mul20, _MM_SHUFFLE(0,0,2,0)),
- _mm_shuffle_epi32(mul31, _MM_SHUFFLE(0,0,2,0)));
- }
+ AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
- AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+ AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
@@ -254,10 +260,10 @@ public:
AI SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
AI SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
- // Not quite sure how to best do operator * in SSE2. We probably don't use it.
+ AI SkNx operator * (const SkNx& o) const { return mullo32(fVec, o.fVec); }
AI SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
- AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+ AI SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
AI SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
AI SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
@@ -465,7 +471,6 @@ public:
AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
: fVec(_mm_setr_epi8(a,b,c,d, 0,0,0,0, 0,0,0,0, 0,0,0,0)) {}
-
AI static SkNx Load(const void* ptr) { return _mm_cvtsi32_si128(*(const int*)ptr); }
AI void store(void* ptr) const { *(int*)ptr = _mm_cvtsi128_si32(fVec); }
@@ -612,6 +617,10 @@ template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
}
+template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
+ return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
+}
+
template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
return src.fVec;
}