aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/opts
diff options
context:
space:
mode:
authorGravatar mtklein <mtklein@chromium.org>2016-07-15 07:45:53 -0700
committerGravatar Commit bot <commit-bot@chromium.org>2016-07-15 07:45:53 -0700
commit036e1831e05ae3a6ec9bcd30cb24f6b1a49a3541 (patch)
tree81efe17768f56658fc48fc7a694e352809da3072 /src/opts
parent58e389b0518b46bbe58ba01c23443cf23c18435c (diff)
Add a bench to measure the best way to pack from int to uint16_t with SSE.
I measured relative runtimes on my laptop: pack_int_uint16_t_ss… 1036 …e41 1x …se3 1.01x …e2_b 3.01x …e2_a 3.02x I've run into Clang problems with the actual _mm_packus_epi32 instruction, I think, so I'm going to exercise a little cowardice and leave that option disabled for now. The ssse3 version probably looks a little faster than it will be in practice. We'll usually need to load its mask, which here is hoisted out of the bench loop. The two sse2 variants are close enough in speed that I'm tie breaking them on other concerns: the <<16, >>16 version doesn't need any scratch registers or to load any constants, so it wins. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2150343002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-Fast-Trybot Review-Url: https://codereview.chromium.org/2150343002
Diffstat (limited to 'src/opts')
-rw-r--r--src/opts/SkNx_sse.h29
1 files changed, 13 insertions, 16 deletions
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index ab88382bd2..553ff1de3e 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -323,20 +323,25 @@ template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
return _mm_cvttps_epi32(src.fVec);
}
-template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
- auto _32 = _mm_cvttps_epi32(src.fVec);
- // Ideally we'd use _mm_packus_epi32 here. But that's SSE4.1+.
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
+#if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+ // TODO: This seems to be causing code generation problems. Investigate?
+ return _mm_packus_epi32(src.fVec);
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
// With SSSE3, we can just shuffle the low 2 bytes from each lane right into place.
const int _ = ~0;
- return _mm_shuffle_epi8(_32, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
+ return _mm_shuffle_epi8(src.fVec, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
#else
- // With SSE2, we have to emulate _mm_packus_epi32 with _mm_packs_epi32:
- _32 = _mm_sub_epi32(_32, _mm_set1_epi32((int)0x00008000));
- return _mm_add_epi16(_mm_packs_epi32(_32, _32), _mm_set1_epi16((short)0x8000));
+ // With SSE2, we have to sign extend our input, making _mm_packs_epi32 do the pack we want.
+ __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
+ return _mm_packs_epi32(x,x);
#endif
}
+template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
+ return SkNx_cast<uint16_t>(SkNx_cast<int>(src));
+}
+
template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
auto _32 = _mm_cvttps_epi32(src.fVec);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
@@ -390,14 +395,6 @@ template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
}
-template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
- // TODO: merge with other work exploring best int -> uint16_t conversion.
-
- // Sign extend to trick _mm_packs_epi32() into doing the pack we want.
- __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
- return _mm_packs_epi32(x,x);
-}
-
template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
}