Add a bench to measure the best way to pack from int to uint16_t with SSE.

I measured relative runtimes on my laptop: pack_int_uint16_t_ss… 1036 …e41 1x …se3 1.01x …e2_b 3.01x …e2_a 3.02x I've run into Clang problems with the actual _mm_packus_epi32 instruction, I think, so I'm going to exercise a little cowardice and leave that option disabled for now. The ssse3 version probably looks a little faster than it will be in practice. We'll usually need to load its mask, which here is hoisted out of the bench loop. The two sse2 variants are close enough in speed that I'm tie breaking them on other concerns: the <<16, >>16 version doesn't need any scratch registers or to load any constants, so it wins. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2150343002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-Fast-Trybot Review-Url: https://codereview.chromium.org/2150343002
author: mtklein <mtklein@chromium.org> 2016-07-15 07:45:53 -0700
committer: Commit bot <commit-bot@chromium.org> 2016-07-15 07:45:53 -0700
commit: 036e1831e05ae3a6ec9bcd30cb24f6b1a49a3541 (patch)
tree: 81efe17768f56658fc48fc7a694e352809da3072 /bench
parent: 58e389b0518b46bbe58ba01c23443cf23c18435c (diff)
1 files changed, 93 insertions, 0 deletions
diff --git a/bench/pack_int_uint16_t_Bench.cpp b/bench/pack_int_uint16_t_Bench.cpp
new file mode 100644
index 0000000000..5e1527d7ca
--- /dev/null
+++ b/bench/pack_int_uint16_t_Bench.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Benchmark.h"
+#include "SkTypes.h"
+
+/**
+ * There's a good variety of ways to pack from int down to uint16_t with SSE,
+ * depending on the specific instructions available.
+ *
+ * SSE2 offers an int -> int16_t pack instruction.  We can use this in two ways:
+ *    - subtract off 32768, int -> int16_t, add 32768 back                                  (sse2_a)
+ *    - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b)
+ * SSSE3 adds a byte shuffle, so we just put the bytes where we want them.                  (ssse3)
+ * SSE41 added an int -> uint16_t pack instruction.                                         (sse41)
+ *
+ * Findings so far:
+ *   - sse41 < ssse3 <<< sse2_b < sse2_a;
+ *   - the ssse3 version is only slightly slower than the sse41 version, maybe not at all
+ *   - the sse2_a is only slightly slower than the sse2_b version
+ *   - the ssse3 and sse41 versions are about 3x faster than either sse2 version
+ *   - the sse41 version seems to cause some code generation trouble.
+ */
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
+
+#include <immintrin.h>
+
+template <__m128i (kernel)(__m128i)>
+class pack_int_uint16_t_Bench : public Benchmark {
+public:
+    pack_int_uint16_t_Bench(const char* impl) {
+        fName.append("pack_int_uint16_t_");
+        fName.append(impl);
+    }
+
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+    const char* onGetName() override { return fName.c_str(); }
+
+    void onDraw(int loops, SkCanvas*) override {
+        __m128i x = _mm_set1_epi32(0x42424242);
+        while (loops --> 0) {
+            x = kernel(x);
+        }
+
+        volatile int blackhole = 0;
+        blackhole ^= _mm_cvtsi128_si32(x);
+    }
+
+    SkString fName;
+};
+
+namespace {
+    __m128i sse2_a(__m128i x) {
+        x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000));
+        return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000));
+    }
+}
+DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); )
+
+namespace {
+    __m128i sse2_b(__m128i x) {
+        x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16);
+        return _mm_packs_epi32(x,x);
+    }
+}
+DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); )
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
+namespace {
+    __m128i ssse3(__m128i x) {
+        // TODO: Can we force the bench to load the mask inside the loop?  Would be more realistic.
+        const int _ = ~0;
+        return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_));
+    }
+}
+DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); )
+#endif
+
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+namespace {
+    __m128i sse41(__m128i x) {
+        return _mm_packus_epi32(x,x);
+    }
+}
+DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); )
+#endif
+
+#endif  // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
author	mtklein <mtklein@chromium.org>	2016-07-15 07:45:53 -0700
committer	Commit bot <commit-bot@chromium.org>	2016-07-15 07:45:53 -0700
commit	036e1831e05ae3a6ec9bcd30cb24f6b1a49a3541 (patch)
tree	81efe17768f56658fc48fc7a694e352809da3072 /bench
parent	58e389b0518b46bbe58ba01c23443cf23c18435c (diff)