diff options
author | mtklein <mtklein@chromium.org> | 2016-02-06 19:38:39 -0800 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2016-02-06 19:38:39 -0800 |
commit | 2d340f2f37a28e82323428725019f12a8538f48e (patch) | |
tree | c10a798ec8fbea528fcd39c0c5f9521cbb02b192 | |
parent | e721a8e883231af1878772ecd23be325113fcc48 (diff) |
could not resist: fast sse float <--> u16
- generalizes the bench to float <--> {u8,u16}
- must remember to implement NEON version at some point
BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1676853002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot
Review URL: https://codereview.chromium.org/1676853002
-rw-r--r-- | bench/Sk4fBench.cpp | 32 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 10 |
2 files changed, 31 insertions, 11 deletions
diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp index 712a65776d..823e14c0af 100644 --- a/bench/Sk4fBench.cpp +++ b/bench/Sk4fBench.cpp @@ -10,37 +10,47 @@ #include "SkNx.h" // Used to prevent the compiler from optimizing away the whole loop. -volatile uint32_t blackhole = 0; +volatile uint64_t blackhole = 0; // Not a great random number generator, but it's very fast. // The code we're measuring is quite fast, so low overhead is essential. -static uint32_t lcg_rand(uint32_t* seed) { +static uint64_t lcg_rand(uint64_t* seed) { *seed *= 1664525; *seed += 1013904223; return *seed; } -struct Sk4fBytesRoundtripBench : public Benchmark { - Sk4fBytesRoundtripBench() {} +template <typename T> +struct Sk4fRoundtripBench : public Benchmark { + Sk4fRoundtripBench() {} + + const char* onGetName() override { + switch (sizeof(T)) { + case 1: return "Sk4f_roundtrip_u8"; + case 2: return "Sk4f_roundtrip_u16"; + } + SkASSERT(false); + return ""; + } - const char* onGetName() override { return "Sk4f_roundtrip"; } bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } void onDraw(int loops, SkCanvas* canvas) override { // Unlike blackhole, junk can and probably will be a register. - uint32_t junk = 0; - uint32_t seed = 0; + uint64_t junk = 0; + uint64_t seed = 0; for (int i = 0; i < loops; i++) { - uint32_t color = lcg_rand(&seed), + uint64_t src = lcg_rand(&seed), back; - auto f = SkNx_cast<float>(Sk4b::Load(&color)); - SkNx_cast<uint8_t>(f).store(&back); + auto f = SkNx_cast<float>(SkNx<4,T>::Load(&src)); + SkNx_cast<T>(f).store(&back); junk ^= back; } blackhole ^= junk; } }; -DEF_BENCH(return new Sk4fBytesRoundtripBench;) +DEF_BENCH(return new Sk4fRoundtripBench<uint8_t>;) +DEF_BENCH(return new Sk4fRoundtripBench<uint16_t>;) struct Sk4fGradientBench : public Benchmark { const char* onGetName() override { return "Sk4f_gradient"; } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 71ecbfd89d..ee6fdc5654 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -348,6 +348,11 @@ template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { return _mm_cvttps_epi32(src.fVec); } +template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) { + auto _32 = _mm_cvttps_epi32(src.fVec); + return _mm_packus_epi16(_32, _32); +} + template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { auto _32 = _mm_cvttps_epi32(src.fVec); #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 @@ -370,6 +375,11 @@ template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { return _mm_cvtepi32_ps(_32); } +template<> inline Sk4f SkNx_cast<float, uint16_t, 4>(const Sk4h& src) { + auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); + return _mm_cvtepi32_ps(_32); +} + static inline void Sk4f_ToBytes(uint8_t bytes[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { _mm_storeu_si128((__m128i*)bytes, |