diff options
-rw-r--r-- | bench/Sk4fBench.cpp | 32 | ||||
-rw-r--r-- | src/opts/SkNx_sse.h | 10 |
2 files changed, 31 insertions, 11 deletions
diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp index 712a65776d..823e14c0af 100644 --- a/bench/Sk4fBench.cpp +++ b/bench/Sk4fBench.cpp @@ -10,37 +10,47 @@ #include "SkNx.h" // Used to prevent the compiler from optimizing away the whole loop. -volatile uint32_t blackhole = 0; +volatile uint64_t blackhole = 0; // Not a great random number generator, but it's very fast. // The code we're measuring is quite fast, so low overhead is essential. -static uint32_t lcg_rand(uint32_t* seed) { +static uint64_t lcg_rand(uint64_t* seed) { *seed *= 1664525; *seed += 1013904223; return *seed; } -struct Sk4fBytesRoundtripBench : public Benchmark { - Sk4fBytesRoundtripBench() {} +template <typename T> +struct Sk4fRoundtripBench : public Benchmark { + Sk4fRoundtripBench() {} + + const char* onGetName() override { + switch (sizeof(T)) { + case 1: return "Sk4f_roundtrip_u8"; + case 2: return "Sk4f_roundtrip_u16"; + } + SkASSERT(false); + return ""; + } - const char* onGetName() override { return "Sk4f_roundtrip"; } bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } void onDraw(int loops, SkCanvas* canvas) override { // Unlike blackhole, junk can and probably will be a register. - uint32_t junk = 0; - uint32_t seed = 0; + uint64_t junk = 0; + uint64_t seed = 0; for (int i = 0; i < loops; i++) { - uint32_t color = lcg_rand(&seed), + uint64_t src = lcg_rand(&seed), back; - auto f = SkNx_cast<float>(Sk4b::Load(&color)); - SkNx_cast<uint8_t>(f).store(&back); + auto f = SkNx_cast<float>(SkNx<4,T>::Load(&src)); + SkNx_cast<T>(f).store(&back); junk ^= back; } blackhole ^= junk; } }; -DEF_BENCH(return new Sk4fBytesRoundtripBench;) +DEF_BENCH(return new Sk4fRoundtripBench<uint8_t>;) +DEF_BENCH(return new Sk4fRoundtripBench<uint16_t>;) struct Sk4fGradientBench : public Benchmark { const char* onGetName() override { return "Sk4f_gradient"; } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 71ecbfd89d..ee6fdc5654 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -348,6 +348,11 @@ template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) { return _mm_cvttps_epi32(src.fVec); } +template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) { + auto _32 = _mm_cvttps_epi32(src.fVec); + return _mm_packus_epi16(_32, _32); +} + template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) { auto _32 = _mm_cvttps_epi32(src.fVec); #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 @@ -370,6 +375,11 @@ template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) { return _mm_cvtepi32_ps(_32); } +template<> inline Sk4f SkNx_cast<float, uint16_t, 4>(const Sk4h& src) { + auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); + return _mm_cvtepi32_ps(_32); +} + static inline void Sk4f_ToBytes(uint8_t bytes[16], const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) { _mm_storeu_si128((__m128i*)bytes, |