aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--bench/Sk4fBench.cpp32
-rw-r--r--src/opts/SkNx_sse.h10
2 files changed, 31 insertions, 11 deletions
diff --git a/bench/Sk4fBench.cpp b/bench/Sk4fBench.cpp
index 712a65776d..823e14c0af 100644
--- a/bench/Sk4fBench.cpp
+++ b/bench/Sk4fBench.cpp
@@ -10,37 +10,47 @@
#include "SkNx.h"
// Used to prevent the compiler from optimizing away the whole loop.
-volatile uint32_t blackhole = 0;
+volatile uint64_t blackhole = 0;
// Not a great random number generator, but it's very fast.
// The code we're measuring is quite fast, so low overhead is essential.
-static uint32_t lcg_rand(uint32_t* seed) {
+static uint64_t lcg_rand(uint64_t* seed) {
*seed *= 1664525;
*seed += 1013904223;
return *seed;
}
-struct Sk4fBytesRoundtripBench : public Benchmark {
- Sk4fBytesRoundtripBench() {}
+template <typename T>
+struct Sk4fRoundtripBench : public Benchmark {
+ Sk4fRoundtripBench() {}
+
+ const char* onGetName() override {
+ switch (sizeof(T)) {
+ case 1: return "Sk4f_roundtrip_u8";
+ case 2: return "Sk4f_roundtrip_u16";
+ }
+ SkASSERT(false);
+ return "";
+ }
- const char* onGetName() override { return "Sk4f_roundtrip"; }
bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
void onDraw(int loops, SkCanvas* canvas) override {
// Unlike blackhole, junk can and probably will be a register.
- uint32_t junk = 0;
- uint32_t seed = 0;
+ uint64_t junk = 0;
+ uint64_t seed = 0;
for (int i = 0; i < loops; i++) {
- uint32_t color = lcg_rand(&seed),
+ uint64_t src = lcg_rand(&seed),
back;
- auto f = SkNx_cast<float>(Sk4b::Load(&color));
- SkNx_cast<uint8_t>(f).store(&back);
+ auto f = SkNx_cast<float>(SkNx<4,T>::Load(&src));
+ SkNx_cast<T>(f).store(&back);
junk ^= back;
}
blackhole ^= junk;
}
};
-DEF_BENCH(return new Sk4fBytesRoundtripBench;)
+DEF_BENCH(return new Sk4fRoundtripBench<uint8_t>;)
+DEF_BENCH(return new Sk4fRoundtripBench<uint16_t>;)
struct Sk4fGradientBench : public Benchmark {
const char* onGetName() override { return "Sk4f_gradient"; }
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 71ecbfd89d..ee6fdc5654 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -348,6 +348,11 @@ template<> inline Sk4i SkNx_cast<int, float, 4>(const Sk4f& src) {
return _mm_cvttps_epi32(src.fVec);
}
+template<> inline Sk4h SkNx_cast<uint16_t, float, 4>(const Sk4f& src) {
+ auto _32 = _mm_cvttps_epi32(src.fVec);
+ return _mm_packus_epi16(_32, _32);
+}
+
template<> inline Sk4b SkNx_cast<uint8_t, float, 4>(const Sk4f& src) {
auto _32 = _mm_cvttps_epi32(src.fVec);
#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
@@ -370,6 +375,11 @@ template<> inline Sk4f SkNx_cast<float, uint8_t, 4>(const Sk4b& src) {
return _mm_cvtepi32_ps(_32);
}
+template<> inline Sk4f SkNx_cast<float, uint16_t, 4>(const Sk4h& src) {
+ auto _32 = _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
+ return _mm_cvtepi32_ps(_32);
+}
+
static inline void Sk4f_ToBytes(uint8_t bytes[16],
const Sk4f& a, const Sk4f& b, const Sk4f& c, const Sk4f& d) {
_mm_storeu_si128((__m128i*)bytes,