diff options
author | mtklein <mtklein@chromium.org> | 2015-04-09 14:05:17 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-04-09 14:05:17 -0700 |
commit | 9ff378b01be0b0a3fc35677a2155ba4ade286cc2 (patch) | |
tree | 2d9221ef31e3632dde74bda9a1023bc1ce71b27c | |
parent | a1e41c6d9a2029eb836c6120bb154ccd25e1588d (diff) |
Rewrite memset benches, then use results to add a small-N optimization.
The benches for N <= 10 get around 2x faster on my N7 and N9. I believe this
is because of the reduced function-call-then-function-pointer-call overhead on
the N7, and additionally because it seems autovectorization beats our NEON code
for small N on the N9.
My desktop is unchanged, though that's probably because N=10 lies well within a
region where memset's performance is essentially constant: N=100 takes only
about 2x as long as N=1 and N=10, which perform nearly identically.
BUG=skia:
Review URL: https://codereview.chromium.org/1073863002
-rw-r--r-- | bench/MemsetBench.cpp | 151 | ||||
-rw-r--r-- | include/core/SkUtils.h | 35 | ||||
-rw-r--r-- | src/core/SkUtils.cpp | 4 |
3 files changed, 95 insertions, 95 deletions
diff --git a/bench/MemsetBench.cpp b/bench/MemsetBench.cpp index e0390bba38..d1682bdd70 100644 --- a/bench/MemsetBench.cpp +++ b/bench/MemsetBench.cpp @@ -6,110 +6,79 @@ */ #include "Benchmark.h" -#include "SkCanvas.h" -#include "SkString.h" +#include "SkTemplates.h" #include "SkUtils.h" +template <typename T, bool kInline> class MemsetBench : public Benchmark { - SkString fName; - -protected: - int fMinSize; - int fMaxSize; - enum { - kBufferSize = 10000, - VALUE32 = 0x12345678, - VALUE16 = 0x1234 - }; - - enum MemsetType { - MEMSET16 = 16, - MEMSET32 = 32 - }; - public: - MemsetBench(MemsetType type, int minSize, int maxSize) { - SkASSERT((minSize < maxSize) && (maxSize <= kBufferSize)); - fMinSize = minSize; - fMaxSize = maxSize; - fName.printf("memset%d_%d_%d", type, minSize, maxSize); - } + explicit MemsetBench(int n) + : fN(n) + , fBuffer(n) + , fName(SkStringPrintf("memset%d_%d%s", sizeof(T)*8, n, kInline ? "_inline" : "")) {} - bool isSuitableFor(Backend backend) override { - return backend == kNonRendering_Backend; - } + bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } + const char* onGetName() override { return fName.c_str(); } - virtual void performTest() = 0; - -protected: - const char* onGetName() override { - return fName.c_str(); - } - - void onDraw(const int loops, SkCanvas* canvas) override { - for (int i = 0; i < loops; ++i) { - this->performTest(); - } - } + void onDraw(const int loops, SkCanvas*) override; private: - typedef Benchmark INHERITED; + int fN; + SkAutoTMalloc<T> fBuffer; + SkString fName; }; -class Memset32Bench : public MemsetBench { - uint32_t kBuffer[kBufferSize + 3]; -public: - Memset32Bench(int minSize, int maxSize) - : INHERITED(MEMSET32, minSize, maxSize) {} +template <> void MemsetBench<uint32_t, false>::onDraw(const int loops, SkCanvas*) { + for (int i = 0; i < 1000*loops; i++) { + sk_memset32(fBuffer.get(), 0xFACEB004, fN); + } +} -protected: - void performTest() override { - for(int j = fMinSize; j < fMaxSize; ++j){ - sk_memset32(kBuffer, VALUE32, j); - sk_memset32(kBuffer + 1, VALUE32, j); - sk_memset32(kBuffer + 2, VALUE32, j); - sk_memset32(kBuffer + 3, VALUE32, j); - } +template <> void MemsetBench<uint16_t, false>::onDraw(const int loops, SkCanvas*) { + for (int i = 0; i < 1000*loops; i++) { + sk_memset16(fBuffer.get(), 0x4973, fN); } -private: - typedef MemsetBench INHERITED; -}; +} -class Memset16Bench : public MemsetBench { - uint16_t kBuffer[kBufferSize + 7]; -public: - Memset16Bench(int minSize, int maxSize) - : INHERITED(MEMSET16, minSize, maxSize) {} +template <typename T> +static void memsetT(T* dst, T val, int n) { + for (int i = 0; i < n; i++) { dst[i] = val; } +} -protected: - void performTest() override { - for(int j = fMinSize; j < fMaxSize; ++j){ - sk_memset16(kBuffer, VALUE16, j); - sk_memset16(kBuffer + 1, VALUE16, j); - sk_memset16(kBuffer + 2, VALUE16, j); - sk_memset16(kBuffer + 3, VALUE16, j); - sk_memset16(kBuffer + 4, VALUE16, j); - sk_memset16(kBuffer + 5, VALUE16, j); - sk_memset16(kBuffer + 6, VALUE16, j); - sk_memset16(kBuffer + 7, VALUE16, j); - } +template <> void MemsetBench<uint32_t, true>::onDraw(const int loops, SkCanvas*) { + for (int i = 0; i < 1000*loops; i++) { + memsetT<uint32_t>(fBuffer.get(), 0xFACEB004, fN); } -private: - typedef MemsetBench INHERITED; -}; +} -DEF_BENCH(return new Memset32Bench(1, 600);) -DEF_BENCH(return new Memset32Bench(600, 800);) -DEF_BENCH(return new Memset32Bench(800, 1000);) -DEF_BENCH(return new Memset32Bench(1000, 2000);) -DEF_BENCH(return new Memset32Bench(2000, 3000);) -DEF_BENCH(return new Memset32Bench(3000, 4000);) -DEF_BENCH(return new Memset32Bench(4000, 5000);) - -DEF_BENCH(return new Memset16Bench(1, 600);) -DEF_BENCH(return new Memset16Bench(600, 800);) -DEF_BENCH(return new Memset16Bench(800, 1000);) -DEF_BENCH(return new Memset16Bench(1000, 2000);) -DEF_BENCH(return new Memset16Bench(2000, 3000);) -DEF_BENCH(return new Memset16Bench(3000, 4000);) -DEF_BENCH(return new Memset16Bench(4000, 5000);) +template <> void MemsetBench<uint16_t, true>::onDraw(const int loops, SkCanvas*) { + for (int i = 0; i < 1000*loops; i++) { + memsetT<uint16_t>(fBuffer.get(), 0x4973, fN); + } +} + +DEF_BENCH(return (new MemsetBench<uint32_t, true>(1))); +DEF_BENCH(return (new MemsetBench<uint32_t, false>(1))); +DEF_BENCH(return (new MemsetBench<uint32_t, true>(10))); +DEF_BENCH(return (new MemsetBench<uint32_t, false>(10))); +DEF_BENCH(return (new MemsetBench<uint32_t, true>(100))); +DEF_BENCH(return (new MemsetBench<uint32_t, false>(100))); +DEF_BENCH(return (new MemsetBench<uint32_t, true>(1000))); +DEF_BENCH(return (new MemsetBench<uint32_t, false>(1000))); +DEF_BENCH(return (new MemsetBench<uint32_t, true>(10000))); +DEF_BENCH(return (new MemsetBench<uint32_t, false>(10000))); +DEF_BENCH(return (new MemsetBench<uint32_t, true>(100000))); +DEF_BENCH(return (new MemsetBench<uint32_t, false>(100000))); + +DEF_BENCH(return (new MemsetBench<uint16_t, true>(1))); +DEF_BENCH(return (new MemsetBench<uint16_t, false>(1))); +DEF_BENCH(return (new MemsetBench<uint16_t, true>(10))); +DEF_BENCH(return (new MemsetBench<uint16_t, false>(10))); +DEF_BENCH(return (new MemsetBench<uint16_t, true>(100))); +DEF_BENCH(return (new MemsetBench<uint16_t, false>(100))); +DEF_BENCH(return (new MemsetBench<uint16_t, true>(1000))); +DEF_BENCH(return (new MemsetBench<uint16_t, false>(1000))); +DEF_BENCH(return (new MemsetBench<uint16_t, true>(10000))); +DEF_BENCH(return (new MemsetBench<uint16_t, false>(10000))); +DEF_BENCH(return (new MemsetBench<uint16_t, true>(100000))); +DEF_BENCH(return (new MemsetBench<uint16_t, false>(100000))); diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h index d522ae0dea..3c24b1f599 100644 --- a/include/core/SkUtils.h +++ b/include/core/SkUtils.h @@ -12,12 +12,31 @@ /////////////////////////////////////////////////////////////////////////////// +// Determined empirically using bench/MemsetBench.cpp on a Nexus 7, Nexus 9, and desktop. +#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 || defined(SK_ARM_HAS_NEON) + // Platforms where we can assume an autovectorizer will give us a good inline memset. + #define SK_SMALL_MEMSET 1000 +#else + // Platforms like Chrome on ARMv7 that don't typically compile with NEON globally. + #define SK_SMALL_MEMSET 10 +#endif + + /** Similar to memset(), but it assigns a 16bit value into the buffer. @param buffer The memory to have value copied into it @param value The 16bit value to be copied into buffer @param count The number of times value should be copied into the buffer. */ -void sk_memset16(uint16_t dst[], uint16_t value, int count); +void sk_memset16_large(uint16_t dst[], uint16_t value, int count); +inline void sk_memset16(uint16_t dst[], uint16_t value, int count) { + if (count <= SK_SMALL_MEMSET) { + for (int i = 0; i < count; i++) { + dst[i] = value; + } + } else { + sk_memset16_large(dst, value, count); + } +} typedef void (*SkMemset16Proc)(uint16_t dst[], uint16_t value, int count); SkMemset16Proc SkMemset16GetPlatformProc(); @@ -26,10 +45,22 @@ SkMemset16Proc SkMemset16GetPlatformProc(); @param value The 32bit value to be copied into buffer @param count The number of times value should be copied into the buffer. */ -void sk_memset32(uint32_t dst[], uint32_t value, int count); +void sk_memset32_large(uint32_t dst[], uint32_t value, int count); +inline void sk_memset32(uint32_t dst[], uint32_t value, int count) { + if (count <= SK_SMALL_MEMSET) { + for (int i = 0; i < count; i++) { + dst[i] = value; + } + } else { + sk_memset32_large(dst, value, count); + } +} + typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count); SkMemset32Proc SkMemset32GetPlatformProc(); +#undef SK_SMALL_MEMSET + /** Similar to memcpy(), but it copies count 32bit values from src to dst. @param dst The memory to have value copied into it @param src The memory to have value copied from it diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp index b063071932..33ea4db47c 100644 --- a/src/core/SkUtils.cpp +++ b/src/core/SkUtils.cpp @@ -134,12 +134,12 @@ SkMemcpy32Proc choose_memcpy32() { } // namespace -void sk_memset16(uint16_t dst[], uint16_t value, int count) { +void sk_memset16_large(uint16_t dst[], uint16_t value, int count) { SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset16Proc, proc, choose_memset16); proc.get()(dst, value, count); } -void sk_memset32(uint32_t dst[], uint32_t value, int count) { +void sk_memset32_large(uint32_t dst[], uint32_t value, int count) { SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset32Proc, proc, choose_memset32); proc.get()(dst, value, count); } |