diff options
author | mtklein <mtklein@chromium.org> | 2015-07-31 10:46:50 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-07-31 10:46:50 -0700 |
commit | 7eb0945af254d376df11475150d184623104cf93 (patch) | |
tree | f1eca7b5e83428fc5f728967fed41324d5186e66 /src/core/SkUtils.cpp | |
parent | 5119ac069e6cf70175b5581eedee7d07347b216a (diff) |
Port SkUtils opts to SkOpts.
With this new arrangement, the benefits of inlining sk_memset16/32 have changed.
On x86, they're not significantly different, except for small N<=10 where the inlined code is significantly slower.
On ARMv7 with NEON, our custom code is still significantly faster for N>10 (up to 2x faster). For small N<=10 inlining is still significantly faster.
On ARMv7 without NEON, our custom code is still ridiculously faster (up to 10x) than inlining for N>10, though for small N<=10 inlining is still a little faster.
We were not using the NEON memset16 and memset32 procs on ARMv8. At first blush, that seems to be an oversight, but if so it's an extremely lucky one. The ARMv8 code generation for our memset16/32 procs is total garbage, leaving those methods ~8x slower than just inlining the memset, using the compiler's autovectorization.
So, no need to inline any more on x86, and still inline for N<=10 on ARMv7. Always inline for ARMv8.
BUG=skia:4117
Review URL: https://codereview.chromium.org/1270573002
Diffstat (limited to 'src/core/SkUtils.cpp')
-rw-r--r-- | src/core/SkUtils.cpp | 128 |
1 files changed, 0 insertions, 128 deletions
diff --git a/src/core/SkUtils.cpp b/src/core/SkUtils.cpp index afd523d369..eecf122100 100644 --- a/src/core/SkUtils.cpp +++ b/src/core/SkUtils.cpp @@ -8,134 +8,6 @@ #include "SkUtils.h" -#include "SkLazyFnPtr.h" - -#if 0 -#define assign_16_longs(dst, value) \ - do { \ - (dst)[0] = value; (dst)[1] = value; \ - (dst)[2] = value; (dst)[3] = value; \ - (dst)[4] = value; (dst)[5] = value; \ - (dst)[6] = value; (dst)[7] = value; \ - (dst)[8] = value; (dst)[9] = value; \ - (dst)[10] = value; (dst)[11] = value; \ - (dst)[12] = value; (dst)[13] = value; \ - (dst)[14] = value; (dst)[15] = value; \ - } while (0) -#else -#define assign_16_longs(dst, value) \ - do { \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - *(dst)++ = value; *(dst)++ = value; \ - } while (0) -#endif - -/////////////////////////////////////////////////////////////////////////////// - -static void sk_memset16_portable(uint16_t dst[], uint16_t value, int count) { - SkASSERT(dst != NULL && count >= 0); - - if (count <= 0) { - return; - } - - // not sure if this helps to short-circuit on small values of count - if (count < 8) { - do { - *dst++ = (uint16_t)value; - } while (--count != 0); - return; - } - - // ensure we're on a long boundary - if ((size_t)dst & 2) { - *dst++ = (uint16_t)value; - count -= 1; - } - - uint32_t value32 = ((uint32_t)value << 16) | value; - - // handle the bulk with our unrolled macro - { - int sixteenlongs = count >> 5; - if (sixteenlongs) { - uint32_t* dst32 = (uint32_t*)dst; - do { - assign_16_longs(dst32, value32); - } while (--sixteenlongs != 0); - dst = (uint16_t*)dst32; - count &= 31; - } - } - - // handle (most) of the rest - { - int longs = count >> 1; - if (longs) { - do { - *(uint32_t*)dst = value32; - dst += 2; - } while (--longs != 0); - } - } - - // cleanup a possible trailing short - if (count & 1) { - *dst = (uint16_t)value; - } -} - -static void sk_memset32_portable(uint32_t dst[], uint32_t value, int count) { - SkASSERT(dst != NULL && count >= 0); - - int sixteenlongs = count >> 4; - if (sixteenlongs) { - do { - assign_16_longs(dst, value); - } while (--sixteenlongs != 0); - count &= 15; - } - - if (count) { - do { - *dst++ = value; - } while (--count != 0); - } -} - -namespace { -// These three methods technically need external linkage to be passed as template parameters. -// Since they can't be static, we hide them in an anonymous namespace instead. - -SkMemset16Proc choose_memset16() { - SkMemset16Proc proc = SkMemset16GetPlatformProc(); - return proc ? proc : sk_memset16_portable; -} - -SkMemset32Proc choose_memset32() { - SkMemset32Proc proc = SkMemset32GetPlatformProc(); - return proc ? proc : sk_memset32_portable; -} - -} // namespace - -void sk_memset16_large(uint16_t dst[], uint16_t value, int count) { - SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset16Proc, proc, choose_memset16); - proc.get()(dst, value, count); -} - -void sk_memset32_large(uint32_t dst[], uint32_t value, int count) { - SK_DECLARE_STATIC_LAZY_FN_PTR(SkMemset32Proc, proc, choose_memset32); - proc.get()(dst, value, count); -} - -/////////////////////////////////////////////////////////////////////////////// /* 0xxxxxxx 1 total 10xxxxxx // never a leading byte |