Port SkUtils opts to SkOpts.

With this new arrangement, the benefits of inlining sk_memset16/32 have changed. On x86, they're not significantly different, except for small N<=10 where the inlined code is significantly slower. On ARMv7 with NEON, our custom code is still significantly faster for N>10 (up to 2x faster). For small N<=10 inlining is still significantly faster. On ARMv7 without NEON, our custom code is still ridiculously faster (up to 10x) than inlining for N>10, though for small N<=10 inlining is still a little faster. We were not using the NEON memset16 and memset32 procs on ARMv8. At first blush, that seems to be an oversight, but if so it's an extremely lucky one. The ARMv8 code generation for our memset16/32 procs is total garbage, leaving those methods ~8x slower than just inlining the memset, using the compiler's autovectorization. So, no need to inline any more on x86, and still inline for N<=10 on ARMv7. Always inline for ARMv8. BUG=skia:4117 Review URL: https://codereview.chromium.org/1270573002
author: mtklein <mtklein@chromium.org> 2015-07-31 10:46:50 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-07-31 10:46:50 -0700
commit: 7eb0945af254d376df11475150d184623104cf93 (patch)
tree: f1eca7b5e83428fc5f728967fed41324d5186e66 /include
parent: 5119ac069e6cf70175b5581eedee7d07347b216a (diff)
3 files changed, 52 insertions, 35 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h
index 5ca4d103d0..2ca1d088d3 100644
--- a/include/core/SkFloatingPoint.h
+++ b/include/core/SkFloatingPoint.h
@@ -11,6 +11,7 @@
 #define SkFloatingPoint_DEFINED
 
 #include "SkTypes.h"
+#include "../private/SkOpts.h"
 
 #include <math.h>
 #include <float.h>
@@ -127,8 +128,6 @@ extern const uint32_t gIEEENegativeInfinity;
 #define SK_FloatInfinity            (*SkTCast<const float*>(&gIEEEInfinity))
 #define SK_FloatNegativeInfinity    (*SkTCast<const float*>(&gIEEENegativeInfinity))
 
-namespace SkOpts { extern float (*rsqrt)(float); }
-
 // Fast, approximate inverse square root.
 // Compare to name-brand "1.0f / sk_float_sqrt(x)".  Should be around 10x faster on SSE, 2x on NEON.
 static inline float sk_float_rsqrt(const float x) {
diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h
index bca76ed54f..b007b7711d 100644
--- a/include/core/SkUtils.h
+++ b/include/core/SkUtils.h
@@ -9,57 +9,46 @@
 #define SkUtils_DEFINED
 
 #include "SkTypes.h"
+#include "../private/SkOpts.h"
 
 ///////////////////////////////////////////////////////////////////////////////
 
-// Determined empirically using bench/MemsetBench.cpp on a Nexus 7, Nexus 9, and desktop.
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 || defined(SK_ARM_HAS_NEON)
-    // Platforms where we can assume an autovectorizer will give us a good inline memset.
-    #define SK_SMALL_MEMSET 1000
-#else
-    // Platforms like Chrome on ARMv7 that don't typically compile with NEON globally.
-    #define SK_SMALL_MEMSET 10
-#endif
-
+// The inlining heuristics below were determined using bench/MemsetBench.cpp
+// on a x86 desktop, a Nexus 7 with and without NEON, and a Nexus 9:
+//   - on x86, inlining was never faster,
+//   - on ARMv7, inlining was faster for N<=10.  Putting this check inside the NEON
+//     code was not helpful; it's got to be here outside.
+//   - NEON code generation for ARMv8 with GCC 4.9 is terrible,
+//     making the NEON code ~8x slower that just a serial loop.
 
 /** Similar to memset(), but it assigns a 16bit value into the buffer.
     @param buffer   The memory to have value copied into it
     @param value    The 16bit value to be copied into buffer
     @param count    The number of times value should be copied into the buffer.
 */
-void sk_memset16_large(uint16_t dst[], uint16_t value, int count);
-inline void sk_memset16(uint16_t dst[], uint16_t value, int count) {
-    if (count <= SK_SMALL_MEMSET) {
-        for (int i = 0; i < count; i++) {
-            dst[i] = value;
-        }
-    } else {
-        sk_memset16_large(dst, value, count);
-    }
+static inline void sk_memset16(uint16_t buffer[], uint16_t value, int count) {
+#if defined(SK_CPU_ARM64)
+    while (count --> 0) { *buffer++ = value; } return;
+#elif defined(SK_CPU_ARM32)
+    if (count <= 10) { while (count --> 0) { *buffer++ = value; } return; }
+#endif
+    SkOpts::memset16(buffer, value, count);
 }
-typedef void (*SkMemset16Proc)(uint16_t dst[], uint16_t value, int count);
-SkMemset16Proc SkMemset16GetPlatformProc();
 
 /** Similar to memset(), but it assigns a 32bit value into the buffer.
     @param buffer   The memory to have value copied into it
     @param value    The 32bit value to be copied into buffer
     @param count    The number of times value should be copied into the buffer.
 */
-void sk_memset32_large(uint32_t dst[], uint32_t value, int count);
-inline void sk_memset32(uint32_t dst[], uint32_t value, int count) {
-    if (count <= SK_SMALL_MEMSET) {
-        for (int i = 0; i < count; i++) {
-            dst[i] = value;
-        }
-    } else {
-        sk_memset32_large(dst, value, count);
-    }
+static inline void sk_memset32(uint32_t buffer[], uint32_t value, int count) {
+#if defined(SK_CPU_ARM64)
+    while (count --> 0) { *buffer++ = value; } return;
+#elif defined(SK_CPU_ARM32)
+    if (count <= 10) { while (count --> 0) { *buffer++ = value; } return; }
+#endif
+    SkOpts::memset32(buffer, value, count);
 }
 
-typedef void (*SkMemset32Proc)(uint32_t dst[], uint32_t value, int count);
-SkMemset32Proc SkMemset32GetPlatformProc();
-
-#undef SK_SMALL_MEMSET
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/private/SkOpts.h b/include/private/SkOpts.h
new file mode 100644
index 0000000000..0594588e29
--- /dev/null
+++ b/include/private/SkOpts.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2015 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkOpts_DEFINED
+#define SkOpts_DEFINED
+
+#include "SkTypes.h"
+
+namespace SkOpts {
+    // Call to replace pointers to portable functions with pointers to CPU-specific functions.
+    // Thread-safe and idempotent.
+    // Called by SkGraphics::Init(), and automatically #if SK_ALLOW_STATIC_GLOBAL_INITIALIZERS.
+    void Init();
+
+    // Declare function pointers here...
+
+    // Returns a fast approximation of 1.0f/sqrtf(x).
+    extern float (*rsqrt)(float);
+
+    // See SkUtils.h
+    extern void (*memset16)(uint16_t[], uint16_t, int);
+    extern void (*memset32)(uint32_t[], uint32_t, int);
+}
+
+#endif//SkOpts_DEFINED
author	mtklein <mtklein@chromium.org>	2015-07-31 10:46:50 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-07-31 10:46:50 -0700
commit	7eb0945af254d376df11475150d184623104cf93 (patch)
tree	f1eca7b5e83428fc5f728967fed41324d5186e66 /include
parent	5119ac069e6cf70175b5581eedee7d07347b216a (diff)