Have Windows use _mm_rsqrt_ss too.

Tidy up a little while I'm in here: 1) SIMD headers are now included by SkTypes.h as appropriate. 2) _mm_cvtss_f32() is pithier and generates the same code. Looks like this is the only code checking for SSE wrong. After this CL: ~/skia (sse) $ git grep __SSE include/core/SkPreConfig.h: #if defined(__SSE4_2__) include/core/SkPreConfig.h: #elif defined(__SSE4_1__) include/core/SkPreConfig.h: #elif defined(__SSE3__) include/core/SkPreConfig.h: #elif defined(__SSE2__) every other check is in SkPreConfig.h where it belongs. This is going to affect some GMs subtly on Windows. BUG=chromium:511458 No public API changes. TBR=reed@google.com Review URL: https://codereview.chromium.org/1248503004
author: mtklein <mtklein@chromium.org> 2015-07-23 08:37:02 -0700
committer: Commit bot <commit-bot@chromium.org> 2015-07-23 08:37:02 -0700
commit: b9c47f996867ec885a7ede697bd1b0859b6d9301 (patch)
tree: 4ad178012a0f89830af3d20aede0c4e23411bb7c
parent: 11dfc8e57a9a485476b4aff93ea2135ee1846d4f (diff)
1 files changed, 2 insertions, 10 deletions
diff --git a/include/core/SkFloatingPoint.h b/include/core/SkFloatingPoint.h
index ad1669c4ee..7c34706f7a 100644
--- a/include/core/SkFloatingPoint.h
+++ b/include/core/SkFloatingPoint.h
@@ -143,12 +143,6 @@ extern const uint32_t gIEEENegativeInfinity;
 #define SK_FloatInfinity            (*SkTCast<const float*>(&gIEEEInfinity))
 #define SK_FloatNegativeInfinity    (*SkTCast<const float*>(&gIEEENegativeInfinity))
 
-#if defined(__SSE__)
-#include <xmmintrin.h>
-#elif defined(SK_ARM_HAS_NEON)
-#include <arm_neon.h>
-#endif
-
 // Fast, approximate inverse square root.
 // Compare to name-brand "1.0f / sk_float_sqrt(x)".  Should be around 10x faster on SSE, 2x on NEON.
 static inline float sk_float_rsqrt(const float x) {
@@ -157,10 +151,8 @@ static inline float sk_float_rsqrt(const float x) {
 //
 // We do one step of Newton's method to refine the estimates in the NEON and null paths.  No
 // refinement is faster, but very innacurate.  Two steps is more accurate, but slower than 1/sqrt.
-#if defined(__SSE__)
-    float result;
-    _mm_store_ss(&result, _mm_rsqrt_ss(_mm_set_ss(x)));
-    return result;
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
+    return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
 #elif defined(SK_ARM_HAS_NEON)
     // Get initial estimate.
     const float32x2_t xx = vdup_n_f32(x);  // Clever readers will note we're doing everything 2x.
author	mtklein <mtklein@chromium.org>	2015-07-23 08:37:02 -0700
committer	Commit bot <commit-bot@chromium.org>	2015-07-23 08:37:02 -0700
commit	b9c47f996867ec885a7ede697bd1b0859b6d9301 (patch)
tree	4ad178012a0f89830af3d20aede0c4e23411bb7c
parent	11dfc8e57a9a485476b4aff93ea2135ee1846d4f (diff)