From 73d7ffca4e12b4f227ebed447c84564285d36548 Mon Sep 17 00:00:00 2001
From: Mike Klein <mtklein@chromium.org>
Date: Wed, 25 Jul 2018 09:19:23 -0400
Subject: refine NEON checks in raster pipeline

This replaces most defined(__ARM_NEON) with defined(JUMPER_IS_NEON),
which is defined(__ARM_NEON) && defined(__clang__).

When compiled by Clang like our bots and primary users, this is a noop,
but will avoid some of the NEON paths for compilers defining __ARM_NEON
that are not Clang.

Bug: skia:8178

Change-Id: Ifaea940c95b61d2fefadbd4e752cc477c571eafa
Reviewed-on: https://skia-review.googlesource.com/143301
Commit-Queue: Herb Derby <herb@google.com>
Auto-Submit: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
---
 src/opts/SkRasterPipeline_opts.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'src')
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 0dce9cfdb6..1eb4d39fc0 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -706,7 +706,7 @@ static const size_t N = sizeof(F) / sizeof(float);
 
 // Any custom ABI to use for all (non-externally-facing) stage functions?
 // Also decide here whether to use narrow (compromise) or wide (ideal) stages.
-#if defined(__arm__) && defined(__ARM_NEON)
+#if defined(__arm__) && defined(JUMPER_IS_NEON)
     // This lets us pass vectors more efficiently on 32-bit ARM.
     // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
     #define ABI __attribute__((pcs("aapcs-vfp")))
@@ -2363,7 +2363,7 @@ static void start_pipeline(const size_t x0,     const size_t y0,
 SI U16 div255(U16 v) {
 #if 0
     return (v+127)/255;  // The ideal rounding divide by 255.
-#elif 1 && defined(__ARM_NEON)
+#elif 1 && defined(JUMPER_IS_NEON)
     // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8
     // just as fast as we can do the approximation below, so might as well be correct!
     // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up.
@@ -2425,7 +2425,7 @@ SI F rcp(F x) {
     __m128 lo,hi;
     split(x, &lo,&hi);
     return join<F>(_mm_rcp_ps(lo), _mm_rcp_ps(hi));
-#elif defined(__ARM_NEON)
+#elif defined(JUMPER_IS_NEON)
     auto rcp = [](float32x4_t v) {
         auto est = vrecpeq_f32(v);
         return vrecpsq_f32(v,est)*est;
@@ -2450,7 +2450,7 @@ SI F sqrt_(F x) {
     float32x4_t lo,hi;
     split(x, &lo,&hi);
     return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
-#elif defined(__ARM_NEON)
+#elif defined(JUMPER_IS_NEON)
     auto sqrt = [](float32x4_t v) {
         auto est = vrsqrteq_f32(v);  // Estimate and two refinement steps for est = rsqrt(v).
         est *= vrsqrtsq_f32(v,est*est);
@@ -2770,7 +2770,7 @@ SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
 }
 
 SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
-#if 1 && defined(__ARM_NEON)
+#if 1 && defined(JUMPER_IS_NEON)
     uint8x8x4_t rgba;
     switch (tail & (N-1)) {
         case 0: rgba = vld4_u8     ((const uint8_t*)(ptr+0)         ); break;
@@ -2791,7 +2791,7 @@ SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16
 #endif
 }
 SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
-#if 1 && defined(__ARM_NEON)
+#if 1 && defined(JUMPER_IS_NEON)
     uint8x8x4_t rgba = {{
         cast<U8>(r),
         cast<U8>(g),
-- 
cgit v1.2.3