From 73d7ffca4e12b4f227ebed447c84564285d36548 Mon Sep 17 00:00:00 2001 From: Mike Klein Date: Wed, 25 Jul 2018 09:19:23 -0400 Subject: refine NEON checks in raster pipeline This replaces most defined(__ARM_NEON) with defined(JUMPER_IS_NEON), which is defined(__ARM_NEON) && defined(__clang__). When compiled by Clang like our bots and primary users, this is a noop, but will avoid some of the NEON paths for compilers defining __ARM_NEON that are not Clang. Bug: skia:8178 Change-Id: Ifaea940c95b61d2fefadbd4e752cc477c571eafa Reviewed-on: https://skia-review.googlesource.com/143301 Commit-Queue: Herb Derby Auto-Submit: Mike Klein Reviewed-by: Herb Derby --- src/opts/SkRasterPipeline_opts.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h index 0dce9cfdb6..1eb4d39fc0 100644 --- a/src/opts/SkRasterPipeline_opts.h +++ b/src/opts/SkRasterPipeline_opts.h @@ -706,7 +706,7 @@ static const size_t N = sizeof(F) / sizeof(float); // Any custom ABI to use for all (non-externally-facing) stage functions? // Also decide here whether to use narrow (compromise) or wide (ideal) stages. -#if defined(__arm__) && defined(__ARM_NEON) +#if defined(__arm__) && defined(JUMPER_IS_NEON) // This lets us pass vectors more efficiently on 32-bit ARM. // We can still only pass 16 floats, so best as 4x {r,g,b,a}. #define ABI __attribute__((pcs("aapcs-vfp"))) @@ -2363,7 +2363,7 @@ static void start_pipeline(const size_t x0, const size_t y0, SI U16 div255(U16 v) { #if 0 return (v+127)/255; // The ideal rounding divide by 255. -#elif 1 && defined(__ARM_NEON) +#elif 1 && defined(JUMPER_IS_NEON) // With NEON we can compute (v+127)/255 as (v + ((v+128)>>8) + 128)>>8 // just as fast as we can do the approximation below, so might as well be correct! // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up. @@ -2425,7 +2425,7 @@ SI F rcp(F x) { __m128 lo,hi; split(x, &lo,&hi); return join(_mm_rcp_ps(lo), _mm_rcp_ps(hi)); -#elif defined(__ARM_NEON) +#elif defined(JUMPER_IS_NEON) auto rcp = [](float32x4_t v) { auto est = vrecpeq_f32(v); return vrecpsq_f32(v,est)*est; @@ -2450,7 +2450,7 @@ SI F sqrt_(F x) { float32x4_t lo,hi; split(x, &lo,&hi); return join(vsqrtq_f32(lo), vsqrtq_f32(hi)); -#elif defined(__ARM_NEON) +#elif defined(JUMPER_IS_NEON) auto sqrt = [](float32x4_t v) { auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). est *= vrsqrtsq_f32(v,est*est); @@ -2770,7 +2770,7 @@ SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { } SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { -#if 1 && defined(__ARM_NEON) +#if 1 && defined(JUMPER_IS_NEON) uint8x8x4_t rgba; switch (tail & (N-1)) { case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; @@ -2791,7 +2791,7 @@ SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16 #endif } SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { -#if 1 && defined(__ARM_NEON) +#if 1 && defined(JUMPER_IS_NEON) uint8x8x4_t rgba = {{ cast(r), cast(g), -- cgit v1.2.3