aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-01-17 13:14:17 -0500
committerGravatar Mike Klein <mtklein@chromium.org>2017-01-17 18:44:01 +0000
commit844685434855c51e310884be9ec8e8b3f563033e (patch)
treee16318a978ef81688ae96c6e382efa03dc176b8c /src
parent652ecb51624941545aaedcea47bf51b314738c9c (diff)
SkSplicer: no need for AI. Clang is good at this.
Change-Id: I1d5b82c0c2748b4d206d8d104fdd5dc04dc2693b Reviewed-on: https://skia-review.googlesource.com/7116 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/splicer/SkSplicer_stages.cpp79
1 files changed, 38 insertions, 41 deletions
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp
index 935995b29d..755bb5d9a5 100644
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -12,9 +12,6 @@
#error This file is not like the rest of Skia. It must be compiled with clang.
#endif
-// We have very specific inlining requirements. It helps to just take total control.
-#define AI __attribute__((always_inline)) inline
-
#if defined(__aarch64__)
#include <arm_neon.h>
@@ -25,15 +22,15 @@
using U8 = uint8_t __attribute__((ext_vector_type(4)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
- AI static F min(F a, F b) { return vminq_f32(a,b); }
- AI static F max(F a, F b) { return vmaxq_f32(a,b); }
- AI static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
- AI static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
- AI static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
- AI static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
- AI static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
-
- AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+ static F min(F a, F b) { return vminq_f32(a,b); }
+ static F max(F a, F b) { return vmaxq_f32(a,b); }
+ static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
+ static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
+ static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
+ static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+ static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
+
+ static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -46,15 +43,15 @@
using U32 = uint32_t __attribute__((ext_vector_type(2)));
using U8 = uint8_t __attribute__((ext_vector_type(2)));
- AI static F min(F a, F b) { return vmin_f32(a,b); }
- AI static F max(F a, F b) { return vmax_f32(a,b); }
- AI static F fma(F f, F m, F a) { return vfma_f32(a,f,m); }
- AI static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
- AI static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
- AI static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
- AI static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+ static F min(F a, F b) { return vmin_f32(a,b); }
+ static F max(F a, F b) { return vmax_f32(a,b); }
+ static F fma(F f, F m, F a) { return vfma_f32(a,f,m); }
+ static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
+ static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+ static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
+ static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
- AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
+ static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
#else
#if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
#error On x86, compile with -mavx2 -mfma -mf16c.
@@ -67,22 +64,22 @@
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
- AI static F min(F a, F b) { return _mm256_min_ps (a,b); }
- AI static F max(F a, F b) { return _mm256_max_ps (a,b); }
- AI static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
- AI static F rcp (F v) { return _mm256_rcp_ps (v); }
- AI static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
- AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
- AI static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+ static F min(F a, F b) { return _mm256_min_ps (a,b); }
+ static F max(F a, F b) { return _mm256_max_ps (a,b); }
+ static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
+ static F rcp (F v) { return _mm256_rcp_ps (v); }
+ static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
+ static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+ static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
- AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
+ static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
#endif
-AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
-AI static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
+static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
+static U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
template <typename T, typename P>
-AI static T unaligned_load(const P* p) {
+static T unaligned_load(const P* p) {
T v;
memcpy(&v, p, sizeof(v));
return v;
@@ -121,16 +118,16 @@ C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
// It's just a convenience to make a valid, spliceable Stage, nothing magic.
-#define STAGE(name) \
- AI static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
- C void name(size_t x, size_t limit, void* ctx, K* k, \
- F r, F g, F b, F a, F dr, F dg, F db, F da) { \
- name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
- done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
- } \
- AI static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#define STAGE(name) \
+ static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ C void name(size_t x, size_t limit, void* ctx, K* k, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
+ done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
+ } \
+ static void name##_k(size_t x, size_t limit, void* ctx, K* k, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
// We can now define Stages!