diff options
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 28 |
1 files changed, 23 insertions, 5 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 0c310515e3..f2abe3d5af 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -32,6 +32,8 @@ using K = const SkJumper_constants; static F gather(const float* p, U32 ix) { return p[ix]; } + #define WRAP(name) sk_##name + #elif defined(__aarch64__) #include <arm_neon.h> @@ -53,6 +55,8 @@ using K = const SkJumper_constants; static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } + #define WRAP(name) sk_##name##_aarch64 + #elif defined(__ARM_NEON__) #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__) #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb. @@ -76,6 +80,8 @@ using K = const SkJumper_constants; static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; } + #define WRAP(name) sk_##name##_armv7 + #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) #include <immintrin.h> @@ -96,6 +102,8 @@ using K = const SkJumper_constants; static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); } + #define WRAP(name) sk_##name##_hsw + #elif defined(__SSE2__) #include <immintrin.h> @@ -120,6 +128,12 @@ using K = const SkJumper_constants; } static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; } + + #if defined(__SSE4_1__) + #define WRAP(name) sk_##name##_sse41 + #else + #define WRAP(name) sk_##name##_sse2 + #endif #endif // We need to be a careful with casts. @@ -190,7 +204,7 @@ static void* load_and_inc(void**& program) { #define STAGE(name) \ static void name##_k(size_t& x, void* ctx, K* k, \ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ - extern "C" void sk_##name(size_t x, void** program, K* k, \ + extern "C" void WRAP(name)(size_t x, void** program, K* k, \ F r, F g, F b, F a, F dr, F dg, F db, F da) { \ auto ctx = load_and_inc(program); \ name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \ @@ -202,7 +216,7 @@ static void* load_and_inc(void**& program) { // Some glue stages that don't fit the normal pattern of stages. -extern "C" void sk_start_pipeline(size_t x, void** program, K* k) { +extern "C" void WRAP(start_pipeline)(size_t x, void** program, K* k) { auto next = (Stage*)load_and_inc(program); F v{}; // TODO: faster uninitialized? next(x,program,k, v,v,v,v, v,v,v,v); @@ -210,13 +224,17 @@ extern "C" void sk_start_pipeline(size_t x, void** program, K* k) { #if defined(JUMPER) && defined(__x86_64__) __attribute__((ms_abi)) - extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) { - sk_start_pipeline(x,program,k); + extern "C" void WRAP(start_pipeline_ms)(size_t x, void** program, K* k) { + WRAP(start_pipeline)(x,program,k); } #endif // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller). -extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {} +extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) { +#if defined(JUMPER) && defined(__AVX2__) + asm("vzeroupper"); +#endif +} // We can now define Stages! |