diff options
Diffstat (limited to 'src/jumper')
-rw-r--r-- | src/jumper/SkJumper.h | 6 | ||||
-rw-r--r-- | src/jumper/SkJumper_misc.h | 2 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 36 | ||||
-rw-r--r-- | src/jumper/SkJumper_stages_8bit.cpp | 2 | ||||
-rw-r--r-- | src/jumper/SkJumper_vectors.h | 57 | ||||
-rwxr-xr-x | src/jumper/build_stages.py | 2 |
6 files changed, 60 insertions, 45 deletions
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h index d4e8ef4f37..20b8d32aba 100644 --- a/src/jumper/SkJumper.h +++ b/src/jumper/SkJumper.h @@ -13,15 +13,15 @@ // Keep it simple! // Externally facing functions (start_pipeline) are called a little specially on Windows. -#if defined(JUMPER) && defined(WIN) && defined(__x86_64__) +#if defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__x86_64__) #define MAYBE_MSABI __attribute__((ms_abi)) // Use MS' ABI, not System V. -#elif defined(JUMPER) && defined(WIN) && defined(__i386__) +#elif defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__i386__) #define MAYBE_MSABI __attribute__((force_align_arg_pointer)) // Re-align stack 4 -> 16 bytes. #else #define MAYBE_MSABI #endif -#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__)) +#if defined(JUMPER_IS_OFFLINE) && (defined(__aarch64__) || defined(__arm__)) // To reduce SkJumper's dependency on the Android NDK, // we provide what we need from <string.h>, <stdint.h>, and <stddef.h> ourselves. #define memcpy __builtin_memcpy diff --git a/src/jumper/SkJumper_misc.h b/src/jumper/SkJumper_misc.h index 8d7bfeb833..09758fba7d 100644 --- a/src/jumper/SkJumper_misc.h +++ b/src/jumper/SkJumper_misc.h @@ -13,7 +13,7 @@ // Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h. // Every function in this file should be marked static and inline using SI. -#if defined(JUMPER) +#if defined(JUMPER_IS_OFFLINE) #define SI __attribute__((always_inline)) static inline #else #define SI static inline diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 29651cab7d..593a88c4e0 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -13,16 +13,15 @@ static const size_t kStride = sizeof(F) / sizeof(float); // A reminder: -// Code guarded by defined(JUMPER) can assume that it will be compiled by Clang -// and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type. -// Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1). +// When defined(JUMPER_IS_SCALAR), F, I32, etc. are normal scalar types and kStride is 1. +// When not, F, I32, etc. are kStride-depp Clang ext_vector_type vectors of the appropriate type. // You can use most constants in this file, but in a few rare exceptions we read from this struct. using K = const SkJumper_constants; // A little wrapper macro to name Stages differently depending on the instruction set. // That lets us link together several options. -#if !defined(JUMPER) +#if !defined(JUMPER_IS_OFFLINE) #define WRAP(name) sk_##name #elif defined(__aarch64__) #define WRAP(name) sk_##name##_aarch64 @@ -60,7 +59,7 @@ using K = const SkJumper_constants; using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F); #endif -#if defined(JUMPER) && defined(__AVX__) +#if defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2) // We really want to make sure all paths go through this function's (implicit) vzeroupper. // If they don't, we'll experience severe slowdowns when we first use SSE instructions again. __attribute__((disable_tail_calls)) @@ -68,10 +67,10 @@ using K = const SkJumper_constants; MAYBE_MSABI extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit, void** program, K* k) { -#if defined(JUMPER) - F v; +#if defined(JUMPER_IS_OFFLINE) + F v; // Really no need to intialize. #else - F v{}; + F v{}; // Compilers tend to whine about this, so it's easiest to just zero. #endif auto start = (Stage*)load_and_inc(program); const size_t x0 = x; @@ -145,7 +144,7 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y template <typename V, typename T> SI V load(const T* src, size_t tail) { -#if defined(JUMPER) +#if !defined(JUMPER_IS_SCALAR) __builtin_assume(tail < kStride); if (__builtin_expect(tail, 0)) { V v{}; // Any inactive lanes are zeroed. @@ -166,7 +165,7 @@ SI V load(const T* src, size_t tail) { template <typename V, typename T> SI void store(T* dst, V v, size_t tail) { -#if defined(JUMPER) +#if !defined(JUMPER_IS_SCALAR) __builtin_assume(tail < kStride); if (__builtin_expect(tail, 0)) { switch (tail) { @@ -609,10 +608,11 @@ STAGE(from_srgb_dst) { STAGE(to_srgb) { auto fn = [&](F l) { // We tweak c and d for each instruction set to make sure fn(1) is exactly 1. - #if defined(JUMPER) && defined(__SSE2__) + #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \ + defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_AVX2 ) const float c = 1.130048394203f, d = 0.141357362270f; - #elif defined(JUMPER) && (defined(__aarch64__) || defined(__arm__)) + #elif defined(JUMPER_IS_NEON) const float c = 1.129999995232f, d = 0.141381442547f; #else @@ -1179,7 +1179,7 @@ STAGE(matrix_perspective) { SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t, F* r, F* g, F* b, F* a) { F fr, br, fg, bg, fb, bb, fa, ba; -#if defined(JUMPER) && defined(__AVX2__) +#if defined(JUMPER_IS_AVX2) if (c->stopCount <=8) { fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); @@ -1344,14 +1344,14 @@ STAGE(mask_2pt_conical_degenerates) { U32 mask = 0xffffffff; // TODO: mtklein kindly volunteered to revisit this at some point. -#if defined(JUMPER) - // Vector comparisons set all bits, so we can use something like this. - mask = mask & (mad(r, c->fDR, c->fR0) >= 0); // R(t) >= 0 - mask = mask & (r == r); // t != NaN -#else +#if defined(JUMPER_IS_SCALAR) // The portable version is more involved, 'cause we only get one bit back. mask = mask & if_then_else(mad(r, c->fDR, c->fR0) >= 0, U32(0xffffffff), U32(0)); // R(t) >= 0 mask = mask & if_then_else(r == r, U32(0xffffffff), U32(0)); // t != NaN +#else + // Vector comparisons set all bits, so we can use something like this. + mask = mask & (mad(r, c->fDR, c->fR0) >= 0); // R(t) >= 0 + mask = mask & (r == r); // t != NaN #endif unaligned_store(&c->fMask, mask); diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp index 3e119010e4..5c73ea8cbe 100644 --- a/src/jumper/SkJumper_stages_8bit.cpp +++ b/src/jumper/SkJumper_stages_8bit.cpp @@ -16,7 +16,7 @@ // pixels. This is the natural format for kN32_SkColorType buffers, and we // hope the stages in this file can replace many custom legacy routines. -#if !defined(JUMPER) +#if !defined(JUMPER_IS_OFFLINE) #error "This file must be pre-compiled." #elif defined(__aarch64__) #define WRAP(name) sk_##name##_aarch64_8bit diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h index dabf3efefb..544b208a04 100644 --- a/src/jumper/SkJumper_vectors.h +++ b/src/jumper/SkJumper_vectors.h @@ -15,9 +15,24 @@ // Every function in this file should be marked static and inline using SI (see SkJumper_misc.h). -#if !defined(JUMPER) - // This path should lead to portable code that can be compiled directly into Skia. - // (All other paths are compiled offline by Clang into SkJumper_generated.S.) +#if !defined(__clang__) + #define JUMPER_IS_SCALAR +#elif defined(__aarch64__) || defined(__ARM_VFPV4__) + #define JUMPER_IS_NEON +#elif defined(__AVX2__) + #define JUMPER_IS_AVX2 +#elif defined(__AVX__) + #define JUMPER_IS_AVX +#elif defined(__SSE4_1__) + #define JUMPER_IS_SSE41 +#elif defined(__SSE2__) + #define JUMPER_IS_SSE2 +#else + #define JUMPER_IS_SCALAR +#endif + +#if defined(JUMPER_IS_SCALAR) + // This path should lead to portable scalar code. #include <math.h> using F = float ; @@ -75,7 +90,7 @@ ptr[3] = a; } -#elif defined(__aarch64__) || defined(__arm__) +#elif defined(JUMPER_IS_NEON) #include <arm_neon.h> // Since we know we're using Clang, we can use its vector extensions. @@ -187,7 +202,7 @@ } } -#elif defined(__AVX__) +#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2) #include <immintrin.h> // These are __m256 and __m256i, but friendlier and strongly-typed. @@ -200,7 +215,7 @@ using U8 = V<uint8_t >; SI F mad(F f, F m, F a) { - #if defined(__FMA__) + #if defined(JUMPER_IS_AVX2) return _mm256_fmadd_ps(f,m,a); #else return f*m+a; @@ -232,7 +247,7 @@ return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; } - #if defined(__AVX2__) + #if defined(JUMPER_IS_AVX2) SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } SI U64 gather(const uint64_t* p, U32 ix) { @@ -401,7 +416,7 @@ } } -#elif defined(__SSE2__) +#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) #include <immintrin.h> template <typename T> using V = T __attribute__((ext_vector_type(4))); @@ -422,7 +437,7 @@ SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } SI U16 pack(U32 v) { - #if defined(__SSE4_1__) + #if defined(JUMPER_IS_SSE41) auto p = _mm_packus_epi32(v,v); #else // Sign extend so that _mm_packs_epi32() does the pack we want. @@ -442,7 +457,7 @@ } SI F floor_(F v) { - #if defined(__SSE4_1__) + #if defined(JUMPER_IS_SSE41) return _mm_floor_ps(v); #else F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); @@ -569,16 +584,16 @@ // We need to be a careful with casts. // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. // These named casts and bit_cast() are always what they seem to be. -#if defined(JUMPER) - SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } - SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } - SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } - SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } -#else +#if defined(JUMPER_IS_SCALAR) SI F cast (U32 v) { return (F)v; } SI U32 trunc_(F v) { return (U32)v; } SI U32 expand(U16 v) { return (U32)v; } SI U32 expand(U8 v) { return (U32)v; } +#else + SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } + SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } + SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } + SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } #endif template <typename V> @@ -587,7 +602,7 @@ SI V if_then_else(I32 c, V t, V e) { } SI U16 bswap(U16 x) { -#if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__) +#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes // when generating code for SSE2 and SSE4.1. We'll do it manually... auto v = widen_cast<__m128i>(x); @@ -625,10 +640,10 @@ SI F approx_powf(F x, F y) { } SI F from_half(U16 h) { -#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__)) +#if defined(JUMPER_IS_NEON) return vcvt_f32_f16(h); -#elif defined(JUMPER) && defined(__AVX2__) +#elif defined(JUMPER_IS_AVX2) return _mm256_cvtph_ps(h); #else @@ -645,10 +660,10 @@ SI F from_half(U16 h) { } SI U16 to_half(F f) { -#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__)) +#if defined(JUMPER_IS_NEON) return vcvt_f16_f32(f); -#elif defined(JUMPER) && defined(__AVX2__) +#elif defined(JUMPER_IS_AVX2) return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); #else diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py index a6d637552d..517b049d0d 100755 --- a/src/jumper/build_stages.py +++ b/src/jumper/build_stages.py @@ -28,7 +28,7 @@ generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win clang = [ccache, clang, '-x', 'c++'] -cflags = ['-std=c++11', '-Os', '-DJUMPER', +cflags = ['-std=c++11', '-Os', '-DJUMPER_IS_OFFLINE', '-momit-leaf-frame-pointer', '-ffp-contract=fast', '-fno-exceptions', '-fno-rtti', '-fno-unwind-tables'] |