aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper
diff options
context:
space:
mode:
Diffstat (limited to 'src/jumper')
-rw-r--r--src/jumper/SkJumper.h6
-rw-r--r--src/jumper/SkJumper_misc.h2
-rw-r--r--src/jumper/SkJumper_stages.cpp36
-rw-r--r--src/jumper/SkJumper_stages_8bit.cpp2
-rw-r--r--src/jumper/SkJumper_vectors.h57
-rwxr-xr-xsrc/jumper/build_stages.py2
6 files changed, 60 insertions, 45 deletions
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index d4e8ef4f37..20b8d32aba 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -13,15 +13,15 @@
// Keep it simple!
// Externally facing functions (start_pipeline) are called a little specially on Windows.
-#if defined(JUMPER) && defined(WIN) && defined(__x86_64__)
+#if defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__x86_64__)
#define MAYBE_MSABI __attribute__((ms_abi)) // Use MS' ABI, not System V.
-#elif defined(JUMPER) && defined(WIN) && defined(__i386__)
+#elif defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__i386__)
#define MAYBE_MSABI __attribute__((force_align_arg_pointer)) // Re-align stack 4 -> 16 bytes.
#else
#define MAYBE_MSABI
#endif
-#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+#if defined(JUMPER_IS_OFFLINE) && (defined(__aarch64__) || defined(__arm__))
// To reduce SkJumper's dependency on the Android NDK,
// we provide what we need from <string.h>, <stdint.h>, and <stddef.h> ourselves.
#define memcpy __builtin_memcpy
diff --git a/src/jumper/SkJumper_misc.h b/src/jumper/SkJumper_misc.h
index 8d7bfeb833..09758fba7d 100644
--- a/src/jumper/SkJumper_misc.h
+++ b/src/jumper/SkJumper_misc.h
@@ -13,7 +13,7 @@
// Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.
// Every function in this file should be marked static and inline using SI.
-#if defined(JUMPER)
+#if defined(JUMPER_IS_OFFLINE)
#define SI __attribute__((always_inline)) static inline
#else
#define SI static inline
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 29651cab7d..593a88c4e0 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -13,16 +13,15 @@
static const size_t kStride = sizeof(F) / sizeof(float);
// A reminder:
-// Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
-// and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
-// Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
+// When defined(JUMPER_IS_SCALAR), F, I32, etc. are normal scalar types and kStride is 1.
+// When not, F, I32, etc. are kStride-depp Clang ext_vector_type vectors of the appropriate type.
// You can use most constants in this file, but in a few rare exceptions we read from this struct.
using K = const SkJumper_constants;
// A little wrapper macro to name Stages differently depending on the instruction set.
// That lets us link together several options.
-#if !defined(JUMPER)
+#if !defined(JUMPER_IS_OFFLINE)
#define WRAP(name) sk_##name
#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64
@@ -60,7 +59,7 @@ using K = const SkJumper_constants;
using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
#endif
-#if defined(JUMPER) && defined(__AVX__)
+#if defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2)
// We really want to make sure all paths go through this function's (implicit) vzeroupper.
// If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
__attribute__((disable_tail_calls))
@@ -68,10 +67,10 @@ using K = const SkJumper_constants;
MAYBE_MSABI
extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
void** program, K* k) {
-#if defined(JUMPER)
- F v;
+#if defined(JUMPER_IS_OFFLINE)
+ F v; // Really no need to intialize.
#else
- F v{};
+ F v{}; // Compilers tend to whine about this, so it's easiest to just zero.
#endif
auto start = (Stage*)load_and_inc(program);
const size_t x0 = x;
@@ -145,7 +144,7 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y
template <typename V, typename T>
SI V load(const T* src, size_t tail) {
-#if defined(JUMPER)
+#if !defined(JUMPER_IS_SCALAR)
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
V v{}; // Any inactive lanes are zeroed.
@@ -166,7 +165,7 @@ SI V load(const T* src, size_t tail) {
template <typename V, typename T>
SI void store(T* dst, V v, size_t tail) {
-#if defined(JUMPER)
+#if !defined(JUMPER_IS_SCALAR)
__builtin_assume(tail < kStride);
if (__builtin_expect(tail, 0)) {
switch (tail) {
@@ -609,10 +608,11 @@ STAGE(from_srgb_dst) {
STAGE(to_srgb) {
auto fn = [&](F l) {
// We tweak c and d for each instruction set to make sure fn(1) is exactly 1.
- #if defined(JUMPER) && defined(__SSE2__)
+ #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \
+ defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_AVX2 )
const float c = 1.130048394203f,
d = 0.141357362270f;
- #elif defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+ #elif defined(JUMPER_IS_NEON)
const float c = 1.129999995232f,
d = 0.141381442547f;
#else
@@ -1179,7 +1179,7 @@ STAGE(matrix_perspective) {
SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
F* r, F* g, F* b, F* a) {
F fr, br, fg, bg, fb, bb, fa, ba;
-#if defined(JUMPER) && defined(__AVX2__)
+#if defined(JUMPER_IS_AVX2)
if (c->stopCount <=8) {
fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
@@ -1344,14 +1344,14 @@ STAGE(mask_2pt_conical_degenerates) {
U32 mask = 0xffffffff;
// TODO: mtklein kindly volunteered to revisit this at some point.
-#if defined(JUMPER)
- // Vector comparisons set all bits, so we can use something like this.
- mask = mask & (mad(r, c->fDR, c->fR0) >= 0); // R(t) >= 0
- mask = mask & (r == r); // t != NaN
-#else
+#if defined(JUMPER_IS_SCALAR)
// The portable version is more involved, 'cause we only get one bit back.
mask = mask & if_then_else(mad(r, c->fDR, c->fR0) >= 0, U32(0xffffffff), U32(0)); // R(t) >= 0
mask = mask & if_then_else(r == r, U32(0xffffffff), U32(0)); // t != NaN
+#else
+ // Vector comparisons set all bits, so we can use something like this.
+ mask = mask & (mad(r, c->fDR, c->fR0) >= 0); // R(t) >= 0
+ mask = mask & (r == r); // t != NaN
#endif
unaligned_store(&c->fMask, mask);
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 3e119010e4..5c73ea8cbe 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -16,7 +16,7 @@
// pixels. This is the natural format for kN32_SkColorType buffers, and we
// hope the stages in this file can replace many custom legacy routines.
-#if !defined(JUMPER)
+#if !defined(JUMPER_IS_OFFLINE)
#error "This file must be pre-compiled."
#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64_8bit
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index dabf3efefb..544b208a04 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -15,9 +15,24 @@
// Every function in this file should be marked static and inline using SI (see SkJumper_misc.h).
-#if !defined(JUMPER)
- // This path should lead to portable code that can be compiled directly into Skia.
- // (All other paths are compiled offline by Clang into SkJumper_generated.S.)
+#if !defined(__clang__)
+ #define JUMPER_IS_SCALAR
+#elif defined(__aarch64__) || defined(__ARM_VFPV4__)
+ #define JUMPER_IS_NEON
+#elif defined(__AVX2__)
+ #define JUMPER_IS_AVX2
+#elif defined(__AVX__)
+ #define JUMPER_IS_AVX
+#elif defined(__SSE4_1__)
+ #define JUMPER_IS_SSE41
+#elif defined(__SSE2__)
+ #define JUMPER_IS_SSE2
+#else
+ #define JUMPER_IS_SCALAR
+#endif
+
+#if defined(JUMPER_IS_SCALAR)
+ // This path should lead to portable scalar code.
#include <math.h>
using F = float ;
@@ -75,7 +90,7 @@
ptr[3] = a;
}
-#elif defined(__aarch64__) || defined(__arm__)
+#elif defined(JUMPER_IS_NEON)
#include <arm_neon.h>
// Since we know we're using Clang, we can use its vector extensions.
@@ -187,7 +202,7 @@
}
}
-#elif defined(__AVX__)
+#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2)
#include <immintrin.h>
// These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -200,7 +215,7 @@
using U8 = V<uint8_t >;
SI F mad(F f, F m, F a) {
- #if defined(__FMA__)
+ #if defined(JUMPER_IS_AVX2)
return _mm256_fmadd_ps(f,m,a);
#else
return f*m+a;
@@ -232,7 +247,7 @@
return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
}
- #if defined(__AVX2__)
+ #if defined(JUMPER_IS_AVX2)
SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); }
SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
SI U64 gather(const uint64_t* p, U32 ix) {
@@ -401,7 +416,7 @@
}
}
-#elif defined(__SSE2__)
+#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
#include <immintrin.h>
template <typename T> using V = T __attribute__((ext_vector_type(4)));
@@ -422,7 +437,7 @@
SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
- #if defined(__SSE4_1__)
+ #if defined(JUMPER_IS_SSE41)
auto p = _mm_packus_epi32(v,v);
#else
// Sign extend so that _mm_packs_epi32() does the pack we want.
@@ -442,7 +457,7 @@
}
SI F floor_(F v) {
- #if defined(__SSE4_1__)
+ #if defined(JUMPER_IS_SSE41)
return _mm_floor_ps(v);
#else
F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
@@ -569,16 +584,16 @@
// We need to be a careful with casts.
// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
// These named casts and bit_cast() are always what they seem to be.
-#if defined(JUMPER)
- SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
- SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
- SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
- SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
-#else
+#if defined(JUMPER_IS_SCALAR)
SI F cast (U32 v) { return (F)v; }
SI U32 trunc_(F v) { return (U32)v; }
SI U32 expand(U16 v) { return (U32)v; }
SI U32 expand(U8 v) { return (U32)v; }
+#else
+ SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
+ SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
+ SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
+ SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
#endif
template <typename V>
@@ -587,7 +602,7 @@ SI V if_then_else(I32 c, V t, V e) {
}
SI U16 bswap(U16 x) {
-#if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
+#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
// Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
// when generating code for SSE2 and SSE4.1. We'll do it manually...
auto v = widen_cast<__m128i>(x);
@@ -625,10 +640,10 @@ SI F approx_powf(F x, F y) {
}
SI F from_half(U16 h) {
-#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+#if defined(JUMPER_IS_NEON)
return vcvt_f32_f16(h);
-#elif defined(JUMPER) && defined(__AVX2__)
+#elif defined(JUMPER_IS_AVX2)
return _mm256_cvtph_ps(h);
#else
@@ -645,10 +660,10 @@ SI F from_half(U16 h) {
}
SI U16 to_half(F f) {
-#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+#if defined(JUMPER_IS_NEON)
return vcvt_f16_f32(f);
-#elif defined(JUMPER) && defined(__AVX2__)
+#elif defined(JUMPER_IS_AVX2)
return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
#else
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index a6d637552d..517b049d0d 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -28,7 +28,7 @@ generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
clang = [ccache, clang, '-x', 'c++']
-cflags = ['-std=c++11', '-Os', '-DJUMPER',
+cflags = ['-std=c++11', '-Os', '-DJUMPER_IS_OFFLINE',
'-momit-leaf-frame-pointer', '-ffp-contract=fast',
'-fno-exceptions', '-fno-rtti', '-fno-unwind-tables']