6 files changed, 60 insertions, 45 deletions
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
index d4e8ef4f37..20b8d32aba 100644
--- a/src/jumper/SkJumper.h
+++ b/src/jumper/SkJumper.h
@@ -13,15 +13,15 @@
 // Keep it simple!
 
 // Externally facing functions (start_pipeline) are called a little specially on Windows.
-#if defined(JUMPER) && defined(WIN) && defined(__x86_64__)
+#if defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__x86_64__)
     #define MAYBE_MSABI __attribute__((ms_abi))                   // Use MS' ABI, not System V.
-#elif defined(JUMPER) && defined(WIN) && defined(__i386__)
+#elif defined(JUMPER_IS_OFFLINE) && defined(WIN) && defined(__i386__)
     #define MAYBE_MSABI __attribute__((force_align_arg_pointer))  // Re-align stack 4 -> 16 bytes.
 #else
     #define MAYBE_MSABI
 #endif
 
-#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+#if defined(JUMPER_IS_OFFLINE) && (defined(__aarch64__) || defined(__arm__))
     // To reduce SkJumper's dependency on the Android NDK,
     // we provide what we need from <string.h>, <stdint.h>, and <stddef.h> ourselves.
     #define memcpy __builtin_memcpy
diff --git a/src/jumper/SkJumper_misc.h b/src/jumper/SkJumper_misc.h
index 8d7bfeb833..09758fba7d 100644
--- a/src/jumper/SkJumper_misc.h
+++ b/src/jumper/SkJumper_misc.h
@@ -13,7 +13,7 @@
 // Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.
 
 // Every function in this file should be marked static and inline using SI.
-#if defined(JUMPER)
+#if defined(JUMPER_IS_OFFLINE)
     #define SI __attribute__((always_inline)) static inline
 #else
     #define SI static inline
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 29651cab7d..593a88c4e0 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -13,16 +13,15 @@
 static const size_t kStride = sizeof(F) / sizeof(float);
 
 // A reminder:
-// Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
-// and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
-// Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
+// When defined(JUMPER_IS_SCALAR), F, I32, etc. are normal scalar types and kStride is 1.
+// When not, F, I32, etc. are kStride-depp Clang ext_vector_type vectors of the appropriate type.
 
 // You can use most constants in this file, but in a few rare exceptions we read from this struct.
 using K = const SkJumper_constants;
 
 // A little wrapper macro to name Stages differently depending on the instruction set.
 // That lets us link together several options.
-#if !defined(JUMPER)
+#if !defined(JUMPER_IS_OFFLINE)
     #define WRAP(name) sk_##name
 #elif defined(__aarch64__)
     #define WRAP(name) sk_##name##_aarch64
@@ -60,7 +59,7 @@ using K = const SkJumper_constants;
     using Stage = void(K* k, void** program, size_t x, size_t y, size_t tail, F,F,F,F, F,F,F,F);
 #endif
 
-#if defined(JUMPER) && defined(__AVX__)
+#if defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2)
     // We really want to make sure all paths go through this function's (implicit) vzeroupper.
     // If they don't, we'll experience severe slowdowns when we first use SSE instructions again.
     __attribute__((disable_tail_calls))
@@ -68,10 +67,10 @@ using K = const SkJumper_constants;
 MAYBE_MSABI
 extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t ylimit,
                                      void** program, K* k) {
-#if defined(JUMPER)
-    F v;
+#if defined(JUMPER_IS_OFFLINE)
+    F v;    // Really no need to intialize.
 #else
-    F v{};
+    F v{};  // Compilers tend to whine about this, so it's easiest to just zero.
 #endif
     auto start = (Stage*)load_and_inc(program);
     const size_t x0 = x;
@@ -145,7 +144,7 @@ extern "C" void WRAP(start_pipeline)(size_t x, size_t y, size_t xlimit, size_t y
 
 template <typename V, typename T>
 SI V load(const T* src, size_t tail) {
-#if defined(JUMPER)
+#if !defined(JUMPER_IS_SCALAR)
     __builtin_assume(tail < kStride);
     if (__builtin_expect(tail, 0)) {
         V v{};  // Any inactive lanes are zeroed.
@@ -166,7 +165,7 @@ SI V load(const T* src, size_t tail) {
 
 template <typename V, typename T>
 SI void store(T* dst, V v, size_t tail) {
-#if defined(JUMPER)
+#if !defined(JUMPER_IS_SCALAR)
     __builtin_assume(tail < kStride);
     if (__builtin_expect(tail, 0)) {
         switch (tail) {
@@ -609,10 +608,11 @@ STAGE(from_srgb_dst) {
 STAGE(to_srgb) {
     auto fn = [&](F l) {
         // We tweak c and d for each instruction set to make sure fn(1) is exactly 1.
-    #if defined(JUMPER) && defined(__SSE2__)
+    #if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || \
+        defined(JUMPER_IS_AVX ) || defined(JUMPER_IS_AVX2 )
         const float c = 1.130048394203f,
                     d = 0.141357362270f;
-    #elif defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+    #elif defined(JUMPER_IS_NEON)
         const float c = 1.129999995232f,
                     d = 0.141381442547f;
     #else
@@ -1179,7 +1179,7 @@ STAGE(matrix_perspective) {
 SI void gradient_lookup(const SkJumper_GradientCtx* c, U32 idx, F t,
                         F* r, F* g, F* b, F* a) {
     F fr, br, fg, bg, fb, bb, fa, ba;
-#if defined(JUMPER) && defined(__AVX2__)
+#if defined(JUMPER_IS_AVX2)
     if (c->stopCount <=8) {
         fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx);
         br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx);
@@ -1344,14 +1344,14 @@ STAGE(mask_2pt_conical_degenerates) {
     U32 mask = 0xffffffff;
 
     // TODO: mtklein kindly volunteered to revisit this at some point.
-#if defined(JUMPER)
-    // Vector comparisons set all bits, so we can use something like this.
-    mask = mask & (mad(r, c->fDR, c->fR0) >= 0);  // R(t) >= 0
-    mask = mask & (r == r);                       // t != NaN
-#else
+#if defined(JUMPER_IS_SCALAR)
     // The portable version is more involved, 'cause we only get one bit back.
     mask = mask & if_then_else(mad(r, c->fDR, c->fR0) >= 0, U32(0xffffffff), U32(0)); // R(t) >= 0
     mask = mask & if_then_else(r == r,                      U32(0xffffffff), U32(0)); // t != NaN
+#else
+    // Vector comparisons set all bits, so we can use something like this.
+    mask = mask & (mad(r, c->fDR, c->fR0) >= 0);  // R(t) >= 0
+    mask = mask & (r == r);                       // t != NaN
 #endif
 
     unaligned_store(&c->fMask, mask);
diff --git a/src/jumper/SkJumper_stages_8bit.cpp b/src/jumper/SkJumper_stages_8bit.cpp
index 3e119010e4..5c73ea8cbe 100644
--- a/src/jumper/SkJumper_stages_8bit.cpp
+++ b/src/jumper/SkJumper_stages_8bit.cpp
@@ -16,7 +16,7 @@
 // pixels.  This is the natural format for kN32_SkColorType buffers, and we
 // hope the stages in this file can replace many custom legacy routines.
 
-#if !defined(JUMPER)
+#if !defined(JUMPER_IS_OFFLINE)
     #error "This file must be pre-compiled."
 #elif defined(__aarch64__)
     #define WRAP(name) sk_##name##_aarch64_8bit
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index dabf3efefb..544b208a04 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -15,9 +15,24 @@
 
 // Every function in this file should be marked static and inline using SI (see SkJumper_misc.h).
 
-#if !defined(JUMPER)
-    // This path should lead to portable code that can be compiled directly into Skia.
-    // (All other paths are compiled offline by Clang into SkJumper_generated.S.)
+#if !defined(__clang__)
+    #define JUMPER_IS_SCALAR
+#elif defined(__aarch64__) || defined(__ARM_VFPV4__)
+    #define JUMPER_IS_NEON
+#elif defined(__AVX2__)
+    #define JUMPER_IS_AVX2
+#elif defined(__AVX__)
+    #define JUMPER_IS_AVX
+#elif defined(__SSE4_1__)
+    #define JUMPER_IS_SSE41
+#elif defined(__SSE2__)
+    #define JUMPER_IS_SSE2
+#else
+    #define JUMPER_IS_SCALAR
+#endif
+
+#if defined(JUMPER_IS_SCALAR)
+    // This path should lead to portable scalar code.
     #include <math.h>
 
     using F   = float   ;
@@ -75,7 +90,7 @@
         ptr[3] = a;
     }
 
-#elif defined(__aarch64__) || defined(__arm__)
+#elif defined(JUMPER_IS_NEON)
     #include <arm_neon.h>
 
     // Since we know we're using Clang, we can use its vector extensions.
@@ -187,7 +202,7 @@
         }
     }
 
-#elif defined(__AVX__)
+#elif defined(JUMPER_IS_AVX) || defined(JUMPER_IS_AVX2)
     #include <immintrin.h>
 
     // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -200,7 +215,7 @@
     using U8  = V<uint8_t >;
 
     SI F mad(F f, F m, F a)  {
-    #if defined(__FMA__)
+    #if defined(JUMPER_IS_AVX2)
         return _mm256_fmadd_ps(f,m,a);
     #else
         return f*m+a;
@@ -232,7 +247,7 @@
         return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
                  p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
     }
-    #if defined(__AVX2__)
+    #if defined(JUMPER_IS_AVX2)
         SI F   gather(const float*    p, U32 ix) { return _mm256_i32gather_ps   (p, ix, 4); }
         SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); }
         SI U64 gather(const uint64_t* p, U32 ix) {
@@ -401,7 +416,7 @@
         }
     }
 
-#elif defined(__SSE2__)
+#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
     #include <immintrin.h>
 
     template <typename T> using V = T __attribute__((ext_vector_type(4)));
@@ -422,7 +437,7 @@
     SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
 
     SI U16 pack(U32 v) {
-    #if defined(__SSE4_1__)
+    #if defined(JUMPER_IS_SSE41)
         auto p = _mm_packus_epi32(v,v);
     #else
         // Sign extend so that _mm_packs_epi32() does the pack we want.
@@ -442,7 +457,7 @@
     }
 
     SI F floor_(F v) {
-    #if defined(__SSE4_1__)
+    #if defined(JUMPER_IS_SSE41)
         return _mm_floor_ps(v);
     #else
         F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
@@ -569,16 +584,16 @@
 // We need to be a careful with casts.
 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
 // These named casts and bit_cast() are always what they seem to be.
-#if defined(JUMPER)
-    SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); }
-    SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); }
-    SI U32 expand(U16 v) { return      __builtin_convertvector(     v, U32); }
-    SI U32 expand(U8  v) { return      __builtin_convertvector(     v, U32); }
-#else
+#if defined(JUMPER_IS_SCALAR)
     SI F   cast  (U32 v) { return   (F)v; }
     SI U32 trunc_(F   v) { return (U32)v; }
     SI U32 expand(U16 v) { return (U32)v; }
     SI U32 expand(U8  v) { return (U32)v; }
+#else
+    SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); }
+    SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); }
+    SI U32 expand(U16 v) { return      __builtin_convertvector(     v, U32); }
+    SI U32 expand(U8  v) { return      __builtin_convertvector(     v, U32); }
 #endif
 
 template <typename V>
@@ -587,7 +602,7 @@ SI V if_then_else(I32 c, V t, V e) {
 }
 
 SI U16 bswap(U16 x) {
-#if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
+#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41)
     // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
     // when generating code for SSE2 and SSE4.1.  We'll do it manually...
     auto v = widen_cast<__m128i>(x);
@@ -625,10 +640,10 @@ SI F approx_powf(F x, F y) {
 }
 
 SI F from_half(U16 h) {
-#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+#if defined(JUMPER_IS_NEON)
     return vcvt_f32_f16(h);
 
-#elif defined(JUMPER) && defined(__AVX2__)
+#elif defined(JUMPER_IS_AVX2)
     return _mm256_cvtph_ps(h);
 
 #else
@@ -645,10 +660,10 @@ SI F from_half(U16 h) {
 }
 
 SI U16 to_half(F f) {
-#if defined(JUMPER) && (defined(__aarch64__) || defined(__arm__))
+#if defined(JUMPER_IS_NEON)
     return vcvt_f16_f32(f);
 
-#elif defined(JUMPER) && defined(__AVX2__)
+#elif defined(JUMPER_IS_AVX2)
     return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
 
 #else
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
index a6d637552d..517b049d0d 100755
--- a/src/jumper/build_stages.py
+++ b/src/jumper/build_stages.py
@@ -28,7 +28,7 @@ generated_win = sys.argv[7] if len(sys.argv) > 7 else generated_win
 clang = [ccache, clang, '-x', 'c++']
 
 
-cflags = ['-std=c++11', '-Os', '-DJUMPER',
+cflags = ['-std=c++11', '-Os', '-DJUMPER_IS_OFFLINE',
           '-momit-leaf-frame-pointer', '-ffp-contract=fast',
           '-fno-exceptions', '-fno-rtti', '-fno-unwind-tables']