aboutsummaryrefslogtreecommitdiffhomepage
path: root/src
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-04-03 13:54:55 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-04-03 19:51:42 +0000
commitb9c4a6fc7de252633f16d11c2df10ee6de16af03 (patch)
tree0d80a89f09e32703144517d6ccdcd0c71ddc2ea0 /src
parentdbcb607f3c6eb74c8c13fad75d4bcb4289a0d9ba (diff)
Refactor and recomment SkJumper_stages.cpp.
SkJumper_stages.cpp is starting to get unweildy. This spins some logical parts out into their own headers. I will follow up by moving more of the very specific f16/f32 load/store logic into SkJumper_vectors.h too. Change-Id: I2a3a055e9d1b65f56983d05649270772a4c69f31 Reviewed-on: https://skia-review.googlesource.com/11133 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src')
-rw-r--r--src/jumper/SkJumper_misc.h49
-rw-r--r--src/jumper/SkJumper_stages.cpp519
-rw-r--r--src/jumper/SkJumper_vectors.h211
3 files changed, 414 insertions, 365 deletions
diff --git a/src/jumper/SkJumper_misc.h b/src/jumper/SkJumper_misc.h
new file mode 100644
index 0000000000..96035bd084
--- /dev/null
+++ b/src/jumper/SkJumper_misc.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkJumper_misc_DEFINED
+#define SkJumper_misc_DEFINED
+
+#include "SkJumper.h" // for memcpy()
+
+// Miscellany used by SkJumper_stages.cpp and SkJumper_vectors.h.
+
+// Every function in this file should be marked static and inline using SI.
+#define SI static inline
+
+template <typename T, typename P>
+SI T unaligned_load(const P* p) { // const void* would work too, but const P* helps ARMv7 codegen.
+ T v;
+ memcpy(&v, p, sizeof(v));
+ return v;
+}
+
+template <typename Dst, typename Src>
+SI Dst bit_cast(const Src& src) {
+ static_assert(sizeof(Dst) == sizeof(Src), "");
+ return unaligned_load<Dst>(&src);
+}
+
+// A couple functions for embedding constants directly into code,
+// so that no .const or .literal4 section is created.
+SI int C(int x) {
+#if defined(JUMPER) && defined(__x86_64__)
+ // Move x-the-compile-time-constant as a literal into x-the-register.
+ asm("mov %1, %0" : "=r"(x) : "i"(x));
+#endif
+ return x;
+}
+SI float C(float f) {
+ int x = C(unaligned_load<int>(&f));
+ return unaligned_load<float>(&x);
+}
+
+// Syntax sugar to make C() easy to use for constant literals.
+SI int operator "" _i(unsigned long long int i) { return C( (int)i); }
+SI float operator "" _f( long double f) { return C((float)f); }
+
+#endif//SkJumper_misc_DEFINED
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 455184f817..e5781f1064 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -6,254 +6,178 @@
*/
#include "SkJumper.h"
+#include "SkJumper_misc.h" // SI, unaligned_load(), bit_cast(), C(), operator"" _i and _f.
+#include "SkJumper_vectors.h" // F, I32, U32, U16, U8, cast(), expand()
-#define SI static inline
+// Our fundamental vector depth is our pixel stride.
+static const size_t kStride = sizeof(F) / sizeof(float);
-template <typename T, typename P>
-SI T unaligned_load(const P* p) {
- T v;
- memcpy(&v, p, sizeof(v));
- return v;
-}
+// A reminder:
+// Code guarded by defined(JUMPER) can assume that it will be compiled by Clang
+// and that F, I32, etc. are kStride-deep ext_vector_types of the appropriate type.
+// Otherwise, F, I32, etc. just alias the basic scalar types (and so kStride == 1).
-template <typename Dst, typename Src>
-SI Dst bit_cast(const Src& src) {
- static_assert(sizeof(Dst) == sizeof(Src), "");
- return unaligned_load<Dst>(&src);
-}
+// Another reminder:
+// You can't generally use constants in this file except via C() or operator"" _i/_f.
+// Not all constants can be generated using C() or _i/_f. Stages read the rest from this struct.
+using K = const SkJumper_constants;
-// A couple functions for embedding constants directly into code,
-// so that no .const or .literal4 section is created.
-SI int C(int x) {
-#if defined(JUMPER) && defined(__x86_64__)
- // Move x-the-compile-time-constant as a literal into x-the-register.
- asm("mov %1, %0" : "=r"(x) : "i"(x));
+// Let's start first with the mechanisms we use to build Stages.
+
+// Our program is an array of void*, either
+// - 1 void* per stage with no context pointer, the next stage;
+// - 2 void* per stage with a context pointer, first the context pointer, then the next stage.
+
+// load_and_inc() steps the program forward by 1 void*, returning that pointer.
+SI void* load_and_inc(void**& program) {
+#if defined(__GNUC__) && defined(__x86_64__)
+ // If program is in %rsi (we try to make this likely) then this is a single instruction.
+ void* rax;
+ asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
+ return rax;
+#else
+ // On ARM *program++ compiles into pretty ideal code without any handholding.
+ return *program++;
#endif
- return x;
-}
-SI float C(float f) {
- int x = C(unaligned_load<int>(&f));
- return unaligned_load<float>(&x);
}
-SI int operator "" _i(unsigned long long int i) { return C( (int)i); }
-SI float operator "" _f( long double f) { return C((float)f); }
-// Not all constants can be generated using C() or _i/_f. We read the rest from this struct.
-using K = const SkJumper_constants;
+// LazyCtx doesn't do anything unless you call operator T*() or load(), encapsulating the
+// logic from above that stages without a context pointer are represented by just 1 void*.
+struct LazyCtx {
+ void* ptr;
+ void**& program;
-#if !defined(JUMPER)
- // This path should lead to portable code that can be compiled directly into Skia.
- // (All other paths are compiled offline by Clang into SkJumper_generated.h.)
- #include <math.h>
-
- using F = float;
- using I32 = int32_t;
- using U32 = uint32_t;
- using U16 = uint16_t;
- using U8 = uint8_t;
-
- SI F mad(F f, F m, F a) { return f*m+a; }
- SI F min(F a, F b) { return fminf(a,b); }
- SI F max(F a, F b) { return fmaxf(a,b); }
- SI F abs_ (F v) { return fabsf(v); }
- SI F floor_(F v) { return floorf(v); }
- SI F rcp (F v) { return 1.0f / v; }
- SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
- SI U32 round (F v, F scale) { return (uint32_t)lrintf(v*scale); }
- SI U16 pack(U32 v) { return (U16)v; }
- SI U8 pack(U16 v) { return (U8)v; }
-
- SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
-
- SI F gather(const float* p, U32 ix) { return p[ix]; }
+ explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
- #define WRAP(name) sk_##name
+ template <typename T>
+ operator T*() {
+ if (!ptr) { ptr = load_and_inc(program); }
+ return (T*)ptr;
+ }
-#elif defined(__aarch64__)
- #include <arm_neon.h>
-
- // Since we know we're using Clang, we can use its vector extensions.
- using F = float __attribute__((ext_vector_type(4)));
- using I32 = int32_t __attribute__((ext_vector_type(4)));
- using U32 = uint32_t __attribute__((ext_vector_type(4)));
- using U16 = uint16_t __attribute__((ext_vector_type(4)));
- using U8 = uint8_t __attribute__((ext_vector_type(4)));
-
- // We polyfill a few routines that Clang doesn't build into ext_vector_types.
- SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
- SI F min(F a, F b) { return vminq_f32(a,b); }
- SI F max(F a, F b) { return vmaxq_f32(a,b); }
- SI F abs_ (F v) { return vabsq_f32(v); }
- SI F floor_(F v) { return vrndmq_f32(v); }
- SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
- SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
- SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); }
- SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
- SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
-
- SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
-
- SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+ template <typename T>
+ T load() {
+ if (!ptr) { ptr = load_and_inc(program); }
+ return unaligned_load<T>(ptr);
+ }
+};
+// A little wrapper macro to name Stages differently depending on the instruction set.
+// That lets us link together several options.
+#if !defined(JUMPER)
+ #define WRAP(name) sk_##name
+#elif defined(__aarch64__)
#define WRAP(name) sk_##name##_aarch64
-
#elif defined(__arm__)
- #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
- #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
- #endif
- #include <arm_neon.h>
-
- // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
- using F = float __attribute__((ext_vector_type(2)));
- using I32 = int32_t __attribute__((ext_vector_type(2)));
- using U32 = uint32_t __attribute__((ext_vector_type(2)));
- using U16 = uint16_t __attribute__((ext_vector_type(2)));
- using U8 = uint8_t __attribute__((ext_vector_type(2)));
-
- SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
- SI F min(F a, F b) { return vmin_f32(a,b); }
- SI F max(F a, F b) { return vmax_f32(a,b); }
- SI F abs_ (F v) { return vabs_f32(v); }
- SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
- SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
- SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
- SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
- SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
-
- SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
-
- SI F floor_(F v) {
- F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
- return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
- }
-
- SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
-
#define WRAP(name) sk_##name##_vfp4
-
+#elif defined(__AVX2__)
+ #define WRAP(name) sk_##name##_hsw
#elif defined(__AVX__)
- #include <immintrin.h>
-
- // These are __m256 and __m256i, but friendlier and strongly-typed.
- using F = float __attribute__((ext_vector_type(8)));
- using I32 = int32_t __attribute__((ext_vector_type(8)));
- using U32 = uint32_t __attribute__((ext_vector_type(8)));
- using U16 = uint16_t __attribute__((ext_vector_type(8)));
- using U8 = uint8_t __attribute__((ext_vector_type(8)));
-
- SI F mad(F f, F m, F a) {
- #if defined(__FMA__)
- return _mm256_fmadd_ps(f,m,a);
- #else
- return f*m+a;
- #endif
- }
+ #define WRAP(name) sk_##name##_avx
+#elif defined(__SSE4_1__)
+ #define WRAP(name) sk_##name##_sse41
+#elif defined(__SSE2__)
+ #define WRAP(name) sk_##name##_sse2
+#endif
- SI F min(F a, F b) { return _mm256_min_ps(a,b); }
- SI F max(F a, F b) { return _mm256_max_ps(a,b); }
- SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
- SI F floor_(F v) { return _mm256_floor_ps(v); }
- SI F rcp (F v) { return _mm256_rcp_ps (v); }
- SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
- SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
-
- SI U16 pack(U32 v) {
- return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
- _mm256_extractf128_si256(v, 1));
- }
- SI U8 pack(U16 v) {
- auto r = _mm_packus_epi16(v,v);
- return unaligned_load<U8>(&r);
- }
+// We're finally going to get to what a Stage function looks like!
+// It's best to jump down to the #else case first, then to come back up here for AVX.
- SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+#if defined(JUMPER) && defined(__AVX__)
+ // There's a big cost to switch between SSE and AVX, so we do a little
+ // extra work to handle even the jagged <kStride tail in AVX mode.
+ // Compared to normal stages, we maintain an extra tail register:
+ // tail == 0 ~~> work on a full kStride pixels
+ // tail != 0 ~~> work on only the first tail pixels
+ // tail is always < kStride.
+ using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
- SI F gather(const float* p, U32 ix) {
- #if defined(__AVX2__)
- return _mm256_i32gather_ps(p, ix, 4);
- #else
- return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
- p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
+ #if defined(JUMPER) && defined(WIN)
+ __attribute__((ms_abi))
#endif
+ extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
+ F v{};
+ auto start = (Stage*)load_and_inc(program);
+ while (x + kStride <= limit) {
+ start(x,program,k,0, v,v,v,v, v,v,v,v);
+ x += kStride;
+ }
+ if (size_t tail = limit - x) {
+ start(x,program,k,tail, v,v,v,v, v,v,v,v);
+ }
+ return limit;
}
- #if defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
- #define WRAP(name) sk_##name##_hsw
- #else
- #define WRAP(name) sk_##name##_avx
- #endif
+ #define STAGE(name) \
+ SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ LazyCtx ctx(program); \
+ name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage*)load_and_inc(program); \
+ next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
-#elif defined(__SSE2__)
- #include <immintrin.h>
-
- using F = float __attribute__((ext_vector_type(4)));
- using I32 = int32_t __attribute__((ext_vector_type(4)));
- using U32 = uint32_t __attribute__((ext_vector_type(4)));
- using U16 = uint16_t __attribute__((ext_vector_type(4)));
- using U8 = uint8_t __attribute__((ext_vector_type(4)));
-
- SI F mad(F f, F m, F a) { return f*m+a; }
- SI F min(F a, F b) { return _mm_min_ps(a,b); }
- SI F max(F a, F b) { return _mm_max_ps(a,b); }
- SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
- SI F rcp (F v) { return _mm_rcp_ps (v); }
- SI F rsqrt(F v) { return _mm_rsqrt_ps(v); }
- SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
-
- SI U16 pack(U32 v) {
- #if defined(__SSE4_1__)
- auto p = _mm_packus_epi32(v,v);
- #else
- // Sign extend so that _mm_packs_epi32() does the pack we want.
- auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
- p = _mm_packs_epi32(p,p);
- #endif
- return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
- }
- SI U8 pack(U16 v) {
- __m128i r;
- memcpy(&r, &v, sizeof(v));
- r = _mm_packus_epi16(r,r);
- return unaligned_load<U8>(&r);
- }
+#else
+ // Other instruction sets (SSE, NEON, portable) can fall back on narrower
+ // pipelines cheaply, which frees us to always assume tail==0.
- SI F if_then_else(I32 c, F t, F e) {
- return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
- }
+ // Stages tail call between each other by following program as described above.
+ // x is our induction variable, stepping forward kStride at a time.
+ using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
- SI F floor_(F v) {
- #if defined(__SSE4_1__)
- return _mm_floor_ps(v);
- #else
- F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
- return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
+ // On Windows, start_pipeline() has a normal Windows ABI, and then the rest is System V.
+ #if defined(JUMPER) && defined(WIN)
+ __attribute__((ms_abi))
#endif
+ extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
+ F v{};
+ auto start = (Stage*)load_and_inc(program);
+ while (x + kStride <= limit) {
+ start(x,program,k, v,v,v,v, v,v,v,v);
+ x += kStride;
+ }
+ return x;
}
- SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
-
- #if defined(__SSE4_1__)
- #define WRAP(name) sk_##name##_sse41
- #else
- #define WRAP(name) sk_##name##_sse2
- #endif
+ // This STAGE macro makes it easier to write stages, handling all the Stage chaining for you.
+ #define STAGE(name) \
+ SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ extern "C" void WRAP(name)(size_t x, void** program, K* k, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ LazyCtx ctx(program); \
+ name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage*)load_and_inc(program); \
+ next(x,program,k, r,g,b,a, dr,dg,db,da); \
+ } \
+ SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
#endif
-static const size_t kStride = sizeof(F) / sizeof(float);
+// just_return() is a simple no-op stage that only exists to end the chain,
+// returning back up to start_pipeline(), and from there to the caller.
+extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
+
+
+// We could start defining normal Stages now. But first, some helper functions and types.
-// We need to be a careful with casts.
-// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
-// These named casts and bit_cast() are always what they seem to be.
+// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
#if defined(JUMPER)
- SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
- SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
- SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
+ using F4 = float __attribute__((ext_vector_type(4)));
#else
- SI F cast (U32 v) { return (F)v; }
- SI U32 expand(U16 v) { return (U32)v; }
- SI U32 expand(U8 v) { return (U32)v; }
+ struct F4 {
+ float vals[4];
+ float operator[](int i) const { return vals[i]; }
+ };
#endif
+// These load() and store() methods are tail-aware,
+// but focus mainly on keeping the at-stride tail==0 case fast.
+
template <typename V, typename T>
SI V load(const T* src, size_t tail) {
#if defined(JUMPER)
@@ -295,7 +219,9 @@ SI void store(T* dst, V v, size_t tail) {
memcpy(dst, &v, sizeof(v));
}
-#if 1 && defined(JUMPER) && defined(__AVX__)
+// This doesn't look strictly necessary, but without it Clang would generate load() using
+// compiler-generated constants that we can't support. This version doesn't need constants.
+#if defined(JUMPER) && defined(__AVX__)
template <>
inline U8 load(const uint8_t* src, size_t tail) {
if (__builtin_expect(tail, 0)) {
@@ -312,8 +238,11 @@ SI void store(T* dst, V v, size_t tail) {
}
#endif
-#if 1 && defined(JUMPER) && defined(__AVX2__)
+// AVX2 adds some mask loads and stores that make for shorter, faster code.
+#if defined(JUMPER) && defined(__AVX2__)
SI U32 mask(size_t tail) {
+ // We go a little out of our way to avoid needing large constant values here.
+
// It's easiest to build the mask as 8 8-bit values, either 0x00 or 0xff.
// Start fully on, then shift away lanes from the top until we've got our mask.
uint64_t mask = 0xffffffffffffffff >> 8*(kStride-tail);
@@ -342,10 +271,6 @@ SI void store(T* dst, V v, size_t tail) {
#endif
-SI F lerp(F from, F to, F t) {
- return mad(to-from, t, from);
-}
-
SI void from_565(U16 _565, F* r, F* g, F* b) {
U32 wide = expand(_565);
*r = cast(wide & C(31<<11)) * C(1.0f / (31<<11));
@@ -360,150 +285,7 @@ SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
*a = cast(wide & C(15<< 0)) * C(1.0f / (15<< 0));
}
-// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
-#if defined(JUMPER)
- using F4 = float __attribute__((ext_vector_type(4)));
-#else
- struct F4 {
- float vals[4];
- float operator[](int i) const { return vals[i]; }
- };
-#endif
-
-SI void* load_and_inc(void**& program) {
-#if defined(__GNUC__) && defined(__x86_64__)
- // Passing program as the second Stage argument makes it likely that it's in %rsi,
- // so this is usually a single instruction *program++.
- void* rax;
- asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi.
- return rax;
- // When a Stage uses its ctx pointer, this optimization typically cuts an instruction:
- // mov (%rsi), %rcx // ctx = program[0]
- // ...
- // mov 0x8(%rsi), %rax // next = program[1]
- // add $0x10, %rsi // program += 2
- // jmpq *%rax // JUMP!
- // becomes
- // lods %ds:(%rsi),%rax // ctx = *program++;
- // ...
- // lods %ds:(%rsi),%rax // next = *program++;
- // jmpq *%rax // JUMP!
- //
- // When a Stage doesn't use its ctx pointer, it's 3 instructions either way,
- // but using lodsq (a 2-byte instruction) tends to trim a few bytes.
-#else
- // On ARM *program++ compiles into a single instruction without any handholding.
- return *program++;
-#endif
-}
-
-// Doesn't do anything unless you resolve it, either by casting to a pointer or calling load().
-// This makes it free in stages that have no context pointer to load (i.e. built with nullptr).
-struct LazyCtx {
- void* ptr;
- void**& program;
-
- explicit LazyCtx(void**& p) : ptr(nullptr), program(p) {}
-
- template <typename T>
- operator T*() {
- if (!ptr) { ptr = load_and_inc(program); }
- return (T*)ptr;
- }
-
- template <typename T>
- T load() {
- if (!ptr) { ptr = load_and_inc(program); }
- return unaligned_load<T>(ptr);
- }
-};
-
-#if defined(JUMPER) && defined(__AVX__)
- // There's a big cost to switch between SSE and AVX+, so we do a little
- // extra work to handle even the jagged <kStride tail in AVX+ mode.
- using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
-
- #if defined(JUMPER) && defined(WIN)
- __attribute__((ms_abi))
- #endif
- extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
- F v{};
- auto start = (Stage*)load_and_inc(program);
- while (x + kStride <= limit) {
- start(x,program,k,0, v,v,v,v, v,v,v,v);
- x += kStride;
- }
- if (size_t tail = limit - x) {
- start(x,program,k,tail, v,v,v,v, v,v,v,v);
- }
- return limit;
- }
-
- #define STAGE(name) \
- SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
- extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
- F r, F g, F b, F a, F dr, F dg, F db, F da) { \
- LazyCtx ctx(program); \
- name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage*)load_and_inc(program); \
- next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
- } \
- SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
-
-#else
- // Other instruction sets (SSE, NEON, portable) can fall back on narrower
- // pipelines cheaply, which frees us to always assume tail==0.
-
- // Stages tail call between each other by following program,
- // an interlaced sequence of Stage pointers and context pointers.
- using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
-
- #if defined(JUMPER) && defined(WIN)
- __attribute__((ms_abi))
- #endif
- extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
- F v{};
- auto start = (Stage*)load_and_inc(program);
- while (x + kStride <= limit) {
- start(x,program,k, v,v,v,v, v,v,v,v);
- x += kStride;
- }
- return x;
- }
-
- #define STAGE(name) \
- SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
- extern "C" void WRAP(name)(size_t x, void** program, K* k, \
- F r, F g, F b, F a, F dr, F dg, F db, F da) { \
- LazyCtx ctx(program); \
- name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage*)load_and_inc(program); \
- next(x,program,k, r,g,b,a, dr,dg,db,da); \
- } \
- SI void name##_k(size_t x, LazyCtx ctx, K* k, size_t tail, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
-#endif
-
-// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
-extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
-
-// We can now define Stages!
-
-// Some things to keep in mind while writing Stages:
-// - do not branch; (i.e. avoid jmp)
-// - do not call functions that don't inline; (i.e. avoid call, ret)
-// - do not use constant literals other than 0, ~0 and 0.0f. (i.e. avoid rip relative addressing)
-//
-// Some things that should work fine:
-// - 0, ~0, and 0.0f;
-// - arithmetic;
-// - functions of F and U32 that we've defined above;
-// - temporary values;
-// - lambdas;
-// - memcpy() with a compile-time constant size argument.
+// Now finally, normal Stages!
STAGE(seed_shader) {
auto y = *(const int*)ctx;
@@ -526,6 +308,7 @@ STAGE(constant_color) {
a = rgba[3];
}
+// Most blend modes apply the same logic to each channel.
#define BLEND_MODE(name) \
SI F name##_channel(F s, F d, F sa, F da); \
STAGE(name) { \
@@ -554,8 +337,9 @@ BLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; }
BLEND_MODE(plus_) { return s + d; }
BLEND_MODE(screen) { return s + d - s*d; }
BLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); }
-
#undef BLEND_MODE
+
+// Most other blend modes apply the same logic to colors, and srcover to alpha.
#define BLEND_MODE(name) \
SI F name##_channel(F s, F d, F sa, F da); \
STAGE(name) { \
@@ -605,6 +389,7 @@ BLEND_MODE(softlight) {
liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
}
+#undef BLEND_MODE
STAGE(clamp_0) {
r = max(r, 0);
@@ -719,6 +504,10 @@ STAGE(scale_u8) {
a = a * c;
}
+SI F lerp(F from, F to, F t) {
+ return mad(to-from, t, from);
+}
+
STAGE(lerp_1_float) {
auto c = *(const float*)ctx;
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
new file mode 100644
index 0000000000..000f90cd04
--- /dev/null
+++ b/src/jumper/SkJumper_vectors.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkJumper_vectors_DEFINED
+#define SkJumper_vectors_DEFINED
+
+#include "SkJumper.h"
+#include "SkJumper_misc.h"
+
+// This file contains vector types that SkJumper_stages.cpp uses to define stages.
+
+// Every function in this file should be marked static and inline using SI (see SkJumper_misc.h).
+
+#if !defined(JUMPER)
+ // This path should lead to portable code that can be compiled directly into Skia.
+ // (All other paths are compiled offline by Clang into SkJumper_generated.S.)
+ #include <math.h>
+
+ using F = float;
+ using I32 = int32_t;
+ using U32 = uint32_t;
+ using U16 = uint16_t;
+ using U8 = uint8_t;
+
+ SI F mad(F f, F m, F a) { return f*m+a; }
+ SI F min(F a, F b) { return fminf(a,b); }
+ SI F max(F a, F b) { return fmaxf(a,b); }
+ SI F abs_ (F v) { return fabsf(v); }
+ SI F floor_(F v) { return floorf(v); }
+ SI F rcp (F v) { return 1.0f / v; }
+ SI F rsqrt (F v) { return 1.0f / sqrtf(v); }
+ SI U32 round (F v, F scale) { return (uint32_t)lrintf(v*scale); }
+ SI U16 pack(U32 v) { return (U16)v; }
+ SI U8 pack(U16 v) { return (U8)v; }
+
+ SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
+
+ SI F gather(const float* p, U32 ix) { return p[ix]; }
+
+#elif defined(__aarch64__)
+ #include <arm_neon.h>
+
+ // Since we know we're using Clang, we can use its vector extensions.
+ using F = float __attribute__((ext_vector_type(4)));
+ using I32 = int32_t __attribute__((ext_vector_type(4)));
+ using U32 = uint32_t __attribute__((ext_vector_type(4)));
+ using U16 = uint16_t __attribute__((ext_vector_type(4)));
+ using U8 = uint8_t __attribute__((ext_vector_type(4)));
+
+ // We polyfill a few routines that Clang doesn't build into ext_vector_types.
+ SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
+ SI F min(F a, F b) { return vminq_f32(a,b); }
+ SI F max(F a, F b) { return vmaxq_f32(a,b); }
+ SI F abs_ (F v) { return vabsq_f32(v); }
+ SI F floor_(F v) { return vrndmq_f32(v); }
+ SI F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
+ SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
+ SI U32 round (F v, F scale) { return vcvtnq_u32_f32(v*scale); }
+ SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
+ SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
+
+ SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+
+ SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+
+#elif defined(__arm__)
+ #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
+ #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
+ #endif
+ #include <arm_neon.h>
+
+ // We can pass {s0-s15} as arguments under AAPCS-VFP. We'll slice that as 8 d-registers.
+ using F = float __attribute__((ext_vector_type(2)));
+ using I32 = int32_t __attribute__((ext_vector_type(2)));
+ using U32 = uint32_t __attribute__((ext_vector_type(2)));
+ using U16 = uint16_t __attribute__((ext_vector_type(2)));
+ using U8 = uint8_t __attribute__((ext_vector_type(2)));
+
+ SI F mad(F f, F m, F a) { return vfma_f32(a,f,m); }
+ SI F min(F a, F b) { return vmin_f32(a,b); }
+ SI F max(F a, F b) { return vmax_f32(a,b); }
+ SI F abs_ (F v) { return vabs_f32(v); }
+ SI F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
+ SI F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+ SI U32 round(F v, F scale) { return vcvt_u32_f32(mad(v,scale,0.5f)); }
+ SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
+ SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
+
+ SI F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
+
+ SI F floor_(F v) {
+ F roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
+ return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
+ }
+
+ SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
+
+#elif defined(__AVX__)
+ #include <immintrin.h>
+
+ // These are __m256 and __m256i, but friendlier and strongly-typed.
+ using F = float __attribute__((ext_vector_type(8)));
+ using I32 = int32_t __attribute__((ext_vector_type(8)));
+ using U32 = uint32_t __attribute__((ext_vector_type(8)));
+ using U16 = uint16_t __attribute__((ext_vector_type(8)));
+ using U8 = uint8_t __attribute__((ext_vector_type(8)));
+
+ SI F mad(F f, F m, F a) {
+ #if defined(__FMA__)
+ return _mm256_fmadd_ps(f,m,a);
+ #else
+ return f*m+a;
+ #endif
+ }
+
+ SI F min(F a, F b) { return _mm256_min_ps(a,b); }
+ SI F max(F a, F b) { return _mm256_max_ps(a,b); }
+ SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
+ SI F floor_(F v) { return _mm256_floor_ps(v); }
+ SI F rcp (F v) { return _mm256_rcp_ps (v); }
+ SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); }
+ SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+ SI U16 pack(U32 v) {
+ return _mm_packus_epi32(_mm256_extractf128_si256(v, 0),
+ _mm256_extractf128_si256(v, 1));
+ }
+ SI U8 pack(U16 v) {
+ auto r = _mm_packus_epi16(v,v);
+ return unaligned_load<U8>(&r);
+ }
+
+ SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+
+ SI F gather(const float* p, U32 ix) {
+ #if defined(__AVX2__)
+ return _mm256_i32gather_ps(p, ix, 4);
+ #else
+ return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+ p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
+ #endif
+ }
+
+#elif defined(__SSE2__)
+ #include <immintrin.h>
+
+ using F = float __attribute__((ext_vector_type(4)));
+ using I32 = int32_t __attribute__((ext_vector_type(4)));
+ using U32 = uint32_t __attribute__((ext_vector_type(4)));
+ using U16 = uint16_t __attribute__((ext_vector_type(4)));
+ using U8 = uint8_t __attribute__((ext_vector_type(4)));
+
+ SI F mad(F f, F m, F a) { return f*m+a; }
+ SI F min(F a, F b) { return _mm_min_ps(a,b); }
+ SI F max(F a, F b) { return _mm_max_ps(a,b); }
+ SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
+ SI F rcp (F v) { return _mm_rcp_ps (v); }
+ SI F rsqrt(F v) { return _mm_rsqrt_ps(v); }
+ SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
+
+ SI U16 pack(U32 v) {
+ #if defined(__SSE4_1__)
+ auto p = _mm_packus_epi32(v,v);
+ #else
+ // Sign extend so that _mm_packs_epi32() does the pack we want.
+ auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16);
+ p = _mm_packs_epi32(p,p);
+ #endif
+ return unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
+ }
+ SI U8 pack(U16 v) {
+ __m128i r;
+ memcpy(&r, &v, sizeof(v));
+ r = _mm_packus_epi16(r,r);
+ return unaligned_load<U8>(&r);
+ }
+
+ SI F if_then_else(I32 c, F t, F e) {
+ return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
+ }
+
+ SI F floor_(F v) {
+ #if defined(__SSE4_1__)
+ return _mm_floor_ps(v);
+ #else
+ F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
+ return roundtrip - if_then_else(roundtrip > v, 1.0_f, 0);
+ #endif
+ }
+
+ SI F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+#endif
+
+// We need to be a careful with casts.
+// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
+// These named casts and bit_cast() are always what they seem to be.
+#if defined(JUMPER)
+ SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
+ SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
+ SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
+#else
+ SI F cast (U32 v) { return (F)v; }
+ SI U32 expand(U16 v) { return (U32)v; }
+ SI U32 expand(U8 v) { return (U32)v; }
+#endif
+
+#endif//SkJumper_vectors_DEFINED