aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages.cpp
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-03-01 13:07:40 -0500
committerGravatar Mike Klein <mtklein@chromium.org>2017-03-02 16:14:45 +0000
commitc31858bcba3f6d9eb6b57ae03c15b266324a5c23 (patch)
tree3148d117b2b6c883c6aefa8d399fea147a536df5 /src/jumper/SkJumper_stages.cpp
parentf7cf81aefd28e5bfe74d40b4fc037df72f157f33 (diff)
SkJumper: handle the <kStride tail in AVX+ mode.
We have plenty general purpose registers to spare on x86-64, so the cheapest thing to do is use one to hold the usual 'tail'. Speedups on HSW: SkRasterPipeline_srgb: 292 -> 170 SkRasterPipeline_f16: 122 -> 90 There's plenty more room to improve here, e.g. using mask loads and stores, but this seems to be enough to get things working reasonably. BUG=skia:6289 Change-Id: I8c0ed325391822e9f36636500350205e93942111 Reviewed-on: https://skia-review.googlesource.com/9110 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r--src/jumper/SkJumper_stages.cpp253
1 files changed, 198 insertions, 55 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 6f498f747b..ca7469aa0d 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -222,6 +222,8 @@ static Dst bit_cast(const Src& src) {
#endif
#endif
+static const size_t kStride = sizeof(F) / sizeof(float);
+
// We need to be a careful with casts.
// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
// These named casts and bit_cast() are always what they seem to be.
@@ -235,6 +237,52 @@ static Dst bit_cast(const Src& src) {
static U32 expand(U8 v) { return (U32)v; }
#endif
+template <typename V, typename T>
+static inline V load(const T* src, size_t tail) {
+#if defined(JUMPER)
+ if (__builtin_expect(tail, 0)) {
+ V v{}; // Any inactive lanes are zeroed.
+ #pragma nounroll
+ for (size_t i = 0; i < tail; i++) {
+ v[i] = src[i];
+ }
+ return v;
+ }
+#endif
+ return unaligned_load<V>(src);
+}
+
+#if 1 && defined(JUMPER) && defined(__AVX__)
+ template <>
+ inline U8 load(const uint8_t* src, size_t tail) {
+ if (__builtin_expect(tail, 0)) {
+ uint64_t v = 0;
+ size_t shift = 0;
+ #pragma nounroll
+ while (tail --> 0) {
+ v |= (uint64_t)*src++ << shift;
+ shift += 8;
+ }
+ return unaligned_load<U8>(&v);
+ }
+ return unaligned_load<U8>(src);
+ }
+#endif
+
+template <typename V, typename T>
+static inline void store(T* dst, V v, size_t tail) {
+#if defined(JUMPER)
+ if (__builtin_expect(tail, 0)) {
+ #pragma nounroll
+ for (size_t i = 0; i < tail; i++) {
+ dst[i] = v[i];
+ }
+ return;
+ }
+#endif
+ memcpy(dst, &v, sizeof(v));
+}
+
static F lerp(F from, F to, F t) {
return mad(to-from, t, from);
@@ -257,10 +305,6 @@ static void from_565(U16 _565, F* r, F* g, F* b, K* k) {
};
#endif
-// Stages tail call between each other by following program,
-// an interlaced sequence of Stage pointers and context pointers.
-using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
-
static void* load_and_inc(void**& program) {
#if defined(__GNUC__) && defined(__x86_64__)
// Passing program as the second Stage argument makes it likely that it's in %rsi,
@@ -288,34 +332,74 @@ static void* load_and_inc(void**& program) {
#endif
}
-#define STAGE(name) \
- static void name##_k(size_t& x, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
- extern "C" void WRAP(name)(size_t x, void** program, K* k, \
- F r, F g, F b, F a, F dr, F dg, F db, F da) { \
- auto ctx = load_and_inc(program); \
- name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \
- auto next = (Stage*)load_and_inc(program); \
- next(x,program,k, r,g,b,a, dr,dg,db,da); \
- } \
- static void name##_k(size_t& x, void* ctx, K* k, \
- F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#if defined(JUMPER) && defined(__AVX__)
+ // There's a big cost to switch between SSE and AVX+, so we do a little
+ // extra work to handle even the jagged <kStride tail in AVX+ mode.
+ using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
-// Some glue stages that don't fit the normal pattern of stages.
+ #if defined(JUMPER) && defined(WIN)
+ __attribute__((ms_abi))
+ #endif
+ extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
+ F v{};
+ auto start = (Stage*)load_and_inc(program);
+ while (x + kStride <= limit) {
+ start(x,program,k,0, v,v,v,v, v,v,v,v);
+ x += kStride;
+ }
+ if (size_t tail = limit - x) {
+ start(x,program,k,tail, v,v,v,v, v,v,v,v);
+ }
+ return limit;
+ }
-#if defined(JUMPER) && defined(WIN)
-__attribute__((ms_abi))
-#endif
-extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
- F v{};
- size_t stride = sizeof(F) / sizeof(float);
- auto start = (Stage*)load_and_inc(program);
- while (x + stride <= limit) {
- start(x,program,k, v,v,v,v, v,v,v,v);
- x += stride;
+ #define STAGE(name) \
+ static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ auto ctx = load_and_inc(program); \
+ name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage*)load_and_inc(program); \
+ next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \
+ } \
+ static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+#else
+ // Other instruction sets (SSE, NEON, portable) can fall back on narrower
+ // pipelines cheaply, which frees us to always assume tail==0.
+
+ // Stages tail call between each other by following program,
+ // an interlaced sequence of Stage pointers and context pointers.
+ using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
+
+ #if defined(JUMPER) && defined(WIN)
+ __attribute__((ms_abi))
+ #endif
+ extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
+ F v{};
+ auto start = (Stage*)load_and_inc(program);
+ while (x + kStride <= limit) {
+ start(x,program,k, v,v,v,v, v,v,v,v);
+ x += kStride;
+ }
+ return x;
}
- return x;
-}
+
+ #define STAGE(name) \
+ static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ extern "C" void WRAP(name)(size_t x, void** program, K* k, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ auto ctx = load_and_inc(program); \
+ name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \
+ auto next = (Stage*)load_and_inc(program); \
+ next(x,program,k, r,g,b,a, dr,dg,db,da); \
+ } \
+ static void name##_k(size_t x, void* ctx, K* k, size_t tail, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#endif
// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
@@ -488,7 +572,7 @@ STAGE(scale_1_float) {
STAGE(scale_u8) {
auto ptr = *(const uint8_t**)ctx + x;
- auto scales = unaligned_load<U8>(ptr);
+ auto scales = load<U8>(ptr, tail);
auto c = cast(expand(scales)) * k->_1_255;
r = r * c;
@@ -508,7 +592,7 @@ STAGE(lerp_1_float) {
STAGE(lerp_u8) {
auto ptr = *(const uint8_t**)ctx + x;
- auto scales = unaligned_load<U8>(ptr);
+ auto scales = load<U8>(ptr, tail);
auto c = cast(expand(scales)) * k->_1_255;
r = lerp(dr, r, c);
@@ -520,7 +604,7 @@ STAGE(lerp_565) {
auto ptr = *(const uint16_t**)ctx + x;
F cr,cg,cb;
- from_565(unaligned_load<U16>(ptr), &cr, &cg, &cb, k);
+ from_565(load<U16>(ptr, tail), &cr, &cg, &cb, k);
r = lerp(dr, r, cr);
g = lerp(dg, g, cg);
@@ -535,7 +619,7 @@ STAGE(load_tables) {
};
auto c = (const Ctx*)ctx;
- auto px = unaligned_load<U32>(c->src + x);
+ auto px = load<U32>(c->src + x, tail);
r = gather(c->r, (px ) & k->_0x000000ff);
g = gather(c->g, (px >> 8) & k->_0x000000ff);
b = gather(c->b, (px >> 16) & k->_0x000000ff);
@@ -546,19 +630,19 @@ STAGE(load_a8) {
auto ptr = *(const uint8_t**)ctx + x;
r = g = b = 0.0f;
- a = cast(expand(unaligned_load<U8>(ptr))) * k->_1_255;
+ a = cast(expand(load<U8>(ptr, tail))) * k->_1_255;
}
STAGE(store_a8) {
auto ptr = *(uint8_t**)ctx + x;
U8 packed = pack(pack(round(a, k->_255)));
- memcpy(ptr, &packed, sizeof(packed));
+ store(ptr, packed, tail);
}
STAGE(load_565) {
auto ptr = *(const uint16_t**)ctx + x;
- from_565(unaligned_load<U16>(ptr), &r,&g,&b, k);
+ from_565(load<U16>(ptr, tail), &r,&g,&b, k);
a = k->_1;
}
STAGE(store_565) {
@@ -567,13 +651,13 @@ STAGE(store_565) {
U16 px = pack( round(r, k->_31) << 11
| round(g, k->_63) << 5
| round(b, k->_31) );
- memcpy(ptr, &px, sizeof(px));
+ store(ptr, px, tail);
}
STAGE(load_8888) {
auto ptr = *(const uint32_t**)ctx + x;
- auto px = unaligned_load<U32>(ptr);
+ auto px = load<U32>(ptr, tail);
r = cast((px ) & k->_0x000000ff) * k->_1_255;
g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
@@ -587,7 +671,7 @@ STAGE(store_8888) {
| round(g, k->_255) << 8
| round(b, k->_255) << 16
| round(a, k->_255) << 24;
- memcpy(ptr, &px, sizeof(px));
+ store(ptr, px, tail);
}
STAGE(load_f16) {
@@ -619,10 +703,23 @@ STAGE(load_f16) {
b = {rb[1], rb[3]};
a = {ga[1], ga[3]};
#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
- auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
- _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
- _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
- _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+ __m128i _01, _23, _45, _67;
+ if (__builtin_expect(tail,0)) {
+ auto src = (const double*)ptr;
+ _01 = _23 = _45 = _67 = _mm_setzero_si128();
+ if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
+ if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
+ if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
+ if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
+ if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
+ if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
+ if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
+ } else {
+ _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
+ _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+ _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
+ _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+ }
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
@@ -639,10 +736,23 @@ STAGE(load_f16) {
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
#elif defined(__AVX__)
- auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
- _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
- _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
- _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+ __m128i _01, _23, _45, _67;
+ if (__builtin_expect(tail,0)) {
+ auto src = (const double*)ptr;
+ _01 = _23 = _45 = _67 = _mm_setzero_si128();
+ if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
+ if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
+ if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
+ if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
+ if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
+ if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
+ if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
+ } else {
+ _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
+ _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+ _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
+ _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+ }
auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
_13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
@@ -750,10 +860,26 @@ STAGE(store_f16) {
ba0123 = _mm_unpacklo_epi16(B, A),
ba4567 = _mm_unpackhi_epi16(B, A);
- _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
- _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
- _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
- _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+ auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
+ _23 = _mm_unpackhi_epi32(rg0123, ba0123),
+ _45 = _mm_unpacklo_epi32(rg4567, ba4567),
+ _67 = _mm_unpackhi_epi32(rg4567, ba4567);
+
+ if (__builtin_expect(tail,0)) {
+ auto dst = (double*)ptr;
+ if (tail > 0) { _mm_storel_pd(dst+0, _01); }
+ if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
+ if (tail > 2) { _mm_storel_pd(dst+2, _23); }
+ if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
+ if (tail > 4) { _mm_storel_pd(dst+4, _45); }
+ if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
+ if (tail > 6) { _mm_storel_pd(dst+6, _67); }
+ } else {
+ _mm_storeu_si128((__m128i*)ptr + 0, _01);
+ _mm_storeu_si128((__m128i*)ptr + 1, _23);
+ _mm_storeu_si128((__m128i*)ptr + 2, _45);
+ _mm_storeu_si128((__m128i*)ptr + 3, _67);
+ }
#elif defined(__AVX__)
auto float_to_half = [&](F f) {
return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,
@@ -775,10 +901,27 @@ STAGE(store_f16) {
rg4567 = r4567 | _mm_slli_si128(g4567,2),
ba0123 = b0123 | _mm_slli_si128(a0123,2),
ba4567 = b4567 | _mm_slli_si128(a4567,2);
- _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
- _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
- _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
- _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+
+ auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
+ _23 = _mm_unpackhi_epi32(rg0123, ba0123),
+ _45 = _mm_unpacklo_epi32(rg4567, ba4567),
+ _67 = _mm_unpackhi_epi32(rg4567, ba4567);
+
+ if (__builtin_expect(tail,0)) {
+ auto dst = (double*)ptr;
+ if (tail > 0) { _mm_storel_pd(dst+0, _01); }
+ if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
+ if (tail > 2) { _mm_storel_pd(dst+2, _23); }
+ if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
+ if (tail > 4) { _mm_storel_pd(dst+4, _45); }
+ if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
+ if (tail > 6) { _mm_storel_pd(dst+6, _67); }
+ } else {
+ _mm_storeu_si128((__m128i*)ptr + 0, _01);
+ _mm_storeu_si128((__m128i*)ptr + 1, _23);
+ _mm_storeu_si128((__m128i*)ptr + 2, _45);
+ _mm_storeu_si128((__m128i*)ptr + 3, _67);
+ }
#elif defined(__SSE2__)
auto float_to_half = [&](F f) {
return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,