diff options
author | Mike Klein <mtklein@chromium.org> | 2017-03-01 13:07:40 -0500 |
---|---|---|
committer | Mike Klein <mtklein@chromium.org> | 2017-03-02 16:14:45 +0000 |
commit | c31858bcba3f6d9eb6b57ae03c15b266324a5c23 (patch) | |
tree | 3148d117b2b6c883c6aefa8d399fea147a536df5 /src/jumper/SkJumper_stages.cpp | |
parent | f7cf81aefd28e5bfe74d40b4fc037df72f157f33 (diff) |
SkJumper: handle the <kStride tail in AVX+ mode.
We have plenty general purpose registers to spare on x86-64,
so the cheapest thing to do is use one to hold the usual 'tail'.
Speedups on HSW:
SkRasterPipeline_srgb: 292 -> 170
SkRasterPipeline_f16: 122 -> 90
There's plenty more room to improve here, e.g. using mask loads and
stores, but this seems to be enough to get things working reasonably.
BUG=skia:6289
Change-Id: I8c0ed325391822e9f36636500350205e93942111
Reviewed-on: https://skia-review.googlesource.com/9110
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 253 |
1 files changed, 198 insertions, 55 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index 6f498f747b..ca7469aa0d 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -222,6 +222,8 @@ static Dst bit_cast(const Src& src) { #endif #endif +static const size_t kStride = sizeof(F) / sizeof(float); + // We need to be a careful with casts. // (F)x means cast x to float in the portable path, but bit_cast x to float in the others. // These named casts and bit_cast() are always what they seem to be. @@ -235,6 +237,52 @@ static Dst bit_cast(const Src& src) { static U32 expand(U8 v) { return (U32)v; } #endif +template <typename V, typename T> +static inline V load(const T* src, size_t tail) { +#if defined(JUMPER) + if (__builtin_expect(tail, 0)) { + V v{}; // Any inactive lanes are zeroed. + #pragma nounroll + for (size_t i = 0; i < tail; i++) { + v[i] = src[i]; + } + return v; + } +#endif + return unaligned_load<V>(src); +} + +#if 1 && defined(JUMPER) && defined(__AVX__) + template <> + inline U8 load(const uint8_t* src, size_t tail) { + if (__builtin_expect(tail, 0)) { + uint64_t v = 0; + size_t shift = 0; + #pragma nounroll + while (tail --> 0) { + v |= (uint64_t)*src++ << shift; + shift += 8; + } + return unaligned_load<U8>(&v); + } + return unaligned_load<U8>(src); + } +#endif + +template <typename V, typename T> +static inline void store(T* dst, V v, size_t tail) { +#if defined(JUMPER) + if (__builtin_expect(tail, 0)) { + #pragma nounroll + for (size_t i = 0; i < tail; i++) { + dst[i] = v[i]; + } + return; + } +#endif + memcpy(dst, &v, sizeof(v)); +} + static F lerp(F from, F to, F t) { return mad(to-from, t, from); @@ -257,10 +305,6 @@ static void from_565(U16 _565, F* r, F* g, F* b, K* k) { }; #endif -// Stages tail call between each other by following program, -// an interlaced sequence of Stage pointers and context pointers. -using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F); - static void* load_and_inc(void**& program) { #if defined(__GNUC__) && defined(__x86_64__) // Passing program as the second Stage argument makes it likely that it's in %rsi, @@ -288,34 +332,74 @@ static void* load_and_inc(void**& program) { #endif } -#define STAGE(name) \ - static void name##_k(size_t& x, void* ctx, K* k, \ - F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ - extern "C" void WRAP(name)(size_t x, void** program, K* k, \ - F r, F g, F b, F a, F dr, F dg, F db, F da) { \ - auto ctx = load_and_inc(program); \ - name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da); \ - auto next = (Stage*)load_and_inc(program); \ - next(x,program,k, r,g,b,a, dr,dg,db,da); \ - } \ - static void name##_k(size_t& x, void* ctx, K* k, \ - F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#if defined(JUMPER) && defined(__AVX__) + // There's a big cost to switch between SSE and AVX+, so we do a little + // extra work to handle even the jagged <kStride tail in AVX+ mode. + using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F); -// Some glue stages that don't fit the normal pattern of stages. + #if defined(JUMPER) && defined(WIN) + __attribute__((ms_abi)) + #endif + extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { + F v{}; + auto start = (Stage*)load_and_inc(program); + while (x + kStride <= limit) { + start(x,program,k,0, v,v,v,v, v,v,v,v); + x += kStride; + } + if (size_t tail = limit - x) { + start(x,program,k,tail, v,v,v,v, v,v,v,v); + } + return limit; + } -#if defined(JUMPER) && defined(WIN) -__attribute__((ms_abi)) -#endif -extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { - F v{}; - size_t stride = sizeof(F) / sizeof(float); - auto start = (Stage*)load_and_inc(program); - while (x + stride <= limit) { - start(x,program,k, v,v,v,v, v,v,v,v); - x += stride; + #define STAGE(name) \ + static void name##_k(size_t x, void* ctx, K* k, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail, \ + F r, F g, F b, F a, F dr, F dg, F db, F da) { \ + auto ctx = load_and_inc(program); \ + name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage*)load_and_inc(program); \ + next(x,program,k,tail, r,g,b,a, dr,dg,db,da); \ + } \ + static void name##_k(size_t x, void* ctx, K* k, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) + +#else + // Other instruction sets (SSE, NEON, portable) can fall back on narrower + // pipelines cheaply, which frees us to always assume tail==0. + + // Stages tail call between each other by following program, + // an interlaced sequence of Stage pointers and context pointers. + using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F); + + #if defined(JUMPER) && defined(WIN) + __attribute__((ms_abi)) + #endif + extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) { + F v{}; + auto start = (Stage*)load_and_inc(program); + while (x + kStride <= limit) { + start(x,program,k, v,v,v,v, v,v,v,v); + x += kStride; + } + return x; } - return x; -} + + #define STAGE(name) \ + static void name##_k(size_t x, void* ctx, K* k, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ + extern "C" void WRAP(name)(size_t x, void** program, K* k, \ + F r, F g, F b, F a, F dr, F dg, F db, F da) { \ + auto ctx = load_and_inc(program); \ + name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da); \ + auto next = (Stage*)load_and_inc(program); \ + next(x,program,k, r,g,b,a, dr,dg,db,da); \ + } \ + static void name##_k(size_t x, void* ctx, K* k, size_t tail, \ + F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) +#endif // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller). extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {} @@ -488,7 +572,7 @@ STAGE(scale_1_float) { STAGE(scale_u8) { auto ptr = *(const uint8_t**)ctx + x; - auto scales = unaligned_load<U8>(ptr); + auto scales = load<U8>(ptr, tail); auto c = cast(expand(scales)) * k->_1_255; r = r * c; @@ -508,7 +592,7 @@ STAGE(lerp_1_float) { STAGE(lerp_u8) { auto ptr = *(const uint8_t**)ctx + x; - auto scales = unaligned_load<U8>(ptr); + auto scales = load<U8>(ptr, tail); auto c = cast(expand(scales)) * k->_1_255; r = lerp(dr, r, c); @@ -520,7 +604,7 @@ STAGE(lerp_565) { auto ptr = *(const uint16_t**)ctx + x; F cr,cg,cb; - from_565(unaligned_load<U16>(ptr), &cr, &cg, &cb, k); + from_565(load<U16>(ptr, tail), &cr, &cg, &cb, k); r = lerp(dr, r, cr); g = lerp(dg, g, cg); @@ -535,7 +619,7 @@ STAGE(load_tables) { }; auto c = (const Ctx*)ctx; - auto px = unaligned_load<U32>(c->src + x); + auto px = load<U32>(c->src + x, tail); r = gather(c->r, (px ) & k->_0x000000ff); g = gather(c->g, (px >> 8) & k->_0x000000ff); b = gather(c->b, (px >> 16) & k->_0x000000ff); @@ -546,19 +630,19 @@ STAGE(load_a8) { auto ptr = *(const uint8_t**)ctx + x; r = g = b = 0.0f; - a = cast(expand(unaligned_load<U8>(ptr))) * k->_1_255; + a = cast(expand(load<U8>(ptr, tail))) * k->_1_255; } STAGE(store_a8) { auto ptr = *(uint8_t**)ctx + x; U8 packed = pack(pack(round(a, k->_255))); - memcpy(ptr, &packed, sizeof(packed)); + store(ptr, packed, tail); } STAGE(load_565) { auto ptr = *(const uint16_t**)ctx + x; - from_565(unaligned_load<U16>(ptr), &r,&g,&b, k); + from_565(load<U16>(ptr, tail), &r,&g,&b, k); a = k->_1; } STAGE(store_565) { @@ -567,13 +651,13 @@ STAGE(store_565) { U16 px = pack( round(r, k->_31) << 11 | round(g, k->_63) << 5 | round(b, k->_31) ); - memcpy(ptr, &px, sizeof(px)); + store(ptr, px, tail); } STAGE(load_8888) { auto ptr = *(const uint32_t**)ctx + x; - auto px = unaligned_load<U32>(ptr); + auto px = load<U32>(ptr, tail); r = cast((px ) & k->_0x000000ff) * k->_1_255; g = cast((px >> 8) & k->_0x000000ff) * k->_1_255; b = cast((px >> 16) & k->_0x000000ff) * k->_1_255; @@ -587,7 +671,7 @@ STAGE(store_8888) { | round(g, k->_255) << 8 | round(b, k->_255) << 16 | round(a, k->_255) << 24; - memcpy(ptr, &px, sizeof(px)); + store(ptr, px, tail); } STAGE(load_f16) { @@ -619,10 +703,23 @@ STAGE(load_f16) { b = {rb[1], rb[3]}; a = {ga[1], ga[3]}; #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) - auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), - _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), - _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); + __m128i _01, _23, _45, _67; + if (__builtin_expect(tail,0)) { + auto src = (const double*)ptr; + _01 = _23 = _45 = _67 = _mm_setzero_si128(); + if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } + if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } + if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } + if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } + if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } + if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } + if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } + } else { + _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); + _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); + _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); + _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); + } auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 @@ -639,10 +736,23 @@ STAGE(load_f16) { b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567)); a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567)); #elif defined(__AVX__) - auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1), - _45 = _mm_loadu_si128(((__m128i*)ptr) + 2), - _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); + __m128i _01, _23, _45, _67; + if (__builtin_expect(tail,0)) { + auto src = (const double*)ptr; + _01 = _23 = _45 = _67 = _mm_setzero_si128(); + if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } + if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } + if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } + if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } + if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } + if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } + if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } + } else { + _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); + _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); + _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); + _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); + } auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 @@ -750,10 +860,26 @@ STAGE(store_f16) { ba0123 = _mm_unpacklo_epi16(B, A), ba4567 = _mm_unpackhi_epi16(B, A); - _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123)); - _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); - _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); - _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); + auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), + _23 = _mm_unpackhi_epi32(rg0123, ba0123), + _45 = _mm_unpacklo_epi32(rg4567, ba4567), + _67 = _mm_unpackhi_epi32(rg4567, ba4567); + + if (__builtin_expect(tail,0)) { + auto dst = (double*)ptr; + if (tail > 0) { _mm_storel_pd(dst+0, _01); } + if (tail > 1) { _mm_storeh_pd(dst+1, _01); } + if (tail > 2) { _mm_storel_pd(dst+2, _23); } + if (tail > 3) { _mm_storeh_pd(dst+3, _23); } + if (tail > 4) { _mm_storel_pd(dst+4, _45); } + if (tail > 5) { _mm_storeh_pd(dst+5, _45); } + if (tail > 6) { _mm_storel_pd(dst+6, _67); } + } else { + _mm_storeu_si128((__m128i*)ptr + 0, _01); + _mm_storeu_si128((__m128i*)ptr + 1, _23); + _mm_storeu_si128((__m128i*)ptr + 2, _45); + _mm_storeu_si128((__m128i*)ptr + 3, _67); + } #elif defined(__AVX__) auto float_to_half = [&](F f) { return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, @@ -775,10 +901,27 @@ STAGE(store_f16) { rg4567 = r4567 | _mm_slli_si128(g4567,2), ba0123 = b0123 | _mm_slli_si128(a0123,2), ba4567 = b4567 | _mm_slli_si128(a4567,2); - _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123)); - _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123)); - _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567)); - _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567)); + + auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), + _23 = _mm_unpackhi_epi32(rg0123, ba0123), + _45 = _mm_unpacklo_epi32(rg4567, ba4567), + _67 = _mm_unpackhi_epi32(rg4567, ba4567); + + if (__builtin_expect(tail,0)) { + auto dst = (double*)ptr; + if (tail > 0) { _mm_storel_pd(dst+0, _01); } + if (tail > 1) { _mm_storeh_pd(dst+1, _01); } + if (tail > 2) { _mm_storel_pd(dst+2, _23); } + if (tail > 3) { _mm_storeh_pd(dst+3, _23); } + if (tail > 4) { _mm_storel_pd(dst+4, _45); } + if (tail > 5) { _mm_storeh_pd(dst+5, _45); } + if (tail > 6) { _mm_storel_pd(dst+6, _67); } + } else { + _mm_storeu_si128((__m128i*)ptr + 0, _01); + _mm_storeu_si128((__m128i*)ptr + 1, _23); + _mm_storeu_si128((__m128i*)ptr + 2, _45); + _mm_storeu_si128((__m128i*)ptr + 3, _67); + } #elif defined(__SSE2__) auto float_to_half = [&](F f) { return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent, |