SkJumper: handle the <kStride tail in AVX+ mode.

We have plenty general purpose registers to spare on x86-64, so the cheapest thing to do is use one to hold the usual 'tail'. Speedups on HSW: SkRasterPipeline_srgb: 292 -> 170 SkRasterPipeline_f16: 122 -> 90 There's plenty more room to improve here, e.g. using mask loads and stores, but this seems to be enough to get things working reasonably. BUG=skia:6289 Change-Id: I8c0ed325391822e9f36636500350205e93942111 Reviewed-on: https://skia-review.googlesource.com/9110 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-03-01 13:07:40 -0500
committer: Mike Klein <mtklein@chromium.org> 2017-03-02 16:14:45 +0000
commit: c31858bcba3f6d9eb6b57ae03c15b266324a5c23 (patch)
tree: 3148d117b2b6c883c6aefa8d399fea147a536df5 /src/jumper/SkJumper_stages.cpp
parent: f7cf81aefd28e5bfe74d40b4fc037df72f157f33 (diff)
1 files changed, 198 insertions, 55 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 6f498f747b..ca7469aa0d 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -222,6 +222,8 @@ static Dst bit_cast(const Src& src) {
     #endif
 #endif
 
+static const size_t kStride = sizeof(F) / sizeof(float);
+
 // We need to be a careful with casts.
 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
 // These named casts and bit_cast() are always what they seem to be.
@@ -235,6 +237,52 @@ static Dst bit_cast(const Src& src) {
     static U32 expand(U8  v) { return (U32)v; }
 #endif
 
+template <typename V, typename T>
+static inline V load(const T* src, size_t tail) {
+#if defined(JUMPER)
+    if (__builtin_expect(tail, 0)) {
+        V v{};  // Any inactive lanes are zeroed.
+        #pragma nounroll
+        for (size_t i = 0; i < tail; i++) {
+            v[i] = src[i];
+        }
+        return v;
+    }
+#endif
+    return unaligned_load<V>(src);
+}
+
+#if 1 && defined(JUMPER) && defined(__AVX__)
+    template <>
+    inline U8 load(const uint8_t* src, size_t tail) {
+        if (__builtin_expect(tail, 0)) {
+            uint64_t v = 0;
+            size_t shift = 0;
+            #pragma nounroll
+            while (tail --> 0) {
+                v |= (uint64_t)*src++ << shift;
+                shift += 8;
+            }
+            return unaligned_load<U8>(&v);
+        }
+        return unaligned_load<U8>(src);
+    }
+#endif
+
+template <typename V, typename T>
+static inline void store(T* dst, V v, size_t tail) {
+#if defined(JUMPER)
+    if (__builtin_expect(tail, 0)) {
+        #pragma nounroll
+        for (size_t i = 0; i < tail; i++) {
+            dst[i] = v[i];
+        }
+        return;
+    }
+#endif
+    memcpy(dst, &v, sizeof(v));
+}
+
 
 static F lerp(F from, F to, F t) {
     return mad(to-from, t, from);
@@ -257,10 +305,6 @@ static void from_565(U16 _565, F* r, F* g, F* b, K* k) {
     };
 #endif
 
-// Stages tail call between each other by following program,
-// an interlaced sequence of Stage pointers and context pointers.
-using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
-
 static void* load_and_inc(void**& program) {
 #if defined(__GNUC__) && defined(__x86_64__)
     // Passing program as the second Stage argument makes it likely that it's in %rsi,
@@ -288,34 +332,74 @@ static void* load_and_inc(void**& program) {
 #endif
 }
 
-#define STAGE(name)                                                           \
-    static void name##_k(size_t& x, void* ctx, K* k,                          \
-                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
-    extern "C" void WRAP(name)(size_t x, void** program, K* k,                \
-                              F r, F g, F b, F a, F dr, F dg, F db, F da) {   \
-        auto ctx = load_and_inc(program);                                     \
-        name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da);                              \
-        auto next = (Stage*)load_and_inc(program);                            \
-        next(x,program,k, r,g,b,a, dr,dg,db,da);                              \
-    }                                                                         \
-    static void name##_k(size_t& x, void* ctx, K* k,                          \
-                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#if defined(JUMPER) && defined(__AVX__)
+    // There's a big cost to switch between SSE and AVX+, so we do a little
+    // extra work to handle even the jagged <kStride tail in AVX+ mode.
+    using Stage = void(size_t x, void** program, K* k, size_t tail, F,F,F,F, F,F,F,F);
 
-// Some glue stages that don't fit the normal pattern of stages.
+    #if defined(JUMPER) && defined(WIN)
+    __attribute__((ms_abi))
+    #endif
+    extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
+        F v{};
+        auto start = (Stage*)load_and_inc(program);
+        while (x + kStride <= limit) {
+            start(x,program,k,0,    v,v,v,v, v,v,v,v);
+            x += kStride;
+        }
+        if (size_t tail = limit - x) {
+            start(x,program,k,tail, v,v,v,v, v,v,v,v);
+        }
+        return limit;
+    }
 
-#if defined(JUMPER) && defined(WIN)
-__attribute__((ms_abi))
-#endif
-extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
-    F v{};
-    size_t stride = sizeof(F) / sizeof(float);
-    auto start = (Stage*)load_and_inc(program);
-    while (x + stride <= limit) {
-        start(x,program,k, v,v,v,v, v,v,v,v);
-        x += stride;
+    #define STAGE(name)                                                           \
+        static void name##_k(size_t x, void* ctx, K* k, size_t tail,              \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+        extern "C" void WRAP(name)(size_t x, void** program, K* k, size_t tail,   \
+                                   F r, F g, F b, F a, F dr, F dg, F db, F da) {  \
+            auto ctx = load_and_inc(program);                                     \
+            name##_k(x,ctx,k,tail, r,g,b,a, dr,dg,db,da);                         \
+            auto next = (Stage*)load_and_inc(program);                            \
+            next(x,program,k,tail, r,g,b,a, dr,dg,db,da);                         \
+        }                                                                         \
+        static void name##_k(size_t x, void* ctx, K* k, size_t tail,              \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+#else
+    // Other instruction sets (SSE, NEON, portable) can fall back on narrower
+    // pipelines cheaply, which frees us to always assume tail==0.
+
+    // Stages tail call between each other by following program,
+    // an interlaced sequence of Stage pointers and context pointers.
+    using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
+
+    #if defined(JUMPER) && defined(WIN)
+    __attribute__((ms_abi))
+    #endif
+    extern "C" size_t WRAP(start_pipeline)(size_t x, void** program, K* k, size_t limit) {
+        F v{};
+        auto start = (Stage*)load_and_inc(program);
+        while (x + kStride <= limit) {
+            start(x,program,k, v,v,v,v, v,v,v,v);
+            x += kStride;
+        }
+        return x;
     }
-    return x;
-}
+
+    #define STAGE(name)                                                           \
+        static void name##_k(size_t x, void* ctx, K* k, size_t tail,              \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+        extern "C" void WRAP(name)(size_t x, void** program, K* k,                \
+                                   F r, F g, F b, F a, F dr, F dg, F db, F da) {  \
+            auto ctx = load_and_inc(program);                                     \
+            name##_k(x,ctx,k,0, r,g,b,a, dr,dg,db,da);                            \
+            auto next = (Stage*)load_and_inc(program);                            \
+            next(x,program,k, r,g,b,a, dr,dg,db,da);                              \
+        }                                                                         \
+        static void name##_k(size_t x, void* ctx, K* k, size_t tail,              \
+                             F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+#endif
 
 // Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
 extern "C" void WRAP(just_return)(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
@@ -488,7 +572,7 @@ STAGE(scale_1_float) {
 STAGE(scale_u8) {
     auto ptr = *(const uint8_t**)ctx + x;
 
-    auto scales = unaligned_load<U8>(ptr);
+    auto scales = load<U8>(ptr, tail);
     auto c = cast(expand(scales)) * k->_1_255;
 
     r = r * c;
@@ -508,7 +592,7 @@ STAGE(lerp_1_float) {
 STAGE(lerp_u8) {
     auto ptr = *(const uint8_t**)ctx + x;
 
-    auto scales = unaligned_load<U8>(ptr);
+    auto scales = load<U8>(ptr, tail);
     auto c = cast(expand(scales)) * k->_1_255;
 
     r = lerp(dr, r, c);
@@ -520,7 +604,7 @@ STAGE(lerp_565) {
     auto ptr = *(const uint16_t**)ctx + x;
 
     F cr,cg,cb;
-    from_565(unaligned_load<U16>(ptr), &cr, &cg, &cb, k);
+    from_565(load<U16>(ptr, tail), &cr, &cg, &cb, k);
 
     r = lerp(dr, r, cr);
     g = lerp(dg, g, cg);
@@ -535,7 +619,7 @@ STAGE(load_tables) {
     };
     auto c = (const Ctx*)ctx;
 
-    auto px = unaligned_load<U32>(c->src + x);
+    auto px = load<U32>(c->src + x, tail);
     r = gather(c->r, (px      ) & k->_0x000000ff);
     g = gather(c->g, (px >>  8) & k->_0x000000ff);
     b = gather(c->b, (px >> 16) & k->_0x000000ff);
@@ -546,19 +630,19 @@ STAGE(load_a8) {
     auto ptr = *(const uint8_t**)ctx + x;
 
     r = g = b = 0.0f;
-    a = cast(expand(unaligned_load<U8>(ptr))) * k->_1_255;
+    a = cast(expand(load<U8>(ptr, tail))) * k->_1_255;
 }
 STAGE(store_a8) {
     auto ptr = *(uint8_t**)ctx + x;
 
     U8 packed = pack(pack(round(a, k->_255)));
-    memcpy(ptr, &packed, sizeof(packed));
+    store(ptr, packed, tail);
 }
 
 STAGE(load_565) {
     auto ptr = *(const uint16_t**)ctx + x;
 
-    from_565(unaligned_load<U16>(ptr), &r,&g,&b, k);
+    from_565(load<U16>(ptr, tail), &r,&g,&b, k);
     a = k->_1;
 }
 STAGE(store_565) {
@@ -567,13 +651,13 @@ STAGE(store_565) {
     U16 px = pack( round(r, k->_31) << 11
                  | round(g, k->_63) <<  5
                  | round(b, k->_31)      );
-    memcpy(ptr, &px, sizeof(px));
+    store(ptr, px, tail);
 }
 
 STAGE(load_8888) {
     auto ptr = *(const uint32_t**)ctx + x;
 
-    auto px = unaligned_load<U32>(ptr);
+    auto px = load<U32>(ptr, tail);
     r = cast((px      ) & k->_0x000000ff) * k->_1_255;
     g = cast((px >>  8) & k->_0x000000ff) * k->_1_255;
     b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
@@ -587,7 +671,7 @@ STAGE(store_8888) {
            | round(g, k->_255) <<  8
            | round(b, k->_255) << 16
            | round(a, k->_255) << 24;
-    memcpy(ptr, &px, sizeof(px));
+    store(ptr, px, tail);
 }
 
 STAGE(load_f16) {
@@ -619,10 +703,23 @@ STAGE(load_f16) {
     b = {rb[1], rb[3]};
     a = {ga[1], ga[3]};
 #elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
-    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
-         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
-         _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
-         _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+    __m128i _01, _23, _45, _67;
+    if (__builtin_expect(tail,0)) {
+        auto src = (const double*)ptr;
+        _01 = _23 = _45 = _67 = _mm_setzero_si128();
+        if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
+        if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
+        if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
+        if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
+        if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
+        if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
+        if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
+    } else {
+        _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
+        _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+        _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
+        _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+    }
 
     auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
          _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
@@ -639,10 +736,23 @@ STAGE(load_f16) {
     b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
     a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
 #elif defined(__AVX__)
-    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
-         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
-         _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
-         _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+    __m128i _01, _23, _45, _67;
+    if (__builtin_expect(tail,0)) {
+        auto src = (const double*)ptr;
+        _01 = _23 = _45 = _67 = _mm_setzero_si128();
+        if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
+        if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
+        if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
+        if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
+        if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
+        if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
+        if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
+    } else {
+        _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
+        _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+        _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
+        _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+    }
 
     auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
          _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
@@ -750,10 +860,26 @@ STAGE(store_f16) {
          ba0123 = _mm_unpacklo_epi16(B, A),
          ba4567 = _mm_unpackhi_epi16(B, A);
 
-    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
-    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
-    _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
-    _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+    auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
+         _23 = _mm_unpackhi_epi32(rg0123, ba0123),
+         _45 = _mm_unpacklo_epi32(rg4567, ba4567),
+         _67 = _mm_unpackhi_epi32(rg4567, ba4567);
+
+    if (__builtin_expect(tail,0)) {
+        auto dst = (double*)ptr;
+        if (tail > 0) { _mm_storel_pd(dst+0, _01); }
+        if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
+        if (tail > 2) { _mm_storel_pd(dst+2, _23); }
+        if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
+        if (tail > 4) { _mm_storel_pd(dst+4, _45); }
+        if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
+        if (tail > 6) { _mm_storel_pd(dst+6, _67); }
+    } else {
+        _mm_storeu_si128((__m128i*)ptr + 0, _01);
+        _mm_storeu_si128((__m128i*)ptr + 1, _23);
+        _mm_storeu_si128((__m128i*)ptr + 2, _45);
+        _mm_storeu_si128((__m128i*)ptr + 3, _67);
+    }
 #elif defined(__AVX__)
     auto float_to_half = [&](F f) {
         return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000)))  // Fix up the exponent,
@@ -775,10 +901,27 @@ STAGE(store_f16) {
          rg4567 = r4567 | _mm_slli_si128(g4567,2),
          ba0123 = b0123 | _mm_slli_si128(a0123,2),
          ba4567 = b4567 | _mm_slli_si128(a4567,2);
-    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
-    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
-    _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
-    _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+
+    auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
+         _23 = _mm_unpackhi_epi32(rg0123, ba0123),
+         _45 = _mm_unpacklo_epi32(rg4567, ba4567),
+         _67 = _mm_unpackhi_epi32(rg4567, ba4567);
+
+    if (__builtin_expect(tail,0)) {
+        auto dst = (double*)ptr;
+        if (tail > 0) { _mm_storel_pd(dst+0, _01); }
+        if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
+        if (tail > 2) { _mm_storel_pd(dst+2, _23); }
+        if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
+        if (tail > 4) { _mm_storel_pd(dst+4, _45); }
+        if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
+        if (tail > 6) { _mm_storel_pd(dst+6, _67); }
+    } else {
+        _mm_storeu_si128((__m128i*)ptr + 0, _01);
+        _mm_storeu_si128((__m128i*)ptr + 1, _23);
+        _mm_storeu_si128((__m128i*)ptr + 2, _45);
+        _mm_storeu_si128((__m128i*)ptr + 3, _67);
+    }
 #elif defined(__SSE2__)
     auto float_to_half = [&](F f) {
         return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000)))  // Fix up the exponent,
author	Mike Klein <mtklein@chromium.org>	2017-03-01 13:07:40 -0500
committer	Mike Klein <mtklein@chromium.org>	2017-03-02 16:14:45 +0000
commit	c31858bcba3f6d9eb6b57ae03c15b266324a5c23 (patch)
tree	3148d117b2b6c883c6aefa8d399fea147a536df5 /src/jumper/SkJumper_stages.cpp
parent	f7cf81aefd28e5bfe74d40b4fc037df72f157f33 (diff)