jumper, factor out load4() and from_half()

load_f16 gets slightly worse codegen for ARMv7, SSE2, SSE4.1, and AVX from splitting it apart compared to the previous fused versions. But the stage code becomes much simpler. I'm happy to make those trades until someone complains. load4() will be useful on its own to implement a couple other stages. Everything draws the same. I intend to follow up with more of the same sort of refactoring, but this was tricky enough a change I want to do them in small steps. Change-Id: Ib4aa86a58d000f2d7916937cd4f22dc2bd135a49 Reviewed-on: https://skia-review.googlesource.com/11186 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-04-03 22:21:15 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-04-04 13:57:54 +0000
commit: 114e6b33d67537f034b749e77f68d168ef9bfbc6 (patch)
tree: 6b92567de9d110f80da64e1eb48778f764dca229 /src/jumper/SkJumper_stages.cpp
parent: 88ec28e3d7567ec2c3e26fed66c16a68a8f8ae64 (diff)
1 files changed, 6 insertions, 143 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index e5781f1064..dd2bb1348f 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -623,149 +623,12 @@ STAGE(store_8888) {
 STAGE(load_f16) {
     auto ptr = *(const uint64_t**)ctx + x;
 
-#if !defined(JUMPER)
-    auto half_to_float = [&](int16_t h) {
-        if (h < 0x0400) { h = 0; }            // Flush denorm and negative to zero.
-        return bit_cast<F>(h << 13)           // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000));  // then fix up the exponent.
-    };
-    auto rgba = (const int16_t*)ptr;
-    r = half_to_float(rgba[0]);
-    g = half_to_float(rgba[1]);
-    b = half_to_float(rgba[2]);
-    a = half_to_float(rgba[3]);
-#elif defined(__aarch64__)
-    auto halfs = vld4_f16((const float16_t*)ptr);
-    r = vcvt_f32_f16(halfs.val[0]);
-    g = vcvt_f32_f16(halfs.val[1]);
-    b = vcvt_f32_f16(halfs.val[2]);
-    a = vcvt_f32_f16(halfs.val[3]);
-#elif defined(__arm__)
-    auto rb_ga = vld2_f16((const float16_t*)ptr);
-    auto rb = vcvt_f32_f16(rb_ga.val[0]),
-         ga = vcvt_f32_f16(rb_ga.val[1]);
-    r = {rb[0], rb[2]};
-    g = {ga[0], ga[2]};
-    b = {rb[1], rb[3]};
-    a = {ga[1], ga[3]};
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
-    __m128i _01, _23, _45, _67;
-    if (__builtin_expect(tail,0)) {
-        auto src = (const double*)ptr;
-        _01 = _23 = _45 = _67 = _mm_setzero_si128();
-        if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
-        if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
-        if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
-        if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
-        if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
-        if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
-        if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
-    } else {
-        _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
-        _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
-        _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
-        _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
-    }
-
-    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
-         _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
-         _46 = _mm_unpacklo_epi16(_45, _67),
-         _57 = _mm_unpackhi_epi16(_45, _67);
-
-    auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
-         ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3
-         rg4567 = _mm_unpacklo_epi16(_46, _57),
-         ba4567 = _mm_unpackhi_epi16(_46, _57);
-
-    r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
-    g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
-    b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
-    a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
-#elif defined(__AVX__)
-    __m128i _01, _23, _45, _67;
-    if (__builtin_expect(tail,0)) {
-        auto src = (const double*)ptr;
-        _01 = _23 = _45 = _67 = _mm_setzero_si128();
-        if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
-        if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
-        if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
-        if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
-        if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
-        if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
-        if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
-    } else {
-        _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
-        _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
-        _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
-        _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
-    }
-
-    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
-         _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
-         _46 = _mm_unpacklo_epi16(_45, _67),
-         _57 = _mm_unpackhi_epi16(_45, _67);
-
-    auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
-         ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3
-         rg4567 = _mm_unpacklo_epi16(_46, _57),
-         ba4567 = _mm_unpackhi_epi16(_46, _57);
-
-    // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero.
-    // With a signed comparison this conveniently also flushes negative half floats to zero.
-    auto ftz = [](__m128i v) {
-        return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
-    };
-    rg0123 = ftz(rg0123);
-    ba0123 = ftz(ba0123);
-    rg4567 = ftz(rg4567);
-    ba4567 = ftz(ba4567);
-
-    U32 R = _mm256_setr_m128i(_mm_unpacklo_epi16(rg0123, _mm_setzero_si128()),
-                              _mm_unpacklo_epi16(rg4567, _mm_setzero_si128())),
-        G = _mm256_setr_m128i(_mm_unpackhi_epi16(rg0123, _mm_setzero_si128()),
-                              _mm_unpackhi_epi16(rg4567, _mm_setzero_si128())),
-        B = _mm256_setr_m128i(_mm_unpacklo_epi16(ba0123, _mm_setzero_si128()),
-                              _mm_unpacklo_epi16(ba4567, _mm_setzero_si128())),
-        A = _mm256_setr_m128i(_mm_unpackhi_epi16(ba0123, _mm_setzero_si128()),
-                              _mm_unpackhi_epi16(ba4567, _mm_setzero_si128()));
-
-    auto half_to_float = [&](U32 h) {
-        return bit_cast<F>(h << 13)             // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
-    };
-
-    r = half_to_float(R);
-    g = half_to_float(G);
-    b = half_to_float(B);
-    a = half_to_float(A);
-
-#elif defined(__SSE2__)
-    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
-         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
-
-    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
-         _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
-
-    auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
-         ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
-
-    // Same deal as AVX, flush denorms and negatives to zero.
-    auto ftz = [](__m128i v) {
-        return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
-    };
-    rg = ftz(rg);
-    ba = ftz(ba);
-
-    auto half_to_float = [&](U32 h) {
-        return bit_cast<F>(h << 13)             // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
-    };
-
-    r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
-    g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
-    b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
-    a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
-#endif
+    U16 R,G,B,A;
+    load4(ptr,tail, &R,&G,&B,&A);
+    r = from_half(R);
+    g = from_half(G);
+    b = from_half(B);
+    a = from_half(A);
 }
 
 STAGE(store_f16) {
author	Mike Klein <mtklein@chromium.org>	2017-04-03 22:21:15 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-04-04 13:57:54 +0000
commit	114e6b33d67537f034b749e77f68d168ef9bfbc6 (patch)
tree	6b92567de9d110f80da64e1eb48778f764dca229 /src/jumper/SkJumper_stages.cpp
parent	88ec28e3d7567ec2c3e26fed66c16a68a8f8ae64 (diff)