jumper, split store_f16 into to_half, store4

Pretty much the same deal as the last CL going the other direction: split store_f16 into to_half() and store4(). Platforms that had fused strategies here get a little less optimal, but the code's easier to follow, maintain, and reuse. Also adds widen_cast() to encapsulate the fairly common pattern of expanding one of our logical vector types (e.g. 8-byte U16) up to the width of the physical vector type (e.g. 16-byte __m128i). This operation is deeply understood by Clang, and often is a no-op. I could make bit_cast() do this, but it seems clearer to have two names. Change-Id: I7ba5bb4746acfcaa6d486379f67e07baee3820b2 Reviewed-on: https://skia-review.googlesource.com/11204 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
author: Mike Klein <mtklein@chromium.org> 2017-04-04 10:24:56 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-04-04 17:29:38 +0000
commit: 95f53be0059940da50d4fce10da5c4dcf037b6ae (patch)
tree: 9ae1fcc979936cf72f4f9757cbd48fdb84dbfbae /src/jumper/SkJumper_stages.cpp
parent: 744808823f635c863d7ca6b4eba652115c92ff85 (diff)
1 files changed, 4 insertions, 111 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index dd2bb1348f..fa64e805d6 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -634,117 +634,10 @@ STAGE(load_f16) {
 STAGE(store_f16) {
     auto ptr = *(uint64_t**)ctx + x;
 
-#if !defined(JUMPER)
-    auto float_to_half = [&](F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    };
-    auto rgba = (int16_t*)ptr;
-    rgba[0] = float_to_half(r);
-    rgba[1] = float_to_half(g);
-    rgba[2] = float_to_half(b);
-    rgba[3] = float_to_half(a);
-#elif defined(__aarch64__)
-    float16x4x4_t halfs = {{
-        vcvt_f16_f32(r),
-        vcvt_f16_f32(g),
-        vcvt_f16_f32(b),
-        vcvt_f16_f32(a),
-    }};
-    vst4_f16((float16_t*)ptr, halfs);
-#elif defined(__arm__)
-    float16x4x2_t rb_ga = {{
-        vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
-        vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
-    }};
-    vst2_f16((float16_t*)ptr, rb_ga);
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
-    auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
-         G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
-         B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
-         A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
-
-    auto rg0123 = _mm_unpacklo_epi16(R, G),  // r0 g0 r1 g1 r2 g2 r3 g3
-         rg4567 = _mm_unpackhi_epi16(R, G),  // r4 g4 r5 g5 r6 g6 r7 g7
-         ba0123 = _mm_unpacklo_epi16(B, A),
-         ba4567 = _mm_unpackhi_epi16(B, A);
-
-    auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
-         _23 = _mm_unpackhi_epi32(rg0123, ba0123),
-         _45 = _mm_unpacklo_epi32(rg4567, ba4567),
-         _67 = _mm_unpackhi_epi32(rg4567, ba4567);
-
-    if (__builtin_expect(tail,0)) {
-        auto dst = (double*)ptr;
-        if (tail > 0) { _mm_storel_pd(dst+0, _01); }
-        if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
-        if (tail > 2) { _mm_storel_pd(dst+2, _23); }
-        if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
-        if (tail > 4) { _mm_storel_pd(dst+4, _45); }
-        if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
-        if (tail > 6) { _mm_storel_pd(dst+6, _67); }
-    } else {
-        _mm_storeu_si128((__m128i*)ptr + 0, _01);
-        _mm_storeu_si128((__m128i*)ptr + 1, _23);
-        _mm_storeu_si128((__m128i*)ptr + 2, _45);
-        _mm_storeu_si128((__m128i*)ptr + 3, _67);
-    }
-#elif defined(__AVX__)
-    auto float_to_half = [&](F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    };
-    U32 R = float_to_half(r),
-        G = float_to_half(g),
-        B = float_to_half(b),
-        A = float_to_half(a);
-    auto r0123 = _mm256_extractf128_si256(R, 0),
-         r4567 = _mm256_extractf128_si256(R, 1),
-         g0123 = _mm256_extractf128_si256(G, 0),
-         g4567 = _mm256_extractf128_si256(G, 1),
-         b0123 = _mm256_extractf128_si256(B, 0),
-         b4567 = _mm256_extractf128_si256(B, 1),
-         a0123 = _mm256_extractf128_si256(A, 0),
-         a4567 = _mm256_extractf128_si256(A, 1);
-    auto rg0123 = r0123 | _mm_slli_si128(g0123,2),
-         rg4567 = r4567 | _mm_slli_si128(g4567,2),
-         ba0123 = b0123 | _mm_slli_si128(a0123,2),
-         ba4567 = b4567 | _mm_slli_si128(a4567,2);
-
-    auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
-         _23 = _mm_unpackhi_epi32(rg0123, ba0123),
-         _45 = _mm_unpacklo_epi32(rg4567, ba4567),
-         _67 = _mm_unpackhi_epi32(rg4567, ba4567);
-
-    if (__builtin_expect(tail,0)) {
-        auto dst = (double*)ptr;
-        if (tail > 0) { _mm_storel_pd(dst+0, _01); }
-        if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
-        if (tail > 2) { _mm_storel_pd(dst+2, _23); }
-        if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
-        if (tail > 4) { _mm_storel_pd(dst+4, _45); }
-        if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
-        if (tail > 6) { _mm_storel_pd(dst+6, _67); }
-    } else {
-        _mm_storeu_si128((__m128i*)ptr + 0, _01);
-        _mm_storeu_si128((__m128i*)ptr + 1, _23);
-        _mm_storeu_si128((__m128i*)ptr + 2, _45);
-        _mm_storeu_si128((__m128i*)ptr + 3, _67);
-    }
-#elif defined(__SSE2__)
-    auto float_to_half = [&](F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    };
-    U32 R = float_to_half(r),
-        G = float_to_half(g),
-        B = float_to_half(b),
-        A = float_to_half(a);
-    U32 rg = R | _mm_slli_si128(G,2),
-        ba = B | _mm_slli_si128(A,2);
-    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
-    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
-#endif
+    store4(ptr,tail, to_half(r)
+                   , to_half(g)
+                   , to_half(b)
+                   , to_half(a));
 }
 
 STAGE(store_f32) {
author	Mike Klein <mtklein@chromium.org>	2017-04-04 10:24:56 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-04-04 17:29:38 +0000
commit	95f53be0059940da50d4fce10da5c4dcf037b6ae (patch)
tree	9ae1fcc979936cf72f4f9757cbd48fdb84dbfbae /src/jumper/SkJumper_stages.cpp
parent	744808823f635c863d7ca6b4eba652115c92ff85 (diff)