aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages.cpp
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-04-04 10:24:56 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-04-04 17:29:38 +0000
commit95f53be0059940da50d4fce10da5c4dcf037b6ae (patch)
tree9ae1fcc979936cf72f4f9757cbd48fdb84dbfbae /src/jumper/SkJumper_stages.cpp
parent744808823f635c863d7ca6b4eba652115c92ff85 (diff)
jumper, split store_f16 into to_half, store4
Pretty much the same deal as the last CL going the other direction: split store_f16 into to_half() and store4(). Platforms that had fused strategies here get a little less optimal, but the code's easier to follow, maintain, and reuse. Also adds widen_cast() to encapsulate the fairly common pattern of expanding one of our logical vector types (e.g. 8-byte U16) up to the width of the physical vector type (e.g. 16-byte __m128i). This operation is deeply understood by Clang, and often is a no-op. I could make bit_cast() do this, but it seems clearer to have two names. Change-Id: I7ba5bb4746acfcaa6d486379f67e07baee3820b2 Reviewed-on: https://skia-review.googlesource.com/11204 Reviewed-by: Herb Derby <herb@google.com> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r--src/jumper/SkJumper_stages.cpp115
1 files changed, 4 insertions, 111 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index dd2bb1348f..fa64e805d6 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -634,117 +634,10 @@ STAGE(load_f16) {
STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
-#if !defined(JUMPER)
- auto float_to_half = [&](F f) {
- return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
- >> 13; // then line up the mantissa.
- };
- auto rgba = (int16_t*)ptr;
- rgba[0] = float_to_half(r);
- rgba[1] = float_to_half(g);
- rgba[2] = float_to_half(b);
- rgba[3] = float_to_half(a);
-#elif defined(__aarch64__)
- float16x4x4_t halfs = {{
- vcvt_f16_f32(r),
- vcvt_f16_f32(g),
- vcvt_f16_f32(b),
- vcvt_f16_f32(a),
- }};
- vst4_f16((float16_t*)ptr, halfs);
-#elif defined(__arm__)
- float16x4x2_t rb_ga = {{
- vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
- vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
- }};
- vst2_f16((float16_t*)ptr, rb_ga);
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
- auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
- G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
- B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
- A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
-
- auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3
- rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7
- ba0123 = _mm_unpacklo_epi16(B, A),
- ba4567 = _mm_unpackhi_epi16(B, A);
-
- auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
- _23 = _mm_unpackhi_epi32(rg0123, ba0123),
- _45 = _mm_unpacklo_epi32(rg4567, ba4567),
- _67 = _mm_unpackhi_epi32(rg4567, ba4567);
-
- if (__builtin_expect(tail,0)) {
- auto dst = (double*)ptr;
- if (tail > 0) { _mm_storel_pd(dst+0, _01); }
- if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
- if (tail > 2) { _mm_storel_pd(dst+2, _23); }
- if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
- if (tail > 4) { _mm_storel_pd(dst+4, _45); }
- if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
- if (tail > 6) { _mm_storel_pd(dst+6, _67); }
- } else {
- _mm_storeu_si128((__m128i*)ptr + 0, _01);
- _mm_storeu_si128((__m128i*)ptr + 1, _23);
- _mm_storeu_si128((__m128i*)ptr + 2, _45);
- _mm_storeu_si128((__m128i*)ptr + 3, _67);
- }
-#elif defined(__AVX__)
- auto float_to_half = [&](F f) {
- return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
- >> 13; // then line up the mantissa.
- };
- U32 R = float_to_half(r),
- G = float_to_half(g),
- B = float_to_half(b),
- A = float_to_half(a);
- auto r0123 = _mm256_extractf128_si256(R, 0),
- r4567 = _mm256_extractf128_si256(R, 1),
- g0123 = _mm256_extractf128_si256(G, 0),
- g4567 = _mm256_extractf128_si256(G, 1),
- b0123 = _mm256_extractf128_si256(B, 0),
- b4567 = _mm256_extractf128_si256(B, 1),
- a0123 = _mm256_extractf128_si256(A, 0),
- a4567 = _mm256_extractf128_si256(A, 1);
- auto rg0123 = r0123 | _mm_slli_si128(g0123,2),
- rg4567 = r4567 | _mm_slli_si128(g4567,2),
- ba0123 = b0123 | _mm_slli_si128(a0123,2),
- ba4567 = b4567 | _mm_slli_si128(a4567,2);
-
- auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
- _23 = _mm_unpackhi_epi32(rg0123, ba0123),
- _45 = _mm_unpacklo_epi32(rg4567, ba4567),
- _67 = _mm_unpackhi_epi32(rg4567, ba4567);
-
- if (__builtin_expect(tail,0)) {
- auto dst = (double*)ptr;
- if (tail > 0) { _mm_storel_pd(dst+0, _01); }
- if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
- if (tail > 2) { _mm_storel_pd(dst+2, _23); }
- if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
- if (tail > 4) { _mm_storel_pd(dst+4, _45); }
- if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
- if (tail > 6) { _mm_storel_pd(dst+6, _67); }
- } else {
- _mm_storeu_si128((__m128i*)ptr + 0, _01);
- _mm_storeu_si128((__m128i*)ptr + 1, _23);
- _mm_storeu_si128((__m128i*)ptr + 2, _45);
- _mm_storeu_si128((__m128i*)ptr + 3, _67);
- }
-#elif defined(__SSE2__)
- auto float_to_half = [&](F f) {
- return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
- >> 13; // then line up the mantissa.
- };
- U32 R = float_to_half(r),
- G = float_to_half(g),
- B = float_to_half(b),
- A = float_to_half(a);
- U32 rg = R | _mm_slli_si128(G,2),
- ba = B | _mm_slli_si128(A,2);
- _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
- _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
-#endif
+ store4(ptr,tail, to_half(r)
+ , to_half(g)
+ , to_half(b)
+ , to_half(a));
}
STAGE(store_f32) {