diff options
author | Mike Klein <mtklein@chromium.org> | 2017-04-04 10:24:56 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-04-04 17:29:38 +0000 |
commit | 95f53be0059940da50d4fce10da5c4dcf037b6ae (patch) | |
tree | 9ae1fcc979936cf72f4f9757cbd48fdb84dbfbae /src/jumper/SkJumper_stages.cpp | |
parent | 744808823f635c863d7ca6b4eba652115c92ff85 (diff) |
jumper, split store_f16 into to_half, store4
Pretty much the same deal as the last CL going the other direction:
split store_f16 into to_half() and store4(). Platforms that had fused
strategies here get a little less optimal, but the code's easier to
follow, maintain, and reuse.
Also adds widen_cast() to encapsulate the fairly common pattern of
expanding one of our logical vector types (e.g. 8-byte U16) up to the
width of the physical vector type (e.g. 16-byte __m128i). This
operation is deeply understood by Clang, and often is a no-op.
I could make bit_cast() do this, but it seems clearer to have two names.
Change-Id: I7ba5bb4746acfcaa6d486379f67e07baee3820b2
Reviewed-on: https://skia-review.googlesource.com/11204
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 115 |
1 files changed, 4 insertions, 111 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index dd2bb1348f..fa64e805d6 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -634,117 +634,10 @@ STAGE(load_f16) { STAGE(store_f16) { auto ptr = *(uint64_t**)ctx + x; -#if !defined(JUMPER) - auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, - >> 13; // then line up the mantissa. - }; - auto rgba = (int16_t*)ptr; - rgba[0] = float_to_half(r); - rgba[1] = float_to_half(g); - rgba[2] = float_to_half(b); - rgba[3] = float_to_half(a); -#elif defined(__aarch64__) - float16x4x4_t halfs = {{ - vcvt_f16_f32(r), - vcvt_f16_f32(g), - vcvt_f16_f32(b), - vcvt_f16_f32(a), - }}; - vst4_f16((float16_t*)ptr, halfs); -#elif defined(__arm__) - float16x4x2_t rb_ga = {{ - vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}), - vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}), - }}; - vst2_f16((float16_t*)ptr, rb_ga); -#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) - auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION), - G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION), - B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION), - A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION); - - auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3 - rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7 - ba0123 = _mm_unpacklo_epi16(B, A), - ba4567 = _mm_unpackhi_epi16(B, A); - - auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), - _23 = _mm_unpackhi_epi32(rg0123, ba0123), - _45 = _mm_unpacklo_epi32(rg4567, ba4567), - _67 = _mm_unpackhi_epi32(rg4567, ba4567); - - if (__builtin_expect(tail,0)) { - auto dst = (double*)ptr; - if (tail > 0) { _mm_storel_pd(dst+0, _01); } - if (tail > 1) { _mm_storeh_pd(dst+1, _01); } - if (tail > 2) { _mm_storel_pd(dst+2, _23); } - if (tail > 3) { _mm_storeh_pd(dst+3, _23); } - if (tail > 4) { _mm_storel_pd(dst+4, _45); } - if (tail > 5) { _mm_storeh_pd(dst+5, _45); } - if (tail > 6) { _mm_storel_pd(dst+6, _67); } - } else { - _mm_storeu_si128((__m128i*)ptr + 0, _01); - _mm_storeu_si128((__m128i*)ptr + 1, _23); - _mm_storeu_si128((__m128i*)ptr + 2, _45); - _mm_storeu_si128((__m128i*)ptr + 3, _67); - } -#elif defined(__AVX__) - auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, - >> 13; // then line up the mantissa. - }; - U32 R = float_to_half(r), - G = float_to_half(g), - B = float_to_half(b), - A = float_to_half(a); - auto r0123 = _mm256_extractf128_si256(R, 0), - r4567 = _mm256_extractf128_si256(R, 1), - g0123 = _mm256_extractf128_si256(G, 0), - g4567 = _mm256_extractf128_si256(G, 1), - b0123 = _mm256_extractf128_si256(B, 0), - b4567 = _mm256_extractf128_si256(B, 1), - a0123 = _mm256_extractf128_si256(A, 0), - a4567 = _mm256_extractf128_si256(A, 1); - auto rg0123 = r0123 | _mm_slli_si128(g0123,2), - rg4567 = r4567 | _mm_slli_si128(g4567,2), - ba0123 = b0123 | _mm_slli_si128(a0123,2), - ba4567 = b4567 | _mm_slli_si128(a4567,2); - - auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), - _23 = _mm_unpackhi_epi32(rg0123, ba0123), - _45 = _mm_unpacklo_epi32(rg4567, ba4567), - _67 = _mm_unpackhi_epi32(rg4567, ba4567); - - if (__builtin_expect(tail,0)) { - auto dst = (double*)ptr; - if (tail > 0) { _mm_storel_pd(dst+0, _01); } - if (tail > 1) { _mm_storeh_pd(dst+1, _01); } - if (tail > 2) { _mm_storel_pd(dst+2, _23); } - if (tail > 3) { _mm_storeh_pd(dst+3, _23); } - if (tail > 4) { _mm_storel_pd(dst+4, _45); } - if (tail > 5) { _mm_storeh_pd(dst+5, _45); } - if (tail > 6) { _mm_storel_pd(dst+6, _67); } - } else { - _mm_storeu_si128((__m128i*)ptr + 0, _01); - _mm_storeu_si128((__m128i*)ptr + 1, _23); - _mm_storeu_si128((__m128i*)ptr + 2, _45); - _mm_storeu_si128((__m128i*)ptr + 3, _67); - } -#elif defined(__SSE2__) - auto float_to_half = [&](F f) { - return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent, - >> 13; // then line up the mantissa. - }; - U32 R = float_to_half(r), - G = float_to_half(g), - B = float_to_half(b), - A = float_to_half(a); - U32 rg = R | _mm_slli_si128(G,2), - ba = B | _mm_slli_si128(A,2); - _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); - _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); -#endif + store4(ptr,tail, to_half(r) + , to_half(g) + , to_half(b) + , to_half(a)); } STAGE(store_f32) { |