diff options
author | Mike Klein <mtklein@chromium.org> | 2017-04-03 22:21:15 -0400 |
---|---|---|
committer | Skia Commit-Bot <skia-commit-bot@chromium.org> | 2017-04-04 13:57:54 +0000 |
commit | 114e6b33d67537f034b749e77f68d168ef9bfbc6 (patch) | |
tree | 6b92567de9d110f80da64e1eb48778f764dca229 /src/jumper/SkJumper_stages.cpp | |
parent | 88ec28e3d7567ec2c3e26fed66c16a68a8f8ae64 (diff) |
jumper, factor out load4() and from_half()
load_f16 gets slightly worse codegen for ARMv7, SSE2, SSE4.1, and AVX
from splitting it apart compared to the previous fused versions. But
the stage code becomes much simpler.
I'm happy to make those trades until someone complains.
load4() will be useful on its own to implement a couple other stages.
Everything draws the same. I intend to follow up with more of the
same sort of refactoring, but this was tricky enough a change I want
to do them in small steps.
Change-Id: Ib4aa86a58d000f2d7916937cd4f22dc2bd135a49
Reviewed-on: https://skia-review.googlesource.com/11186
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r-- | src/jumper/SkJumper_stages.cpp | 149 |
1 files changed, 6 insertions, 143 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index e5781f1064..dd2bb1348f 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -623,149 +623,12 @@ STAGE(store_8888) { STAGE(load_f16) { auto ptr = *(const uint64_t**)ctx + x; -#if !defined(JUMPER) - auto half_to_float = [&](int16_t h) { - if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero. - return bit_cast<F>(h << 13) // Line up the mantissa, - * bit_cast<F>(U32(0x77800000)); // then fix up the exponent. - }; - auto rgba = (const int16_t*)ptr; - r = half_to_float(rgba[0]); - g = half_to_float(rgba[1]); - b = half_to_float(rgba[2]); - a = half_to_float(rgba[3]); -#elif defined(__aarch64__) - auto halfs = vld4_f16((const float16_t*)ptr); - r = vcvt_f32_f16(halfs.val[0]); - g = vcvt_f32_f16(halfs.val[1]); - b = vcvt_f32_f16(halfs.val[2]); - a = vcvt_f32_f16(halfs.val[3]); -#elif defined(__arm__) - auto rb_ga = vld2_f16((const float16_t*)ptr); - auto rb = vcvt_f32_f16(rb_ga.val[0]), - ga = vcvt_f32_f16(rb_ga.val[1]); - r = {rb[0], rb[2]}; - g = {ga[0], ga[2]}; - b = {rb[1], rb[3]}; - a = {ga[1], ga[3]}; -#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__) - __m128i _01, _23, _45, _67; - if (__builtin_expect(tail,0)) { - auto src = (const double*)ptr; - _01 = _23 = _45 = _67 = _mm_setzero_si128(); - if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } - if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } - if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } - if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } - if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } - if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } - if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } - } else { - _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); - _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); - _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); - } - - auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 - _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 - _46 = _mm_unpacklo_epi16(_45, _67), - _57 = _mm_unpackhi_epi16(_45, _67); - - auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 - rg4567 = _mm_unpacklo_epi16(_46, _57), - ba4567 = _mm_unpackhi_epi16(_46, _57); - - r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567)); - g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567)); - b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567)); - a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567)); -#elif defined(__AVX__) - __m128i _01, _23, _45, _67; - if (__builtin_expect(tail,0)) { - auto src = (const double*)ptr; - _01 = _23 = _45 = _67 = _mm_setzero_si128(); - if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } - if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } - if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } - if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } - if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } - if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } - if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } - } else { - _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); - _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); - _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); - } - - auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 - _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 - _46 = _mm_unpacklo_epi16(_45, _67), - _57 = _mm_unpackhi_epi16(_45, _67); - - auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 - rg4567 = _mm_unpacklo_epi16(_46, _57), - ba4567 = _mm_unpackhi_epi16(_46, _57); - - // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero. - // With a signed comparison this conveniently also flushes negative half floats to zero. - auto ftz = [](__m128i v) { - return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v); - }; - rg0123 = ftz(rg0123); - ba0123 = ftz(ba0123); - rg4567 = ftz(rg4567); - ba4567 = ftz(ba4567); - - U32 R = _mm256_setr_m128i(_mm_unpacklo_epi16(rg0123, _mm_setzero_si128()), - _mm_unpacklo_epi16(rg4567, _mm_setzero_si128())), - G = _mm256_setr_m128i(_mm_unpackhi_epi16(rg0123, _mm_setzero_si128()), - _mm_unpackhi_epi16(rg4567, _mm_setzero_si128())), - B = _mm256_setr_m128i(_mm_unpacklo_epi16(ba0123, _mm_setzero_si128()), - _mm_unpacklo_epi16(ba4567, _mm_setzero_si128())), - A = _mm256_setr_m128i(_mm_unpackhi_epi16(ba0123, _mm_setzero_si128()), - _mm_unpackhi_epi16(ba4567, _mm_setzero_si128())); - - auto half_to_float = [&](U32 h) { - return bit_cast<F>(h << 13) // Line up the mantissa, - * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent. - }; - - r = half_to_float(R); - g = half_to_float(G); - b = half_to_float(B); - a = half_to_float(A); - -#elif defined(__SSE2__) - auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0), - _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); - - auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 - _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 - - auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 - ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 - - // Same deal as AVX, flush denorms and negatives to zero. - auto ftz = [](__m128i v) { - return _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v); - }; - rg = ftz(rg); - ba = ftz(ba); - - auto half_to_float = [&](U32 h) { - return bit_cast<F>(h << 13) // Line up the mantissa, - * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent. - }; - - r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128())); - g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128())); - b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128())); - a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128())); -#endif + U16 R,G,B,A; + load4(ptr,tail, &R,&G,&B,&A); + r = from_half(R); + g = from_half(G); + b = from_half(B); + a = from_half(A); } STAGE(store_f16) { |