aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_stages.cpp
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-02-16 06:21:54 -0500
committerGravatar Mike Klein <mtklein@chromium.org>2017-02-16 12:54:04 +0000
commitfa64774820cb42594d3f5bc2059953510f038636 (patch)
tree46b4bdee3eaa607c5455b03da125563016d83ab0 /src/jumper/SkJumper_stages.cpp
parent8729e5bbf7c436fd7c7c13182adbbfb419f566b5 (diff)
Flush to zero when loading f16 with sse2/sse4.1.
The multiply by 0x77800000 is quite slow when the input is denormalized. We don't mind flushing those values (in the range of 1e-5) to zero. Implement portable load_f16() / store_f16() too. Change-Id: I125cff1c79ca71d9abe22ac7877136d86707cb56 Reviewed-on: https://skia-review.googlesource.com/8467 Reviewed-by: Mike Klein <mtklein@chromium.org> Commit-Queue: Mike Klein <mtklein@chromium.org>
Diffstat (limited to 'src/jumper/SkJumper_stages.cpp')
-rw-r--r--src/jumper/SkJumper_stages.cpp28
1 files changed, 24 insertions, 4 deletions
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
index 6c106c3f05..20ea719727 100644
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -402,8 +402,16 @@ STAGE(load_f16) {
auto ptr = *(const uint64_t**)ctx + x;
#if !defined(JUMPER)
- // TODO:
- (void)ptr;
+ auto half_to_float = [&](int16_t h) {
+ if (h < 0x0400) { h = 0; } // Flush denorm and negative to zero.
+ return bit_cast<F>(h << 13) // Line up the mantissa,
+ * bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent.
+ };
+ auto rgba = (const int16_t*)ptr;
+ r = half_to_float(rgba[0]);
+ g = half_to_float(rgba[1]);
+ b = half_to_float(rgba[2]);
+ a = half_to_float(rgba[3]);
#elif defined(__aarch64__)
auto halfs = vld4_f16((const float16_t*)ptr);
r = vcvt_f32_f16(halfs.val[0]);
@@ -448,6 +456,11 @@ STAGE(load_f16) {
auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
+ // half_to_float() slows down ~10x for denorm inputs, so we flush them to zero.
+ // With a signed comparison this conveniently also flushes negative half floats to zero.
+ rg = _mm_andnot_si128(_mm_cmplt_epi16(rg, U32(k->_0x04000400)), rg);
+ ba = _mm_andnot_si128(_mm_cmplt_epi16(ba, U32(k->_0x04000400)), ba);
+
auto half_to_float = [&](U32 h) {
return bit_cast<F>(h << 13) // Line up the mantissa,
* bit_cast<F>(U32(k->_0x77800000)); // then fix up the exponent.
@@ -464,8 +477,15 @@ STAGE(store_f16) {
auto ptr = *(uint64_t**)ctx + x;
#if !defined(JUMPER)
- // TODO:
- (void)ptr;
+ auto float_to_half = [&](F f) {
+ return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000))) // Fix up the exponent,
+ >> 13; // then line up the mantissa.
+ };
+ auto rgba = (int16_t*)ptr;
+ rgba[0] = float_to_half(r);
+ rgba[1] = float_to_half(g);
+ rgba[2] = float_to_half(b);
+ rgba[3] = float_to_half(a);
#elif defined(__aarch64__)
float16x4x4_t halfs = {{
vcvt_f16_f32(r),