test and fix f16<->f32 conversion stages

This refactors from_half() and to_half() a bit, totally reimplementing the non-hardware cases to be more clearly correct. CQ_INCLUDE_TRYBOTS=skia.primary:Test-Android-Clang-PixelC-CPU-TegraX1-arm64-Release-Android,Test-Android-Clang-Ci20-CPU-IngenicJZ4780-mipsel-Release-Android,Test-Android-Clang-Nexus10-CPU-Exynos5250-arm-Release-Android,Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Release,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86-Debug,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Debug Change-Id: I439463cf90935c5e8fe2369cbcf45e07f3af62c7 Reviewed-on: https://skia-review.googlesource.com/13921 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Matt Sarett <msarett@google.com>
author: Mike Klein <mtklein@chromium.org> 2017-04-19 17:19:30 -0400
committer: Skia Commit-Bot <skia-commit-bot@chromium.org> 2017-04-20 12:49:03 +0000
commit: d0ce148ed4945aa75fb7eeaaffcfd345dd9f85fb (patch)
tree: d6987bfeab8c995de846989d1c56bae3ed685365 /src/jumper/SkJumper_vectors.h
parent: 544e0ad49c11bd349782618de6430bdf8cec0106 (diff)
1 files changed, 56 insertions, 60 deletions
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index bd8ad40262..3cb1785b3d 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -74,16 +74,6 @@
         ptr[3] = a;
     }
 
-    SI F from_half(U16 h) {
-        if ((int16_t)h < 0x0400) { h = 0; }   // Flush denorm and negative to zero.
-        return bit_cast<F>(h << 13)           // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000));  // then fix up the exponent.
-    }
-    SI U16 to_half(F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    }
-
 #elif defined(__aarch64__)
     #include <arm_neon.h>
 
@@ -143,9 +133,6 @@
         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
     }
 
-    SI F from_half(U16 h) { return vcvt_f32_f16(h); }
-    SI U16 to_half(F   f) { return vcvt_f16_f32(f); }
-
 #elif defined(__arm__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -222,15 +209,6 @@
         vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
     }
 
-    SI F from_half(U16 h) {
-        auto v = widen_cast<uint16x4_t>(h);
-        return vget_low_f32(vcvt_f32_f16(v));
-    }
-    SI U16 to_half(F f) {
-        auto v = widen_cast<float32x4_t>(f);
-        uint16x4_t h = vcvt_f16_f32(v);
-        return unaligned_load<U16>(&h);
-    }
 
 #elif defined(__AVX__)
     #include <immintrin.h>
@@ -445,29 +423,6 @@
         }
     }
 
-    SI F from_half(U16 h) {
-    #if defined(__AVX2__)
-        return _mm256_cvtph_ps(h);
-    #else
-        // This technique would slow down ~10x for denorm inputs, so we flush them to zero.
-        // With a signed comparison this conveniently also flushes negative half floats to zero.
-        h = _mm_andnot_si128(_mm_cmplt_epi16(h, _mm_set1_epi32(0x04000400_i)), h);
-
-        U32 w = _mm256_setr_m128i(_mm_unpacklo_epi16(h, _mm_setzero_si128()),
-                                  _mm_unpackhi_epi16(h, _mm_setzero_si128()));
-        return bit_cast<F>(w << 13)             // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
-    #endif
-    }
-    SI U16 to_half(F f) {
-    #if defined(__AVX2__)
-        return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
-    #else
-        return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-                    >> 13);                                            // then line up the mantissa.
-    #endif
-    }
-
 #elif defined(__SSE2__)
     #include <immintrin.h>
 
@@ -582,21 +537,6 @@
         _mm_storeu_ps(ptr+ 8, b);
         _mm_storeu_ps(ptr+12, a);
     }
-
-    SI F from_half(U16 h) {
-        auto v = widen_cast<__m128i>(h);
-
-        // Same deal as AVX: flush denorms and negatives to zero.
-        v = _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
-
-        U32 w = _mm_unpacklo_epi16(v, _mm_setzero_si128());
-        return bit_cast<F>(w << 13)             // Line up the mantissa,
-             * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
-    }
-    SI U16 to_half(F f) {
-        return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-                    >> 13);                                            // then line up the mantissa.
-    }
 #endif
 
 // We need to be a careful with casts.
@@ -614,6 +554,11 @@
     SI U32 expand(U8  v) { return (U32)v; }
 #endif
 
+template <typename V>
+SI V if_then_else(I32 c, V t, V e) {
+    return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
+}
+
 SI U16 bswap(U16 x) {
 #if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
     // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
@@ -652,4 +597,55 @@ SI F approx_powf(F x, F y) {
     return approx_pow2(approx_log2(x) * y);
 }
 
+SI F from_half(U16 h) {
+#if defined(JUMPER) && defined(__aarch64__)
+    return vcvt_f32_f16(h);
+
+#elif defined(JUMPER) && defined(__arm__)
+    auto v = widen_cast<uint16x4_t>(h);
+    return vget_low_f32(vcvt_f32_f16(v));
+
+#elif defined(JUMPER) && defined(__AVX2__)
+    return _mm256_cvtph_ps(h);
+
+#else
+    // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
+    U32 sem = expand(h),
+        s   = sem & 0x8000_i,
+         e  = sem & 0x7c00_i,
+         em = sem ^ s;
+
+    // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
+    return if_then_else(e == 0, 0
+                              , bit_cast<F>( (s<<16) + (em<<13) + C((127-15)<<23) ));
+#endif
+}
+
+SI U16 to_half(F f) {
+#if defined(JUMPER) && defined(__aarch64__)
+    return vcvt_f16_f32(f);
+
+#elif defined(JUMPER) && defined(__arm__)
+    auto v = widen_cast<float32x4_t>(f);
+    uint16x4_t h = vcvt_f16_f32(v);
+    return unaligned_load<U16>(&h);
+
+#elif defined(JUMPER) && defined(__AVX2__)
+    return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+
+#else
+    // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
+    U32 sem = bit_cast<U32>(f),
+        s   = sem & 0x80000000_i,
+         em = sem ^ s;
+
+    // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
+    auto denorm = bit_cast<F>(em) < C(1.0f / (1<<14));
+    return pack(if_then_else(denorm, U32(0)
+                                   , (s>>16) + (em>>13) - C((127-15)<<10)));
+#endif
+}
+
+
+
 #endif//SkJumper_vectors_DEFINED
author	Mike Klein <mtklein@chromium.org>	2017-04-19 17:19:30 -0400
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>	2017-04-20 12:49:03 +0000
commit	d0ce148ed4945aa75fb7eeaaffcfd345dd9f85fb (patch)
tree	d6987bfeab8c995de846989d1c56bae3ed685365 /src/jumper/SkJumper_vectors.h
parent	544e0ad49c11bd349782618de6430bdf8cec0106 (diff)