aboutsummaryrefslogtreecommitdiffhomepage
path: root/src/jumper/SkJumper_vectors.h
diff options
context:
space:
mode:
authorGravatar Mike Klein <mtklein@chromium.org>2017-04-19 17:19:30 -0400
committerGravatar Skia Commit-Bot <skia-commit-bot@chromium.org>2017-04-20 12:49:03 +0000
commitd0ce148ed4945aa75fb7eeaaffcfd345dd9f85fb (patch)
treed6987bfeab8c995de846989d1c56bae3ed685365 /src/jumper/SkJumper_vectors.h
parent544e0ad49c11bd349782618de6430bdf8cec0106 (diff)
test and fix f16<->f32 conversion stages
This refactors from_half() and to_half() a bit, totally reimplementing the non-hardware cases to be more clearly correct. CQ_INCLUDE_TRYBOTS=skia.primary:Test-Android-Clang-PixelC-CPU-TegraX1-arm64-Release-Android,Test-Android-Clang-Ci20-CPU-IngenicJZ4780-mipsel-Release-Android,Test-Android-Clang-Nexus10-CPU-Exynos5250-arm-Release-Android,Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Release,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86-Debug,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Debug Change-Id: I439463cf90935c5e8fe2369cbcf45e07f3af62c7 Reviewed-on: https://skia-review.googlesource.com/13921 Commit-Queue: Mike Klein <mtklein@chromium.org> Reviewed-by: Matt Sarett <msarett@google.com>
Diffstat (limited to 'src/jumper/SkJumper_vectors.h')
-rw-r--r--src/jumper/SkJumper_vectors.h116
1 files changed, 56 insertions, 60 deletions
diff --git a/src/jumper/SkJumper_vectors.h b/src/jumper/SkJumper_vectors.h
index bd8ad40262..3cb1785b3d 100644
--- a/src/jumper/SkJumper_vectors.h
+++ b/src/jumper/SkJumper_vectors.h
@@ -74,16 +74,6 @@
ptr[3] = a;
}
- SI F from_half(U16 h) {
- if ((int16_t)h < 0x0400) { h = 0; } // Flush denorm and negative to zero.
- return bit_cast<F>(h << 13) // Line up the mantissa,
- * bit_cast<F>(U32(0x77800000)); // then fix up the exponent.
- }
- SI U16 to_half(F f) {
- return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
- >> 13; // then line up the mantissa.
- }
-
#elif defined(__aarch64__)
#include <arm_neon.h>
@@ -143,9 +133,6 @@
vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
}
- SI F from_half(U16 h) { return vcvt_f32_f16(h); }
- SI U16 to_half(F f) { return vcvt_f16_f32(f); }
-
#elif defined(__arm__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
#error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -222,15 +209,6 @@
vst4_f32(ptr, (float32x2x4_t{{r,g,b,a}}));
}
- SI F from_half(U16 h) {
- auto v = widen_cast<uint16x4_t>(h);
- return vget_low_f32(vcvt_f32_f16(v));
- }
- SI U16 to_half(F f) {
- auto v = widen_cast<float32x4_t>(f);
- uint16x4_t h = vcvt_f16_f32(v);
- return unaligned_load<U16>(&h);
- }
#elif defined(__AVX__)
#include <immintrin.h>
@@ -445,29 +423,6 @@
}
}
- SI F from_half(U16 h) {
- #if defined(__AVX2__)
- return _mm256_cvtph_ps(h);
- #else
- // This technique would slow down ~10x for denorm inputs, so we flush them to zero.
- // With a signed comparison this conveniently also flushes negative half floats to zero.
- h = _mm_andnot_si128(_mm_cmplt_epi16(h, _mm_set1_epi32(0x04000400_i)), h);
-
- U32 w = _mm256_setr_m128i(_mm_unpacklo_epi16(h, _mm_setzero_si128()),
- _mm_unpackhi_epi16(h, _mm_setzero_si128()));
- return bit_cast<F>(w << 13) // Line up the mantissa,
- * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent.
- #endif
- }
- SI U16 to_half(F f) {
- #if defined(__AVX2__)
- return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
- #else
- return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
- >> 13); // then line up the mantissa.
- #endif
- }
-
#elif defined(__SSE2__)
#include <immintrin.h>
@@ -582,21 +537,6 @@
_mm_storeu_ps(ptr+ 8, b);
_mm_storeu_ps(ptr+12, a);
}
-
- SI F from_half(U16 h) {
- auto v = widen_cast<__m128i>(h);
-
- // Same deal as AVX: flush denorms and negatives to zero.
- v = _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
-
- U32 w = _mm_unpacklo_epi16(v, _mm_setzero_si128());
- return bit_cast<F>(w << 13) // Line up the mantissa,
- * bit_cast<F>(U32(0x77800000_i)); // then fix up the exponent.
- }
- SI U16 to_half(F f) {
- return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i))) // Fix up the exponent,
- >> 13); // then line up the mantissa.
- }
#endif
// We need to be a careful with casts.
@@ -614,6 +554,11 @@
SI U32 expand(U8 v) { return (U32)v; }
#endif
+template <typename V>
+SI V if_then_else(I32 c, V t, V e) {
+ return bit_cast<V>(if_then_else(c, bit_cast<F>(t), bit_cast<F>(e)));
+}
+
SI U16 bswap(U16 x) {
#if defined(JUMPER) && defined(__SSE2__) && !defined(__AVX__)
// Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes
@@ -652,4 +597,55 @@ SI F approx_powf(F x, F y) {
return approx_pow2(approx_log2(x) * y);
}
+SI F from_half(U16 h) {
+#if defined(JUMPER) && defined(__aarch64__)
+ return vcvt_f32_f16(h);
+
+#elif defined(JUMPER) && defined(__arm__)
+ auto v = widen_cast<uint16x4_t>(h);
+ return vget_low_f32(vcvt_f32_f16(v));
+
+#elif defined(JUMPER) && defined(__AVX2__)
+ return _mm256_cvtph_ps(h);
+
+#else
+ // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
+ U32 sem = expand(h),
+ s = sem & 0x8000_i,
+ e = sem & 0x7c00_i,
+ em = sem ^ s;
+
+ // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
+ return if_then_else(e == 0, 0
+ , bit_cast<F>( (s<<16) + (em<<13) + C((127-15)<<23) ));
+#endif
+}
+
+SI U16 to_half(F f) {
+#if defined(JUMPER) && defined(__aarch64__)
+ return vcvt_f16_f32(f);
+
+#elif defined(JUMPER) && defined(__arm__)
+ auto v = widen_cast<float32x4_t>(f);
+ uint16x4_t h = vcvt_f16_f32(v);
+ return unaligned_load<U16>(&h);
+
+#elif defined(JUMPER) && defined(__AVX2__)
+ return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+
+#else
+ // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
+ U32 sem = bit_cast<U32>(f),
+ s = sem & 0x80000000_i,
+ em = sem ^ s;
+
+ // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
+ auto denorm = bit_cast<F>(em) < C(1.0f / (1<<14));
+ return pack(if_then_else(denorm, U32(0)
+ , (s>>16) + (em>>13) - C((127-15)<<10)));
+#endif
+}
+
+
+
#endif//SkJumper_vectors_DEFINED