Expand _01 half<->float limitation to _finite. Simplify.

It's become clear we need to sometimes deal with values <0 or >1. I'm not yet convinced we care about NaN or +-inf. We had some fairly clever tricks and optimizations here for NEON and SSE. I've thrown them out in favor of a single implementation. If we find the specializations mattered, we can certainly figure out how to extend them to this new range/domain. This happens to add a vectorized float -> half for ARMv7, which was missing from the _01 version. (The SSE strategy was not portable to platforms that flush denorm floats to zero.) I've tested the full float range for FloatToHalf on my desktop and a 5x. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2145663003 CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-Fast-Trybot Committed: https://skia.googlesource.com/skia/+/3296bee70d074bb8094b3229dbe12fa016657e90 Review-Url: https://codereview.chromium.org/2145663003
author: mtklein <mtklein@chromium.org> 2016-07-15 07:00:11 -0700
committer: Commit bot <commit-bot@chromium.org> 2016-07-15 07:00:11 -0700
commit: 58e389b0518b46bbe58ba01c23443cf23c18435c (patch)
tree: 51f6d91fa6a116666c9c318897211cbc7ca0395b /src/opts
parent: 428036621e1667b504051872869ac38cf6fac9c8 (diff)
2 files changed, 50 insertions, 1 deletions
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h
index 91cd104482..2f73e0368d 100644
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -388,13 +388,28 @@ public:
 
     SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
     SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
+    SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
 
     SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
     SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
 
+    SkNx operator == (const SkNx& o) const {
+        return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
+    }
+    SkNx operator <  (const SkNx& o) const {
+        return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
+    }
+    SkNx operator >  (const SkNx& o) const {
+        return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
+    }
+
     static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
     // TODO as needed
 
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
+        return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
+    }
+
     int32x4_t fVec;
 };
 
@@ -456,6 +471,14 @@ template<> inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
     return vqmovn_u16(vcombine_u16(_16, _16));
 }
 
+template<> inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
+    return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
+}
+
+template<> inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
+    return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
+}
+
 static inline Sk4i Sk4f_round(const Sk4f& x) {
     return vcvtq_s32_f32((x + 0.5f).fVec);
 }
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h
index 78cea3b3d8..ab88382bd2 100644
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -152,16 +152,30 @@ public:
 
     SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
     SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+    SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
 
     SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
     SkNx operator >> (int bits) const { return _mm_srai_epi32(fVec, bits); }
 
+    SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); }
+    SkNx operator  < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVec); }
+    SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVec); }
+
     int operator[](int k) const {
         SkASSERT(0 <= k && k < 4);
         union { __m128i v; int is[4]; } pun = {fVec};
         return pun.is[k&3];
     }
 
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+        return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
+    #else
+        return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
+                            _mm_andnot_si128(fVec, e.fVec));
+    #endif
+    }
+
     __m128i fVec;
 };
 
@@ -372,7 +386,19 @@ template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
     return _mm_packus_epi16(src.fVec, src.fVec);
 }
 
-template<> inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
+template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
+    return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
+}
+
+template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
+    // TODO: merge with other work exploring best int -> uint16_t conversion.
+
+    // Sign extend to trick _mm_packs_epi32() into doing the pack we want.
+    __m128i x = _mm_srai_epi32(_mm_slli_epi32(src.fVec, 16), 16);
+    return _mm_packs_epi32(x,x);
+}
+
+template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
     return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
 }
author	mtklein <mtklein@chromium.org>	2016-07-15 07:00:11 -0700
committer	Commit bot <commit-bot@chromium.org>	2016-07-15 07:00:11 -0700
commit	58e389b0518b46bbe58ba01c23443cf23c18435c (patch)
tree	51f6d91fa6a116666c9c318897211cbc7ca0395b /src/opts
parent	428036621e1667b504051872869ac38cf6fac9c8 (diff)