Specialize pldexp_double and pfdexp_double and get rid of Packet2l definition for SSE. SSE does not support conversion between 64 bit integers and double and the existing implementation of casting between Packet2d and Packer2l results in undefined behavior when casting NaN to int. Since pldexp and pfdexp only manipulate exponent fields that fit in 32 bit, this change provides specializations that use existing instructions _mm_cvtpd_pi32 and _mm_cvtsi32_pd instead.

author: Rasmus Munk Larsen <rmlarsen@google.com> 2020-09-30 13:33:44 -0700
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2020-09-30 13:33:44 -0700
commit: 44b9d4e412b1afe110998aab3dff64d1c0ff0710 (patch)
tree: eddcb1f849512fc611a55fdf8d4283b690f87244 /Eigen/src/Core/arch/SSE
parent: d5a0d894915c19199590b32149834351b60e2fd1 (diff)
2 files changed, 17 insertions, 50 deletions
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index fb584c2af..64479f5d4 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -45,11 +45,11 @@ typedef __m128d Packet2d;
 
 typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
 typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
-typedef eigen_packet_wrapper<__m128i, 2> Packet2l;
 
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+template<> struct is_arithmetic<Packet4i>  { enum { value = true }; };
 template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
 
 #define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
@@ -194,7 +194,6 @@ template<> struct unpacket_traits<Packet4f> {
 template<> struct unpacket_traits<Packet2d> {
   typedef double    type;
   typedef Packet2d  half;
-  typedef Packet2l  integer_packet;
   enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
 };
 template<> struct unpacket_traits<Packet4i> {
@@ -487,9 +486,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con
 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return _mm_srai_epi32(a,N); }
 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) { return _mm_srli_epi32(a,N); }
 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return _mm_slli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) { return _mm_srli_epi64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return _mm_slli_epi64(a,N); }
-
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
@@ -756,15 +752,29 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Pack
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
-  return pfrexp_double(a,exponent);
+  const Packet2d cst_1022d = pset1<Packet2d>(1022.0);
+  const Packet2d cst_half = pset1<Packet2d>(0.5);
+  const Packet2d cst_inv_mant_mask  = pset1frombits<Packet2d>(static_cast<uint64_t>(~0x7ff0000000000000ull));
+  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(a), 52);
+  exponent = psub(_mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)), cst_1022d);
+  return por(pand(a, cst_inv_mant_mask), cst_half);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
   return pldexp_float(a,exponent);
 }
 
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
-  return pldexp_double(a,exponent);
+  const Packet2d cst_1023 = pset1<Packet2d>(1023.0);
+  // Add exponent offset.
+  __m64 ei = _mm_cvtpd_pi32(padd(exponent, cst_1023));
+  // Convert to exponents to int64 and swizzle to the low-order 32 bits.
+  __m128i el = _mm_set_epi64(_mm_setzero_si64(), ei);
+  el = vec4i_swizzle1(el, 0, 3, 1, 3);
+  // return a * 2^exponent
+  return pmul(a, _mm_castsi128_pd(_mm_slli_epi64(el, 52)));
 }
 
 // with AVX, the default implementations based on pload1 are faster
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index bb68986c2..3e6cd90e5 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,19 +69,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
   return _mm_cvtps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
-  // using a[1]/a[0] to get high/low 64 bit from __m128d is faster than _mm_cvtsd_f64() ,but 
-  // it will trigger the bug report at https://gitlab.com/libeigen/eigen/-/issues/1997 since the 
-  // a[index] ops was not supported by MSVC compiler(supported by gcc).
-#if EIGEN_COMP_MSVC
-  return _mm_set_epi64x(int64_t(_mm_cvtsd_f64(_mm_unpackhi_pd(a,a))), int64_t(_mm_cvtsd_f64(a)));
-#elif ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
-  return _mm_set_epi64x(int64_t(a.m_val[1]), int64_t(a.m_val[0]));
-#else
-  return _mm_set_epi64x(int64_t(a[1]), int64_t(a[0]));
-#endif
-}
-
 template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
   return _mm_castps_si128(a);
 }
@@ -90,36 +77,6 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Pa
   return _mm_castsi128_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l,Packet2d>(const Packet2d& a) {
-  return _mm_castpd_si128(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
-  return _mm_castsi128_pd(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  // AVX512DQ finally provides an instruction for this
-  return _mm_cvtepi64_pd(a);
-#else
-  // Before AVX512, there is no packed epi64 to double cast instruction
-  // The idea is to convert upper and lower half separately, via bit-twiddling
-  // then add them together, but remove the offsets
-  Packet2d upper = preinterpret<Packet2d>(plogical_shift_right<32>(a));
-  Packet2d lower = pand(pset1frombits<Packet2d>(static_cast<uint64_t>(0xffffffffULL)), preinterpret<Packet2d>(a));
-  // upper = 2**(53+32) + ((a >> 32) + 0x80000000)
-  upper = pxor(pset1frombits<Packet2d>(static_cast<uint64_t>(0x4530000080000000ULL)), upper); // exponent of 52+32, and xor the upper bit of 32bit mantissa
-  // lower = 2**53 + (a & 0xffffffff)
-  lower = pxor(pset1frombits<Packet2d>(static_cast<uint64_t>(0x4330000000000000ULL)), lower); // exponent of 52
-  // adding upper+lower would be 2**84+2**63+2**52 too big. Create the negative of that:
-  Packet2d offset = pset1frombits<Packet2d>(static_cast<uint64_t>(0xC530000080100000ULL));
-  // add everything together, start with the bigger numbers, since the 2**84 will cancel out, giving an exact result
-  return padd(padd(offset, upper), lower);
-#endif
-}
-
 // Disable the following code since it's broken on too many platforms / compilers.
 //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
 #if 0
author	Rasmus Munk Larsen <rmlarsen@google.com>	2020-09-30 13:33:44 -0700
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2020-09-30 13:33:44 -0700
commit	44b9d4e412b1afe110998aab3dff64d1c0ff0710 (patch)
tree	eddcb1f849512fc611a55fdf8d4283b690f87244 /Eigen/src/Core/arch/SSE
parent	d5a0d894915c19199590b32149834351b60e2fd1 (diff)