diff options
author | Christoph Hertzberg <chtz@informatik.uni-bremen.de> | 2020-09-27 23:02:34 +0200 |
---|---|---|
committer | Christoph Hertzberg <chtz@informatik.uni-bremen.de> | 2020-09-28 22:14:02 +0000 |
commit | 6b0c0b587ed8aaae72ef6a383617789d27cd1708 (patch) | |
tree | a446b36002743e725cd7b5e095ea182cfae9b4ee /Eigen | |
parent | 6425e875a1158e1e2a0afcf703105e8ddbfee7bd (diff) |
Provide a more efficient Packet2l->Packet2d cast method
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/arch/SSE/TypeCasting.h | 37 |
1 files changed, 22 insertions, 15 deletions
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index 1f01d3c28..708385f1a 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -82,21 +82,6 @@ template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d #endif } -template <> -EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - int64_t a0 = _mm_extract_epi64(a, 0); - int64_t a1 = _mm_extract_epi64(a, 1); -#elif EIGEN_ARCH_x86_64 - int64_t a0 = _mm_cvtsi128_si64(a); - int64_t a1 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(a, a)); -#else - int64_t a0 = a.m_val[0]; - int64_t a1 = a.m_val[1]; -#endif - return _mm_set_pd(static_cast<double>(a1), static_cast<double>(a0)); -} - template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) { return _mm_castps_si128(a); } @@ -113,6 +98,28 @@ template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const P return _mm_castsi128_pd(a); } +template <> +EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + // AVX512DQ finally provides an instruction for this + return _mm_cvtepi64_pd(a); +#else + // Before AVX512, there is no packed epi64 to double cast instruction + // The idea is to convert upper and lower half separately, via bit-twiddling + // then add them together, but remove the offsets + Packet2d upper = preinterpret<Packet2d>(plogical_shift_right<32>(a)); + Packet2d lower = pand(pset1frombits<Packet2d>(0xffffffffUL), preinterpret<Packet2d>(a)); + // upper = 2**(53+32) + ((a >> 32) + 0x80000000) + upper = pxor(pset1frombits<Packet2d>(0x4530000080000000UL), upper); // exponent of 52+32, and xor the upper bit of 32bit mantissa + // lower = 2**53 + (a & 0xffffffff) + lower = pxor(pset1frombits<Packet2d>(0x4330000000000000UL), lower); // exponent of 52 + // adding upper+lower would be 2**84+2**63+2**52 too big. Create the negative of that: + Packet2d offset = pset1frombits<Packet2d>(0xC530000080100000UL); + // add everything together, start with the bigger numbers, since the 2**84 will cancel out, giving an exact result + return padd(padd(offset, upper), lower); +#endif +} + // Disable the following code since it's broken on too many platforms / compilers. //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) #if 0 |