From 6b0c0b587ed8aaae72ef6a383617789d27cd1708 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Sun, 27 Sep 2020 23:02:34 +0200 Subject: Provide a more efficient Packet2l->Packet2d cast method --- Eigen/src/Core/arch/SSE/TypeCasting.h | 37 +++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'Eigen/src/Core/arch/SSE') diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h index 1f01d3c28..708385f1a 100644 --- a/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -82,21 +82,6 @@ template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d #endif } -template <> -EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { -#ifdef EIGEN_VECTORIZE_SSE4_1 - int64_t a0 = _mm_extract_epi64(a, 0); - int64_t a1 = _mm_extract_epi64(a, 1); -#elif EIGEN_ARCH_x86_64 - int64_t a0 = _mm_cvtsi128_si64(a); - int64_t a1 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(a, a)); -#else - int64_t a0 = a.m_val[0]; - int64_t a1 = a.m_val[1]; -#endif - return _mm_set_pd(static_cast(a1), static_cast(a0)); -} - template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { return _mm_castps_si128(a); } @@ -113,6 +98,28 @@ template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const P return _mm_castsi128_pd(a); } +template <> +EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + // AVX512DQ finally provides an instruction for this + return _mm_cvtepi64_pd(a); +#else + // Before AVX512, there is no packed epi64 to double cast instruction + // The idea is to convert upper and lower half separately, via bit-twiddling + // then add them together, but remove the offsets + Packet2d upper = preinterpret(plogical_shift_right<32>(a)); + Packet2d lower = pand(pset1frombits(0xffffffffUL), preinterpret(a)); + // upper = 2**(53+32) + ((a >> 32) + 0x80000000) + upper = pxor(pset1frombits(0x4530000080000000UL), upper); // exponent of 52+32, and xor the upper bit of 32bit mantissa + // lower = 2**53 + (a & 0xffffffff) + lower = pxor(pset1frombits(0x4330000000000000UL), lower); // exponent of 52 + // adding upper+lower would be 2**84+2**63+2**52 too big. Create the negative of that: + Packet2d offset = pset1frombits(0xC530000080100000UL); + // add everything together, start with the bigger numbers, since the 2**84 will cancel out, giving an exact result + return padd(padd(offset, upper), lower); +#endif +} + // Disable the following code since it's broken on too many platforms / compilers. //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) #if 0 -- cgit v1.2.3