aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/SSE
diff options
context:
space:
mode:
authorGravatar Christoph Hertzberg <chtz@informatik.uni-bremen.de>2020-09-27 23:02:34 +0200
committerGravatar Christoph Hertzberg <chtz@informatik.uni-bremen.de>2020-09-28 22:14:02 +0000
commit6b0c0b587ed8aaae72ef6a383617789d27cd1708 (patch)
treea446b36002743e725cd7b5e095ea182cfae9b4ee /Eigen/src/Core/arch/SSE
parent6425e875a1158e1e2a0afcf703105e8ddbfee7bd (diff)
Provide a more efficient Packet2l->Packet2d cast method
Diffstat (limited to 'Eigen/src/Core/arch/SSE')
-rw-r--r--Eigen/src/Core/arch/SSE/TypeCasting.h37
1 files changed, 22 insertions, 15 deletions
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 1f01d3c28..708385f1a 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -82,21 +82,6 @@ template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d
#endif
}
-template <>
-EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
-#ifdef EIGEN_VECTORIZE_SSE4_1
- int64_t a0 = _mm_extract_epi64(a, 0);
- int64_t a1 = _mm_extract_epi64(a, 1);
-#elif EIGEN_ARCH_x86_64
- int64_t a0 = _mm_cvtsi128_si64(a);
- int64_t a1 = _mm_cvtsi128_si64(_mm_unpackhi_epi64(a, a));
-#else
- int64_t a0 = a.m_val[0];
- int64_t a1 = a.m_val[1];
-#endif
- return _mm_set_pd(static_cast<double>(a1), static_cast<double>(a0));
-}
-
template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
return _mm_castps_si128(a);
}
@@ -113,6 +98,28 @@ template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const P
return _mm_castsi128_pd(a);
}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+ // AVX512DQ finally provides an instruction for this
+ return _mm_cvtepi64_pd(a);
+#else
+ // Before AVX512, there is no packed epi64 to double cast instruction
+ // The idea is to convert upper and lower half separately, via bit-twiddling
+ // then add them together, but remove the offsets
+ Packet2d upper = preinterpret<Packet2d>(plogical_shift_right<32>(a));
+ Packet2d lower = pand(pset1frombits<Packet2d>(0xffffffffUL), preinterpret<Packet2d>(a));
+ // upper = 2**(53+32) + ((a >> 32) + 0x80000000)
+ upper = pxor(pset1frombits<Packet2d>(0x4530000080000000UL), upper); // exponent of 52+32, and xor the upper bit of 32bit mantissa
+ // lower = 2**53 + (a & 0xffffffff)
+ lower = pxor(pset1frombits<Packet2d>(0x4330000000000000UL), lower); // exponent of 52
+ // adding upper+lower would be 2**84+2**63+2**52 too big. Create the negative of that:
+ Packet2d offset = pset1frombits<Packet2d>(0xC530000080100000UL);
+ // add everything together, start with the bigger numbers, since the 2**84 will cancel out, giving an exact result
+ return padd(padd(offset, upper), lower);
+#endif
+}
+
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#if 0