From 21edea5eddb282f12d38938d657973b1f8720779 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 15 Oct 2020 18:39:58 -0700 Subject: Fix the specialization of pfrexp for AVX to be faster when AVX2/AVX512DQ is not available, and avoid undefined behavior in C++. Also mask off the sign bit when extracting the exponent. --- Eigen/src/Core/arch/SSE/PacketMath.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'Eigen/src/Core/arch/SSE') diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 602adbad3..bd354c407 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -805,10 +805,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Pack template<> EIGEN_STRONG_INLINE Packet2d pfrexp(const Packet2d& a, Packet2d& exponent) { const Packet2d cst_1022d = pset1(1022.0); const Packet2d cst_half = pset1(0.5); - const Packet2d cst_inv_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); - __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(a), 52); + const Packet2d cst_exp_mask = pset1frombits(static_cast(0x7ff0000000000000ull)); + __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52); exponent = psub(_mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3)), cst_1022d); - return por(pand(a, cst_inv_mant_mask), cst_half); + const Packet2d cst_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); + return por(pand(a, cst_mant_mask), cst_half); } template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { -- cgit v1.2.3