Fix ldexp implementations.

The previous implementations produced garbage values if the exponent did not fit within the exponent bits. See #2131 for a complete discussion, and !375 for other possible implementations. Here we implement the 4-factor version. See `pldexp_impl` in `GenericPacketMathFunctions.h` for a full description. The SSE `pcmp*` methods were moved down since `pcmp_le<Packet4i>` requires `por`. Left as a "TODO" is to delegate to a faster version if we know the exponent does fit within the exponent bits. Fixes #2131.
author: Antonio Sanchez <cantonios@google.com> 2020-10-12 12:24:08 +0100
committer: Rasmus Munk Larsen <rmlarsen@google.com> 2021-02-10 22:45:41 +0000
commit: 4cb563a01e0619ea1798c7927f1909755ead2dd8 (patch)
tree: f1a1c213a13ad6320fa86ebb144af777568eeeea /Eigen/src/Core/arch/AVX512
parent: 7eb07da538ecc1b8937bfb5dac0d071067728397 (diff)
1 files changed, 23 insertions, 10 deletions
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index fa43d8809..b9e9fdbfd 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -917,16 +917,29 @@ template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, c
 }
 
 template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
-  // Build e=2^n by constructing the exponents in a 256-bit vector and
-  // shifting them to where they belong in double-precision values.
-  Packet8i cst_1023 = pset1<Packet8i>(1023);
-  __m256i emm0 = _mm512_cvtpd_epi32(exponent);
-  emm0 = _mm256_add_epi32(emm0, cst_1023);
-  emm0 = _mm256_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m256i lo = _mm256_slli_epi64(emm0, 52);
-  __m256i hi = _mm256_slli_epi64(_mm256_srli_epi64(emm0, 32), 52);
-  __m512d b = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
-  return pmul(a, b);
+  // Clamp exponent to [-2099, 2099]
+  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
+  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  
+  // Split 2^e into four factors and multiply.
+  const Packet8i bias = pset1<Packet8i>(1023);
+  Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+  
+  // 2^b
+  Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  Packet8i lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  Packet8d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+  
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
 }
 
 #ifdef EIGEN_VECTORIZE_AVX512DQ
author	Antonio Sanchez <cantonios@google.com>	2020-10-12 12:24:08 +0100
committer	Rasmus Munk Larsen <rmlarsen@google.com>	2021-02-10 22:45:41 +0000
commit	4cb563a01e0619ea1798c7927f1909755ead2dd8 (patch)
tree	f1a1c213a13ad6320fa86ebb144af777568eeeea /Eigen/src/Core/arch/AVX512
parent	7eb07da538ecc1b8937bfb5dac0d071067728397 (diff)