From 4cb563a01e0619ea1798c7927f1909755ead2dd8 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 12 Oct 2020 12:24:08 +0100 Subject: Fix ldexp implementations. The previous implementations produced garbage values if the exponent did not fit within the exponent bits. See #2131 for a complete discussion, and !375 for other possible implementations. Here we implement the 4-factor version. See `pldexp_impl` in `GenericPacketMathFunctions.h` for a full description. The SSE `pcmp*` methods were moved down since `pcmp_le` requires `por`. Left as a "TODO" is to delegate to a faster version if we know the exponent does fit within the exponent bits. Fixes #2131. --- Eigen/src/Core/GenericPacketMath.h | 24 +--- Eigen/src/Core/arch/AVX/PacketMath.h | 44 ++++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 33 +++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 150 +++++++++++++++++---- .../Core/arch/Default/GenericPacketMathFunctions.h | 110 ++++++++++++--- .../arch/Default/GenericPacketMathFunctionsFwd.h | 21 ++- Eigen/src/Core/arch/SSE/PacketMath.h | 53 +++++--- test/packetmath.cpp | 35 ++++- 8 files changed, 356 insertions(+), 114 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 7160fa084..f8c7826db 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -453,7 +453,7 @@ EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { return result; } -/** \internal \returns a * 2^exponent +/** \internal \returns a * 2^((int)exponent) * See https://en.cppreference.com/w/cpp/numeric/math/ldexp */ template EIGEN_DEVICE_FUNC inline Packet @@ -905,28 +905,6 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } -/*************************************************************************** - * Some generic implementations to be used by implementors -***************************************************************************/ - -/** Default implementation of pfrexp for float. - * It is expected to be called by implementers of template<> pfrexp. - */ -template EIGEN_STRONG_INLINE Packet -pfrexp_float(const Packet& a, Packet& exponent); - -/** Default implementation of pldexp for float. - * It is expected to be called by implementers of template<> pldexp. - */ -template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent); - -/** Default implementation of pldexp for double. - * It is expected to be called by implementers of template<> pldexp. - */ -template EIGEN_STRONG_INLINE Packet -pldexp_double(Packet a, Packet exponent); - } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 2299e4d48..34bc242ca 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -273,6 +273,15 @@ template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const Packet8i& b) { +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_sub_epi32(a,b); +#else + __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); + __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { @@ -379,6 +388,7 @@ template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const return _mm256_min_pd(b,a); #endif } + template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // See pmin above @@ -756,17 +766,29 @@ template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, cons } template<> EIGEN_STRONG_INLINE Packet4d pldexp(const Packet4d& a, const Packet4d& exponent) { - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - Packet4i cst_1023 = pset1(1023); - __m128i emm0 = _mm256_cvtpd_epi32(exponent); - emm0 = _mm_add_epi32(emm0, cst_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - return pmul(a,_mm256_castsi256_pd(e)); + // Clamp exponent to [-2099, 2099] + const Packet4d max_exponent = pset1(2099.0); + const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet4i bias = pset1(1023); + Packet4i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + Packet4i lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + Packet4d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3); + lo = _mm_slli_epi64(hi, 52); + hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52); + c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; } template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index fa43d8809..b9e9fdbfd 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -917,16 +917,29 @@ template<> EIGEN_STRONG_INLINE Packet16f pldexp(const Packet16f& a, c } template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& exponent) { - // Build e=2^n by constructing the exponents in a 256-bit vector and - // shifting them to where they belong in double-precision values. - Packet8i cst_1023 = pset1(1023); - __m256i emm0 = _mm512_cvtpd_epi32(exponent); - emm0 = _mm256_add_epi32(emm0, cst_1023); - emm0 = _mm256_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m256i lo = _mm256_slli_epi64(emm0, 52); - __m256i hi = _mm256_slli_epi64(_mm256_srli_epi64(emm0, 32), 52); - __m512d b = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); - return pmul(a, b); + // Clamp exponent to [-2099, 2099] + const Packet8d max_exponent = pset1(2099.0); + const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply. + const Packet8i bias = pset1(1023); + Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4) + + // 2^b + Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0)); + Packet8i lo = _mm256_slli_epi64(hi, 52); + hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); + Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); + Packet8d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + + // 2^(e - 3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0)); + lo = _mm256_slli_epi64(hi, 52); + hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); + c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); + out = pmul(out, c); // a * 2^e + return out; } #ifdef EIGEN_VECTORIZE_AVX512DQ diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index fdf4f1e9c..f29b59590 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -2452,38 +2452,134 @@ static inline Packet2l ConvertToPacket2l(const Packet2d& x) { #endif } -template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { - - // build 2^n - Packet2l emm0 = ConvertToPacket2l(exponent); - -#ifdef __POWER8_VECTOR__ - const Packet2l p2l_1023 = { 1023, 1023 }; - const Packet2ul p2ul_52 = { 52, 52 }; - emm0 = vec_add(emm0, p2l_1023); - emm0 = vec_sl(emm0, p2ul_52); + +// Packet2l shifts. +// For POWER8 we simply use vec_sr/l. +// +// Things are more complicated for POWER7. There is actually a +// vec_xxsxdi intrinsic but it is not supported by some gcc versions. +// So we need to shift by N % 32 and rearrage bytes. +#ifdef __POWER8_VECTOR__ + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + const Packet2ul shift = { N, N }; + return vec_sl(a, shift); +} + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + const Packet2ul shift = { N, N }; + return vec_sr(a, shift); +} + #else - // Code is a bit complex for POWER7. There is actually a - // vec_xxsldi intrinsic but it is not supported by some gcc versions. - // So we shift (52-32) bits and do a word swap with zeros. - const Packet4i p4i_1023 = pset1(1023); - const Packet4i p4i_20 = pset1(20); // 52 - 32 - - Packet4i emm04i = reinterpret_cast(emm0); - emm04i = vec_add(emm04i, p4i_1023); - emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); + +// Shifts [A, B, C, D] to [B, 0, D, 0]. +// Used to implement left shifts for Packet2l. +EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) { static const Packet16uc perm = { - 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, - 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; -#ifdef _BIG_ENDIAN - emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); -#else - emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); -#endif + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; + #ifdef _BIG_ENDIAN + return vec_perm(p4i_ZERO, a, perm); + #else + return vec_perm(a, p4i_ZERO, perm); + #endif +} + +// Shifts [A, B, C, D] to [0, A, 0, C]. +// Used to implement right shifts for Packet2l. +EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) { + static const Packet16uc perm = { + 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b }; + #ifdef _BIG_ENDIAN + return vec_perm(p4i_ZERO, a, perm); + #else + return vec_perm(a, p4i_ZERO, perm); + #endif +} + +template +struct plogical_shift_left_impl; + +template +struct plogical_shift_left_impl= 0)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned n = static_cast(N); + const Packet4ui shift = {n, n, n, n}; + const Packet4i ai = reinterpret_cast(a); + static const unsigned m = static_cast(32 - N); + const Packet4ui shift_right = {m, m, m, m}; + const Packet4i out_hi = vec_sl(ai, shift); + const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right)); + return reinterpret_cast(por(out_hi, out_lo)); + } +}; + +template +struct plogical_shift_left_impl= 32)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned m = static_cast(N - 32); + const Packet4ui shift = {m, m, m, m}; + const Packet4i ai = reinterpret_cast(a); + return reinterpret_cast(shift_even_left(vec_sl(ai, shift))); + } +}; +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { + return plogical_shift_left_impl::run(a); +} + +template +struct plogical_shift_right_impl; + +template +struct plogical_shift_right_impl= 0)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned n = static_cast(N); + const Packet4ui shift = {n, n, n, n}; + const Packet4i ai = reinterpret_cast(a); + static const unsigned m = static_cast(32 - N); + const Packet4ui shift_left = {m, m, m, m}; + const Packet4i out_lo = vec_sr(ai, shift); + const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left)); + return reinterpret_cast(por(out_hi, out_lo)); + } +}; + +template +struct plogical_shift_right_impl= 32)>::type> { + static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) { + static const unsigned m = static_cast(N - 32); + const Packet4ui shift = {m, m, m, m}; + const Packet4i ai = reinterpret_cast(a); + return reinterpret_cast(shift_odd_right(vec_sr(ai, shift))); + } +}; + +template +EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { + return plogical_shift_right_impl::run(a); +} #endif - return pmul(a, reinterpret_cast(emm0)); +template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2l e = ConvertToPacket2l(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + + // Split 2^e into four factors and multiply: + const Packet2l bias = { 1023, 1023 }; + Packet2l b = plogical_shift_right<2>(e); // floor(e/4) + Packet2d c = reinterpret_cast(plogical_shift_left<52>(b + bias)); + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = reinterpret_cast(plogical_shift_left<52>(b + bias)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; } template<> EIGEN_STRONG_INLINE Packet2d pfrexp (const Packet2d& a, Packet2d& exponent) { diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index b4fa0489b..09146f496 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -40,30 +40,99 @@ pfrexp_double(const Packet& a, Packet& exponent) { typedef typename unpacket_traits::integer_packet PacketI; const Packet cst_1022d = pset1(1022.0); const Packet cst_half = pset1(0.5); - const Packet cst_inv_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); + const Packet cst_inv_mant_mask = pset1frombits(static_cast(~0x7ff0000000000000ull)); exponent = psub(pcast(plogical_shift_right<52>(preinterpret(pabs(a)))), cst_1022d); return por(pand(a, cst_inv_mant_mask), cst_half); } -template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent) -{ +// Safely applies ldexp, correctly handles overflows, underflows and denormals. +// Assumes IEEE floating point format. +template +struct pldexp_impl { typedef typename unpacket_traits::integer_packet PacketI; - const Packet cst_127 = pset1(127.f); - // return a * 2^exponent - PacketI ei = pcast(padd(exponent, cst_127)); - return pmul(a, preinterpret(plogical_shift_left<23>(ei))); -} + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = std::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + Packet run(const Packet& a, const Packet& exponent) { + // We want to return a * 2^exponent, allowing for all possible integer + // exponents without overflowing or underflowing in intermediate + // computations. + // + // Since 'a' and the output can be denormal, the maximum range of 'exponent' + // to consider for a float is: + // -255-23 -> 255+23 + // Below -278 any finite float 'a' will become zero, and above +278 any + // finite float will become inf, including when 'a' is the smallest possible + // denormal. + // + // Unfortunately, 2^(278) cannot be represented using either one or two + // finite normal floats, so we must split the scale factor into at least + // three parts. It turns out to be faster to split 'exponent' into four + // factors, since [exponent>>2] is much faster to compute that [exponent/3]. + // + // Set e = min(max(exponent, -278), 278); + // b = floor(e/4); + // out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b)) + // + // This will avoid any intermediate overflows and correctly handle 0, inf, + // NaN cases. + const Packet max_exponent = pset1(Scalar( (ScalarI(1)<((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1)); // 127 + const PacketI e = pcast(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent)); + PacketI b = parithmetic_shift_right<2>(e); // floor(e/4); + Packet c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^b + Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(e, b), b), b); // e - 3b + c = preinterpret(plogical_shift_left(padd(b, bias))); // 2^(e-3*b) + out = pmul(out, c); + return out; + } +}; -template EIGEN_STRONG_INLINE Packet -pldexp_double(Packet a, Packet exponent) -{ +// Explicitly multiplies +// a * (2^e) +// clamping e to the range +// [std::numeric_limits::min_exponent-2, std::numeric_limits::max_exponent] +// +// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow +// if 2^e doesn't fit into a normal floating-point Scalar. +// +// Assumes IEEE floating point format +template +struct pldexp_fast_impl { typedef typename unpacket_traits::integer_packet PacketI; - const Packet cst_1023 = pset1(1023.0); - // return a * 2^exponent - PacketI ei = pcast(padd(exponent, cst_1023)); - return pmul(a, preinterpret(plogical_shift_left<52>(ei))); -} + typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type ScalarI; + enum { + TotalBits = sizeof(Scalar) * CHAR_BIT, + MantissaBits = std::numeric_limits::digits - 1, + ExponentBits = int(TotalBits) - int(MantissaBits) - 1 + }; + + static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC + Packet run(const Packet& a, const Packet& exponent) { + const Packet bias = pset1(Scalar((ScalarI(1)<<(int(ExponentBits)-1)) - ScalarI(1))); // 127 + const Packet limit = pset1(Scalar((ScalarI(1)<(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127 + // return a * (2^e) + return pmul(a, preinterpret(plogical_shift_left(e))); + } +}; + +template EIGEN_STRONG_INLINE Packet +pldexp_float(const Packet& a, const Packet& exponent) +{ return pldexp_impl::run(a, exponent); } + +template EIGEN_STRONG_INLINE Packet +pldexp_double(const Packet& a, const Packet& exponent) +{ return pldexp_impl::run(a, exponent); } // Natural or base 2 logarithm. // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) @@ -394,6 +463,7 @@ Packet pexp_float(const Packet _x) y = pmadd(y, r2, y2); // Return 2^m * exp(r). + // TODO: replace pldexp with faster implementation since y in [-1, 1). return pmax(pldexp(y,m), _x); } @@ -462,6 +532,7 @@ Packet pexp_double(const Packet _x) // Construct the result 2^n * exp(g) = e * x. The max is used to catch // non-finite values in the input. + // TODO: replace pldexp with faster implementation since x in [-1, 1). return pmax(pldexp(x,fx), _x); } @@ -897,6 +968,8 @@ Packet generic_pow_impl(const Packet& x, const Packet& y) { // Note: I experimented with using Dekker's algorithms for the // multiplication by ln(2) here, but did not see any difference. Packet e_r = pexp(pmul(pset1(Scalar(EIGEN_LN2)), r_z)); + // TODO: investigate bounds of e_r and n_z, potentially using faster + // implementation of ldexp. return pldexp(e_r, n_z); } @@ -909,6 +982,7 @@ Packet generic_pow(const Packet& x, const Packet& y) { const Packet cst_pos_inf = pset1(NumTraits::infinity()); const Packet cst_zero = pset1(Scalar(0)); const Packet cst_one = pset1(Scalar(1)); + const Packet cst_half = pset1(Scalar(0.5)); const Packet cst_nan = pset1(NumTraits::quiet_NaN()); Packet abs_x = pabs(x); @@ -937,7 +1011,7 @@ Packet generic_pow(const Packet& x, const Packet& y) { // Predicates for whether y is integer and/or even. Packet y_is_int = pcmp_eq(pfloor(y), y); - Packet y_div_2 = pldexp(y, pset1(Scalar(-1))); + Packet y_div_2 = pmul(y, cst_half); Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); // Predicates encoding special cases for the value of pow(x,y) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index a623f54cb..96c572fd3 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -21,14 +21,33 @@ namespace internal { template EIGEN_DEVICE_FUNC inline Packet pset(const typename unpacket_traits::type (&a)[N] /* a */); +/*************************************************************************** + * Some generic implementations to be used by implementors +***************************************************************************/ + +/** Default implementation of pfrexp for float. + * It is expected to be called by implementers of template<> pfrexp. + */ template EIGEN_STRONG_INLINE Packet pfrexp_float(const Packet& a, Packet& exponent); +/** Default implementation of pfrexp for double. + * It is expected to be called by implementers of template<> pfrexp. + */ template EIGEN_STRONG_INLINE Packet pfrexp_double(const Packet& a, Packet& exponent); +/** Default implementation of pldexp for float. + * It is expected to be called by implementers of template<> pldexp. + */ +template EIGEN_STRONG_INLINE Packet +pldexp_float(const Packet& a, const Packet& exponent); + +/** Default implementation of pldexp for double. + * It is expected to be called by implementers of template<> pldexp. + */ template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent); +pldexp_double(const Packet& a, const Packet& exponent); /** \internal \returns log(x) for single precision float */ template diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 05d9f8edd..422b0fce7 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -399,21 +399,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, con } #endif -template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } -template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); } - - template<> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } template<> EIGEN_STRONG_INLINE Packet16b ptrue(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); } template<> EIGEN_STRONG_INLINE Packet4f @@ -447,6 +432,21 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } +template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } + template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may @@ -907,13 +907,22 @@ template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, cons // We specialize pldexp here, since the generic implementation uses Packet2l, which is not well // supported by SSE, and has more range than is needed for exponents. template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { - const Packet2d cst_1023 = pset1(1023.0); - // Add exponent offset. - Packet4i ei = _mm_cvtpd_epi32(padd(exponent, cst_1023)); - // Convert to exponents to int64 and swizzle to the low-order 32 bits. - ei = vec4i_swizzle1(ei, 0, 3, 1, 3); - // return a * 2^exponent - return pmul(a, _mm_castsi128_pd(_mm_slli_epi64(ei, 52))); + // Clamp exponent to [-2099, 2099] + const Packet2d max_exponent = pset1(2099.0); + const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent); + + // Convert e to integer and swizzle to low-order bits. + const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3); + + // Split 2^e into four factors and multiply: + const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023); + Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4) + Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b + Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b) + b = psub(psub(psub(ei, b), b), b); // e - 3b + c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b) + out = pmul(out, c); // a * 2^e + return out; } // with AVX, the default implementations based on pload1 are faster diff --git a/test/packetmath.cpp b/test/packetmath.cpp index b7562e6a1..c388f3a31 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -573,10 +573,41 @@ void packetmath_real() { data2[i] = Scalar(internal::random(-1, 1)); } for (int i = 0; i < PacketSize; ++i) { - data1[i+PacketSize] = Scalar(internal::random(0, 4)); - data2[i+PacketSize] = Scalar(internal::random(0, 4)); + data1[i+PacketSize] = Scalar(internal::random(-4, 4)); + data2[i+PacketSize] = Scalar(internal::random(-4, 4)); } CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + if (PacketTraits::HasExp) { + data1[0] = Scalar(-1); + // underflow to zero + data1[PacketSize] = Scalar(std::numeric_limits::min_exponent-10); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + // overflow to inf + data1[PacketSize] = Scalar(std::numeric_limits::max_exponent+10); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + // NaN stays NaN + data1[0] = NumTraits::quiet_NaN(); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + VERIFY((numext::isnan)(data2[0])); + // inf stays inf + data1[0] = NumTraits::infinity(); + data1[PacketSize] = Scalar(std::numeric_limits::min_exponent-10); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + // zero stays zero + data1[0] = Scalar(0); + data1[PacketSize] = Scalar(std::numeric_limits::max_exponent+10); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + // Small number big exponent. + data1[0] = Scalar(std::ldexp(Scalar(1.0), std::numeric_limits::min_exponent-1)); + data1[PacketSize] = Scalar(-std::numeric_limits::min_exponent + +std::numeric_limits::max_exponent); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + // Big number small exponent. + data1[0] = Scalar(std::ldexp(Scalar(1.0), std::numeric_limits::max_exponent-1)); + data1[PacketSize] = Scalar(+std::numeric_limits::min_exponent + -std::numeric_limits::max_exponent); + CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp); + } for (int i = 0; i < size; ++i) { data1[i] = Scalar(internal::random(-1, 1) * std::pow(10., internal::random(-6, 6))); -- cgit v1.2.3