From c770746d709686ef2b8b652616d9232f9b028e78 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 24 Nov 2020 20:53:07 +0000 Subject: Fix Half NaN definition and test. The `half_float` test was failing with `-mcpu=cortex-a55` (native `__fp16`) due to a bad NaN bit-pattern comparison (in the case of casting a float to `__fp16`, the signaling `NaN` is quieted). There was also an inconsistency between `numeric_limits::quiet_NaN()` and `NumTraits::quiet_NaN()`. Here we correct the inconsistency and compare NaNs according to the IEEE 754 definition. Also modified the `bfloat16_float` test to match. Tested with `cortex-a53` and `cortex-a55`. --- Eigen/src/Core/GenericPacketMath.h | 14 ++++++ Eigen/src/Core/arch/AVX/Complex.h | 36 +++++++++++++- Eigen/src/Core/arch/AVX512/Complex.h | 17 +++++-- .../Core/arch/Default/GenericPacketMathFunctions.h | 56 ++++++++++++++++++++++ .../arch/Default/GenericPacketMathFunctionsFwd.h | 7 +++ Eigen/src/Core/arch/SSE/Complex.h | 38 +++++++++++++-- Eigen/src/Core/arch/SSE/PacketMath.h | 4 ++ test/packetmath.cpp | 15 +++++- 8 files changed, 176 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index e2fc7002b..e8065f8aa 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -539,6 +539,20 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } +/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0), + where x is the value of all 1-bits. */ +template EIGEN_DEVICE_FUNC inline Packet +peven_mask(const Packet& /*a*/) { + typedef typename unpacket_traits::type Scalar; + const size_t n = unpacket_traits::size; + Scalar elements[n]; + for(size_t i = 0; i < n; ++i) { + memset(elements+i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar)); + } + return ploadu(elements); +} + + /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) { (*to) = from; } diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 23568cae9..22f2c3de6 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -38,6 +38,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -47,7 +48,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet8f real; + enum { + size=4, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -228,6 +240,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -237,7 +250,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet4d real; + enum { + size=2, + alignment=Aligned32, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } @@ -399,6 +423,14 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet2cd psqrt(const Packet2cd& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet4cf psqrt(const Packet4cf& a) { + return psqrt_complex(a); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 53ee53d17..bee91c3bb 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -37,6 +37,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -47,6 +48,8 @@ template<> struct packet_traits > : default_packet_traits template<> struct unpacket_traits { typedef std::complex type; + typedef Packet4cf half; + typedef Packet16f real; enum { size = 8, alignment=unpacket_traits::alignment, @@ -54,7 +57,6 @@ template<> struct unpacket_traits { masked_load_available=false, masked_store_available=false }; - typedef Packet4cf half; }; template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } @@ -223,6 +225,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -233,6 +236,8 @@ template<> struct packet_traits > : default_packet_traits template<> struct unpacket_traits { typedef std::complex type; + typedef Packet2cd half; + typedef Packet8d real; enum { size = 4, alignment = unpacket_traits::alignment, @@ -240,7 +245,6 @@ template<> struct unpacket_traits { masked_load_available=false, masked_store_available=false }; - typedef Packet2cd half; }; template<> EIGEN_STRONG_INLINE Packet4cd padd(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } @@ -437,8 +441,15 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] } -} // end namespace internal +template<> EIGEN_STRONG_INLINE Packet4cd psqrt(const Packet4cd& a) { + return psqrt_complex(a); +} + +template<> EIGEN_STRONG_INLINE Packet8cf psqrt(const Packet8cf& a) { + return psqrt_complex(a); +} +} // end namespace internal } // end namespace Eigen #endif // EIGEN_COMPLEX_AVX512_H diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 6d92d1c72..4e8a42463 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -643,6 +643,62 @@ Packet pcos_float(const Packet& x) return psincos_float(x); } + +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename unpacket_traits::real RealPacket; + + // Computes the principal sqrt of the complex numbers. For clarity, the comments + // below spell out the steps, assuming Packet contains 2 complex numbers, e.g. + // a = [a0_r, a0_i, a1_r, a1_i] + // In other words, the function computes b = [b0_r, b0_i, b1_r, b1_i] such that + // (b0_r + i*b0_i)^2 = a0_r + i*a0_i, and + // (b1_r + i*b1_i)^2 = a1_r + i*a1_i . + + // Step 1. Compute l = [l0, l0, l1, l1], where + // l0 = sqrt(a0_r^2 + a0_i^2), l1 = sqrt(a1_r^2 + a1_i^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (x0 == 0 ? x0 : x0 * sqrt(1 + (y0/x0)**2)), + // where x0 = max(|a0_r|, |a0_i|), y0 = min(|a0_r|, |a0_i|) + // and similarly for l1. + Packet a_flip = pcplxflip(a); + Packet zero_mask; + zero_mask.v = pcmp_eq(a.v, pzero(a.v)); + RealPacket a_abs = pabs(a.v); // [|a0_i|, |a0_r|, |a1_i|, |a1_r|] + RealPacket a_abs_flip = pabs(a_flip.v); // [|a0_i|, |a0_r|, |a1_i|, |a1_r|] + RealPacket a_max = pmax(a_abs, a_abs_flip); + RealPacket a_min = pmin(a_abs, a_abs_flip); + RealPacket r = pdiv(a_min, a_max); + RealPacket one = pset1(RealScalar(1)); + RealPacket l = pmul(a_max, psqrt(padd(one, pmul(r, r)))); // [l0, l0, l1, l1] + // Set l to zero if both real and imaginary parts are zero. + l = pandnot(l, pand(zero_mask.v, pcplxflip(zero_mask).v)); + + // Step 2. Compute + // [ sqrt((l0 + a0_r)/2), sqrt((l0 - a0_r)/2), + // sqrt((l1 + a1_r)/2), sqrt((l1 - a1_r)/2) ] + Packet real_mask; + real_mask.v = peven_mask(real_mask.v); + Packet a_real = pand(a, real_mask); + l = padd(l, a_real.v); + l = psub(l, pcplxflip(a_real).v); + l = psqrt(pmul(l, pset1(RealScalar(0.5)))); + // If imag(a) is zero, we mask out the imaginary part, which should be zero. + l = pandnot(l, pandnot(zero_mask.v, real_mask.v)); + + //Step 3. Apply the sign of the imaginary parts of a to get the final result: + // b = [ sqrt((l0 + a0_r)/2), sign(a0_i)*sqrt((l0 - a0_r)/2), + // sqrt((l1 + a1_r)/2), sign(a1_i)*sqrt((l1 - a1_r)/2) ] + RealPacket imag_sign_mask = pset1(Scalar(RealScalar(0.0), RealScalar(-0.0))).v; + RealPacket imag_signs = pand(a.v, imag_sign_mask); + Packet result = Packet(pxor(l, imag_signs)); + return result; +} + /* polevl (modified for Eigen) * * Evaluate polynomial diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 0e02a1b20..cc6fc5ac7 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -70,8 +70,15 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet pcos_float(const Packet& x); +/** \internal \returns sqrt(x) for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet psqrt_complex(const Packet& x); + template struct ppolevl; + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 0d322a2a1..600488448 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -40,6 +40,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -50,7 +51,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet2cf half; + typedef Packet4f real; + enum { + size=2, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -83,7 +95,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con } template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } - template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } @@ -255,6 +266,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, + HasSqrt = 1, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -264,7 +276,18 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + typedef Packet1cd half; + typedef Packet2d real; + enum { + size=1, + alignment=Aligned16, + vectorizable=true, + masked_load_available=false, + masked_store_available=false + }; +}; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } @@ -426,8 +449,15 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, co return Packet2cf(_mm_castpd_ps(result)); } -} // end namespace internal +template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) { + return psqrt_complex(a); +} +template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { + return psqrt_complex(a); +} + +} // end namespace internal } // end namespace Eigen #endif // EIGEN_COMPLEX_SSE_H diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index ef77ab6fa..b68abec64 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -267,6 +267,10 @@ template<> EIGEN_STRONG_INLINE Packet16b pset1(const bool& from) { template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } template<> EIGEN_STRONG_INLINE Packet2d pset1frombits(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); } +template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { + return Packet4f(_mm_set_epi32(0, 0xffffffff, 0, 0xffffffff)); +} + template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index d52f997dc..40129075f 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -473,8 +473,6 @@ void packetmath() { CHECK_CWISE3_IF(true, internal::pselect, internal::pselect); } - CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); - for (int i = 0; i < size; ++i) { data1[i] = internal::random(); } @@ -486,6 +484,11 @@ void packetmath() { packetmath_boolean_mask_ops(); packetmath_pcast_ops_runner::run(); packetmath_minus_zero_add(); + + for (int i = 0; i < size; ++i) { + data1[i] = numext::abs(internal::random()); + } + CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); } template @@ -966,6 +969,8 @@ void test_conj_helper(Scalar* data1, Scalar* data2, Scalar* ref, Scalar* pval) { template void packetmath_complex() { + typedef internal::packet_traits PacketTraits; + typedef typename Scalar::value_type RealScalar; const int PacketSize = internal::unpacket_traits::size; const int size = PacketSize * 4; @@ -984,11 +989,17 @@ void packetmath_complex() { test_conj_helper(data1, data2, ref, pval); test_conj_helper(data1, data2, ref, pval); + // Test pcplxflip. { for (int i = 0; i < PacketSize; ++i) ref[i] = Scalar(std::imag(data1[i]), std::real(data1[i])); internal::pstore(pval, internal::pcplxflip(internal::pload(data1))); VERIFY(test::areApprox(ref, pval, PacketSize) && "pcplxflip"); } + + for (int i = 0; i < size; ++i) { + data1[i] = Scalar(internal::random(), internal::random()); + } + CHECK_CWISE1_IF(PacketTraits::HasSqrt, numext::sqrt, internal::psqrt); } template -- cgit v1.2.3