diff options
Diffstat (limited to 'Eigen/src')
-rw-r--r-- | Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 16 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 30 | ||||
-rwxr-xr-x | Eigen/src/Core/arch/SSE/PacketMath.h | 30 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 64 |
4 files changed, 106 insertions, 34 deletions
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index b1d4be32d..411640ee8 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -630,14 +630,6 @@ __attribute__((optimize("-fno-unsafe-math-optimizations"))) #endif Packet psincos_float(const Packet& _x) { -// Workaround -ffast-math aggressive optimizations -// See bug 1674 -#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE) -#define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X)); -#else -#define EIGEN_SINCOS_DONT_OPT(X) -#endif - typedef typename unpacket_traits<Packet>::integer_packet PacketI; const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI @@ -652,7 +644,7 @@ Packet psincos_float(const Packet& _x) // Rounding trick: Packet y_round = padd(y, cst_rounding_magic); - EIGEN_SINCOS_DONT_OPT(y_round) + EIGEN_OPTIMIZATION_BARRIER(y_round) PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24) y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi @@ -674,9 +666,9 @@ Packet psincos_float(const Packet& _x) // and 2 ULP up to: const float huge_th = ComputeSine ? 25966.f : 18838.f; x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000 - EIGEN_SINCOS_DONT_OPT(x) + EIGEN_OPTIMIZATION_BARRIER(x) x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000 - EIGEN_SINCOS_DONT_OPT(x) + EIGEN_OPTIMIZATION_BARRIER(x) x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000 x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee @@ -753,8 +745,6 @@ Packet psincos_float(const Packet& _x) // Update the sign and filter huge inputs return pxor(y, sign_bit); - -#undef EIGEN_SINCOS_DONT_OPT } template<typename Packet> diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index ec6ea90c5..7d69de6dc 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3207,20 +3207,30 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet4f>(-static_cast<float>(1<<23)), - pset1<Packet4f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet2f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet2f>(-static_cast<float>(1<<23)), - pset1<Packet2f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23)); + const Packet2f abs_a = pabs(a); + Packet2f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index b9821ad80..d7b8bc8ac 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -646,20 +646,30 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re #else template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { // Adds and subtracts signum(a) * 2^23 to force rounding. - const Packet4f offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet4f>(-static_cast<float>(1<<23)), - pset1<Packet4f>(+static_cast<float>(1<<23))); - return psub(padd(a, offset), offset); + const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23)); + const Packet4f abs_a = pabs(a); + Packet4f r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { // Adds and subtracts signum(a) * 2^52 to force rounding. - const Packet2d offset = - pselect(pcmp_lt(a, pzero(a)), - pset1<Packet2d>(-static_cast<double>(1ull<<52)), - pset1<Packet2d>(+static_cast<double>(1ull<<52))); - return psub(padd(a, offset), offset); + const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52)); + const Packet2d abs_a = pabs(a); + Packet2d r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), + pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; } template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index ac514cbb4..43890eab1 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -51,7 +51,11 @@ #ifndef EIGEN_STACK_ALLOCATION_LIMIT // 131072 == 128 KB -#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#if defined(__AVX512F__) + #define EIGEN_STACK_ALLOCATION_LIMIT 0 +#else + #define EIGEN_STACK_ALLOCATION_LIMIT 16384 +#endif #endif //------------------------------------------------------------------------------------------ @@ -1063,6 +1067,64 @@ namespace Eigen { #endif +// Acts as a barrier preventing operations involving `X` from crossing. This +// occurs, for example, in the fast rounding trick where a magic constant is +// added then subtracted, which is otherwise compiled away with -ffast-math. +// +// See bug 1674 +#if !defined(EIGEN_OPTIMIZATION_BARRIER) + #if EIGEN_COMP_GNUC + // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: + // X: Any operand whatsoever. + // r: A register operand is allowed provided that it is in a general + // register. + // g: Any register, memory or immediate integer operand is allowed, except + // for registers that are not general registers. + // w: (AArch32/AArch64) Floating point register, Advanced SIMD vector + // register or SVE vector register. + // x: (SSE) Any SSE register. + // (AArch64) Like w, but restricted to registers 0 to 15 inclusive. + // v: (PowerPC) An Altivec vector register. + // wa:(PowerPC) A VSX register. + // + // "X" (uppercase) should work for all cases, though this seems to fail for + // some versions of GCC for arm/aarch64 with + // "error: inconsistent operand constraints in an 'asm'" + // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and + // vectors, otherwise + // "error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type" + // + // GCC for ppc64le generates an internal compiler error with x/X/g. + // GCC for AVX generates an internal compiler error with X. + // + // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq + // gcc for arm, aarch64, + // gcc for ppc64le, + // both vectors and scalars. + // + // Note that this is restricted to plain types - this will not work + // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these, + // you will need to apply to the underlying POD type. + #if EIGEN_ARCH_PPC + // General, Altivec, VSX. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #elif EIGEN_ARCH_ARM_OR_ARM64 + // General, NEON. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #elif EIGEN_ARCH_i386_OR_x86_64 + // General, SSE. + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); + #else + // Not implemented for other architectures. + #define EIGEN_OPTIMIZATION_BARRIER(X) + #endif + #else + // Not implemented for other compilers. + #define EIGEN_OPTIMIZATION_BARRIER(X) + #endif +#endif + #if EIGEN_COMP_MSVC // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362. // This workaround is ugly, but it does the job. |