From cc0c38ace87f9b77a21b2ad1b20b0c4f97b24719 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 15 Sep 2020 11:07:57 -0400 Subject: Remove old Clang compiler bug work-arounds. The two LLVM bugs referenced in the comments here have long been fixed. The workarounds were now detrimental because (1) they prevented using fused mul-add on Clang/ARM32 and (2) the unnecessary 'volatile' in 'asm volatile' prevented legitimate reordering by the compiler. --- Eigen/src/Core/arch/NEON/PacketMath.h | 31 ++----------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) (limited to 'Eigen/src/Core/arch/NEON/PacketMath.h') diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 530adfeec..463ae58ad 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -1010,17 +1010,8 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, return pset1(0ULL); } -// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, -// then implements a slow software scalar fallback calling fmaf()! -// Filed LLVM bug: -// https://llvm.org/bugs/show_bug.cgi?id=27216 -#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) -// See bug 936. -// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. -// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. -// MLA is not fused i.e. does 2 roundings. -// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): -// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. + +#ifdef __ARM_FEATURE_FMA template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) @@ -1028,25 +1019,7 @@ template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& #else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { -#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM - // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, - // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on - // -march=armv7-a, that is a very common case. - // See e.g. this thread: - // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html - // Filed LLVM bug: - // https://llvm.org/bugs/show_bug.cgi?id=27219 - Packet4f r = c; - asm volatile( - "vmla.f32 %q[r], %q[a], %q[b]" - : [r] "+w" (r) - : [a] "w" (a), - [b] "w" (b) - : ); - return r; -#else return vmlaq_f32(c,a,b); -#endif } template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) { -- cgit v1.2.3