diff options
Diffstat (limited to 'Eigen/src/Core')
-rw-r--r-- | Eigen/src/Core/MathFunctions.h | 33 | ||||
-rw-r--r-- | Eigen/src/Core/NumTraits.h | 8 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/Half.h | 63 | ||||
-rw-r--r-- | Eigen/src/Core/arch/NEON/PacketMath.h | 41 | ||||
-rw-r--r-- | Eigen/src/Core/functors/UnaryFunctors.h | 2 | ||||
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 4 |
6 files changed, 131 insertions, 20 deletions
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index ec75175ca..e6c7dfa08 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -962,6 +962,15 @@ T (ceil)(const T& x) return ceil(x); } +#ifdef __CUDACC__ +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float ceil(const float &x) { return ::ceilf(x); } + +template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double ceil(const double &x) { return ::ceil(x); } +#endif + + /** Log base 2 for 32 bits positive integers. * Conveniently returns 0 for x==0. */ inline int log2(int x) @@ -1025,7 +1034,7 @@ double tan(const double &x) { return ::tan(x); } template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T abs(const T &x) { +typename NumTraits<T>::Real abs(const T &x) { EIGEN_USING_STD_MATH(abs); return abs(x); } @@ -1053,6 +1062,28 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } #endif + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T fmod(const T& a, const T& b) { + EIGEN_USING_STD_MATH(floor); + return fmod(a, b); +} + +#ifdef __CUDACC__ +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float fmod(const float& a, const float& b) { + return ::fmodf(a, b); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double fmod(const double& a, const double& b) { + return ::fmod(a, b); +} +#endif + } // end namespace numext namespace internal { diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index b7b5e7d22..e065fa714 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -153,7 +153,9 @@ template<typename _Real> struct NumTraits<std::complex<_Real> > MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost }; + EIGEN_DEVICE_FUNC static inline Real epsilon() { return NumTraits<Real>::epsilon(); } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); } }; @@ -166,7 +168,7 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar; typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger; typedef ArrayType & Nested; - + enum { IsComplex = NumTraits<Scalar>::IsComplex, IsInteger = NumTraits<Scalar>::IsInteger, @@ -176,8 +178,10 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost, MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost }; - + + EIGEN_DEVICE_FUNC static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); } + EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); } }; diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 61131828f..212aa0d5d 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -341,6 +341,18 @@ template<> struct is_arithmetic<half> { enum { value = true }; }; } // end namespace internal +template<> struct NumTraits<Eigen::half> + : GenericNumTraits<Eigen::half> +{ + EIGEN_DEVICE_FUNC static inline float dummy_precision() { return 1e-3f; } + EIGEN_DEVICE_FUNC static inline Eigen::half highest() { + return internal::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static inline Eigen::half lowest() { + return internal::raw_uint16_to_half(0xfbff); + } +}; + // Infinity/NaN checks. namespace numext { @@ -348,7 +360,7 @@ namespace numext { static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { return (a.x & 0x7fff) == 0x7c00; } -static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { +static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hisnan(a); #else @@ -361,9 +373,6 @@ static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { } // end namespace Eigen // Standard mathematical functions and trancendentals. - -namespace std { - static inline EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { Eigen::half result; result.x = a.x & 0x7FFF; @@ -375,6 +384,45 @@ static inline EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { return Eigen::half(::logf(float(a))); } +static inline EIGEN_DEVICE_FUNC Eigen::half sqrt(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +static inline EIGEN_DEVICE_FUNC Eigen::half floor(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +static inline EIGEN_DEVICE_FUNC Eigen::half ceil(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} +static inline EIGEN_DEVICE_FUNC bool (isnan)(const Eigen::half& a) { + return (Eigen::numext::isnan)(a); +} +static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { + return (Eigen::numext::isinf)(a); +} +static inline EIGEN_DEVICE_FUNC bool (isfinite)(const Eigen::half& a) { + return !(Eigen::numext::isinf)(a) && !(Eigen::numext::isnan)(a); +} + + +namespace std { + +// Import the standard mathematical functions and trancendentals into the +// into the std namespace. +using ::abs; +using ::exp; +using ::log; +using ::sqrt; +using ::floor; +using ::ceil; + +#if __cplusplus > 199711L +template <> +struct hash<Eigen::half> { + size_t operator()(const Eigen::half& a) const { + return std::hash<unsigned short>()(a.x); + } +}; +#endif } // end namespace std @@ -384,7 +432,14 @@ static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { __device__ inline Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); } +#endif +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320 +static inline EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::internal::raw_uint16_to_half( + __ldg(reinterpret_cast<const unsigned short*>(ptr))); +} #endif diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index fc4c0d03a..63a2d9f52 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -177,7 +177,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co return pset1<Packet4i>(0); } -#ifdef __ARM_FEATURE_FMA +// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, +// then implements a slow software scalar fallback calling fmaf()! +#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) // See bug 936. // FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. // FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. @@ -186,7 +188,25 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co // MLA: 10 GFlop/s ; FMA: 12 GFlops/s. template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { +#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM + // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, + // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on + // -march=armv7-a, that is a very common case. + // See e.g. this thread: + // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html + Packet4f r = c; + asm volatile( + "vmla.f32 %q[r], %q[a], %q[b]" + : [r] "+w" (r) + : [a] "w" (a), + [b] "w" (b) + : ); + return r; +#else + return vmlaq_f32(c,a,b); +#endif +} #endif // No FMA instruction for int, so use MLA unconditionally. @@ -532,20 +552,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) { #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG -#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__) // Bug 907: workaround missing declarations of the following two functions in the ADK -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vreinterpretq_u64_f64 (float64x2_t __a) +// Defining these functions as templates ensures that if these intrinsics are +// already defined in arm_neon.h, then our workaround doesn't cause a conflict +// and has lower priority in overload resolution. +template <typename T> +uint64x2_t vreinterpretq_u64_f64(T a) { - return (uint64x2_t) __a; + return (uint64x2_t) a; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vreinterpretq_f64_u64 (uint64x2_t __a) +template <typename T> +float64x2_t vreinterpretq_f64_u64(T a) { - return (float64x2_t) __a; + return (float64x2_t) a; } -#endif typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 531beead6..46622f804 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -41,7 +41,7 @@ struct functor_traits<scalar_opposite_op<Scalar> > template<typename Scalar> struct scalar_abs_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op) typedef typename NumTraits<Scalar>::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs(a); } template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pabs(a); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index dbfc9bd37..97627d14c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -370,8 +370,8 @@ // Does the compiler support const expressions? #ifdef __CUDACC__ -// Const expressions are supported provided that c++11 is enabled and we're using nvcc 7.5 or above -#if defined(__CUDACC_VER__) && __CUDACC_VER__ >= 70500 && __cplusplus > 199711L +// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above +#if __cplusplus > 199711L && defined(__CUDACC_VER__) && (defined(__clang__) || __CUDACC_VER__ >= 70500) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif (defined(__cplusplus) && __cplusplus >= 201402L) || \ |