diff options
Diffstat (limited to 'Eigen/src')
17 files changed, 433 insertions, 101 deletions
diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index bd2361e9a..43f3b84c8 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -35,7 +35,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector; template<int Size, int MaxSize> struct product_size_category { enum { - #ifndef EIGEN_CUDA_ARCH + #ifndef EIGEN_GPU_COMPILE_PHASE is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 55b6a89e2..b67c41d8a 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -303,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu /** \internal tries to do cache prefetching of \a addr */ template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -530,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_GPUCC) template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) { return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index a5740334a..f16476a92 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true> template<typename Scalar> struct real_impl : real_default_impl<Scalar> {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_GPU_COMPILE_PHASE) template<typename T> struct real_impl<std::complex<T> > { @@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true> template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_GPU_COMPILE_PHASE) template<typename T> struct imag_impl<std::complex<T> > { @@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true> template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {}; -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_GPU_COMPILE_PHASE) template<typename T> struct conj_impl<std::complex<T> > { @@ -435,7 +435,12 @@ struct round_retval struct arg_impl { static inline Scalar run(const Scalar& x) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + // HIP does not seem to have a native device side implementation for the math routine "arg" + using std::arg; + #else EIGEN_USING_STD_MATH(arg); + #endif return arg(x); } }; @@ -768,7 +773,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isfinite_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -783,7 +788,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isinf_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -798,7 +803,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type isnan_impl(const T& x) { - #ifdef EIGEN_CUDA_ARCH + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -864,7 +869,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x); namespace numext { -#if (!defined(EIGEN_CUDACC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__) +#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) && !defined(__SYCL_DEVICE_ONLY__) template<typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -977,7 +982,12 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) { +#if defined(EIGEN_HIPCC) + // no "fminl" on HIP yet + return (x < y) ? x : y; +#else return fminl(x, y); +#endif } template<typename T> @@ -1002,7 +1012,12 @@ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) { +#if defined(EIGEN_HIPCC) + // no "fmaxl" on HIP yet + return (x > y) ? x : y; +#else return fmaxl(x, y); +#endif } #endif @@ -1099,7 +1114,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1157,7 +1172,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1178,7 +1193,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1236,7 +1251,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1264,7 +1279,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1294,7 +1309,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1330,7 +1345,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1350,7 +1365,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1370,7 +1385,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1390,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1421,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1452,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1483,7 +1498,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1504,7 +1519,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1524,7 +1539,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1542,12 +1557,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1567,7 +1582,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#ifdef EIGEN_CUDACC +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 22ad32ae3..0330b5741 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -885,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef EIGEN_CUDACC +#ifndef EIGEN_GPUCC template<int LoadMode,typename PacketType> EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -929,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef EIGEN_CUDACC +#ifndef EIGEN_GPUCC template<int LoadMode,typename PacketType> EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/GPU/Half.h index c10550050..ab9d27591 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -26,15 +26,15 @@ // Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay // in fp32 for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H +#ifndef EIGEN_HALF_GPU_H +#define EIGEN_HALF_GPU_H #if __cplusplus > 199711L #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() @@ -49,16 +49,41 @@ struct half; namespace half_impl { -#if !defined(EIGEN_HAS_CUDA_FP16) +#if !defined(EIGEN_HAS_GPU_FP16) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { EIGEN_DEVICE_FUNC __half_raw() : x(0) {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add a implcit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; + typedef __half __half_raw; + #endif #endif EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); @@ -69,8 +94,19 @@ struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); } + #endif + #elif defined(EIGEN_HAS_CUDA_FP16) + #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000) EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif #endif }; @@ -78,17 +114,38 @@ struct half_base : public __half_raw { // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) - typedef half_impl::__half_raw __half_raw; - #endif + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP! + // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif #endif + explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -201,7 +258,8 @@ namespace Eigen { namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 @@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { #else // Emulate support for half floats -// Definitions for CPUs and older CUDA, mostly working through conversion +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion // to/from fp32. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { @@ -342,7 +400,8 @@ union FP32 { }; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) __half tmp_ff = __float2half(ff); return *(__half_raw*)&tmp_ff; @@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) @@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp(a)); #else return half(::expf(float(a))); @@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return half(::hlog(a)); #else return half(::logf(float(a))); @@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(b, a) ? b : a; #else const float f1 = static_cast<float>(a); @@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(a, b) ? b : a; #else const float f1 = static_cast<float>(a); @@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -629,9 +698,12 @@ struct hash<Eigen::half> { // Add the missing shfl_xor intrinsic -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if EIGEN_CUDACC_VER < 90000 + #if (EIGEN_CUDACC_VER < 90000) || \ + defined(EIGEN_HAS_HIP_FP16) return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); #else return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width)); @@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM #endif // ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast<const unsigned short*>(ptr))); @@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) namespace Eigen { namespace numext { @@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) { } // namespace numext #endif -#endif // EIGEN_HALF_CUDA_H +#endif // EIGEN_HALF_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h index ff6256ce0..d2b3a2568 100644 --- a/Eigen/src/Core/arch/CUDA/MathFunctions.h +++ b/Eigen/src/Core/arch/GPU/MathFunctions.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H -#define EIGEN_MATH_FUNCTIONS_CUDA_H +#ifndef EIGEN_MATH_FUNCTIONS_GPU_H +#define EIGEN_MATH_FUNCTIONS_GPU_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog<float4>(const float4& a) { @@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a) } // end namespace Eigen -#endif // EIGEN_MATH_FUNCTIONS_CUDA_H +#endif // EIGEN_MATH_FUNCTIONS_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index ab8e477f4..ddf37b9c1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PACKET_MATH_CUDA_H -#define EIGEN_PACKET_MATH_CUDA_H +#ifndef EIGEN_PACKET_MATH_GPU_H +#define EIGEN_PACKET_MATH_GPU_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> struct is_arithmetic<float4> { enum { value = true }; }; template<> struct is_arithmetic<double2> { enum { value = true }; }; @@ -338,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) { } // end namespace Eigen -#endif // EIGEN_PACKET_MATH_CUDA_H +#endif // EIGEN_PACKET_MATH_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 9897bd4e5..b0a72e1f9 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -7,15 +7,16 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H +#ifndef EIGEN_PACKET_MATH_HALF_GPU_H +#define EIGEN_PACKET_MATH_HALF_GPU_H namespace Eigen { namespace internal { // Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) template<> struct is_arithmetic<half2> { enum { value = true }; }; @@ -43,7 +44,18 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else + return __half2half2(from); +#endif + +#else // EIGEN_CUDA_ARCH return __half2half2(from); +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { @@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 350 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { @@ -117,15 +155,29 @@ ptranspose(PacketBlock<half2,2>& kernel) { } template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __halves2half2(a, __hadd(a, __float2half(1.0f))); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; return __halves2half2(a, __float2half(f)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hadd2(a, b); #else @@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons float r2 = a2 + b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hsub2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hsub2(a, b); #else @@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons float r2 = a2 - b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hneg2(a); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hneg2(a); #else @@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { float a2 = __high2float(a); return __floats2half2_rn(-a1, -a2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hmul2(a, b); #else @@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons float r2 = a2 * b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hfma2(a, b, c); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hfma2(a, b, c); #else @@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con float r2 = a2 * b2 + c2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif + +#else // EIGEN_CUDA_ARCH + float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons float r1 = a1 / b1; float r2 = a2 / b2; return __floats2half2_rn(r1, r2); + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { @@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hadd(__low2half(a), __high2half(a)); #else @@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& float a2 = __high2float(a); return Eigen::half(__float2half(a1 + a2)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha float a2 = __high2float(a); return a1 > a2 ? __low2half(a) : __high2half(a); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha float a2 = __high2float(a); return a1 < a2 ? __low2half(a) : __high2half(a); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hmul(__low2half(a), __high2half(a)); #else @@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha float a2 = __high2float(a); return Eigen::half(__float2half(a1 * a2)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { @@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) { return __floats2half2_rn(r1, r2); } -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { @@ -1281,4 +1416,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) { } } -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H +#endif // EIGEN_PACKET_MATH_HALF_GPU_H diff --git a/Eigen/src/Core/arch/CUDA/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 30f870c3d..57a55d08b 100644 --- a/Eigen/src/Core/arch/CUDA/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H +#ifndef EIGEN_TYPE_CASTING_GPU_H +#define EIGEN_TYPE_CASTING_GPU_H namespace Eigen { @@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(a); #else return Eigen::half(a); @@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(static_cast<float>(a)); #else return Eigen::half(static_cast<float>(a)); @@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(a); #else return static_cast<float>(a); @@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> > -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits<Eigen::half, float> { @@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f } // end namespace Eigen -#endif // EIGEN_TYPE_CASTING_CUDA_H +#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h new file mode 100644 index 000000000..25375a0a4 --- /dev/null +++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h @@ -0,0 +1,23 @@ +/* + * math_constants.h - + * HIP equivalent of the CUDA header of the same name + */ + +#ifndef __MATH_CONSTANTS_H__ +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ + +#define HIPRT_INF_F __int_as_float(0x7f800000) +#define HIPRT_NAN_F __int_as_float(0x7fffffff) +#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) +#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) +#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) +#define HIPRT_ZERO_F 0.0f +#define HIPRT_ONE_F 1.0f + +/* double precision constants */ +#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) +#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) + +#endif diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 1077d8eb0..9765cc763 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef EIGEN_CUDACC +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast<Scalar&>(b)=a; a=t; #else diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index 3eae6b8ca..401d597d8 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -436,7 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -455,7 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index b2a71bc6f..767feb99d 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -48,7 +48,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -57,7 +57,7 @@ EIGEN_DONT_INLINE static void run( }; template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, @@ -231,7 +231,7 @@ typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -240,7 +240,7 @@ EIGEN_DONT_INLINE static void run( }; template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 8927bd404..64b7be423 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1008,9 +1008,12 @@ namespace Eigen { # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# ifdef EIGEN_CUDA_ARCH +# if defined(EIGEN_CUDA_ARCH) # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") +# elif defined(EIGEN_HIP_DEVICE_COMPILE) +# define EIGEN_THROW_X(X) asm("s_trap 0") +# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index aaa05a19c..85bc75da6 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -70,7 +70,20 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast<std::size_t>(-1); + #if defined(EIGEN_HIPCC) + // + // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), + // and as a consequence the code in the #else block triggers the hipcc warning : + // "no overloaded function has restriction specifiers that are compatible with the ambient context" + // + // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects + // the same on "operator new" + // Reverting code back to the old version in this #if block for the hipcc compiler + // + new int[huge]; + #else ::operator new(huge); + #endif #endif } @@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + result = ::malloc(size); + #else result = std::malloc(size); + #endif + #if EIGEN_DEFAULT_ALIGN_BYTES==16 eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); #endif @@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::free(ptr); + #else std::free(ptr); + #endif + #else handmade_aligned_free(ptr); #endif @@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std: { check_that_malloc_is_allowed(); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + void *result = ::malloc(size); + #else void *result = std::malloc(size); + #endif + if(!result && size) throw_std_bad_alloc(); return result; @@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr) { + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::free(ptr); + #else std::free(ptr); + #endif } template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); + #if defined(EIGEN_HIP_DEVICE_COMPILE) + ::memcpy(target, start, size); + #else std::memcpy(target, start, size); + #endif } }; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3d2bdd12e..748f24b1e 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -11,9 +11,18 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(EIGEN_CUDA_ARCH) -#include <cfloat> -#include <math_constants.h> +#if defined(EIGEN_GPU_COMPILE_PHASE) + + #include <cfloat> + + #if defined(EIGEN_CUDA_ARCH) + #include <math_constants.h> + #endif + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + #endif #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L @@ -177,7 +186,7 @@ template<bool Condition, typename T=void> struct enable_if; template<typename T> struct enable_if<true,T> { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -199,13 +208,31 @@ template<> struct numeric_limits<float> EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits<double> { @@ -216,9 +243,21 @@ template<> struct numeric_limits<double> EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits<int> { @@ -531,13 +570,13 @@ template<typename T, typename U> struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) using internal::device::numeric_limits; #else using std::numeric_limits; @@ -557,7 +596,7 @@ T div_ceil(const T &a, const T &b) template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) +#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); } @@ -568,7 +607,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to<double template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_CUDA_ARCH) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) +#if !defined(EIGEN_GPU_COMPILE_PHASE) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); } diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index a24deb96a..11df14918 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#ifndef EIGEN_CUDACC +#if !defined(EIGEN_GPUCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm |