From 8fbd47052bcafea612b8ae2841c1de5db738f042 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 6 Jun 2018 10:12:58 -0400 Subject: Adding support for using Eigen in HIP kernels. This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs. Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor) Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests. --- Eigen/src/Core/util/Meta.h | 52 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 8 deletions(-) (limited to 'Eigen/src/Core/util/Meta.h') diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 6e5af35c0..ca6fa6ce9 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -16,6 +16,12 @@ #include #endif +#if defined(EIGEN_HIP_DEVICE_COMPILE) +#include +#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" +#endif + + #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #include #endif @@ -175,7 +181,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -197,13 +203,31 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { return CUDART_MAX_NORMAL_F; } + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { return CUDART_INF_F; } + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } EIGEN_DEVICE_FUNC - static float quiet_NaN() { return CUDART_NAN_F; } + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } }; template<> struct numeric_limits { @@ -214,9 +238,21 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { return CUDART_INF; } + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } EIGEN_DEVICE_FUNC - static double quiet_NaN() { return CUDART_NAN; } + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } }; template<> struct numeric_limits { @@ -529,13 +565,13 @@ template struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) using internal::device::numeric_limits; #else using std::numeric_limits; -- cgit v1.2.3 From b6cc0961b17f6204038158c445eddf411c97a3e2 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Thu, 14 Jun 2018 10:21:54 -0400 Subject: updates based on PR feedback There are two major changes (and a few minor ones which are not listed here...see PR discussion for details) 1. Eigen::half implementations for HIP and CUDA have been merged. This means that - `CUDA/Half.h` and `HIP/hcc/Half.h` got merged to a new file `GPU/Half.h` - `CUDA/PacketMathHalf.h` and `HIP/hcc/PacketMathHalf.h` got merged to a new file `GPU/PacketMathHalf.h` - `CUDA/TypeCasting.h` and `HIP/hcc/TypeCasting.h` got merged to a new file `GPU/TypeCasting.h` After this change the `HIP/hcc` directory only contains one file `math_constants.h`. That will go away too once that file becomes a part of the HIP install. 2. new macros EIGEN_GPUCC, EIGEN_GPU_COMPILE_PHASE and EIGEN_HAS_GPU_FP16 have been added and the code has been updated to use them where appropriate. - `EIGEN_GPUCC` is the same as `(EIGEN_CUDACC || EIGEN_HIPCC)` - `EIGEN_GPU_DEVICE_COMPILE` is the same as `(EIGEN_CUDA_ARCH || EIGEN_HIP_DEVICE_COMPILE)` - `EIGEN_HAS_GPU_FP16` is the same as `(EIGEN_HAS_CUDA_FP16 or EIGEN_HAS_HIP_FP16)` --- Eigen/Core | 71 ++++++++-- Eigen/src/Core/GeneralProduct.h | 2 +- Eigen/src/Core/GenericPacketMath.h | 12 +- Eigen/src/Core/MathFunctions.h | 56 ++++---- Eigen/src/Core/ProductEvaluators.h | 18 +-- Eigen/src/Core/arch/GPU/Half.h | 131 ++++++++++++++----- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 145 ++++++++++++++++++++- Eigen/src/Core/arch/GPU/TypeCasting.h | 18 ++- Eigen/src/Core/functors/AssignmentFunctors.h | 2 +- Eigen/src/Core/functors/BinaryFunctors.h | 10 +- Eigen/src/Core/util/BlasUtil.h | 5 +- Eigen/src/Core/util/Memory.h | 8 +- Eigen/src/Core/util/Meta.h | 29 +++-- Eigen/src/SVD/BDCSVD.h | 2 +- test/half_float.cpp | 2 +- test/main.h | 16 ++- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 4 +- unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 20 +-- unsupported/Eigen/CXX11/src/util/EmulateArray.h | 2 +- .../src/SpecialFunctions/SpecialFunctionsImpl.h | 4 +- .../arch/CUDA/CudaSpecialFunctions.h | 2 +- unsupported/test/cxx11_tensor_hip.cu | 5 +- 29 files changed, 425 insertions(+), 185 deletions(-) (limited to 'Eigen/src/Core/util/Meta.h') diff --git a/Eigen/Core b/Eigen/Core index c72d5468a..f67bffd12 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -99,6 +99,61 @@ #define EIGEN_DONT_VECTORIZE #endif + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC +// +#define EIGEN_GPUCC +// +// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels +// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels +// +// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. +// For those cases, the corresponding code should be guarded with +// #if defined(EIGEN_GPUCC) +// instead of +// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDACC) +// +#endif + +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE +// +#define EIGEN_GPU_COMPILE_PHASE +// +// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, +// + one to compile the source for the "host" (ie CPU) +// + another to compile the source for the "device" (ie. GPU) +// +// Code that needs to enabled only during the either the "host" or "device" compilation phase +// needs to be guarded with a macro that indicates the current compilation phase +// +// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP +// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA +// +// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA +// For those cases, the code should be guarded with +// #if defined(EIGEN_GPU_COMPILE_PHASE) +// instead of +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDA_ARCH) +// +#endif + + // When compiling CUDA device code with NVCC, or HIP device code with HIPCC // pull in math functions from the global namespace. In host mode, and when // device doee with clang, use the std versions. @@ -312,6 +367,10 @@ #endif #endif +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + #define EIGEN_HAS_GPU_FP16 +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -475,15 +534,9 @@ using std::ptrdiff_t; #endif // Half float support -#if defined EIGEN_USE_HIP - #include "src/Core/arch/HIP/hcc/Half.h" - #include "src/Core/arch/HIP/hcc/PacketMathHalf.h" - #include "src/Core/arch/HIP/hcc/TypeCasting.h" -#else - #include "src/Core/arch/CUDA/Half.h" - #include "src/Core/arch/CUDA/PacketMathHalf.h" - #include "src/Core/arch/CUDA/TypeCasting.h" -#endif +#include "src/Core/arch/GPU/Half.h" +#include "src/Core/arch/GPU/PacketMathHalf.h" +#include "src/Core/arch/GPU/TypeCasting.h" #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 694f7cbde..261f77b99 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -35,7 +35,7 @@ template struct product_type_selector; template struct product_size_category { enum { - #ifndef EIGEN_CUDA_ARCH + #ifndef EIGEN_GPU_COMPILE_PHASE is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2603bd2f7..b67c41d8a 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -301,13 +301,11 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - inline void prefetch(const Scalar* addr) +template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -534,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) +#if !defined(EIGEN_GPUCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index fe6d6585c..6415a7696 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct imag_impl > { @@ -260,7 +260,7 @@ struct conj_default_impl template struct conj_impl : conj_default_impl {}; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct conj_impl > { @@ -773,9 +773,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - return isfinite(x); - #elif defined(EIGEN_CUDA_ARCH) + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -790,9 +788,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - return isinf(x); - #elif defined(EIGEN_CUDA_ARCH) + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -807,9 +803,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - return isnan(x); - #elif defined(EIGEN_CUDA_ARCH) + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -875,7 +869,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1089,7 +1083,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1147,7 +1141,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1168,7 +1162,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1226,7 +1220,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1254,7 +1248,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1284,7 +1278,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1320,7 +1314,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1340,7 +1334,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1360,7 +1354,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1380,7 +1374,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1411,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1442,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1473,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1494,7 +1488,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1514,7 +1508,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1532,12 +1526,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1557,7 +1551,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index e0868daf5..2bb42f74b 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -137,14 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; -<<<<<<< local - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - static EIGEN_STRONG_INLINE -======= static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ->>>>>>> other void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -397,14 +390,7 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template -<<<<<<< local - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) -======= static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) ->>>>>>> other { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side @@ -865,7 +851,7 @@ struct product_evaluator, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef EIGEN_CUDACC +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -909,7 +895,7 @@ struct product_evaluator, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef EIGEN_CUDACC +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h index c10550050..ab9d27591 100644 --- a/Eigen/src/Core/arch/GPU/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -26,15 +26,15 @@ // Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay // in fp32 for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H +#ifndef EIGEN_HALF_GPU_H +#define EIGEN_HALF_GPU_H #if __cplusplus > 199711L #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() @@ -49,16 +49,41 @@ struct half; namespace half_impl { -#if !defined(EIGEN_HAS_CUDA_FP16) +#if !defined(EIGEN_HAS_GPU_FP16) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { EIGEN_DEVICE_FUNC __half_raw() : x(0) {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add a implcit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; + typedef __half __half_raw; + #endif #endif EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); @@ -69,8 +94,19 @@ struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); } + #endif + #elif defined(EIGEN_HAS_CUDA_FP16) + #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000) EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif #endif }; @@ -78,17 +114,38 @@ struct half_base : public __half_raw { // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) - typedef half_impl::__half_raw __half_raw; - #endif + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP! + // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif #endif + explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -201,7 +258,8 @@ namespace Eigen { namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 @@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { #else // Emulate support for half floats -// Definitions for CPUs and older CUDA, mostly working through conversion +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion // to/from fp32. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { @@ -342,7 +400,8 @@ union FP32 { }; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) __half tmp_ff = __float2half(ff); return *(__half_raw*)&tmp_ff; @@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) @@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp(a)); #else return half(::expf(float(a))); @@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return half(::hlog(a)); #else return half(::logf(float(a))); @@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -629,9 +698,12 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if EIGEN_CUDACC_VER < 90000 + #if (EIGEN_CUDACC_VER < 90000) || \ + defined(EIGEN_HAS_HIP_FP16) return static_cast(__shfl_xor(static_cast(var), laneMask, width)); #else return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); @@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM #endif // ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) namespace Eigen { namespace numext { @@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) { } // namespace numext #endif -#endif // EIGEN_HALF_CUDA_H +#endif // EIGEN_HALF_GPU_H diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 8a6f209c4..e1ecac1ab 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -7,15 +7,16 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H +#ifndef EIGEN_PACKET_MATH_HALF_GPU_H +#define EIGEN_PACKET_MATH_HALF_GPU_H namespace Eigen { namespace internal { // Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) template<> struct is_arithmetic { enum { value = true }; }; @@ -43,7 +44,18 @@ template<> struct packet_traits : default_packet_traits template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else + return __half2half2(from); +#endif + +#else // EIGEN_CUDA_ARCH return __half2half2(from); +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 350 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { @@ -117,15 +155,29 @@ ptranspose(PacketBlock& kernel) { } template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __halves2half2(a, __hadd(a, __float2half(1.0f))); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; return __halves2half2(a, __float2half(f)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hadd2(a, b); #else @@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons float r2 = a2 + b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hsub2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hsub2(a, b); #else @@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons float r2 = a2 - b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hneg2(a); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hneg2(a); #else @@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { float a2 = __high2float(a); return __floats2half2_rn(-a1, -a2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hmul2(a, b); #else @@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons float r2 = a2 * b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hfma2(a, b, c); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hfma2(a, b, c); #else @@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, con float r2 = a2 * b2 + c2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif + +#else // EIGEN_CUDA_ARCH + float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, cons float r1 = a1 / b1; float r2 = a2 / b2; return __floats2half2_rn(r1, r2); + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { @@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hadd(__low2half(a), __high2half(a)); #else @@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& float a2 = __high2float(a); return Eigen::half(__float2half(a1 + a2)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha float a2 = __high2float(a); return a1 > a2 ? __low2half(a) : __high2half(a); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha float a2 = __high2float(a); return a1 < a2 ? __low2half(a) : __high2half(a); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hmul(__low2half(a), __high2half(a)); #else @@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha float a2 = __high2float(a); return Eigen::half(__float2half(a1 * a2)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { @@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { @@ -1130,4 +1265,4 @@ ptranspose(PacketBlock& kernel) { } } -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H +#endif // EIGEN_PACKET_MATH_HALF_GPU_H diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 30f870c3d..57a55d08b 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H +#ifndef EIGEN_TYPE_CASTING_GPU_H +#define EIGEN_TYPE_CASTING_GPU_H namespace Eigen { @@ -19,7 +19,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(a); #else return Eigen::half(a); @@ -37,7 +38,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(static_cast(a)); #else return Eigen::half(static_cast(a)); @@ -55,7 +57,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(a); #else return static_cast(a); @@ -69,7 +72,8 @@ struct functor_traits > -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { @@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f } // end namespace Eigen -#endif // EIGEN_TYPE_CASTING_CUDA_H +#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 1077d8eb0..9765cc763 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef EIGEN_CUDACC +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index e269140bd..401d597d8 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -436,10 +436,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC explicit - #endif - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -458,10 +455,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC explicit - #endif - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index a4cde6d95..b1791fb3a 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -163,10 +163,7 @@ class BlasLinearMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - EIGEN_ALWAYS_INLINE void prefetch(int i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 87fcc30f5..059d06874 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -171,7 +171,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED #if defined(EIGEN_HIP_DEVICE_COMPILE) - result = aligned_malloc(size); + result = ::malloc(size); #else result = std::malloc(size); #endif @@ -195,7 +195,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED #if defined(EIGEN_HIP_DEVICE_COMPILE) - aligned_free(ptr); + ::free(ptr); #else std::free(ptr); #endif @@ -244,7 +244,7 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: check_that_malloc_is_allowed(); #if defined(EIGEN_HIP_DEVICE_COMPILE) - void *result = aligned_malloc(size); + void *result = ::malloc(size); #else void *result = std::malloc(size); #endif @@ -263,7 +263,7 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { #if defined(EIGEN_HIP_DEVICE_COMPILE) - aligned_free(ptr); + ::free(ptr); #else std::free(ptr); #endif diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 7f78cc89c..5a358bc12 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -11,16 +11,19 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(EIGEN_CUDA_ARCH) -#include -#include -#endif +#if defined(EIGEN_GPU_COMPILE_PHASE) -#if defined(EIGEN_HIP_DEVICE_COMPILE) -#include -#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" -#endif + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + +#endif #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #include @@ -181,7 +184,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -565,13 +568,13 @@ template struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) using internal::device::numeric_limits; #else using std::numeric_limits; @@ -590,7 +593,7 @@ T div_ceil(const T &a, const T &b) template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_GPU_COMPILE_PHASE) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } @@ -601,7 +604,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_GPU_COMPILE_PHASE) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index e977b9623..11df14918 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1299,7 +1299,7 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) +#if !defined(EIGEN_GPUCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/test/half_float.cpp b/test/half_float.cpp index 7734f82cc..1b0ea9482 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -9,7 +9,7 @@ #include "main.h" -#include +#include // Make sure it's possible to forward declare Eigen::half namespace Eigen { diff --git a/test/main.h b/test/main.h index 5691af52b..95bbc9eb0 100644 --- a/test/main.h +++ b/test/main.h @@ -68,9 +68,19 @@ // are defined here and any not-parenthesized min/max call will cause a // compiler error. #if !defined(__HIPCC__) - // HIP headers include the header which contains not-parenthesized - // calls to "max", triggering the following check and causing the compile to fail - // so disabling the following checks for HIP + // + // HIP header files include the following files + // + // + // + // which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail + // + // Including those header files before the following macro definition for "min" / "max", only partially resolves the issue + // This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other + // headers. + // + // So instead choosing to simply disable this check for HIP + // #define min(A,B) please_protect_your_min_with_parentheses #define max(A,B) please_protect_your_max_with_parentheses #define isnan(X) please_protect_your_isnan_with_parentheses diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index e94e577fc..5c1c68912 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,7 +35,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running on the host CPU return 1; #elif defined(EIGEN_HIP_DEVICE_COMPILE) @@ -48,9 +48,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) // Running on the host CPU return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48*1024; // FIXME : update this number for HIP #else // Running on a CUDA device, return the amount of shared memory available. return 48*1024; @@ -58,9 +61,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) // Running single threaded on the host CPU return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP #else // Running on a CUDA device return firstLevelCacheSize(); @@ -68,7 +74,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 24a57970a..8bbe449cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor { }; -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -276,7 +276,7 @@ inline void TensorExecutor::run( evaluator.cleanup(); } -#endif // EIGEN_CUDACC || EIGEN_HIPCC +#endif // EIGEN_GPUCC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 25ba2001e..b6d445c50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -35,7 +35,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef EIGEN_CUDA_ARCH +#ifdef EIGEN_GPU_COMPILE_PHASE return __clz(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -53,7 +53,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef EIGEN_CUDA_ARCH +#ifdef EIGEN_GPU_COMPILE_PHASE return __clzll(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -90,7 +90,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umulhi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -101,7 +101,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -124,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -203,7 +203,7 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef EIGEN_CUDA_ARCH +#ifdef EIGEN_GPU_COMPILE_PHASE return (__umulhi(magic, n) >> shift); #elif defined(__SYCL_DEVICE_ONLY__) return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index 8e1ba486d..c6ca396a3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) +#if !defined(EIGEN_GPUCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index de1075cc1..87be090f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16))) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) template <> struct PacketType { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 71536a4b9..5a547141a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -16,7 +16,7 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. assert(threadIdx.z == 0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index d2fb3fd32..fdd338b96 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -334,12 +334,12 @@ struct OuterReducer { }; -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#if defined(EIGEN_HAS_GPU_FP16) template __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template @@ -698,9 +698,9 @@ struct TensorEvaluator, #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#if defined(EIGEN_HAS_GPU_FP16) template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); @@ -793,7 +793,7 @@ struct TensorEvaluator, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 174a6a064..6d68e256f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -242,7 +242,7 @@ struct ScanLauncher { } }; -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) // GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in @@ -286,7 +286,7 @@ struct ScanLauncher { #endif } }; -#endif // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC) +#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index bb584e3f9..8de3bbcab 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -268,10 +268,7 @@ template< typename Reducer > struct reduce { - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - constexpr static inline int run() { return Reducer::Identity; } + EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; } }; template< @@ -279,10 +276,7 @@ template< typename A > struct reduce { - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - constexpr static inline A run(A a) { return a; } + EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; } }; template< @@ -291,10 +285,7 @@ template< typename... Ts > struct reduce { - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { + EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { return Reducer::run(a, reduce::run(ts...)); } }; @@ -333,10 +324,7 @@ struct greater_equal_zero_op { template constexpr static inline auto // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 // does... template -#if defined(EIGEN_HIPCC) -EIGEN_DEVICE_FUNC -#endif -constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) +EIGEN_DEVICE_FUNC constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) { return reduce::run(ts...); } diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 5b01c5fb7..d91662d96 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template class array { diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 9a719e3c0..fbbc87661 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -190,7 +190,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgammaf_r(x, &dummy); #else @@ -203,7 +203,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgamma_r(x, &dummy); #else diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h index 020ac1b62..ad011e305 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma(const float4& a) diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu index f1794aa87..b28840267 100644 --- a/unsupported/test/cxx11_tensor_hip.cu +++ b/unsupported/test/cxx11_tensor_hip.cu @@ -1222,9 +1222,8 @@ void test_cxx11_tensor_hip() CALL_SUBTEST(test_hip_elementwise()); CALL_SUBTEST(test_hip_props()); CALL_SUBTEST(test_hip_reduction()); - // FIXME : uncommenting following tests results in compile failure - // CALL_SUBTEST(test_hip_contraction()); - // CALL_SUBTEST(test_hip_contraction()); + CALL_SUBTEST(test_hip_contraction()); + CALL_SUBTEST(test_hip_contraction()); CALL_SUBTEST(test_hip_convolution_1d()); CALL_SUBTEST(test_hip_convolution_1d()); CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d()); -- cgit v1.2.3