From b6cc0961b17f6204038158c445eddf411c97a3e2 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Thu, 14 Jun 2018 10:21:54 -0400 Subject: updates based on PR feedback There are two major changes (and a few minor ones which are not listed here...see PR discussion for details) 1. Eigen::half implementations for HIP and CUDA have been merged. This means that - `CUDA/Half.h` and `HIP/hcc/Half.h` got merged to a new file `GPU/Half.h` - `CUDA/PacketMathHalf.h` and `HIP/hcc/PacketMathHalf.h` got merged to a new file `GPU/PacketMathHalf.h` - `CUDA/TypeCasting.h` and `HIP/hcc/TypeCasting.h` got merged to a new file `GPU/TypeCasting.h` After this change the `HIP/hcc` directory only contains one file `math_constants.h`. That will go away too once that file becomes a part of the HIP install. 2. new macros EIGEN_GPUCC, EIGEN_GPU_COMPILE_PHASE and EIGEN_HAS_GPU_FP16 have been added and the code has been updated to use them where appropriate. - `EIGEN_GPUCC` is the same as `(EIGEN_CUDACC || EIGEN_HIPCC)` - `EIGEN_GPU_DEVICE_COMPILE` is the same as `(EIGEN_CUDA_ARCH || EIGEN_HIP_DEVICE_COMPILE)` - `EIGEN_HAS_GPU_FP16` is the same as `(EIGEN_HAS_CUDA_FP16 or EIGEN_HAS_HIP_FP16)` --- Eigen/Core | 71 ++++++++-- Eigen/src/Core/GeneralProduct.h | 2 +- Eigen/src/Core/GenericPacketMath.h | 12 +- Eigen/src/Core/MathFunctions.h | 56 ++++---- Eigen/src/Core/ProductEvaluators.h | 18 +-- Eigen/src/Core/arch/GPU/Half.h | 131 ++++++++++++++----- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 145 ++++++++++++++++++++- Eigen/src/Core/arch/GPU/TypeCasting.h | 18 ++- Eigen/src/Core/functors/AssignmentFunctors.h | 2 +- Eigen/src/Core/functors/BinaryFunctors.h | 10 +- Eigen/src/Core/util/BlasUtil.h | 5 +- Eigen/src/Core/util/Memory.h | 8 +- Eigen/src/Core/util/Meta.h | 29 +++-- Eigen/src/SVD/BDCSVD.h | 2 +- test/half_float.cpp | 2 +- test/main.h | 16 ++- .../Eigen/CXX11/src/Tensor/TensorDeviceDefault.h | 14 +- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 4 +- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 12 +- unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 10 +- unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 4 +- unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 20 +-- unsupported/Eigen/CXX11/src/util/EmulateArray.h | 2 +- .../src/SpecialFunctions/SpecialFunctionsImpl.h | 4 +- .../arch/CUDA/CudaSpecialFunctions.h | 2 +- unsupported/test/cxx11_tensor_hip.cu | 5 +- 29 files changed, 425 insertions(+), 185 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index c72d5468a..f67bffd12 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -99,6 +99,61 @@ #define EIGEN_DONT_VECTORIZE #endif + +#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC +// +#define EIGEN_GPUCC +// +// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels +// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels +// +// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. +// For those cases, the corresponding code should be guarded with +// #if defined(EIGEN_GPUCC) +// instead of +// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIPCC) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDACC) +// +#endif + +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE +// +#define EIGEN_GPU_COMPILE_PHASE +// +// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, +// + one to compile the source for the "host" (ie CPU) +// + another to compile the source for the "device" (ie. GPU) +// +// Code that needs to enabled only during the either the "host" or "device" compilation phase +// needs to be guarded with a macro that indicates the current compilation phase +// +// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP +// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA +// +// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA +// For those cases, the code should be guarded with +// #if defined(EIGEN_GPU_COMPILE_PHASE) +// instead of +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to HIP, the code should be guarded with +// #if defined(EIGEN_HIP_DEVICE_COMPILE) +// +// For cases where the tweak is specific to CUDA, the code should be guarded with +// #if defined(EIGEN_CUDA_ARCH) +// +#endif + + // When compiling CUDA device code with NVCC, or HIP device code with HIPCC // pull in math functions from the global namespace. In host mode, and when // device doee with clang, use the std versions. @@ -312,6 +367,10 @@ #endif #endif +#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) + #define EIGEN_HAS_GPU_FP16 +#endif + #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) #define EIGEN_HAS_OPENMP #endif @@ -475,15 +534,9 @@ using std::ptrdiff_t; #endif // Half float support -#if defined EIGEN_USE_HIP - #include "src/Core/arch/HIP/hcc/Half.h" - #include "src/Core/arch/HIP/hcc/PacketMathHalf.h" - #include "src/Core/arch/HIP/hcc/TypeCasting.h" -#else - #include "src/Core/arch/CUDA/Half.h" - #include "src/Core/arch/CUDA/PacketMathHalf.h" - #include "src/Core/arch/CUDA/TypeCasting.h" -#endif +#include "src/Core/arch/GPU/Half.h" +#include "src/Core/arch/GPU/PacketMathHalf.h" +#include "src/Core/arch/GPU/TypeCasting.h" #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" diff --git a/Eigen/src/Core/GeneralProduct.h b/Eigen/src/Core/GeneralProduct.h index 694f7cbde..261f77b99 100644 --- a/Eigen/src/Core/GeneralProduct.h +++ b/Eigen/src/Core/GeneralProduct.h @@ -35,7 +35,7 @@ template struct product_type_selector; template struct product_size_category { enum { - #ifndef EIGEN_CUDA_ARCH + #ifndef EIGEN_GPU_COMPILE_PHASE is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 2603bd2f7..b67c41d8a 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -301,13 +301,11 @@ template EIGEN_DEVICE_FUNC inline void pstoreu { pstore(to, from); } /** \internal tries to do cache prefetching of \a addr */ -template - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - inline void prefetch(const Scalar* addr) +template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#ifdef EIGEN_CUDA_ARCH +#if defined(EIGEN_HIP_DEVICE_COMPILE) + // do nothing +#elif defined(EIGEN_CUDA_ARCH) #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -534,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) +#if !defined(EIGEN_GPUCC) template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index fe6d6585c..6415a7696 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -96,7 +96,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct real_impl > { @@ -144,7 +144,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct imag_impl > { @@ -260,7 +260,7 @@ struct conj_default_impl template struct conj_impl : conj_default_impl {}; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template struct conj_impl > { @@ -773,9 +773,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - return isfinite(x); - #elif defined(EIGEN_CUDA_ARCH) + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -790,9 +788,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - return isinf(x); - #elif defined(EIGEN_CUDA_ARCH) + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -807,9 +803,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - return isnan(x); - #elif defined(EIGEN_CUDA_ARCH) + #if defined(EIGEN_GPU_COMPILE_PHASE) return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -875,7 +869,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -1089,7 +1083,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); } EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1147,7 +1141,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); } EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1168,7 +1162,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); } EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1226,7 +1220,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1254,7 +1248,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1284,7 +1278,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); } EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } @@ -1320,7 +1314,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); } EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float &x) { return ::expm1f(x); } @@ -1340,7 +1334,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); } EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1360,7 +1354,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); } EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1380,7 +1374,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); } EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1411,7 +1405,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); } EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1442,7 +1436,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); } EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1473,7 +1467,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); } EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1494,7 +1488,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); } EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1514,7 +1508,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); } EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1532,12 +1526,12 @@ T tanh(const T &x) { #if defined(__SYCL_DEVICE_ONLY__) EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); } EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); } -#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH +#elif (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1557,7 +1551,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y) EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); } #endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index e0868daf5..2bb42f74b 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -137,14 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; -<<<<<<< local - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - static EIGEN_STRONG_INLINE -======= static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ->>>>>>> other void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -397,14 +390,7 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template -<<<<<<< local - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) -======= static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) ->>>>>>> other { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side @@ -865,7 +851,7 @@ struct product_evaluator, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef EIGEN_CUDACC +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -909,7 +895,7 @@ struct product_evaluator, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef EIGEN_CUDACC +#ifndef EIGEN_GPUCC template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h index c10550050..ab9d27591 100644 --- a/Eigen/src/Core/arch/GPU/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -26,15 +26,15 @@ // Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with +// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay // in fp32 for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H +#ifndef EIGEN_HALF_GPU_H +#define EIGEN_HALF_GPU_H #if __cplusplus > 199711L #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() @@ -49,16 +49,41 @@ struct half; namespace half_impl { -#if !defined(EIGEN_HAS_CUDA_FP16) +#if !defined(EIGEN_HAS_GPU_FP16) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { EIGEN_DEVICE_FUNC __half_raw() : x(0) {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) +// Make a __half_raw definition that is +// ++ compatible with that of Eigen and +// ++ add a implcit conversion to the native __half of the old HIP implementation. +// +// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation. +// +// In the old HIP implementation, +// ++ __half is a typedef of __fp16 +// ++ the "__h*" routines take "__half" arguments +// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make +// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} + union { + unsigned short x; + __half data; + }; + operator __half(void) const { return data; } +}; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; + typedef __half __half_raw; + #endif #endif EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); @@ -69,8 +94,19 @@ struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {} + #else + EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); } + #endif + #elif defined(EIGEN_HAS_CUDA_FP16) + #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000) EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} + #endif + #endif #endif }; @@ -78,17 +114,38 @@ struct half_base : public __half_raw { // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) - typedef half_impl::__half_raw __half_raw; - #endif + + // Writing this out as separate #if-else blocks to make the code easier to follow + // The same applies to most #if-else blocks in this file +#if !defined(EIGEN_HAS_GPU_FP16) + typedef half_impl::__half_raw __half_raw; +#elif defined(EIGEN_HAS_HIP_FP16) + #if defined(EIGEN_HAS_OLD_HIP_FP16) + typedef half_impl::__half_raw __half_raw; + #endif +#elif defined(EIGEN_HAS_CUDA_FP16) + // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP! + // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif +#endif EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + +#if defined(EIGEN_HAS_GPU_FP16) + #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #elif defined(EIGEN_HAS_CUDA_FP16) + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + #endif + #endif #endif + explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -201,7 +258,8 @@ namespace Eigen { namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 @@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { #else // Emulate support for half floats -// Definitions for CPUs and older CUDA, mostly working through conversion +// Definitions for CPUs and older HIP+CUDA, mostly working through conversion // to/from fp32. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { @@ -342,7 +400,8 @@ union FP32 { }; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) __half tmp_ff = __float2half(ff); return *(__half_raw*)&tmp_ff; @@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) @@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp(a)); #else return half(::expf(float(a))); @@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return half(::hlog(a)); #else return half(::logf(float(a))); @@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -629,9 +698,12 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) + __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if EIGEN_CUDACC_VER < 90000 + #if (EIGEN_CUDACC_VER < 90000) || \ + defined(EIGEN_HAS_HIP_FP16) return static_cast(__shfl_xor(static_cast(var), laneMask, width)); #else return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); @@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM #endif // ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) namespace Eigen { namespace numext { @@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) { } // namespace numext #endif -#endif // EIGEN_HALF_CUDA_H +#endif // EIGEN_HALF_GPU_H diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 8a6f209c4..e1ecac1ab 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -7,15 +7,16 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H -#define EIGEN_PACKET_MATH_HALF_CUDA_H +#ifndef EIGEN_PACKET_MATH_HALF_GPU_H +#define EIGEN_PACKET_MATH_HALF_GPU_H namespace Eigen { namespace internal { // Most of the following operations require arch >= 3.0 -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) template<> struct is_arithmetic { enum { value = true }; }; @@ -43,7 +44,18 @@ template<> struct packet_traits : default_packet_traits template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return half2half2(from); +#else + return __half2half2(from); +#endif + +#else // EIGEN_CUDA_ARCH return __half2half2(from); +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -69,20 +81,46 @@ template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __ldg((const half2*)from); +#endif + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 350 return __ldg((const half2*)from); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return __halves2half2((*(from+0)), (*(from+1))); +#else + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#endif + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 350 return __halves2half2(__ldg(from+0), __ldg(from+1)); #else return __halves2half2(*(from+0), *(from+1)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { @@ -117,15 +155,29 @@ ptranspose(PacketBlock& kernel) { } template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __halves2half2(a, __hadd(a, __float2half(1.0f))); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; return __halves2half2(a, __float2half(f)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hadd2(a, b); #else @@ -137,9 +189,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, cons float r2 = a2 + b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hsub2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hsub2(a, b); #else @@ -151,9 +211,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, cons float r2 = a2 - b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hneg2(a); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hneg2(a); #else @@ -161,11 +229,19 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { float a2 = __high2float(a); return __floats2half2_rn(-a1, -a2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul2(a, b); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hmul2(a, b); #else @@ -177,9 +253,17 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, cons float r2 = a2 * b2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hfma2(a, b, c); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hfma2(a, b, c); #else @@ -193,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, con float r2 = a2 * b2 + c2; return __floats2half2_rn(r1, r2); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + +#if defined(EIGEN_HAS_OLD_HIP_FP16) + return h2div(a, b); +#else + return __h2div(a, b); +#endif + +#else // EIGEN_CUDA_ARCH + float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -203,6 +299,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, cons float r1 = a1 / b1; float r2 = a2 / b2; return __floats2half2_rn(r1, r2); + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { @@ -226,6 +324,12 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, cons } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hadd(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hadd(__low2half(a), __high2half(a)); #else @@ -233,9 +337,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& float a2 = __high2float(a); return Eigen::half(__float2half(a1 + a2)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -245,9 +359,19 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const ha float a2 = __high2float(a); return a1 > a2 ? __low2half(a) : __high2half(a); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 __half first = __low2half(a); __half second = __high2half(a); @@ -257,9 +381,17 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const ha float a2 = __high2float(a); return a1 < a2 ? __low2half(a) : __high2half(a); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + + return __hmul(__low2half(a), __high2half(a)); + +#else // EIGEN_CUDA_ARCH + #if EIGEN_CUDA_ARCH >= 530 return __hmul(__low2half(a), __high2half(a)); #else @@ -267,6 +399,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha float a2 = __high2float(a); return Eigen::half(__float2half(a1 * a2)); #endif + +#endif } template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { @@ -285,7 +419,8 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { return __floats2half2_rn(r1, r2); } -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 +#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ + defined(EIGEN_HIP_DEVICE_COMPILE) template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { @@ -1130,4 +1265,4 @@ ptranspose(PacketBlock& kernel) { } } -#endif // EIGEN_PACKET_MATH_HALF_CUDA_H +#endif // EIGEN_PACKET_MATH_HALF_GPU_H diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 30f870c3d..57a55d08b 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_TYPE_CASTING_CUDA_H -#define EIGEN_TYPE_CASTING_CUDA_H +#ifndef EIGEN_TYPE_CASTING_GPU_H +#define EIGEN_TYPE_CASTING_GPU_H namespace Eigen { @@ -19,7 +19,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(a); #else return Eigen::half(a); @@ -37,7 +38,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef Eigen::half result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __float2half(static_cast(a)); #else return Eigen::half(static_cast(a)); @@ -55,7 +57,8 @@ struct scalar_cast_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) typedef float result_type; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(a); #else return static_cast(a); @@ -69,7 +72,8 @@ struct functor_traits > -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { @@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f } // end namespace Eigen -#endif // EIGEN_TYPE_CASTING_CUDA_H +#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 1077d8eb0..9765cc763 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef EIGEN_CUDACC +#ifdef EIGEN_GPUCC // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index e269140bd..401d597d8 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -436,10 +436,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC explicit - #endif - bind1st_op(const first_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -458,10 +455,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC explicit - #endif - bind2nd_op(const second_argument_type &val) : m_value(val) {} + EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index a4cde6d95..b1791fb3a 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -163,10 +163,7 @@ class BlasLinearMapper { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - EIGEN_ALWAYS_INLINE void prefetch(int i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 87fcc30f5..059d06874 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -171,7 +171,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED #if defined(EIGEN_HIP_DEVICE_COMPILE) - result = aligned_malloc(size); + result = ::malloc(size); #else result = std::malloc(size); #endif @@ -195,7 +195,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED #if defined(EIGEN_HIP_DEVICE_COMPILE) - aligned_free(ptr); + ::free(ptr); #else std::free(ptr); #endif @@ -244,7 +244,7 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: check_that_malloc_is_allowed(); #if defined(EIGEN_HIP_DEVICE_COMPILE) - void *result = aligned_malloc(size); + void *result = ::malloc(size); #else void *result = std::malloc(size); #endif @@ -263,7 +263,7 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { #if defined(EIGEN_HIP_DEVICE_COMPILE) - aligned_free(ptr); + ::free(ptr); #else std::free(ptr); #endif diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 7f78cc89c..5a358bc12 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -11,16 +11,19 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(EIGEN_CUDA_ARCH) -#include -#include -#endif +#if defined(EIGEN_GPU_COMPILE_PHASE) -#if defined(EIGEN_HIP_DEVICE_COMPILE) -#include -#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" -#endif + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + +#endif #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L #include @@ -181,7 +184,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -565,13 +568,13 @@ template struct scalar_product_traits namespace numext { -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) using internal::device::numeric_limits; #else using std::numeric_limits; @@ -590,7 +593,7 @@ T div_ceil(const T &a, const T &b) template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_GPU_COMPILE_PHASE) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } @@ -601,7 +604,7 @@ bool equal_strict(const double& x,const double& y) { return std::equal_to EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_CUDA_ARCH) +#if !defined(EIGEN_GPU_COMPILE_PHASE) template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index e977b9623..11df14918 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1299,7 +1299,7 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) +#if !defined(EIGEN_GPUCC) /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/test/half_float.cpp b/test/half_float.cpp index 7734f82cc..1b0ea9482 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -9,7 +9,7 @@ #include "main.h" -#include +#include // Make sure it's possible to forward declare Eigen::half namespace Eigen { diff --git a/test/main.h b/test/main.h index 5691af52b..95bbc9eb0 100644 --- a/test/main.h +++ b/test/main.h @@ -68,9 +68,19 @@ // are defined here and any not-parenthesized min/max call will cause a // compiler error. #if !defined(__HIPCC__) - // HIP headers include the header which contains not-parenthesized - // calls to "max", triggering the following check and causing the compile to fail - // so disabling the following checks for HIP + // + // HIP header files include the following files + // + // + // + // which seem to contain not-parenthesized calls to "max"/"min", triggering the following check and causing the compile to fail + // + // Including those header files before the following macro definition for "min" / "max", only partially resolves the issue + // This is because other HIP header files also define "isnan" / "isinf" / "isfinite" functions, which are needed in other + // headers. + // + // So instead choosing to simply disable this check for HIP + // #define min(A,B) please_protect_your_min_with_parentheses #define max(A,B) please_protect_your_max_with_parentheses #define isnan(X) please_protect_your_isnan_with_parentheses diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index e94e577fc..5c1c68912 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -35,7 +35,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running on the host CPU return 1; #elif defined(EIGEN_HIP_DEVICE_COMPILE) @@ -48,9 +48,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) // Running on the host CPU return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48*1024; // FIXME : update this number for HIP #else // Running on a CUDA device, return the amount of shared memory available. return 48*1024; @@ -58,9 +61,12 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) // Running single threaded on the host CPU return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP #else // Running on a CUDA device return firstLevelCacheSize(); @@ -68,7 +74,7 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index 24a57970a..8bbe449cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -201,7 +201,7 @@ class TensorExecutor { }; -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) +#if defined(EIGEN_GPUCC) template struct EigenMetaKernelEval { static __device__ EIGEN_ALWAYS_INLINE @@ -276,7 +276,7 @@ inline void TensorExecutor::run( evaluator.cleanup(); } -#endif // EIGEN_CUDACC || EIGEN_HIPCC +#endif // EIGEN_GPUCC #endif // EIGEN_USE_GPU // SYCL Executor policy diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 25ba2001e..b6d445c50 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -35,7 +35,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef EIGEN_CUDA_ARCH +#ifdef EIGEN_GPU_COMPILE_PHASE return __clz(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -53,7 +53,7 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef EIGEN_CUDA_ARCH +#ifdef EIGEN_GPU_COMPILE_PHASE return __clzll(val); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::clz(val); @@ -90,7 +90,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umulhi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -101,7 +101,7 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(EIGEN_CUDA_ARCH) +#if defined(EIGEN_GPU_COMPILE_PHASE) return __umul64hi(a, b); #elif defined(__SYCL_DEVICE_ONLY__) return cl::sycl::mul_hi(a, static_cast(b)); @@ -124,7 +124,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if defined(__SIZEOF_INT128__) && !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) +#if defined(__SIZEOF_INT128__) && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(__SYCL_DEVICE_ONLY__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -203,7 +203,7 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef EIGEN_CUDA_ARCH +#ifdef EIGEN_GPU_COMPILE_PHASE return (__umulhi(magic, n) >> shift); #elif defined(__SYCL_DEVICE_ONLY__) return (cl::sycl::mul_hi(static_cast(magic), static_cast(n)) >> shift); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index 8e1ba486d..c6ca396a3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC) +#if !defined(EIGEN_GPUCC) #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index de1075cc1..87be090f9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16))) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) template <> struct PacketType { typedef half2 type; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 71536a4b9..5a547141a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -16,7 +16,7 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. assert(threadIdx.z == 0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index d2fb3fd32..fdd338b96 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -334,12 +334,12 @@ struct OuterReducer { }; -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#if defined(EIGEN_HAS_GPU_FP16) template __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template @@ -698,9 +698,9 @@ struct TensorEvaluator, #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#if defined(EIGEN_HAS_GPU_FP16) template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); @@ -793,7 +793,7 @@ struct TensorEvaluator, Op m_reducer; // For full reductions -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) static const bool RunningOnGPU = internal::is_same::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 174a6a064..6d68e256f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -242,7 +242,7 @@ struct ScanLauncher { } }; -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)) +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) // GPU implementation of scan // TODO(ibab) This placeholder implementation performs multiple scans in @@ -286,7 +286,7 @@ struct ScanLauncher { #endif } }; -#endif // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC) +#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index bb584e3f9..8de3bbcab 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -268,10 +268,7 @@ template< typename Reducer > struct reduce { - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - constexpr static inline int run() { return Reducer::Identity; } + EIGEN_DEVICE_FUNC constexpr static inline int run() { return Reducer::Identity; } }; template< @@ -279,10 +276,7 @@ template< typename A > struct reduce { - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - constexpr static inline A run(A a) { return a; } + EIGEN_DEVICE_FUNC constexpr static inline A run(A a) { return a; } }; template< @@ -291,10 +285,7 @@ template< typename... Ts > struct reduce { - #if defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { + EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { return Reducer::run(a, reduce::run(ts...)); } }; @@ -333,10 +324,7 @@ struct greater_equal_zero_op { template constexpr static inline auto // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 // does... template -#if defined(EIGEN_HIPCC) -EIGEN_DEVICE_FUNC -#endif -constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) +EIGEN_DEVICE_FUNC constexpr inline decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) { return reduce::run(ts...); } diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index 5b01c5fb7..d91662d96 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -15,7 +15,7 @@ // The array class is only available starting with cxx11. Emulate our own here // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! // Moreover, CUDA doesn't support the STL containers, so we use our own instead. -#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY) +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY) namespace Eigen { template class array { diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 9a719e3c0..fbbc87661 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -190,7 +190,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) { -#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgammaf_r(x, &dummy); #else @@ -203,7 +203,7 @@ template <> struct lgamma_impl { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) { -#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) int dummy; return ::lgamma_r(x, &dummy); #else diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h index 020ac1b62..ad011e305 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma(const float4& a) diff --git a/unsupported/test/cxx11_tensor_hip.cu b/unsupported/test/cxx11_tensor_hip.cu index f1794aa87..b28840267 100644 --- a/unsupported/test/cxx11_tensor_hip.cu +++ b/unsupported/test/cxx11_tensor_hip.cu @@ -1222,9 +1222,8 @@ void test_cxx11_tensor_hip() CALL_SUBTEST(test_hip_elementwise()); CALL_SUBTEST(test_hip_props()); CALL_SUBTEST(test_hip_reduction()); - // FIXME : uncommenting following tests results in compile failure - // CALL_SUBTEST(test_hip_contraction()); - // CALL_SUBTEST(test_hip_contraction()); + CALL_SUBTEST(test_hip_contraction()); + CALL_SUBTEST(test_hip_contraction()); CALL_SUBTEST(test_hip_convolution_1d()); CALL_SUBTEST(test_hip_convolution_1d()); CALL_SUBTEST(test_hip_convolution_inner_dim_col_major_1d()); -- cgit v1.2.3