From b08527b0c1ffdbd44347ca3a7869f10b0cb3cbb6 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 31 May 2019 15:26:06 -0700 Subject: Clean up CUDA/NVCC version macros and their use in Eigen, and a few other CUDA build failures. --- Eigen/src/Core/Visitor.h | 1 + Eigen/src/Core/arch/GPU/Half.h | 47 +++++++++++----------- Eigen/src/Core/arch/GPU/PacketMath.h | 2 + Eigen/src/Core/arch/GPU/PacketMathHalf.h | 7 +++- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- Eigen/src/Core/util/Macros.h | 25 +++++++++--- test/gpu_common.h | 9 +++-- test/main.h | 16 ++++---- .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h | 4 +- .../CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 8 ++-- 11 files changed, 76 insertions(+), 49 deletions(-) diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index f67d83bd1..67a69c54f 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -138,6 +138,7 @@ template struct coeff_visitor { // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc + EIGEN_DEVICE_FUNC coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h index 5affe8b92..e433bfe1c 100644 --- a/Eigen/src/Core/arch/GPU/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -60,7 +60,7 @@ struct __half_raw { // Nothing to do here // HIP fp16 header file has a definition for __half_raw #elif defined(EIGEN_HAS_CUDA_FP16) - #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw typedef __half __half_raw; #endif // defined(EIGEN_HAS_CUDA_FP16) @@ -83,7 +83,7 @@ struct half_base : public __half_raw { #if defined(EIGEN_HAS_HIP_FP16) EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); } #elif defined(EIGEN_HAS_CUDA_FP16) - #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000) + #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000) EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} #endif #endif @@ -103,11 +103,12 @@ struct half : public half_impl::half_base { // Nothing to do here // HIP fp16 header file has a definition for __half_raw #elif defined(EIGEN_HAS_CUDA_FP16) - // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP! - // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed - #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 - typedef half_impl::__half_raw __half_raw; - #endif + // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so + // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within + // #if defined(EIGEN_HAS_CUDA_FP16) is needed + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + typedef half_impl::__half_raw __half_raw; + #endif #endif EIGEN_DEVICE_FUNC half() {} @@ -119,7 +120,7 @@ struct half : public half_impl::half_base { #if defined(EIGEN_HAS_HIP_FP16) EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} #elif defined(EIGEN_HAS_CUDA_FP16) - #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} #endif #endif @@ -238,9 +239,9 @@ namespace Eigen { namespace half_impl { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \ - (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) +#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ + EIGEN_CUDA_ARCH >= 530) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) #define EIGEN_HAS_NATIVE_FP16 #endif @@ -251,7 +252,7 @@ namespace half_impl { #if defined(EIGEN_HAS_NATIVE_FP16) EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { -#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 return __hadd(::__half(a), ::__half(b)); #else return __hadd(a, b); @@ -264,7 +265,7 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { -#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 +#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 return __hdiv(a, b); #else float num = __half2float(a); @@ -312,13 +313,13 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { #endif -#if !defined(EIGEN_HAS_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats +#if !defined(EIGEN_HAS_NATIVE_FP16) || EIGEN_COMP_CLANG // Emulate support for half floats -#if defined(__clang__) && defined(__CUDA__) +#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) // We need to provide emulated *host-side* FP16 operators for clang. #pragma push_macro("EIGEN_DEVICE_FUNC") #undef EIGEN_DEVICE_FUNC -#if defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) #define EIGEN_DEVICE_FUNC __host__ #else // both host and device need emulated ops. #define EIGEN_DEVICE_FUNC __host__ __device__ @@ -517,7 +518,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ defined(EIGEN_HIP_DEVICE_COMPILE) return half(hexp(a)); #else @@ -528,7 +529,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ +#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return half(::hlog(a)); #else @@ -542,7 +543,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ defined(EIGEN_HIP_DEVICE_COMPILE) return half(hsqrt(a)); #else @@ -565,7 +566,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ defined(EIGEN_HIP_DEVICE_COMPILE) return half(hfloor(a)); #else @@ -573,7 +574,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ defined(EIGEN_HIP_DEVICE_COMPILE) return half(hceil(a)); #else @@ -673,7 +674,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ defined(EIGEN_HIP_DEVICE_COMPILE) return Eigen::half(::hlog(a)); #else @@ -712,7 +713,7 @@ struct hash { defined(EIGEN_HIP_DEVICE_COMPILE) __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if (EIGEN_CUDACC_VER < 90000) || \ + #if (EIGEN_CUDA_SDK_VER < 90000) || \ defined(EIGEN_HAS_HIP_FP16) return static_cast(__shfl_xor(static_cast(var), laneMask, width)); #else diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index ee5b4b39e..adb260643 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -102,6 +102,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do return make_double2(from, from); } +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, @@ -213,6 +214,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq(const double2& a, const double2& b) { return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); } +#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { return make_float4(a, a+1, a+2, a+3); diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index 723506c8b..b04a4d7d6 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -73,8 +73,13 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen: } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) + to[0] = from.x; + to[1] = from.y; +#else to[0] = __low2half(from); to[1] = __high2half(from); +#endif } template<> @@ -477,7 +482,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2 return __floats2half2_rn(r1, r2); } -#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ +#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ defined(EIGEN_HIP_DEVICE_COMPILE) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 1514975d1..d2f2f33f7 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -380,7 +380,7 @@ #if defined EIGEN_CUDACC #define EIGEN_VECTORIZE_GPU #include - #if EIGEN_CUDACC_VER >= 70500 + #if EIGEN_CUDA_SDK_VER >= 70500 #define EIGEN_HAS_CUDA_FP16 #endif #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 29be2a74f..31f91bd49 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -108,6 +108,18 @@ #define EIGEN_COMP_MSVC 0 #endif +#if defined(__NVCC__) +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) + #define EIGEN_COMP_NVCC __CUDACC_VER__ +#else + #error "NVCC did not define compiler version." +#endif +#else + #define EIGEN_COMP_NVCC 0 +#endif + // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: // name ver MSC_VER // 2008 9 1500 @@ -408,10 +420,11 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif -#if defined(CUDA_VERSION) - #define EIGEN_CUDACC_VER (CUDA_VERSION*10) +#if defined(EIGEN_CUDACC) +#include + #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) #else - #define EIGEN_CUDACC_VER 0 + #define EIGEN_CUDA_SDK_VER 0 #endif #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) @@ -622,7 +635,7 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -637,7 +650,7 @@ #ifndef EIGEN_HAS_CONSTEXPR #if defined(EIGEN_CUDACC) // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above - #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) + #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ @@ -981,7 +994,7 @@ namespace Eigen { #endif -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; diff --git a/test/gpu_common.h b/test/gpu_common.h index 79d4ea694..fd8278f67 100644 --- a/test/gpu_common.h +++ b/test/gpu_common.h @@ -1,4 +1,3 @@ - #ifndef EIGEN_TEST_GPU_COMMON_H #define EIGEN_TEST_GPU_COMMON_H @@ -130,10 +129,14 @@ void ei_test_init_gpu() std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << "\n"; #endif - #ifdef EIGEN_CUDACC_VER - std::cout << " EIGEN_CUDACC_VER: " << int(EIGEN_CUDACC_VER) << "\n"; + #ifdef EIGEN_CUDA_SDK_VER + std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << "\n"; #endif + #ifdef EIGEN_COMP_NVCC + std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << "\n"; + #endif + #ifdef EIGEN_HIPCC std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << "\n"; #endif diff --git a/test/main.h b/test/main.h index 1fe631ca9..93e894460 100644 --- a/test/main.h +++ b/test/main.h @@ -52,15 +52,17 @@ #endif // Same for cuda_fp16.h -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) -#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) -#elif defined(__CUDACC_VER__) -#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__ +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + // Means the compiler is either nvcc or clang with CUDA enabled + #define EIGEN_CUDACC __CUDACC__ +#endif +#if defined(EIGEN_CUDACC) +#include + #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) #else -#define EIGEN_TEST_CUDACC_VER 0 + #define EIGEN_CUDA_SDK_VER 0 #endif - -#if EIGEN_TEST_CUDACC_VER >= 70500 +#if EIGEN_CUDA_SDK_VER >= 70500 #include #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index 5d19652e6..3471d1056 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. -#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) #else #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) @@ -621,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } - #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); #else diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index c03096363..f32ce27e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -81,8 +81,8 @@ // gpu_assert can be overridden #ifndef gpu_assert -#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0)) -// clang-cuda and HIPCC do not support the use of assert on the GPU side. +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC do not support the use of assert on the GPU side. #define gpu_assert(COND) #else #define gpu_assert(COND) assert(COND) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 7ee4a6087..095bb54cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -177,7 +177,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } else { reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); } - #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); #else reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); @@ -269,7 +269,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, wka_in.h = accum; wka_out.i = __shfl_down(wka_in.i, offset, warpSize); reducer.reducePacket(wka_out.h, &accum); - #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); #else int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); @@ -466,7 +466,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } else { reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); } - #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); #else reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); @@ -571,7 +571,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, wka_in.h = reduced_val2; wka_out.i = __shfl_down(wka_in.i, offset, warpSize); reducer.reducePacket(wka_out.h, &reduced_val2); - #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); #else -- cgit v1.2.3