aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-05-31 15:26:06 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-05-31 15:26:06 -0700
commitb08527b0c1ffdbd44347ca3a7869f10b0cb3cbb6 (patch)
tree0e96b895f59e4a77ca9880d2e219ffbde11680ac
parent56144005811e3e5a76031ba0aac8a4e1fa3e3396 (diff)
Clean up CUDA/NVCC version macros and their use in Eigen, and a few other CUDA build failures.
-rw-r--r--Eigen/src/Core/Visitor.h1
-rw-r--r--Eigen/src/Core/arch/GPU/Half.h47
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMath.h2
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h7
-rw-r--r--Eigen/src/Core/util/ConfigureVectorization.h2
-rw-r--r--Eigen/src/Core/util/Macros.h25
-rw-r--r--test/gpu_common.h9
-rw-r--r--test/main.h16
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h4
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h8
11 files changed, 76 insertions, 49 deletions
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index f67d83bd1..67a69c54f 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -138,6 +138,7 @@ template <typename Derived>
struct coeff_visitor
{
// default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+ EIGEN_DEVICE_FUNC
coeff_visitor() : row(-1), col(-1), res(0) {}
typedef typename Derived::Scalar Scalar;
Index row, col;
diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index 5affe8b92..e433bfe1c 100644
--- a/Eigen/src/Core/arch/GPU/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -60,7 +60,7 @@ struct __half_raw {
// Nothing to do here
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif // defined(EIGEN_HAS_CUDA_FP16)
@@ -83,7 +83,7 @@ struct half_base : public __half_raw {
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
#elif defined(EIGEN_HAS_CUDA_FP16)
- #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
+ #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
@@ -103,11 +103,12 @@ struct half : public half_impl::half_base {
// Nothing to do here
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
- // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
- // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
- #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
- typedef half_impl::__half_raw __half_raw;
- #endif
+ // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+ // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
+ // #if defined(EIGEN_HAS_CUDA_FP16) is needed
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+ typedef half_impl::__half_raw __half_raw;
+ #endif
#endif
EIGEN_DEVICE_FUNC half() {}
@@ -119,7 +120,7 @@ struct half : public half_impl::half_base {
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
#endif
@@ -238,9 +239,9 @@ namespace Eigen {
namespace half_impl {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \
- (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+ EIGEN_CUDA_ARCH >= 530) || \
+ (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
#define EIGEN_HAS_NATIVE_FP16
#endif
@@ -251,7 +252,7 @@ namespace half_impl {
#if defined(EIGEN_HAS_NATIVE_FP16)
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hadd(::__half(a), ::__half(b));
#else
return __hadd(a, b);
@@ -264,7 +265,7 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hdiv(a, b);
#else
float num = __half2float(a);
@@ -312,13 +313,13 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
#endif
-#if !defined(EIGEN_HAS_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats
+#if !defined(EIGEN_HAS_NATIVE_FP16) || EIGEN_COMP_CLANG // Emulate support for half floats
-#if defined(__clang__) && defined(__CUDA__)
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
// We need to provide emulated *host-side* FP16 operators for clang.
#pragma push_macro("EIGEN_DEVICE_FUNC")
#undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
#define EIGEN_DEVICE_FUNC __host__
#else // both host and device need emulated ops.
#define EIGEN_DEVICE_FUNC __host__ __device__
@@ -517,7 +518,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a));
#else
@@ -528,7 +529,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return half(::hlog(a));
#else
@@ -542,7 +543,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a));
#else
@@ -565,7 +566,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a));
#else
@@ -573,7 +574,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a));
#else
@@ -673,7 +674,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return Eigen::half(::hlog(a));
#else
@@ -712,7 +713,7 @@ struct hash<Eigen::half> {
defined(EIGEN_HIP_DEVICE_COMPILE)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
- #if (EIGEN_CUDACC_VER < 90000) || \
+ #if (EIGEN_CUDA_SDK_VER < 90000) || \
defined(EIGEN_HAS_HIP_FP16)
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
#else
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index ee5b4b39e..adb260643 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -102,6 +102,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
return make_double2(from, from);
}
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG)
namespace {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
@@ -213,6 +214,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
pcmp_eq<double2>(const double2& a, const double2& b) {
return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
}
+#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 723506c8b..b04a4d7d6 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -73,8 +73,13 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen:
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+ to[0] = from.x;
+ to[1] = from.y;
+#else
to[0] = __low2half(from);
to[1] = __high2half(from);
+#endif
}
template<>
@@ -477,7 +482,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2
return __floats2half2_rn(r1, r2);
}
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 1514975d1..d2f2f33f7 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -380,7 +380,7 @@
#if defined EIGEN_CUDACC
#define EIGEN_VECTORIZE_GPU
#include <vector_types.h>
- #if EIGEN_CUDACC_VER >= 70500
+ #if EIGEN_CUDA_SDK_VER >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 29be2a74f..31f91bd49 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -108,6 +108,18 @@
#define EIGEN_COMP_MSVC 0
#endif
+#if defined(__NVCC__)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+ #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+ #define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+ #error "NVCC did not define compiler version."
+#endif
+#else
+ #define EIGEN_COMP_NVCC 0
+#endif
+
// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
// name ver MSC_VER
// 2008 9 1500
@@ -408,10 +420,11 @@
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
-#if defined(CUDA_VERSION)
- #define EIGEN_CUDACC_VER (CUDA_VERSION*10)
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+ #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
#else
- #define EIGEN_CUDACC_VER 0
+ #define EIGEN_CUDA_SDK_VER 0
#endif
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
@@ -622,7 +635,7 @@
// Does the compiler support variadic templates?
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
- && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
+ && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
@@ -637,7 +650,7 @@
#ifndef EIGEN_HAS_CONSTEXPR
#if defined(EIGEN_CUDACC)
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
- #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
+ #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
#define EIGEN_HAS_CONSTEXPR 1
#endif
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
@@ -981,7 +994,7 @@ namespace Eigen {
#endif
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC)
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =;
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 79d4ea694..fd8278f67 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -1,4 +1,3 @@
-
#ifndef EIGEN_TEST_GPU_COMMON_H
#define EIGEN_TEST_GPU_COMMON_H
@@ -130,10 +129,14 @@ void ei_test_init_gpu()
std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << "\n";
#endif
- #ifdef EIGEN_CUDACC_VER
- std::cout << " EIGEN_CUDACC_VER: " << int(EIGEN_CUDACC_VER) << "\n";
+ #ifdef EIGEN_CUDA_SDK_VER
+ std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << "\n";
#endif
+ #ifdef EIGEN_COMP_NVCC
+ std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << "\n";
+ #endif
+
#ifdef EIGEN_HIPCC
std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << "\n";
#endif
diff --git a/test/main.h b/test/main.h
index 1fe631ca9..93e894460 100644
--- a/test/main.h
+++ b/test/main.h
@@ -52,15 +52,17 @@
#endif
// Same for cuda_fp16.h
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+ // Means the compiler is either nvcc or clang with CUDA enabled
+ #define EIGEN_CUDACC __CUDACC__
+#endif
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+ #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
#else
-#define EIGEN_TEST_CUDACC_VER 0
+ #define EIGEN_CUDA_SDK_VER 0
#endif
-
-#if EIGEN_TEST_CUDACC_VER >= 70500
+#if EIGEN_CUDA_SDK_VER >= 70500
#include <cuda_fp16.h>
#endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index 5d19652e6..3471d1056 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
// the sum across all big k blocks of the product of little k block of index (x, y)
// with block of index (y, z). To compute the final output, we need to reduce
// the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
#else
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -621,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
x1 = rhs_pf0.x;
x2 = rhs_pf0.z;
}
- #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+ #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
x1 = __shfl_xor(x1, 4);
x2 = __shfl_xor(x2, 4);
#else
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
index c03096363..f32ce27e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -81,8 +81,8 @@
// gpu_assert can be overridden
#ifndef gpu_assert
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0))
-// clang-cuda and HIPCC do not support the use of assert on the GPU side.
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
#define gpu_assert(COND)
#else
#define gpu_assert(COND) assert(COND)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 7ee4a6087..095bb54cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -177,7 +177,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
}
- #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
#else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
@@ -269,7 +269,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
wka_in.h = accum;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &accum);
- #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
#else
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
@@ -466,7 +466,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
}
- #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
#else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
@@ -571,7 +571,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
wka_in.h = reduced_val2;
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
reducer.reducePacket(wka_out.h, &reduced_val2);
- #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
#else