From b08527b0c1ffdbd44347ca3a7869f10b0cb3cbb6 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Fri, 31 May 2019 15:26:06 -0700
Subject: Clean up CUDA/NVCC version macros and their use in Eigen, and a few
 other CUDA build failures.

---
 Eigen/src/Core/Visitor.h                           |  1 +
 Eigen/src/Core/arch/GPU/Half.h                     | 47 +++++++++++-----------
 Eigen/src/Core/arch/GPU/PacketMath.h               |  2 +
 Eigen/src/Core/arch/GPU/PacketMathHalf.h           |  7 +++-
 Eigen/src/Core/util/ConfigureVectorization.h       |  2 +-
 Eigen/src/Core/util/Macros.h                       | 25 +++++++++---
 test/gpu_common.h                                  |  9 +++--
 test/main.h                                        | 16 ++++----
 .../Eigen/CXX11/src/Tensor/TensorContractionGpu.h  |  4 +-
 .../CXX11/src/Tensor/TensorGpuHipCudaDefines.h     |  4 +-
 .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h    |  8 ++--
 11 files changed, 76 insertions(+), 49 deletions(-)
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index f67d83bd1..67a69c54f 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -138,6 +138,7 @@ template <typename Derived>
 struct coeff_visitor
 {
   // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
+  EIGEN_DEVICE_FUNC
   coeff_visitor() : row(-1), col(-1), res(0) {}
   typedef typename Derived::Scalar Scalar;
   Index row, col;
diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index 5affe8b92..e433bfe1c 100644
--- a/Eigen/src/Core/arch/GPU/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -60,7 +60,7 @@ struct __half_raw {
   // Nothing to do here
   // HIP fp16 header file has a definition for __half_raw
 #elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+ #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
  typedef __half __half_raw;
  #endif // defined(EIGEN_HAS_CUDA_FP16)
@@ -83,7 +83,7 @@ struct half_base : public __half_raw {
  #if defined(EIGEN_HAS_HIP_FP16)
   EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
  #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
+  #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000)
   EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
   #endif
  #endif    
@@ -103,11 +103,12 @@ struct half : public half_impl::half_base {
   // Nothing to do here
   // HIP fp16 header file has a definition for __half_raw
 #elif defined(EIGEN_HAS_CUDA_FP16)
-  // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
-  // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
- #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000  
-  typedef half_impl::__half_raw __half_raw;
- #endif
+  // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+  // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
+  // #if defined(EIGEN_HAS_CUDA_FP16) is needed
+  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    typedef half_impl::__half_raw __half_raw;
+  #endif
 #endif
 
   EIGEN_DEVICE_FUNC half() {}
@@ -119,7 +120,7 @@ struct half : public half_impl::half_base {
  #if defined(EIGEN_HAS_HIP_FP16)
   EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
  #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
   #endif
  #endif
@@ -238,9 +239,9 @@ namespace Eigen {
 
 namespace half_impl {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \
-  (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
+     EIGEN_CUDA_ARCH >= 530) ||                                  \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 #define EIGEN_HAS_NATIVE_FP16
 #endif
 
@@ -251,7 +252,7 @@ namespace half_impl {
 
 #if defined(EIGEN_HAS_NATIVE_FP16)
 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   return __hadd(::__half(a), ::__half(b));
 #else
   return __hadd(a, b);
@@ -264,7 +265,7 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
   return __hsub(a, b);
 }
 EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
   return __hdiv(a, b);
 #else
   float num = __half2float(a);
@@ -312,13 +313,13 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
 
 #endif
 
-#if !defined(EIGEN_HAS_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats
+#if !defined(EIGEN_HAS_NATIVE_FP16) || EIGEN_COMP_CLANG // Emulate support for half floats
 
-#if defined(__clang__) && defined(__CUDA__)
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -517,7 +518,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
   return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
@@ -528,7 +529,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
   return half(numext::expm1(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
   (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return half(::hlog(a));
 #else
@@ -542,7 +543,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
   return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
@@ -565,7 +566,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
   return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
@@ -573,7 +574,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
@@ -673,7 +674,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
   return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
   return Eigen::half(::hlog(a));
 #else
@@ -712,7 +713,7 @@ struct hash<Eigen::half> {
   defined(EIGEN_HIP_DEVICE_COMPILE)
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if (EIGEN_CUDACC_VER < 90000) || \
+  #if (EIGEN_CUDA_SDK_VER < 90000) || \
     defined(EIGEN_HAS_HIP_FP16)
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
   #else
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index ee5b4b39e..adb260643 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -102,6 +102,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
   return make_double2(from, from);
 }
 
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG)
 namespace {
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
@@ -213,6 +214,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
 pcmp_eq<double2>(const double2& a, const double2& b) {
   return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
 }
+#endif  // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
   return make_float4(a, a+1, a+2, a+3);
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index 723506c8b..b04a4d7d6 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -73,8 +73,13 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen:
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+  to[0] = from.x;
+  to[1] = from.y;
+#else
   to[0] = __low2half(from);
   to[1] = __high2half(from);
+#endif
 }
 
 template<>
@@ -477,7 +482,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2
   return __floats2half2_rn(r1, r2);
 }
 
-#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
 
 template<>  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 1514975d1..d2f2f33f7 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -380,7 +380,7 @@
 #if defined EIGEN_CUDACC
   #define EIGEN_VECTORIZE_GPU
   #include <vector_types.h>
-  #if EIGEN_CUDACC_VER >= 70500
+  #if EIGEN_CUDA_SDK_VER >= 70500
     #define EIGEN_HAS_CUDA_FP16
   #endif
 #endif
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 29be2a74f..31f91bd49 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -108,6 +108,18 @@
   #define EIGEN_COMP_MSVC 0
 #endif
 
+#if defined(__NVCC__)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+  #define EIGEN_COMP_NVCC  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+  #define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+  #error "NVCC did not define compiler version."
+#endif
+#else
+  #define EIGEN_COMP_NVCC 0
+#endif
+
 // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
 //  name        ver   MSC_VER
 //  2008         9      1500
@@ -408,10 +420,11 @@
   #define EIGEN_CUDA_ARCH __CUDA_ARCH__
 #endif
 
-#if defined(CUDA_VERSION)
-  #define EIGEN_CUDACC_VER (CUDA_VERSION*10)
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
 #else
-  #define EIGEN_CUDACC_VER 0
+  #define EIGEN_CUDA_SDK_VER 0
 #endif
 
 #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
@@ -622,7 +635,7 @@
 // Does the compiler support variadic templates?
 #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
 #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
-  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
     // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
     //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
@@ -637,7 +650,7 @@
 #ifndef EIGEN_HAS_CONSTEXPR
   #if defined(EIGEN_CUDACC)
   // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-    #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
+    #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
       #define EIGEN_HAS_CONSTEXPR 1
     #endif
   #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
@@ -981,7 +994,7 @@ namespace Eigen {
 #endif
 
 
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC)
   // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 79d4ea694..fd8278f67 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -1,4 +1,3 @@
-
 #ifndef EIGEN_TEST_GPU_COMMON_H
 #define EIGEN_TEST_GPU_COMMON_H
 
@@ -130,10 +129,14 @@ void ei_test_init_gpu()
   std::cout << "  EIGEN_CUDACC:                 " << int(EIGEN_CUDACC) << "\n";
   #endif
   
-  #ifdef EIGEN_CUDACC_VER
-  std::cout << "  EIGEN_CUDACC_VER:             " << int(EIGEN_CUDACC_VER) << "\n";
+  #ifdef EIGEN_CUDA_SDK_VER
+  std::cout << "  EIGEN_CUDA_SDK_VER:             " << int(EIGEN_CUDA_SDK_VER) << "\n";
   #endif
 
+  #ifdef EIGEN_COMP_NVCC
+  std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << "\n";
+  #endif
+  
   #ifdef EIGEN_HIPCC
   std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << "\n";
   #endif
diff --git a/test/main.h b/test/main.h
index 1fe631ca9..93e894460 100644
--- a/test/main.h
+++ b/test/main.h
@@ -52,15 +52,17 @@
 #endif
 
 // Same for cuda_fp16.h
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-#define EIGEN_TEST_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  // Means the compiler is either nvcc or clang with CUDA enabled
+  #define EIGEN_CUDACC __CUDACC__
+#endif
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
 #else
-#define EIGEN_TEST_CUDACC_VER 0
+  #define EIGEN_CUDA_SDK_VER 0
 #endif
-
-#if EIGEN_TEST_CUDACC_VER >= 70500
+#if EIGEN_CUDA_SDK_VER >= 70500
 #include <cuda_fp16.h>
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index 5d19652e6..3471d1056 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
   // the sum across all big k blocks of the product of little k block of index (x, y)
   // with block of index (y, z). To compute the final output, we need to reduce
   // the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
 #else
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -621,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
       x1 = rhs_pf0.x;
       x2 = rhs_pf0.z;
     }
-    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
     x1 = __shfl_xor(x1, 4);
     x2 = __shfl_xor(x2, 4);
     #else
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
index c03096363..f32ce27e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -81,8 +81,8 @@
 // gpu_assert can be overridden
 #ifndef gpu_assert
 
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0))
-// clang-cuda and HIPCC do not support the use of assert on the GPU side.
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
 #define gpu_assert(COND)
 #else
 #define gpu_assert(COND) assert(COND)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 7ee4a6087..095bb54cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -177,7 +177,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
     } else {
       reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
     }
-  #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
     reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
   #else
     reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
@@ -269,7 +269,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
     wka_in.h = accum;
     wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
     reducer.reducePacket(wka_out.h, &accum);
-  #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
     reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
   #else
     int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
@@ -466,7 +466,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
         } else {
           reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
         }
-      #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
         reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
       #else
         reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
@@ -571,7 +571,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 	wka_in.h = reduced_val2;
 	wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
         reducer.reducePacket(wka_out.h, &reduced_val2);
-      #elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
         reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
         reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
       #else
-- 
cgit v1.2.3