From 734a50dc601c51a7d1ed7b422ac5db374b1e5805 Mon Sep 17 00:00:00 2001
From: Rasmus Munk Larsen <rmlarsen@google.com>
Date: Wed, 15 May 2019 13:32:15 -0700
Subject: Make Eigen build with cuda 10 and clang.

---
 Eigen/src/Core/arch/GPU/Half.h                     | 24 ++++++++++++++++++++--
 Eigen/src/Core/arch/GPU/PacketMathHalf.h           | 10 ++++++++-
 Eigen/src/Core/util/Macros.h                       |  7 ++-----
 .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h    |  4 ----
 4 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index 6f1362f2b..0af1b14f5 100644
--- a/Eigen/src/Core/arch/GPU/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -239,13 +239,17 @@ namespace Eigen {
 namespace half_impl {
 
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \
+  (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
+#define __EIGEN_NATIVE_FP16
+#endif
 
 // Intrinsics for native fp16 support. Note that on current hardware,
 // these are no faster than fp32 arithmetic (you need to use the half2
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.
 
+#if defined(__EIGEN_NATIVE_FP16)
 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
 #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
   return __hadd(::__half(a), ::__half(b));
@@ -306,7 +310,20 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
   return __hge(a, b);
 }
 
-#else  // Emulate support for half floats
+#endif
+
+#if !defined(__EIGEN_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats
+
+#if defined(__clang__) && defined(__CUDA__)
+// We need to provide emulated *host-side* FP16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_FP16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
 
 // Definitions for CPUs and older HIP+CUDA, mostly working through conversion
 // to/from fp32.
@@ -363,6 +380,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal
   return float(a) >= float(b);
 }
 
+#if defined(__clang__) && defined(__CUDA__)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
 #endif  // Emulate support for half floats
 
 // Division by an index. Do it in full float precision to avoid accuracy
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index f82fa09c1..3da8c1f3e 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -16,7 +16,8 @@ namespace internal {
 
 // Most of the following operations require arch >= 3.0
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
+  (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
 
 template<> struct is_arithmetic<half2> { enum { value = true }; };
 
@@ -45,7 +46,14 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+#if !defined(EIGEN_CUDA_ARCH)
+  half2 r;
+  r.x = from;
+  r.y = from;
+  return r;
+#else
   return __half2half2(from);
+#endif
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index ce3633388..564b4d09f 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -395,11 +395,8 @@
   #define EIGEN_CUDA_ARCH __CUDA_ARCH__
 #endif
 
-// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-  #define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-  #define EIGEN_CUDACC_VER __CUDACC_VER__
+#if defined(CUDA_VERSION)
+  #define EIGEN_CUDACC_VER (CUDA_VERSION*10)
 #else
   #define EIGEN_CUDACC_VER 0
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 0718ba2a1..7ee4a6087 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -674,10 +674,6 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
     if (num_blocks > 1) {
       // We initialize the outputs outside the reduction kernel when we can't be sure that there
       // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
       LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
                          1, 1, 0, device, reducer, self, num_preserved_vals, output);
     }
-- 
cgit v1.2.3