From 734a50dc601c51a7d1ed7b422ac5db374b1e5805 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 15 May 2019 13:32:15 -0700 Subject: Make Eigen build with cuda 10 and clang. --- Eigen/src/Core/arch/GPU/Half.h | 24 ++++++++++++++++++++-- Eigen/src/Core/arch/GPU/PacketMathHalf.h | 10 ++++++++- Eigen/src/Core/util/Macros.h | 7 ++----- .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 4 ---- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h index 6f1362f2b..0af1b14f5 100644 --- a/Eigen/src/Core/arch/GPU/Half.h +++ b/Eigen/src/Core/arch/GPU/Half.h @@ -239,13 +239,17 @@ namespace Eigen { namespace half_impl { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) + (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \ + (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) +#define __EIGEN_NATIVE_FP16 +#endif // Intrinsics for native fp16 support. Note that on current hardware, // these are no faster than fp32 arithmetic (you need to use the half2 // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. +#if defined(__EIGEN_NATIVE_FP16) EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 return __hadd(::__half(a), ::__half(b)); @@ -306,7 +310,20 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { return __hge(a, b); } -#else // Emulate support for half floats +#endif + +#if !defined(__EIGEN_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats + +#if defined(__clang__) && defined(__CUDA__) +// We need to provide emulated *host-side* FP16 operators for clang. +#pragma push_macro("EIGEN_DEVICE_FUNC") +#undef EIGEN_DEVICE_FUNC +#if defined(EIGEN_HAS_CUDA_FP16) +#define EIGEN_DEVICE_FUNC __host__ +#else // both host and device need emulated ops. +#define EIGEN_DEVICE_FUNC __host__ __device__ +#endif +#endif // Definitions for CPUs and older HIP+CUDA, mostly working through conversion // to/from fp32. @@ -363,6 +380,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal return float(a) >= float(b); } +#if defined(__clang__) && defined(__CUDA__) +#pragma pop_macro("EIGEN_DEVICE_FUNC") +#endif #endif // Emulate support for half floats // Division by an index. Do it in full float precision to avoid accuracy diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index f82fa09c1..3da8c1f3e 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -16,7 +16,8 @@ namespace internal { // Most of the following operations require arch >= 3.0 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \ + (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) template<> struct is_arithmetic { enum { value = true }; }; @@ -45,7 +46,14 @@ template<> struct packet_traits : default_packet_traits template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { +#if !defined(EIGEN_CUDA_ARCH) + half2 r; + r.x = from; + r.y = from; + return r; +#else return __half2half2(from); +#endif } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index ce3633388..564b4d09f 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -395,11 +395,8 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif -// Starting with CUDA 9 the composite __CUDACC_VER__ is not available. -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) -#elif defined(__CUDACC_VER__) - #define EIGEN_CUDACC_VER __CUDACC_VER__ +#if defined(CUDA_VERSION) + #define EIGEN_CUDACC_VER (CUDA_VERSION*10) #else #define EIGEN_CUDACC_VER 0 #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 0718ba2a1..7ee4a6087 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -674,10 +674,6 @@ struct InnerReductionLauncher { if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - const int dyn_blocks = divup(num_preserved_vals, 1024); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / 1024; - const int num_blocks = numext::mini(max_blocks, dyn_blocks); LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat), 1, 1, 0, device, reducer, self, num_preserved_vals, output); } -- cgit v1.2.3