aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/GPU
diff options
context:
space:
mode:
authorGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-05-15 13:32:15 -0700
committerGravatar Rasmus Munk Larsen <rmlarsen@google.com>2019-05-15 13:32:15 -0700
commitab0a30e4292f109326b444dabb1e9e0c1dc29881 (patch)
tree352aad7d121a118aeba018ca146ade1e84860eb5 /Eigen/src/Core/arch/GPU
parentc8d8d5c0fcfe31eb43005245e36627e104ad2e5f (diff)
Make Eigen build with cuda 10 and clang.
Diffstat (limited to 'Eigen/src/Core/arch/GPU')
-rw-r--r--Eigen/src/Core/arch/GPU/Half.h24
-rw-r--r--Eigen/src/Core/arch/GPU/PacketMathHalf.h10
2 files changed, 31 insertions, 3 deletions
diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index 6f1362f2b..7f1d7e5b8 100644
--- a/Eigen/src/Core/arch/GPU/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -239,13 +239,17 @@ namespace Eigen {
namespace half_impl {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \
+ (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
+#define EIGEN_HAS_NATIVE_FP16
+#endif
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
+#if defined(EIGEN_HAS_NATIVE_FP16)
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hadd(::__half(a), ::__half(b));
@@ -306,7 +310,20 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b);
}
-#else // Emulate support for half floats
+#endif
+
+#if !defined(EIGEN_HAS_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats
+
+#if defined(__clang__) && defined(__CUDA__)
+// We need to provide emulated *host-side* FP16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_FP16)
+#define EIGEN_DEVICE_FUNC __host__
+#else // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
// to/from fp32.
@@ -363,6 +380,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal
return float(a) >= float(b);
}
+#if defined(__clang__) && defined(__CUDA__)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
index f82fa09c1..3da8c1f3e 100644
--- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h
@@ -16,7 +16,8 @@ namespace internal {
// Most of the following operations require arch >= 3.0
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
+ (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
template<> struct is_arithmetic<half2> { enum { value = true }; };
@@ -45,7 +46,14 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+#if !defined(EIGEN_CUDA_ARCH)
+ half2 r;
+ r.x = from;
+ r.y = from;
+ return r;
+#else
return __half2half2(from);
+#endif
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {