1 files changed, 102 insertions, 29 deletions
diff --git a/Eigen/src/Core/arch/GPU/Half.h b/Eigen/src/Core/arch/GPU/Half.h
index c10550050..ab9d27591 100644
--- a/Eigen/src/Core/arch/GPU/Half.h
+++ b/Eigen/src/Core/arch/GPU/Half.h
@@ -26,15 +26,15 @@
 
 
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
 // in fp32 for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 
 
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
+#ifndef EIGEN_HALF_GPU_H
+#define EIGEN_HALF_GPU_H
 
 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@@ -49,16 +49,41 @@ struct half;
 
 namespace half_impl {
 
-#if !defined(EIGEN_HAS_CUDA_FP16)
+#if !defined(EIGEN_HAS_GPU_FP16)
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
   EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
   explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+#elif defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+// Make a __half_raw definition that is
+// ++ compatible with that of Eigen and
+// ++ add a implcit conversion to the native __half of the old HIP implementation.
+//
+// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
+//
+// In the old HIP implementation,
+//   ++ __half is a typedef of __fp16
+//   ++ the "__h*" routines take "__half" arguments
+// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make 
+// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
+  union {
+    unsigned short x;
+    __half data;
+  };
+  operator __half(void) const { return data; }
+};
+ #endif
+#elif defined(EIGEN_HAS_CUDA_FP16)
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
+ typedef __half __half_raw;
+ #endif
 #endif
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
@@ -69,8 +94,19 @@ struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC half_base() {}
   EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
   EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  #if defined(EIGEN_HAS_OLD_HIP_FP16)
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
+  #else
+  EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
+  #endif
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
   EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+  #endif
+ #endif    
 #endif
 };
 
@@ -78,17 +114,38 @@ struct half_base : public __half_raw {
 
 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
-    typedef half_impl::__half_raw __half_raw;
-  #endif
+
+  // Writing this out as separate #if-else blocks to make the code easier to follow
+  // The same applies to most #if-else blocks in this file
+#if !defined(EIGEN_HAS_GPU_FP16)
+  typedef half_impl::__half_raw __half_raw;
+#elif defined(EIGEN_HAS_HIP_FP16)
+ #if defined(EIGEN_HAS_OLD_HIP_FP16)
+  typedef half_impl::__half_raw __half_raw;
+ #endif
+#elif defined(EIGEN_HAS_CUDA_FP16)
+  // Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
+  // So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
+ #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000  
+  typedef half_impl::__half_raw __half_raw;
+ #endif
+#endif
 
   EIGEN_DEVICE_FUNC half() {}
 
   EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  
+#if defined(EIGEN_HAS_GPU_FP16)
+ #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+ #elif defined(EIGEN_HAS_CUDA_FP16)
+  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
   EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  #endif
+ #endif
 #endif
+  
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -201,7 +258,8 @@ namespace Eigen {
 
 namespace half_impl {
 
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 
 // Intrinsics for native fp16 support. Note that on current hardware,
 // these are no faster than fp32 arithmetic (you need to use the half2
@@ -262,7 +320,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
 
 #else  // Emulate support for half floats
 
-// Definitions for CPUs and older CUDA, mostly working through conversion
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
 // to/from fp32.
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@@ -342,7 +400,8 @@ union FP32 {
 };
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
@@ -398,7 +457,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
 
 #elif defined(EIGEN_HAS_FP16_C)
@@ -432,7 +492,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
   return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hisnan(a);
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -448,7 +509,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
   return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
    return half(::expf(float(a)));
@@ -458,7 +520,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
   return half(numext::expm1(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return half(::hlog(a));
 #else
   return half(::logf(float(a)));
@@ -471,7 +534,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
   return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
     return half(::sqrtf(float(a)));
@@ -493,14 +557,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
   return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
+#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
@@ -508,7 +574,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(b, a) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -517,7 +584,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __hlt(a, b) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -595,7 +663,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
   return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
+#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
   return Eigen::half(::hlog(a));
 #else
   return Eigen::half(::logf(float(a)));
@@ -629,9 +698,12 @@ struct hash<Eigen::half> {
 
 
 // Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
+
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if EIGEN_CUDACC_VER < 90000
+  #if (EIGEN_CUDACC_VER < 90000) || \
+    defined(EIGEN_HAS_HIP_FP16)
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
   #else
   return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
@@ -640,7 +712,8 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
+  defined(EIGEN_HIP_DEVICE_COMPILE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(
       __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -648,7 +721,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
 #endif
 
 
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 namespace Eigen {
 namespace numext {
 
@@ -674,4 +747,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif
 
-#endif // EIGEN_HALF_CUDA_H
+#endif // EIGEN_HALF_GPU_H