aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/arch/CUDA
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-08-31 02:49:39 +0000
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2017-08-31 02:49:39 +0000
commita4089991eb6bdb9e8ebfef93d81ca7b5e67ea77d (patch)
tree49a9b6c0c4ec6d006debe862cf209a8f252cfe78 /Eigen/src/Core/arch/CUDA
parent304ef2957134be386e50592ad7120177c5f3a7c0 (diff)
Added support for CUDA 9.0.
Diffstat (limited to 'Eigen/src/Core/arch/CUDA')
-rw-r--r--Eigen/src/Core/arch/CUDA/Half.h58
-rw-r--r--Eigen/src/Core/arch/CUDA/PacketMathHalf.h3
2 files changed, 37 insertions, 24 deletions
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 8cedd65ad..1c557767a 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -50,38 +50,45 @@ struct half;
namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16)
-
-// Make our own __half definition that is similar to CUDA's.
-struct __half {
- EIGEN_DEVICE_FUNC __half() : x(0) {}
- explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+ EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+ explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x;
};
-
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
#endif
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-struct half_base : public __half {
+struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {}
- EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
- EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+ EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+ EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
};
} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
- #if !defined(EIGEN_HAS_CUDA_FP16)
- typedef half_impl::__half __half;
+ #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+ typedef half_impl::__half_raw __half_raw;
#endif
EIGEN_DEVICE_FUNC half() {}
- EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+ EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+ EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif
explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
- __half h;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+ __half_raw h;
h.x = x;
return h;
}
@@ -280,12 +287,13 @@ union FP32 {
float f;
};
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
- return __float2half(ff);
+ __half tmp_ff = __float2half(ff);
+ return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C)
- __half h;
+ __half_raw h;
h.x = _cvtss_sh(ff, 0);
return h;
@@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
const FP32 f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
- __half o;
+ __half_raw o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
@@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
return __half2float(h);
@@ -612,11 +620,15 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+ #if EIGEN_CUDACC_VER < 90000
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+ #else
+ return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
+ #endif
}
#endif
-// ldg() has an overload for __half, but we also need one for Eigen::half.
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index ba6a7f920..ce48e4b31 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result;
- result.x = a.x & 0x7FFF7FFF;
+ unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+ *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
return result;
}