17 files changed, 175 insertions, 63 deletions
diff --git a/Eigen/Core b/Eigen/Core
index f6fe4b0ec..c66359b79 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -22,6 +22,7 @@
   #define EIGEN_CUDA_ARCH __CUDA_ARCH__
 #endif
 
+// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
 #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
 #define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
 #elif defined(__CUDACC_VER__)
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h
index 8cedd65ad..1c557767a 100644
--- a/Eigen/src/Core/arch/CUDA/Half.h
+++ b/Eigen/src/Core/arch/CUDA/Half.h
@@ -50,38 +50,45 @@ struct half;
 namespace half_impl {
 
 #if !defined(EIGEN_HAS_CUDA_FP16)
-
-// Make our own __half definition that is similar to CUDA's.
-struct __half {
-  EIGEN_DEVICE_FUNC __half() : x(0) {}
-  explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
+  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
-
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
 #endif
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
 
-struct half_base : public __half {
+struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
+  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
 };
 
 } // namespace half_impl
 
 // Class definition.
 struct half : public half_impl::half_base {
-  #if !defined(EIGEN_HAS_CUDA_FP16)
-    typedef half_impl::__half __half;
+  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
+    typedef half_impl::__half_raw __half_raw;
   #endif
 
   EIGEN_DEVICE_FUNC half() {}
 
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
   EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
+  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
+#endif
 
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
 // these in hardware. If we need more performance on older/other CPUs, they are
 // also possible to vectorize directly.
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
-  __half h;
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
+  __half_raw h;
   h.x = x;
   return h;
 }
@@ -280,12 +287,13 @@ union FP32 {
   float f;
 };
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-  return __float2half(ff);
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;
 
 #elif defined(EIGEN_HAS_FP16_C)
-  __half h;
+  __half_raw h;
   h.x = _cvtss_sh(ff, 0);
   return h;
 
@@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
   const FP32 f16max = { (127 + 16) << 23 };
   const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
   unsigned int sign_mask = 0x80000000u;
-  __half o;
+  __half_raw o;
   o.x = static_cast<unsigned short>(0x0u);
 
   unsigned int sign = f.u & sign_mask;
@@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
 #endif
 }
 
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
 #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
   return __half2float(h);
 
@@ -612,11 +620,15 @@ struct hash<Eigen::half> {
 // Add the missing shfl_xor intrinsic
 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+  #if EIGEN_CUDACC_VER < 90000
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
+  #else
+  return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
+  #endif
 }
 #endif
 
-// ldg() has an overload for __half, but we also need one for Eigen::half.
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
 #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
index ba6a7f920..ce48e4b31 100644
--- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
+++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2&
 
 template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
   half2 result;
-  result.x = a.x & 0x7FFF7FFF;
+  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
   return result;
 }
 
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 7b6208873..7734f82cc 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -20,7 +20,7 @@ using Eigen::half;
 
 void test_conversion()
 {
-  using Eigen::half_impl::__half;
+  using Eigen::half_impl::__half_raw;
 
   // Conversion from float.
   VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
@@ -37,9 +37,9 @@ void test_conversion()
   VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
 
   // Verify round-to-nearest-even behavior.
-  float val1 = float(half(__half(0x3c00)));
-  float val2 = float(half(__half(0x3c01)));
-  float val3 = float(half(__half(0x3c02)));
+  float val1 = float(half(__half_raw(0x3c00)));
+  float val2 = float(half(__half_raw(0x3c01)));
+  float val3 = float(half(__half_raw(0x3c02)));
   VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
   VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
 
@@ -55,21 +55,21 @@ void test_conversion()
   VERIFY_IS_EQUAL(half(true).x, 0x3c00);
 
   // Conversion to float.
-  VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
-  VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
+  VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f);
+  VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f);
 
   // Denormals.
-  VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
-  VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
-  VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
+  VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f);
+  VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f);
 
   // NaNs and infinities.
   VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
   VERIFY(!(numext::isnan)(float(half(0.0f))));
-  VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
-  VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
-  VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
-  VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
+  VERIFY((numext::isinf)(float(half(__half_raw(0xfc00)))));
+  VERIFY((numext::isnan)(float(half(__half_raw(0xfc01)))));
+  VERIFY((numext::isinf)(float(half(__half_raw(0x7c00)))));
+  VERIFY((numext::isnan)(float(half(__half_raw(0x7c01)))));
 
 #if !EIGEN_COMP_MSVC
   // Visual Studio errors out on divisions by 0
@@ -79,12 +79,12 @@ void test_conversion()
 #endif
 
   // Exactly same checks as above, just directly on the half representation.
-  VERIFY(!(numext::isinf)(half(__half(0x7bff))));
-  VERIFY(!(numext::isnan)(half(__half(0x0000))));
-  VERIFY((numext::isinf)(half(__half(0xfc00))));
-  VERIFY((numext::isnan)(half(__half(0xfc01))));
-  VERIFY((numext::isinf)(half(__half(0x7c00))));
-  VERIFY((numext::isnan)(half(__half(0x7c01))));
+  VERIFY(!(numext::isinf)(half(__half_raw(0x7bff))));
+  VERIFY(!(numext::isnan)(half(__half_raw(0x0000))));
+  VERIFY((numext::isinf)(half(__half_raw(0xfc00))));
+  VERIFY((numext::isnan)(half(__half_raw(0xfc01))));
+  VERIFY((numext::isinf)(half(__half_raw(0x7c00))));
+  VERIFY((numext::isnan)(half(__half_raw(0x7c01))));
 
 #if !EIGEN_COMP_MSVC
   // Visual Studio errors out on divisions by 0
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 428b18499..903bc51cc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
   // the sum across all big k blocks of the product of little k block of index (x, y)
   // with block of index (y, z). To compute the final output, we need to reduce
   // the 8 threads over y by summation.
+#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
 
 #define reduceRow(i, mask)                      \
   shuffleInc(i, 0, mask);                       \
@@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
       x1 = rhs_pf0.x;
       x2 = rhs_pf0.z;
     }
+    #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
     x1 = __shfl_xor(x1, 4);
     x2 = __shfl_xor(x2, 4);
+    #else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+    #endif
     if((threadIdx.x%8) < 4) {
       rhs_pf0.y = x1;
       rhs_pf0.w = x2;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 974eb7deb..ebcbd6f41 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
   else {
     assert(0 && "Wordsize not supported");
   }
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
   assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
 }
 
 // We extend atomicExch to support extra data types
@@ -104,9 +104,9 @@ template <>
 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
 #if EIGEN_CUDA_ARCH >= 300
   atomicAdd(output, accum);
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
   assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
 }
 
 
@@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
 
 #pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
     reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
   }
 
   if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
     // Let the last block reset the semaphore
     atomicInc(semaphore, gridDim.x + 1);
   }
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
   assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
 }
 
 
@@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
 
   // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    if (num_coeffs % 2 != 0) {
-      half last = input.m_impl.coeff(num_coeffs-1);
-      *scratch = __halves2half2(last, reducer.initialize());
-    } else {
-      *scratch = reducer.template initializePacket<half2>();
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      if (num_coeffs % 2 != 0) {
+        half last = input.m_impl.coeff(num_coeffs-1);
+        *scratch = __halves2half2(last, reducer.initialize());
+      } else {
+        *scratch = reducer.template initializePacket<half2>();
+      }
     }
     __syncthreads();
   }
@@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
 
 #pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
     reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
+    reducer.reducePacket(*(half2*)(&temp), &accum);
+  #endif
   }
 
   if ((threadIdx.x & (warpSize - 1)) == 0) {
     atomicReduce(scratch, accum, reducer);
   }
 
-  __syncthreads();
-
-  if (gridDim.x == 1 && first_index == 0) {
-    half tmp = __low2half(*scratch);
-    reducer.reduce(__high2half(*scratch), &tmp);
-    *output = tmp;
+  if (gridDim.x == 1) {
+    __syncthreads();
+    if (first_index == 0) {
+      half tmp = __low2half(*scratch);
+      reducer.reduce(__high2half(*scratch), &tmp);
+      *output = tmp;
+    }
   }
 }
 
@@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
 
 #pragma unroll
       for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
         reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
       }
 
       if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       }
     }
   }
-#else // __CUDA_ARCH__ >= 300
+#else // EIGEN_CUDA_ARCH >= 300
   assert(0 && "Shouldn't be called on unsupported device");
-#endif // __CUDA_ARCH__ >= 300
+#endif // EIGEN_CUDA_ARCH >= 300
 }
 
 #ifdef EIGEN_HAS_CUDA_FP16
@@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 
 #pragma unroll
       for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
         reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
         reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+      #else
+        int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize);
+        int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize);
+        reducer.reducePacket(*(half2*)(&temp1), &reduced_val1);
+        reducer.reducePacket(*(half2*)(&temp2), &reduced_val2);
+      #endif
       }
 
       half val1 =  __low2half(reduced_val1);
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 3d73d491a..0e8b8125d 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -15,6 +15,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 
 template <int Layout>
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
index 816e03220..dabf9e45f 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -16,6 +16,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 
 void test_cuda_conversion() {
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
index a52350f85..d25e1bee1 100644
--- a/unsupported/test/cxx11_tensor_complex_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -14,6 +14,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 
 void test_cuda_nullary() {
diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
index aac780905..4f0f621b4 100644
--- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
+++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -14,6 +14,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 
 template<typename T>
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index e821ccf0c..c68287e34 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -17,6 +17,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;
 
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 9584a539f..d9059a2dc 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -15,6 +15,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 
 void test_cuda_nullary() {
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index cbb43e210..d5bfeeb39 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -16,6 +16,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 using Eigen::RowMajor;
 
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index b3aab0b9d..c9f3ae1ae 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -16,6 +16,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 
 template<typename>
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index fa1a46732..9d08605fc 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -16,6 +16,12 @@
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 
 void test_cuda_random_uniform()
 {
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index ec0669704..d6ce04f1c 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -15,6 +15,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 
 template<typename Type, int DataLayout>
 static void test_full_reductions() {
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
index de1c0ac95..e99724b91 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -16,6 +16,12 @@
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
+// The EIGEN_CUDACC_VER macro is provided by
+// unsupported/Eigen/CXX11/Tensor included above
+#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500
+#include <cuda_fp16.h>
+#endif
+
 using Eigen::Tensor;
 typedef Tensor<float, 1>::DimensionPair DimPair;