diff options
-rw-r--r-- | Eigen/Core | 1 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/Half.h | 58 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 3 | ||||
-rw-r--r-- | test/half_float.cpp | 38 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h | 9 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 63 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_argmax_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_cast_float16_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_complex_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_contract_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_device.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_of_float16_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_random_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_reduction_cuda.cu | 6 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_scan_cuda.cu | 6 |
17 files changed, 175 insertions, 63 deletions
diff --git a/Eigen/Core b/Eigen/Core index f6fe4b0ec..c66359b79 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -22,6 +22,7 @@ #define EIGEN_CUDA_ARCH __CUDA_ARCH__ #endif +// Starting with CUDA 9 the composite __CUDACC_VER__ is not available. #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) #define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) #elif defined(__CUDACC_VER__) diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 8cedd65ad..1c557767a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -50,38 +50,45 @@ struct half; namespace half_impl { #if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; - +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; #endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } @@ -280,12 +287,13 @@ union FP32 { float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - return __float2half(ff); + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; @@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { const FP32 f16max = { (127 + 16) << 23 }; const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast<unsigned short>(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); @@ -612,11 +620,15 @@ struct hash<Eigen::half> { // Add the missing shfl_xor intrinsic #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if EIGEN_CUDACC_VER < 90000 return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); + #else + return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ba6a7f920..ce48e4b31 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast<const unsigned*>(&(a))); + *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF; return result; } diff --git a/test/half_float.cpp b/test/half_float.cpp index 7b6208873..7734f82cc 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -20,7 +20,7 @@ using Eigen::half; void test_conversion() { - using Eigen::half_impl::__half; + using Eigen::half_impl::__half_raw; // Conversion from float. VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00); @@ -37,9 +37,9 @@ void test_conversion() VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002); // Verify round-to-nearest-even behavior. - float val1 = float(half(__half(0x3c00))); - float val2 = float(half(__half(0x3c01))); - float val3 = float(half(__half(0x3c02))); + float val1 = float(half(__half_raw(0x3c00))); + float val2 = float(half(__half_raw(0x3c01))); + float val3 = float(half(__half_raw(0x3c02))); VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00); VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02); @@ -55,21 +55,21 @@ void test_conversion() VERIFY_IS_EQUAL(half(true).x, 0x3c00); // Conversion to float. - VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f); - VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x0000))), 0.0f); + VERIFY_IS_EQUAL(float(half(__half_raw(0x3c00))), 1.0f); // Denormals. - VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f); - VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f); + VERIFY_IS_APPROX(float(half(__half_raw(0x8001))), -5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0001))), 5.96046e-08f); + VERIFY_IS_APPROX(float(half(__half_raw(0x0002))), 1.19209e-07f); // NaNs and infinities. VERIFY(!(numext::isinf)(float(half(65504.0f)))); // Largest finite number. VERIFY(!(numext::isnan)(float(half(0.0f)))); - VERIFY((numext::isinf)(float(half(__half(0xfc00))))); - VERIFY((numext::isnan)(float(half(__half(0xfc01))))); - VERIFY((numext::isinf)(float(half(__half(0x7c00))))); - VERIFY((numext::isnan)(float(half(__half(0x7c01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0xfc00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0xfc01))))); + VERIFY((numext::isinf)(float(half(__half_raw(0x7c00))))); + VERIFY((numext::isnan)(float(half(__half_raw(0x7c01))))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 @@ -79,12 +79,12 @@ void test_conversion() #endif // Exactly same checks as above, just directly on the half representation. - VERIFY(!(numext::isinf)(half(__half(0x7bff)))); - VERIFY(!(numext::isnan)(half(__half(0x0000)))); - VERIFY((numext::isinf)(half(__half(0xfc00)))); - VERIFY((numext::isnan)(half(__half(0xfc01)))); - VERIFY((numext::isinf)(half(__half(0x7c00)))); - VERIFY((numext::isnan)(half(__half(0x7c01)))); + VERIFY(!(numext::isinf)(half(__half_raw(0x7bff)))); + VERIFY(!(numext::isnan)(half(__half_raw(0x0000)))); + VERIFY((numext::isinf)(half(__half_raw(0xfc00)))); + VERIFY((numext::isnan)(half(__half_raw(0xfc01)))); + VERIFY((numext::isinf)(half(__half_raw(0x7c00)))); + VERIFY((numext::isnan)(half(__half_raw(0x7c01)))); #if !EIGEN_COMP_MSVC // Visual Studio errors out on divisions by 0 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 428b18499..903bc51cc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -388,7 +388,11 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, // the sum across all big k blocks of the product of little k block of index (x, y) // with block of index (y, z). To compute the final output, we need to reduce // the 8 threads over y by summation. +#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif #define reduceRow(i, mask) \ shuffleInc(i, 0, mask); \ @@ -614,8 +618,13 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh x1 = rhs_pf0.x; x2 = rhs_pf0.z; } + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 x1 = __shfl_xor(x1, 4); x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif if((threadIdx.x%8) < 4) { rhs_pf0.y = x1; rhs_pf0.w = x2; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 974eb7deb..ebcbd6f41 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -62,9 +62,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) else { assert(0 && "Wordsize not supported"); } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } // We extend atomicExch to support extra data types @@ -104,9 +104,9 @@ template <> __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) { #if EIGEN_CUDA_ARCH >= 300 atomicAdd(output, accum); -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -168,7 +168,11 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -179,9 +183,9 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num // Let the last block reset the semaphore atomicInc(semaphore, gridDim.x + 1); } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } @@ -223,12 +227,14 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - if (num_coeffs % 2 != 0) { - half last = input.m_impl.coeff(num_coeffs-1); - *scratch = __halves2half2(last, reducer.initialize()); - } else { - *scratch = reducer.template initializePacket<half2>(); + if (gridDim.x == 1) { + if (first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); + } } __syncthreads(); } @@ -244,19 +250,25 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); + #else + int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp), &accum); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(scratch, accum, reducer); } - __syncthreads(); - - if (gridDim.x == 1 && first_index == 0) { - half tmp = __low2half(*scratch); - reducer.reduce(__high2half(*scratch), &tmp); - *output = tmp; + if (gridDim.x == 1) { + __syncthreads(); + if (first_index == 0) { + half tmp = __low2half(*scratch); + reducer.reduce(__high2half(*scratch), &tmp); + *output = tmp; + } } } @@ -425,7 +437,11 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); + #endif } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -433,9 +449,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } } -#else // __CUDA_ARCH__ >= 300 +#else // EIGEN_CUDA_ARCH >= 300 assert(0 && "Shouldn't be called on unsupported device"); -#endif // __CUDA_ARCH__ >= 300 +#endif // EIGEN_CUDA_ARCH >= 300 } #ifdef EIGEN_HAS_CUDA_FP16 @@ -515,8 +531,15 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1); reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2); + #else + int temp1 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val1), (unsigned)offset, warpSize); + int temp2 = __shfl_down_sync(0xFFFFFFFF, *(int*)(&reduced_val2), (unsigned)offset, warpSize); + reducer.reducePacket(*(half2*)(&temp1), &reduced_val1); + reducer.reducePacket(*(half2*)(&temp2), &reduced_val2); + #endif } half val1 = __low2half(reduced_val1); diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu index 3d73d491a..0e8b8125d 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu @@ -15,6 +15,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; template <int Layout> diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 816e03220..dabf9e45f 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; void test_cuda_conversion() { diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu index a52350f85..d25e1bee1 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cuda.cu @@ -14,6 +14,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu index aac780905..4f0f621b4 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu @@ -14,6 +14,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; template<typename T> diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu index e821ccf0c..c68287e34 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_cuda.cu @@ -17,6 +17,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu index 9584a539f..d9059a2dc 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_cuda.cu @@ -15,6 +15,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; void test_cuda_nullary() { diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index cbb43e210..d5bfeeb39 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -16,6 +16,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; using Eigen::RowMajor; diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index b3aab0b9d..c9f3ae1ae 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; template<typename> diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu index fa1a46732..9d08605fc 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include <Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + void test_cuda_random_uniform() { diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu index ec0669704..d6ce04f1c 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu @@ -15,6 +15,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + template<typename Type, int DataLayout> static void test_full_reductions() { diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu index de1c0ac95..e99724b91 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_cuda.cu @@ -16,6 +16,12 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +// The EIGEN_CUDACC_VER macro is provided by +// unsupported/Eigen/CXX11/Tensor included above +#if defined EIGEN_CUDACC_VER && EIGEN_CUDACC_VER >= 70500 +#include <cuda_fp16.h> +#endif + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; |