From 4ede059de17f54b5c29b27e86486016caef0b795 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 May 2016 17:04:01 -0700 Subject: Properly gate the use of half2. --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 4 ++++ unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 97f4b34b3..71061293b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -323,10 +323,12 @@ template __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +#ifdef EIGEN_HAS_CUDA_FP16 template __global__ void ReductionInitKernelHalfFloat(R, const S, I, half2*); template __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +#endif template __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); @@ -624,8 +626,10 @@ struct TensorEvaluator, Device> #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +#ifdef EIGEN_HAS_CUDA_FP16 template friend void internal::ReductionInitKernelHalfFloat(R, const S, I, half2*); template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +#endif template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index afa1a2697..6db9e63c8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -68,6 +68,7 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) } +#ifdef EIGEN_HAS_CUDA_FP16 template