From b733b8b680885c0fcdfddea5423171468609b5a6 Mon Sep 17 00:00:00 2001 From: Sami Kama Date: Tue, 10 Mar 2020 20:28:43 +0000 Subject: remove duplicate pset1 for half and add some comments about why we need expose pmul/add/div/min/max on host --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 8 +- .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 290 +++++++++++++++------ 3 files changed, 219 insertions(+), 87 deletions(-) (limited to 'unsupported') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index 6afc98877..a3a750f21 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -53,10 +53,12 @@ struct PacketType : internal::packet_traits { // For CUDA packet types when using a GpuDevice #if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) -template <> + +typedef ulonglong2 Packet4h2; +template<> struct PacketType { - typedef half2 type; - static const int size = 2; + typedef Packet4h2 type; + static const int size = 8; enum { HasAdd = 1, HasSub = 1, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 5ca694062..8332a9ae0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -420,9 +420,9 @@ __global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType* #if defined(EIGEN_HAS_GPU_FP16) template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*); +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); template -__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, half2*); +__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); template __global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); @@ -863,8 +863,8 @@ struct TensorReductionEvaluatorBase KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) - template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, half2*); - template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, half2*); + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 095bb54cc..9d3305cfd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -98,7 +98,17 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer } } } -#endif // EIGEN_HAS_GPU_FP16 +// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations +template