From d27b0ad4c8b6637aad7ef47902742626785b6a76 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Wed, 1 Jun 2016 11:12:07 -0700 Subject: Added support for mean reductions on fp16 --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 40 +++++++++++++++++----- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 7368768cf..f9ce24373 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -230,13 +230,15 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 #endif -template +template struct FullReductionLauncher { - template static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { assert(false && "Should only be called on floats and half floats"); } +}; +template +struct FullReductionLauncher { static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; typedef typename Self::CoeffReturnType Scalar; @@ -254,8 +256,18 @@ struct FullReductionLauncher { LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } +}; #ifdef EIGEN_HAS_CUDA_FP16 +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { + assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template +struct FullReductionLauncher { static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; @@ -279,8 +291,8 @@ struct FullReductionLauncher { 1, 1, 0, device, reducer, output, scratch); } } -#endif }; +#endif template @@ -306,7 +318,7 @@ struct FullReducer { return; } - FullReductionLauncher::run(self, reducer, device, output, num_coeffs); + FullReductionLauncher::run(self, reducer, device, output, num_coeffs); } }; @@ -473,14 +485,16 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #endif -template +template struct InnerReductionLauncher { - template static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats and half floats on a gpu device"); return true; } +}; +template +struct InnerReductionLauncher { static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; @@ -509,8 +523,18 @@ struct InnerReductionLauncher { return false; } +}; #ifdef EIGEN_HAS_CUDA_FP16 +template +struct InnerReductionLauncher { + static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { + assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template +struct InnerReductionLauncher { static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { typedef typename Self::Index Index; @@ -543,8 +567,8 @@ struct InnerReductionLauncher { return false; } -#endif }; +#endif template @@ -574,7 +598,7 @@ struct InnerReducer { return true; } - return InnerReductionLauncher::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + return InnerReductionLauncher::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); } }; -- cgit v1.2.3