From 4013b8fecacfb4235df0bd79e9c56f39ee2db077 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 May 2016 09:40:42 -0700 Subject: Simplified the reduction code a little. --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 9186dffe4..b18200166 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -91,8 +91,8 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer #endif } -template -__device__ inline void atomicReduce(T* output, T accum, SumReducer&) { +template <> +__device__ inline void atomicReduce(float* output, float accum, SumReducer&) { #if __CUDA_ARCH__ >= 300 atomicAdd(output, accum); #else @@ -208,9 +208,14 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2 #endif -template -struct Launcher { - static void run(const Self& self, Op& reducer, const GpuDevice& device, typename Self::CoeffReturnType* output, typename Self::Index num_coeffs) { +template +struct FullReductionLauncher { + template + static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { + assert(false && "Should only be called on floats and half floats"); + } + + static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; @@ -220,18 +225,15 @@ struct Launcher { if (num_blocks > 1) { // We initialize the outputs outside the reduction kernel when we can't be sure that there // won't be a race conditions between multiple thread blocks. - LAUNCH_CUDA_KERNEL((ReductionInitKernel), + LAUNCH_CUDA_KERNEL((ReductionInitKernel), 1, 32, 0, device, reducer.initialize(), 1, output); } LAUNCH_CUDA_KERNEL((FullReductionKernel), num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); } -}; #ifdef EIGEN_HAS_CUDA_FP16 -template -struct Launcher { static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; @@ -255,8 +257,8 @@ struct Launcher { 1, 1, 0, device, reducer, output, scratch); } } -}; #endif +}; template @@ -282,8 +284,7 @@ struct FullReducer { return; } - static const bool is_half = internal::is_same::value; - Launcher::run(self, reducer, device, output, num_coeffs); + FullReductionLauncher::run(self, reducer, device, output, num_coeffs); } }; -- cgit v1.2.3