diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-09 17:09:54 -0700 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2016-05-09 17:09:54 -0700 |
commit | 4670d7d5ce2517b2e9201f1cf44ae62ef2284eb5 (patch) | |
tree | 15f6faeec806068d4a786146ab92d557156321db /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | |
parent | c3859a2b584730ed1cb155a6d9bc422592d8d26b (diff) |
Improved the performance of full reductions on GPU:
Before:
BM_fullReduction/10 200000 11751 8.51 MFlops/s
BM_fullReduction/80 5000 523385 12.23 MFlops/s
BM_fullReduction/640 50 36179326 11.32 MFlops/s
BM_fullReduction/4K 1 2173517195 11.50 MFlops/s
After:
BM_fullReduction/10 500000 5987 16.70 MFlops/s
BM_fullReduction/80 200000 10636 601.73 MFlops/s
BM_fullReduction/640 50000 58428 7010.31 MFlops/s
BM_fullReduction/4K 1000 2006106 12461.95 MFlops/s
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 170 |
1 files changed, 151 insertions, 19 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index fd2587dd5..9186dffe4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -67,6 +67,30 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) #endif } + +template <template <typename T> class R> +__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) { +#if __CUDA_ARCH__ >= 300 + unsigned int oldval = *reinterpret_cast<unsigned int*>(output); + unsigned int newval = oldval; + reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval)); + if (newval == oldval) { + return; + } + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + template <typename T> __device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) { #if __CUDA_ARCH__ >= 300 @@ -108,7 +132,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { - reducer.reduce(__shfl_down(accum, offset), &accum); + reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); } if ((threadIdx.x & (warpSize - 1)) == 0) { @@ -117,28 +141,78 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num } -template <typename Self, typename Op, bool Vectorizable> -struct FullReducer<Self, Op, GpuDevice, Vectorizable> { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats. - static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; - - template <typename OutputType> - static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) { - assert(false && "Should only be called on floats"); +#ifdef EIGEN_HAS_CUDA_FP16 +template <typename Self, + typename Reducer, typename Index> +__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) { + eigen_assert(threadIdx.x == 1); + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); } +} - static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) { - typedef typename Self::Index Index; +template <int BlockSize, int NumPerThread, typename Self, + typename Reducer, typename Index> +static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, + half* output, half2* scratch) { + eigen_assert(NumPerThread % 2 == 0); - const Index num_coeffs = array_prod(self.m_impl.dimensions()); - // Don't crash when we're called with an input tensor of size 0. - if (num_coeffs == 0) { - return; + const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x; + + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + if (gridDim.x == 1 && first_index == 0) { + if (num_coeffs % 2 != 0) { + half last = input.m_impl.coeff(num_coeffs-1); + *scratch = __halves2half2(last, reducer.initialize()); + } else { + *scratch = reducer.template initializePacket<half2>(); } + } + + half2 accum = reducer.template initializePacket<half2>(); + Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + 2*i; + eigen_assert(index + 1 < num_coeffs); + half2 val = input.m_impl.template packet<Unaligned>(index); + reducer.reducePacket(val, &accum); + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum); + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(scratch, accum, reducer); + } + + __syncthreads(); + + if (gridDim.x == 1 && first_index == 0) { + reducer.reduce(__low2half(*scratch), output); + reducer.reduce(__high2half(*scratch), output); + } +} + +template <typename Op> +__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { + eigen_assert(threadIdx.x == 1); + reducer.reduce(__low2half(*scratch), output); + reducer.reduce(__high2half(*scratch), output); +} + +#endif + +template <typename Self, typename Op, bool is_half> +struct Launcher { + static void run(const Self& self, Op& reducer, const GpuDevice& device, typename Self::CoeffReturnType* output, typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType Scalar; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); @@ -155,6 +229,64 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { } }; +#ifdef EIGEN_HAS_CUDA_FP16 +template <typename Self, typename Op> +struct Launcher<Self, Op, true> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread); + half2* scratch = static_cast<half2*>(device.scratchpad()); + + if (num_blocks > 1) { + // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), + 1, 1, 0, device, reducer, self, num_coeffs, scratch); + } + + LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); + + if (num_blocks > 1) { + LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>), + 1, 1, 0, device, reducer, output, scratch); + } + } +}; +#endif + + +template <typename Self, typename Op, bool Vectorizable> +struct FullReducer<Self, Op, GpuDevice, Vectorizable> { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats and half floats. + #ifdef EIGEN_HAS_CUDA_FP16 + static const bool HasOptimizedImplementation = !Op::IsStateful && + (internal::is_same<typename Self::CoeffReturnType, float>::value || + internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value); +#else + static const bool HasOptimizedImplementation = !Op::IsStateful && + internal::is_same<typename Self::CoeffReturnType, float>::value; +#endif + + template <typename OutputType> + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + assert(HasOptimizedImplementation && "Should only be called on floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + + static const bool is_half = internal::is_same<typename Self::CoeffReturnType, half>::value; + Launcher<Self, Op, is_half>::run(self, reducer, device, output, num_coeffs); + } +}; + template <int NumPerThread, typename Self, typename Reducer, typename Index> @@ -323,7 +455,7 @@ struct OuterReducer<Self, Op, GpuDevice> { return true; } - const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; const int block_size = 256; const int num_per_thread = 16; const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread); |