From c2a102345f627e4cd1908dad03e6ef0cbb2170c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Jun 2016 17:27:08 -0700 Subject: Improved the performance of full reductions. AFTER: BM_fullReduction/10 4541 4543 154017 21.0M items/s BM_fullReduction/64 5191 5193 100000 752.5M items/s BM_fullReduction/512 9588 9588 71361 25.5G items/s BM_fullReduction/4k 244314 244281 2863 64.0G items/s BM_fullReduction/5k 359382 359363 1946 64.8G items/s BEFORE: BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s --- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 52 +++++++++++++++++----- 1 file changed, 41 insertions(+), 11 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 45087a9a4..0d1a098b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -112,17 +112,40 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe } } + template __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, - typename Self::CoeffReturnType* output) { + typename Self::CoeffReturnType* output, unsigned int* semaphore) { + // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - - // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - *output = reducer.initialize(); - __syncthreads(); + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } } + else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExch(output, reducer.initialize()); + unsigned int old = atomicExch(semaphore, 2u); + assert(old == 1u); + } + else { + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val = atomicCAS(semaphore, 2u, 2u); + while (val < 2u) { + val = atomicCAS(semaphore, 2u, 2u); + } + } + } + } + + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); typename Self::CoeffReturnType accum = reducer.initialize(); Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread*BlockSize); @@ -141,6 +164,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(output, accum, reducer); } + + if (gridDim.x > 1 && threadIdx.x == 0) { + unsigned int ticket = atomicInc(semaphore, UINT_MAX); + assert(ticket >= 2u); + if (ticket == gridDim.x + 1) { + // We're the last block, reset the semaphore + *semaphore = 0; + } + } } @@ -246,15 +278,13 @@ struct FullReductionLauncher { const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); + unsigned int* semaphore = NULL; if (num_blocks > 1) { - // We initialize the outputs outside the reduction kernel when we can't be sure that there - // won't be a race conditions between multiple thread blocks. - LAUNCH_CUDA_KERNEL((ReductionInitKernel), - 1, 32, 0, device, reducer.initialize(), 1, output); + semaphore = device.semaphore(); } LAUNCH_CUDA_KERNEL((FullReductionKernel), - num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); } }; -- cgit v1.2.3