From c2a102345f627e4cd1908dad03e6ef0cbb2170c0 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 3 Jun 2016 17:27:08 -0700 Subject: Improved the performance of full reductions. AFTER: BM_fullReduction/10 4541 4543 154017 21.0M items/s BM_fullReduction/64 5191 5193 100000 752.5M items/s BM_fullReduction/512 9588 9588 71361 25.5G items/s BM_fullReduction/4k 244314 244281 2863 64.0G items/s BM_fullReduction/5k 359382 359363 1946 64.8G items/s BEFORE: BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s --- .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 37 +++++++++++++-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h | 52 +++++++++++++++++----- 3 files changed, 76 insertions(+), 17 deletions(-) (limited to 'unsupported/Eigen/CXX11') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index 6c12b2ed8..1468caa23 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -12,6 +12,8 @@ namespace Eigen { +static const int kCudaScratchSize = 1024; + // This defines an interface that GPUDevice can take to use // CUDA streams underneath. class StreamInterface { @@ -27,6 +29,12 @@ class StreamInterface { // Return a scratchpad buffer of size 1k virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; }; static cudaDeviceProp* m_deviceProperties; @@ -65,12 +73,12 @@ static const cudaStream_t default_stream = cudaStreamDefault; class CudaStreamDevice : public StreamInterface { public: // Use the default stream on the current device - CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) { + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { cudaGetDevice(&device_); initializeDeviceProp(); } // Use the default stream on the specified device - CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) { + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { initializeDeviceProp(); } // Use the specified stream. Note that it's the @@ -78,7 +86,7 @@ class CudaStreamDevice : public StreamInterface { // the specified device. If no device is specified the code // assumes that the stream is associated to the current gpu device. CudaStreamDevice(const cudaStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL) { + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { if (device < 0) { cudaGetDevice(&device_); } else { @@ -123,15 +131,27 @@ class CudaStreamDevice : public StreamInterface { virtual void* scratchpad() const { if (scratch_ == NULL) { - scratch_ = allocate(1024); + scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); } return scratch_; } + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + return semaphore_; + } + private: const cudaStream_t* stream_; int device_; mutable void* scratch_; + mutable unsigned int* semaphore_; }; struct GpuDevice { @@ -174,6 +194,15 @@ struct GpuDevice { #endif } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const { +#ifndef __CUDA_ARCH__ + return stream_->semaphore(); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); + return NULL; +#endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { #ifndef __CUDA_ARCH__ cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 99a09c058..6ddf824e9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -316,7 +316,7 @@ struct OuterReducer { #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) template -__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); +__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 @@ -616,7 +616,7 @@ struct TensorEvaluator, Device> template friend struct internal::FullReducerShard; #endif #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) - template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*); + template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); #ifdef EIGEN_HAS_CUDA_FP16 template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 45087a9a4..0d1a098b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -112,17 +112,40 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe } } + template __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, - typename Self::CoeffReturnType* output) { + typename Self::CoeffReturnType* output, unsigned int* semaphore) { + // Initialize the output value const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; - - // Initialize the output value if it wasn't initialized by the ReductionInitKernel - if (gridDim.x == 1 && first_index == 0) { - *output = reducer.initialize(); - __syncthreads(); + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } } + else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExch(output, reducer.initialize()); + unsigned int old = atomicExch(semaphore, 2u); + assert(old == 1u); + } + else { + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val = atomicCAS(semaphore, 2u, 2u); + while (val < 2u) { + val = atomicCAS(semaphore, 2u, 2u); + } + } + } + } + + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); typename Self::CoeffReturnType accum = reducer.initialize(); Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread*BlockSize); @@ -141,6 +164,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num if ((threadIdx.x & (warpSize - 1)) == 0) { atomicReduce(output, accum, reducer); } + + if (gridDim.x > 1 && threadIdx.x == 0) { + unsigned int ticket = atomicInc(semaphore, UINT_MAX); + assert(ticket >= 2u); + if (ticket == gridDim.x + 1) { + // We're the last block, reset the semaphore + *semaphore = 0; + } + } } @@ -246,15 +278,13 @@ struct FullReductionLauncher { const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); + unsigned int* semaphore = NULL; if (num_blocks > 1) { - // We initialize the outputs outside the reduction kernel when we can't be sure that there - // won't be a race conditions between multiple thread blocks. - LAUNCH_CUDA_KERNEL((ReductionInitKernel), - 1, 32, 0, device, reducer.initialize(), 1, output); + semaphore = device.semaphore(); } LAUNCH_CUDA_KERNEL((FullReductionKernel), - num_blocks, block_size, 0, device, reducer, self, num_coeffs, output); + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); } }; -- cgit v1.2.3