From c2a102345f627e4cd1908dad03e6ef0cbb2170c0 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <benoit.steiner.goog@gmail.com>
Date: Fri, 3 Jun 2016 17:27:08 -0700
Subject: Improved the performance of full reductions.

AFTER:
BM_fullReduction/10        4541       4543     154017  21.0M items/s
BM_fullReduction/64        5191       5193     100000  752.5M items/s
BM_fullReduction/512       9588       9588      71361  25.5G items/s
BM_fullReduction/4k      244314     244281       2863  64.0G items/s
BM_fullReduction/5k      359382     359363       1946  64.8G items/s

BEFORE:
BM_fullReduction/10        9085       9087      74395  10.5M items/s
BM_fullReduction/64        9478       9478      72014  412.1M items/s
BM_fullReduction/512      14643      14646      46902  16.7G items/s
BM_fullReduction/4k      260338     260384       2678  60.0G items/s
BM_fullReduction/5k      385076     385178       1818  60.5G items/s
---
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h      | 37 +++++++++++++--
 .../Eigen/CXX11/src/Tensor/TensorReduction.h       |  4 +-
 .../Eigen/CXX11/src/Tensor/TensorReductionCuda.h   | 52 +++++++++++++++++-----
 3 files changed, 76 insertions(+), 17 deletions(-)

(limited to 'unsupported/Eigen/CXX11')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 6c12b2ed8..1468caa23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
 
 namespace Eigen {
 
+static const int kCudaScratchSize = 1024;
+
 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
@@ -27,6 +29,12 @@ class StreamInterface {
 
   // Return a scratchpad buffer of size 1k
   virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
 };
 
 static cudaDeviceProp* m_deviceProperties;
@@ -65,12 +73,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
 class CudaStreamDevice : public StreamInterface {
  public:
   // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     cudaGetDevice(&device_);
     initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     initializeDeviceProp();
   }
   // Use the specified stream. Note that it's the
@@ -78,7 +86,7 @@ class CudaStreamDevice : public StreamInterface {
   // the specified device. If no device is specified the code
   // assumes that the stream is associated to the current gpu device.
   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL) {
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     if (device < 0) {
       cudaGetDevice(&device_);
     } else {
@@ -123,15 +131,27 @@ class CudaStreamDevice : public StreamInterface {
 
   virtual void* scratchpad() const {
     if (scratch_ == NULL) {
-      scratch_ = allocate(1024);
+      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
     }
     return scratch_;
   }
 
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+    }
+    return semaphore_;
+  }
+
  private:
   const cudaStream_t* stream_;
   int device_;
   mutable void* scratch_;
+  mutable unsigned int* semaphore_;
 };
 
 struct GpuDevice {
@@ -174,6 +194,15 @@ struct GpuDevice {
 #endif
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+#ifndef __CUDA_ARCH__
+    return stream_->semaphore();
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return NULL;
+#endif
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 99a09c058..6ddf824e9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -316,7 +316,7 @@ struct OuterReducer {
 
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 
 
 #ifdef EIGEN_HAS_CUDA_FP16
@@ -616,7 +616,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 #ifdef EIGEN_HAS_CUDA_FP16
   template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 45087a9a4..0d1a098b7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -112,17 +112,40 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
   }
 }
 
+
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+  // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    *output = reducer.initialize();
-    __syncthreads();
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
   }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExch(output, reducer.initialize());
+        unsigned int old = atomicExch(semaphore, 2u);
+        assert(old == 1u);
+      }
+      else {
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val = atomicCAS(semaphore, 2u, 2u);
+        while (val < 2u) {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
 
   typename Self::CoeffReturnType accum = reducer.initialize();
   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
@@ -141,6 +164,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
   if ((threadIdx.x & (warpSize - 1)) == 0) {
     atomicReduce(output, accum, reducer);
   }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    unsigned int ticket = atomicInc(semaphore, UINT_MAX);
+    assert(ticket >= 2u);
+    if (ticket == gridDim.x + 1) {
+      // We're the last block, reset the semaphore
+      *semaphore = 0;
+    }
+  }
 }
 
 
@@ -246,15 +278,13 @@ struct FullReductionLauncher<Self, Op, float, PacketAccess> {
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
 
+    unsigned int* semaphore = NULL;
     if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>),
-                         1, 32, 0, device, reducer.initialize(), 1, output);
+      semaphore = device.semaphore();
     }
 
     LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
   }
 };
 
-- 
cgit v1.2.3