Improved the performance of full reductions.

AFTER: BM_fullReduction/10 4541 4543 154017 21.0M items/s BM_fullReduction/64 5191 5193 100000 752.5M items/s BM_fullReduction/512 9588 9588 71361 25.5G items/s BM_fullReduction/4k 244314 244281 2863 64.0G items/s BM_fullReduction/5k 359382 359363 1946 64.8G items/s BEFORE: BM_fullReduction/10 9085 9087 74395 10.5M items/s BM_fullReduction/64 9478 9478 72014 412.1M items/s BM_fullReduction/512 14643 14646 46902 16.7G items/s BM_fullReduction/4k 260338 260384 2678 60.0G items/s BM_fullReduction/5k 385076 385178 1818 60.5G items/s
author: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-06-03 17:27:08 -0700
committer: Benoit Steiner <benoit.steiner.goog@gmail.com> 2016-06-03 17:27:08 -0700
commit: c2a102345f627e4cd1908dad03e6ef0cbb2170c0 (patch)
tree: 37dcab31e24ba0f2e313c1c7e842ffb26032c7f5 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent: 8d97ba6b2251aabf325ff74f24959ceaa85cf11e (diff)
1 files changed, 41 insertions, 11 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 45087a9a4..0d1a098b7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -112,17 +112,40 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
   }
 }
 
+
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+  // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    *output = reducer.initialize();
-    __syncthreads();
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
   }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExch(output, reducer.initialize());
+        unsigned int old = atomicExch(semaphore, 2u);
+        assert(old == 1u);
+      }
+      else {
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val = atomicCAS(semaphore, 2u, 2u);
+        while (val < 2u) {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
 
   typename Self::CoeffReturnType accum = reducer.initialize();
   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
@@ -141,6 +164,15 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
   if ((threadIdx.x & (warpSize - 1)) == 0) {
     atomicReduce(output, accum, reducer);
   }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    unsigned int ticket = atomicInc(semaphore, UINT_MAX);
+    assert(ticket >= 2u);
+    if (ticket == gridDim.x + 1) {
+      // We're the last block, reset the semaphore
+      *semaphore = 0;
+    }
+  }
 }
 
 
@@ -246,15 +278,13 @@ struct FullReductionLauncher<Self, Op, float, PacketAccess> {
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
 
+    unsigned int* semaphore = NULL;
     if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>),
-                         1, 32, 0, device, reducer.initialize(), 1, output);
+      semaphore = device.semaphore();
     }
 
     LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
   }
 };
author	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-06-03 17:27:08 -0700
committer	Benoit Steiner <benoit.steiner.goog@gmail.com>	2016-06-03 17:27:08 -0700
commit	c2a102345f627e4cd1908dad03e6ef0cbb2170c0 (patch)
tree	37dcab31e24ba0f2e313c1c7e842ffb26032c7f5 /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parent	8d97ba6b2251aabf325ff74f24959ceaa85cf11e (diff)