aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
diff options
context:
space:
mode:
authorGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-09 17:09:54 -0700
committerGravatar Benoit Steiner <benoit.steiner.goog@gmail.com>2016-05-09 17:09:54 -0700
commit4670d7d5ce2517b2e9201f1cf44ae62ef2284eb5 (patch)
tree15f6faeec806068d4a786146ab92d557156321db /unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
parentc3859a2b584730ed1cb155a6d9bc422592d8d26b (diff)
Improved the performance of full reductions on GPU:
Before: BM_fullReduction/10 200000 11751 8.51 MFlops/s BM_fullReduction/80 5000 523385 12.23 MFlops/s BM_fullReduction/640 50 36179326 11.32 MFlops/s BM_fullReduction/4K 1 2173517195 11.50 MFlops/s After: BM_fullReduction/10 500000 5987 16.70 MFlops/s BM_fullReduction/80 200000 10636 601.73 MFlops/s BM_fullReduction/640 50000 58428 7010.31 MFlops/s BM_fullReduction/4K 1000 2006106 12461.95 MFlops/s
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h170
1 files changed, 151 insertions, 19 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index fd2587dd5..9186dffe4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,6 +67,30 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
#endif
}
+
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+#if __CUDA_ARCH__ >= 300
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+#else
+ assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
template <typename T>
__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
#if __CUDA_ARCH__ >= 300
@@ -108,7 +132,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
- reducer.reduce(__shfl_down(accum, offset), &accum);
+ reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -117,28 +141,78 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
}
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
- // Unfortunately nvidia doesn't support well exotic types such as complex,
- // so reduce the scope of the optimized version of the code to the simple case
- // of floats.
- static const bool HasOptimizedImplementation = !Op::IsStateful &&
- internal::is_same<typename Self::CoeffReturnType, float>::value;
-
- template <typename OutputType>
- static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
- assert(false && "Should only be called on floats");
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+ eigen_assert(threadIdx.x == 1);
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
}
+}
- static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
- typedef typename Self::Index Index;
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+static __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+ half* output, half2* scratch) {
+ eigen_assert(NumPerThread % 2 == 0);
- const Index num_coeffs = array_prod(self.m_impl.dimensions());
- // Don't crash when we're called with an input tensor of size 0.
- if (num_coeffs == 0) {
- return;
+ const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+ if (gridDim.x == 1 && first_index == 0) {
+ if (num_coeffs % 2 != 0) {
+ half last = input.m_impl.coeff(num_coeffs-1);
+ *scratch = __halves2half2(last, reducer.initialize());
+ } else {
+ *scratch = reducer.template initializePacket<half2>();
}
+ }
+
+ half2 accum = reducer.template initializePacket<half2>();
+ Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+ for (Index i = 0; i < max_iter; i += BlockSize) {
+ const Index index = first_index + 2*i;
+ eigen_assert(index + 1 < num_coeffs);
+ half2 val = input.m_impl.template packet<Unaligned>(index);
+ reducer.reducePacket(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(scratch, accum, reducer);
+ }
+
+ __syncthreads();
+
+ if (gridDim.x == 1 && first_index == 0) {
+ reducer.reduce(__low2half(*scratch), output);
+ reducer.reduce(__high2half(*scratch), output);
+ }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+ eigen_assert(threadIdx.x == 1);
+ reducer.reduce(__low2half(*scratch), output);
+ reducer.reduce(__high2half(*scratch), output);
+}
+
+#endif
+
+template <typename Self, typename Op, bool is_half>
+struct Launcher {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, typename Self::CoeffReturnType* output, typename Self::Index num_coeffs) {
+ typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
@@ -155,6 +229,64 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
}
};
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct Launcher<Self, Op, true> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+ typedef typename Self::Index Index;
+
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ half2* scratch = static_cast<half2*>(device.scratchpad());
+
+ if (num_blocks > 1) {
+ // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_coeffs, scratch);
+ }
+
+ LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+ if (num_blocks > 1) {
+ LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+ 1, 1, 0, device, reducer, output, scratch);
+ }
+ }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats and half floats.
+ #ifdef EIGEN_HAS_CUDA_FP16
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value);
+#else
+ static const bool HasOptimizedImplementation = !Op::IsStateful &&
+ internal::is_same<typename Self::CoeffReturnType, float>::value;
+#endif
+
+ template <typename OutputType>
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return;
+ }
+
+ static const bool is_half = internal::is_same<typename Self::CoeffReturnType, half>::value;
+ Launcher<Self, Op, is_half>::run(self, reducer, device, output, num_coeffs);
+ }
+};
+
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
@@ -323,7 +455,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
return true;
}
- const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);