aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-10-17 15:03:01 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-10-17 15:06:25 -0700
commita925c8dcaf57506c0f7ad167aad6794a88878ca3 (patch)
tree4f67279b5ba2e58b6dab2c5821fe262a8ee2e80d /tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
parent2fb1f1d837d8f86f3ad753ea235a1b3a22ba195f (diff)
Improve performance of tf.space_to_depth and tf.depth_to_space for typical block sizes of NCHW data layout on GPU with a loop kernel.
PiperOrigin-RevId: 172520132
Diffstat (limited to 'tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc')
-rw-r--r--tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc83
1 files changed, 76 insertions, 7 deletions
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index 94c7a0a3f6..a1a01e8813 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -66,10 +66,6 @@ __global__ void S2D_NCHW(const int32 nthreads,
const int block_size, const int output_width,
const int input_depth_by_output_height,
dtype* __restrict__ output_ptr) {
- // TODO(pauldonnelly): This kernel gets input coalescing, but not output
- // coalescing. We could use shared memory to get both. It may also help
- // to amortize the address calculations via an inner loop over block_size.
- // A template parameter for the block_size is another potential optimization.
CUDA_1D_KERNEL_LOOP(input_idx, nthreads) {
// We assume both the input and output are packed NCHW tensors.
// input_idx represents an index within the flattened input tensor.
@@ -100,6 +96,48 @@ __global__ void S2D_NCHW(const int32 nthreads,
}
}
+// Space2Depth kernel for FORMAT_NCHW using a loop over block area.
+// See 'spacetodepth_op.h' for functional specification.
+template <typename dtype, int block_size>
+__global__ void S2D_NCHW_LOOP(const int32 nthreads,
+ const dtype* __restrict__ input,
+ const int output_width, const int input_width,
+ const int input_depth_by_output_area,
+ const int output_depth_by_output_area,
+ dtype* __restrict__ output) {
+ CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+ // We will be converting the image from ordering:
+ // n, iC, oY, bY, oX, bX (== input index) to
+ // n, bY, bX, iC, oY, oX (== output index)
+
+ // We assume thread_idx encodes n_iC_oY_oX, and use an unrolled loop over
+ // bY and bX coordinates within the block. This kernel gets a small
+ // performance improvement compared with S2D_NCHW due to a denser access
+ // pattern on the input side. (Note: the equivalent D2S kernel gets a larger
+ // improvement as a denser pattern on the output side makes more
+ // difference).
+
+ const int n_iC_oY = thread_idx / output_width;
+ const int oX = thread_idx - n_iC_oY * output_width;
+ const int n = thread_idx / input_depth_by_output_area;
+ const int iC_oY_oX = thread_idx - n * input_depth_by_output_area;
+
+ // Recombine the components and apply to the input and output pointers.
+ auto input_ptr = input + (n_iC_oY * input_width + oX) * block_size;
+ auto output_ptr = output + n * output_depth_by_output_area + iC_oY_oX;
+
+#pragma unroll
+ // Copy a patch of data to the output batch image.
+ for (int bY = 0; bY < block_size; ++bY) {
+#pragma unroll
+ for (int bX = 0; bX < block_size; ++bX) {
+ output_ptr[(bY * block_size + bX) * input_depth_by_output_area] =
+ ldg(input_ptr + bY * input_width + bX);
+ }
+ }
+ }
+}
+
// Specialization of SpaceToDepthOpFunctor for a CPUDevice.
namespace functor {
template <typename T>
@@ -137,9 +175,40 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> {
const int output_depth = output.dimension(1);
const int output_height = output.dimension(2);
const int output_width = output.dimension(3);
-
- const int total_count =
- batch_size * output_height * output_width * output_depth;
+ const int output_area = output_width * output_height;
+ const int output_depth_by_output_area = output_depth * output_area;
+
+ // We improve performance by generating instantiations of the loop kernel
+ // for the most common block sizes.
+ if (block_size <= 4) {
+ const int input_width = input.dimension(3);
+ const int input_depth_by_output_area = input_depth * output_area;
+ const int total_count = batch_size * input_depth_by_output_area;
+ CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
+ switch (block_size) {
+ case 2:
+ return S2D_NCHW_LOOP<T, 2>
+ <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ total_count, input.data(), output_width, input_width,
+ input_depth_by_output_area, output_depth_by_output_area,
+ output.data());
+ case 3:
+ return S2D_NCHW_LOOP<T, 3>
+ <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ total_count, input.data(), output_width, input_width,
+ input_depth_by_output_area, output_depth_by_output_area,
+ output.data());
+ case 4:
+ return S2D_NCHW_LOOP<T, 4>
+ <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+ total_count, input.data(), output_width, input_width,
+ input_depth_by_output_area, output_depth_by_output_area,
+ output.data());
+ }
+ }
+
+ // Other block sizes are processed by the generic kernel.
+ const int total_count = batch_size * output_depth_by_output_area;
CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
S2D_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
config.virtual_thread_count, input.data(), block_size, output_width,