diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-10-17 15:03:01 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-10-17 15:06:25 -0700 |
commit | a925c8dcaf57506c0f7ad167aad6794a88878ca3 (patch) | |
tree | 4f67279b5ba2e58b6dab2c5821fe262a8ee2e80d /tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc | |
parent | 2fb1f1d837d8f86f3ad753ea235a1b3a22ba195f (diff) |
Improve performance of tf.space_to_depth and tf.depth_to_space for typical block sizes of NCHW data layout on GPU with a loop kernel.
PiperOrigin-RevId: 172520132
Diffstat (limited to 'tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc')
-rw-r--r-- | tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc | 83 |
1 files changed, 76 insertions, 7 deletions
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc index 94c7a0a3f6..a1a01e8813 100644 --- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc +++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc @@ -66,10 +66,6 @@ __global__ void S2D_NCHW(const int32 nthreads, const int block_size, const int output_width, const int input_depth_by_output_height, dtype* __restrict__ output_ptr) { - // TODO(pauldonnelly): This kernel gets input coalescing, but not output - // coalescing. We could use shared memory to get both. It may also help - // to amortize the address calculations via an inner loop over block_size. - // A template parameter for the block_size is another potential optimization. CUDA_1D_KERNEL_LOOP(input_idx, nthreads) { // We assume both the input and output are packed NCHW tensors. // input_idx represents an index within the flattened input tensor. @@ -100,6 +96,48 @@ __global__ void S2D_NCHW(const int32 nthreads, } } +// Space2Depth kernel for FORMAT_NCHW using a loop over block area. +// See 'spacetodepth_op.h' for functional specification. +template <typename dtype, int block_size> +__global__ void S2D_NCHW_LOOP(const int32 nthreads, + const dtype* __restrict__ input, + const int output_width, const int input_width, + const int input_depth_by_output_area, + const int output_depth_by_output_area, + dtype* __restrict__ output) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + // We will be converting the image from ordering: + // n, iC, oY, bY, oX, bX (== input index) to + // n, bY, bX, iC, oY, oX (== output index) + + // We assume thread_idx encodes n_iC_oY_oX, and use an unrolled loop over + // bY and bX coordinates within the block. This kernel gets a small + // performance improvement compared with S2D_NCHW due to a denser access + // pattern on the input side. (Note: the equivalent D2S kernel gets a larger + // improvement as a denser pattern on the output side makes more + // difference). + + const int n_iC_oY = thread_idx / output_width; + const int oX = thread_idx - n_iC_oY * output_width; + const int n = thread_idx / input_depth_by_output_area; + const int iC_oY_oX = thread_idx - n * input_depth_by_output_area; + + // Recombine the components and apply to the input and output pointers. + auto input_ptr = input + (n_iC_oY * input_width + oX) * block_size; + auto output_ptr = output + n * output_depth_by_output_area + iC_oY_oX; + +#pragma unroll + // Copy a patch of data to the output batch image. + for (int bY = 0; bY < block_size; ++bY) { +#pragma unroll + for (int bX = 0; bX < block_size; ++bX) { + output_ptr[(bY * block_size + bX) * input_depth_by_output_area] = + ldg(input_ptr + bY * input_width + bX); + } + } + } +} + // Specialization of SpaceToDepthOpFunctor for a CPUDevice. namespace functor { template <typename T> @@ -137,9 +175,40 @@ struct SpaceToDepthOpFunctor<GPUDevice, T, FORMAT_NCHW> { const int output_depth = output.dimension(1); const int output_height = output.dimension(2); const int output_width = output.dimension(3); - - const int total_count = - batch_size * output_height * output_width * output_depth; + const int output_area = output_width * output_height; + const int output_depth_by_output_area = output_depth * output_area; + + // We improve performance by generating instantiations of the loop kernel + // for the most common block sizes. + if (block_size <= 4) { + const int input_width = input.dimension(3); + const int input_depth_by_output_area = input_depth * output_area; + const int total_count = batch_size * input_depth_by_output_area; + CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); + switch (block_size) { + case 2: + return S2D_NCHW_LOOP<T, 2> + <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + total_count, input.data(), output_width, input_width, + input_depth_by_output_area, output_depth_by_output_area, + output.data()); + case 3: + return S2D_NCHW_LOOP<T, 3> + <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + total_count, input.data(), output_width, input_width, + input_depth_by_output_area, output_depth_by_output_area, + output.data()); + case 4: + return S2D_NCHW_LOOP<T, 4> + <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + total_count, input.data(), output_width, input_width, + input_depth_by_output_area, output_depth_by_output_area, + output.data()); + } + } + + // Other block sizes are processed by the generic kernel. + const int total_count = batch_size * output_depth_by_output_area; CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); S2D_NCHW<<<config.block_count, config.thread_per_block, 0, d.stream()>>>( config.virtual_thread_count, input.data(), block_size, output_width, |