aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
diff options
context:
space:
mode:
authorGravatar Yangzihao Wang <yangzihao@google.com>2017-07-19 13:15:03 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-07-19 13:19:10 -0700
commit831a3a46d34e9bd305555889392c016690ff9bc4 (patch)
tree454bf198209e43c411c62ec02a7a85a426434c92 /tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
parent03a790d5bff835365a54a534fc1658dc287c0f63 (diff)
Automated g4 rollback of changelist 160314706
PiperOrigin-RevId: 162525519
Diffstat (limited to 'tensorflow/core/kernels/conv_ops_gpu_3.cu.cc')
-rw-r--r--tensorflow/core/kernels/conv_ops_gpu_3.cu.cc127
1 files changed, 123 insertions, 4 deletions
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index 2307c2de0e..bcabda848c 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -272,6 +272,87 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
}
}
+// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor
+// when only one of the dimension sizes is smaller than 16,
+// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
+//
+// small_dim = the_smaller_dimension_size
+// large_dim = the_larger_dimension_size
+// tile_num_per_block = blockDim.x
+// kTileLength = small_dim
+//
+// Each thread block operates on a single rectangle tile, where its width is
+// kTileLength (we currently set it to 64) and its height is small_dim,
+// We set the thread block's X dimension to be tile_num_per_block, and its Y
+// and Z to be one.
+template <typename T, bool SmallDim2>
+__global__ void SwapDimension1And2InTensor3SmallDim(const T* input,
+ int batch_per_block,
+ Dimension<3> input_dims,
+ T* output) {
+ // TODO(yangzihao) avoid share memory bank conflict.
+ extern __shared__ __align__(sizeof(T)) unsigned char shmem[];
+ T* shared_memory_tile = reinterpret_cast<T*>(shmem);
+
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.z == 1);
+
+ int block_offset = blockIdx.x * blockDim.x * batch_per_block;
+
+ int x = threadIdx.x;
+ int tile_height = blockDim.x;
+
+ // Get tile height, width, and thread/block origin indices.
+ int small_dim = SmallDim2 ? input_dims[2] : input_dims[1];
+ int large_dim = SmallDim2 ? input_dims[1] : input_dims[2];
+
+ int global_offset = small_dim * large_dim * (blockIdx.y * batch_per_block) +
+ (SmallDim2 ? block_offset * small_dim : block_offset) + x;
+ if (global_offset > (input_dims[0] * input_dims[1] * input_dims[2])) return;
+
+ for (int batch = 0; batch < batch_per_block; ++batch) {
+ int block_origin_idx =
+ small_dim * large_dim * (blockIdx.y * batch_per_block + batch);
+ int thread_origin_idx =
+ block_origin_idx +
+ (SmallDim2 ? block_offset * small_dim : block_offset) + x;
+
+ if (block_offset + blockDim.x > large_dim) {
+ tile_height = large_dim - block_offset;
+ }
+
+ // Load a continuous memory region to shared memory tile.
+ if (x < tile_height) {
+ for (int y = 0; y < small_dim; y++) {
+ int shmem_index =
+ SmallDim2 ? (x + y * tile_height) : (x * small_dim + y);
+ shared_memory_tile[shmem_index] =
+ ldg(input + thread_origin_idx +
+ y * (SmallDim2 ? tile_height : large_dim));
+ }
+ }
+
+ __syncthreads();
+
+ // Get block origin index for output array.
+ int output_block_offset = block_origin_idx;
+ int output_block_idx = SmallDim2 ? block_offset : block_offset * small_dim;
+ int output_block_origin_idx = output_block_offset + output_block_idx;
+
+ // Store the tranposed memory region in shared memory to device.
+ if (x < tile_height) {
+ for (int y = 0; y < small_dim; y++) {
+ int output_idx = output_block_origin_idx + x +
+ y * (SmallDim2 ? large_dim : tile_height);
+ int shmem_index =
+ SmallDim2 ? (x * small_dim + y) : (x + y * tile_height);
+ output[output_idx] = shared_memory_tile[shmem_index];
+ }
+ }
+ }
+}
+
// A Cuda custom kernel that convert input to output, given proper padding on
// the left and the top. The padded value is zero.
template <typename T, int NDIMS>
@@ -420,25 +501,63 @@ template <typename T>
void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
const Dimension<3>& input_dims, T* output) {
// If both dimensions are not trivial, use tiles for the actual swapping.
+ // If one dimension is trivial, use SmallDim kernel for swapping.
// Otherwise, the trivial swapping relying on the ldg cache is more efficient.
static const int kMinDimensionToUseTiles = 16;
bool use_tiles = (input_dims[1] >= kMinDimensionToUseTiles &&
input_dims[2] >= kMinDimensionToUseTiles);
+ bool use_small_dim = ((input_dims[1] >= kMinDimensionToUseTiles &&
+ input_dims[2] < kMinDimensionToUseTiles)) ||
+ ((input_dims[1] < kMinDimensionToUseTiles &&
+ input_dims[2] >= kMinDimensionToUseTiles));
+ static const int NumSubTiles = 8;
+
if (use_tiles) {
- // We get best performance when TileSize is the number of threads in a warp
- // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
- // threads.
static const int TileSize = 32;
- static const int NumSubTiles = 8;
Dimension<3> input_dims_in_tiles = {
input_dims[0], (input_dims[1] + TileSize - 1) / TileSize,
(input_dims[2] + TileSize - 1) / TileSize,
};
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
input_dims_in_tiles[2];
+ // We get best performance when TileSize is the number of threads in a warp
+ // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
+ // threads.
SwapDimension1And2InTensor3UsingTiles<T, TileSize, NumSubTiles><<<
total_tiles_count, dim3(TileSize, NumSubTiles), 0, d.stream()>>>(
input, input_dims, output);
+ } else if (use_small_dim) {
+ // When only one of the dimensions is smaller than kMinDimensionToUseTiles,
+ // we use one block to process a rectangle region with the size of
+ // kTileLength * small_dim. We found that when set kTileLength to 64 on
+ // TitanX Maxwell GPU, it achieves the best performance.
+ // large_dim
+ // +---------------...--------+
+ // | | | |
+ // small_dim | | ... | |
+ // | | | |
+ // +--------------...---------+
+ // \----- ------/ \- -/
+ // V V
+ // kTileLength(tile_height) tile_height
+ static const int kTileLength = 64;
+ static const int kGridDimY = 2048;
+ int small_dim = std::min(input_dims[2], input_dims[1]);
+ int large_dim = std::max(input_dims[2], input_dims[1]);
+ int tile_num_per_block = (large_dim + kTileLength - 1) / kTileLength;
+ int grid_dim_y = input_dims[0] < kGridDimY ? input_dims[0] : kGridDimY;
+ int batch_per_block = (input_dims[0] + grid_dim_y - 1) / grid_dim_y;
+ if (input_dims[2] < input_dims[1]) {
+ SwapDimension1And2InTensor3SmallDim<T, true>
+ <<<dim3(tile_num_per_block, grid_dim_y), kTileLength,
+ kTileLength * small_dim * sizeof(T), d.stream()>>>(
+ input, batch_per_block, input_dims, output);
+ } else {
+ SwapDimension1And2InTensor3SmallDim<T, false>
+ <<<dim3(tile_num_per_block, grid_dim_y), kTileLength,
+ kTileLength * small_dim * sizeof(T), d.stream()>>>(
+ input, batch_per_block, input_dims, output);
+ }
} else {
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);