aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2016-11-10 16:47:07 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-11-10 17:05:29 -0800
commitf803bd7c5338d522d262314bd1e0eb4021367c3d (patch)
tree7b586f70c2e192b365afd137e34afe3ec34d1978 /tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
parent5b5b6f55287b8d1a663d101e5dbf74fca2425891 (diff)
Add a new op split_v that can handle variable size splits.
Aside from being useful on its own, this op also makes the implementation of the gradient of concat much more efficient. Previously a new slice op was created in the graph for every input tensor to concat. This op moves that logic inside of one op. The overhead could be quite significant in cases with 100+ input Tensors to concat. Change: 138822942
Diffstat (limited to 'tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc')
-rw-r--r--tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc55
1 files changed, 2 insertions, 53 deletions
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 9037a48580..019d6b6ab2 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -24,6 +24,7 @@ limitations under the License.
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/kernels/cuda_device_array_gpu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
namespace tensorflow {
@@ -31,58 +32,6 @@ typedef Eigen::GpuDevice GPUDevice;
namespace {
-struct Cuda2DLaunchConfig {
- dim3 virtual_thread_count;
- dim3 thread_per_block;
- dim3 block_count;
-};
-
-Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
- const GPUDevice& d) {
- Cuda2DLaunchConfig config;
-
- config.virtual_thread_count = dim3(xdim, ydim, 1);
-
- const int kThreadsPerBlock = 256;
- int block_cols = std::min(xdim, kThreadsPerBlock);
- // ok to round down here and just do more loops in the kernel
- int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
-
- const int physical_thread_count =
- d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
-
- const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
-
- config.thread_per_block = dim3(block_cols, block_rows, 1);
-
- int grid_x = std::min((xdim + block_cols - 1) / block_cols, max_blocks);
-
- config.block_count = dim3(
- grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
-
- return config;
-}
-
-template <typename IntType>
-__device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
- IntType* orig = first;
- IntType* it = nullptr;
- IntType step = 0;
- while (count > 0) {
- it = first;
- step = count / 2;
- it += step;
- if (!(val < *it)) {
- first = ++it;
- count -= step + 1;
- } else {
- count = step;
- }
- }
-
- return first - orig;
-}
-
template <typename T, typename IntType>
__global__ void concat_fixed_kernel(
CudaDeviceArrayStruct<const T*> input_ptr_data, int split_size,
@@ -139,7 +88,7 @@ __global__ void concat_variable_kernel(
// do an initial binary search and then scan linearly from there
// works well when there are many small segments and when the
// segments are much longer
- IntType segment = upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
+ IntType segment = gpu::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1;
IntType curr_offset = col_scan[segment];
IntType curr_segment = segment;