diff options
author | 2016-11-10 16:47:07 -0800 | |
---|---|---|
committer | 2016-11-10 17:05:29 -0800 | |
commit | f803bd7c5338d522d262314bd1e0eb4021367c3d (patch) | |
tree | 7b586f70c2e192b365afd137e34afe3ec34d1978 /tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | |
parent | 5b5b6f55287b8d1a663d101e5dbf74fca2425891 (diff) |
Add a new op split_v that can handle variable size splits.
Aside from being useful on its own, this op also makes the implementation
of the gradient of concat much more efficient. Previously a new slice op was
created in the graph for every input tensor to concat. This op moves that
logic inside of one op. The overhead could be quite significant in cases
with 100+ input Tensors to concat.
Change: 138822942
Diffstat (limited to 'tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc')
-rw-r--r-- | tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | 55 |
1 files changed, 2 insertions, 53 deletions
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc index 9037a48580..019d6b6ab2 100644 --- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc +++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/kernels/cuda_device_array_gpu.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" namespace tensorflow { @@ -31,58 +32,6 @@ typedef Eigen::GpuDevice GPUDevice; namespace { -struct Cuda2DLaunchConfig { - dim3 virtual_thread_count; - dim3 thread_per_block; - dim3 block_count; -}; - -Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim, - const GPUDevice& d) { - Cuda2DLaunchConfig config; - - config.virtual_thread_count = dim3(xdim, ydim, 1); - - const int kThreadsPerBlock = 256; - int block_cols = std::min(xdim, kThreadsPerBlock); - // ok to round down here and just do more loops in the kernel - int block_rows = std::max(kThreadsPerBlock / block_cols, 1); - - const int physical_thread_count = - d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(); - - const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1); - - config.thread_per_block = dim3(block_cols, block_rows, 1); - - int grid_x = std::min((xdim + block_cols - 1) / block_cols, max_blocks); - - config.block_count = dim3( - grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1); - - return config; -} - -template <typename IntType> -__device__ IntType upper_bound(IntType* first, IntType count, IntType val) { - IntType* orig = first; - IntType* it = nullptr; - IntType step = 0; - while (count > 0) { - it = first; - step = count / 2; - it += step; - if (!(val < *it)) { - first = ++it; - count -= step + 1; - } else { - count = step; - } - } - - return first - orig; -} - template <typename T, typename IntType> __global__ void concat_fixed_kernel( CudaDeviceArrayStruct<const T*> input_ptr_data, int split_size, @@ -139,7 +88,7 @@ __global__ void concat_variable_kernel( // do an initial binary search and then scan linearly from there // works well when there are many small segments and when the // segments are much longer - IntType segment = upper_bound<IntType>(col_scan, num_inputs, gidx) - 1; + IntType segment = gpu::upper_bound<IntType>(col_scan, num_inputs, gidx) - 1; IntType curr_offset = col_scan[segment]; IntType curr_segment = segment; |