diff options
Diffstat (limited to 'tensorflow/core/kernels/slice_op_gpu.cu.cc')
-rw-r--r-- | tensorflow/core/kernels/slice_op_gpu.cu.cc | 56 |
1 files changed, 0 insertions, 56 deletions
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc index 3039b3d777..a301986f2f 100644 --- a/tensorflow/core/kernels/slice_op_gpu.cu.cc +++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc @@ -21,65 +21,9 @@ limitations under the License. #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" namespace tensorflow { -namespace internal { - -template <typename T> -__global__ void SliceKernel(int nthreads, const T* src, const int32* buf, - const int32 ndims, T* dst) { - const int32* in_strides = buf; - const int32* out_strides = buf + ndims; - const int32* slice_indices = buf + ndims * 2; - CUDA_1D_KERNEL_LOOP(o_idx, nthreads) { - int32 i_idx = 0; - int32 t = o_idx; - for (int i = 0; i < ndims; ++i) { - i_idx += (t / out_strides[i] + slice_indices[i]) * in_strides[i]; - t %= out_strides[i]; - } - dst[o_idx] = ldg(src + i_idx); - } -} - -template <typename Device, typename T> -void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in, - const gtl::ArraySlice<int64>& slice_indices) { - // Ensures we can use 32-bit index. - const int64 in_nelem = in.NumElements(); - CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU"; - const int64 out_nelem = out->NumElements(); - CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU"; - // Pack strides and slice indices sizes into one buffer. - const int32 ndims = in.dims(); - gtl::InlinedVector<int32, 24> host_buf(ndims * 3); - gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape()); - gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape()); - for (int i = 0; i < ndims; ++i) { - host_buf[i] = in_strides[i]; - host_buf[ndims + i] = out_strides[i]; - host_buf[ndims * 2 + i] = slice_indices[i]; - } - auto num_bytes = sizeof(int64) * host_buf.size(); - auto dev_buf = d.allocate(num_bytes); - // NOTE: host_buf is not allocated by CudaHostAllocator, and - // therefore we are doing a sync copy effectively. - d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes); - // Launch kernel to q[...] = p[...]. - const T* p = in.flat<T>().data(); - T* q = out->flat<T>().data(); - CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d); - SliceKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>( - cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf), - ndims, q); - // Safe to deallocate immediately after the kernel launch. - d.deallocate(dev_buf); -} - -} // namespace internal typedef Eigen::GpuDevice GPUDevice; |