diff options
Diffstat (limited to 'tensorflow/core/kernels/bucketize_op_gpu.cu.cc')
-rw-r--r-- | tensorflow/core/kernels/bucketize_op_gpu.cu.cc | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc new file mode 100644 index 0000000000..aafbbe41b4 --- /dev/null +++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc @@ -0,0 +1,101 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/kernels/bucketize_op.h" +#include "tensorflow/core/kernels/cuda_device_array.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +template <typename T> +__global__ void BucketizeCustomKernel( + const int32 size_in, const T* in, const int32 size_boundaries, + CudaDeviceArrayStruct<float> boundaries_array, int32* out) { + const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array); + CUDA_1D_KERNEL_LOOP(i, size_in) { + T value = in[i]; + int32 bucket = 0; + int32 count = size_boundaries; + while (count > 0) { + int32 l = bucket; + int32 step = count / 2; + l += step; + if (!(value < static_cast<T>(boundaries[l]))) { + bucket = ++l; + count -= step + 1; + } else { + count = step; + } + } + out[i] = bucket; + } +} + +namespace functor { + +template <typename T> +struct BucketizeFunctor<GPUDevice, T> { + // PRECONDITION: boundaries_vector must be sorted. + static Status Compute(OpKernelContext* context, + const typename TTypes<T, 1>::ConstTensor& input, + const std::vector<float>& boundaries_vector, + typename TTypes<int32, 1>::Tensor& output) { + const GPUDevice& d = context->eigen_device<GPUDevice>(); + + CudaDeviceArrayOnHost<float> boundaries_array(context, + boundaries_vector.size()); + TF_RETURN_IF_ERROR(boundaries_array.Init()); + for (int i = 0; i < boundaries_vector.size(); ++i) { + boundaries_array.Set(i, boundaries_vector[i]); + } + TF_RETURN_IF_ERROR(boundaries_array.Finalize()); + + CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d); + BucketizeCustomKernel< + T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>( + input.size(), input.data(), boundaries_vector.size(), + boundaries_array.data(), output.data()); + + return Status::OK(); + } +}; +} // namespace functor + +#define REGISTER_GPU_SPEC(type) \ + template struct functor::BucketizeFunctor<GPUDevice, type>; + +REGISTER_GPU_SPEC(int32); +REGISTER_GPU_SPEC(int64); +REGISTER_GPU_SPEC(float); +REGISTER_GPU_SPEC(double); +#undef REGISTER_GPU_SPEC + +} // namespace tensorflow + +#endif // GOOGLE_CUDA |