From f41959ccb2d9d4c722fe8fc3351401d53bcf4900 Mon Sep 17 00:00:00 2001 From: Manjunath Kudlur Date: Fri, 6 Nov 2015 16:27:58 -0800 Subject: TensorFlow: Initial commit of TensorFlow library. TensorFlow is an open source software library for numerical computation using data flow graphs. Base CL: 107276108 --- tensorflow/core/kernels/tile_ops.cc | 460 ++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 tensorflow/core/kernels/tile_ops.cc (limited to 'tensorflow/core/kernels/tile_ops.cc') diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc new file mode 100644 index 0000000000..d5e0e89d60 --- /dev/null +++ b/tensorflow/core/kernels/tile_ops.cc @@ -0,0 +1,460 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#ifdef GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "tensorflow/core/kernels/tile_ops.h" + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// -------------------------------------------------------------------------- +template +class TileOp : public OpKernel { + public: + explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& multiples = context->input(1); + + OP_REQUIRES( + context, TensorShapeUtils::IsLegacyVector(multiples.shape()), + errors::InvalidArgument("Expected multiples to be 1-D, but got shape ", + multiples.shape().ShortDebugString())); + OP_REQUIRES(context, input.dims() == multiples.NumElements(), + errors::InvalidArgument( + "Expected multiples argument to be a vector of length ", + input.dims(), " but got length ", multiples.dim_size(0))); + + const int input_dims = input.dims(); + const gtl::ArraySlice multiples_array(multiples.flat().data(), + input_dims); + + TensorShape output_shape; + for (int i = 0; i < input_dims; ++i) { + OP_REQUIRES( + context, multiples_array[i] > 0, + errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ", + multiples_array[i])); + output_shape.AddDim(input.dim_size(i) * multiples_array[i]); + } + Tensor* result = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result)); + +#define HANDLE_DIM(DT, NDIM) \ + if (context->input(0).dtype() == DT && input_dims == NDIM) { \ + HandleCase(context, multiples_array, result); \ + return; \ + } + +#define HANDLE_TYPE(T) \ + HANDLE_DIM(T, 0) \ + HANDLE_DIM(T, 1) \ + HANDLE_DIM(T, 2) \ + HANDLE_DIM(T, 3) \ + HANDLE_DIM(T, 4) \ + HANDLE_DIM(T, 5) + + HANDLE_TYPE(DT_BOOL); + HANDLE_TYPE(DT_FLOAT); + HANDLE_TYPE(DT_DOUBLE); + HANDLE_TYPE(DT_UINT8); + HANDLE_TYPE(DT_INT32); + HANDLE_TYPE(DT_INT16); + HANDLE_TYPE(DT_INT64); + HANDLE_TYPE(DT_STRING); // when DEVICE=CPUDevice. + +#undef HANDLE_TYPE +#undef HANDLE_DIM + + OP_REQUIRES(context, false, + errors::Unimplemented( + "TileOp : Unhandled input dimensions, DT : ", + context->input(0).dtype(), ", dims : ", input_dims)); + } + + private: + template + void HandleCaseImpl(OpKernelContext* context, + const gtl::ArraySlice& multiples_array, + Tensor* result) { + typedef typename EnumToDataType
::Type T; + Eigen::array broadcast_array; + for (int i = 0; i < NDIM; ++i) { + broadcast_array[i] = multiples_array[i]; + } + functor::Tile()( + context->eigen_device(), result->tensor(), + context->input(0).tensor(), broadcast_array); + } + + template + void HandleCase(OpKernelContext* context, + const gtl::ArraySlice& multiples_array, + Tensor* result); + + TF_DISALLOW_COPY_AND_ASSIGN(TileOp); +}; + +template +template +inline void TileOp::HandleCase( + OpKernelContext* context, const gtl::ArraySlice& multiples_array, + Tensor* result) { + LOG(FATAL) << "TileOp: Invalid combination of Device, DT and NDIM: " + << typeid(Device).name() << ", " << DataTypeString(DT) << ", " + << NDIM; +} + +#define HANDLE_CASE(device, dtype, ndim) \ + template <> \ + template <> \ + void TileOp::HandleCase( \ + OpKernelContext * context, \ + const gtl::ArraySlice& multiples_array, Tensor* result) { \ + HandleCaseImpl(context, multiples_array, result); \ + } + +#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \ + HANDLE_CASE(device, dtype, 1); \ + HANDLE_CASE(device, dtype, 2); \ + HANDLE_CASE(device, dtype, 3); \ + HANDLE_CASE(device, dtype, 4); \ + HANDLE_CASE(device, dtype, 5); + +#define HANDLE_CASE_DIM(device, dtype) \ + HANDLE_CASE(device, dtype, 0); \ + HANDLE_CASE_DIM_POSITIVE(device, dtype); + +HANDLE_CASE_DIM(CPUDevice, DT_BOOL); +HANDLE_CASE_DIM(CPUDevice, DT_FLOAT); +HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM(CPUDevice, DT_UINT8); +HANDLE_CASE_DIM(CPUDevice, DT_INT32); +HANDLE_CASE_DIM(CPUDevice, DT_INT16); +HANDLE_CASE_DIM(CPUDevice, DT_INT64); +HANDLE_CASE_DIM(CPUDevice, DT_STRING); + +#if GOOGLE_CUDA +// Eigen on GPU does not handle 0-dimension data types yet. +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64); +#endif // GOOGLE_CUDA + +#undef HANDLE_CASE_DIM_POSITIVE +#undef HANDLE_CASE_DIM +#undef HANDLE_CASE + +// -------------------------------------------------------------------------- +template +class TileGradientOp : public OpKernel { + public: + explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& multiples = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsLegacyVector(multiples.shape()), + errors::InvalidArgument("Expected multiples to be 1-D, but got shape ", + multiples.shape().ShortDebugString())); + OP_REQUIRES(context, input.dims() == multiples.NumElements(), + errors::InvalidArgument( + "Expected multiples argument to be a vector of length ", + input.dims(), " but got length ", multiples.dim_size(0))); + + const int input_dims = input.dims(); + const gtl::ArraySlice multiples_array(multiples.flat().data(), + input_dims); + + TensorShape output_shape; + std::vector input_dim_size_vec; + for (int i = 0; i < input_dims; ++i) { + OP_REQUIRES( + context, multiples_array[i] > 0, + errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ", + multiples_array[i])); + OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0, + errors::InvalidArgument("Expected input_dim[", i, + "] to be divisible by multiples[", i, + "], but ", input.dim_size(i), " % ", + multiples_array[i], " != 0")); + output_shape.AddDim(input.dim_size(i) / multiples_array[i]); + input_dim_size_vec.push_back(input.dim_size(i)); + } + Tensor* result = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result)); + +#define HANDLE_DIM(DT, NDIM) \ + if (context->input(0).dtype() == DT && input_dims == NDIM) { \ + HandleCase(context, input_dim_size_vec, multiples_array, \ + result); \ + return; \ + } + +#define HANDLE_TYPE(T) \ + HANDLE_DIM(T, 0) \ + HANDLE_DIM(T, 1) \ + HANDLE_DIM(T, 2) \ + HANDLE_DIM(T, 3) \ + HANDLE_DIM(T, 4) \ + HANDLE_DIM(T, 5) + + HANDLE_TYPE(DT_FLOAT); + HANDLE_TYPE(DT_DOUBLE); + HANDLE_TYPE(DT_INT32); + HANDLE_TYPE(DT_INT16); + HANDLE_TYPE(DT_INT64); + +#undef HANDLE_TYPE +#undef HANDLE_DIM + + OP_REQUIRES(context, false, + errors::Unimplemented( + "TileGradientOp : Unhandled input dimensions, DT : ", + context->input(0).dtype(), ", dims : ", input_dims)); + } + + private: + template + void HandleCase(OpKernelContext* context, + const std::vector& input_dims, + const gtl::ArraySlice& multiples_array, + Tensor* result); + + template + void HandleCaseImpl(OpKernelContext* context, + const std::vector& input_dims, + const gtl::ArraySlice& multiples_array, + Tensor* result) { + typedef typename EnumToDataType
::Type T; + + bool reduction_only = true; + std::vector reduction_dims; + + for (int i = 0; i < NDIM; ++i) { + if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) { + reduction_only = false; + break; + } else { + if (multiples_array[i] == input_dims[i]) { + reduction_dims.push_back(i); + } + } + } + + if (reduction_only) { +#define HANDLE_DIM(D) \ + if (reduction_dims.size() == (D)) { \ + HandleReduce(context, reduction_dims, result); \ + return; \ + } + // NOTE(keveman): Handling the most common case here. + // Adding more cases here would require more templating and code + // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1. + HANDLE_DIM(NDIM > 0 ? 1 : 0); + +// Fall through to the unoptimized version. +#undef HANDLE_DIM + } + + Eigen::DSizes indices; + Eigen::DSizes sizes; + + // Accumulate slices along the dimensions into the output. The number of + // slices along dimension 'i' is simply the multiple along dimension 'i' + // passed to the original Tile op. + for (int i = 0; i < NDIM; ++i) { + sizes[i] = input_dims[i] / multiples_array[i]; + indices[i] = 0; + } + + bool first = true; + while (true) { + functor::TileGrad()( + context->eigen_device(), result->tensor(), + context->input(0).tensor(), indices, sizes, first); + first = false; + // Increment the begin indices. + int i = 0; + while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) { + indices[i] = 0; + ++i; + } + // We are finished if we have iterated to the maximum along all + // dimensions. + if (i == NDIM) { + break; + } + indices[i] += sizes[i]; + } + } + + template + void HandleReduce(OpKernelContext* context, + const std::vector& reduce_dim_in, Tensor* result) { + static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions"); + Eigen::DSizes reduce_dim; + Eigen::DSizes reshape_dim; + + for (int i = 0; i < REDUCENDIM; ++i) { + reduce_dim[i] = reduce_dim_in[i]; + } + + for (int i = 0; i < NDIM; ++i) { + reshape_dim[i] = result->dim_size(i); + } + + functor::ReduceAndReshape()( + context->eigen_device(), result->tensor(), + context->input(0).tensor(), reduce_dim, reshape_dim); + } + + TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp); +}; + +template +template +inline void TileGradientOp::HandleCase( + OpKernelContext* context, const std::vector& input_dims, + const gtl::ArraySlice& multiples_array, Tensor* result) { + LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: " + << typeid(Device).name() << ", " << DataTypeString(DT) << ", " + << NDIM; +} + +#define HANDLE_CASE(device, dtype, ndim) \ + template <> \ + template <> \ + void TileGradientOp::HandleCase( \ + OpKernelContext * context, const std::vector& input_dims, \ + const gtl::ArraySlice& multiples_array, Tensor* result) { \ + HandleCaseImpl(context, input_dims, multiples_array, result); \ + } + +#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \ + HANDLE_CASE(device, dtype, 1); \ + HANDLE_CASE(device, dtype, 2); \ + HANDLE_CASE(device, dtype, 3); \ + HANDLE_CASE(device, dtype, 4); \ + HANDLE_CASE(device, dtype, 5); + +#define HANDLE_CASE_DIM(device, dtype) \ + HANDLE_CASE(device, dtype, 0); \ + HANDLE_CASE_DIM_POSITIVE(device, dtype); + +HANDLE_CASE_DIM(CPUDevice, DT_FLOAT); +HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM(CPUDevice, DT_INT16); +HANDLE_CASE_DIM(CPUDevice, DT_INT32); +HANDLE_CASE_DIM(CPUDevice, DT_INT64); + +#if GOOGLE_CUDA +// Eigen on GPU does not handle 0-dimension data types yet. +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64); +#endif // GOOGLE_CUDA + +#undef HANDLE_CASE_DIM_POSITIVE +#undef HANDLE_CASE_DIM +#undef HANDLE_CASE + +REGISTER_KERNEL_BUILDER(Name("Tile").Device(DEVICE_CPU).HostMemory("multiples"), + TileOp); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_CPU) + .HostMemory("multiples"), + TileGradientOp); + +#if GOOGLE_CUDA +#define DEFINE_GPU_TYPE(T) \ + DEFINE_GPU_DIM(T, 1) \ + DEFINE_GPU_DIM(T, 2) \ + DEFINE_GPU_DIM(T, 3) \ + DEFINE_GPU_DIM(T, 4) \ + DEFINE_GPU_DIM(T, 5) + +#define DEFINE_GPU_DIM(T, NDIM) \ + template <> \ + void Tile::operator()( \ + const GPUDevice& d, typename TTypes::Tensor out, \ + typename TTypes::ConstTensor in, \ + const Eigen::array& broadcast_array) const; \ + extern template struct Tile; \ + template <> \ + void TileGrad::operator()( \ + const GPUDevice& d, typename TTypes::Tensor out, \ + typename TTypes::ConstTensor in, \ + const Eigen::DSizes& indices, \ + const Eigen::DSizes& sizes, bool first) const; \ + extern template struct TileGrad; \ + template <> \ + void ReduceAndReshape::operator()( \ + const GPUDevice& d, typename TTypes::Tensor out, \ + typename TTypes::ConstTensor in, \ + const Eigen::DSizes& reduce_dim, \ + const Eigen::DSizes& reshape_dim) const; \ + extern template struct ReduceAndReshape; + +namespace functor { +DEFINE_GPU_TYPE(float); +DEFINE_GPU_TYPE(double); +DEFINE_GPU_TYPE(int64); +DEFINE_GPU_TYPE(int32); +DEFINE_GPU_TYPE(int16); +} // end namespace functor + +#undef DEFINE_GPU_DIM +#undef DEFINE_GPU_TYPE + +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("multiples"), + TileOp); +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("multiples"), + TileOp); +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("multiples"), + TileOp); + +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("multiples"), + TileGradientOp); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("multiples"), + TileGradientOp); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .HostMemory("multiples"), + TileGradientOp); +#endif // GOOGLE_CUDA +} // namespace tensorflow -- cgit v1.2.3