diff options
Diffstat (limited to 'tensorflow/core/kernels')
323 files changed, 33366 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc new file mode 100644 index 0000000000..7cc0534354 --- /dev/null +++ b/tensorflow/core/kernels/adjust_contrast_op.cc @@ -0,0 +1,121 @@ +// See docs in ../ops/image_ops.cc +#define EIGEN_USE_THREADS + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/adjust_contrast_op.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class AdjustContrastOp : public OpKernel { + public: + explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) { + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& factor = context->input(1); + const Tensor& min_value = context->input(2); + const Tensor& max_value = context->input(3); + OP_REQUIRES(context, input.dims() >= 3, + errors::InvalidArgument("input must be at least 3-D, got shape", + input.shape().ShortDebugString())); + const int64 height = input.dim_size(input.dims() - 3); + const int64 width = input.dim_size(input.dims() - 2); + const int64 channels = input.dim_size(input.dims() - 1); + + OP_REQUIRES(context, TensorShapeUtils::IsScalar(factor.shape()), + errors::InvalidArgument("contrast_factor must be scalar: ", + factor.shape().ShortDebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(min_value.shape()), + errors::InvalidArgument("min_value must be scalar: ", + min_value.shape().ShortDebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_value.shape()), + errors::InvalidArgument("max_value must be scalar: ", + max_value.shape().ShortDebugString())); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + + Tensor mean_values; + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<float>::value, + TensorShape(input.shape()), + &mean_values)); + + if (input.NumElements() > 0) { + const int64 batch = input.NumElements() / (height * width * channels); + const int64 shape[4] = {batch, height, width, channels}; + functor::AdjustContrast<Device, T>()( + context->eigen_device<Device>(), input.shaped<T, 4>(shape), + factor.scalar<float>(), min_value.scalar<float>(), + max_value.scalar<float>(), mean_values.shaped<float, 4>(shape), + output->shaped<float, 4>(shape)); + } + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("AdjustContrast").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + AdjustContrastOp<CPUDevice, T>); + +REGISTER_KERNEL(uint8); +REGISTER_KERNEL(int8); +REGISTER_KERNEL(int16); +REGISTER_KERNEL(int32); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA +// Forward declarations of the function specializations for GPU (to prevent +// building the GPU versions here, they will be built compiling _gpu.cu.cc). +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void AdjustContrast<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<float>::ConstScalar contrast_factor, \ + typename TTypes<float>::ConstScalar min_value, \ + typename TTypes<float>::ConstScalar max_value, \ + typename TTypes<float, 4>::Tensor mean_values, \ + typename TTypes<float, 4>::Tensor output); \ + extern template struct AdjustContrast<GPUDevice, T>; + +DECLARE_GPU_SPEC(uint8); +DECLARE_GPU_SPEC(int8); +DECLARE_GPU_SPEC(int16); +DECLARE_GPU_SPEC(int32); +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("AdjustContrast").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ + AdjustContrastOp<GPUDevice, T>); +REGISTER_GPU_KERNEL(uint8); +REGISTER_GPU_KERNEL(int8); +REGISTER_GPU_KERNEL(int16); +REGISTER_GPU_KERNEL(int32); +REGISTER_GPU_KERNEL(float); +REGISTER_GPU_KERNEL(double); +#undef REGISTER_GPU_KERNEL + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/adjust_contrast_op.h new file mode 100644 index 0000000000..2182b33c03 --- /dev/null +++ b/tensorflow/core/kernels/adjust_contrast_op.h @@ -0,0 +1,64 @@ +#ifndef TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_ +#define TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_ +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by AdjustContrastOp to do the computations. +template <typename Device, typename T> +struct AdjustContrast { + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input, + typename TTypes<float>::ConstScalar contrast_factor, + typename TTypes<float>::ConstScalar min_value, + typename TTypes<float>::ConstScalar max_value, + typename TTypes<float, 4>::Tensor mean_values, + typename TTypes<float, 4>::Tensor output) { + const int batch = input.dimension(0); + const int height = input.dimension(1); + const int width = input.dimension(2); + const int channels = input.dimension(3); + + Eigen::array<int, 4> scalar_broadcast{{batch, height, width, channels}}; +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::array<int, 2> reduction_axis{{1, 2}}; + Eigen::array<int, 4> scalar{{1, 1, 1, 1}}; + Eigen::array<int, 4> broadcast_dims{{1, height, width, 1}}; + Eigen::Tensor<int, 4>::Dimensions reshape_dims{{batch, 1, 1, channels}}; +#else + Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > + reduction_axis; + Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<1>, + Eigen::type2index<1>, Eigen::type2index<1> > scalar; + Eigen::IndexList<Eigen::type2index<1>, int, int, Eigen::type2index<1> > + broadcast_dims; + broadcast_dims.set(1, height); + broadcast_dims.set(2, width); + Eigen::IndexList<int, Eigen::type2index<1>, Eigen::type2index<1>, int> + reshape_dims; + reshape_dims.set(0, batch); + reshape_dims.set(3, channels); +#endif + mean_values.device(d) = input.template cast<float>() + .mean(reduction_axis) + .eval() + .reshape(reshape_dims) + .broadcast(broadcast_dims); + + auto contrast_factor_tensor = + contrast_factor.reshape(scalar).broadcast(scalar_broadcast); + auto adjusted = + (input.template cast<float>() - mean_values) * contrast_factor_tensor + + mean_values; + auto min_bcast = min_value.reshape(scalar).broadcast(scalar_broadcast); + auto max_bcast = max_value.reshape(scalar).broadcast(scalar_broadcast); + // TODO(wicke): This is rather slow and should be re-written as pure cuda. + output.device(d) = adjusted.cwiseMin(max_bcast).cwiseMax(min_bcast); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_ADJUST_CONTRAST_OP_H_ diff --git a/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc new file mode 100644 index 0000000000..75b177cf4d --- /dev/null +++ b/tensorflow/core/kernels/adjust_contrast_op_benchmark_test.cc @@ -0,0 +1,43 @@ +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +static Graph* BM_AdjustContrast(int batches, int width, int height) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor in(DT_UINT8, TensorShape({batches, width, height, 3})); + in.flat<uint8>().setRandom(); + Tensor factor(DT_FLOAT, TensorShape({})); + factor.flat<float>().setConstant(1.2); + Tensor min_value(DT_FLOAT, TensorShape({})); + min_value.flat<float>().setConstant(7.); + Tensor max_value(DT_FLOAT, TensorShape({})); + max_value.flat<float>().setConstant(250.); + + Node* ret; + NodeBuilder(g->NewName("n"), "AdjustContrast") + .Input(test::graph::Constant(g, in)) + .Input(test::graph::Constant(g, factor)) + .Input(test::graph::Constant(g, min_value)) + .Input(test::graph::Constant(g, max_value)) + .Finalize(g, &ret); + return g; +} + +#define BM_AdjustContrastDev(DEVICE, B, W, H) \ + static void BM_AdjustContrast_##DEVICE##_##B##_##W##_##H(int iters) { \ + testing::ItemsProcessed(iters* B* W* H * 3); \ + test::Benchmark(#DEVICE, BM_AdjustContrast(B, W, H)).Run(iters); \ + } \ + BENCHMARK(BM_AdjustContrast_##DEVICE##_##B##_##W##_##H); + +// Benchmark results as of cl/106323955 +// BM_AdjustContrast_cpu_1_299_299 3416770 22008951 100 11.6M items/s + +// BM_AdjustContrast_gpu_32_299_299 37117844 45512374 100 179.8M items/s +BM_AdjustContrastDev(cpu, 1, 299, 299) BM_AdjustContrastDev(gpu, 32, 299, 299) + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc new file mode 100644 index 0000000000..7a9b0726fd --- /dev/null +++ b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc @@ -0,0 +1,22 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/adjust_contrast_op.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; +template struct functor::AdjustContrast<GPUDevice, uint8>; +template struct functor::AdjustContrast<GPUDevice, int8>; +template struct functor::AdjustContrast<GPUDevice, int16>; +template struct functor::AdjustContrast<GPUDevice, int32>; +template struct functor::AdjustContrast<GPUDevice, int64>; +template struct functor::AdjustContrast<GPUDevice, float>; +template struct functor::AdjustContrast<GPUDevice, double>; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc new file mode 100644 index 0000000000..67891e4fa1 --- /dev/null +++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc @@ -0,0 +1,88 @@ +#include "tensorflow/core/framework/allocator.h" +#include <gtest/gtest.h> +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +class AdjustContrastOpTest : public OpsTestBase { + protected: + void MakeOp() { RequireDefaultOps(); } +}; + +TEST_F(AdjustContrastOpTest, Simple_1113) { + RequireDefaultOps(); + EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Attr("T", DT_FLOAT) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + AddInputFromArray<float>(TensorShape({1, 1, 1, 3}), {-1, 2, 3}); + AddInputFromArray<float>(TensorShape({}), {1.0}); + AddInputFromArray<float>(TensorShape({}), {0.0}); + AddInputFromArray<float>(TensorShape({}), {2.0}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 3})); + test::FillValues<float>(&expected, {0, 2, 2}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(AdjustContrastOpTest, Simple_1223) { + RequireDefaultOps(); + EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Attr("T", DT_FLOAT) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + AddInputFromArray<float>(TensorShape({1, 2, 2, 3}), + {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12}); + AddInputFromArray<float>(TensorShape({}), {0.2}); + AddInputFromArray<float>(TensorShape({}), {0.0}); + AddInputFromArray<float>(TensorShape({}), {10.0}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 3})); + test::FillValues<float>( + &expected, {2.2, 6.2, 10, 2.4, 6.4, 10, 2.6, 6.6, 10, 2.8, 6.8, 10}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(AdjustContrastOpTest, Big_99x99x3) { + EXPECT_OK(NodeDefBuilder("adjust_constrast_op", "AdjustContrast") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Attr("T", DT_FLOAT) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + + std::vector<float> values; + for (int i = 0; i < 99 * 99 * 3; ++i) { + values.push_back(i % 255); + } + + AddInputFromArray<float>(TensorShape({1, 99, 99, 3}), values); + AddInputFromArray<float>(TensorShape({}), {0.2}); + AddInputFromArray<float>(TensorShape({}), {0}); + AddInputFromArray<float>(TensorShape({}), {255}); + ASSERT_OK(RunOpKernel()); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc new file mode 100644 index 0000000000..426e868735 --- /dev/null +++ b/tensorflow/core/kernels/aggregate_ops.cc @@ -0,0 +1,238 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/aggregate_ops.h" + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/register_types.h" + +#include "tensorflow/core/platform/logging.h" +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class AddNOp : public OpKernel { + public: + explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { + if (!ctx->ValidateInputsAreSameShape(this)) return; + + const Tensor& input0 = ctx->input(0); + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output)); + auto To = output->flat<T>(); + + const int num = ctx->num_inputs(); + if (num == 1) { + *output = input0; + return; + } + +#define I(IDX) ctx->input(IDX).flat<T>() + +#if defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID) + // On Android, we only support additions of two arguments, so we + // can reduce the number of template instantiations. + OP_REQUIRES(ctx, num == 2, + errors::InvalidArgument("Only additions of two arguments " + "supported. Num inputs: ", + num)); + functor::Add2Functor<Device, T> functor2; + functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); +#else + static const int kWidth = 8; + int r = num % kWidth; + + switch (r) { + case 2: { + functor::Add2Functor<Device, T> functor2; + functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); + break; + } + case 3: { + functor::Add3Functor<Device, T> functor3; + functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2)); + break; + } + case 4: { + functor::Add4Functor<Device, T> functor4; + functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), + I(3)); + break; + } + case 5: { + functor::Add5Functor<Device, T> functor5; + functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), + I(3), I(4)); + break; + } + case 6: { + functor::Add6Functor<Device, T> functor6; + functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), + I(3), I(4), I(5)); + break; + } + case 7: { + functor::Add7Functor<Device, T> functor7; + functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), + I(3), I(4), I(5), I(6)); + break; + } + case 0: { + functor::Add8Functor<Device, T> functor8; + functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), + I(3), I(4), I(5), I(6), I(7)); + r = 8; + break; + } + case 1: { + functor::Add9Functor<Device, T> functor9; + functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), + I(3), I(4), I(5), I(6), I(7), I(8)); + r = 9; + break; + } + } + + for (; r < num; r += kWidth) { + functor::Add8pFunctor<Device, T> functor8p; + functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1), + I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7)); + } +#endif // defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID) + +#undef I + } +}; + +// Partial specializations for a CPUDevice, that uses the Eigen implementation +// from AddNEigenImpl. +namespace functor { +template <typename T> +struct Add2Functor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2) { + Add2EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2); + } +}; +template <typename T> +struct Add3Functor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3) { + Add3EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3); + } +}; +template <typename T> +struct Add4Functor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4) { + Add4EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4); + } +}; +template <typename T> +struct Add5Functor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5) { + Add5EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5); + } +}; +template <typename T> +struct Add6Functor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6) { + Add6EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6); + } +}; +template <typename T> +struct Add7Functor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7) { + Add7EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7); + } +}; + +template <typename T> +struct Add8Functor<CPUDevice, T> { + void operator()( + const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { + Add8EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7, in8); + } +}; + +template <typename T> +struct Add8pFunctor<CPUDevice, T> { + void operator()( + const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { + Add8pEigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7, in8); + } +}; + +template <typename T> +struct Add9Functor<CPUDevice, T> { + void operator()( + const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8, + typename TTypes<T>::ConstFlat in9) { + Add9EigenImpl<CPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7, in8, in9); + } +}; + +} // namespace functor + +#define REGISTER_ADDN(type, dev) \ + REGISTER_KERNEL_BUILDER( \ + Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ + AddNOp<dev##Device, type>) + +#define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU) + +TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU); +#undef REGISTER_ADDN_CPU + +#if GOOGLE_CUDA +REGISTER_ADDN(float, GPU); +#endif // GOOGLE_CUDA + +#undef REGISTER_ADDN + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/aggregate_ops.h b/tensorflow/core/kernels/aggregate_ops.h new file mode 100644 index 0000000000..2214901970 --- /dev/null +++ b/tensorflow/core/kernels/aggregate_ops.h @@ -0,0 +1,211 @@ +#ifndef TENSORFLOW_KERNELS_AGGREGATE_OPS_H_ +#define TENSORFLOW_KERNELS_AGGREGATE_OPS_H_ + +// Functor definitions for Aggregate ops, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +struct Add2Functor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2); +}; + +template <typename Device, typename T> +struct Add2EigenImpl { + static void Compute(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2) { + out.device(d) = in1 + in2; + } +}; + +template <typename Device, typename T> +struct Add3Functor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3); +}; + +template <typename Device, typename T> +struct Add3EigenImpl { + static void Compute(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3) { + out.device(d) = in1 + in2 + in3; + } +}; + +template <typename Device, typename T> +struct Add4Functor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4); +}; + +template <typename Device, typename T> +struct Add4EigenImpl { + static void Compute(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4) { + out.device(d) = in1 + in2 + in3 + in4; + } +}; + +template <typename Device, typename T> +struct Add5Functor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5); +}; + +template <typename Device, typename T> +struct Add5EigenImpl { + static void Compute(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5) { + out.device(d) = in1 + in2 + in3 + in4 + in5; + } +}; + +template <typename Device, typename T> +struct Add6Functor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6); +}; + +template <typename Device, typename T> +struct Add6EigenImpl { + static void Compute(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6) { + out.device(d) = in1 + in2 + in3 + in4 + in5 + in6; + } +}; + +template <typename Device, typename T> +struct Add7Functor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7); +}; + +template <typename Device, typename T> +struct Add7EigenImpl { + static void Compute(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7) { + out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7; + } +}; + +template <typename Device, typename T> +struct Add8Functor { + void operator()( + const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8); +}; + +template <typename Device, typename T> +struct Add8EigenImpl { + static void Compute( + const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { + out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8; + } +}; + +// Add8p is like Add8 except the underlying implementation should += +// rather than assign to the output. +template <typename Device, typename T> +struct Add8pFunctor { + void operator()( + const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8); +}; + +template <typename Device, typename T> +struct Add8pEigenImpl { + static void Compute( + const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { + out.device(d) += in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8; + } +}; + +template <typename Device, typename T> +struct Add9Functor { + void operator()( + const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8, + typename TTypes<T>::ConstFlat in9); +}; + +template <typename Device, typename T> +struct Add9EigenImpl { + static void Compute( + const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8, + typename TTypes<T>::ConstFlat in9) { + out.device(d) = in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8 + in9; + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_AGGREGATE_OPS_H_ diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc new file mode 100644 index 0000000000..5cf2934ac1 --- /dev/null +++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc @@ -0,0 +1,141 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/aggregate_ops.h" + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Partial specialization for a GPUDevice, that uses the Eigen implementation. +namespace functor { +template <typename T> +struct Add2Functor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2) { + Add2EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2); + } +}; + +template <typename T> +struct Add3Functor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3) { + Add3EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3); + } +}; + +template <typename T> +struct Add4Functor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4) { + Add4EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4); + } +}; + +template <typename T> +struct Add5Functor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5) { + Add5EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5); + } +}; + +template <typename T> +struct Add6Functor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6) { + Add6EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6); + } +}; + +template <typename T> +struct Add7Functor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, + typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, + typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, + typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7) { + Add7EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7); + } +}; + +template <typename T> +struct Add8Functor<GPUDevice, T> { + void operator()( + const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { + Add8EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7, in8); + } +}; + +template <typename T> +struct Add8pFunctor<GPUDevice, T> { + void operator()( + const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) { + Add8pEigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7, in8); + } +}; + +template <typename T> +struct Add9Functor<GPUDevice, T> { + void operator()( + const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2, + typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4, + typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6, + typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8, + typename TTypes<T>::ConstFlat in9) { + Add9EigenImpl<GPUDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6, + in7, in8, in9); + } +}; + +} // end namespace functor + +// Instantiate the GPU implementation for float. +template struct functor::Add2Functor<GPUDevice, float>; +template struct functor::Add3Functor<GPUDevice, float>; +template struct functor::Add4Functor<GPUDevice, float>; +template struct functor::Add5Functor<GPUDevice, float>; +template struct functor::Add6Functor<GPUDevice, float>; +template struct functor::Add7Functor<GPUDevice, float>; +template struct functor::Add8Functor<GPUDevice, float>; +template struct functor::Add8pFunctor<GPUDevice, float>; +template struct functor::Add9Functor<GPUDevice, float>; + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc new file mode 100644 index 0000000000..0845eebf09 --- /dev/null +++ b/tensorflow/core/kernels/argmax_op.cc @@ -0,0 +1,163 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "tensorflow/core/kernels/argmax_op.h" + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T, typename ArgFunctor> +class ArgOp : public OpKernel { + public: + explicit ArgOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& dimension = context->input(1); + + OP_REQUIRES(context, TensorShapeUtils::IsScalar(dimension.shape()), + errors::InvalidArgument( + "dim must be a scalar, but received tensor of shape: ", + dimension.shape().DebugString())); + + const int32 dim = dimension.scalar<int32>()(); + const int input_dims = input.dims(); + + OP_REQUIRES(context, dim >= 0, errors::InvalidArgument("dim must be >= 0")); + OP_REQUIRES(context, dim < input_dims, + errors::InvalidArgument("Minimum tensor rank: ", dim, + " but got: ", input_dims)); + + TensorShape output_shape; + TensorShape input_shape = input.shape(); + for (int d = 0; d < input_dims - 1; ++d) { + output_shape.AddDim(input_shape.dim_size((d < dim) ? d : d + 1)); + } + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + +#define HANDLE_DIM(NDIM) \ + case NDIM: \ + ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(), \ + input.tensor<T, NDIM>(), dim, \ + output->tensor<int64, NDIM - 1>()); \ + break; + + switch (input_dims) { + HANDLE_DIM(1); + HANDLE_DIM(2); + HANDLE_DIM(3); + HANDLE_DIM(4); + HANDLE_DIM(5); + + default: + OP_REQUIRES(context, false, + errors::InvalidArgument( + "ArgOp : Unhandled input dimensions: ", input_dims)); + } + } +#undef HANDLE_DIM + + private: + TF_DISALLOW_COPY_AND_ASSIGN(ArgOp); +}; + +template <typename Device, typename T> +class ArgMaxOp : public ArgOp<Device, T, functor::ArgMax<Device, T> > { + public: + explicit ArgMaxOp(OpKernelConstruction* context) + : ArgOp<Device, T, functor::ArgMax<Device, T> >(context) {} +}; + +template <typename Device, typename T> +class ArgMinOp : public ArgOp<Device, T, functor::ArgMin<Device, T> > { + public: + explicit ArgMinOp(OpKernelConstruction* context) + : ArgOp<Device, T, functor::ArgMin<Device, T> >(context) {} +}; + +#define REGISTER_ARGMAX(type) \ + REGISTER_KERNEL_BUILDER(Name("ArgMax") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("dimension"), \ + ArgMaxOp<CPUDevice, type>); \ + REGISTER_KERNEL_BUILDER(Name("ArgMin") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("dimension"), \ + ArgMinOp<CPUDevice, type>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_ARGMAX); + +#if GOOGLE_CUDA + +// Forward declarations of the functor specializations for GPU. +namespace functor { + +#define DECLARE_GPU_SPEC(T, Dims) \ + template <> \ + void ArgMax<GPUDevice, T>::Reduce##Dims( \ + const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \ + const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output); \ + template <> \ + void ArgMin<GPUDevice, T>::Reduce##Dims( \ + const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \ + const int32 dimension, typename TTypes<int64, Dims - 1>::Tensor output); + +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC(T, 1); \ + DECLARE_GPU_SPEC(T, 2); \ + DECLARE_GPU_SPEC(T, 3); \ + DECLARE_GPU_SPEC(T, 4); \ + DECLARE_GPU_SPEC(T, 5); + +#define DECLARE_GPU_CLASS(T) \ + extern template struct ArgMax<GPUDevice, T>; \ + extern template struct ArgMin<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS); +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_CLASS); + +#undef DECLARE_GPU_SPECS +#undef DECLARE_GPU_CLASS + +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_ARGMAX_GPU(type) \ + REGISTER_KERNEL_BUILDER(Name("ArgMax") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("dimension"), \ + ArgMaxOp<GPUDevice, type>); \ + REGISTER_KERNEL_BUILDER(Name("ArgMin") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("dimension"), \ + ArgMinOp<GPUDevice, type>); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_ARGMAX_GPU); + +#undef REGISTER_ARGMAX_GPU + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/argmax_op.h b/tensorflow/core/kernels/argmax_op.h new file mode 100644 index 0000000000..41734f3254 --- /dev/null +++ b/tensorflow/core/kernels/argmax_op.h @@ -0,0 +1,55 @@ +#ifndef TENSORFLOW_KERNELS_ARGMAX_OP_H_ +#define TENSORFLOW_KERNELS_ARGMAX_OP_H_ +// Generator definition for ArgMaxOp, must be compilable by nvcc. + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace functor { + +template <typename Device, typename T> +struct ArgMax { +#define DECLARE_COMPUTE_SPEC(Dims) \ + EIGEN_ALWAYS_INLINE static void Reduce##Dims( \ + const Device& d, typename TTypes<T, Dims>::ConstTensor input, \ + const int32 dimension, \ + typename TTypes<int64, Dims - 1>::Tensor output) { \ + output.device(d) = input.argmax(dimension).template cast<int64>(); \ + } + + DECLARE_COMPUTE_SPEC(1); + DECLARE_COMPUTE_SPEC(2); + DECLARE_COMPUTE_SPEC(3); + DECLARE_COMPUTE_SPEC(4); + DECLARE_COMPUTE_SPEC(5); + +#undef DECLARE_COMPUTE_SPEC +}; + +template <typename Device, typename T> +struct ArgMin { +#define DECLARE_COMPUTE_SPEC(Dims) \ + EIGEN_ALWAYS_INLINE static void Reduce##Dims( \ + const Device& d, typename TTypes<T, Dims>::ConstTensor input, \ + const int32 dimension, \ + typename TTypes<int64, Dims - 1>::Tensor output) { \ + output.device(d) = input.argmin(dimension).template cast<int64>(); \ + } + + DECLARE_COMPUTE_SPEC(1); + DECLARE_COMPUTE_SPEC(2); + DECLARE_COMPUTE_SPEC(3); + DECLARE_COMPUTE_SPEC(4); + DECLARE_COMPUTE_SPEC(5); + +#undef DECLARE_COMPUTE_SPEC +}; + +} // namespace functor + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_ARGMAX_OP_H_ diff --git a/tensorflow/core/kernels/argmax_op_gpu.cu.cc b/tensorflow/core/kernels/argmax_op_gpu.cu.cc new file mode 100644 index 0000000000..6c91fc2c86 --- /dev/null +++ b/tensorflow/core/kernels/argmax_op_gpu.cu.cc @@ -0,0 +1,20 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/argmax_op.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_GPU_SPEC(T) \ + template struct functor::ArgMax<GPUDevice, T>; \ + template struct functor::ArgMin<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC); + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h new file mode 100644 index 0000000000..3306f1eeaa --- /dev/null +++ b/tensorflow/core/kernels/assign_op.h @@ -0,0 +1,92 @@ +#ifndef TENSORFLOW_KERNELS_ASSIGN_OP_H_ +#define TENSORFLOW_KERNELS_ASSIGN_OP_H_ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +// TODO(jeff): Get rid of use_exclusive_lock_ option + +// Computes *input[0] = input[1] +class AssignOp : public OpKernel { + public: + explicit AssignOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, + context->GetAttr("use_locking", &use_exclusive_lock_)); + OP_REQUIRES_OK(context, + context->GetAttr("validate_shape", &validate_shape_)); + OP_REQUIRES(context, IsRefType(context->input_type(0)), + errors::InvalidArgument("lhs input needs to be a ref type")); + } + + void Compute(OpKernelContext* context) override { + Tensor rhs = context->input(1); + + // We always return the input ref. + context->forward_ref_input_to_ref_output(0, 0); + + // If the left hand side is not initialized, or the shape of the + // right-hand side is different than the left hand side, we need + // to allocate a new tensor. + { + mutex_lock l(*context->input_ref_mutex(0)); + + Tensor old_lhs = context->mutable_input(0, true); + + if (validate_shape_) { + OP_REQUIRES( + context, old_lhs.shape().IsSameSize(rhs.shape()), + errors::InvalidArgument( + "Assign requires shapes of both tensors to match. lhs shape= ", + old_lhs.shape().ShortDebugString(), " rhs shape= ", + rhs.shape().ShortDebugString())); + } + + const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape()); + if (!old_lhs.IsInitialized() || !same_shape) { + // Create new tensor whose shape matches the right hand side + // and copy then hand off to lhs. + // We can't always know how this value will be used downstream, + // so make conservative assumptions in specifying the memory + // allocation attributes. + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + PersistentTensor copy; + Tensor* copyTensor = nullptr; + OP_REQUIRES_OK( + context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(), + ©, ©Tensor, attr)); + Copy(context, copyTensor, rhs); + context->replace_ref_input(0, *copyTensor, true); + return; + } + + // The tensor has already been initialized and the right hand side + // matches the left hand side's shape. + if (use_exclusive_lock_) { + Copy(context, &old_lhs, rhs); + return; + } + } + + // The tensor has already been initialized and the right hand side + // matches the left hand side's shape. We have been told to do the + // copy outside the lock. + Tensor old_unlocked_lhs = context->mutable_input(0, false); + Copy(context, &old_unlocked_lhs, rhs); + } + + virtual void Copy(OpKernelContext* context, Tensor* lhs, + const Tensor& rhs) = 0; + + bool use_exclusive_lock_; + bool validate_shape_; +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_ASSIGN_OP_H_ diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc new file mode 100644 index 0000000000..28763f65a4 --- /dev/null +++ b/tensorflow/core/kernels/attention_ops.cc @@ -0,0 +1,92 @@ +// See docs in ../ops/attention_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" + +namespace tensorflow { + +class ExtractGlimpseOp : public OpKernel { + public: + explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_)); + OP_REQUIRES_OK(context, context->GetAttr("centered", ¢ered_)); + OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise_)); + } + + // Expect input tensor of rank 4 with dimensions (batch_size, height, width, + // depth). + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const TensorShape input_shape = input.shape(); + const int32 num_dims = input_shape.dims(); + OP_REQUIRES( + context, num_dims == 4, + errors::InvalidArgument( + "input must be 4-dimensional (batch_size, height, width, depth)", + input_shape.ShortDebugString())); + + const int64 batch_size = input_shape.dim_size(0); + + const Tensor& window_size = context->input(1); + OP_REQUIRES(context, (window_size.shape().dims() == 1) && + window_size.shape().dim_size(0) == 2, + errors::InvalidArgument( + "input must be a vector of size 2 (height, width)", + window_size.shape().ShortDebugString())); + + const int64 output_height = window_size.tensor<int, 1>()(0); + const int64 output_width = window_size.tensor<int, 1>()(1); + TensorShape output_shape = input_shape; + output_shape.set_dim(1, output_height); + output_shape.set_dim(2, output_width); + + const Tensor& offsets = context->input(2); + OP_REQUIRES(context, offsets.shape().dims() == 2, + errors::InvalidArgument("input must be a matrix", + offsets.shape().ShortDebugString())); + OP_REQUIRES(context, offsets.shape().dim_size(0) == batch_size, + errors::InvalidArgument("first dimension should be batch", + offsets.shape().ShortDebugString())); + OP_REQUIRES( + context, offsets.shape().dim_size(1) == 2, + errors::InvalidArgument("second dimension should be of size 2 (y,x)", + offsets.shape().ShortDebugString())); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + + std::vector<Eigen::IndexPair<float> > offset_vec; + offset_vec.reserve(batch_size); + for (int i = 0; i < batch_size; ++i) { + float offset_y = offsets.tensor<float, 2>()(i, 0); + float offset_x = offsets.tensor<float, 2>()(i, 1); + // Eigen::ExtractGlimpses expects offsets as (x,y), whereas the + // calling TensorFlow operates with (y,x) as indices. + offset_vec.push_back(Eigen::IndexPair<float>(offset_x, offset_y)); + } + + output->tensor<float, 4>().swap_layout().device( + context->eigen_cpu_device()) = + Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(), + output_width, output_height, offset_vec, + normalized_, centered_, uniform_noise_); + } + + private: + bool normalized_; + bool centered_; + bool uniform_noise_; +}; + +REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU), + ExtractGlimpseOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc new file mode 100644 index 0000000000..26f98ffbcd --- /dev/null +++ b/tensorflow/core/kernels/avgpooling_op.cc @@ -0,0 +1,418 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/avgpooling_op.h" + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/pooling_ops_common.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/maxpooling_op_gpu.h" +#include "tensorflow/core/kernels/pooling_ops_common_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class AvgPoolingOp : public UnaryOp<T> { + public: + explicit AvgPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + OP_REQUIRES(context, params.depth_window == 1, + errors::Unimplemented( + "Non-spatial pooling is not " + "yet supported. Volunteers? :)")); + + // For avgpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional")); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, params.forward_output_shape(), &output)); + + if (std::is_same<Device, GPUDevice>::value) { + Eigen::PaddingType pt = BrainPadding2EigenPadding(padding_); + functor::SpatialAvgPooling<Device, T>()( + context->eigen_device<Device>(), output->tensor<T, 4>(), + tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols, + params.row_stride, params.col_stride, pt); + } else { + SpatialAvgPool<Device, T>(context, output, tensor_in, params, padding_); + } + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +REGISTER_KERNEL_BUILDER(Name("AvgPool") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + AvgPoolingOp<CPUDevice, float>); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void SpatialAvgPooling<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, int window_rows, \ + int window_cols, int row_stride, int col_stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialAvgPooling<GPUDevice, T>; + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNEL_BUILDER(Name("AvgPool") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T"), + AvgPoolingOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + +// The operation to compute AvgPool gradients. +// It takes two inputs: +// - The original input tensor shape +// - Backprop tensor for output +// It produces one output: backprop tensor for input. +template <typename Device, class T> +class AvgPoolingGradOp : public OpKernel { + public: + explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in_shape = context->input(0); + const Tensor& out_backprop = context->input(1); + // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements. + OP_REQUIRES(context, tensor_in_shape.dims() == 1 && + tensor_in_shape.NumElements() == 4, + errors::InvalidArgument( + "out_backprop must be 1-dimensional and 4 " + "elements")); + // For avgpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + const int64 out_backprop_batch = out_backprop.dim_size(0); + const int64 out_backprop_rows = out_backprop.dim_size(1); + const int64 out_backprop_cols = out_backprop.dim_size(2); + const int64 out_backprop_depth = out_backprop.dim_size(3); + + TensorShape output_shape; + auto shape_vec = tensor_in_shape.vec<int32>(); + for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) { + output_shape.AddDim(shape_vec(i)); + } + const int64 in_rows = output_shape.dim_size(1); + const int64 in_cols = output_shape.dim_size(2); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + output->flat<T>().setZero(); + + const int window_rows = ksize_[1]; + const int window_cols = ksize_[2]; + const int depth_window = ksize_[3]; + + const int row_stride = stride_[1]; + const int col_stride = stride_[2]; + + // We (will) use different code for spatial pooling and + // non-spatial pooling. + // + // Spatial pooling is when depth_window = 1 + OP_REQUIRES(context, depth_window == 1, + errors::Unimplemented( + "Non-spatial pooling is not " + "yet supported. Volunteers? :)")); + + int out_height, out_width, pad_rows, pad_cols; + OP_REQUIRES_OK( + context, Get2dOutputSize(in_rows, in_cols, window_rows, window_cols, + row_stride, col_stride, padding_, &out_height, + &out_width, &pad_rows, &pad_cols)); + + const T* out_backprop_ptr = out_backprop.flat<T>().data(); + T* input_backprop_ptr = output->flat<T>().data(); + + for (int64 b = 0; b < out_backprop_batch; ++b) { + for (int64 r = 0; r < out_backprop_rows; ++r) { + // Calculates row broadcast size. For SAME padding, current + // index could be in the padding area, and r*row_stride + + // window_rows could be beyond the input tensor's boundary. In + // such cases, change the starting index and reduce the + // broadcast size. + int rindex, rsize; + OP_REQUIRES_OK(context, + GetBroadcastSize(r, in_rows, window_rows, row_stride, + pad_rows, &rindex, &rsize)); + for (int64 c = 0; c < out_backprop_cols; ++c) { + // Calculates col broadcast size. For SAME padding, current + // index could be in the padding area, and c*col_stride + + // window_cols could be beyond the input tensor's boundary. In + // such cases, change the starting index and reduce the + // broadcast size. + int cindex, csize; + OP_REQUIRES_OK(context, + GetBroadcastSize(c, in_cols, window_cols, col_stride, + pad_cols, &cindex, &csize)); + + T divide_coeff = 1.0 / (rsize * csize); + int64 output_index = + (b * out_backprop_rows + r) * out_backprop_cols + c; + for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) { + for (int64 c_dst = cindex; c_dst < cindex + csize; ++c_dst) { + int64 input_index = (b * in_rows + r_dst) * in_cols + c_dst; + const T* output_offset = + out_backprop_ptr + output_index * out_backprop_depth; + T* input_offset = + input_backprop_ptr + input_index * out_backprop_depth; + for (int64 d = 0; d < out_backprop_depth; ++d) { + *input_offset += *output_offset * divide_coeff; + ++output_offset; + ++input_offset; + } + } + } + } + } + } + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T") + .HostMemory("orig_input_shape"), + AvgPoolingGradOp<CPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") + .Device(DEVICE_CPU) + .TypeConstraint<double>("T") + .HostMemory("orig_input_shape"), + AvgPoolingGradOp<CPUDevice, double>); + +#if GOOGLE_CUDA + +// A CUDNN based AvgPoolingGrad implementation. It includes the padding as the +// candidates for the pooling operation. +template <class T> +class AvgPoolingGradOp<GPUDevice, T> : public OpKernel { + public: + typedef GPUDevice Device; + + explicit AvgPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in_shape = context->input(0); + const Tensor& out_backprop = context->input(1); + // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements. + OP_REQUIRES( + context, + tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4, + errors::InvalidArgument("out_backprop must be 1-dimensional and 4 " + "elements")); + // For avgpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + + TensorShape output_shape; + auto shape_vec = tensor_in_shape.vec<int32>(); + for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) { + output_shape.AddDim(shape_vec(i)); + } + + DnnPoolingGradOp<T>::Compute( + context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_, + stride_, padding_, nullptr, nullptr, out_backprop, output_shape); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("orig_input_shape") + .Label("cudnn"), + AvgPoolingGradOp<GPUDevice, float>); + +// A custom GPU kernel based AvgPoolingGrad implementation. It includes the +// padding as the candidates for the pooling operation. +template <class T> +class AvgPoolingGradOpCustomGPUKernel : public OpKernel { + public: + typedef GPUDevice Device; + + explicit AvgPoolingGradOpCustomGPUKernel(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in_shape = context->input(0); + const Tensor& out_backprop = context->input(1); + // For avgpooling, tensor_in_shape should have 1 dimension, and 4 elements. + OP_REQUIRES( + context, + tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4, + errors::InvalidArgument("out_backprop must be 1-dimensional and 4 " + "elements")); + // For avgpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + const int64 out_backprop_batch = out_backprop.dim_size(0); + const int64 out_backprop_rows = out_backprop.dim_size(1); + const int64 out_backprop_cols = out_backprop.dim_size(2); + const int64 out_backprop_depth = out_backprop.dim_size(3); + + TensorShape output_shape; + auto shape_vec = tensor_in_shape.vec<int32>(); + for (int64 i = 0; i < tensor_in_shape.NumElements(); ++i) { + output_shape.AddDim(shape_vec(i)); + } + const int64 in_rows = output_shape.dim_size(1); + const int64 in_cols = output_shape.dim_size(2); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + + const int window_rows = ksize_[1]; + const int window_cols = ksize_[2]; + const int depth_window = ksize_[3]; + + const int row_stride = stride_[1]; + const int col_stride = stride_[2]; + + // We (will) use different code for spatial pooling and + // non-spatial pooling. + // + // Spatial pooling is when depth_window = 1 + OP_REQUIRES(context, depth_window == 1, + errors::Unimplemented("Non-spatial pooling is not " + "yet supported. Volunteers? :)")); + + int out_height, out_width, pad_rows, pad_cols; + OP_REQUIRES_OK( + context, Get2dOutputSize(in_rows, in_cols, window_rows, window_cols, + row_stride, col_stride, padding_, &out_height, + &out_width, &pad_rows, &pad_cols)); + + RunAvePoolBackwardNHWC<T>(out_backprop.flat<T>().data(), // top_diff + out_backprop_batch, // num + in_rows, // height + in_cols, // width + out_backprop_depth, // channels + out_backprop_rows, // pooled_height + out_backprop_cols, // pooled_width + window_rows, // kernel_h + window_cols, // kernel_w + row_stride, // stride_h + col_stride, // stride_w + pad_rows, // pad_t + pad_cols, // pad_l + output->flat<T>().data(), // bottom_diff + context->eigen_gpu_device()); // d + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("orig_input_shape"), + AvgPoolingGradOpCustomGPUKernel<float>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h new file mode 100644 index 0000000000..38f0eb97e5 --- /dev/null +++ b/tensorflow/core/kernels/avgpooling_op.h @@ -0,0 +1,58 @@ +#ifndef TENSORFLOW_KERNELS_AVGPOOLING_OP_H_ +#define TENSORFLOW_KERNELS_AVGPOOLING_OP_H_ +// Functor definition for AvgPoolingOp, must be compilable by nvcc. + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +struct SpatialAvgPooling { + void operator()(const Device& d, typename TTypes<T, 4>::Tensor output, + typename TTypes<T, 4>::ConstTensor input, int window_rows, + int window_cols, int row_stride, int col_stride, + const Eigen::PaddingType& padding) { + // Because we swap the layout, we swap the row/cols as well + output.swap_layout().device(d) = + Eigen::SpatialAvgPooling(input.swap_layout(), window_cols, window_rows, + col_stride, row_stride, padding); + } +}; + +} // namespace functor + +typedef Eigen::GpuDevice GPUDevice; + +// Lauch a custom GPU kernels from Yanqing for the avgpooling backward operation +// that works NHWC data formats. +// Arguments: +// top_diff: backprop to the output of the pooling layer +// num: number of input batches +// height: input height +// width: input width +// channels: number of input channels +// pooled_height: the height of the output to the pooling layer +// pooled_width: the width of the output to the pooling layer +// kernel_h: the height of the pooling kernel +// kernel_w: the width of the pooling kernel +// stride_h: the height of the vertical stride +// stride_w: the width of the horizontal stride +// pad_t: padding size to the top side +// pad_l: padding size to the left side +// bottom_diff: backprop to the input of the pooling layer. +template <typename T> +bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, + const int pad_l, T* const bottom_diff, + const GPUDevice& d); + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_AVGPOOLING_OP_H_ diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc new file mode 100644 index 0000000000..ec84ee6862 --- /dev/null +++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc @@ -0,0 +1,101 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <iostream> + +#include "tensorflow/core/kernels/avgpooling_op.h" + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_GPU_KERNELS(T) \ + template struct functor::SpatialAvgPooling<GPUDevice, T>; + +DEFINE_GPU_KERNELS(float) + +#undef DEFINE_GPU_KERNELS + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +static const int CAFFE_CUDA_NUM_THREADS = 1024; + +template <typename dtype> +__global__ void AvePoolBackwardNHWC(const int nthreads, + const dtype* const top_diff, const int num, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, + const int pad_l, dtype* const bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // find out the local index + // find out the local offset + const int c = index % channels; + const int w = index / channels % width + pad_l; + const int h = (index / channels / width) % height + pad_t; + const int n = index / channels / width / height; + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_height); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_width); + dtype gradient = 0; + const dtype* const top_diff_slice = + top_diff + n * pooled_height * pooled_width * channels + c; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_t; + int wstart = pw * stride_w - pad_l; + int hend = min(hstart + kernel_h, height); + int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += + top_diff_slice[(ph * pooled_width + pw) * channels] / pool_size; + } + } + bottom_diff[index] = gradient; + } +} + +template <typename T> +bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, + const int pad_l, T* const bottom_diff, + const GPUDevice& d) { + int x_size = num * height * width * channels; + int thread_per_block = + std::min(CAFFE_CUDA_NUM_THREADS, d.maxCudaThreadsPerMultiProcessor()); + int block_count = (x_size + thread_per_block - 1) / thread_per_block; + AvePoolBackwardNHWC<T><<<block_count, thread_per_block, 0, d.stream()>>>( + x_size, top_diff, num, height, width, channels, pooled_height, + pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_t, + bottom_diff); + + return d.ok(); +} + +template bool RunAvePoolBackwardNHWC( + const float* const top_diff, const int num, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + float* const bottom_diff, const GPUDevice& d); + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/batch_matmul_op.cc b/tensorflow/core/kernels/batch_matmul_op.cc new file mode 100644 index 0000000000..349aac0158 --- /dev/null +++ b/tensorflow/core/kernels/batch_matmul_op.cc @@ -0,0 +1,260 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/fill_functor.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/util/work_sharder.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename Scalar> +struct LaunchBatchMatMul; + +template <typename Scalar> +struct LaunchBatchMatMul<CPUDevice, Scalar> { + static void Launch(OpKernelContext* context, const Tensor& in_x, + const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { + auto Tx = in_x.tensor<Scalar, 3>(); + auto Ty = in_y.tensor<Scalar, 3>(); + auto Tz = out->tensor<Scalar, 3>(); + + // Shards "n"-matmuls into "num" shards. Each shard is + // dispatched to a thread. + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + const int64 num_units = in_x.dim_size(0); + const int64 cost_per_unit = + in_x.dim_size(0) * in_x.dim_size(1) * out->dim_size(2); + Shard(worker_threads.num_threads, worker_threads.workers, num_units, + cost_per_unit, [&Tx, &Ty, adj_x, adj_y, &Tz](int start, int limit) { + LaunchBatchMatMul<CPUDevice, Scalar>::Run(Tx, Ty, adj_x, adj_y, Tz, + start, limit); + }); + } + + template <typename In, typename Out> + static void Run(In Tx, In Ty, bool adj_x, bool adj_y, Out Tz, int start, + int limit) { + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs; + + Eigen::internal::scalar_conjugate_op<Scalar> conj; + if (!adj_x && !adj_y) { + for (int i = start; i < limit; ++i) { + auto x = Tx.template chip<0>(i); + auto y = Ty.template chip<0>(i); + auto z = Tz.template chip<0>(i); + contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); + z = x.contract(y, contract_pairs); // matmul + } + } else if (!adj_x && adj_y) { + for (int i = start; i < limit; ++i) { + auto x = Tx.template chip<0>(i); + auto y = Ty.template chip<0>(i).unaryExpr(conj); + auto z = Tz.template chip<0>(i); + contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 1); + z = x.contract(y, contract_pairs); // matmul + } + } else if (adj_x && !adj_y) { + for (int i = start; i < limit; ++i) { + auto x = Tx.template chip<0>(i).unaryExpr(conj); + auto y = Ty.template chip<0>(i); + auto z = Tz.template chip<0>(i); + contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(0, 0); + z = x.contract(y, contract_pairs); // matmul + } + } else { + for (int i = start; i < limit; ++i) { + auto x = Tx.template chip<0>(i).unaryExpr(conj); + auto y = Ty.template chip<0>(i).unaryExpr(conj); + auto z = Tz.template chip<0>(i); + contract_pairs[0] = Eigen::IndexPair<Eigen::DenseIndex>(0, 1); + z = x.contract(y, contract_pairs); // matmul + } + } + } +}; + +#if GOOGLE_CUDA + +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +template <typename Scalar> +struct LaunchBatchMatMul<GPUDevice, Scalar> { + static void Launch(OpKernelContext* context, const Tensor& in_x, + const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) { + perftools::gputools::blas::Transpose trans[] = { + perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose}; + const uint64 m = in_x.dim_size(adj_x ? 2 : 1); + const uint64 k = in_x.dim_size(adj_x ? 1 : 2); + const uint64 n = in_y.dim_size(adj_y ? 1 : 2); + const uint64 batch_size = in_x.dim_size(0); + auto blas_transpose_a = trans[adj_x]; + auto blas_transpose_b = trans[adj_y]; + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType; + std::vector<DeviceMemoryType> a_device_memory; + std::vector<DeviceMemoryType> b_device_memory; + std::vector<DeviceMemoryType> c_device_memory; + std::vector<DeviceMemoryType*> a_ptrs; + std::vector<DeviceMemoryType*> b_ptrs; + std::vector<DeviceMemoryType*> c_ptrs; + a_device_memory.reserve(batch_size); + b_device_memory.reserve(batch_size); + c_device_memory.reserve(batch_size); + a_ptrs.reserve(batch_size); + b_ptrs.reserve(batch_size); + c_ptrs.reserve(batch_size); + auto* a_base_ptr = in_x.template flat<Scalar>().data(); + auto* b_base_ptr = in_y.template flat<Scalar>().data(); + auto* c_base_ptr = out->template flat<Scalar>().data(); + for (int64 i = 0; i < batch_size; ++i) { + a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k)); + b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n)); + c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n)); + a_ptrs.push_back(&a_device_memory.back()); + b_ptrs.push_back(&b_device_memory.back()); + c_ptrs.push_back(&c_device_memory.back()); + } + + // Cublas does + // C = A x B + // where A, B and C are assumed to be in column major. + // We want the output to be in row-major, so we can compute + // C' = B' x A' (' stands for transpose) + bool blas_launch_status = + stream->ThenBlasGemmBatched(blas_transpose_b, blas_transpose_a, n, m, k, + static_cast<Scalar>(1.0), b_ptrs, + adj_y ? k : n, a_ptrs, adj_x ? m : k, + static_cast<Scalar>(0.0), c_ptrs, n, + batch_size) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal( + "Blas SGEMMBatched launch failed : a.shape=", + in_x.shape().DebugString(), ", b.shape=", in_y.shape().DebugString(), + ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size)); + } + } +}; + +#endif // GOOGLE_CUDA + +template <typename Device, typename Scalar> +class BatchMatMul : public OpKernel { + public: + explicit BatchMatMul(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_)); + OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_)); + } + + virtual ~BatchMatMul() {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& in0 = ctx->input(0); + const Tensor& in1 = ctx->input(1); + OP_REQUIRES(ctx, in0.dims() == in1.dims(), + errors::InvalidArgument("In[0] and In[1] has different ndims: ", + in0.shape().ShortDebugString(), " vs. ", + in1.shape().ShortDebugString())); + const int ndims = in0.dims(); + OP_REQUIRES( + ctx, ndims >= 3, + errors::InvalidArgument("In[0] and In[1] ndims must be >= 3: ", ndims)); + TensorShape out_shape; + for (int i = 0; i < ndims - 2; ++i) { + OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i), + errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", + i, ") must be the same: ", + in0.shape().DebugString(), " vs ", + in1.shape().DebugString())); + out_shape.AddDim(in0.dim_size(i)); + } + auto n = out_shape.num_elements(); + auto d0 = in0.dim_size(ndims - 2); + auto d1 = in0.dim_size(ndims - 1); + Tensor in0_reshaped; + CHECK(in0_reshaped.CopyFrom(in0, TensorShape({n, d0, d1}))); + auto d2 = in1.dim_size(ndims - 2); + auto d3 = in1.dim_size(ndims - 1); + Tensor in1_reshaped; + CHECK(in1_reshaped.CopyFrom(in1, TensorShape({n, d2, d3}))); + if (adj_x_) std::swap(d0, d1); + if (adj_y_) std::swap(d2, d3); + OP_REQUIRES(ctx, d1 == d2, + errors::InvalidArgument( + "In[0] mismatch In[1] shape: ", d1, " vs. ", d2, ": ", + in0.shape().ShortDebugString(), " ", + in1.shape().ShortDebugString(), " ", adj_x_, " ", adj_y_)); + out_shape.AddDim(d0); + out_shape.AddDim(d3); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); + if (out->NumElements() == 0) { + return; + } + if (in0.NumElements() == 0 || in1.NumElements() == 0) { + functor::SetZeroFunctor<Device, Scalar> f; + f(ctx->eigen_device<Device>(), out->flat<Scalar>()); + return; + } + Tensor out_reshaped; + CHECK(out_reshaped.CopyFrom(*out, TensorShape({n, d0, d3}))); + LaunchBatchMatMul<Device, Scalar>::Launch(ctx, in0_reshaped, in1_reshaped, + adj_x_, adj_y_, &out_reshaped); + } + + private: + bool adj_x_; + bool adj_y_; +}; + +#define REGISTER_CPU(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \ + BatchMatMul<CPUDevice, TYPE>) + +#define REGISTER_GPU(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \ + BatchMatMul<GPUDevice, TYPE>) + +REGISTER_CPU(float); +REGISTER_CPU(double); +REGISTER_CPU(int32); +REGISTER_CPU(complex64); + +#ifdef GOOGLE_CUDA +// TODO(kalakris): The GPU implementation is currently disabled due to issues +// encountered in practice. See b/24534272. +// REGISTER_GPU(float); +#endif // GOOGLE_CUDA + +#undef REGISTER_CPU +#undef REGISTER_GPU +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc new file mode 100644 index 0000000000..c67c921631 --- /dev/null +++ b/tensorflow/core/kernels/batch_norm_op.cc @@ -0,0 +1,223 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/batch_norm_op.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class BatchNormOp : public OpKernel { + public: + explicit BatchNormOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, + context->GetAttr("variance_epsilon", &variance_epsilon_)); + OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization", + &scale_after_normalization_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& mean = context->input(1); + const Tensor& var = context->input(2); + const Tensor& beta = context->input(3); + const Tensor& gamma = context->input(4); + + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + OP_REQUIRES(context, mean.dims() == 1, + errors::InvalidArgument("mean must be 1-dimensional", + mean.shape().ShortDebugString())); + OP_REQUIRES(context, var.dims() == 1, + errors::InvalidArgument("var must be 1-dimensional", + var.shape().ShortDebugString())); + OP_REQUIRES(context, beta.dims() == 1, + errors::InvalidArgument("beta must be 1-dimensional", + beta.shape().ShortDebugString())); + OP_REQUIRES(context, gamma.dims() == 1, + errors::InvalidArgument("gamma must be 1-dimensional", + gamma.shape().ShortDebugString())); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + + functor::BatchNorm<Device, T>()( + context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(), + var.vec<T>(), beta.vec<T>(), gamma.vec<T>(), variance_epsilon_, + scale_after_normalization_, output->tensor<T, 4>()); + } + + private: + float variance_epsilon_; + bool scale_after_normalization_; +}; + +template <typename Device, typename T> +class BatchNormGradOp : public OpKernel { + public: + explicit BatchNormGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, + context->GetAttr("variance_epsilon", &variance_epsilon_)); + OP_REQUIRES_OK(context, context->GetAttr("scale_after_normalization", + &scale_after_normalization_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& mean = context->input(1); + const Tensor& var = context->input(2); + const Tensor& gamma = context->input(3); + const Tensor& out_backprop = context->input(4); + + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + OP_REQUIRES(context, mean.dims() == 1, + errors::InvalidArgument("mean must be 1-dimensional", + mean.shape().ShortDebugString())); + OP_REQUIRES(context, var.dims() == 1, + errors::InvalidArgument("var must be 1-dimensional", + var.shape().ShortDebugString())); + OP_REQUIRES(context, gamma.dims() == 1, + errors::InvalidArgument("gamma must be 1-dimensional", + gamma.shape().ShortDebugString())); + OP_REQUIRES( + context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional", + out_backprop.shape().ShortDebugString())); + + Tensor* dx = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx)); + Tensor* dm = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm)); + Tensor* dv = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv)); + Tensor* db = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db)); + Tensor* dg = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg)); + + // Scratch buffer of [depth] dimension, aka the 4th dimension of input, + // which is dim_size(3), for calculating various combinations of + // (var + epsilon). + Tensor scratch1; + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({input.dim_size(3)}), &scratch1)); + + // Scratch buffer of [depth] dimension for saving intermediate calculation + // values. + Tensor scratch2; + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({input.dim_size(3)}), &scratch2)); + + functor::BatchNormGrad<Device, T>()( + context->eigen_device<Device>(), input.tensor<T, 4>(), mean.vec<T>(), + var.vec<T>(), gamma.vec<T>(), out_backprop.tensor<T, 4>(), + variance_epsilon_, scale_after_normalization_, dx->tensor<T, 4>(), + dm->vec<T>(), dv->vec<T>(), db->vec<T>(), dg->vec<T>(), + scratch1.vec<T>(), scratch2.vec<T>()); + } + + private: + float variance_epsilon_; + bool scale_after_normalization_; +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T"), \ + BatchNormOp<CPUDevice, T>); + +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void BatchNorm<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \ + typename TTypes<T>::ConstVec beta, typename TTypes<T>::ConstVec gamma, \ + float variance_epsilon, bool scale_after_normalization, \ + typename TTypes<T, 4>::Tensor output); \ + extern template struct BatchNorm<GPUDevice, T>; + +#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T); + +DECLARE_GPU_SPECS(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T"), \ + BatchNormOp<GPUDevice, T>); + +REGISTER_GPU_KERNEL(float); +#undef REGISTER_GPU_KERNEL + +#endif // GOOGLE_CUDA + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T"), \ + BatchNormGradOp<CPUDevice, T>); + +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void BatchNormGrad<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<T>::ConstVec mean, typename TTypes<T>::ConstVec var, \ + typename TTypes<T>::ConstVec gamma, \ + typename TTypes<T, 4>::ConstTensor out_backprop, float variance_epsilon, \ + bool scale_after_normalization, typename TTypes<T, 4>::Tensor dx, \ + typename TTypes<T>::Vec dm, typename TTypes<T>::Vec dv, \ + typename TTypes<T>::Vec db, typename TTypes<T>::Vec dg, \ + typename TTypes<T>::Vec scratch1, typename TTypes<T>::Vec scratch2); \ + extern template struct BatchNormGrad<GPUDevice, T>; + +#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T); + +DECLARE_GPU_SPECS(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T"), \ + BatchNormGradOp<GPUDevice, T>); + +REGISTER_GPU_KERNEL(float); +#undef REGISTER_GPU_KERNEL + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/batch_norm_op.h b/tensorflow/core/kernels/batch_norm_op.h new file mode 100644 index 0000000000..5981e58460 --- /dev/null +++ b/tensorflow/core/kernels/batch_norm_op.h @@ -0,0 +1,133 @@ +#ifndef TENSORFLOW_KERNELS_BATCH_NORM_OP_H_ +#define TENSORFLOW_KERNELS_BATCH_NORM_OP_H_ +// Functor definition for BatchNormOp, must be compilable by nvcc. +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by BatchNormOp to do the computations. +template <typename Device, typename T> +struct BatchNorm { + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input, + typename TTypes<T>::ConstVec mean, + typename TTypes<T>::ConstVec var, + typename TTypes<T>::ConstVec beta, + typename TTypes<T>::ConstVec gamma, float variance_epsilon, + bool scale_after_normalization, + typename TTypes<T, 4>::Tensor output) { + const int depth = mean.dimension(0); + const int rest_size = input.size() / depth; + + Eigen::DSizes<int, 2> rest_by_depth(rest_size, depth); +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::DSizes<int, 2> rest_by_one(rest_size, 1); + Eigen::DSizes<int, 2> one_by_depth(1, depth); + Eigen::DSizes<int, 2> depth_by_one(depth, 1); +#else + Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one; + rest_by_one.set(0, rest_size); + Eigen::IndexList<Eigen::type2index<1>, int> one_by_depth; + one_by_depth.set(1, depth); + Eigen::IndexList<int, Eigen::type2index<1> > depth_by_one; + depth_by_one.set(0, depth); +#endif + if (scale_after_normalization) { + output.reshape(rest_by_depth).device(d) = + (input.reshape(rest_by_depth) - + mean.reshape(one_by_depth).broadcast(rest_by_one)) * + ((var + var.constant(variance_epsilon)).rsqrt() * gamma) + .eval() + .reshape(one_by_depth) + .broadcast(rest_by_one) + + beta.reshape(one_by_depth).broadcast(rest_by_one); + } else { + output.reshape(rest_by_depth).device(d) = + (input.reshape(rest_by_depth) - + mean.reshape(one_by_depth).broadcast(rest_by_one)) * + ((var + var.constant(variance_epsilon)).rsqrt()) + .eval() + .reshape(one_by_depth) + .broadcast(rest_by_one) + + beta.reshape(one_by_depth).broadcast(rest_by_one); + } + } +}; + +template <typename Device, typename T> +struct BatchNormGrad { + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input, + typename TTypes<T>::ConstVec mean, + typename TTypes<T>::ConstVec var, + typename TTypes<T>::ConstVec gamma, + typename TTypes<T, 4>::ConstTensor out_backprop, + float variance_epsilon, bool scale_after_normalization, + typename TTypes<T, 4>::Tensor dx, typename TTypes<T>::Vec dm, + typename TTypes<T>::Vec dv, typename TTypes<T>::Vec db, + typename TTypes<T>::Vec dg, typename TTypes<T>::Vec scratch1, + typename TTypes<T>::Vec scratch2) { + const int depth = mean.dimension(0); + const int rest_size = input.size() / depth; + + typedef typename TTypes<T>::ConstVec::Index Index; + Eigen::DSizes<Index, 2> rest_by_depth(rest_size, depth); + Eigen::DSizes<Index, 2> rest_by_one(rest_size, 1); + Eigen::DSizes<Index, 2> one_by_depth(1, depth); + + // db = out_backprop + // + // dg = out_backprop * ((x - m) * rsqrt(v + epsilon)) + // + // dv = sum_over_rest(out_backprop * gamma * (x - m)) * + // (-1/2) * (v + epsilon) ^ (-3/2) + // + // dm = sum_over_rest(out_backprop * gamma) * (-1 / rsqrt(v + epsilon)) + // + // dx = out_backprop * (gamma * rsqrt(v + epsilon)) + Eigen::array<Index, 1> reduction_axis; + reduction_axis[0] = 0; // Reduces on first dimension. + + db.device(d) = out_backprop.reshape(rest_by_depth).sum(reduction_axis); + + // scratch1 = rsqrt(v + epsilon) + scratch1.device(d) = (var + var.constant(variance_epsilon)).rsqrt(); + + // scratch2 = sum_over_rest(out_backprop * (x - m)) + scratch2.device(d) = (out_backprop.reshape(rest_by_depth) * + (input.reshape(rest_by_depth) - + mean.reshape(one_by_depth).broadcast(rest_by_one))) + .sum(reduction_axis); + + if (scale_after_normalization) { + dx.reshape(rest_by_depth).device(d) = + out_backprop.reshape(rest_by_depth) * ((scratch1 * gamma) + .eval() + .reshape(one_by_depth) + .broadcast(rest_by_one)); + dm.device(d) = -db * (scratch1 * gamma).eval(); + dg.device(d) = scratch2 * scratch1; + } else { + dx.reshape(rest_by_depth).device(d) = + out_backprop.reshape(rest_by_depth) * + scratch1.reshape(one_by_depth).broadcast(rest_by_one); + dm.device(d) = -db * scratch1; + dg.device(d) = dg.constant(static_cast<T>(0.0)); // Gamma is not learned. + } + + // scratch1 = - 1/2 * (var + epsilon) ^ (-3/2) + scratch1.device(d) = scratch1 * scratch1.constant(static_cast<T>(-0.5f)) / + (var + var.constant(variance_epsilon)); + + if (scale_after_normalization) { + dv.device(d) = scratch2 * (scratch1 * gamma).eval(); + } else { + dv.device(d) = scratch2 * scratch1; + } + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_BATCH_NORM_OP_H_ diff --git a/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc new file mode 100644 index 0000000000..02e0eeecfa --- /dev/null +++ b/tensorflow/core/kernels/batch_norm_op_gpu.cu.cc @@ -0,0 +1,17 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/batch_norm_op.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; +template struct functor::BatchNorm<GPUDevice, float>; +template struct functor::BatchNormGrad<GPUDevice, float>; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc new file mode 100644 index 0000000000..bb1492e5b4 --- /dev/null +++ b/tensorflow/core/kernels/bcast_ops.cc @@ -0,0 +1,71 @@ +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/bcast.h" + +namespace tensorflow { + +// Given shapes of two tensors, computes the reduction indices for the +// gradient computation. +// +// TODO(zhifengc): +// 1. Adds support for n-ary (n >= 2). +class BCastGradArgsOp : public OpKernel { + public: + explicit BCastGradArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK( + ctx, ctx->MatchSignature({DT_INT32, DT_INT32}, {DT_INT32, DT_INT32})); + } + + void Compute(OpKernelContext* ctx) override { + OP_REQUIRES( + ctx, ctx->num_inputs() == 2, + errors::Unimplemented("Broadcast for n-ary operations (n > 2)")); + gtl::InlinedVector<BCast::Vec, 4> shapes; + for (int i = 0; i < ctx->num_inputs(); ++i) { + const Tensor& in = ctx->input(i); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(in.shape()), + errors::InvalidArgument("In[", i, "] must be a vector.", + in.shape().ShortDebugString())); + BCast::Vec vec; + for (int64 i = 0; i < in.NumElements(); ++i) { + vec.push_back(in.vec<int32>()(i)); + } + shapes.push_back(vec); + } + BCast bcast(shapes[0], shapes[1]); + OP_REQUIRES(ctx, bcast.IsValid(), + errors::InvalidArgument( + "Incompatible shapes: [", str_util::Join(shapes[0], ","), + "] vs. [", str_util::Join(shapes[1], ","), "]")); + Output(ctx, 0, bcast.grad_x_reduce_idx()); + Output(ctx, 1, bcast.grad_y_reduce_idx()); + } + + private: + void Output(OpKernelContext* ctx, int idx, const BCast::Vec& v) { + const int len = v.size(); + Tensor* o = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(idx, TensorShape({len}), &o)); + for (int i = 0; i < len; ++i) o->flat<int32>()(i) = v[i]; + } + + TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp); +}; + +REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs") + .Device(DEVICE_CPU) + .HostMemory("s0") + .HostMemory("s1") + .HostMemory("r0") + .HostMemory("r1"), + BCastGradArgsOp); +REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs") + .Device(DEVICE_GPU) + .HostMemory("s0") + .HostMemory("s1") + .HostMemory("r0") + .HostMemory("r1"), + BCastGradArgsOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc new file mode 100644 index 0000000000..68737f6c2d --- /dev/null +++ b/tensorflow/core/kernels/bias_op.cc @@ -0,0 +1,112 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/bias_op.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class BiasOp : public BinaryOp<T> { + public: + explicit BiasOp(OpKernelConstruction* context) : BinaryOp<T>(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& bias = context->input(1); + + OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input.shape()), + errors::InvalidArgument("Input tensor must be at least 2D: ", + input.shape().DebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsVector(bias.shape()), + errors::InvalidArgument("Biases must be 1D: ", + bias.shape().DebugString())); + const auto last_dim = input.shape().dims() - 1; + OP_REQUIRES( + context, bias.shape().dim_size(0) == input.shape().dim_size(last_dim), + errors::InvalidArgument( + "Must provide as many biases as the last dimension " + "of the input tensor: ", + bias.shape().DebugString(), " vs. ", input.shape().DebugString())); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + + switch (input.shape().dims()) { + case 2: + Compute<2>(context, input, bias, output); + break; + case 3: + Compute<3>(context, input, bias, output); + break; + case 4: + Compute<4>(context, input, bias, output); + break; + case 5: + Compute<5>(context, input, bias, output); + break; + default: + OP_REQUIRES(context, false, + errors::InvalidArgument("Only ranks up to 5 supported: ", + input.shape().DebugString())); + } + } + + // Add biases for an input matrix of rank Dims, by using the Bias. + template <int Dims> + void Compute(OpKernelContext* ctx, const Tensor& input, const Tensor& bias, + Tensor* output) { + functor::Bias<Device, T, Dims> functor; + functor(ctx->eigen_device<Device>(), input.tensor<T, Dims>(), bias.vec<T>(), + output->tensor<T, Dims>()); + } +}; + +#define REGISTER_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("BiasAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + BiasOp<CPUDevice, type>); + +TF_CALL_NUMBER_TYPES(REGISTER_KERNEL); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T, Dims) \ + template <> \ + void Bias<GPUDevice, T, Dims>::operator()( \ + const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \ + typename TTypes<T>::ConstVec bias, \ + typename TTypes<T, Dims>::Tensor output); \ + extern template struct Bias<GPUDevice, T, Dims>; + +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC(T, 2); \ + DECLARE_GPU_SPEC(T, 3); \ + DECLARE_GPU_SPEC(T, 4); \ + DECLARE_GPU_SPEC(T, 5); + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS); +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("BiasAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + BiasOp<GPUDevice, type>); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/bias_op.h b/tensorflow/core/kernels/bias_op.h new file mode 100644 index 0000000000..513406d251 --- /dev/null +++ b/tensorflow/core/kernels/bias_op.h @@ -0,0 +1,41 @@ +#ifndef TENSORFLOW_KERNELS_BIAS_OP_H_ +#define TENSORFLOW_KERNELS_BIAS_OP_H_ +// Functor definition for BiasOp, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by BiasOp to do the computations. +template <typename Device, typename T, int Dims> +struct Bias { + // Add "bias" to "input", broadcasting it on all dimensions but the last one. + void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input, + typename TTypes<T>::ConstVec bias, + typename TTypes<T, Dims>::Tensor output) { + const int bias_size = bias.dimension(0); + const int rest_size = input.size() / bias_size; + + Eigen::DSizes<int, 2> rest_by_bias(rest_size, bias_size); +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::DSizes<int, 2> rest_by_one(rest_size, 1); + Eigen::DSizes<int, 2> one_by_bias(1, bias_size); +#else + Eigen::IndexList<int, Eigen::type2index<1> > rest_by_one; + rest_by_one.set(0, rest_size); + Eigen::IndexList<Eigen::type2index<1>, int> one_by_bias; + one_by_bias.set(1, bias_size); +#endif + + output.reshape(rest_by_bias).device(d) = + input.reshape(rest_by_bias) + + bias.reshape(one_by_bias).broadcast(rest_by_one); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_BIAS_OP_H_ diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc new file mode 100644 index 0000000000..d3377b3ce8 --- /dev/null +++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc @@ -0,0 +1,23 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/bias_op.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Definition of the GPU implementations declared in bias_op.cc. +#define DEFINE_GPU_SPECS(T) \ + template struct functor::Bias<GPUDevice, T, 2>; \ + template struct functor::Bias<GPUDevice, T, 3>; \ + template struct functor::Bias<GPUDevice, T, 4>; \ + template struct functor::Bias<GPUDevice, T, 5>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc new file mode 100644 index 0000000000..cd5fde37a6 --- /dev/null +++ b/tensorflow/core/kernels/candidate_sampler_ops.cc @@ -0,0 +1,243 @@ +// See docs in ../ops/candidate_sampling_ops.cc. + +#define EIGEN_USE_THREADS + +#include <cfloat> +#include <unordered_map> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/range_sampler.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/util/guarded_philox_random.h" + +namespace tensorflow { + +class BaseCandidateSamplerOp : public OpKernel { + public: + explicit BaseCandidateSamplerOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("num_sampled", &num_sampled_)); + OP_REQUIRES_OK(context, context->GetAttr("num_true", &num_true_)); + OP_REQUIRES_OK(context, context->GetAttr("unique", &unique_)); + OP_REQUIRES_OK(context, generator_.Init(context)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& true_classes = context->input(0); + OP_REQUIRES(context, true_classes.dims() == 2, + errors::InvalidArgument("true_classes must be a matrix")); + const int32 batch_size = true_classes.dim_size(0); + OP_REQUIRES(context, true_classes.dim_size(1) == num_true_, + errors::InvalidArgument("true_classes must have " + "num_true columns")); + + // Output candidates and expected_count. + Tensor* out_sampled_candidates = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({num_sampled_}), + &out_sampled_candidates)); + + Tensor* out_true_expected_count = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 1, TensorShape({batch_size, num_true_}), + &out_true_expected_count)); + Tensor* out_sampled_expected_count = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(2, TensorShape({num_sampled_}), + &out_sampled_expected_count)); + + gtl::ArraySlice<int64> true_candidate(true_classes.matrix<int64>().data(), + batch_size * num_true_); + gtl::MutableArraySlice<int64> sampled_candidate( + out_sampled_candidates->vec<int64>().data(), num_sampled_); + gtl::MutableArraySlice<float> true_expected_count( + out_true_expected_count->matrix<float>().data(), + batch_size * num_true_); + gtl::MutableArraySlice<float> sampled_expected_count( + out_sampled_expected_count->vec<float>().data(), num_sampled_); + + CHECK(sampler_) << "CandidateSamplerOp did not set sampler_"; + + // Approximately conservatively estimate the number of samples required. + // In cases where rejection sampling is used we may occasionally use more + // samples than expected, which will result in reused random bits. + const int64 samples32 = 2048 * num_sampled_; + + // Pick sampled candidates. + auto local_gen = generator_.ReserveSamples32(samples32); + random::SimplePhilox random(&local_gen); + sampler_->SampleBatchGetExpectedCount(&random, unique_, &sampled_candidate, + &sampled_expected_count, + true_candidate, &true_expected_count); + + if (sampler_->NeedsUpdates()) { + sampler_->Update(true_candidate); + } + } + + protected: + void set_sampler(RangeSampler* sampler) { sampler_.reset(sampler); } + + private: + int32 num_true_; + int32 num_sampled_; + bool unique_; + std::unique_ptr<RangeSampler> sampler_; + GuardedPhiloxRandom generator_; +}; + +template <class RangeSamplerType> +class SimpleCandidateSamplerOp : public BaseCandidateSamplerOp { + public: + explicit SimpleCandidateSamplerOp(OpKernelConstruction* context) + : BaseCandidateSamplerOp(context) { + int64 range_max; + OP_REQUIRES_OK(context, context->GetAttr("range_max", &range_max)); + set_sampler(new RangeSamplerType(range_max)); + } +}; + +REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU), + SimpleCandidateSamplerOp<UniformSampler>); + +REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU), + SimpleCandidateSamplerOp<LogUniformSampler>); + +REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler") + .Device(DEVICE_CPU), + SimpleCandidateSamplerOp<UnigramSampler>); + +REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler") + .Device(DEVICE_CPU), + SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>); + +class AllCandidateSamplerOp : public BaseCandidateSamplerOp { + public: + explicit AllCandidateSamplerOp(OpKernelConstruction* context) + : BaseCandidateSamplerOp(context) { + int64 range_max; + OP_REQUIRES_OK(context, context->GetAttr("num_sampled", &range_max)); + set_sampler(new AllSampler(range_max)); + } +}; + +REGISTER_KERNEL_BUILDER(Name("AllCandidateSampler").Device(DEVICE_CPU), + AllCandidateSamplerOp); + +class FixedUnigramCandidateSamplerOp : public BaseCandidateSamplerOp { + public: + explicit FixedUnigramCandidateSamplerOp(OpKernelConstruction* context) + : BaseCandidateSamplerOp(context) { + int64 range_max; + OP_REQUIRES_OK(context, context->GetAttr("range_max", &range_max)); + string vocab_file; + OP_REQUIRES_OK(context, context->GetAttr("vocab_file", &vocab_file)); + std::vector<float> unigrams; + OP_REQUIRES_OK(context, context->GetAttr("unigrams", &unigrams)); + OP_REQUIRES( + context, !vocab_file.empty() || !unigrams.empty(), + errors::InvalidArgument("Must provide either vocab_file or unigrams.")); + OP_REQUIRES(context, vocab_file.empty() || unigrams.empty(), + errors::InvalidArgument( + "Must only provide one of vocab_file and unigrams.")); + float distortion; + OP_REQUIRES_OK(context, context->GetAttr("distortion", &distortion)); + int64 num_reserved_ids; + OP_REQUIRES_OK(context, + context->GetAttr("num_reserved_ids", &num_reserved_ids)); + int64 num_shards; + OP_REQUIRES_OK(context, context->GetAttr("num_shards", &num_shards)); + int64 shard; + OP_REQUIRES_OK(context, context->GetAttr("shard", &shard)); + + if (!vocab_file.empty()) { + set_sampler(new FixedUnigramSampler(context->env(), range_max, vocab_file, + distortion, num_reserved_ids, + num_shards, shard)); + } else { + set_sampler(new FixedUnigramSampler(range_max, unigrams, distortion, + num_reserved_ids, num_shards, shard)); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("FixedUnigramCandidateSampler").Device(DEVICE_CPU), + FixedUnigramCandidateSamplerOp); + +class ComputeAccidentalHitsOp : public OpKernel { + public: + explicit ComputeAccidentalHitsOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("num_true", &num_true_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& in_true_candidates = context->input(0); + TensorShape in_true_candidates_shape = in_true_candidates.shape(); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) && + in_true_candidates_shape.dim_size(1) == num_true_, + errors::InvalidArgument( + "true_candidates must be a batch_size * num_true matrix")); + + const int64 batch_size = in_true_candidates_shape.dim_size(0); + + const Tensor& in_sampled_candidates = context->input(1); + OP_REQUIRES(context, + TensorShapeUtils::IsVector(in_sampled_candidates.shape()), + errors::InvalidArgument( + "sampled_candidates must be a vector, which is typically " + "an output from CandidateSampler")); + + std::unordered_map<int64, int> sampled_candidate_to_pos; + for (int64 i = 0; i < in_sampled_candidates.dim_size(0); ++i) { + sampled_candidate_to_pos[in_sampled_candidates.vec<int64>()(i)] = i; + } + + // Produce output in the same format as UnpackSparseFeatures. + std::vector<int> indices; + std::vector<int64> ids; + std::vector<float> weights; + + for (int64 i = 0; i < batch_size; ++i) { + for (int64 j = 0; j < num_true_; ++j) { + const int64 true_candidate = in_true_candidates.matrix<int64>()(i, j); + const auto look = sampled_candidate_to_pos.find(true_candidate); + if (look != sampled_candidate_to_pos.end()) { + indices.push_back(i); + ids.push_back(look->second); + weights.push_back(-FLT_MAX); + } + } + } + + Tensor* out_indices = nullptr; + OP_REQUIRES_OK( + context, + context->allocate_output( + 0, TensorShape({static_cast<int>(indices.size())}), &out_indices)); + Tensor* out_ids = nullptr; + OP_REQUIRES_OK( + context, context->allocate_output( + 1, TensorShape({static_cast<int>(ids.size())}), &out_ids)); + Tensor* out_weights = nullptr; + OP_REQUIRES_OK( + context, + context->allocate_output( + 2, TensorShape({static_cast<int>(weights.size())}), &out_weights)); + + for (size_t i = 0; i < indices.size(); ++i) { + out_indices->vec<int32>()(i) = indices[i]; + out_ids->vec<int64>()(i) = ids[i]; + out_weights->vec<float>()(i) = weights[i]; + } + } + + private: + int64 num_true_; +}; + +REGISTER_KERNEL_BUILDER(Name("ComputeAccidentalHits").Device(DEVICE_CPU), + ComputeAccidentalHitsOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc new file mode 100644 index 0000000000..779ac57b6a --- /dev/null +++ b/tensorflow/core/kernels/cast_op.cc @@ -0,0 +1,233 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/cast_op.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +template <typename Device, typename Tout, typename Tin> +void CastMaybeInline(const Device& d, typename TTypes<Tout>::Flat o, + typename TTypes<Tin>::ConstFlat i) { + if (o.size() * (sizeof(Tin) + sizeof(Tout)) < 131072) { + // Small cast on a CPU: do inline + o = i.template cast<Tout>(); + } else { + o.device(d) = i.template cast<Tout>(); + } +} + +template <typename O, typename I> +struct CastFunctor<CPUDevice, O, I> { + void operator()(const CPUDevice& d, typename TTypes<O>::Flat o, + typename TTypes<I>::ConstFlat i) { + CastMaybeInline<CPUDevice, O, I>(d, o, i); + } +}; + +} // namespace functor + +#define CAST_CASE(DEVICE, IN, OUT) \ + if (DataTypeToEnum<IN>::value == src_dtype_ && \ + DataTypeToEnum<OUT>::value == dst_dtype_) { \ + work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) { \ + functor::CastFunctor<DEVICE, OUT, IN> func; \ + func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>()); \ + }; \ + return Status::OK(); \ + } + +class CastOpBase : public OpKernel { + public: + explicit CastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &src_dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& inp = ctx->input(0); + if (work_ == nullptr) { + ctx->set_output(0, inp); + } else { + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out)); + work_(ctx, inp, out); + } + } + + protected: + DataType src_dtype_; + DataType dst_dtype_; + std::function<void(OpKernelContext*, const Tensor&, Tensor*)> work_ = nullptr; + + virtual Status Prepare() = 0; + Status Unimplemented() { + return errors::Unimplemented("Cast ", DataTypeString(src_dtype_), " to ", + DataTypeString(dst_dtype_), + " is not supported"); + } + + TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase); +}; + +class CpuCastOp : public CastOpBase { + public: + explicit CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) { + OP_REQUIRES_OK(ctx, Prepare()); + } + + protected: + Status Prepare() override { + if (src_dtype_ == dst_dtype_) { + work_ = nullptr; // Identity + return Status::OK(); + } + CAST_CASE(CPUDevice, bool, float); + CAST_CASE(CPUDevice, bool, int32); + CAST_CASE(CPUDevice, bool, double); + CAST_CASE(CPUDevice, double, float); + CAST_CASE(CPUDevice, double, int32); + CAST_CASE(CPUDevice, double, int64); + CAST_CASE(CPUDevice, float, double); + CAST_CASE(CPUDevice, float, uint8); + CAST_CASE(CPUDevice, float, int32); + CAST_CASE(CPUDevice, float, int64); + CAST_CASE(CPUDevice, int32, double); + CAST_CASE(CPUDevice, int32, float); + CAST_CASE(CPUDevice, int32, uint8); + CAST_CASE(CPUDevice, int32, int64); + CAST_CASE(CPUDevice, int64, double); + CAST_CASE(CPUDevice, int64, float); + CAST_CASE(CPUDevice, int64, int32); + CAST_CASE(CPUDevice, uint8, float); + CAST_CASE(CPUDevice, uint8, int32); + CAST_CASE(CPUDevice, uint8, int64); + CAST_CASE(CPUDevice, uint8, double); + if (src_dtype_ == DT_BFLOAT16 && dst_dtype_ == DT_FLOAT) { + work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) { + int64 N = out->NumElements(); + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + int num_threads = + std::min<int>(std::min(4, worker_threads->num_threads), N / 4096); + if (num_threads < 1) { + BFloat16ToFloat(inp.flat<bfloat16>().data(), + out->flat<float>().data(), N); + } else { + auto work = [&inp, &out](int64 start, int64 end) { + BFloat16ToFloat(inp.flat<bfloat16>().data() + start, + out->flat<float>().data() + start, end - start); + }; + Shard(num_threads, worker_threads->workers, N, 100, work); + } + }; + return Status::OK(); + } + if (src_dtype_ == DT_FLOAT && dst_dtype_ == DT_BFLOAT16) { + work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) { + int64 N = out->NumElements(); + auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads(); + int num_threads = + std::min<int>(std::min(4, worker_threads->num_threads), N / 4096); + if (num_threads < 1) { + FloatToBFloat16(inp.flat<float>().data(), + out->flat<bfloat16>().data(), N); + } else { + auto work = [&inp, &out](int64 start, int64 end) { + FloatToBFloat16(inp.flat<float>().data() + start, + out->flat<bfloat16>().data() + start, end - start); + }; + Shard(num_threads, worker_threads->workers, N, 100, work); + } + }; + return Status::OK(); + } + return Unimplemented(); + } +}; + +class GpuCastOp : public CastOpBase { + public: + explicit GpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) { + OP_REQUIRES_OK(ctx, Prepare()); + } + + protected: + Status Prepare() override { + if (src_dtype_ == dst_dtype_) { + work_ = nullptr; // Identity + return Status::OK(); + } + CAST_CASE(GPUDevice, bfloat16, float); + CAST_CASE(GPUDevice, bool, float); + CAST_CASE(GPUDevice, double, float); + CAST_CASE(GPUDevice, double, int64); + CAST_CASE(GPUDevice, float, bfloat16); + CAST_CASE(GPUDevice, float, double); + CAST_CASE(GPUDevice, float, int64); + CAST_CASE(GPUDevice, int64, double); + CAST_CASE(GPUDevice, int64, float); + CAST_CASE(GPUDevice, uint8, float); + CAST_CASE(GPUDevice, float, uint8); + CAST_CASE(GPUDevice, bool, int32); + CAST_CASE(GPUDevice, double, int32); + CAST_CASE(GPUDevice, float, int32); + CAST_CASE(GPUDevice, int32, double); + CAST_CASE(GPUDevice, int32, float); + CAST_CASE(GPUDevice, int32, int64); + CAST_CASE(GPUDevice, int64, int32); + return Unimplemented(); + } +}; + +#undef CAST_CASE + +REGISTER_KERNEL_BUILDER(Name("Cast").Device(DEVICE_CPU), CpuCastOp); + +#if GOOGLE_CUDA +#define REGISTER_CAST_GPU(srctype, dsttype) \ + REGISTER_KERNEL_BUILDER(Name("Cast") \ + .TypeConstraint<srctype>("SrcT") \ + .TypeConstraint<dsttype>("DstT") \ + .Device(DEVICE_GPU), \ + GpuCastOp); +REGISTER_CAST_GPU(bfloat16, float); +REGISTER_CAST_GPU(bool, float); +REGISTER_CAST_GPU(double, float); +REGISTER_CAST_GPU(double, int64); +REGISTER_CAST_GPU(float, bfloat16); +REGISTER_CAST_GPU(float, double); +REGISTER_CAST_GPU(float, int64); +REGISTER_CAST_GPU(int64, double); +REGISTER_CAST_GPU(int64, float); +REGISTER_CAST_GPU(uint8, float); +REGISTER_CAST_GPU(float, uint8); +REGISTER_CAST_GPU(bool, int32); +REGISTER_CAST_GPU(double, int32); +REGISTER_CAST_GPU(float, int32); +REGISTER_CAST_GPU(int32, double); +REGISTER_CAST_GPU(int32, float); +REGISTER_CAST_GPU(int32, int64); +REGISTER_CAST_GPU(int64, int32); +#undef REGISTER_CAST_GPU +#endif // GOOGLE_CUDA + +// HostCast differs from Cast in that its input and output are in host memory. +REGISTER_KERNEL_BUILDER(Name("_HostCast").Device(DEVICE_CPU), CpuCastOp); +REGISTER_KERNEL_BUILDER( + Name("_HostCast").Device(DEVICE_GPU).HostMemory("x").HostMemory("y"), + CpuCastOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h new file mode 100644 index 0000000000..d066206abc --- /dev/null +++ b/tensorflow/core/kernels/cast_op.h @@ -0,0 +1,71 @@ +#ifndef TENSORFLOW_KERNELS_CAST_OP_H_ +#define TENSORFLOW_KERNELS_CAST_OP_H_ + +#include "tensorflow/core/framework/bfloat16.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/port.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename Tout, typename Tin> +void Cast(const Device& d, typename TTypes<Tout>::Flat o, + typename TTypes<Tin>::ConstFlat i) { + o.device(d) = i.template cast<Tout>(); +} + +template <typename Device, typename Tout, typename Tin> +struct CastFunctor { + void operator()(const Device& d, typename TTypes<Tout>::Flat o, + typename TTypes<Tin>::ConstFlat i); +}; + +} // end namespace functor +} // end namespace tensorflow + +namespace Eigen { +namespace internal { + +// Specialized cast op impls for bfloat16. +template <> +struct scalar_cast_op< ::tensorflow::bfloat16, float> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()( + const ::tensorflow::bfloat16& a) const { + static_assert(::tensorflow::port::kLittleEndian, ""); + float ret; + uint16_t* p = reinterpret_cast<uint16_t*>(&ret); + p[0] = 0; + p[1] = a.value; + return ret; + } +}; + +template <> +struct functor_traits<scalar_cast_op< ::tensorflow::bfloat16, float> > { + enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; +}; + +template <> +struct scalar_cast_op<float, ::tensorflow::bfloat16> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef ::tensorflow::bfloat16 result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ::tensorflow::bfloat16 operator()( + const float a) const { + static_assert(::tensorflow::port::kLittleEndian, ""); + const uint16_t* p = reinterpret_cast<const uint16_t*>(&a); + return ::tensorflow::bfloat16(p[1]); + } +}; + +template <> +struct functor_traits<scalar_cast_op<float, ::tensorflow::bfloat16> > { + enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; +}; + +} // namespace internal +} // namespace Eigen + +#endif // TENSORFLOW_KERNELS_CAST_OP_H_ diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc new file mode 100644 index 0000000000..cd198c752b --- /dev/null +++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc @@ -0,0 +1,45 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/bfloat16.h" +#include "tensorflow/core/kernels/cast_op.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +template <typename O, typename I> +struct CastFunctor<GPUDevice, O, I> { + void operator()(const GPUDevice& d, typename TTypes<O>::Flat o, + typename TTypes<I>::ConstFlat i) { + Cast<GPUDevice, O, I>(d, o, i); + } +}; + +#define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>; +DEFINE(float, double); +DEFINE(float, int32); +DEFINE(float, int64); +DEFINE(double, float); +DEFINE(double, int32); +DEFINE(double, int64); +DEFINE(int32, float); +DEFINE(int32, double); +DEFINE(int32, int64); +DEFINE(int64, float); +DEFINE(int64, double); +DEFINE(int64, int32); +DEFINE(int32, bool); +DEFINE(float, bool); +DEFINE(float, uint8); +DEFINE(uint8, float); +DEFINE(float, bfloat16); +DEFINE(bfloat16, float); +#undef DEFINE + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc new file mode 100644 index 0000000000..f774fbcfe8 --- /dev/null +++ b/tensorflow/core/kernels/cast_op_test.cc @@ -0,0 +1,100 @@ +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +template <typename Src, typename Dst> +static Graph* Cast(int num) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor data(DataTypeToEnum<Src>::value, + TensorShape({64, 64, num / (64 * 64)})); + data.flat<Src>().setRandom(); + test::graph::Cast(g, test::graph::Constant(g, data), + DataTypeToEnum<Dst>::value); + return g; +} + +class CastOpTest : public OpsTestBase { + protected: + void MakeOp(DataType src, DataType dst) { + RequireDefaultOps(); + EXPECT_OK(NodeDefBuilder("cast_op", "Cast") + .Input(FakeInput(DT_INT32)) + .Attr("SrcT", src) + .Attr("DstT", dst) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + } +}; + +TEST_F(CastOpTest, Int32ToUint8) { + MakeOp(DT_INT32, DT_UINT8); + AddInputFromArray<int32>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_UINT8, TensorShape({1, 2, 2, 1})); + test::FillValues<uint8>(&expected, {1, 2, 3, 4}); + test::ExpectTensorEqual<uint8>(expected, *GetOutput(0)); +} + +static void BM_cpu_float_int64(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * + (sizeof(float) + sizeof(int64))); + testing::UseRealTime(); + test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters); +} +BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); + +static void BM_gpu_float_int64(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * + (sizeof(float) + sizeof(int64))); + testing::UseRealTime(); + test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters); +} +BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); + +static void BM_cpu_bool_float(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * + (sizeof(bool) + sizeof(float))); + testing::UseRealTime(); + test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters); +} +BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); + +static void BM_gpu_bool_float(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * + (sizeof(bool) + sizeof(float))); + testing::UseRealTime(); + test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters); +} +BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); + +static void BM_cpu_float_bfloat16(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * + (sizeof(float) + sizeof(bfloat16))); + testing::UseRealTime(); + test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters); +} +BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20); + +static void BM_cpu_bfloat16_float(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * + (sizeof(float) + sizeof(bfloat16))); + testing::UseRealTime(); + test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters); +} +BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc new file mode 100644 index 0000000000..65487a303c --- /dev/null +++ b/tensorflow/core/kernels/check_numerics_op.cc @@ -0,0 +1,190 @@ +// See docs in ../ops/array_ops.cc. + +#include <math.h> +#include <algorithm> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/public/tensor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#if GOOGLE_CUDA +template <typename T> +struct CheckNumericsLaunch { + void Run(const GPUDevice& d, const T* data, int size, + int abnormal_detected[2]); +}; +#endif + +namespace { + +template <typename Device, typename T> +class CheckNumericsOp; + +// Partial specialization for CPU +template <typename T> +class CheckNumericsOp<CPUDevice, T> : public OpKernel { + public: + explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) { + // message_ is used as the prefix for the assertion error message. For + // instance, this can be the name of the input op that produced the tensor. + OP_REQUIRES_OK(context, context->GetAttr("message", &message_)); + } + + void Compute(OpKernelContext* context) override { + // pass along the input to the output + context->set_output(0, context->input(0)); + + auto in = context->input(0).flat<T>(); + const T* data = in.data(); + const int size = in.size(); + // Check to see if any element of the tensor is NaN or Inf. + int fp_props = + std::accumulate(data, data + size, 0, [](const int& x, const T& y) { + int prop = std::fpclassify(y); + int result = x; + if (prop == FP_INFINITE) { + result |= kInfBit; + } else if (prop == FP_NAN) { + result |= kNaNBit; + } + return result; + }); + string status; + if ((fp_props & kInfBit) && (fp_props & kNaNBit)) { + status = "Inf and NaN"; + } else { + if (fp_props & kInfBit) { + status = "Inf"; + } + if (fp_props & kNaNBit) { + status = "NaN"; + } + } + if (!status.empty()) { + context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ", + status, " values")); + } + } + + private: + string message_; + static const int kInfBit = 0x01; + static const int kNaNBit = 0x02; +}; + +#if GOOGLE_CUDA +// Partial specialization for GPU +template <typename T> +class CheckNumericsOp<GPUDevice, T> : public OpKernel { + public: + typedef GPUDevice Device; + + explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) { + // message_ is used as the prefix for the assertion error message. For + // instance, this can be the name of the input op that produced the tensor. + OP_REQUIRES_OK(context, context->GetAttr("message", &message_)); + } + + void Compute(OpKernelContext* context) override { + // pass along the input to the output + context->set_output(0, context->input(0)); + auto input = context->input(0).flat<T>(); + + // Allocate and initialize the elements to hold the check results + const int abnormal_detected_size = 2; + Tensor abnormal_detected; + OP_REQUIRES_OK(context, context->allocate_temp( + DT_INT32, TensorShape({abnormal_detected_size}), + &abnormal_detected)); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + perftools::gputools::DeviceMemoryBase abnormal_detected_ptr( + abnormal_detected.flat<int>().data(), + abnormal_detected.flat<int>().size()); + stream->ThenMemset32(&abnormal_detected_ptr, 0, + abnormal_detected.flat<int>().size() * sizeof(int)); + + // Call the Cuda kernels for the numerical checks + const Device& d = context->eigen_device<Device>(); + CheckNumericsLaunch<T>().Run(d, input.data(), input.size(), + abnormal_detected.flat<int>().data()); + + // Copy the results from device to host + AllocatorAttributes attr; + attr.set_on_host(true); + attr.set_gpu_compatible(true); + Tensor abnormal_detected_out; + OP_REQUIRES_OK(context, context->allocate_temp( + DT_INT32, TensorShape({abnormal_detected_size}), + &abnormal_detected_out, attr)); + int* abnormal_detected_host = abnormal_detected_out.flat<int>().data(); + stream->ThenMemcpy(abnormal_detected_host, abnormal_detected_ptr, + abnormal_detected_size * sizeof(int)); + stream->BlockHostUntilDone(); + OP_REQUIRES(context, stream->ok(), + errors::Internal("cudaMemcpy from device to host failed")); + + int is_nan = abnormal_detected_host[0]; + int is_inf = abnormal_detected_host[1]; + if (is_nan || is_inf) { + string status; + LOG(ERROR) << "abnormal_detected_host @" << abnormal_detected_host + << " = {" << is_nan << ", " << is_inf << "} " << message_; + + // Results should always be 1 or 0. If we see anything else then + // there has been some GPU memory corruption. + CHECK_GE(is_nan, 0); + CHECK_GE(is_inf, 0); + CHECK_LE(is_nan, 1); + CHECK_LE(is_inf, 1); + + if (is_nan && is_inf) { + status = "Inf and NaN"; + } else if (is_nan) { + status = "NaN"; + } else if (is_inf) { + status = "Inf"; + } + context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ", + status, " values")); + } + } + + private: + string message_; +}; +#endif // GOOGLE_CUDA + +} // namespace + +REGISTER_KERNEL_BUILDER(Name("CheckNumerics") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + CheckNumericsOp<CPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("CheckNumerics") + .Device(DEVICE_CPU) + .TypeConstraint<double>("T"), + CheckNumericsOp<CPUDevice, double>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("CheckNumerics") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T"), + CheckNumericsOp<GPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("CheckNumerics") + .Device(DEVICE_GPU) + .TypeConstraint<double>("T"), + CheckNumericsOp<GPUDevice, double>); +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc new file mode 100644 index 0000000000..cb84f98731 --- /dev/null +++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc @@ -0,0 +1,62 @@ +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include <stdio.h> +#include <assert.h> + +#include <math.h> +#include <algorithm> + +#include "tensorflow/core/platform/port.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace { + +typedef Eigen::GpuDevice GPUDevice; + +// A Cuda kernel to check if each element is Inf or Nan. If any exists, the +// relevant elements in abnormal_detected will be set +template <typename T> +__global__ void CheckNumericsKernel(const T *data, int size, + int abnormal_detected[2]) { + const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32 total_thread_count = gridDim.x * blockDim.x; + + int32 offset = thread_id; + + while (offset < size) { + if (isnan(data[offset])) { + abnormal_detected[0] = 1; + } + if (isinf(data[offset])) { + abnormal_detected[1] = 1; + } + offset += total_thread_count; + } +} + +} // namespace + +// A simple launch pad to launch the Cuda kernels that checks the numerical +// abnormality in the given array +template <typename T> +struct CheckNumericsLaunch { + void Run(const GPUDevice &d, const T *data, int size, + int abnormal_detected[2]) { + const int32 block_size = d.maxCudaThreadsPerBlock(); + const int32 num_blocks = + (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / + block_size; + + CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>( + data, size, abnormal_detected); + } +}; + +template struct CheckNumericsLaunch<float>; +template struct CheckNumericsLaunch<double>; + +} // namespace tensorflow +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc new file mode 100644 index 0000000000..12632fb248 --- /dev/null +++ b/tensorflow/core/kernels/cholesky_op.cc @@ -0,0 +1,71 @@ +// See docs in ../ops/linalg_ops.cc. +// TODO(konstantinos): Enable complex inputs. This will require additional tests +// and OP_REQUIRES. + +#include <cmath> + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/linalg_ops_common.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/Eigen/Cholesky" + +namespace tensorflow { + +template <class Scalar, bool SupportsBatchOperationT> +class CholeskyOp : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> { + public: + explicit CholeskyOp(OpKernelConstruction* context) + : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {} + + TensorShape GetOutputMatrixShape( + const TensorShape& input_matrix_shape) override { + return input_matrix_shape; + } + + int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override { + const int64 rows = input_matrix_shape.dim_size(0); + if (rows > (1LL << 20)) { + // A big number to cap the cost in case overflow. + return kint32max; + } else { + return rows * rows * rows; + } + } + + using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap; + using + typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap; + + void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input, + MatrixMap* output) override { + OP_REQUIRES(context, input.rows() == input.cols(), + errors::InvalidArgument("Input matrix must be square.")); + if (input.rows() == 0) { + // If X is an empty matrix (0 rows, 0 col), X * X' == X. + // Therefore, we return X. + return; + } + // Perform the actual LL^T Cholesky decomposition. This will only use + // the lower triangular part of data_in by default. The upper triangular + // part of the matrix will not be read. + Eigen::LLT<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> llt_decomposition(input); + + // Output the lower triangular in a dense form. + *output = llt_decomposition.matrixL(); + + OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success, + errors::InvalidArgument("LLT decomposition was not successful. " + "The input might not be valid.")); + } +}; + +REGISTER_LINALG_OP("Cholesky", (CholeskyOp<float, false>), float); +REGISTER_LINALG_OP("Cholesky", (CholeskyOp<double, false>), double); +REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<float, true>), float); +REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<double, true>), double); +} // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc new file mode 100644 index 0000000000..b68fcec515 --- /dev/null +++ b/tensorflow/core/kernels/concat_op.cc @@ -0,0 +1,153 @@ +// See docs in ../ops/array_ops.cc. + +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/concat_op.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// -------------------------------------------------------------------------- +template <typename Device, typename T> +class ConcatOp : public OpKernel { + public: + typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> + ConstMatrixVector; + + explicit ConcatOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* c) override { + const Tensor* concat_dim_tensor; + OP_REQUIRES_OK(c, c->input("concat_dim", &concat_dim_tensor)); + OP_REQUIRES( + c, TensorShapeUtils::IsLegacyScalar(concat_dim_tensor->shape()), + errors::InvalidArgument( + "Concat dim tensor should be a scalar integer, but got shape ", + concat_dim_tensor->shape().DebugString())); + const int32 concat_dim = concat_dim_tensor->scalar<int32>()(); + OpInputList values; + OP_REQUIRES_OK(c, c->input_list("values", &values)); + const int N = values.size(); + const int input_dims = values[0].dims(); + const TensorShape& input_shape = values[0].shape(); + OP_REQUIRES( + c, (0 <= concat_dim && concat_dim < input_dims) || + (kAllowLegacyScalars && concat_dim == 0), + errors::InvalidArgument( + "ConcatOp : Expected concatenating dimensions in the range [", 0, + ", ", input_dims, "), but got ", concat_dim)); + + // Note that we reduce the concat of n-dimensional tensors into a two + // dimensional concat. Assuming the dimensions of any input/output + // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along + // the dimension indicated with size y0, we flatten it to {x, y}, where y = + // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1). + ConstMatrixVector inputs_flat; + inputs_flat.reserve(N); + int64 inputs_flat_dim0 = 1; + for (int d = 0; d < concat_dim; ++d) { + inputs_flat_dim0 *= input_shape.dim_size(d); + } + int output_concat_dim = 0; + const bool input_is_scalar = TensorShapeUtils::IsLegacyScalar(input_shape); + for (int i = 0; i < N; ++i) { + const auto in = values[i]; + const bool in_is_scalar = TensorShapeUtils::IsLegacyScalar(in.shape()); + OP_REQUIRES( + c, in.dims() == input_dims || (input_is_scalar && in_is_scalar), + errors::InvalidArgument( + "ConcatOp : Ranks of all input tensors should match: shape[0] = ", + input_shape.ShortDebugString(), " vs. shape[", i, "] = ", + in.shape().ShortDebugString())); + for (int j = 0; j < input_dims; ++j) { + if (j == concat_dim) { + continue; + } + OP_REQUIRES( + c, in.dim_size(j) == input_shape.dim_size(j), + errors::InvalidArgument( + "ConcatOp : Dimensions of inputs should match: shape[0] = ", + input_shape.ShortDebugString(), " vs. shape[", i, "] = ", + in.shape().ShortDebugString())); + } + if (in.NumElements() > 0) { + int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0; + inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix( + in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1}))); + } + // TODO(irving): Remove check once !kAllowLegacyScalars + output_concat_dim += in.dims() > 0 ? in.dim_size(concat_dim) : 1; + } + + TensorShape output_shape(input_shape); + // TODO(irving): Remove rank 0 case once !kAllowLegacyScalars + if (output_shape.dims() == 0) { + output_shape.AddDim(output_concat_dim); + } else { + output_shape.set_dim(concat_dim, output_concat_dim); + } + Tensor* output = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output)); + if (output->NumElements() > 0) { + int64 output_dim1 = output->NumElements() / inputs_flat_dim0; + auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1}); + if (std::is_same<Device, GPUDevice>::value) { + ConcatGPU<T>(c->eigen_gpu_device(), inputs_flat, &output_flat); + } else { + ConcatCPU<T>(c->device(), inputs_flat, &output_flat); + } + } + } +}; + +#define REGISTER_CONCAT(type) \ + REGISTER_KERNEL_BUILDER(Name("Concat") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("concat_dim"), \ + ConcatOp<CPUDevice, type>) + +TF_CALL_ALL_TYPES(REGISTER_CONCAT); +REGISTER_CONCAT(quint8); +REGISTER_CONCAT(qint8); +REGISTER_CONCAT(qint32); +REGISTER_CONCAT(bfloat16); + +#undef REGISTER_CONCAT + +#if GOOGLE_CUDA + +#define REGISTER_GPU(type) \ + REGISTER_KERNEL_BUILDER(Name("Concat") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("concat_dim"), \ + ConcatOp<GPUDevice, type>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); +#undef REGISTER_GPU + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Concat") + .Device(DEVICE_GPU) + .TypeConstraint<int32>("T") + .HostMemory("concat_dim") + .HostMemory("values") + .HostMemory("output"), + ConcatOp<CPUDevice, int32>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op.h b/tensorflow/core/kernels/concat_op.h new file mode 100644 index 0000000000..664e55080d --- /dev/null +++ b/tensorflow/core/kernels/concat_op.h @@ -0,0 +1,27 @@ +#ifndef TENSORFLOW_KERNELS_CONCAT_OP_H_ +#define TENSORFLOW_KERNELS_CONCAT_OP_H_ + +#include <vector> + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/device_base.h" + +namespace tensorflow { + +// Assumes all inputs are nonempty +template <typename T> +void ConcatCPU(DeviceBase* d, + const std::vector< + std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, + typename TTypes<T, 2>::Matrix* output); + +// Assumes all inputs are nonempty +template <typename T> +void ConcatGPU(const Eigen::GpuDevice& d, + const std::vector< + std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, + typename TTypes<T, 2>::Matrix* output); + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CONCAT_OP_H_ diff --git a/tensorflow/core/kernels/concat_op_cpu.cc b/tensorflow/core/kernels/concat_op_cpu.cc new file mode 100644 index 0000000000..679a53721c --- /dev/null +++ b/tensorflow/core/kernels/concat_op_cpu.cc @@ -0,0 +1,122 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/concat_op.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +template <typename T> +static inline void Copy(T* dst, const T* src, int n) { + if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) { + memcpy(dst, src, n * sizeof(T)); + } else { + for (int k = 0; k < n; ++k) { + *dst++ = *src++; + } + } +} + +template <typename T> +void ConcatCPU(DeviceBase* d, + const std::vector< + std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, + typename TTypes<T, 2>::Matrix* output) { + int num_inputs = inputs.size(); + std::vector<ptrdiff_t> sizes; + sizes.reserve(num_inputs); + int row_size = 0; + for (int j = 0; j < num_inputs; ++j) { + sizes.push_back(inputs[j]->dimension(1)); + row_size += sizes.back(); + } + + auto worker_threads = d->tensorflow_cpu_worker_threads(); + int num_threads = std::min<int>(std::min(4, worker_threads->num_threads), + output->size() / 4096); + // Single threaded mode. + if (num_threads == 0) { + T* out = &(*output)(0, 0); + std::vector<const T*> inp; + inp.reserve(num_inputs); + for (int j = 0; j < num_inputs; ++j) { + inp.push_back(&(*inputs[j])(0, 0)); + } + const int dim0 = output->dimension(0); + for (int i = 0; i < dim0; ++i) { + for (int j = 0; j < num_inputs; ++j) { + auto size = sizes[j]; + Copy(out, inp[j], size); + out += size; + inp[j] += size; + } + } + return; + } + + // Sharded mode. + auto work = [&row_size, &sizes, &inputs, &output, &num_inputs](int64 start, + int64 end) { + int64 skipped_rows = start / row_size; + T* out = output->data() + skipped_rows * row_size; + T* out_start = output->data() + start; + T* out_end = output->data() + end; + + // Handle partial row at start + if (out < out_start) { + for (int j = 0; j < num_inputs; ++j) { + ptrdiff_t size = sizes[j]; + ptrdiff_t offset = out_start - out; + if (size <= offset) { + out += size; + continue; + } + const T* inp = &(*inputs[j])(skipped_rows, 0); + if (offset > 0) { + out += offset; + inp += offset; + size -= offset; + } + size = std::min(size, out_end - out); + if (size <= 0) break; + Copy(out, inp, size); + out += size; + } + ++skipped_rows; + } + if (out == out_end) return; + CHECK(out >= out_start); + CHECK(out < out_end); + + // Copy remaining data. + std::vector<const T*> inp; + inp.reserve(num_inputs); + for (int j = 0; j < num_inputs; ++j) { + inp.push_back(&(*inputs[j])(skipped_rows, 0)); + } + const int dim0 = output->dimension(0); + for (int i = skipped_rows; i < dim0; ++i) { + for (int j = 0; j < num_inputs; ++j) { + ptrdiff_t size = std::min(sizes[j], out_end - out); + Copy(out, inp[j], size); + out += size; + inp[j] += size; + if (out == out_end) return; + } + } + }; + Shard(num_threads, worker_threads->workers, output->size(), 100, work); +} + +#define REGISTER(T) \ + template void ConcatCPU<T>( \ + DeviceBase*, \ + const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \ + typename TTypes<T, 2>::Matrix* output); +TF_CALL_ALL_TYPES(REGISTER) +REGISTER(quint8) +REGISTER(qint8) +REGISTER(qint32) +REGISTER(bfloat16) + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/concat_op_gpu.cu.cc b/tensorflow/core/kernels/concat_op_gpu.cu.cc new file mode 100644 index 0000000000..d8ce6bd85d --- /dev/null +++ b/tensorflow/core/kernels/concat_op_gpu.cu.cc @@ -0,0 +1,41 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> + +#include <memory> + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +template <typename T> +void ConcatGPU(const GPUDevice& d, + const std::vector< + std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs, + typename TTypes<T, 2>::Matrix* output) { + Eigen::array<ptrdiff_t, 2> offset(0, 0); + for (int i = 0; i < inputs.size(); ++i) { + Eigen::array<ptrdiff_t, 2> size = inputs[i]->dimensions(); + output->slice(offset, size).device(d) = *inputs[i]; + offset[1] += size[1]; + } +} + +#define REGISTER_GPU(T) \ + template void ConcatGPU<T>( \ + const GPUDevice& d, \ + const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& \ + inputs, \ + typename TTypes<T, 2>::Matrix* output); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); +#undef REGISTER_GPU + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc new file mode 100644 index 0000000000..4ccc5b5b19 --- /dev/null +++ b/tensorflow/core/kernels/concat_op_test.cc @@ -0,0 +1,240 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim' +// in size, and concat them together along "concat_dimension" +template <typename T> +static void ConcatHelper(int iters, int concat_dimension, int dim2) { + testing::StopTiming(); + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + + DataType dt = DataTypeToEnum<T>::v(); + const int kDim1 = 100; + Tensor concat_dim(DT_INT32, TensorShape({})); + concat_dim.scalar<int32>()() = concat_dimension; + Tensor in0(dt, TensorShape({kDim1, dim2})); + in0.flat<T>().setRandom(); + Tensor in1(dt, TensorShape({kDim1, dim2})); + in1.flat<T>().setRandom(); + + Node* node; + TF_CHECK_OK( + NodeBuilder(g->NewName("n"), "Concat") + .Input(test::graph::Constant(g, concat_dim)) + .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)}) + .Attr("N", 2) + .Attr("T", dt) + .Finalize(g, &node)); + + testing::BytesProcessed(static_cast<int64>(iters) * + ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T)); + testing::StartTiming(); + test::Benchmark("cpu", g).Run(iters); + testing::UseRealTime(); +} + +static void BM_ConcatDim0Float(int iters, int dim2) { + ConcatHelper<float>(iters, 0, dim2); +} + +static void BM_ConcatDim1Float(int iters, int dim2) { + ConcatHelper<float>(iters, 1, dim2); +} + +BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000); +BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000); + +static void BM_ConcatDim1int16(int iters, int dim2) { + ConcatHelper<int16>(iters, 1, dim2); +} +static void BM_ConcatDim1bfloat16(int iters, int dim2) { + ConcatHelper<bfloat16>(iters, 1, dim2); +} + +BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000); +BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000); + +template <typename T> +static void ConcatManyHelper(int iters, int concat_dimension, int dim2) { + testing::StopTiming(); + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + + DataType dt = DataTypeToEnum<T>::v(); + const int kDim1 = 40000; + const int kNumInputs = 64; + Tensor concat_dim(DT_INT32, TensorShape({})); + concat_dim.scalar<int32>()() = concat_dimension; + std::vector<NodeBuilder::NodeOut> inputs; + inputs.reserve(kNumInputs); + for (int i = 0; i < kNumInputs; ++i) { + Tensor in(dt, TensorShape({kDim1, dim2})); + in.flat<T>().setRandom(); + inputs.push_back(test::graph::Constant(g, in)); + } + + Node* node; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat") + .Input(test::graph::Constant(g, concat_dim)) + .Input(inputs) + .Attr("N", 64) + .Attr("T", dt) + .Finalize(g, &node)); + testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 * + kNumInputs * sizeof(T)); + testing::StartTiming(); + test::Benchmark("cpu", g).Run(iters); + testing::UseRealTime(); +} + +static void BM_ConcatManyDim1bfloat16(int iters, int dim2) { + ConcatManyHelper<bfloat16>(iters, 1, dim2); +} + +BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60); + +static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) { + testing::StopTiming(); + + const int kDim1 = 100; + std::vector<float> data1(kDim1 * dim2, 1.0f); + std::vector<float> data2(kDim1 * dim2, 2.0f); + + testing::BytesProcessed(static_cast<int64>(iters) * + ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float)); + testing::StartTiming(); + while (--iters > 0) { + const int n0 = data1.size(); + const int n1 = data2.size(); + float* result = new float[n0 + n1]; + memcpy(&result[0], &data1[0], n0 * sizeof(float)); + memcpy(&result[n0], &data2[0], n1 * sizeof(float)); + delete[] result; + } +} + +static void BM_MemcpyAlternativeDim0(int iters, int dim2) { + MemcpyAlternativeHelper(iters, 0, dim2); +} +static void BM_MemcpyAlternativeDim1(int iters, int dim2) { + MemcpyAlternativeHelper(iters, 1, dim2); +} + +BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000); +BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); + +typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>, + Eigen::Unaligned> EigenMap; +static void MemcpyManyAlternative1(int iters, int dim2) { + testing::StopTiming(); + + const int kDim1 = 40000; + const int kNumCopies = 64; + const int size = kDim1 * dim2 * kNumCopies; + bfloat16* data = new bfloat16[size]; + EigenMap map(data, size); + map.setRandom(); + + testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 * + kNumCopies * sizeof(bfloat16)); + testing::StartTiming(); + while (iters-- > 0) { + std::vector<bfloat16*> inputs(kNumCopies); + for (int i = 0; i < kNumCopies; ++i) { + inputs[i] = &data[i * kDim1 * dim2]; + } + bfloat16* result = new bfloat16[size]; + for (int j = 0; j < kNumCopies; ++j) { + bfloat16* output = &result[j * dim2]; + for (int i = 0; i < kDim1; ++i) { + if (i + 1 < kDim1) { + port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2); + } + memcpy(output, inputs[j], dim2 * sizeof(bfloat16)); + inputs[j] += dim2; + output += dim2 * kNumCopies; + } + } + delete[] result; + } + delete[] data; +} + +static void MemcpyManyAlternative2(int iters, int dim2) { + testing::StopTiming(); + + const int kDim1 = 40000; + const int kNumCopies = 64; + const int size = kDim1 * dim2 * kNumCopies; + bfloat16* data = new bfloat16[size]; + EigenMap map(data, size); + map.setRandom(); + + testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 * + kNumCopies * sizeof(bfloat16)); + testing::StartTiming(); + std::vector<bfloat16*> inputs(kNumCopies); + while (--iters > 0) { + bfloat16* result = new bfloat16[size]; + for (int i = 0; i < kNumCopies; ++i) { + inputs[i] = &data[i * kDim1 * dim2]; + } + bfloat16* output = result; + for (int i = 0; i < kDim1; ++i) { + for (int j = 0; j < kNumCopies; ++j) { + if (j + 1 < kNumCopies) { + port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]); + } + memcpy(output, inputs[j], dim2 * sizeof(bfloat16)); + inputs[j] += dim2; + output += dim2; + } + } + delete[] result; + } + delete[] data; +} + +BENCHMARK(MemcpyManyAlternative1) + ->Arg(16) + ->Arg(17) + ->Arg(18) + ->Arg(32) + ->Arg(33) + ->Arg(34) + ->Arg(60) + ->Arg(64) + ->Arg(65); + +BENCHMARK(MemcpyManyAlternative2) + ->Arg(16) + ->Arg(17) + ->Arg(18) + ->Arg(32) + ->Arg(33) + ->Arg(34) + ->Arg(60) + ->Arg(64) + ->Arg(65); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc new file mode 100644 index 0000000000..281bafd3df --- /dev/null +++ b/tensorflow/core/kernels/constant_op.cc @@ -0,0 +1,249 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/constant_op.h" + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.pb.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/fill_functor.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +ConstantOp::ConstantOp(OpKernelConstruction* ctx) + : OpKernel(ctx), tensor_(ctx->output_type(0)) { + const TensorProto* proto = nullptr; + OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto)); + OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto( + *proto, AllocatorAttributes(), &tensor_)); + OP_REQUIRES( + ctx, ctx->output_type(0) == tensor_.dtype(), + errors::InvalidArgument("Type mismatch between value (", + DataTypeString(tensor_.dtype()), ") and dtype (", + DataTypeString(ctx->output_type(0)), ")")); +} + +void ConstantOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, tensor_); } + +ConstantOp::~ConstantOp() {} + +REGISTER_KERNEL_BUILDER(Name("Const").Device(DEVICE_CPU), ConstantOp); + +#if GOOGLE_CUDA +#define REGISTER_KERNEL(D, TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \ + ConstantOp); +REGISTER_KERNEL(GPU, float); +REGISTER_KERNEL(GPU, double); +REGISTER_KERNEL(GPU, uint8); +REGISTER_KERNEL(GPU, int8); +REGISTER_KERNEL(GPU, int16); +REGISTER_KERNEL(GPU, int64); +REGISTER_KERNEL(GPU, complex64); +REGISTER_KERNEL(GPU, bool); +// Currently we do not support string constants on GPU +#undef REGISTER_KERNEL +#endif + +// HostConstantOp differs from ConstantOp in that its output is always +// in host memory. +class HostConstantOp : public OpKernel { + public: + explicit HostConstantOp(OpKernelConstruction* ctx) + : OpKernel(ctx), tensor_(ctx->output_type(0)) { + const TensorProto* proto = nullptr; + AllocatorAttributes alloc_attr; + alloc_attr.set_on_host(true); + OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto)); + OP_REQUIRES_OK( + ctx, ctx->device()->MakeTensorFromProto(*proto, alloc_attr, &tensor_)); + OP_REQUIRES( + ctx, ctx->output_type(0) == tensor_.dtype(), + errors::InvalidArgument( + "Type mismatch between value (", DataTypeString(tensor_.dtype()), + ") and dtype (", DataTypeString(ctx->output_type(0)), ")")); + } + + void Compute(OpKernelContext* ctx) override { ctx->set_output(0, tensor_); } + + bool IsExpensive() override { return false; } + + ~HostConstantOp() override {} + + private: + Tensor tensor_; + TF_DISALLOW_COPY_AND_ASSIGN(HostConstantOp); +}; + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Const") + .Device(DEVICE_GPU) + .HostMemory("output") + .TypeConstraint<int32>("dtype"), + HostConstantOp); + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +// Partial specialization of FillFunctor<Device=CPUDevice, T>. +template <typename T> +struct FillFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstScalar in) { + out.device(d) = out.constant(in()); + } +}; + +// Partial specialization of SetZeroFunctor<Device=CPUDevice, T>. +template <typename T> +struct SetZeroFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out) { + out.device(d) = out.constant(0); + } +}; + +#define DEFINE_SETZERO_CPU(T) template struct SetZeroFunctor<CPUDevice, T> +DEFINE_SETZERO_CPU(float); +DEFINE_SETZERO_CPU(double); +DEFINE_SETZERO_CPU(int32); +DEFINE_SETZERO_CPU(complex64); +#undef DEFINE_SETZERO_CPU + +} // end namespace functor + +template <typename Device, typename T> +class FillOp : public OpKernel { + public: + explicit FillOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& Tdims = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsLegacyVector(Tdims.shape()), + errors::InvalidArgument("dims must be a vector of int32.")); + const Tensor& Tvalue = context->input(1); + OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(Tvalue.shape()), + errors::InvalidArgument("value must be a scalar.")); + auto dims = Tdims.flat<int32>(); + for (int i = 0; i < dims.size(); i++) { + OP_REQUIRES(context, dims(i) >= 0, + errors::InvalidArgument("dims[", i, "] = ", dims(i), + " must be nonnegative.")); + } + Tensor* out = nullptr; + OP_REQUIRES_OK( + context, + context->allocate_output( + 0, TensorShapeUtils::MakeShape( + reinterpret_cast<const int32*>(dims.data()), dims.size()), + &out)); + functor::FillFunctor<Device, T> functor; + functor(context->eigen_device<Device>(), out->flat<T>(), + Tvalue.scalar<T>()); + } +}; + +#define REGISTER_KERNEL(D, TYPE) \ + REGISTER_KERNEL_BUILDER(Name("Fill") \ + .Device(DEVICE_##D) \ + .TypeConstraint<TYPE>("T") \ + .HostMemory("dims"), \ + FillOp<D##Device, TYPE>); + +#define REGISTER_CPU_KERNEL(TYPE) REGISTER_KERNEL(CPU, TYPE) +TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL); +#undef REGISTER_CPU_KERNEL + +#if GOOGLE_CUDA +REGISTER_KERNEL(GPU, float); +REGISTER_KERNEL(GPU, double); +REGISTER_KERNEL(GPU, uint8); +REGISTER_KERNEL(GPU, int8); +REGISTER_KERNEL(GPU, int16); +REGISTER_KERNEL(GPU, int64); +// Currently we do not support filling strings and complex64 on GPU + +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Fill") + .Device(DEVICE_GPU) + .TypeConstraint<int32>("T") + .HostMemory("dims") + .HostMemory("value") + .HostMemory("output"), + FillOp<CPUDevice, int32>); + +template <typename Device, typename T> +class ZerosLikeOp : public OpKernel { + public: + explicit ZerosLikeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& input = ctx->input(0); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &out)); + Tensor zero(DataTypeToEnum<T>::value, {1}); + zero.scalar<T>().setZero(); + const Tensor& zero_cref = zero; + functor::FillFunctor<Device, T> functor; + functor(ctx->eigen_device<Device>(), out->flat<T>(), zero_cref.scalar<T>()); + } +}; + +#define REGISTER_KERNEL(type, dev) \ + REGISTER_KERNEL_BUILDER( \ + Name("ZerosLike").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ + ZerosLikeOp<dev##Device, type>) + +#define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU) +TF_CALL_ALL_TYPES(REGISTER_CPU); +#undef REGISTER_CPU + +#if GOOGLE_CUDA +REGISTER_KERNEL(float, GPU); +REGISTER_KERNEL(double, GPU); +#endif // GOOGLE_CUDA + +#undef REGISTER_KERNEL + +class PlaceholderOp : public OpKernel { + public: + explicit PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_)); + } + + void Compute(OpKernelContext* ctx) override { + if (expected_shape_.dims() > 0) { + OP_REQUIRES(ctx, false, + errors::InvalidArgument( + "You must feed a value for placeholder tensor '", name(), + "' with dtype ", DataTypeString(output_type(0)), + " and shape ", expected_shape_.DebugString())); + } else { + OP_REQUIRES(ctx, false, + errors::InvalidArgument( + "You must feed a value for placeholder tensor '", name(), + "' with dtype ", DataTypeString(output_type(0)))); + } + } + + private: + TensorShape expected_shape_; +}; + +REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE_CPU), PlaceholderOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/constant_op.h b/tensorflow/core/kernels/constant_op.h new file mode 100644 index 0000000000..20a5c9c42f --- /dev/null +++ b/tensorflow/core/kernels/constant_op.h @@ -0,0 +1,25 @@ +#ifndef TENSORFLOW_KERNELS_CONSTANT_OP_H_ +#define TENSORFLOW_KERNELS_CONSTANT_OP_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +// ConstantOp returns a tensor specified by ConstantOpDef. +class ConstantOp : public OpKernel { + public: + explicit ConstantOp(OpKernelConstruction* ctx); + void Compute(OpKernelContext* ctx) override; + bool IsExpensive() override { return false; } + ~ConstantOp() override; + + private: + Tensor tensor_; + TF_DISALLOW_COPY_AND_ASSIGN(ConstantOp); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CONSTANT_OP_H_ diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc new file mode 100644 index 0000000000..64502378bd --- /dev/null +++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc @@ -0,0 +1,89 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/fill_functor.h" +#include "tensorflow/core/platform/port.h" + +namespace Eigen { +namespace internal { + +template <typename T> +struct scalar_const_op { + typedef typename packet_traits<T>::type Packet; + + const T* val; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + scalar_const_op(const scalar_const_op& x) + : val(x.val) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {} + + template <typename Index> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(Index, + Index = 0) const { + return *val; + } + + template <typename Index> + EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { + return internal::pset1<Packet>(*val); + } +}; + +template <typename T> +struct functor_traits<scalar_const_op<T> > { + enum { + Cost = 1, + PacketAccess = packet_traits<T>::Vectorizable, + IsRepeatable = true + }; +}; + +} // end namespace internal +} // end namespace Eigen + +namespace tensorflow { + +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +// Partial specialization FillFunctor<Device=GPUDevice, T> +template <typename T> +struct FillFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstScalar in) { + Eigen::internal::scalar_const_op<T> f(in.data()); + out.device(d) = out.nullaryExpr(f); + } +}; + +#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T> +DEFINE_FILL_GPU(float); +DEFINE_FILL_GPU(double); +DEFINE_FILL_GPU(int32); +DEFINE_FILL_GPU(uint8); +DEFINE_FILL_GPU(int16); +DEFINE_FILL_GPU(int8); +DEFINE_FILL_GPU(int64); +#undef DEFINE_FILL_GPU + +// Partial specialization of FillFunctor<Device=GPUDevice, T>. +template <typename T> +struct SetZeroFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) { + out.device(d) = out.constant(0); + } +}; + +#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T> +DEFINE_SETZERO_GPU(float); +#undef DEFINE_SETZERO_GPU + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc new file mode 100644 index 0000000000..f5a464c07c --- /dev/null +++ b/tensorflow/core/kernels/constant_op_test.cc @@ -0,0 +1,43 @@ +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +// Returns graph containing "num" const nodes. If 'sequential' is +// true, make sure all constants are executed sequentially in the +// graph by adding control dependencies. +static Graph* ManyConsts(int num, bool sequential) { + Graph* g = new Graph(OpRegistry::Global()); + Node* prev = nullptr; + for (int i = 0; i < num; ++i) { + Tensor c(DT_FLOAT, TensorShape({})); + c.scalar<float>()() = i; + Node* curr = test::graph::Constant(g, c); + if (sequential && prev != nullptr) { + g->AddControlEdge(prev, curr); + } + prev = curr; + } + return g; +} + +static void BM_ManyConsts_Parallel(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters); +} +BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10); + +static void BM_ManyConsts_Sequential(int iters, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters); +} +BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc new file mode 100644 index 0000000000..bc44a7f7cc --- /dev/null +++ b/tensorflow/core/kernels/control_flow_ops.cc @@ -0,0 +1,359 @@ +#include "tensorflow/core/kernels/control_flow_ops.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +// A switch op has two inputs and two outputs. It forwards the value of +// Input:0 to the output specified by input:1. Input:1 is a boolean tensor. +// Input:0 is forwarded to output:0 if input:1 is false, otherwise to +// output:1. +class SwitchOp : public OpKernel { + public: + explicit SwitchOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& outputPorts = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsScalar(outputPorts.shape()), + errors::InvalidArgument("The second input must be a scalar, " + "but it has shape ", + outputPorts.shape().ShortDebugString())); + + bool pred = outputPorts.scalar<bool>()(); + int port = (pred) ? 1 : 0; + if (IsRefType(context->input_dtype(0))) { + context->forward_ref_input_to_ref_output(0, port); + } else { + context->set_output(port, context->input(0)); + } + } + + bool IsExpensive() override { return false; } + + ~SwitchOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(SwitchOp); +}; + +#define REGISTER_CPU_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_CPU) \ + .HostMemory("pred") \ + .TypeConstraint<type>("T"), \ + SwitchOp) + +#define REGISTER_CPU_REF_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_CPU) \ + .HostMemory("pred") \ + .TypeConstraint<type>("T"), \ + SwitchOp) + +#define REGISTER_GPU_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("Switch") \ + .Device(DEVICE_GPU) \ + .HostMemory("pred") \ + .TypeConstraint<type>("T"), \ + SwitchOp) + +#define REGISTER_GPU_REF_SWITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSwitch") \ + .Device(DEVICE_GPU) \ + .HostMemory("pred") \ + .TypeConstraint<type>("T"), \ + SwitchOp) + +TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH); +TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SWITCH); +REGISTER_GPU_SWITCH(bool); +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_REF_SWITCH); +REGISTER_GPU_REF_SWITCH(int32); +REGISTER_GPU_REF_SWITCH(bool); + +#undef REGISTER_CPU_SWITCH +#undef REGISTER_CPU_REF_SWITCH +#undef REGISTER_GPU_SWITCH +#undef REGISTER_GPU_REF_SWITCH + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Switch") + .Device(DEVICE_GPU) + .HostMemory("data") + .HostMemory("pred") + .HostMemory("output_false") + .HostMemory("output_true") + .TypeConstraint<int32>("T"), + SwitchOp); + +class RefSelectOp : public OpKernel { + public: + explicit RefSelectOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("N", &num_ref_inputs_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& index_tensor = context->input(0); + OP_REQUIRES( + context, TensorShapeUtils::IsScalar(index_tensor.shape()), + errors::InvalidArgument("Index must be a scalar, " + "but it has shape ", + index_tensor.shape().ShortDebugString())); + + int32 index = index_tensor.scalar<int32>()(); + + OP_REQUIRES(context, index >= 0 && index < num_ref_inputs_, + errors::InvalidArgument("Index must be in the range [0, ", + num_ref_inputs_, ") but got ", index)); + context->forward_ref_input_to_ref_output(index + 1, 0); + } + + bool IsExpensive() override { return false; } + + ~RefSelectOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(RefSelectOp); + + private: + int num_ref_inputs_; +}; + +#define REGISTER_CPU_REF_SELECT(type) \ + REGISTER_KERNEL_BUILDER(Name("RefSelect") \ + .Device(DEVICE_CPU) \ + .HostMemory("index") \ + .TypeConstraint<type>("T"), \ + RefSelectOp) +TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SELECT); + +#undef REGISTER_CPU_REF_SWITCH + +// A merge op has n inputs and two outputs. It forwards the value of the +// first input that becomes available to its first output, and the +// index of the first input to its second output. +class MergeOp : public OpKernel { + public: + explicit MergeOp(OpKernelConstruction* context) : OpKernel(context) { + const DataType dt = context->input_type(0); + const int num_in = context->num_inputs(); + OP_REQUIRES_OK(context, context->MatchSignature(DataTypeVector(num_in, dt), + {dt, DT_INT32})); + } + + void Compute(OpKernelContext* context) override { + bool input_seen = false; + for (int i = 0; i < context->num_inputs(); ++i) { + if (context->has_input(i)) { + if (input_seen) { + context->SetStatus(errors::Internal( + "Merge can not have more than one valid input.")); + return; + } + input_seen = true; + + context->set_output(0, context->input(i)); + Tensor* value_index = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}), + &value_index)); + value_index->scalar<int32>()() = i; + } + } + } + + bool IsExpensive() override { return false; } + + ~MergeOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(MergeOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Merge").Device(DEVICE_CPU), MergeOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Merge") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("value_index"), \ + MergeOp); + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); + +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Merge") + .Device(DEVICE_GPU) + .HostMemory("inputs") + .HostMemory("output") + .HostMemory("value_index") + .TypeConstraint<int32>("T"), + MergeOp); + +// An enter op has one input and one output. It creates or finds +// the child frame that is uniquely identified by the frame_name, +// and makes its input available to the child frame. +class EnterOp : public OpKernel { + public: + explicit EnterOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + if (IsRefType(context->input_dtype(0))) { + context->forward_ref_input_to_ref_output(0, 0); + } else { + context->set_output(0, context->input(0)); + } + } + + bool IsExpensive() override { return false; } + + ~EnterOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(EnterOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_CPU), EnterOp); +REGISTER_KERNEL_BUILDER(Name("RefEnter").Device(DEVICE_CPU), EnterOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Enter").Device(DEVICE_GPU).TypeConstraint<type>("T"), EnterOp); +#define REGISTER_GPU_REF_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("RefEnter").Device(DEVICE_GPU).TypeConstraint<type>("T"), EnterOp); + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +TF_CALL_NUMBER_TYPES(REGISTER_GPU_REF_KERNEL); + +#undef REGISTER_GPU_KERNEL +#undef REGISTER_GPU_REF_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Enter") + .Device(DEVICE_GPU) + .HostMemory("data") + .HostMemory("output") + .TypeConstraint<int32>("T"), + EnterOp); + +// An exit op has one input and one output. It exits the current +// frame to its parent frame, and makes its input available to the +// parent frame. +class ExitOp : public OpKernel { + public: + explicit ExitOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + context->set_output(0, context->input(0)); + } + + bool IsExpensive() override { return false; } + + ~ExitOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(ExitOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_CPU), ExitOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Exit").Device(DEVICE_GPU).TypeConstraint<type>("T"), ExitOp); + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); + +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Exit") + .Device(DEVICE_GPU) + .HostMemory("data") + .HostMemory("output") + .TypeConstraint<int32>("T"), + ExitOp); + +// A next_iteration op has one input and one output. It makes its input +// available to the next iteration. +class NextIterationOp : public OpKernel { + public: + explicit NextIterationOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + context->set_output(0, context->input(0)); + } + + bool IsExpensive() override { return false; } + + ~NextIterationOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp); +}; + +REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE_CPU), + NextIterationOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("NextIteration").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + NextIterationOp); + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); + +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("NextIteration") + .Device(DEVICE_GPU) + .HostMemory("data") + .HostMemory("output") + .TypeConstraint<int32>("T"), + NextIterationOp); + +// A LoopCond op has one input and one output. The input is a boolean +// scalar representing the taken branches of the "pivot" Switch that +// determines loop termination. As a contract, any high-level front-end +// should always use port '0' of the "pivot" switches for loop exit. +class LoopCondOp : public OpKernel { + public: + explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + context->set_output(0, context->input(0)); + } + + bool IsExpensive() override { return false; } + + ~LoopCondOp() override {} + + TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp); +}; + +REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp); +REGISTER_KERNEL_BUILDER(Name("LoopCond") + .Device(DEVICE_GPU) + .HostMemory("input") + .HostMemory("output"), + LoopCondOp); + +// ControlTrigger kernels +REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU), + ControlTriggerOp); + +REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU), + ControlTriggerOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h new file mode 100644 index 0000000000..184cc9fb63 --- /dev/null +++ b/tensorflow/core/kernels/control_flow_ops.h @@ -0,0 +1,22 @@ +#ifndef TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_ +#define TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_ + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +// A ControlTriggerOp is similar to a NoOp. However, it always treats the input +// control edges as Live edges. Its primary use so far is in the scheduling of +// recvs, where we add ControlTrigger nodes and use them to trigger recvs. We +// allow ControlTrigger nodes to be enabled by dead nodes. +class ControlTriggerOp : public OpKernel { + public: + explicit ControlTriggerOp(OpKernelConstruction* context) + : OpKernel(context) {} + void Compute(OpKernelContext* context) override {} + bool IsExpensive() override { return false; } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_ diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc new file mode 100644 index 0000000000..52bc11abf0 --- /dev/null +++ b/tensorflow/core/kernels/control_flow_ops_test.cc @@ -0,0 +1,71 @@ +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +// Tests for the switch op +class SwitchOpTest : public OpsTestBase { + protected: + void Initialize(DataType dt) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("op", "Switch") + .Input(FakeInput(dt)) + .Input(FakeInput()) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SwitchOpTest, Int32Success_6_s0) { + Initialize(DT_INT32); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<bool>(TensorShape({}), {false}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); + EXPECT_EQ(nullptr, GetOutput(1)); +} + +TEST_F(SwitchOpTest, Int32Success_6_s1) { + Initialize(DT_INT32); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<bool>(TensorShape({}), {true}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(1)); + EXPECT_EQ(nullptr, GetOutput(0)); +} + +TEST_F(SwitchOpTest, Int32Success_2_3_s0) { + Initialize(DT_INT32); + AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<bool>(TensorShape({}), {false}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({2, 3})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); + EXPECT_EQ(nullptr, GetOutput(1)); +} + +TEST_F(SwitchOpTest, StringSuccess_s1) { + Initialize(DT_STRING); + AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"}); + AddInputFromArray<bool>(TensorShape({}), {true}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_STRING, TensorShape({6})); + test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(1)); + EXPECT_EQ(nullptr, GetOutput(0)); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h new file mode 100644 index 0000000000..2fb623244c --- /dev/null +++ b/tensorflow/core/kernels/conv_2d.h @@ -0,0 +1,127 @@ +#ifndef TENSORFLOW_KERNELS_CONV_2D_H_ +#define TENSORFLOW_KERNELS_CONV_2D_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// TODO(yangke): revisit these operations and in particular, see if we can +// combine all of them into just one operation without causing nvcc to +// timeout. +template <typename Device, typename T, int Dims> +struct ShuffleAndReverse { + void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input, + const Eigen::DSizes<Eigen::DenseIndex, Dims>& order, + const Eigen::array<bool, Dims>& reverse_dims, + typename TTypes<T, Dims>::Tensor output) { + output.device(d) = input.shuffle(order).reverse(reverse_dims); + } +}; + +template <typename Device, typename T, int Dims> +struct InflatePadAndShuffle { + void operator()( + const Device& d, typename TTypes<T, Dims>::ConstTensor input, + const Eigen::DSizes<Eigen::DenseIndex, Dims>& strides, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, Dims>& pad_dims, + const Eigen::DSizes<Eigen::DenseIndex, Dims>& order, + typename TTypes<T, Dims>::Tensor output) { + output.device(d) = input.inflate(strides).pad(pad_dims).shuffle(order); + } +}; + +template <typename Device, typename Input, typename Filter, typename Output> +void SpatialConvolutionFunc(const Device& d, Output output, Input input, + Filter filter, int stride, + const Eigen::PaddingType& padding) { + output.device(d) = Eigen::SpatialConvolution(input, filter, stride, padding); +} + +template <typename Device, typename T> +struct SpatialConvolution { + void operator()(const Device& d, typename TTypes<T, 4>::Tensor output, + typename TTypes<T, 4>::ConstTensor input, + typename TTypes<T, 4>::ConstTensor filter, int stride, + const Eigen::PaddingType& padding) { + SpatialConvolutionFunc(d, output, input, filter, stride, padding); + } +}; + +template <typename Device, typename T> +struct SpatialConvolutionBackwardInput { + void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward, + typename TTypes<T, 4>::ConstTensor kernel, + typename TTypes<T, 4>::ConstTensor output_backward, + int input_rows, int input_cols, int stride) { + input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput( + kernel, output_backward, input_rows, input_cols, stride); + } +}; + +template <typename Device, typename T> +struct SpatialConvolutionBackwardKernel { + void operator()(const Device& d, + typename TTypes<T, 4>::Tensor kernel_backward, + typename TTypes<T, 4>::ConstTensor input, + typename TTypes<T, 4>::ConstTensor output_backward, + int kernel_rows, int kernel_cols, int stride) { + kernel_backward.device(d) = Eigen::SpatialConvolutionBackwardKernel( + input, output_backward, kernel_rows, kernel_cols, stride); + } +}; + +// TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h. +// My initial attempt to do this compiled but failed in the pytest +// due to a swigdeps error. +template <typename Device, typename T> +struct MatMulConvFunctor { + // Computes on device "d": out = in0 * in1, where * is matrix + // multiplication. + void operator()( + const Device& d, typename TTypes<T, 2>::Tensor out, + typename TTypes<T, 2>::ConstTensor in0, + typename TTypes<T, 2>::ConstTensor in1, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) { + out.device(d) = in0.contract(in1, dim_pair); + } +}; + +template <typename Device, typename T> +struct TransformFilter { + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in, + typename TTypes<T, 4>::Tensor out) { + out.device(d) = in.shuffle(Eigen::DSizes<Eigen::DenseIndex, 4>(3, 2, 0, 1)); + } +}; + +template <typename Device, typename T> +struct TransformDepth { + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in, + const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, + typename TTypes<T, 4>::Tensor out) { + out.device(d) = in.shuffle(shuffle); + } +}; + +template <typename Device, typename T> +struct PadInput { + void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor in, + int padding_rows_left, int padding_rows_right, + int padding_cols_left, int padding_cols_right, + typename TTypes<T, 4>::Tensor out) { + Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 4> padding; + padding[0] = std::make_pair(0, 0); + padding[1] = std::make_pair(padding_rows_left, padding_rows_right); + padding[2] = std::make_pair(padding_cols_left, padding_cols_right); + padding[3] = std::make_pair(0, 0); + out.device(d) = in.pad(padding); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CONV_2D_H_ diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc new file mode 100644 index 0000000000..bb21d7003c --- /dev/null +++ b/tensorflow/core/kernels/conv_grad_ops.cc @@ -0,0 +1,1190 @@ +// See docs in ../ops/nn_ops.cc. + +#define USE_EIGEN_TENSOR +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/util/use_cudnn.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// The operation to compute Conv2D gradients. +// +// +// To compute the gradients for Conv2D, we need three input tensors: +// input, filter, and backprop for output. +// And we need to compute two backprops: one for input and one for filter. We +// compute them in two different kernels. + +// Both backprops can be computed as straightforward conv2d. +// +// Consider a case where the input is 3x3 and the filter is 2x1: +// +// INPUT = [ A B C ] +// [ D E F ] +// [ G H I ] +// +// where each "A", "B", etc is batch x in_depth +// +// FILTER = [ X Y ] +// +// where both "X" and "Y" are in_depth x out_depth +// +// With VALID padding, the output is 3x2: +// +// OUTPUT = [ a b ] +// [ c d ] +// [ e f ] +// +// where each "a", "b", etc is batch x out_depth +// +// So we have: +// +// a = A * X + B * Y +// b = B * X + C * Y +// c = D * X + E * Y +// d = E * X + F * Y +// e = G * X + H * Y +// f = H * X + I * Y +// +// So when we have backprops for the outputs (we denote them by +// a', b', ... ): +// +// The backprops for the input are: +// +// A' = a' * X^t +// B' = a' * Y^t + b' * X^t +// C' = b' * Y^t +// ... +// +// This is essentially computing a 2d conv of +// +// INPUT = [ 0 a' b' 0 ] +// [ 0 c' d' 0 ] +// [ 0 e' f' 0 ] +// and +// +// FILTER = [ Y^t X^t ] +// +// The backprops for the filter are: +// +// X' = A^t * a' + B^t * b' + D^t * c' + E^t * d' + G^t * e' + H^t * f' +// Y' = B^t * a' + C^t * b' + E^t + c' + F^t * d' + H^t * e' + I^t * f' +// +// This is essentially computing a 2d conv of +// +// INPUT = [ A^t B^t C^t ] +// [ D^t E^t F^t ] +// [ G^t H^t I^t ] +// +// and +// +// FILTER = [ a' b' ] +// [ c' d' ] +// [ e' f' ] +// +// +////////////////////////////////////////////////////////// +// +// With stride more than one, it's a bit more complicated (we will need to +// create holes to the backprop). +// +// Consider the case where +// +// INPUT = [ A B C D E ] +// [ F G H I J ] +// [ K L M N O ] +// and +// +// FILTER = [ X Y Z ] +// +// with stride 2. +// +// The output will be +// +// OUTPUT = [ a b ] +// [ c d ] +// +// where: +// +// a = A * X + B * Y + C * Z +// b = C * X + D * Y + E * Z +// c = K * X + L * Y + M * Z +// d = M * X + N * Y + O * Z +// +// +// To compute the backprop for INPUT, we need to convolve +// +// INPUT = [ 0 0 a' 0 b' 0 0 ] +// [ 0 0 0 0 0 0 0 ] +// [ 0 0 c' 0 d' 0 0 ] +// +// (notice the holes in INPUT) +// +// and +// +// FILTER = [ Z^t Y^t X^t ] +// +// with stride 1. +// +// To compute the backprop for FILTER, we need to convolve + +// +// INPUT = [ A^t B^t C^t D^t E^t ] +// [ F^t G^t H^t I^t J^t ] +// [ K^t L^t M^t N^t O^t ] +// and +// +// FILTER = [ a' 0 b' ] +// [ 0 0 0 ] +// [ c' 0 d' ] +// +// (notice the holes in FILTER) +// +// +// with stride 1 +// +////////////////////////////////////////////////////////// +// +// +// The case for SAME padding is in fact very similar to VALID -- we just +// need to pad the input tensor a bit when computing the filter_backprop. + +// Common code between the two kernels: verifies that the dimensions all match +// and extract the padded rows and columns. +#define EXTRACT_AND_VERIFY_DIMENSIONS(label) \ + const Tensor& out_backprop = context->input(2); \ + OP_REQUIRES( \ + context, input_shape.dims() == 4, \ + errors::InvalidArgument(label, ": input must be 4-dimensional")); \ + OP_REQUIRES( \ + context, filter_shape.dims() == 4, \ + errors::InvalidArgument(label, ": filter must be 4-dimensional")); \ + OP_REQUIRES( \ + context, out_backprop.dims() == 4, \ + errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \ + const int64 batch = input_shape.dim_size(0); \ + OP_REQUIRES( \ + context, batch == out_backprop.dim_size(0), \ + errors::InvalidArgument( \ + label, ": input and out_backprop must have the same batch size")); \ + const int64 input_rows = input_shape.dim_size(1); \ + const int64 input_cols = input_shape.dim_size(2); \ + const int64 filter_rows = filter_shape.dim_size(0); \ + const int64 filter_cols = filter_shape.dim_size(1); \ + const int64 output_rows = out_backprop.dim_size(1); \ + const int64 output_cols = out_backprop.dim_size(2); \ + const int64 in_depth = input_shape.dim_size(3); \ + OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \ + errors::InvalidArgument( \ + label, ": input and filter must have the same depth")); \ + const int64 out_depth = filter_shape.dim_size(3); \ + OP_REQUIRES( \ + context, out_depth == out_backprop.dim_size(3), \ + errors::InvalidArgument( \ + label, ": filter and out_backprop must have the same out_depth")); \ + const auto stride = strides_[1]; \ + int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; \ + if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) { \ + out_rows = input_rows; \ + out_cols = input_cols; \ + } else { \ + OP_REQUIRES_OK( \ + context, Get2dOutputSize(input_rows, input_cols, filter_rows, \ + filter_cols, stride, stride, padding_, \ + &out_rows, &out_cols, &pad_rows, &pad_cols)); \ + } \ + OP_REQUIRES( \ + context, output_rows == out_rows, \ + errors::InvalidArgument( \ + label, ": Number of rows of out_backprop doesn't match computed: ", \ + "actual = ", output_rows, ", computed = ", out_rows)); \ + OP_REQUIRES( \ + context, output_cols == out_cols, \ + errors::InvalidArgument( \ + label, ": Number of cols of out_backprop doesn't match computed: ", \ + "actual = ", output_cols, ", computed = ", out_cols)); \ + const auto expanded_out_rows = (output_rows - 1) * stride + 1; \ + const auto expanded_out_cols = (output_cols - 1) * stride + 1; \ + const auto padded_out_rows = input_rows + filter_rows - 1; \ + const auto padded_out_cols = input_cols + filter_cols - 1; \ + const auto top_pad_rows = filter_rows - 1 - pad_rows; \ + const auto left_pad_cols = filter_cols - 1 - pad_cols; \ + const auto bottom_pad_rows = \ + padded_out_rows - expanded_out_rows - top_pad_rows; \ + const auto right_pad_cols = \ + padded_out_cols - expanded_out_cols - left_pad_cols; \ + Eigen::DSizes<Eigen::DenseIndex, 4> strides{1, stride, stride, 1}; \ + VLOG(2) << "Conv2d: " << label \ + << ": expanded_out_rows = " << expanded_out_rows \ + << ", expanded_out_cols = " << expanded_out_cols \ + << ", filter_rows = " << filter_rows \ + << ", filter_cols = " << filter_cols \ + << ", padded_out_rows = " << padded_out_rows \ + << ", padded_out_cols = " << padded_out_cols \ + << ", top_pad_rows = " << top_pad_rows \ + << ", left_pad_cols = " << left_pad_cols \ + << ", bottom_pad_rows = " << bottom_pad_rows \ + << ", right_pad_cols = " << right_pad_cols \ + << ", strides = " << strides[1] + +namespace { +TensorShape VectorToShape(const TTypes<int32>::ConstVec& sizes) { + TensorShape shape; + + using Index = TTypes<int32>::ConstVec::Index; + const Index dims = sizes.size(); + for (Index i = 0; i < dims; ++i) { + shape.AddDim(sizes(i)); + } + + return shape; +} +} // namespace + +// The fast versions using eigen computations directly. They are only enabled +// for CPU for now since nvcc times out when trying to compile them. +// TODO(yangke): enable them for GPUs when we have a faster compiler. + +template <typename Device, class T> +class Conv2DFastBackpropInputOp : public OpKernel { + public: + explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_sizes = context->input(0); + const Tensor& filter = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", + input_sizes.dims())); + TensorShape input_shape = VectorToShape(input_sizes.vec<int32>()); + const TensorShape& filter_shape = filter.shape(); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput"); + Tensor* in_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + // Need to flip the input_rows and input_cols when passing to eigen. + functor::SpatialConvolutionBackwardInput<Device, T>()( + context->eigen_device<Device>(), in_backprop->tensor<T, 4>(), + filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(), input_cols, + input_rows, stride); + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp); +}; + +// Based on implementation written by Yangqing Jia (jiayq). +template <typename Device, class T> +class Conv2DCustomBackpropInputOp : public OpKernel { + public: + explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES( + context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_sizes = context->input(0); + const Tensor& filter = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", + input_sizes.dims())); + TensorShape input_shape = VectorToShape(input_sizes.vec<int32>()); + const TensorShape& filter_shape = filter.shape(); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput"); + Tensor* in_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + + // TODO(andydavis) Consider moving code shared with + // Conv2DCustomBackpropFilterOp into a shared helper function. + int pad_top; + int pad_bottom; + int pad_left; + int pad_right; + OP_REQUIRES_OK( + context, + Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols, + stride, stride, padding_, &out_rows, &out_cols, + &pad_top, &pad_bottom, &pad_left, &pad_right)); + + // The total dimension size of each kernel. + const int filter_total_size = filter_rows * filter_cols * in_depth; + // The output image size is the spatial size of the output. + const int output_image_size = out_rows * out_cols; + + Tensor col_buffer; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({output_image_size, filter_total_size}), &col_buffer)); + + // The input offset corresponding to a single input image. + const int input_offset = input_rows * input_cols * in_depth; + // The output offset corresponding to a single output image. + const int output_offset = out_rows * out_cols * out_depth; + + auto* filter_data = filter.template flat<T>().data(); + auto* col_buffer_data = col_buffer.template flat<T>().data(); + auto* out_backprop_data = out_backprop.template flat<T>().data(); + auto* input_backprop_data = in_backprop->template flat<T>().data(); + + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> MatrixMap; + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> ConstMatrixMap; + + for (int image_id = 0; image_id < batch; ++image_id) { + // Compute gradient into col_buffer. + MatrixMap C(col_buffer_data, output_image_size, filter_total_size); + + ConstMatrixMap A(out_backprop_data + output_offset * image_id, + output_image_size, out_depth); + ConstMatrixMap B(filter_data, filter_total_size, out_depth); + + // TODO(andydavis) Use a multi-threaded matmul implementation here. + C.noalias() = A * B.transpose(); + + Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows, + filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride, + stride, input_backprop_data); + + input_backprop_data += input_offset; + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + Conv2DCustomBackpropInputOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_CPU) + .Label("custom") + .TypeConstraint<float>("T"), + Conv2DCustomBackpropInputOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_CPU) + .Label("eigen_tensor") + .TypeConstraint<float>("T"), + Conv2DFastBackpropInputOp<CPUDevice, float>); + +template <typename Device, class T> +class Conv2DFastBackpropFilterOp : public OpKernel { + public: + explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(filter_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ", + filter_sizes.dims())); + const TensorShape& input_shape = input.shape(); + TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>()); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter"); + Tensor* filter_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, filter_shape, &filter_backprop)); + + // Need to flip the filter_rows and filter_cols when passing to eigen. + functor::SpatialConvolutionBackwardKernel<Device, T>()( + context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(), + input.tensor<T, 4>(), out_backprop.tensor<T, 4>(), filter_cols, + filter_rows, stride); + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp); +}; + +// Based on implementation written by Yangqing Jia (jiayq). +template <typename Device, class T> +class Conv2DCustomBackpropFilterOp : public OpKernel { + public: + explicit Conv2DCustomBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES( + context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(filter_sizes.shape()), + errors::InvalidArgument( + "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, " + "not ", + filter_sizes.dims())); + const TensorShape& input_shape = input.shape(); + TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>()); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DCustomBackpropFilter"); + Tensor* filter_backprop; + OP_REQUIRES_OK(context, + context->allocate_output(0, filter_shape, &filter_backprop)); + + int pad_top; + int pad_bottom; + int pad_left; + int pad_right; + OP_REQUIRES_OK( + context, + Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols, + stride, stride, padding_, &out_rows, &out_cols, + &pad_top, &pad_bottom, &pad_left, &pad_right)); + + // The total dimension size of each kernel. + const int filter_total_size = filter_rows * filter_cols * in_depth; + // The output image size is the spatial size of the output. + const int output_image_size = out_rows * out_cols; + + Tensor col_buffer; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({output_image_size, filter_total_size}), &col_buffer)); + + // The input offset corresponding to a single input image. + const int input_offset = input_rows * input_cols * in_depth; + // The output offset corresponding to a single output image. + const int output_offset = out_rows * out_cols * out_depth; + + auto* input_data = input.template flat<T>().data(); + auto* col_buffer_data = col_buffer.template flat<T>().data(); + auto* out_backprop_data = out_backprop.template flat<T>().data(); + auto* filter_backprop_data = filter_backprop->template flat<T>().data(); + + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> MatrixMap; + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> ConstMatrixMap; + + MatrixMap C(filter_backprop_data, filter_total_size, out_depth); + + C.setZero(); + for (int image_id = 0; image_id < batch; ++image_id) { + // When we compute the gradient with respect to the filters, we need to do + // im2col to allow gemm-type computation. + Im2col<T>(input_data, in_depth, input_rows, input_cols, filter_rows, + filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride, + stride, col_buffer_data); + + ConstMatrixMap A(col_buffer_data, output_image_size, filter_total_size); + ConstMatrixMap B(out_backprop_data + output_offset * image_id, + output_image_size, out_depth); + + // Compute gradient with respect to filter. + // TODO(andydavis) Use a multi-threaded matmul implementation here. + C.noalias() += A.transpose() * B; + + input_data += input_offset; + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + Conv2DCustomBackpropFilterOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_CPU) + .Label("custom") + .TypeConstraint<float>("T"), + Conv2DCustomBackpropFilterOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_CPU) + .Label("eigen_tensor") + .TypeConstraint<float>("T"), + Conv2DFastBackpropFilterOp<CPUDevice, float>); + +// GPU definitions of both ops. +#if GOOGLE_CUDA +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, + uint64 size) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), + size * sizeof(T)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +// The slow version (but compiles for GPU) + +// Backprop for input. +template <typename Device, class T> +class Conv2DSlowBackpropInputOp : public OpKernel { + public: + explicit Conv2DSlowBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); + use_cudnn_ &= CanUseCudnn(); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_sizes = context->input(0); + const Tensor& filter = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", + input_sizes.dims())); + TensorShape input_shape = VectorToShape(input_sizes.vec<int32>()); + const TensorShape& filter_shape = filter.shape(); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput"); + Tensor* in_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + + const int padding_rows = + (output_rows - 1) * stride + filter_rows - input_rows; + const int padding_cols = + (output_cols - 1) * stride + filter_cols - input_cols; + + // TODO(keveman): cuDNN only supports equal padding on both sides, so only + // calling it when that is true. Remove this check when (if?) cuDNN starts + // supporting different padding. + bool padding_compatible = + (padding_rows % 2 == 0) && (padding_cols % 2 == 0); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + if (use_cudnn_ && padding_compatible) { + if (filter_rows == 1 && filter_cols == 1 && stride == 1) { + // 1x1 filter, so call cublas directly. + const uint64 m = batch * input_rows * input_cols; + const uint64 k = out_depth; + const uint64 n = in_depth; + + auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), + out_backprop.template flat<T>().size()); + auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), + filter.template flat<T>().size()); + auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(), + in_backprop->template flat<T>().size()); + + auto transpose = perftools::gputools::blas::Transpose::kTranspose; + auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + + bool blas_launch_status = + stream->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, + k, a_ptr, k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", + m, ", n=", n, ", k=", k)); + } + return; + } + + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(batch) + .set_height(input_rows) + .set_width(input_cols) + .set_feature_map_count(in_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(batch) + .set_height(output_rows) + .set_width(output_cols) + .set_feature_map_count(out_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter_rows) + .set_input_filter_width(filter_cols) + .set_input_feature_map_count(in_depth) + .set_output_feature_map_count(out_depth); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(stride) + .set_horizontal_filter_stride(stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // NOTE(keveman): + // cuDNN only supports the following layouts : + // Input : B x D x R x C + // Filter : OD x ID x R x C + // Whereas, we have + // Input : B x R x C x D + // Filter : R x C x ID x OD + // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) + // The first TransformDepth performs + // (B x R x C x D) => (B x D x R x C). + // Since the tensor returned from cuDNN is B x D x R x C also, + // the second TransformDepth performs + // (B x D x R x C) => (B x R x C x D). + Tensor transformed_filter; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_depth, in_depth, filter_rows, filter_cols}), + &transformed_filter)); + + functor::TransformFilter<Device, T>()(context->eigen_device<Device>(), + filter.tensor<T, 4>(), + transformed_filter.tensor<T, 4>()); + + Tensor transformed_out_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, out_depth, output_rows, output_cols}), + &transformed_out_backprop)); + + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2), + transformed_out_backprop.tensor<T, 4>()); + + Tensor pre_transformed_in_backprop; + OP_REQUIRES_OK(context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, in_depth, input_rows, input_cols}), + &pre_transformed_in_backprop)); + + auto out_backprop_ptr = + AsDeviceMemory(transformed_out_backprop.template flat<T>().data(), + transformed_out_backprop.template flat<T>().size()); + auto filter_ptr = + AsDeviceMemory(transformed_filter.template flat<T>().data(), + transformed_filter.template flat<T>().size()); + auto in_backprop_ptr = + AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(), + pre_transformed_in_backprop.template flat<T>().size()); + + bool cudnn_launch_status = + stream->ThenConvolveBackwardData(filter_desc, filter_ptr, output_desc, + out_backprop_ptr, conv_desc, + input_desc, &in_backprop_ptr) + .ok(); + + if (!cudnn_launch_status) { + context->SetStatus(errors::Internal( + "cuDNN Backward Data function launch failure : input shape(", + input_shape.DebugString(), ") filter shape(", + filter_shape.DebugString(), ")")); + } + + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), + toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1), + in_backprop->tensor<T, 4>()); + } else { + // We fill out a padded out_backprop + TensorShape padded_out_shape( + {batch, padded_out_rows, padded_out_cols, out_depth}); + Tensor padded_output; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + padded_out_shape, &padded_output)); + + Eigen::DSizes<Eigen::DenseIndex, 4> trivial_order{0, 1, 2, 3}; + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{ + {{0, 0}, + {top_pad_rows, bottom_pad_rows}, + {left_pad_cols, right_pad_cols}, + {0, 0}}}; + + functor::InflatePadAndShuffle<Device, T, 4>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides, + pad_dims, trivial_order, padded_output.tensor<T, 4>()); + const Tensor& padded_output_cref = padded_output; + + // We then need to fill a new "reverted" filter + // We need to transpose the in_depth and out_depth for the filter and + // inverse the rows and cols. + TensorShape r_filter_shape( + {filter_rows, filter_cols, out_depth, in_depth}); + Tensor r_filter; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + r_filter_shape, &r_filter)); + + Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2}; + Eigen::array<bool, 4> filter_rev_dims{true, true, false, false}; + functor::ShuffleAndReverse<Device, T, 4>()( + context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order, + filter_rev_dims, r_filter.tensor<T, 4>()); + const Tensor& r_filter_cref = r_filter; + + // Now we can call conv_2d directly. + functor::SpatialConvolution<Device, T>()( + context->eigen_device<Device>(), in_backprop->tensor<T, 4>(), + padded_output_cref.tensor<T, 4>(), r_filter_cref.tensor<T, 4>(), 1, + BrainPadding2EigenPadding(VALID)); + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + bool use_cudnn_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp); +}; + +// Backprop for filter. +template <typename Device, class T> +class Conv2DSlowBackpropFilterOp : public OpKernel { + public: + explicit Conv2DSlowBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); + use_cudnn_ &= CanUseCudnn(); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(filter_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ", + filter_sizes.dims())); + const TensorShape& input_shape = input.shape(); + TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>()); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter"); + Tensor* filter_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, filter_shape, &filter_backprop)); + + const int padding_rows = + (output_rows - 1) * stride + filter_rows - input_rows; + const int padding_cols = + (output_cols - 1) * stride + filter_cols - input_cols; + + // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only + // calling it when that is true. Remove this check when (if?) cuDNN starts + // supporting different padding. + bool padding_compatible = + (padding_rows % 2 == 0) && (padding_cols % 2 == 0); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + if (use_cudnn_ && padding_compatible) { + if (filter_rows == 1 && filter_cols == 1 && stride == 1) { + const uint64 m = in_depth; + const uint64 k = batch * input_rows * input_cols; + const uint64 n = out_depth; + + // The shape of output backprop is + // [batch, out_rows, out_cols, out_depth] + // From cublas's perspective, it is: n x k + auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), + out_backprop.template flat<T>().size()); + + // The shape of input is + // [batch, in_rows, in_cols, in_depth], + // From cublas's perspective, it is: m x k + auto b_ptr = AsDeviceMemory(input.template flat<T>().data(), + input.template flat<T>().size()); + + // the shape of the filter backprop from the conv_2d should be + // [1, 1, in_depth, out_depth] + // From cublas's perspective, it is: n x m + auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(), + filter_backprop->template flat<T>().size()); + + bool blas_launch_status = + stream->ThenBlasGemm( + perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose, n, m, k, + 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", + m, ", n=", n, ", k=", k)); + } + return; + } + + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(batch) + .set_height(input_rows) + .set_width(input_cols) + .set_feature_map_count(in_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(batch) + .set_height(output_rows) + .set_width(output_cols) + .set_feature_map_count(out_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter_rows) + .set_input_filter_width(filter_cols) + .set_input_feature_map_count(in_depth) + .set_output_feature_map_count(out_depth); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(stride) + .set_horizontal_filter_stride(stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // NOTE(zhengxq): + // cuDNN only supports the following layouts : + // Input : B x D x R x C + // Filter : OD x ID x R x C + // Whereas, we have + // Input : B x R x C x D + // Filter : R x C x ID x OD + // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) + // The first TransformDepth performs + // (B x R x C x D) => (B x D x R x C). + // Since the tensor returned from cuDNN is B x D x R x C also, + // the second TransformDepth performs + // (B x D x R x C) => (B x R x C x D). + + Tensor pre_transformed_filter_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_depth, in_depth, filter_rows, filter_cols}), + &pre_transformed_filter_backprop)); + + Tensor transformed_out_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, out_depth, output_rows, output_cols}), + &transformed_out_backprop)); + + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2), + transformed_out_backprop.tensor<T, 4>()); + + Tensor transformed_input; + OP_REQUIRES_OK(context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, in_depth, input_rows, input_cols}), + &transformed_input)); + + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), input.tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2), + transformed_input.tensor<T, 4>()); + + auto out_backprop_ptr = + AsDeviceMemory(transformed_out_backprop.template flat<T>().data(), + transformed_out_backprop.template flat<T>().size()); + auto filter_backprop_ptr = AsDeviceMemory( + pre_transformed_filter_backprop.template flat<T>().data(), + pre_transformed_filter_backprop.template flat<T>().size()); + auto input_ptr = + AsDeviceMemory(transformed_input.template flat<T>().data(), + transformed_input.template flat<T>().size()); + + bool cudnn_launch_status = + stream->ThenConvolveBackwardFilter(input_desc, input_ptr, output_desc, + out_backprop_ptr, conv_desc, + filter_desc, &filter_backprop_ptr) + .ok(); + + if (!cudnn_launch_status) { + context->SetStatus(errors::Internal( + "cuDNN Backward Filter function launch failure : input shape(", + input_shape.DebugString(), ") filter shape(", + filter_shape.DebugString(), ")")); + } + + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), + toConstTensor(pre_transformed_filter_backprop) + .template tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(2, 3, 1, 0), + filter_backprop->tensor<T, 4>()); + } else { + // Fall back to the non-cudnn code path + + // For the backprop of the filter, we need to also transpose the + // out_backprop. + // The shape of backprop is + // [batch, out_rows, out_cols, out_depth] + // And we need to change it to + // [out_depth, out_rows, out_cols, batch] + Eigen::DSizes<Eigen::DenseIndex, 4> out_order{3, 1, 2, 0}; + TensorShape padded_out_shape( + {out_depth, padded_out_rows, padded_out_cols, batch}); + Tensor padded_output; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + padded_out_shape, &padded_output)); + + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{ + {{0, 0}, + {top_pad_rows, bottom_pad_rows}, + {left_pad_cols, right_pad_cols}, + {0, 0}}}; + functor::InflatePadAndShuffle<Device, T, 4>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides, + pad_dims, out_order, padded_output.tensor<T, 4>()); + const Tensor& padded_output_cref = padded_output; + + // For the backprop of the filter, we need to transpose the input. + // The shape of input is + // [batch, in_rows, in_cols, in_depth] + // And we need to change it to + // [in_rows, in_cols, batch, in_depth] + Eigen::DSizes<Eigen::DenseIndex, 4> in_order{1, 2, 0, 3}; + TensorShape in_shuffle_shape({input_rows, input_cols, batch, in_depth}); + Tensor in_shuffle; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + in_shuffle_shape, &in_shuffle)); + + // No need for reversing this time. + Eigen::array<bool, 4> trivial_dims{false, false, false, false}; + functor::ShuffleAndReverse<Device, T, 4>()( + context->eigen_device<Device>(), input.tensor<T, 4>(), in_order, + trivial_dims, in_shuffle.tensor<T, 4>()); + const Tensor& in_shuffle_cref = in_shuffle; + + // The output of the conv_2d would be + // [out_depth, filter_rows, filter_cols, in_depth] + // and we need to shuffle it back to + // [filter_rows, filter_cols, in_depth, out_depth]; + // And we need to reverse the filter backprops + // So we need to allocated (sigh) yet another piece of memory to hold the + // ouptut. + TensorShape filter_shuffle_shape( + {out_depth, filter_rows, filter_cols, in_depth}); + Tensor filter_shuffle; + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(), + filter_shuffle_shape, + &filter_shuffle)); + + functor::SpatialConvolution<Device, T>()( + context->eigen_device<Device>(), filter_shuffle.tensor<T, 4>(), + padded_output_cref.tensor<T, 4>(), in_shuffle_cref.tensor<T, 4>(), 1, + BrainPadding2EigenPadding(VALID)); + + // Now copy the filter_backprop back to the destination. + Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0}; + Eigen::array<bool, 4> filter_rev_dims{true, true, false, false}; + const Tensor& filter_shuffle_cref = filter_shuffle; + functor::ShuffleAndReverse<Device, T, 4>()( + context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(), + filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>()); + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + bool use_cudnn_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp); +}; + +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ShuffleAndReverse<GPUDevice, T, 4>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& order, \ + const Eigen::array<bool, 4>& reverse_dims, \ + typename TTypes<T, 4>::Tensor output); \ + extern template struct ShuffleAndReverse<GPUDevice, T, 4>; \ + template <> \ + void InflatePadAndShuffle<GPUDevice, T, 4>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& strides, \ + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& order, \ + typename TTypes<T, 4>::Tensor output); \ + extern template struct InflatePadAndShuffle<GPUDevice, T, 4>; \ + template <> \ + void TransformFilter<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformFilter<GPUDevice, T>; \ + template <> \ + void TransformDepth<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformDepth<GPUDevice, T>; \ + template <> \ + void SpatialConvolution<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<T, 4>::ConstTensor filter, int stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialConvolution<GPUDevice, T>; \ + template <> \ + void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop, \ + typename TTypes<T, 4>::ConstTensor filter, \ + typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows, \ + int input_cols, int stride); \ + extern template struct SpatialConvolutionBackwardInput<GPUDevice, T> + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("input_sizes"), + Conv2DSlowBackpropInputOp<GPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("filter_sizes"), + Conv2DSlowBackpropFilterOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc new file mode 100644 index 0000000000..aaa2951778 --- /dev/null +++ b/tensorflow/core/kernels/conv_ops.cc @@ -0,0 +1,373 @@ +// See docs in ../ops/nn_ops.cc. + +#define USE_EIGEN_TENSOR +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/util/use_cudnn.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +struct LaunchGeneric { + static void launch(OpKernelContext* ctx, const Tensor& input, + const Tensor& filter, int stride, + const Eigen::PaddingType& padding, Tensor* output) { + if (filter.dim_size(1) == filter.dim_size(0) && filter.dim_size(0) == 1 && + stride == 1) { + // For 1x1 kernel, the 2D convolution is reduced to matrix + // multiplication. + // + // TODO(vrv): We should be able to call SpatialConvolution + // and it will produce the same result, but doing so + // led to NaNs during training. Using matmul instead for now. + int conv_width = 1; // Width for the convolution step. + for (int i = 0; i < 3; ++i) { + conv_width *= output->dim_size(i); + } + + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; + dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); + functor::MatMulConvFunctor<Device, T>()( + ctx->eigen_device<Device>(), + output->shaped<T, 2>({conv_width, filter.dim_size(3)}), + input.shaped<T, 2>({conv_width, filter.dim_size(2)}), + filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}), + dim_pair); + } else { + functor::SpatialConvolution<Device, T>()( + ctx->eigen_device<Device>(), output->tensor<T, 4>(), + input.tensor<T, 4>(), filter.tensor<T, 4>(), stride, padding); + } + } +}; + +template <typename Device, typename T> +struct LaunchConvOp; + +template <typename T> +struct LaunchConvOp<CPUDevice, T> { + static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input, + const Tensor& filter, int stride, + const Eigen::PaddingType& padding, Tensor* output) { + LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, stride, padding, + output); + } +}; + +template <typename Device, typename T> +class Conv2DOp : public BinaryOp<T> { + public: + explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); + use_cudnn_ &= CanUseCudnn(); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + // Input tensor is of the following dimensions: + // [ batch, in_rows, in_cols, in_depth ] + + const Tensor& input = context->input(0); + + // Input filter is of the following dimensions: + // [ filter_rows, filter_cols, in_depth, out_depth] + const Tensor& filter = context->input(1); + + // For 2D convolution, there should be 4 dimensions. + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + OP_REQUIRES(context, filter.dims() == 4, + errors::InvalidArgument("filter must be 4-dimensional: ", + filter.shape().ShortDebugString())); + + // The last dimension for input is in_depth. It must be the same as the + // filter's in_depth. + const int64 in_depth = input.dim_size(3); + OP_REQUIRES( + context, in_depth == filter.dim_size(2), + errors::InvalidArgument("input and filter must have the same depth: ", + in_depth, " vs ", filter.dim_size(2))); + + // The last dimension for filter is out_depth. + const int64 out_depth = filter.dim_size(3); + + // The second dimension for input is rows/height. + // The first dimension for filter is rows/height. + const int64 input_rows = input.dim_size(1); + const int64 filter_rows = filter.dim_size(0); + + // The third dimension for input is columns/width. + // The second dimension for filter is columns/width. + const int64 input_cols = input.dim_size(2); + const int64 filter_cols = filter.dim_size(1); + + // The first dimension for input is batch. + const int64 batch = input.dim_size(0); + + // For now we take the stride from the second dimension only (we + // assume row = col stride, and do not support striding on the + // batch or depth dimension). + const int stride = strides_[1]; + + int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; + if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) { + // For 1x1 kernel, the 2D convolution is reduced to matrix + // multiplication. + out_rows = input_rows; + out_cols = input_cols; + } else { + OP_REQUIRES_OK( + context, Get2dOutputSize(input_rows, input_cols, filter_rows, + filter_cols, stride, stride, padding_, + &out_rows, &out_cols, &pad_rows, &pad_cols)); + } + TensorShape out_shape({batch, out_rows, out_cols, out_depth}); + + // Output tensor is of the following dimensions: + // [ in_batch, out_rows, out_cols, out_depth ] + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + + VLOG(2) << "Conv2D: in_depth = " << in_depth + << ", input_cols = " << input_cols + << ", filter_cols = " << filter_cols + << ", input_rows = " << input_rows + << ", filter_rows = " << filter_rows << ", stride = " << stride + << ", out_depth = " << out_depth; + + LaunchConvOp<Device, T>::launch(context, use_cudnn_, input, filter, stride, + BrainPadding2EigenPadding(padding_), + output); + } + + private: + std::vector<int32> strides_; + bool use_cudnn_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Conv2D") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + Conv2DOp<CPUDevice, float>); + +#if GOOGLE_CUDA + +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, + uint64 size) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), + size * sizeof(T)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +template <typename T> +struct LaunchConvOp<GPUDevice, T> { + static void launch(OpKernelContext* ctx, bool use_cudnn, + const Tensor& input_param, const Tensor& filter, + int stride, const Eigen::PaddingType& padding, + Tensor* output) { + auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); + + if (use_cudnn) { + Tensor input = input_param; + if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) { + // 1x1 filter, so call cublas directly. + const uint64 m = + input.dim_size(0) * input.dim_size(1) * input.dim_size(2); + const uint64 k = filter.dim_size(2); + const uint64 n = filter.dim_size(3); + + auto a_ptr = AsDeviceMemory(input.template flat<T>().data(), + input.template flat<T>().size()); + auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), + filter.template flat<T>().size()); + auto c_ptr = AsDeviceMemory(output->template flat<T>().data(), + output->template flat<T>().size()); + + auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + bool blas_launch_status = + stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, + b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, + ", n=", n, ", k=", k)); + } + return; + } + if (padding == Eigen::PADDING_SAME) { + const int64 out_rows = output->dim_size(1); + const int64 out_cols = output->dim_size(2); + const int64 in_rows = input.dim_size(1); + const int64 in_cols = input.dim_size(2); + const int64 patch_rows = filter.dim_size(0); + const int64 patch_cols = filter.dim_size(1); + // Total padding on rows and cols is + // Pr = (R' - 1) * S + Kr - R + // Pc = (C' - 1) * S + Kc - C + // where (R', C') are output dimensions, (R, C) are input dimensions, S + // is stride, (Kr, Kc) are filter dimensions. + // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top + // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means + // we pad more on the right and bottom than on the top and left. + const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows; + const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols; + Tensor transformed_input; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape( + {input.dim_size(0), input.dim_size(1) + padding_rows, + input.dim_size(2) + padding_cols, input.dim_size(3)}), + &transformed_input)); + + functor::PadInput<GPUDevice, T>()( + ctx->eigen_device<GPUDevice>(), input_param.tensor<T, 4>(), + padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2, + padding_cols - padding_cols / 2, transformed_input.tensor<T, 4>()); + input = transformed_input; + } + + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(input.dim_size(0)) + .set_height(input.dim_size(1)) + .set_width(input.dim_size(2)) + .set_feature_map_count(input.dim_size(3)) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(output->dim_size(0)) + .set_height(output->dim_size(1)) + .set_width(output->dim_size(2)) + .set_feature_map_count(output->dim_size(3)) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter.dim_size(0)) + .set_input_filter_width(filter.dim_size(1)) + .set_input_feature_map_count(filter.dim_size(2)) + .set_output_feature_map_count(filter.dim_size(3)); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(stride) + .set_horizontal_filter_stride(stride); + + Tensor transformed_filter; + OP_REQUIRES_OK(ctx, + ctx->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({filter.dim_size(3), filter.dim_size(2), + filter.dim_size(0), filter.dim_size(1)}), + &transformed_filter)); + + functor::TransformFilter<GPUDevice, T>()( + ctx->eigen_device<GPUDevice>(), filter.tensor<T, 4>(), + transformed_filter.tensor<T, 4>()); + + auto input_ptr = AsDeviceMemory(input.template flat<T>().data(), + input.template flat<T>().size()); + auto filter_ptr = + AsDeviceMemory(transformed_filter.template flat<T>().data(), + transformed_filter.template flat<T>().size()); + auto output_ptr = AsDeviceMemory(output->template flat<T>().data(), + output->template flat<T>().size()); + + bool cudnn_launch_status = + stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr, + conv_desc, output_desc, &output_ptr) + .ok(); + + if (!cudnn_launch_status) { + ctx->SetStatus(errors::Internal( + "cuDNN launch failure : input shape(", input.shape().DebugString(), + ") filter shape(", filter.shape().DebugString(), ")")); + } + } else { + LaunchGeneric<GPUDevice, T>::launch(ctx, input_param, filter, stride, + padding, output); + } + } +}; + +#endif // GOOGLE_CUDA + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void SpatialConvolution<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<T, 4>::ConstTensor filter, int stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialConvolution<GPUDevice, T>; \ + template <> \ + void MatMulConvFunctor<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 2>::Tensor out, \ + typename TTypes<T, 2>::ConstTensor in0, \ + typename TTypes<T, 2>::ConstTensor in1, \ + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \ + extern template struct MatMulConvFunctor<GPUDevice, T>; \ + template <> \ + void TransformFilter<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformFilter<GPUDevice, T>; \ + template <> \ + void PadInput<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + int padding_rows_left, int padding_rows_right, int padding_cols_left, \ + int padding_cols_right, typename TTypes<T, 4>::Tensor out); \ + extern template struct PadInput<GPUDevice, T> + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Registration of the GPU implementations. +REGISTER_KERNEL_BUILDER(Name("Conv2D") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T"), + Conv2DOp<GPUDevice, float>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/conv_ops_gpu.cu.cc b/tensorflow/core/kernels/conv_ops_gpu.cu.cc new file mode 100644 index 0000000000..44af814e2b --- /dev/null +++ b/tensorflow/core/kernels/conv_ops_gpu.cu.cc @@ -0,0 +1,35 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/conv_2d.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +template <typename T> +struct SpatialConvolution<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T, 4>::Tensor output, + typename TTypes<T, 4>::ConstTensor input, + typename TTypes<T, 4>::ConstTensor filter, int stride, + const Eigen::PaddingType& padding) { + // TODO(keveman): nvcc 6.5 crashes when 32 bit indexing is turned on. Enable + // this when we move to cuda 7.0. + // SpatialConvolutionFunc(d, To32Bit(output), To32Bit(input), + // To32Bit(filter), stride, padding); + + SpatialConvolutionFunc(d, output, input, filter, stride, padding); + } +}; + +template struct SpatialConvolution<GPUDevice, float>; + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc new file mode 100644 index 0000000000..e2e9d25d83 --- /dev/null +++ b/tensorflow/core/kernels/conv_ops_gpu_2.cu.cc @@ -0,0 +1,16 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/conv_2d.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; +template struct functor::InflatePadAndShuffle<GPUDevice, float, 4>; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc new file mode 100644 index 0000000000..dbbe08ef9c --- /dev/null +++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc @@ -0,0 +1,22 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/conv_2d.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; +template struct functor::ShuffleAndReverse<GPUDevice, float, 4>; + +template struct functor::TransformFilter<GPUDevice, float>; + +template struct functor::PadInput<GPUDevice, float>; + +template struct functor::TransformDepth<GPUDevice, float>; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc new file mode 100644 index 0000000000..87d79ecb4d --- /dev/null +++ b/tensorflow/core/kernels/conv_ops_gpu_matmul.cu.cc @@ -0,0 +1,16 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/conv_2d.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; +template struct functor::MatMulConvFunctor<GPUDevice, float>; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/core_ops_test.cc b/tensorflow/core/kernels/core_ops_test.cc new file mode 100644 index 0000000000..a42a5999da --- /dev/null +++ b/tensorflow/core/kernels/core_ops_test.cc @@ -0,0 +1,990 @@ +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include <functional> +#include <memory> +#include <unordered_map> +#include <vector> + +#include "tensorflow/cc/ops/const_op.h" +#include "tensorflow/cc/ops/nn_ops.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/eigen_thread_pool.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/port.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +static void SetConstOp(const string& name, std::initializer_list<int64> dims, + NodeDef* node) { + Tensor tensor(DT_FLOAT, TensorShape(dims)); + for (int64 i = 0; i < tensor.NumElements(); ++i) { + tensor.flat<float>()(i) = i / 10.0f; + } + TF_CHECK_OK(NodeDefBuilder(name, "Const") + .Attr("dtype", DT_FLOAT) + .Attr("value", tensor) + .Finalize(node)); +} + +static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes, + NodeDef* node) { + TensorShape shape; + shape.AddDim(sizes.size()); + Tensor tensor(DT_INT32, shape); + for (int64 i = 0; i < tensor.NumElements(); ++i) { + tensor.flat<int32>()(i) = sizes[i]; + } + TF_CHECK_OK(NodeDefBuilder(name, "Const") + .Attr("dtype", DT_INT32) + .Attr("value", tensor) + .Finalize(node)); +} + +namespace { + +enum CONV_OP { + CONV_OP_FORWARD = 0, + CONV_OP_BACKPROP_INPUT = 1, + CONV_OP_BACKPROP_FILTER = 2 +}; + +} // namespace + +static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth, + int out_depth, int filter_rows, int filter_cols, + CONV_OP op, int num_threads, int stride, + Padding padding, bool use_gpu, const string& label) { + if (!IsGoogleCudaEnabled() && use_gpu) { + testing::SetLabel( + strings::StrCat("Skipping GPU test (no --config=cuda): ", label)); + return; + } + testing::SetLabel(label); + + // Set the number of threads + SessionOptions options; + options.config.set_intra_op_parallelism_threads(num_threads); + + // We set up a graph for computing convolution. + GraphDef graph; + + // For this, we need an input tensor and a filter tensor. + // Compute the output size. + int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; + TF_CHECK_OK(Get2dOutputSize(rows, cols, filter_rows, filter_cols, stride, + stride, padding, &out_rows, &out_cols, &pad_rows, + &pad_cols)); + // Counting the number of floating point operations (both MUL and ADD) + int64 num_ops = 0; + if (op == CONV_OP_FORWARD) { + // Forward computation: + // BATCH x OUT_ROW X OUT_COL X IN_DEPTH X PATCH_ROW X PATH_COL X OUT_DEPTH + // We multiply by two since there are mutliplications and additions. + num_ops = static_cast<int64>(batch * in_depth * out_depth) * + static_cast<int64>(filter_rows * filter_cols) * + static_cast<int64>(out_rows * out_cols) * 2; + } else { + // Backward computation: both input and filter backprop take the same + // amount of computation: + // BATCH x IN_ROW X IN_COL X IN_DEPTH X PATCH_ROW X PATCH_COL X OUT_DEPTH + // We multiply by two since there are mutliplications and additions. + num_ops = static_cast<int64>(batch * in_depth * out_depth) * + static_cast<int64>(filter_rows * filter_cols) * + static_cast<int64>(rows * cols) * 2; + } + + SetConstOp("input", {batch, rows, cols, in_depth}, graph.add_node()); + SetConstOp("filter", {filter_rows, filter_cols, in_depth, out_depth}, + graph.add_node()); + SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth}, + graph.add_node()); + SetConstSizesOp("input_sizes", + std::vector<int32>({batch, rows, cols, in_depth}), + graph.add_node()); + SetConstSizesOp("filter_sizes", std::vector<int32>({filter_rows, filter_cols, + in_depth, out_depth}), + graph.add_node()); + + // Now add the convolution op + NodeDef* conv = graph.add_node(); + switch (op) { + case CONV_OP_FORWARD: + TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2D") + .Input("input", 0, DT_FLOAT) + .Input("filter", 0, DT_FLOAT) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(conv)); + break; + case CONV_OP_BACKPROP_INPUT: + TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropInput") + .Input("input_sizes", 0, DT_INT32) + .Input("filter", 0, DT_FLOAT) + .Input("output_backprop", 0, DT_FLOAT) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(conv)); + break; + case CONV_OP_BACKPROP_FILTER: + TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropFilter") + .Input("input", 0, DT_FLOAT) + .Input("filter_sizes", 0, DT_INT32) + .Input("output_backprop", 0, DT_FLOAT) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(conv)); + break; + } + Graph* g = new Graph(OpRegistry::Global()); + GraphConstructorOptions opts; + TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g)); + + string device = use_gpu ? "gpu" : "cpu"; + test::Benchmark(device, g, &options).Run(iters); + testing::ItemsProcessed(num_ops * iters); +} + +// BS: batch_size +// R: tensor_in_rows +// C: tensor_in_cols +// ID: input_depth +// OD: output_depth +// KR: kernel_rows +// KC: kernel_cols +#define BM_ConvFloatFwd(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL) \ + static void BM_ConvFloatFwdCPU1_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \ + PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ + } \ + static void BM_ConvFloatFwdCPU4_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR, \ + PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ + } \ + static void BM_ConvFloatFwdGPU_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \ + PAD, true, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ + } \ + BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL); \ + BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL); \ + BENCHMARK(BM_ConvFloatFwdGPU_##LABEL) + +BM_ConvFloatFwd(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0); +BM_ConvFloatFwd(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1); +BM_ConvFloatFwd(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2); +BM_ConvFloatFwd(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3); +BM_ConvFloatFwd(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4); +BM_ConvFloatFwd(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5); +BM_ConvFloatFwd(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6); +BM_ConvFloatFwd(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7); +BM_ConvFloatFwd(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8); +BM_ConvFloatFwd(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9); +BM_ConvFloatFwd(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10); +BM_ConvFloatFwd(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11); +BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12); +BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13); +BM_ConvFloatFwd(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14); +BM_ConvFloatFwd(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15); +BM_ConvFloatFwd(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16); +BM_ConvFloatFwd(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17); +BM_ConvFloatFwd(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18); +BM_ConvFloatFwd(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19); +BM_ConvFloatFwd(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20); +BM_ConvFloatFwd(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21); +BM_ConvFloatFwd(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22); +BM_ConvFloatFwd(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23); +BM_ConvFloatFwd(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24); +BM_ConvFloatFwd(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25); +BM_ConvFloatFwd(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26); +BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27); +BM_ConvFloatFwd(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28); +BM_ConvFloatFwd(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29); +BM_ConvFloatFwd(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30); +BM_ConvFloatFwd(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31); +BM_ConvFloatFwd(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32); +BM_ConvFloatFwd(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33); +BM_ConvFloatFwd(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34); +BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35); +BM_ConvFloatFwd(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36); +BM_ConvFloatFwd(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37); +BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38); +BM_ConvFloatFwd(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39); +BM_ConvFloatFwd(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40); +BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41); +BM_ConvFloatFwd(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42); +BM_ConvFloatFwd(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43); +BM_ConvFloatFwd(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44); +BM_ConvFloatFwd(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45); +BM_ConvFloatFwd(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46); +BM_ConvFloatFwd(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47); +BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48); +BM_ConvFloatFwd(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49); +BM_ConvFloatFwd(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50); +BM_ConvFloatFwd(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51); +BM_ConvFloatFwd(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52); +BM_ConvFloatFwd(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53); +BM_ConvFloatFwd(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54); + +#define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL) \ + static void BM_ConvFloatBkInCPU1_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \ + STR, PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ + } \ + static void BM_ConvFloatBkInCPU4_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4, \ + STR, PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ + } \ + static void BM_ConvFloatBkInGPU_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \ + STR, PAD, true, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ + } \ + static void BM_ConvFloatBkFilterCPU1_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ + STR, PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ + } \ + static void BM_ConvFloatBkFilterCPU4_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4, \ + STR, PAD, false, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ + } \ + static void BM_ConvFloatBkFilterGPU_##LABEL(int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ + STR, PAD, true, \ + strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ + KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ + } \ + BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL); \ + BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL); \ + BENCHMARK(BM_ConvFloatBkInGPU_##LABEL); \ + BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL); \ + BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL); \ + BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL) + +// Benchmarks from the inception model + +BM_ConvFloatBkInAndFilter(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0); +BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1); +BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2); +BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3); +BM_ConvFloatBkInAndFilter(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4); +BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5); +BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6); +BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7); +BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8); +BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9); +BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10); +BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14); +BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16); +BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21); +BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26); +BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27); +BM_ConvFloatBkInAndFilter(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30); +BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32); +BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33); +BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34); +BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35); +BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36); +BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37); +BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38); +BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39); +BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40); +BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41); +BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42); +BM_ConvFloatBkInAndFilter(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43); +BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44); +BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45); +BM_ConvFloatBkInAndFilter(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46); +BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47); +BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48); +BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49); +BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50); +BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51); +BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52); +BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53); +BM_ConvFloatBkInAndFilter(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54); + +#define BM_ConvFloatBkFCPU(BS, R, C, ID, OD, KR, KC, TH, LABEL) \ + static void \ + BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH( \ + int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \ + 1, VALID, false, LABEL); \ + } \ + BENCHMARK( \ + BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH) + +// Benchmarks from https://github.com/soumith/convnet-benchmarks +BM_ConvFloatBkFCPU(128, 128, 128, 3, 96, 11, 11, 4, "convnet-layer1"); +BM_ConvFloatBkFCPU(128, 64, 64, 64, 128, 9, 9, 4, "convnet-layer2"); +BM_ConvFloatBkFCPU(128, 32, 32, 128, 128, 9, 9, 4, "convnet-layer3"); +BM_ConvFloatBkFCPU(128, 16, 16, 128, 128, 7, 7, 4, "convnet-layer4"); +BM_ConvFloatBkFCPU(128, 13, 13, 384, 384, 3, 3, 4, "convnet-layer5"); + +#define BM_ConvFloatBkFGPU(BS, R, C, ID, OD, KR, KC, LABEL) \ + static void BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \ + int iters) { \ + BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ + 1, VALID, true, LABEL); \ + } \ + BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC) + +// Benchmarks from https://github.com/soumith/convnet-benchmarks +BM_ConvFloatBkFGPU(128, 128, 128, 3, 96, 11, 11, "convnet-layer1"); +BM_ConvFloatBkFGPU(128, 64, 64, 64, 128, 9, 9, "convnet-layer2"); +BM_ConvFloatBkFGPU(128, 32, 32, 128, 128, 9, 9, "convnet-layer3"); +BM_ConvFloatBkFGPU(128, 16, 16, 128, 128, 7, 7, "convnet-layer4"); +BM_ConvFloatBkFGPU(128, 13, 13, 384, 384, 3, 3, "convnet-layer5"); + +static void BM_LRNFloat(int iters, int depth, int cols, int rows, + int batch_size, int range, int num_threads, + const string& label) { + tensorflow::testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + thread::ThreadPool threadpool(Env::Default(), "test", num_threads); + EigenThreadPoolWrapper wrapper(&threadpool); + Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); + device->set_eigen_cpu_device(&eigen_cpu_device); + + gtl::InlinedVector<TensorValue, 4> inputs; + TensorShape shape({batch_size, rows, cols, depth}); + + Tensor input(DT_FLOAT, shape); + test::FillIota<float>(&input, 1.0); + inputs.push_back({nullptr, &input}); + + // Convolution op. + NodeDef lrn_node_def; + TF_CHECK_OK(NodeDefBuilder("lrn_op", "LRN") + .Input("input", 0, DT_FLOAT) + .Attr("depth_radius", range) + .Attr("bias", 1.0) + .Attr("alpha", 0.1) + .Attr("beta", 0.5) + .Finalize(&lrn_node_def)); + + Status status; + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), lrn_node_def, &status)); + TF_CHECK_OK(status); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> context(new OpKernelContext(params)); + + op->Compute(context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete context->release_output(0).tensor; + op->Compute(context.get()); + } + tensorflow::testing::StopTiming(); + testing::ItemsProcessed(context->mutable_output(0)->NumElements() * iters * + (2 * range + 1) * 2); + testing::SetLabel(label); +} + +#define BM_LRNFloatFwdCPU(DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL) \ + static void \ + BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS( \ + int iters) { \ + BM_LRNFloat(iters, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL); \ + } \ + BENCHMARK( \ + BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS) + +// clang-format off +// DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL +BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 1, "lrn 1 thread"); +BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 1, "lrn 1 thread"); +BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 1, "lrn 1 thread"); +BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 4, "lrn 4 threads"); +BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 4, "lrn 4 threads"); +BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 4, "lrn 4 threads"); +BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 8, "lrn 8 threads"); +BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 8, "lrn 8 threads"); +BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 8, "lrn 8 threads"); +// clang-format on + +/* +AvgPooling Op +*/ +static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth, + int kernel_rows, int kernel_cols, int stride, + Padding padding, int num_threads, const string& label) { + tensorflow::testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + thread::ThreadPool threadpool(Env::Default(), "test", num_threads); + EigenThreadPoolWrapper wrapper(&threadpool); + Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); + device->set_eigen_cpu_device(&eigen_cpu_device); + + gtl::InlinedVector<TensorValue, 4> inputs; + TensorShape shape1({batch_size, rows, cols, depth}); + Tensor input1(DT_FLOAT, shape1); + test::FillIota<float>(&input1, 1.0); + inputs.push_back({nullptr, &input1}); + + // AvgPooling op. + NodeDef avgpool_node_def; + CHECK_EQ(kernel_rows, kernel_cols); + Status status = NodeDefBuilder("avgpool_op", "AvgPool") + .Input(FakeInput(DT_FLOAT)) + .Attr("ksize", {1, kernel_rows, kernel_cols, 1}) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(&avgpool_node_def); + TF_CHECK_OK(status); + + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), avgpool_node_def, &status)); + TF_CHECK_OK(status); + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params)); + + op->Compute(avgpool_context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete avgpool_context->release_output(0).tensor; + op->Compute(avgpool_context.get()); + } + tensorflow::testing::StopTiming(); + testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() * + iters); + testing::SetLabel(label); +} + +// BS: batch_size +// IR: input_rows +// IC: input_cols +// ND: node_depth +// KR: kernel_rows +// KC: kernel_cols +// ST: stride. We use the same stride for both directions. +// PT: padding +#define BM_AvgPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ + static void \ + BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \ + int iters) { \ + BM_AvgPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \ + } \ + BENCHMARK( \ + BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) + +// Labels are taken from the 2014-July-24 version of imagenet +BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "avgpool0_VALID"); +BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "avgpool1_VALID"); +BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "avgpool4_VALID"); +BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "avgpool10_VALID"); +BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "avgpool0_SAME"); +BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "avgpool1_SAME"); +BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "avgpool4_SAME"); +BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "avgpool10_SAME"); +BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "avgpool0_VALID"); +BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "avgpool1_VALID"); +BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "avgpool4_VALID"); +BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "avgpool10_VALID"); +BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "avgpool0_SAME"); +BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "avgpool1_SAME"); +BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "avgpool4_SAME"); +BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME"); + +static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols, + int depth, int kernel_rows, int kernel_cols, + int stride, Padding padding, int num_threads, + const string& label) { + tensorflow::testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + thread::ThreadPool threadpool(Env::Default(), "test", num_threads); + EigenThreadPoolWrapper wrapper(&threadpool); + Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); + device->set_eigen_cpu_device(&eigen_cpu_device); + + gtl::InlinedVector<TensorValue, 4> inputs; + + int out_height, out_width, pad_rows, pad_cols; + Status status = + Get2dOutputSize(rows, cols, kernel_rows, kernel_cols, stride, stride, + padding, &out_height, &out_width, &pad_rows, &pad_cols); + TF_CHECK_OK(status); + TensorShape output_shape({batch_size, out_height, out_width, depth}); + TensorShape shape2({4}); + Tensor input_shape_tensor(DT_INT32, shape2); + int32 input_dims[] = {batch_size, rows, cols, depth}; + for (int i = 0; i < 4; i++) { + input_shape_tensor.flat<int32>()(i) = input_dims[i]; + } + inputs.push_back({nullptr, &input_shape_tensor}); + + Tensor output_backprop(DT_FLOAT, output_shape); + test::FillIota<float>(&output_backprop, 11.0); + inputs.push_back({nullptr, &output_backprop}); + + // AvgPoolGrad op. + NodeDef avgpool_grad_node_def; + status = NodeDefBuilder("avgpool_grad_op", "AvgPoolGrad") + .Input(FakeInput()) + .Input(FakeInput(DT_FLOAT)) + .Attr("ksize", {1, kernel_rows, kernel_cols, 1}) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(&avgpool_grad_node_def); + TF_CHECK_OK(status); + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, nullptr, cpu_allocator(), avgpool_grad_node_def, &status)); + TF_CHECK_OK(status); + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params)); + + op->Compute(avgpool_context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete avgpool_context->release_output(0).tensor; + op->Compute(avgpool_context.get()); + } + tensorflow::testing::StopTiming(); + testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() * + iters); + testing::SetLabel(label); +} + +// BS: batch_size +// IR: input_rows +// IC: input_cols +// ND: node_depth +// KR: kernel_rows +// KC: kernel_cols +// ST: stride. We use the same stride for both directions. +// PT: padding +// The resulted symbol is too long. Need to use two macros to fit in 80-chars +#define BM_AvgPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ + static void \ + BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \ + int iters) { \ + BM_AvgPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \ + } \ + BENCHMARK( \ + BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) + +// Shapes taken from the 2015/05/16 inception model +BM_AvgPoolBkCPU(32, 35, 35, 192, 3, 3, 1, SAME, 1, "avgpool_grad0_SAME"); +BM_AvgPoolBkCPU(32, 35, 35, 256, 3, 3, 1, SAME, 1, "avgpool_grad1_SAME"); +BM_AvgPoolBkCPU(32, 17, 17, 768, 3, 3, 1, SAME, 1, "avgpool_grad2_SAME"); +BM_AvgPoolBkCPU(32, 17, 17, 1024, 3, 3, 1, SAME, 1, "avgpool_grad3_SAME"); +BM_AvgPoolBkCPU(32, 17, 17, 1152, 3, 3, 1, SAME, 1, "avgpool_grad4_SAME"); +BM_AvgPoolBkCPU(32, 17, 17, 1216, 3, 3, 1, SAME, 1, "avgpool_grad5_SAME"); +BM_AvgPoolBkCPU(32, 17, 17, 1248, 5, 5, 3, VALID, 1, "avgpool_grad6_VALID"); +BM_AvgPoolBkCPU(32, 8, 8, 1760, 3, 3, 1, SAME, 1, "avgpool_grad7_SAME"); +BM_AvgPoolBkCPU(32, 8, 8, 2048, 8, 8, 1, VALID, 1, "avgpool_grad8_VALID"); + +/* +MaxPooling Op +*/ +static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth, + int kernel_rows, int kernel_cols, int stride, + Padding padding, int num_threads, const string& label) { + tensorflow::testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + thread::ThreadPool threadpool(Env::Default(), "test", num_threads); + EigenThreadPoolWrapper wrapper(&threadpool); + Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); + device->set_eigen_cpu_device(&eigen_cpu_device); + + gtl::InlinedVector<TensorValue, 4> inputs; + TensorShape shape1({batch_size, rows, cols, depth}); + Tensor input1(DT_FLOAT, shape1); + test::FillIota<float>(&input1, 1.0); + inputs.push_back({nullptr, &input1}); + + // MaxPooling op. + NodeDef maxpool_node_def; + CHECK_EQ(kernel_rows, kernel_cols); + Status status = NodeDefBuilder("maxpool_op", "MaxPool") + .Input(FakeInput()) + .Attr("ksize", {1, kernel_rows, kernel_cols, 1}) + .Attr("strides", {1, stride, stride, 1}) + .Attr("padding", padding == VALID ? "VALID" : "SAME") + .Finalize(&maxpool_node_def); + TF_CHECK_OK(status); + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), maxpool_node_def, &status)); + TF_CHECK_OK(status); + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> maxpool_context(new OpKernelContext(params)); + + op->Compute(maxpool_context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete maxpool_context->release_output(0).tensor; + op->Compute(maxpool_context.get()); + } + tensorflow::testing::StopTiming(); + testing::ItemsProcessed(maxpool_context->mutable_output(0)->NumElements() * + iters); + testing::SetLabel(label); +} + +// BS: batch_size +// IR: input_rows +// IC: input_cols +// ND: node_depth +// KR: kernel_rows +// KC: kernel_cols +// ST: stride. We use the same stride for both directions. +// PT: padding +#define BM_MaxPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ + static void \ + BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \ + int iters) { \ + BM_MaxPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \ + } \ + BENCHMARK( \ + BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) + +// Labels are taken from the 2014-July-24 version of imagenet +BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID"); +BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID"); +BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID"); +BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "maxpool10_VALID"); +BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME"); +BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME"); +BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME"); +BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME"); +BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID"); +BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID"); +BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID"); +BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "maxpool10_VALID"); +BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "maxpool0_SAME"); +BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "maxpool1_SAME"); +BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "maxpool4_SAME"); +BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME"); + +static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols, + int depth, int kernel_rows, int kernel_cols, + int stride, Padding padding, int num_threads, + bool use_gpu, const string& label) { + GraphDefBuilder b(GraphDefBuilder::kFailImmediately); + + int out_height, out_width, pad_rows, pad_cols; + Status status = + Get2dOutputSize(rows, cols, kernel_rows, kernel_cols, stride, stride, + padding, &out_height, &out_width, &pad_rows, &pad_cols); + TF_CHECK_OK(status); + + Tensor input_data(DT_FLOAT, TensorShape({batch_size, rows, cols, depth})); + input_data.flat<float>().setRandom(); + Node* input_data_node = ops::Const(input_data, b.opts()); + + Tensor output_data(DT_FLOAT, + TensorShape({batch_size, out_height, out_width, depth})); + output_data.flat<float>().setRandom(); + Node* output_data_node = ops::Const(output_data, b.opts()); + + Tensor output_diff(DT_FLOAT, + TensorShape({batch_size, out_height, out_width, depth})); + output_diff.flat<float>().setRandom(); + Node* output_diff_node = ops::Const(output_diff, b.opts()); + + CHECK_EQ(kernel_rows, kernel_cols); + ops::MaxPoolGrad(input_data_node, output_data_node, output_diff_node, + {1, kernel_rows, kernel_cols, 1} /* ksize */, + {1, stride, stride, 1} /* stride */, + padding == VALID ? "VALID" : "SAME", b.opts()); + Graph* g = new Graph(OpRegistry::Global()); + TF_CHECK_OK(b.ToGraph(g)); + string device = use_gpu ? "gpu" : "cpu"; + test::Benchmark(device, g).Run(iters); + + testing::ItemsProcessed(batch_size * rows * cols * depth * iters); + testing::SetLabel(label); +} + +// BS: batch_size +// IR: input_rows +// IC: input_cols +// ND: node_depth +// KR: kernel_rows +// KC: kernel_cols +// ST: stride. We use the same stride for both directions. +// PT: padding +// The resulted symbol is too long. Need to use two macros to fit in 80-chars +// clang-format off +#define BM_MaxPoolBkGPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ + static void \ + BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ + ##PT##_##TH( \ + int iters) { \ + BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL); \ + } \ + BENCHMARK( \ + BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ + ##PT##_##TH) \ + +#define BM_MaxPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ + static void \ + BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ + ##PT##_##TH( \ + int iters) { \ + BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL); \ + } \ + BENCHMARK( \ + BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ + ##PT##_##TH) +// clang-format on + +// Shapes taken from the 2015/05/16 inception model +BM_MaxPoolBkGPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID"); +BM_MaxPoolBkGPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID"); +BM_MaxPoolBkGPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID"); +BM_MaxPoolBkGPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID"); +BM_MaxPoolBkGPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID"); + +BM_MaxPoolBkCPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID"); +BM_MaxPoolBkCPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID"); +BM_MaxPoolBkCPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID"); +BM_MaxPoolBkCPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID"); +BM_MaxPoolBkCPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID"); + +/* +Relu Op +Run benchmark with: +*/ +static void BM_ReluFloat(int iters, int batch_size, int rows, int cols, + int depth, int num_threads, const string& label) { + tensorflow::testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + thread::ThreadPool threadpool(Env::Default(), "test", num_threads); + EigenThreadPoolWrapper wrapper(&threadpool); + Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); + device->set_eigen_cpu_device(&eigen_cpu_device); + + gtl::InlinedVector<TensorValue, 4> inputs; + TensorShape shape1({batch_size, rows, cols, depth}); + Tensor input1(DT_FLOAT, shape1); + test::FillIota<float>(&input1, 1.0); + inputs.push_back({nullptr, &input1}); + + // Reluing op. + NodeDef relu_node_def; + Status status = NodeDefBuilder("relu_op", "Relu") + .Input(FakeInput(DT_FLOAT)) + .Finalize(&relu_node_def); + TF_CHECK_OK(status); + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), relu_node_def, &status)); + TF_CHECK_OK(status); + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(params)); + + op->Compute(relu_context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete relu_context->release_output(0).tensor; + op->Compute(relu_context.get()); + } + tensorflow::testing::StopTiming(); + testing::ItemsProcessed(relu_context->mutable_output(0)->NumElements() * + iters); + testing::SetLabel(label); +} + +// BS: batch_size +// IR: input_rows +// IC: input_cols +// ND: node_depth +#define BM_Relu(BS, IR, IC, ND, TH, LABEL) \ + static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \ + BM_ReluFloat(iters, BS, IR, IC, ND, TH, LABEL); \ + } \ + BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH) + +BM_Relu(32, 112, 112, 64, 1, "relu0"); +BM_Relu(32, 56, 56, 192, 1, "relu1"); +BM_Relu(32, 28, 28, 352, 1, "relu4"); +BM_Relu(32, 14, 14, 576, 1, "relu10"); +BM_Relu(32, 112, 112, 64, 4, "relu0"); +BM_Relu(32, 56, 56, 192, 4, "relu1"); +BM_Relu(32, 28, 28, 352, 4, "relu4"); +BM_Relu(32, 14, 14, 576, 4, "relu10"); + +static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth, + int num_threads, const string& label) { + tensorflow::testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + thread::ThreadPool threadpool(Env::Default(), "test", num_threads); + EigenThreadPoolWrapper wrapper(&threadpool); + Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); + device->set_eigen_cpu_device(&eigen_cpu_device); + + gtl::InlinedVector<TensorValue, 4> inputs; + TensorShape shape1({node_depth, batch_size}); + Tensor* input1 = new Tensor(DT_FLOAT, shape1); + test::FillIota<float>(input1, 1.0); + inputs.push_back({nullptr, input1}); + + // Softmax op. + NodeDef softmax_node_def; + TF_CHECK_OK(NodeDefBuilder("softmax_op", "Softmax") + .Input("input", 0, DT_FLOAT) + .Finalize(&softmax_node_def)); + Status status; + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), softmax_node_def, &status)); + TF_CHECK_OK(status); + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> softmax_context(new OpKernelContext(params)); + + op->Compute(softmax_context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete softmax_context->release_output(0).tensor; + op->Compute(softmax_context.get()); + } + tensorflow::testing::StopTiming(); + testing::ItemsProcessed(softmax_context->mutable_output(0)->NumElements() * + iters); + testing::SetLabel(label); +} + +#define BM_ImageNetSoftmaxFwdCPU(BATCH_SIZE, NODE_DEPTH, TH, LABEL) \ + static void BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH( \ + int iters) { \ + BM_ImageNetSoftmaxFwd(iters, BATCH_SIZE, NODE_DEPTH, TH, LABEL); \ + } \ + BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH) + +// Labels are taken from the 2014-July-24 version of imagenet +BM_ImageNetSoftmaxFwdCPU(32, 1008, 1, "softmax32"); +BM_ImageNetSoftmaxFwdCPU(128, 1008, 1, "softmax128"); +BM_ImageNetSoftmaxFwdCPU(32, 1008, 4, "softmax32"); +BM_ImageNetSoftmaxFwdCPU(128, 1008, 4, "softmax128"); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/count_up_to_op.cc b/tensorflow/core/kernels/count_up_to_op.cc new file mode 100644 index 0000000000..7cf4bdb6d0 --- /dev/null +++ b/tensorflow/core/kernels/count_up_to_op.cc @@ -0,0 +1,51 @@ +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +template <class T> +class CountUpToOp : public OpKernel { + public: + explicit CountUpToOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("limit", &limit_)); + } + + void Compute(OpKernelContext* context) override { + T before_increment; + { + mutex_lock l(*context->input_ref_mutex(0)); + Tensor tensor = context->mutable_input(0, true); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(tensor.shape()), + errors::InvalidArgument("input is not a scalar: ", + tensor.shape().DebugString())); + T* ptr = &tensor.scalar<T>()(); + before_increment = *ptr; + if (*ptr >= limit_) { + context->SetStatus(errors::OutOfRange("Reached limit of ", limit_)); + return; + } + ++*ptr; + } + // Output if no error. + Tensor* out_tensor; + OP_REQUIRES_OK(context, context->allocate_output("output", TensorShape({}), + &out_tensor)); + out_tensor->scalar<T>()() = before_increment; + } + + private: + T limit_; +}; + +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("CountUpTo").TypeConstraint<TYPE>("T").Device(DEVICE_CPU), \ + CountUpToOp<TYPE>) + +REGISTER(int32); +REGISTER(int64); + +#undef REGISTER + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc new file mode 100644 index 0000000000..5d39b88166 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_abs.cc @@ -0,0 +1,23 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(UnaryOp, CPU, "Abs", functor::abs, float, double, int32, int64); +#ifndef __ANDROID__ +REGISTER_KERNEL_BUILDER(Name("ComplexAbs").Device(DEVICE_CPU), + UnaryOp<CPUDevice, functor::abs<complex64>>); +#endif +#if GOOGLE_CUDA +REGISTER3(UnaryOp, GPU, "Abs", functor::abs, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Abs") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .TypeConstraint<int32>("T"), + UnaryOp<CPUDevice, functor::abs<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_add.cc b/tensorflow/core/kernels/cwise_op_add.cc new file mode 100644 index 0000000000..a6cd4bddbe --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_add.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER7(BinaryOp, CPU, "Add", functor::add, float, double, int32, int64, int8, + int16, complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Add", functor::add, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Add") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::add<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc new file mode 100644 index 0000000000..0a8f1313f8 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER2(UnaryOp, CPU, "Ceil", functor::ceil, float, double); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Ceil", functor::ceil, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_complex.cc b/tensorflow/core/kernels/cwise_op_complex.cc new file mode 100644 index 0000000000..825181bc35 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_complex.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("Complex").Device(DEVICE_CPU), + BinaryOp<CPUDevice, functor::make_complex<float>>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Complex").Device(DEVICE_GPU), + BinaryOp<GPUDevice, functor::make_complex<float>>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_conj.cc b/tensorflow/core/kernels/cwise_op_conj.cc new file mode 100644 index 0000000000..ba445d1c3d --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_conj.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("Conj").Device(DEVICE_CPU), + UnaryOp<CPUDevice, functor::conj<complex64>>); +#if GOOGLE_CUDA +// REGISTER_KERNEL_BUILDER(Name("Conj").Device(DEVICE_GPU), +// UnaryOp<GPUDevice, functor::conj<complex64>>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc new file mode 100644 index 0000000000..45e24fc2ec --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_cos.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Cos", functor::cos, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Cos", functor::cos, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc new file mode 100644 index 0000000000..76d606ed03 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_div.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(BinaryOp, CPU, "Div", functor::div, float, double, int32, int64, + complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Div", functor::div, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Div") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::div<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_equal_to.cc b/tensorflow/core/kernels/cwise_op_equal_to.cc new file mode 100644 index 0000000000..8369299332 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_equal_to.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(BinaryOp, CPU, "Equal", functor::equal_to, float, double, int32, + int64, complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Equal", functor::equal_to, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Equal") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::equal_to<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc new file mode 100644 index 0000000000..b2603a1b4c --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_exp.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Exp", functor::exp, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Exp", functor::exp, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc new file mode 100644 index 0000000000..83c8203953 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER2(UnaryOp, CPU, "Floor", functor::floor, float, double); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Floor", functor::floor, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc new file mode 100644 index 0000000000..59436afbc0 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_abs.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY3(abs, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc new file mode 100644 index 0000000000..edf8e0d1a5 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_add.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(add, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc new file mode 100644 index 0000000000..f24c4b8b73 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_ceil.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(ceil, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc new file mode 100644 index 0000000000..29086b5c71 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_complex.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY1(make_complex, float); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc new file mode 100644 index 0000000000..cae22cea8e --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +// DEFINE_UNARY1(conj, complex64); // not working +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc new file mode 100644 index 0000000000..c8412496a8 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_cos.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(cos, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc new file mode 100644 index 0000000000..c581c0487e --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_div.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(div, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc new file mode 100644 index 0000000000..f994822a74 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY4(equal_to, float, double, int64, complex64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc new file mode 100644 index 0000000000..caeaa19cef --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(exp, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc new file mode 100644 index 0000000000..0a06ff2978 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_floor.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(floor, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc new file mode 100644 index 0000000000..e1278e077b --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_greater.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(greater, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc new file mode 100644 index 0000000000..fafcf9b28a --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_greater_equal.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(greater_equal, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc new file mode 100644 index 0000000000..0370782c96 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_imag.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY1(get_imag, complex64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc new file mode 100644 index 0000000000..020abef210 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_inverse.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY3(inverse, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc new file mode 100644 index 0000000000..7a3a273af7 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_isfinite.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(isfinite, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc new file mode 100644 index 0000000000..cfc4be3d25 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_isinf.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(isinf, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc new file mode 100644 index 0000000000..c93b74387e --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_isnan.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(isnan, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc new file mode 100644 index 0000000000..8e2b28ac60 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_less.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(less, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc new file mode 100644 index 0000000000..be8e34a58b --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_less_equal.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(less_equal, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc new file mode 100644 index 0000000000..7d183cce50 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_log.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(log, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc new file mode 100644 index 0000000000..ba7046f9f0 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_logical_and.cu.cc @@ -0,0 +1,13 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +template struct BinaryFunctor<GPUDevice, logical_and, 1>; +template struct BinaryFunctor<GPUDevice, logical_and, 2>; +template struct BinaryFunctor<GPUDevice, logical_and, 3>; +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc new file mode 100644 index 0000000000..34a43a76ef --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_logical_not.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +template struct UnaryFunctor<GPUDevice, logical_not>; +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc new file mode 100644 index 0000000000..47a7bd68dc --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_logical_or.cu.cc @@ -0,0 +1,13 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +template struct BinaryFunctor<GPUDevice, logical_or, 1>; +template struct BinaryFunctor<GPUDevice, logical_or, 2>; +template struct BinaryFunctor<GPUDevice, logical_or, 3>; +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc new file mode 100644 index 0000000000..8f7ab90e9a --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_maximum.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(maximum, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc new file mode 100644 index 0000000000..75fd7f89b4 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_minimum.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(minimum, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc new file mode 100644 index 0000000000..d08a17a94d --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_mod.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +// No GPU ops for mod yet. +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc new file mode 100644 index 0000000000..e0a6738bef --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(mul, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc new file mode 100644 index 0000000000..3031afbb75 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY4(neg, float, double, int32, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc new file mode 100644 index 0000000000..59c76ee88b --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_not_equal_to.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY4(not_equal_to, float, double, int64, complex64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc new file mode 100644 index 0000000000..50177495bc --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_pow.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(pow, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc new file mode 100644 index 0000000000..3b1d465914 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_real.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY1(get_real, complex64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc new file mode 100644 index 0000000000..682e2d2d4b --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_rsqrt.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(rsqrt, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc new file mode 100644 index 0000000000..b5125648e3 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc @@ -0,0 +1,15 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +template struct SelectFunctor<GPUDevice, float>; +template struct SelectFunctor<GPUDevice, double>; +template struct SelectFunctor<GPUDevice, int32>; +template struct SelectFunctor<GPUDevice, int64>; +template struct SelectFunctor<GPUDevice, complex64>; +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc new file mode 100644 index 0000000000..9c250f3071 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_sigmoid.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(sigmoid, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc new file mode 100644 index 0000000000..f413480ecc --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_sign.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY3(sign, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc new file mode 100644 index 0000000000..6135f3b780 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_sin.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(sin, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc new file mode 100644 index 0000000000..9bdf3b9e30 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_sqrt.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(sqrt, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc new file mode 100644 index 0000000000..6b900e994d --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_square.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY3(square, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc new file mode 100644 index 0000000000..6fd5ea0d38 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_BINARY3(sub, float, double, int64); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc new file mode 100644 index 0000000000..e0393f6c2a --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_gpu_tanh.cu.cc @@ -0,0 +1,11 @@ +#if GOOGLE_CUDA + +#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h" + +namespace tensorflow { +namespace functor { +DEFINE_UNARY2(tanh, float, double); +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc new file mode 100644 index 0000000000..9ae31dcdfe --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_greater.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(BinaryOp, CPU, "Greater", functor::greater, float, double, int32, + int64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Greater", functor::greater, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Greater") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::greater<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc new file mode 100644 index 0000000000..be4cc5dc79 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc @@ -0,0 +1,22 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, float, double, + int32, int64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "GreaterEqual", functor::greater_equal, float, double, + int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("GreaterEqual") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::greater_equal<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_imag.cc b/tensorflow/core/kernels/cwise_op_imag.cc new file mode 100644 index 0000000000..c2432326fc --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_imag.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("Imag").Device(DEVICE_CPU), + UnaryOp<CPUDevice, functor::get_imag<complex64>>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Imag").Device(DEVICE_GPU), + UnaryOp<GPUDevice, functor::get_imag<complex64>>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_inverse.cc b/tensorflow/core/kernels/cwise_op_inverse.cc new file mode 100644 index 0000000000..6af883e755 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_inverse.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Inv", functor::inverse, float, double, complex64); +#if GOOGLE_CUDA +REGISTER3(UnaryOp, GPU, "Inv", functor::inverse, float, double, int64); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc new file mode 100644 index 0000000000..e52d199a8f --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_isfinite.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER2(UnaryOp, CPU, "IsFinite", functor::isfinite, float, double); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "IsFinite", functor::isfinite, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc new file mode 100644 index 0000000000..868204f86e --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_isinf.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER2(UnaryOp, CPU, "IsInf", functor::isinf, float, double); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "IsInf", functor::isinf, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc new file mode 100644 index 0000000000..a8f4d60d0f --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_isnan.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER2(UnaryOp, CPU, "IsNan", functor::isnan, float, double); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "IsNan", functor::isnan, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc new file mode 100644 index 0000000000..3b5f75445c --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_less.cc @@ -0,0 +1,20 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(BinaryOp, CPU, "Less", functor::less, float, double, int32, int64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Less", functor::less, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Less") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::less<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc new file mode 100644 index 0000000000..507c7c2908 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_less_equal.cc @@ -0,0 +1,22 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(BinaryOp, CPU, "LessEqual", functor::less_equal, float, double, int32, + int64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "LessEqual", functor::less_equal, float, double, + int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("LessEqual") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::less_equal<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc new file mode 100644 index 0000000000..ebc7cbcc4e --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_log.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Log", functor::log, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Log", functor::log, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_logical_and.cc b/tensorflow/core/kernels/cwise_op_logical_and.cc new file mode 100644 index 0000000000..a4075088f4 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_logical_and.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("LogicalAnd").Device(DEVICE_CPU), + BinaryOp<CPUDevice, functor::logical_and>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("LogicalAnd").Device(DEVICE_GPU), + BinaryOp<GPUDevice, functor::logical_and>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_logical_not.cc b/tensorflow/core/kernels/cwise_op_logical_not.cc new file mode 100644 index 0000000000..b2e97bf70c --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_logical_not.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("LogicalNot").Device(DEVICE_CPU), + UnaryOp<CPUDevice, functor::logical_not>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("LogicalNot").Device(DEVICE_GPU), + UnaryOp<GPUDevice, functor::logical_not>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_logical_or.cc b/tensorflow/core/kernels/cwise_op_logical_or.cc new file mode 100644 index 0000000000..0d1df082f7 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_logical_or.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("LogicalOr").Device(DEVICE_CPU), + BinaryOp<CPUDevice, functor::logical_or>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("LogicalOr").Device(DEVICE_GPU), + BinaryOp<GPUDevice, functor::logical_or>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc new file mode 100644 index 0000000000..c0c9e3f6f5 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_maximum.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(BinaryOp, CPU, "Maximum", functor::maximum, float, double, int32, + int64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Maximum", functor::maximum, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Maximum") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::maximum<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc new file mode 100644 index 0000000000..4c6bf7df05 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_minimum.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(BinaryOp, CPU, "Minimum", functor::minimum, float, double, int32, + int64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Minimum", functor::minimum, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Minimum") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::minimum<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mod.cc b/tensorflow/core/kernels/cwise_op_mod.cc new file mode 100644 index 0000000000..17f2834030 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_mod.cc @@ -0,0 +1,6 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER2(BinaryOp, CPU, "Mod", functor::mod, int32, int64); +REGISTER2(BinaryOp, CPU, "Mod", functor::fmod, float, double); +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_mul.cc b/tensorflow/core/kernels/cwise_op_mul.cc new file mode 100644 index 0000000000..15f65012cd --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_mul.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER7(BinaryOp, CPU, "Mul", functor::mul, float, double, int32, int64, int8, + int16, complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Mul", functor::mul, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Mul") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::mul<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc new file mode 100644 index 0000000000..3a19b2e94f --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_neg.cc @@ -0,0 +1,9 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(UnaryOp, CPU, "Neg", functor::neg, float, double, int32, complex64, + int64); +#if GOOGLE_CUDA +REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, double, int32, int64); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to.cc b/tensorflow/core/kernels/cwise_op_not_equal_to.cc new file mode 100644 index 0000000000..02d434a1c2 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_not_equal_to.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, double, + int32, int64, complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, double, + int64); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc new file mode 100644 index 0000000000..d10dced85f --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_pow.cc @@ -0,0 +1,9 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, double, int32, int64, + complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Pow", functor::pow, float, double, int64); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_real.cc b/tensorflow/core/kernels/cwise_op_real.cc new file mode 100644 index 0000000000..84295a5a16 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_real.cc @@ -0,0 +1,10 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_KERNEL_BUILDER(Name("Real").Device(DEVICE_CPU), + UnaryOp<CPUDevice, functor::get_real<complex64>>); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Real").Device(DEVICE_GPU), + UnaryOp<GPUDevice, functor::get_real<complex64>>); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc new file mode 100644 index 0000000000..a22b1209de --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc new file mode 100644 index 0000000000..baa821690a --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_select.cc @@ -0,0 +1,17 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER_SELECT(CPU, "Select", "", float); +REGISTER_SELECT(CPU, "Select", "", double); +REGISTER_SELECT(CPU, "Select", "", int32); +REGISTER_SELECT(CPU, "Select", "", int64); +REGISTER_SELECT(CPU, "Select", "", complex64); +REGISTER_SELECT(CPU, "Select", "", string); +#if GOOGLE_CUDA +REGISTER_SELECT(GPU, "Select", "", float); +REGISTER_SELECT(GPU, "Select", "", double); +REGISTER_SELECT(GPU, "Select", "", int32); +REGISTER_SELECT(GPU, "Select", "", int64); +REGISTER_SELECT(GPU, "Select", "", complex64); +#endif // GOOGLE_CUDA +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc new file mode 100644 index 0000000000..e03b5d54dd --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc new file mode 100644 index 0000000000..59a0bfa1ed --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_sign.cc @@ -0,0 +1,19 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER4(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64); +#if GOOGLE_CUDA +REGISTER3(UnaryOp, GPU, "Sign", functor::sign, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Sign") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .TypeConstraint<int32>("T"), + UnaryOp<CPUDevice, functor::sign<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc new file mode 100644 index 0000000000..e7c87374d7 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_sin.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Sin", functor::sin, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Sin", functor::sin, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc new file mode 100644 index 0000000000..f43241264a --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_sqrt.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Sqrt", functor::sqrt, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Sqrt", functor::sqrt, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc new file mode 100644 index 0000000000..510fda49aa --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_square.cc @@ -0,0 +1,9 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(UnaryOp, CPU, "Square", functor::square, float, double, int32, + complex64, int64); +#if GOOGLE_CUDA +REGISTER3(UnaryOp, GPU, "Square", functor::square, float, double, int64); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc new file mode 100644 index 0000000000..c3c5952f8d --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_sub.cc @@ -0,0 +1,21 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER5(BinaryOp, CPU, "Sub", functor::sub, float, double, int32, int64, + complex64); +#if GOOGLE_CUDA +REGISTER3(BinaryOp, GPU, "Sub", functor::sub, float, double, int64); +#endif + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Sub") + .Device(DEVICE_GPU) + .HostMemory("x") + .HostMemory("y") + .HostMemory("z") + .TypeConstraint<int32>("T"), + BinaryOp<CPUDevice, functor::sub<int32>>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc new file mode 100644 index 0000000000..31f4743449 --- /dev/null +++ b/tensorflow/core/kernels/cwise_op_tanh.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { +REGISTER3(UnaryOp, CPU, "Tanh", functor::tanh, float, double, complex64); +#if GOOGLE_CUDA +REGISTER2(UnaryOp, GPU, "Tanh", functor::tanh, float, double); +#endif +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h new file mode 100644 index 0000000000..7d818cfbbf --- /dev/null +++ b/tensorflow/core/kernels/cwise_ops.h @@ -0,0 +1,607 @@ +#ifndef TENSORFLOW_KERNELS_CWISE_OPS_H_ +#define TENSORFLOW_KERNELS_CWISE_OPS_H_ + +#include <cmath> +#include <functional> +#include "tensorflow/core/framework/numeric_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +// The following functors (sign, tanh, sigmoid, etc.) are not defined +// by Eigen. When their equivalent are added into the Eigen, we can +// replace them using type aliases. + +namespace Eigen { +namespace internal { + +template <typename T> +struct scalar_sign_op { + // TODO(zhifengc): this only works for real types. In theory, + // sign(x) = x / |x| works for both real and complex values. + EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + return T(x > T(0)) - T(x < T(0)); + } +}; + +// TODO(zhifengc): Eigen::internal::pow_impl does not have proper +// EIGEN host/device decoration. We duplicate code here for now. +template <typename T, bool IsInteger> +struct pow { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T + operator()(const T& x, const T& y) const { + return std::pow(x, y); + } +}; + +template <typename T> +struct pow<T, true> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x, T y) const { + T res(1); + if (y & 1) res *= x; + y >>= 1; + while (y) { + x *= x; + if (y & 1) res *= x; + y >>= 1; + } + return res; + } +}; + +template <typename T> +struct scalar_pow2_op : pow<T, NumTraits<T>::IsInteger> {}; + +template <typename T> +struct functor_traits<scalar_pow2_op<T> > { + enum { + Cost = 5 * NumTraits<T>::MulCost, + PacketAccess = false, + }; +}; + +template <typename T> +struct scalar_fmod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod2_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a, + const T& b) const { + return fmod(a, b); + } +}; + +template <typename T> +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T + operator()(const T& a, const T& b) const { + return a % b; + } +}; + +template <typename T> +struct functor_traits<scalar_mod2_op<T> > { + enum { + Cost = 5, // Roughly the cost of a div + PacketAccess = false, + }; +}; + +// scalar_left and scalar_right are template helpers to partially +// apply a binary function. +// +// Suppose Binary is a binary functor f(x, y), scalar_left<> is a +// unary functor g_x(y) = f(x, y), where x is provided via the +// constructor. Similarly, scalar_right<> is a unary functor g_y(x) = +// f(x, y). + +template <typename Tout, typename Tin, typename Binary, + bool PacketAccess = functor_traits<Binary>::PacketAccess> +struct scalar_left { + typedef Tout result_type; + const Tin* left; + EIGEN_DEVICE_FUNC inline scalar_left( + const scalar_left& other) // NOLINT(runtime/explicit) + : left(other.left) {} + EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c) : left(c) {} + EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const { + return Binary()(*left, right); + } +}; + +template <typename Tout, typename Tin, typename Binary> +struct scalar_left<Tout, Tin, Binary, true> { + typedef Tout result_type; + const Tin* left; + EIGEN_DEVICE_FUNC inline scalar_left( + const scalar_left& other) // NOLINT(runtime/explicit) + : left(other.left) {} + EIGEN_DEVICE_FUNC inline explicit scalar_left(const Tin* c) : left(c) {} + EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& right) const { + return Binary()(*left, right); + } + + template <typename Packet> + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& right_packet) const { + const Packet left_packet = Eigen::internal::pset1<Packet>(*left); + return Binary().packetOp(left_packet, right_packet); + } +}; + +template <typename Tout, typename Tin, typename Binary> +struct functor_traits<scalar_left<Tout, Tin, Binary> > { + enum { + Cost = functor_traits<Binary>::Cost, + PacketAccess = functor_traits<Binary>::PacketAccess, + }; +}; + +template <typename Tout, typename Tin, typename Binary, + bool PacketAccess = functor_traits<Binary>::PacketAccess> +struct scalar_right { + typedef Tout result_type; + const Tin* right; + EIGEN_DEVICE_FUNC inline scalar_right( + const scalar_right& other) // NOLINT(runtime/explicit) + : right(other.right) {} + EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c) : right(c) {} + EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const { + return Binary()(left, *right); + } +}; + +template <typename Tout, typename Tin, typename Binary> +struct scalar_right<Tout, Tin, Binary, true> { + typedef Tout result_type; + const Tin* right; + EIGEN_DEVICE_FUNC inline scalar_right( + const scalar_right& other) // NOLINT(runtime/explicit) + : right(other.right) {} + EIGEN_DEVICE_FUNC inline explicit scalar_right(const Tin* c) : right(c) {} + EIGEN_DEVICE_FUNC inline Tout operator()(const Tin& left) const { + return Binary()(left, *right); + } + + template <typename Packet> + EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& left_packet) const { + const Packet right_packet = Eigen::internal::pset1<Packet>(*right); + return Binary().packetOp(left_packet, right_packet); + } +}; + +template <typename Tout, typename Tin, typename Binary> +struct functor_traits<scalar_right<Tout, Tin, Binary> > { + enum { + Cost = functor_traits<Binary>::Cost, + PacketAccess = functor_traits<Binary>::PacketAccess, + }; +}; + +// similar to std::equal_to, but with the DEVICE_FUNC qualifier +template <class T> +struct equal_to : std::binary_function<T, T, bool> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool operator()(const T& x, const T& y) const { return x == y; } +}; + +// similar to std::not_equal_to, but with the DEVICE_FUNC qualifier +template <class T> +struct not_equal_to : std::binary_function<T, T, bool> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool operator()(const T& x, const T& y) const { return x != y; } +}; + +// similar to std::greater, but with the DEVICE_FUNC qualifier +template <class T> +struct greater : std::binary_function<T, T, bool> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool operator()(const T& x, const T& y) const { return x > y; } +}; + +// similar to std::less, but with the DEVICE_FUNC qualifier +template <class T> +struct less : std::binary_function<T, T, bool> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool operator()(const T& x, const T& y) const { return x < y; } +}; + +// similar to std::greater_equal, but with the DEVICE_FUNC qualifier +template <class T> +struct greater_equal : std::binary_function<T, T, bool> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool operator()(const T& x, const T& y) const { return x >= y; } +}; + +// similar to std::less_equal, but with the DEVICE_FUNC qualifier +template <class T> +struct less_equal : std::binary_function<T, T, bool> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool operator()(const T& x, const T& y) const { return x <= y; } +}; + +} // end namespace internal +} // end namespace Eigen + +namespace tensorflow { +namespace functor { + +//////////////////////////////////////////////////////////////////////////////// +// Helpers +//////////////////////////////////////////////////////////////////////////////// + +// Base template for functors whose input scalar type is T and +// output scalar type is R. +template <typename T, typename F, typename R = T> +struct base { + // func defines operator() and its vectorized version packetOp(). + typedef F func; + + // If true, the functor's corresponding binary op will instantiate + // specialized kernels to perform an optimized broadcast + // operation. Each functor for which this is enabled increases the + // code size, so by default this is disabled for binary functors and + // is enabled on a per-op basis as needed. + static const bool use_bcast_optimization = false; + + // operator() has the signature: + // out_type operator()(in_type in0, in_type in1 ...) + typedef R out_type; + typedef T in_type; + + // TensorFlow provides tensor-ized version of "func". Roughly + // speaking, the tensorflow operation has the signature: + // tout_type op(tin_type in0) + // tout_type op(tin_type in0, tin_type in1) + // tout_type op(tin_type in0, in_type scalar) + typedef typename TTypes<out_type>::Flat tout_type; + typedef typename TTypes<in_type>::ConstFlat tin_type; + typedef typename TTypes<in_type>::ConstScalar tscalar_type; +}; + +// For now, we only apply certain speed optimization for +// float/double's broadcast binary op. +template <typename T> +struct use_bcast_optimization { + static const bool value = false; +}; + +template <> +struct use_bcast_optimization<float> { + static const bool value = true; +}; + +template <> +struct use_bcast_optimization<double> { + static const bool value = true; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Unary functors +//////////////////////////////////////////////////////////////////////////////// + +// abs(x) = |x| +// neg(x) = - x +// inverse(x) = 1 / x +// square(x) = x^2 +// sqrt(x) = x^(1/2) +// rsqrt(x) = x^(-1/2) +// exp(x) = e^x +// log(x) = natural logrithm of x +// tanh = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +// sigmoid = 1 / (1 + exp(-x)) // a.k.a, logistic +// +// NOTE: We may eventually implement common functions used in NN +// here. E.g., rectifier, softplus, derivatives of tanh, sigmod, etc. +// For reference, see speech/lstm/eigen_functors.h. + +template <typename T> +struct abs : base<T, Eigen::internal::scalar_abs_op<T>, + typename Eigen::internal::scalar_abs_op<T>::result_type> {}; + +template <typename T> +struct neg : base<T, Eigen::internal::scalar_opposite_op<T> > {}; + +template <typename T> +struct inverse : base<T, Eigen::internal::scalar_inverse_op<T> > {}; + +template <typename T> +struct square : base<T, Eigen::internal::scalar_square_op<T> > {}; + +template <typename T> +struct sqrt : base<T, Eigen::internal::scalar_sqrt_op<T> > {}; + +template <typename T> +struct rsqrt : base<T, Eigen::internal::scalar_rsqrt_op<T> > {}; + +template <typename T> +struct exp : base<T, Eigen::internal::scalar_exp_op<T> > {}; + +template <typename T> +struct log : base<T, Eigen::internal::scalar_log_op<T> > {}; + +template <typename T> +struct sign : base<T, Eigen::internal::scalar_sign_op<T> > {}; + +template <typename T> +struct tanh : base<T, Eigen::internal::scalar_tanh_op<T> > {}; + +template <typename T> +struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T> > {}; + +template <typename T> +struct sin : base<T, Eigen::internal::scalar_sin_op<T> > {}; + +template <typename T> +struct cos : base<T, Eigen::internal::scalar_cos_op<T> > {}; + +struct logical_not : base<bool, std::logical_not<bool> > {}; + +namespace impl { + +#ifndef __CUDACC__ +// Uses STL std cmath functions. +template <typename T> +bool isinf(T v) { + return std::isinf(v); +} + +template <typename T> +bool isnan(T v) { + return std::isnan(v); +} + +template <typename T> +bool isfinite(T v) { + return std::isfinite(v); +} + +template <typename T> +T floor(T v) { + return std::floor(v); +} + +template <typename T> +T ceil(T v) { + return std::ceil(v); +} +#else +// Uses CUDA's functions for float and double. +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isinf(T v) { + return ::isinf(v); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isnan(T v) { + return ::isnan(v); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isfinite(T v) { + return ::isfinite(v); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T floor(T v) { + return ::floor(v); +} + +template <typename T> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T ceil(T v) { + return ::ceil(v); +} +#endif +} // end namespace impl + +// NOTE: std::isinf, std::isnan, std::isfinite are plain function. +// Therefore we need to wrap them in functors to be used with Eigen's +// type system. + +template <typename T> +struct isinf_func { + typedef bool result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const { + return impl::isinf(x); + } +}; + +template <typename T> +struct isinf : base<T, isinf_func<T>, bool> {}; + +template <typename T> +struct isnan_func { + typedef bool result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const { + return impl::isnan(x); + } +}; + +template <typename T> +struct isnan : base<T, isnan_func<T>, bool> {}; + +template <typename T> +struct isfinite_func { + typedef bool result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(T x) const { + return impl::isfinite(x); + } +}; + +template <typename T> +struct isfinite : base<T, isfinite_func<T>, bool> {}; + +template <typename T> +struct floor_func { + typedef T result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const { + return impl::floor(x); + } +}; + +template <typename T> +struct floor : base<T, floor_func<T> > {}; + +template <typename T> +struct ceil_func { + typedef T result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(T x) const { + return impl::ceil(x); + } +}; + +template <typename T> +struct ceil : base<T, ceil_func<T> > {}; + +//////////////////////////////////////////////////////////////////////////////// +// Binary functors +//////////////////////////////////////////////////////////////////////////////// + +// Binary functors: +// +// add(x, y) = x + y +// sub(x, y) = x - y +// mul(x, y) = x * y +// div(x, y) = x / y +// mod(x, y) = x % y (int32 and int64 only) +// fmod(x, y) = fmod(x, y) (float and double only) +// pow(x, y) = x ^ y +// maximum(x, y) = x > y ? x : y +// minimum(x, y) = x < y ? x : y + +template <typename T> +struct add : base<T, Eigen::internal::scalar_sum_op<T> > { + static const bool use_bcast_optimization = true; +}; + +template <typename T> +struct sub : base<T, Eigen::internal::scalar_difference_op<T> > { + static const bool use_bcast_optimization = true; +}; + +template <typename T> +struct mul : base<T, Eigen::internal::scalar_product_op<T> > {}; + +template <typename T> +struct div : base<T, Eigen::internal::scalar_quotient_op<T> > {}; + +template <typename T> +struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T> > {}; + +template <typename T> +struct mod : base<T, Eigen::internal::scalar_mod2_op<T> > {}; + +template <typename T> +struct pow : base<T, Eigen::internal::scalar_pow2_op<T> > {}; + +template <typename T> +struct maximum : base<T, Eigen::internal::scalar_max_op<T> > {}; + +template <typename T> +struct minimum : base<T, Eigen::internal::scalar_min_op<T> > {}; + +template <typename T> +struct less : base<T, Eigen::internal::less<T>, bool> {}; + +template <typename T> +struct less_equal : base<T, Eigen::internal::less_equal<T>, bool> {}; + +template <typename T> +struct greater : base<T, Eigen::internal::greater<T>, bool> {}; + +template <typename T> +struct greater_equal : base<T, Eigen::internal::greater_equal<T>, bool> {}; + +template <typename T> +struct equal_to : base<T, Eigen::internal::equal_to<T>, bool> {}; + +template <typename T> +struct not_equal_to : base<T, Eigen::internal::not_equal_to<T>, bool> {}; + +struct logical_and : base<bool, Eigen::internal::scalar_boolean_and_op> {}; + +struct logical_or : base<bool, Eigen::internal::scalar_boolean_or_op> {}; + +template <typename T> +struct make_complex_func { + typedef std::complex<T> result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + result_type operator()(T real, T imag) const { + return std::complex<T>(real, imag); + } +}; + +template <typename T> +struct make_complex : base<T, make_complex_func<T>, std::complex<T> > {}; + +template <typename T> +struct get_real + : base<T, Eigen::internal::scalar_real_op<T>, typename T::value_type> {}; + +template <typename T> +struct get_imag + : base<T, Eigen::internal::scalar_imag_op<T>, typename T::value_type> {}; + +template <typename T> +struct conj : base<T, Eigen::internal::scalar_conjugate_op<T> > {}; + +//////////////////////////////////////////////////////////////////////////////// +// Functors takes 1 or 2 tensors, computes the base functor on +// coefficient of the input tensors and puts the results in the output +// tensor. +//////////////////////////////////////////////////////////////////////////////// +template <typename Device, typename Functor> +struct UnaryFunctor { + // Computes on device "d": out[i] = Functor(in[i]) + void operator()(const Device& d, typename Functor::tout_type out, + typename Functor::tin_type in); +}; + +template <typename Device, typename Functor, int NDIMS> +struct BinaryFunctor { + // Computes on device "d": out[i] = Functor(in0[i], in1[i]) + void operator()(const Device& d, typename Functor::tout_type out, + typename Functor::tin_type in0, + typename Functor::tin_type in1); + + // Computes on device "d": out[i] = Functor(scalar[0], in[i]) + void Left(const Device& d, typename Functor::tout_type out, + typename Functor::tscalar_type scalar, + typename Functor::tin_type in); + + // Computes on device "d": out[i] = Functor(in[i], scalar[0]) + void Right(const Device& d, typename Functor::tout_type out, + typename Functor::tin_type in, + typename Functor::tscalar_type scalar); + + // Computes on device "d": + // out = Functor(in0.broadcast(bcast0), in1.broadcast(bcast01)) + // + // TODO(zhifengc): makes BCast a template member function on NDIMS + // instead making BinaryFunctor templates on NDIMS. + void BCast(const Device& d, + typename TTypes<typename Functor::out_type, NDIMS>::Tensor out, + typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0, + typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0, + typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1, + typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1); +}; + +template <int NDIMS> +bool AllOne(const typename Eigen::array<Eigen::DenseIndex, NDIMS>& a) { + for (int i = 0; i < a.size(); ++i) { + if (a[i] != 1) return false; + } + return true; +} + +template <typename Device, typename T> +struct SelectFunctor { + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<bool>::ConstFlat cond_flat, + typename TTypes<T>::ConstFlat then_flat, + typename TTypes<T>::ConstFlat else_flat); +}; + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CWISE_OPS_H_ diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc new file mode 100644 index 0000000000..f86d2ddd9a --- /dev/null +++ b/tensorflow/core/kernels/cwise_ops_common.cc @@ -0,0 +1,42 @@ +#include "tensorflow/core/kernels/cwise_ops_common.h" + +namespace tensorflow { + +BinaryOpShared::BinaryOpShared(OpKernelConstruction* ctx, DataType out, + DataType in) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->MatchSignature({in, in}, {out})); +} + +void BinaryOpShared::SetUnimplementedError(OpKernelContext* ctx) { + ctx->SetStatus(errors::Unimplemented( + "Broadcast between ", ctx->input(0).shape().ShortDebugString(), " and ", + ctx->input(1).shape().ShortDebugString(), " is not supported yet.")); +} + +static BCast::Vec FromShape(const TensorShape& shape) { + BCast::Vec ret; + for (int i = 0; i < shape.dims(); ++i) ret.push_back(shape.dim_size(i)); + return ret; +} + +static TensorShape ToShape(const BCast::Vec& vec) { + TensorShape shape; + for (auto elem : vec) shape.AddDim(elem); + return shape; +} + +BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx) + : bcast(FromShape(ctx->input(0).shape()), + FromShape(ctx->input(1).shape())) { + if (!bcast.IsValid()) { + ctx->SetStatus(errors::InvalidArgument( + "Incompatible shapes: ", ctx->input(0).shape().ShortDebugString(), + " vs. ", ctx->input(1).shape().ShortDebugString())); + return; + } + OP_REQUIRES_OK(ctx, + ctx->allocate_output(0, ToShape(bcast.output_shape()), &out)); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h new file mode 100644 index 0000000000..cf848b86d1 --- /dev/null +++ b/tensorflow/core/kernels/cwise_ops_common.h @@ -0,0 +1,390 @@ +#ifndef TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_ +#define TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_ + +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/cwise_ops.h" + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/bcast.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +class BinaryOpShared : public OpKernel { + public: + explicit BinaryOpShared(OpKernelConstruction* ctx, DataType out, DataType in); + + protected: + struct BinaryOpState { + // Sets up bcast with the shape of in0 and in1, ensures that the bcast + // is valid, and if so, allocates out using ctx->output(...). + // Caller must check ctx->status() upon return for non-ok status. + // If ctx->status().ok() is true, then out is guaranteed to be allocated. + BinaryOpState(OpKernelContext* ctx); + + BCast bcast; + Tensor* out = nullptr; + }; + + template <int NDIMS> + static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray( + const BCast::Vec& vec) { + CHECK_EQ(vec.size(), NDIMS); + Eigen::array<Eigen::DenseIndex, NDIMS> ret; + for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i]; + return ret; + } + void SetUnimplementedError(OpKernelContext* ctx); +}; + +// Coefficient-wise binary operations: +// Device: E.g., CPUDevice, GPUDevice. +// Functor: defined in cwise_functors.h. E.g., functor::add2. +template <typename Device, typename Functor> +class BinaryOp : public BinaryOpShared { + public: + typedef typename Functor::in_type Tin; // Input scalar data type. + typedef typename Functor::out_type Tout; // Output scalar data type. + + explicit BinaryOp(OpKernelConstruction* ctx) + : BinaryOpShared(ctx, DataTypeToEnum<Tout>::v(), + DataTypeToEnum<Tin>::v()) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& in0 = ctx->input(0); + const Tensor& in1 = ctx->input(1); + // 'state': Shared helper not dependent on T to reduce code size + BinaryOpState state(ctx); + if (!ctx->status().ok()) return; + Tensor* out = state.out; + BCast* bcast = &state.bcast; + if (out->NumElements() == 0) { + return; + } + const int ndims = bcast->x_reshape().size(); + if (ndims <= 1) { + if (in1.NumElements() == 1) { + // tensor op scalar + functor::BinaryFunctor<Device, Functor, 1>().Right( + ctx->eigen_device<Device>(), out->flat<Tout>(), in0.flat<Tin>(), + in1.scalar<Tin>()); + return; + } + if (in0.NumElements() == 1) { + // scalar op tensor + functor::BinaryFunctor<Device, Functor, 1>().Left( + ctx->eigen_device<Device>(), out->flat<Tout>(), in0.scalar<Tin>(), + in1.flat<Tin>()); + return; + } + functor::BinaryFunctor<Device, Functor, 1>()( + ctx->eigen_device<Device>(), out->flat<Tout>(), in0.flat<Tin>(), + in1.flat<Tin>()); + return; + } + + if (ndims == 2) { + functor::BinaryFunctor<Device, Functor, 2>().BCast( + ctx->eigen_device<Device>(), + out->shaped<Tout, 2>(bcast->result_shape()), + in0.shaped<Tin, 2>(bcast->x_reshape()), + ToIndexArray<2>(bcast->x_bcast()), + in1.shaped<Tin, 2>(bcast->y_reshape()), + ToIndexArray<2>(bcast->y_bcast())); + return; + } + + if (ndims == 3) { + functor::BinaryFunctor<Device, Functor, 3>().BCast( + ctx->eigen_device<Device>(), + out->shaped<Tout, 3>(bcast->result_shape()), + in0.shaped<Tin, 3>(bcast->x_reshape()), + ToIndexArray<3>(bcast->x_bcast()), + in1.shaped<Tin, 3>(bcast->y_reshape()), + ToIndexArray<3>(bcast->y_bcast())); + return; + } + + SetUnimplementedError(ctx); + } + + private: +}; + +// Coefficient-wise unary operations: +// Device: E.g., CPUDevice, GPUDevice. +// Functor: defined in cwise_functors.h. E.g., functor::sqrt. +template <typename Device, typename Functor> +class UnaryOp : public OpKernel { + public: + typedef typename Functor::in_type Tin; // Input scalar data type. + typedef typename Functor::out_type Tout; // Output scalar data type. + // Tin may be different from Tout. E.g., abs: complex64 -> float + + explicit UnaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + auto in = DataTypeToEnum<Tin>::v(); + auto out = DataTypeToEnum<Tout>::v(); + OP_REQUIRES_OK(ctx, ctx->MatchSignature({in}, {out})); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& inp = ctx->input(0); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out)); + functor::UnaryFunctor<Device, Functor>()( + ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>()); + } +}; + +// Coefficient-wise select operation. +// Device: E.g., CPUDevice, GPUDevice. +template <typename Device, typename T> +class SelectOp : public OpKernel { + public: + explicit SelectOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + auto dt = DataTypeToEnum<T>::v(); + OP_REQUIRES_OK(ctx, ctx->MatchSignature({DT_BOOL, dt, dt}, {dt})); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& in0 = ctx->input(0); + const Tensor& in1 = ctx->input(1); + const Tensor& in2 = ctx->input(2); + if (!ctx->ValidateInputsAreSameShape(this)) return; + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out)); + functor::SelectFunctor<Device, T> func; + func(ctx->eigen_device<Device>(), out->flat<T>(), in0.flat<bool>(), + in1.flat<T>(), in2.flat<T>()); + } +}; + +namespace functor { + +// For CPUDevice, we do operations inline if the resulting tensor is +// modestly sized. +static bool DoInline(size_t size) { return size <= 32768; } + +template <typename D, typename OUT, typename RHS> +void Assign(const D& d, OUT out, RHS rhs) { + if (DoInline(out.size())) { + out = rhs; + } else { + out.device(d) = rhs; + } +} + +// Partial specialization of BinaryFunctor<Device=CPUDevice, Functor>. +template <typename Functor, int NDIMS> +struct BinaryFunctor<CPUDevice, Functor, NDIMS> { + void operator()(const CPUDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in0, + typename Functor::tin_type in1) { + Assign(d, out, in0.binaryExpr(in1, typename Functor::func())); + } + + void Left(const CPUDevice& d, typename Functor::tout_type out, + typename Functor::tscalar_type scalar, + typename Functor::tin_type in) { + typedef typename Functor::out_type Tout; + typedef typename Functor::in_type Tin; + typedef typename Functor::func Binary; + typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary; + Assign(d, out, in.unaryExpr(Unary(scalar.data()))); + } + + void Right(const CPUDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in, + typename Functor::tscalar_type scalar) { + typedef typename Functor::out_type Tout; + typedef typename Functor::in_type Tin; + typedef typename Functor::func Binary; + typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary; + Assign(d, out, in.unaryExpr(Unary(scalar.data()))); + } + +#if !defined(EIGEN_HAS_INDEX_LIST) + inline Eigen::DSizes<int, 2> NByOne(int n) { + return Eigen::DSizes<int, 2>(n, 1); + } + inline Eigen::DSizes<int, 2> OneByM(int m) { + return Eigen::DSizes<int, 2>(1, m); + } +#else + inline Eigen::IndexList<int, Eigen::type2index<1>> NByOne(int n) { + Eigen::IndexList<int, Eigen::type2index<1>> ret; + ret.set(0, n); + return ret; + } + inline Eigen::IndexList<Eigen::type2index<1>, int> OneByM(int m) { + Eigen::IndexList<Eigen::type2index<1>, int> ret; + ret.set(1, m); + return ret; + } +#endif + + void BCast(const CPUDevice& dev, + typename TTypes<typename Functor::out_type, NDIMS>::Tensor out, + typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0, + typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0, + typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1, + typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1) { + typedef typename Functor::in_type T; + typename Functor::func func; + if ((NDIMS == 2) && Functor::use_bcast_optimization && + use_bcast_optimization<T>::value) { + // Optimize for speed by using Eigen::type2index and avoid + // .broadcast() when we know its a no-op. + // + // Here, we need to handle 6 cases depending on how many "1" + // exist in in0 and in1's shapes (4 numbers in total). It's not + // possible that two shapes have more than 2 1s because those + // are simplified to NDIMS==1 case. + // + // Because this optimization increases the binary size for each + // Functor (+, -, *, /, <, <=, etc.), type and ndim combination. + // we only apply such optimization for selected ops/types/ndims. + // + // Because NDIMS, Functor::use_broadcast_optimization and + // use_broadcast_optimization<T> are compile-time constant, gcc + // does a decent job avoiding generating code when conditions + // are not met. + const int a = in0.dimension(0); // in0 is shape [a, b] + const int b = in0.dimension(1); + const int c = in1.dimension(0); // in1 is shape [c, d] + const int d = in1.dimension(1); + if ((a == 1) && (d == 1)) { + auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c)); + auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b)); + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + if ((b == 1) && (c == 1)) { + auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d)); + auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a)); + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + if (a == 1) { + auto lhs = in0.reshape(OneByM(b)).broadcast(NByOne(c)); + auto rhs = in1; + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + if (b == 1) { + auto lhs = in0.reshape(NByOne(a)).broadcast(OneByM(d)); + auto rhs = in1; + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + if (c == 1) { + auto lhs = in0; + auto rhs = in1.reshape(OneByM(d)).broadcast(NByOne(a)); + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + if (d == 1) { + auto lhs = in0; + auto rhs = in1.reshape(NByOne(c)).broadcast(OneByM(b)); + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + + const bool bcast0_all_one = AllOne<NDIMS>(bcast0); + const bool bcast1_all_one = AllOne<NDIMS>(bcast1); + if (bcast0_all_one && !bcast1_all_one) { + auto lhs = in0; // No need to do broadcast for in0 + auto rhs = in1.broadcast(bcast1); + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + + if (!bcast0_all_one && bcast1_all_one) { + auto lhs = in0.broadcast(bcast0); + auto rhs = in1; // No need to do broadcast for in1 + Assign(dev, out, lhs.binaryExpr(rhs, func)); + return; + } + } + + // Fallback path. Always work and probably slower. + auto lhs = in0.broadcast(bcast0); + auto rhs = in1.broadcast(bcast1); + Assign(dev, out, lhs.binaryExpr(rhs, func)); + } +}; + +// Partial specialization of UnaryFunctor<Device=CPUDevice, Functor>. +template <typename Functor> +struct UnaryFunctor<CPUDevice, Functor> { + void operator()(const CPUDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in) { + Assign(d, out, in.unaryExpr(typename Functor::func())); + } +}; + +template <typename T> +struct SelectFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<bool>::ConstFlat cond_flat, + typename TTypes<T>::ConstFlat then_flat, + typename TTypes<T>::ConstFlat else_flat) { + Assign(d, out, cond_flat.select(then_flat, else_flat)); + } +}; + +} // end namespace functor + +#define REGISTER_SELECT(D, N, F, T) \ + REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \ + SelectOp<D##Device, T>) + +#define REGISTER(OP, D, N, F, T) \ + REGISTER_KERNEL_BUILDER(Name(N).Device(DEVICE_##D).TypeConstraint<T>("T"), \ + OP<D##Device, F<T>>); + +// Macros to register kernels for multiple types (T0, T1, etc.) on +// device type "D" (CPU or GPU) for operatin "N" (e.g., sqrt) using +// the functor "F" (e.g., functor:sqrt). + +#ifdef __ANDROID__ +// On Android, only register the first type (float) +#define REGISTER2(OP, D, N, F, T0, T1) REGISTER(OP, D, N, F, T0) +#define REGISTER3(OP, D, N, F, T0, T1, T2) REGISTER(OP, D, N, F, T0) +#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) REGISTER(OP, D, N, F, T0) +#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) REGISTER(OP, D, N, F, T0) +#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) REGISTER(OP, D, N, F, T0) +#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \ + REGISTER(OP, D, N, F, T0) +#else // !__ANDROID__ +#define REGISTER2(OP, D, N, F, T0, T1) \ + REGISTER(OP, D, N, F, T0) \ + REGISTER(OP, D, N, F, T1) +#define REGISTER3(OP, D, N, F, T0, T1, T2) \ + REGISTER2(OP, D, N, F, T0, T1) \ + REGISTER(OP, D, N, F, T2) +#define REGISTER4(OP, D, N, F, T0, T1, T2, T3) \ + REGISTER2(OP, D, N, F, T0, T1) \ + REGISTER2(OP, D, N, F, T2, T3) +#define REGISTER5(OP, D, N, F, T0, T1, T2, T3, T4) \ + REGISTER3(OP, D, N, F, T0, T1, T2) \ + REGISTER2(OP, D, N, F, T3, T4) +#define REGISTER6(OP, D, N, F, T0, T1, T2, T3, T4, T5) \ + REGISTER3(OP, D, N, F, T0, T1, T2) \ + REGISTER3(OP, D, N, F, T3, T4, T5) +#define REGISTER7(OP, D, N, F, T0, T1, T2, T3, T4, T5, T6) \ + REGISTER4(OP, D, N, F, T0, T1, T2, T3) \ + REGISTER3(OP, D, N, F, T4, T5, T6) +#endif // __ANDROID__ + +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CWISE_OPS_COMMON_H_ diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h new file mode 100644 index 0000000000..b0dc027144 --- /dev/null +++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h @@ -0,0 +1,135 @@ +#if !GOOGLE_CUDA +#error This file must only be included when building with Cuda support +#endif + +#ifndef TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_ +#define TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_ + +#define EIGEN_USE_GPU + +#include <complex> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/kernels/cwise_ops.h" +#include "tensorflow/core/framework/tensor_types.h" + +#include "tensorflow/core/platform/logging.h" +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; +typedef std::complex<float> complex64; + +// Partial specialization of UnaryFunctor<Device=GPUDevice, Functor>. +template <typename Functor> +struct UnaryFunctor<GPUDevice, Functor> { + void operator()(const GPUDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in) { + out.device(d) = in.unaryExpr(typename Functor::func()); + } +}; + +// Partial specialization of BinaryFunctor<Device=GPUDevice, Functor>. +template <typename Functor, int NDIMS> +struct BinaryFunctor<GPUDevice, Functor, NDIMS> { + void operator()(const GPUDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in0, + typename Functor::tin_type in1) { + out.device(d) = in0.binaryExpr(in1, typename Functor::func()); + } + + void Left(const GPUDevice& d, typename Functor::tout_type out, + typename Functor::tscalar_type scalar, + typename Functor::tin_type in) { + typedef typename Functor::out_type Tout; + typedef typename Functor::in_type Tin; + typedef typename Functor::func Binary; + typedef typename Eigen::internal::scalar_left<Tout, Tin, Binary> Unary; + out.device(d) = in.unaryExpr(Unary(scalar.data())); + } + + void Right(const GPUDevice& d, typename Functor::tout_type out, + typename Functor::tin_type in, + typename Functor::tscalar_type scalar) { + typedef typename Functor::out_type Tout; + typedef typename Functor::in_type Tin; + typedef typename Functor::func Binary; + typedef typename Eigen::internal::scalar_right<Tout, Tin, Binary> Unary; + out.device(d) = in.unaryExpr(Unary(scalar.data())); + } + + void BCast(const GPUDevice& d, + typename TTypes<typename Functor::out_type, NDIMS>::Tensor out, + typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0, + typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0, + typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1, + typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1) { + typedef typename Functor::in_type T; + typename Functor::func func; + if ((NDIMS == 2) && Functor::use_bcast_optimization && + use_bcast_optimization<T>::value) { + const bool bcast0_all_one = AllOne<NDIMS>(bcast0); + const bool bcast1_all_one = AllOne<NDIMS>(bcast1); + if (bcast0_all_one && !bcast1_all_one) { + out.device(d) = in0.binaryExpr(in1.broadcast(bcast1), func); + return; + } + if (!bcast0_all_one && bcast1_all_one) { + out.device(d) = in0.broadcast(bcast0).binaryExpr(in1, func); + return; + } + } + out.device(d) = + in0.broadcast(bcast0).binaryExpr(in1.broadcast(bcast1), func); + } +}; + +template <typename T> +struct SelectFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat out, + typename TTypes<bool>::ConstFlat cond_flat, + typename TTypes<T>::ConstFlat then_flat, + typename TTypes<T>::ConstFlat else_flat) { + out.device(d) = cond_flat.select(then_flat, else_flat); + } +}; + +// Macros to explicitly instantiate kernels on GPU for multiple types +// (T0, T1, etc.) for UnaryFunctor (e.g., functor:sqrt). +#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<GPUDevice, F<T> > +#define DEFINE_UNARY2(F, T0, T1) \ + DEFINE_UNARY1(F, T0); \ + DEFINE_UNARY1(F, T1) +#define DEFINE_UNARY3(F, T0, T1, T2) \ + DEFINE_UNARY2(F, T0, T1); \ + DEFINE_UNARY1(F, T2) +#define DEFINE_UNARY4(F, T0, T1, T2, T3) \ + DEFINE_UNARY2(F, T0, T1); \ + DEFINE_UNARY2(F, T2, T3) +#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \ + DEFINE_UNARY2(F, T0, T1); \ + DEFINE_UNARY3(F, T2, T3, T4) + +// Macros to explicitly instantiate kernels on GPU for multiple types +// (T0, T1, etc.) for BinaryFunctor. +#define DEFINE_BINARY1(F, T) \ + template struct BinaryFunctor<GPUDevice, F<T>, 1>; \ + template struct BinaryFunctor<GPUDevice, F<T>, 2>; \ + template struct BinaryFunctor<GPUDevice, F<T>, 3> +#define DEFINE_BINARY2(F, T0, T1) \ + DEFINE_BINARY1(F, T0); \ + DEFINE_BINARY1(F, T1) +#define DEFINE_BINARY3(F, T0, T1, T2) \ + DEFINE_BINARY2(F, T0, T1); \ + DEFINE_BINARY1(F, T2) +#define DEFINE_BINARY4(F, T0, T1, T2, T3) \ + DEFINE_BINARY2(F, T0, T1); \ + DEFINE_BINARY2(F, T2, T3) +#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \ + DEFINE_BINARY2(F, T0, T1); \ + DEFINE_BINARY3(F, T2, T3, T4) + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_CWISE_OPS_GPU_COMMON_CU_H_ diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc new file mode 100644 index 0000000000..56af248117 --- /dev/null +++ b/tensorflow/core/kernels/cwise_ops_test.cc @@ -0,0 +1,167 @@ +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +// Creates a Graph which applies a unary "func" on a 3D float tensor +// of "num" elements. +static Graph* Unary(const string& func, int num) { + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)})); + CHECK_GT(data.NumElements(), 0); + data.flat<float>().setRandom(); + test::graph::Unary(g, func, test::graph::Constant(g, data), 0); + return g; +} + +static int kRows = 100000; + +static int RowsAndColsArg(int r, int c) { return r * kRows + c; } +static int RowsFromArg(int arg) { return (arg / kRows); } +static int ColsFromArg(int arg) { return (arg % kRows); } + +#define BM_UNARY(DEVICE, FUNC) \ + static void BM_##DEVICE##_##FUNC(int iters, int num) { \ + const int64 tot = static_cast<int64>(iters) * num; \ + testing::ItemsProcessed(tot); \ + testing::BytesProcessed(tot * sizeof(float)); \ + test::Benchmark(#DEVICE, Unary(#FUNC, num)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_##FUNC)->Range(4 << 10, 1 << 20); + +BM_UNARY(cpu, Floor); +BM_UNARY(gpu, Floor); + +// data func scalar. +static Graph* BinaryScalar(int num, const string& func) { + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)})); + lhs.flat<float>().setRandom(); + Tensor rhs(DT_FLOAT, TensorShape({})); + rhs.flat<float>().setRandom(); + test::graph::Binary(g, func, test::graph::Constant(g, lhs), + test::graph::Constant(g, rhs)); + return g; +} + +#define BM_BINARY_SCALAR(DEVICE, FUNC) \ + static void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) { \ + const int64 tot = static_cast<int64>(iters) * num; \ + testing::ItemsProcessed(tot); \ + testing::BytesProcessed(tot * sizeof(float)); \ + test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \ + ->Arg(4096) /* must >= 4096 */ \ + ->Arg(32768) \ + ->Arg(131072) \ + ->Arg(1048576); + +BM_BINARY_SCALAR(cpu, Less); +BM_BINARY_SCALAR(gpu, Less); +BM_BINARY_SCALAR(cpu, Add); +BM_BINARY_SCALAR(gpu, Add); +#undef BM_BINARY_SCALAR + +static Graph* BiasAdd(int rows, int cols) { + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + Tensor lhs(DT_FLOAT, TensorShape({rows, cols})); + lhs.flat<float>().setRandom(); + TensorShape rhs_shape; + rhs_shape = TensorShape({cols}); + Tensor rhs(DT_FLOAT, rhs_shape); + rhs.flat<float>().setRandom(); + test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs), + test::graph::Constant(g, rhs)); + return g; +} + +#define BM_BIAS_ADD(DEVICE, R, C) \ + static void BM_##DEVICE##_BiasAdd_R##R##_C##C(int iters, int arg) { \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + const int64 tot = static_cast<int64>(iters) * rows * cols; \ + testing::ItemsProcessed(tot); \ + testing::BytesProcessed(tot * sizeof(float)); \ + test::Benchmark(#DEVICE, BiasAdd(rows, cols)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_BiasAdd_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); + +#define BM_BIAS_ADD_ALL(DEVICE) \ + BM_BIAS_ADD(DEVICE, 512, 2048); \ + BM_BIAS_ADD(DEVICE, 512, 4096); \ + BM_BIAS_ADD(DEVICE, 2048, 512); \ + BM_BIAS_ADD(DEVICE, 4096, 512); + +BM_BIAS_ADD_ALL(cpu); +BM_BIAS_ADD_ALL(gpu); +#undef BM_BIAS_ADD_ALL +#undef BM_BIAS_ADD + +static Graph* BcastAdd(int rows, int cols, int dim) { + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + Tensor lhs(DT_FLOAT, TensorShape({rows, cols})); + lhs.flat<float>().setRandom(); + TensorShape rhs_shape; + if (dim == 0) { + rhs_shape = TensorShape({rows, 1}); + } else { + rhs_shape = TensorShape({cols}); + } + Tensor rhs(DT_FLOAT, rhs_shape); + rhs.flat<float>().setRandom(); + test::graph::Binary(g, "Add", test::graph::Constant(g, lhs), + test::graph::Constant(g, rhs)); + return g; +} + +#define BM_BCAST_ADD_ROW(DEVICE, R, C) \ + static void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + const int64 tot = static_cast<int64>(iters) * rows * cols; \ + testing::ItemsProcessed(tot); \ + testing::BytesProcessed(tot * sizeof(float)); \ + test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); + +#define BM_BCAST_ADD_ROW_ALL(DEVICE) \ + BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \ + BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \ + BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \ + BM_BCAST_ADD_ROW(DEVICE, 4096, 512); +BM_BCAST_ADD_ROW_ALL(cpu); +BM_BCAST_ADD_ROW_ALL(gpu); +#undef BM_BCAST_ADD_ROW_ALL +#undef BM_BCAST_ADD_ROW + +#define BM_BCAST_ADD_COL(DEVICE, R, C) \ + static void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + const int64 tot = static_cast<int64>(iters) * rows * cols; \ + testing::ItemsProcessed(tot); \ + testing::BytesProcessed(tot * sizeof(float)); \ + test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); + +#define BM_BCAST_ADD_COL_ALL(DEVICE) \ + BM_BCAST_ADD_COL(DEVICE, 512, 2048); \ + BM_BCAST_ADD_COL(DEVICE, 512, 4096); \ + BM_BCAST_ADD_COL(DEVICE, 2048, 512); \ + BM_BCAST_ADD_COL(DEVICE, 4096, 512); +BM_BCAST_ADD_COL_ALL(cpu); +BM_BCAST_ADD_COL_ALL(gpu); +#undef BM_BCAST_ADD_COL_ALL +#undef BM_BCAST_ADD_COL + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc new file mode 100644 index 0000000000..0919bab96f --- /dev/null +++ b/tensorflow/core/kernels/decode_csv_op.cc @@ -0,0 +1,222 @@ +// See docs in ../ops/parsing_ops.cc. +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class DecodeCSVOp : public OpKernel { + public: + explicit DecodeCSVOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string delim; + + OP_REQUIRES_OK(ctx, ctx->GetAttr("OUT_TYPE", &out_type_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim)); + + OP_REQUIRES(ctx, delim.size() == 1, + errors::InvalidArgument("field_delim should be only 1 char")); + + delim_ = delim[0]; + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* records; + OpInputList record_defaults; + + OP_REQUIRES_OK(ctx, ctx->input("records", &records)); + OP_REQUIRES_OK(ctx, ctx->input_list("record_defaults", &record_defaults)); + + for (int i = 0; i < record_defaults.size(); ++i) { + OP_REQUIRES(ctx, record_defaults[i].NumElements() < 2, + errors::InvalidArgument( + "There should only be 1 default per field but field ", i, + " has ", record_defaults[i].NumElements())); + } + + auto records_t = records->flat<string>(); + int records_size = records_t.size(); + + OpOutputList output; + OP_REQUIRES_OK(ctx, ctx->output_list("output", &output)); + + for (size_t i = 0; i < out_type_.size(); ++i) { + Tensor* out = nullptr; + output.allocate(i, records->shape(), &out); + } + + for (int i = 0; i < records_size; ++i) { + const StringPiece record(records_t(i)); + std::vector<string> fields; + ExtractFields(ctx, record, &fields); + OP_REQUIRES(ctx, fields.size() == out_type_.size(), + errors::InvalidArgument("Expect ", out_type_.size(), + " fields but have ", fields.size(), + " in record ", i)); + + // Check each field in the record + for (size_t f = 0; f < out_type_.size(); ++f) { + const DataType& dtype = out_type_[f]; + switch (dtype) { + case DT_INT32: { + // If this field is empty, check if default is given: + // If yes, use default value; Otherwise report error. + if (fields[f].empty()) { + OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1, + errors::InvalidArgument( + "Field ", f, + " is required but missing in record ", i, "!")); + + output[f]->flat<int32>()(i) = record_defaults[f].flat<int32>()(0); + } else { + int32 value; + OP_REQUIRES(ctx, strings::safe_strto32(fields[f].c_str(), &value), + errors::InvalidArgument("Field ", f, " in record ", i, + " is not a valid int32: ", + fields[f])); + output[f]->flat<int32>()(i) = value; + } + break; + } + case DT_INT64: { + // If this field is empty, check if default is given: + // If yes, use default value; Otherwise report error. + if (fields[f].empty()) { + OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1, + errors::InvalidArgument( + "Field ", f, + " is required but missing in record ", i, "!")); + + output[f]->flat<int64>()(i) = record_defaults[f].flat<int64>()(0); + } else { + int64 value; + OP_REQUIRES(ctx, strings::safe_strto64(fields[f].c_str(), &value), + errors::InvalidArgument("Field ", f, " in record ", i, + " is not a valid int64: ", + fields[f])); + output[f]->flat<int64>()(i) = value; + } + break; + } + case DT_FLOAT: { + // If this field is empty, check if default is given: + // If yes, use default value; Otherwise report error. + if (fields[f].empty()) { + OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1, + errors::InvalidArgument( + "Field ", f, + " is required but missing in record ", i, "!")); + output[f]->flat<float>()(i) = record_defaults[f].flat<float>()(0); + } else { + float value; + OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value), + errors::InvalidArgument("Field ", f, " in record ", i, + " is not a valid float: ", + fields[f])); + output[f]->flat<float>()(i) = value; + } + break; + } + case DT_STRING: { + // If this field is empty, check if default is given: + // If yes, use default value; Otherwise report error. + if (fields[f].empty()) { + OP_REQUIRES(ctx, record_defaults[f].NumElements() == 1, + errors::InvalidArgument( + "Field ", f, + " is required but missing in record ", i, "!")); + output[f]->flat<string>()(i) = + record_defaults[f].flat<string>()(0); + } else { + output[f]->flat<string>()(i) = fields[f]; + } + break; + } + default: + OP_REQUIRES(ctx, false, + errors::InvalidArgument("csv: data type ", dtype, + " not supported in field ", f)); + } + } + } + } + + private: + std::vector<DataType> out_type_; + char delim_; + + void ExtractFields(OpKernelContext* ctx, StringPiece input, + std::vector<string>* result) { + int current_idx = 0; + if (!input.empty()) { + while (static_cast<size_t>(current_idx) < input.size()) { + if (input[current_idx] == '\n' || input[current_idx] == '\r') { + current_idx++; + continue; + } + + bool quoted = false; + if (input[current_idx] == '"') { + quoted = true; + current_idx++; + } + + // This is the body of the field; + string field; + if (!quoted) { + while (static_cast<size_t>(current_idx) < input.size() && + input[current_idx] != delim_) { + OP_REQUIRES(ctx, input[current_idx] != '"' && + input[current_idx] != '\n' && + input[current_idx] != '\r', + errors::InvalidArgument( + "Unquoted fields cannot have quotes/CRLFs inside")); + field += input[current_idx]; + current_idx++; + } + + // Go to next field or the end + current_idx++; + } else { + // Quoted field needs to be ended with '"' and delim or end + while ( + (static_cast<size_t>(current_idx) < input.size() - 1) && + (input[current_idx] != '"' || input[current_idx + 1] != delim_)) { + if (input[current_idx] != '"') { + field += input[current_idx]; + current_idx++; + } else { + OP_REQUIRES( + ctx, input[current_idx + 1] == '"', + errors::InvalidArgument("Quote inside a string has to be " + "escaped by another quote")); + field += '"'; + current_idx += 2; + } + } + + OP_REQUIRES( + ctx, + input[current_idx] == '"' && + (static_cast<size_t>(current_idx) == input.size() - 1 || + input[current_idx + 1] == delim_), + errors::InvalidArgument("Quoted field has to end with quote " + "followed by delim or end")); + + current_idx += 2; + } + + result->push_back(field); + } + + // Check if the last field is missing + if (input[input.size() - 1] == delim_) result->push_back(string()); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("DecodeCSV").Device(DEVICE_CPU), DecodeCSVOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/decode_jpeg_op.cc b/tensorflow/core/kernels/decode_jpeg_op.cc new file mode 100644 index 0000000000..e41d3f3e11 --- /dev/null +++ b/tensorflow/core/kernels/decode_jpeg_op.cc @@ -0,0 +1,72 @@ +// See docs in ../ops/image_ops.cc + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/lib/jpeg/jpeg_mem.h" + +namespace tensorflow { + +// Decode the contents of a JPEG file +class DecodeJpegOp : public OpKernel { + public: + explicit DecodeJpegOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("channels", &flags_.components)); + OP_REQUIRES(context, flags_.components == 0 || flags_.components == 1 || + flags_.components == 3, + errors::InvalidArgument("channels must be 0, 1, or 3, got ", + flags_.components)); + OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio)); + OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 || + flags_.ratio == 4 || flags_.ratio == 8, + errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ", + flags_.ratio)); + OP_REQUIRES_OK( + context, context->GetAttr("fancy_upscaling", &flags_.fancy_upscaling)); + OP_REQUIRES_OK(context, + context->GetAttr("try_recover_truncated", + &flags_.try_recover_truncated_jpeg)); + OP_REQUIRES_OK(context, context->GetAttr("acceptable_fraction", + &flags_.min_acceptable_fraction)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& contents = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()), + errors::InvalidArgument("contents must be scalar, got shape ", + contents.shape().ShortDebugString())); + const StringPiece input = contents.scalar<string>()(); + OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(), + errors::InvalidArgument("JPEG contents are too large for int: ", + input.size())); + + // Decode image, allocating tensor once the image size is known + Tensor* output = NULL; + OP_REQUIRES( + context, + jpeg::Uncompress( + input.data(), input.size(), flags_, NULL, + [=, &output](int width, int height, int channels) -> uint8* { + Status status(context->allocate_output( + 0, TensorShape({height, width, channels}), &output)); + if (!status.ok()) { + VLOG(1) << status; + context->SetStatus(status); + return nullptr; + } + return output->flat<uint8>().data(); + }), + errors::InvalidArgument("Invalid JPEG data, size ", input.size())); + } + + private: + jpeg::UncompressFlags flags_; +}; +REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeJpegOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/decode_png_op.cc b/tensorflow/core/kernels/decode_png_op.cc new file mode 100644 index 0000000000..e8071526f9 --- /dev/null +++ b/tensorflow/core/kernels/decode_png_op.cc @@ -0,0 +1,69 @@ +// See docs in ../ops/image_ops.cc + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/lib/png/png_io.h" + +namespace tensorflow { + +// Decode the contents of a PNG file +class DecodePngOp : public OpKernel { + public: + explicit DecodePngOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_)); + OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 || + channels_ == 4, + errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ", + channels_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& contents = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()), + errors::InvalidArgument("contents must be scalar, got shape ", + contents.shape().ShortDebugString())); + + // Start decoding image to get shape details + const StringPiece data = contents.scalar<string>()(); + png::DecodeContext decode; + OP_REQUIRES( + context, png::CommonInitDecode(data, channels_, 8, &decode), + errors::InvalidArgument("Invalid PNG header, data size ", data.size())); + + // Verify that width and height don't overflow int + const int width = decode.width; + const int height = decode.height; + if (width != static_cast<int64>(decode.width) || + height != static_cast<int64>(decode.height)) { + png::CommonFreeDecode(&decode); + OP_REQUIRES(context, false, + errors::InvalidArgument("PNG size too large for int: ", + decode.width, " by ", decode.height)); + } + + // Allocate tensor + Tensor* output = nullptr; + const auto status = context->allocate_output( + 0, TensorShape({height, width, decode.channels}), &output); + if (!status.ok()) png::CommonFreeDecode(&decode); + OP_REQUIRES_OK(context, status); + + // Finish decoding image + OP_REQUIRES( + context, png::CommonFinishDecode(output->flat<uint8>().data(), + decode.channels * width, &decode), + errors::InvalidArgument("Invalid PNG data, size ", data.size())); + } + + private: + int channels_; +}; +REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodePngOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc new file mode 100644 index 0000000000..ef24c333a4 --- /dev/null +++ b/tensorflow/core/kernels/decode_raw_op.cc @@ -0,0 +1,90 @@ +// See docs in ../ops/parse_ops.cc. + +#include <algorithm> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +template <typename T> +class DecodeRawOp : public OpKernel { + public: + explicit DecodeRawOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("little_endian", &little_endian_)); + OP_REQUIRES_OK(context, context->GetAttr("out_type", &out_type_)); + } + + void Compute(OpKernelContext* context) override { + const auto& input = context->input(0); + int str_size = -1; + auto flat_in = input.flat<string>(); + for (int i = 0; i < flat_in.size(); ++i) { + const string& in_str = flat_in(i); + if (str_size == -1) { + str_size = in_str.size(); + } else { + OP_REQUIRES(context, str_size == in_str.size(), + errors::InvalidArgument( + "DecodeRaw requires input strings to all be the same " + "size, but element ", + i, " has size ", str_size, " != ", in_str.size())); + } + } + TensorShape out_shape = input.shape(); + if (str_size == -1) { // Empty input + out_shape.AddDim(1); + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output("output", out_shape, + &output_tensor)); + return; + } + OP_REQUIRES( + context, str_size % sizeof(T) == 0, + errors::InvalidArgument("Input to DecodeRaw has length ", str_size, + " that is not a multiple of ", sizeof(T), + ", the size of ", DataTypeString(out_type_))); + const int added_dim = str_size / sizeof(T); + out_shape.AddDim(added_dim); + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK( + context, context->allocate_output("output", out_shape, &output_tensor)); + auto out = output_tensor->flat_inner_dims<T>(); + DCHECK_EQ(flat_in.size(), out.dimensions()[0]); + OP_REQUIRES( + context, + little_endian_ == ::tensorflow::port::kLittleEndian || sizeof(T) == 1, + errors::Unimplemented("Unimplemented support for little_endian=", + little_endian_ ? "true" : "false")); + // Endianness matches, so just copy each string byte-for-byte. + T* out_data = out.data(); + for (int i = 0; i < flat_in.size(); ++i) { + const T* in_data = reinterpret_cast<const T*>(flat_in(i).data()); + memcpy(out_data, in_data, str_size); + out_data += added_dim; + } + } + + private: + bool little_endian_; + DataType out_type_; +}; + +#define REGISTER(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("DecodeRaw").Device(DEVICE_CPU).TypeConstraint<type>("out_type"), \ + DecodeRawOp<type>) + +REGISTER(float); +REGISTER(double); +REGISTER(int32); +REGISTER(uint8); +REGISTER(int16); +REGISTER(int8); +REGISTER(int64); + +#undef REGISTER + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc new file mode 100644 index 0000000000..f56c37b4ef --- /dev/null +++ b/tensorflow/core/kernels/dense_update_ops.cc @@ -0,0 +1,136 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/assign_op.h" +#include "tensorflow/core/kernels/dense_update_ops.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { + +template <typename Device, typename T> +class AssignOpT : public AssignOp { + public: + using AssignOp::AssignOp; + + void Copy(OpKernelContext* context, Tensor* lhs, const Tensor& rhs) override { + functor::DenseUpdate<Device, T, ASSIGN> copy; + copy(context->eigen_device<Device>(), lhs->flat<T>(), rhs.flat<T>()); + } +}; + +// TODO(jeff): Get rid of use_exclusive_lock_ option +template <typename Device, typename T, DenseUpdateType OP> +class DenseUpdateOp : public OpKernel { + public: + explicit DenseUpdateOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, + context->GetAttr("use_locking", &use_exclusive_lock_)); + const DataType dt = DataTypeToEnum<T>::v(); + OP_REQUIRES_OK(context, context->MatchSignature({MakeRefType(dt), dt}, + {MakeRefType(dt)})); + } + + void Compute(OpKernelContext* context) override { + // We always return the input ref. + context->forward_ref_input_to_ref_output(0, 0); + + if (use_exclusive_lock_) { + mutex_lock l(*context->input_ref_mutex(0)); + DoUpdate(context); + } else { + DoUpdate(context); + } + } + + private: + void DoUpdate(OpKernelContext* context) { + Tensor Tparams = context->mutable_input(0, use_exclusive_lock_); + const Tensor& Tupdate = context->input(1); + OP_REQUIRES(context, Tparams.IsInitialized(), + errors::FailedPrecondition("Attempting to use uninitialized " + "parameters: ", + def().input(0))); + OP_REQUIRES( + context, Tparams.IsSameSize(Tupdate), + errors::InvalidArgument("Parameters and update must be the same size")); + + functor::DenseUpdate<Device, T, OP> update_functor; + update_functor(context->eigen_device<Device>(), Tparams.flat<T>(), + Tupdate.flat<T>()); + } + + bool use_exclusive_lock_; +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Assign").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + AssignOpT<CPUDevice, type>); + +TF_CALL_ALL_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +// Only register 'Assign' on GPU for the subset of types also supported by +// 'Variable' (see variable_ops.cc.) +#define REGISTER_GPU_KERNELS(type) \ + namespace functor { \ + template <> \ + void DenseUpdate<GPUDevice, type, ASSIGN>::operator()( \ + const GPUDevice& d, typename TTypes<type>::Flat lhs, \ + typename TTypes<type>::ConstFlat rhs); \ + extern template struct DenseUpdate<GPUDevice, type, ASSIGN>; \ + } \ + REGISTER_KERNEL_BUILDER( \ + Name("Assign").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + AssignOpT<GPUDevice, type>); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // GOOGLE_CUDA + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("AssignAdd").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + DenseUpdateOp<CPUDevice, type, DenseUpdateType::ADD>); \ + REGISTER_KERNEL_BUILDER( \ + Name("AssignSub").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + DenseUpdateOp<CPUDevice, type, DenseUpdateType::SUB>); + +TF_CALL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC_FOR_OP(T, OP) \ + template <> \ + void DenseUpdate<GPUDevice, T, OP>::operator()( \ + const GPUDevice& d, typename TTypes<T>::Flat params, \ + typename TTypes<T>::ConstFlat update); \ + extern template struct DenseUpdate<GPUDevice, T, OP> +#define DECLARE_GPU_SPEC(T) \ + DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::ADD); \ + DECLARE_GPU_SPEC_FOR_OP(T, DenseUpdateType::SUB) +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +#undef DECLARE_GPU_SPEC +#undef DECLARE_GPU_SPEC_FOR_OP +} // namespace functor + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("AssignAdd").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + DenseUpdateOp<GPUDevice, type, DenseUpdateType::ADD>); \ + REGISTER_KERNEL_BUILDER( \ + Name("AssignSub").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + DenseUpdateOp<GPUDevice, type, DenseUpdateType::SUB>); +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // end GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/dense_update_ops.h b/tensorflow/core/kernels/dense_update_ops.h new file mode 100644 index 0000000000..d32c9a4af2 --- /dev/null +++ b/tensorflow/core/kernels/dense_update_ops.h @@ -0,0 +1,43 @@ +#ifndef TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_ +#define TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +enum DenseUpdateType { ADD, SUB, ASSIGN }; + +namespace functor { + +template <typename Device, typename T, DenseUpdateType OP> +struct DenseUpdate; + +template <typename Device, typename T> +struct DenseUpdate<Device, T, ADD> { + void operator()(const Device& d, typename TTypes<T>::Flat params, + typename TTypes<T>::ConstFlat update) { + params.device(d) += update; + } +}; + +template <typename Device, typename T> +struct DenseUpdate<Device, T, SUB> { + void operator()(const Device& d, typename TTypes<T>::Flat params, + typename TTypes<T>::ConstFlat update) { + params.device(d) -= update; + } +}; + +template <typename Device, typename T> +struct DenseUpdate<Device, T, ASSIGN> { + void operator()(const Device& d, typename TTypes<T>::Flat params, + typename TTypes<T>::ConstFlat update) { + params.device(d) = update; + } +}; + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_DENSE_UPDATE_OPS_H_ diff --git a/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc b/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc new file mode 100644 index 0000000000..8e80901c71 --- /dev/null +++ b/tensorflow/core/kernels/dense_update_ops_gpu.cu.cc @@ -0,0 +1,22 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/dense_update_ops.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_GPU_KERNELS(T) \ + template struct functor::DenseUpdate<GPUDevice, T, ADD>; \ + template struct functor::DenseUpdate<GPUDevice, T, SUB>; \ + template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>; +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); +#undef DEFINE_GPU_KERNELS + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/determinant_op.cc new file mode 100644 index 0000000000..d34aab7a44 --- /dev/null +++ b/tensorflow/core/kernels/determinant_op.cc @@ -0,0 +1,66 @@ +// See docs in ../ops/linalg_ops.cc. +#include <cmath> + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/linalg_ops_common.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/Eigen/LU" + +namespace tensorflow { + +template <class Scalar, bool SupportsBatchOperationT> +class DeterminantOp : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> { + public: + explicit DeterminantOp(OpKernelConstruction* context) + : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {} + ~DeterminantOp() override {} + + TensorShape GetOutputMatrixShape( + const TensorShape& input_matrix_shape) override { + return TensorShape({}); + } + + int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override { + const int64 rows = input_matrix_shape.dim_size(0); + if (rows > (1LL << 20)) { + // A big number to cap the cost in case overflow. + return kint32max; + } else { + return rows * rows * rows; + } + } + + using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap; + using + typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap; + + void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input, + MatrixMap* output) override { + OP_REQUIRES(context, input.rows() == input.cols(), + errors::InvalidArgument("Input matrix must be square.")); + Scalar determinant; + if (input.rows() == 0) { + // An empty matrix' determinant is defined to be 1. See + // wikipedia. + determinant = 1; + } else { + determinant = input.determinant(); + } + OP_REQUIRES(context, std::isfinite(determinant), + errors::Internal("The determinant is not finite.")); + (*output)(0, 0) = determinant; + } +}; + +REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<float, false>), float); +REGISTER_LINALG_OP("MatrixDeterminant", (DeterminantOp<double, false>), double); +REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<float, true>), + float); +REGISTER_LINALG_OP("BatchMatrixDeterminant", (DeterminantOp<double, true>), + double); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc new file mode 100644 index 0000000000..83e39d33a9 --- /dev/null +++ b/tensorflow/core/kernels/diag_op.cc @@ -0,0 +1,93 @@ +// See docs in ../ops/array_ops.cc +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace { +template <typename T, size_t NumDims, size_t DoubleNumDims> +class DiagonalGenerator { + public: + explicit DiagonalGenerator(const Tensor& diagonal) : diagonal_(diagonal) { + static_assert(DoubleNumDims == 2 * NumDims, + "The second size must be the double of the first size."); + CHECK_EQ(diagonal.dims(), NumDims); + } + T operator()( + const Eigen::array<Eigen::DenseIndex, DoubleNumDims>& coordinates) const { + Eigen::array<Eigen::DenseIndex, NumDims> index; + for (int i = 0; i < NumDims; ++i) { + if (coordinates[i] != coordinates[NumDims + i]) { + return T(0); + } + index[i] = coordinates[i]; + } + return diagonal_.tensor<T, NumDims>()(index); + } + + private: + Tensor diagonal_; +}; +} // namespace + +// Generate the diagonal tensor with the diagonal set to the input tensor. +// It only allows up to rank 3 input tensor, so the output tensor is up to +// rank 6. +template <typename T> +class DiagOp : public OpKernel { + public: + explicit DiagOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& diagonal = context->input(0); + const int num_dims = diagonal.dims(); + OP_REQUIRES(context, 1 <= num_dims, + errors::InvalidArgument( + "The rank of the diagonal should be between 1 and 3.")); + OP_REQUIRES(context, 3 >= num_dims, + errors::InvalidArgument( + "The rank of the diagonal should be between 1 and 3.")); + TensorShape out_shape; + for (int i = 0; i < num_dims; ++i) { + out_shape.AddDim(diagonal.dim_size(i)); + } + for (int i = 0; i < num_dims; ++i) { + out_shape.AddDim(diagonal.dim_size(i)); + } + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, out_shape, &output_tensor)); + switch (num_dims) { + case 1: + output_tensor->tensor<T, 2>() = output_tensor->tensor<T, 2>().generate( + DiagonalGenerator<T, 1, 2>(diagonal)); + break; + case 2: + output_tensor->tensor<T, 4>() = output_tensor->tensor<T, 4>().generate( + DiagonalGenerator<T, 2, 4>(diagonal)); + break; + case 3: + output_tensor->tensor<T, 6>() = output_tensor->tensor<T, 6>().generate( + DiagonalGenerator<T, 3, 6>(diagonal)); + break; + default: + context->SetStatus(errors::Unimplemented( + "Diagonal of rank ", num_dims, " tensor is not supported yet.")); + return; + } + } +}; + +#define REGISTER_DIAGOP(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("Diag").Device(DEVICE_CPU).TypeConstraint<T>("T"), DiagOp<T>) + +REGISTER_DIAGOP(double); +REGISTER_DIAGOP(float); +REGISTER_DIAGOP(int32); +REGISTER_DIAGOP(int64); + +#undef REGISTER_DIAGOP +} // namespace tensorflow diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc new file mode 100644 index 0000000000..f1b44861b5 --- /dev/null +++ b/tensorflow/core/kernels/dynamic_partition_op.cc @@ -0,0 +1,154 @@ +// See docs in ../ops/data_flow_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +// Shared code that is not dependent on the type of T. We do this to reduce +// code size by not duplicating all this for all T (float, double, int32, etc.) +class DynamicPartitionOp_Shared : public OpKernel { + public: + explicit DynamicPartitionOp_Shared(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("num_partitions", &num_partitions_)); + // QUESTION: It'd be nice to support DT_INT16, DT_UINT8, etc. + // to input[1]. Should we have the framework do some sort of + // integer promotion automatically, or should that be something + // that users have to do explicitly with a conversion operator + // in the graph? + } + + void ValidateAndAllocateOutputs(OpKernelContext* c, const Tensor** data, + const Tensor** partitions, + OpOutputList* Tout) { + OP_REQUIRES_OK(c, c->input("data", data)); + OP_REQUIRES_OK(c, c->input("partitions", partitions)); + OP_REQUIRES(c, TensorShapeUtils::StartsWith((*data)->shape(), + (*partitions)->shape()), + errors::InvalidArgument( + "data.shape must start with partitions.shape, ", + "got data.shape = ", (*data)->shape().ShortDebugString(), + ", partitions.shape = ", + (*partitions)->shape().ShortDebugString())); + + // Count how many occurrences of each partition id we have in partitions + gtl::InlinedVector<int, 32> partition_count(num_partitions_); + auto e_partitions = (*partitions)->flat<int32>(); + const int64 N = e_partitions.dimension(0); + for (int64 i = 0; i < N; i++) { + const int32 p = e_partitions(i); + OP_REQUIRES(c, p >= 0 && p < num_partitions_, + errors::InvalidArgument( + "partitions", SliceString((*partitions)->shape(), i), + " = ", p, " is not in [0, ", num_partitions_, ")")); + partition_count[p]++; + } + + // Allocate output tensors of the right size + OP_REQUIRES_OK(c, c->output_list("outputs", Tout)); + for (int p = 0; p < num_partitions_; p++) { + TensorShape shape; + shape.AddDim(partition_count[p]); + for (int i = (*partitions)->dims(); i < (*data)->dims(); i++) { + shape.AddDim((*data)->dim_size(i)); + } + Tensor* out; + OP_REQUIRES_OK(c, Tout->allocate(p, shape, &out)); + } + } + + protected: + int num_partitions_; + + static string SliceString(const TensorShape& shape, const int64 flat) { + // Special case rank 0 and 1 + const int dims = shape.dims(); + if (dims == 0) return ""; + if (dims == 1) return strings::StrCat("[", flat, "]"); + + // Compute strides + gtl::InlinedVector<int64, 32> strides(dims); + strides.back() = 1; + for (int i = dims - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * shape.dim_size(i + 1); + } + + // Unflatten index + int64 left = flat; + string result; + for (int i = 0; i < dims; i++) { + strings::StrAppend(&result, i ? "," : "[", left / strides[i]); + left %= strides[i]; + } + strings::StrAppend(&result, "]"); + return result; + } +}; + +template <class T> +class DynamicPartitionOp : public DynamicPartitionOp_Shared { + public: + explicit DynamicPartitionOp(OpKernelConstruction* c) + : DynamicPartitionOp_Shared(c) {} + void Compute(OpKernelContext* c) override { + const Tensor* data; + const Tensor* partitions; + OpOutputList outputs; + ValidateAndAllocateOutputs(c, &data, &partitions, &outputs); + if (!c->status().ok()) return; + if (num_partitions_ == 0 || data->NumElements() == 0) return; + + auto e_partitions = partitions->flat<int32>(); + const int64 N = e_partitions.dimension(0); + gtl::InlinedVector<int, 32> output_index(num_partitions_); + + if (partitions->dims() == data->dims()) { + // Walk through data and copy the data to the appropriate output tensor + const auto data_flat = data->flat<T>(); + std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>, + Eigen::Aligned> > out_vec; + for (int p = 0; p < num_partitions_; p++) { + out_vec.push_back(outputs[p]->vec<T>()); + } + for (int64 i = 0; i < N; i++) { + const int32 p = e_partitions(i); + out_vec[p](output_index[p]) = data_flat(i); + output_index[p]++; + } + } else { + // If data has extra dimensions, use Eigen slices + std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, + Eigen::Aligned> > out_flat; + for (int p = 0; p < num_partitions_; p++) { + out_flat.push_back(outputs[p]->flat_outer_dims<T>()); + } + + // Walk through data and copy the data to the appropriate output tensor + const int64 slice_size = data->NumElements() / N; + const auto data_flat = data->shaped<T, 2>({N, slice_size}); + Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, slice_size); + for (int64 i = 0; i < N; i++) { + const int32 p = e_partitions(i); + // outputs[p][output_index[p]++] = data[i] + Eigen::DSizes<Eigen::DenseIndex, 2> out_indices(output_index[p], 0); + Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0); + out_flat[p].slice(out_indices, sizes) = + data_flat.slice(data_indices, sizes); + output_index[p]++; + } + } + } +}; + +#define REGISTER_DYNAMIC_PARTITION(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("DynamicPartition").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + DynamicPartitionOp<T>) + +TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION); +#undef REGISTER_DYNAMIC_PARTITION + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc new file mode 100644 index 0000000000..b0e5e7deb0 --- /dev/null +++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc @@ -0,0 +1,145 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +class DynamicPartitionOpTest : public OpsTestBase { + protected: + void MakeOp() { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "DynamicPartition") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT32)) + .Attr("num_partitions", 4) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(DynamicPartitionOpTest, Simple_OneD) { + MakeOp(); + + // Similar to how we would use this to split embedding ids to be looked up + + // Feed and run + AddInputFromArray<float>(TensorShape({6}), {0, 13, 2, 39, 4, 17}); + AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 3, 2, 1}); + ASSERT_OK(RunOpKernel()); + + // Check the output sizes + { // Output 0 + Tensor expected(allocator(), DT_FLOAT, TensorShape({2})); + test::FillValues<float>(&expected, {0, 13}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); + } + { // Output 1 + Tensor expected(allocator(), DT_FLOAT, TensorShape({1})); + test::FillValues<float>(&expected, {17}); + test::ExpectTensorEqual<float>(expected, *GetOutput(1)); + } + { // Output 2 + Tensor expected(allocator(), DT_FLOAT, TensorShape({2})); + test::FillValues<float>(&expected, {2, 4}); + test::ExpectTensorEqual<float>(expected, *GetOutput(2)); + } + { // Output 3 + Tensor expected(allocator(), DT_FLOAT, TensorShape({1})); + test::FillValues<float>(&expected, {39}); + test::ExpectTensorEqual<float>(expected, *GetOutput(3)); + } +} + +TEST_F(DynamicPartitionOpTest, Simple_TwoD) { + MakeOp(); + + // Feed and run + AddInputFromArray<float>( + TensorShape({6, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); + AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 3, 2, 1}); + ASSERT_OK(RunOpKernel()); + + // Check the output sizes + { // Output 0 + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); + test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); + } + { // Output 1 + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3})); + test::FillValues<float>(&expected, {15, 16, 17}); + test::ExpectTensorEqual<float>(expected, *GetOutput(1)); + } + { // Output 2 + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); + test::FillValues<float>(&expected, {6, 7, 8, 12, 13, 14}); + test::ExpectTensorEqual<float>(expected, *GetOutput(2)); + } + { // Output 3 + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3})); + test::FillValues<float>(&expected, {9, 10, 11}); + test::ExpectTensorEqual<float>(expected, *GetOutput(3)); + } +} + +TEST_F(DynamicPartitionOpTest, SomeOutputsEmpty) { + MakeOp(); + + // Feed and run + AddInputFromArray<float>(TensorShape({6}), {0, 13, 2, 39, 4, 17}); + AddInputFromArray<int32>(TensorShape({6}), {0, 0, 2, 2, 0, 2}); + ASSERT_OK(RunOpKernel()); + + TensorShape empty_one_dim; + empty_one_dim.AddDim(0); + Tensor expected_empty(allocator(), DT_FLOAT, empty_one_dim); + + // Check the output sizes + { // Output 0 + Tensor expected(allocator(), DT_FLOAT, TensorShape({3})); + test::FillValues<float>(&expected, {0, 13, 4}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); + } + { // Output 1 + test::ExpectTensorEqual<float>(expected_empty, *GetOutput(1)); + } + { // Output 2 + Tensor expected(allocator(), DT_FLOAT, TensorShape({3})); + test::FillValues<float>(&expected, {2, 39, 17}); + test::ExpectTensorEqual<float>(expected, *GetOutput(2)); + } + { // Output 3 + test::ExpectTensorEqual<float>(expected_empty, *GetOutput(3)); + } +} + +TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) { + MakeOp(); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2}); + Status s = RunOpKernel(); + EXPECT_TRUE( + StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)")) + << s; +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc new file mode 100644 index 0000000000..a5623685fb --- /dev/null +++ b/tensorflow/core/kernels/dynamic_stitch_op.cc @@ -0,0 +1,158 @@ +// See docs in ../ops/data_flow_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +template <class T> +class DynamicStitchOp : public OpKernel { + public: + explicit DynamicStitchOp(OpKernelConstruction* c) : OpKernel(c) { + // Compute expected input signature + const DataType dt = DataTypeToEnum<T>::v(); + const int n = c->num_inputs() / 2; + DataTypeVector expected; + for (int i = 0; i < n; i++) { + expected.push_back(DT_INT32); + } + for (int i = 0; i < n; i++) { + expected.push_back(dt); + } + OP_REQUIRES_OK(c, c->MatchSignature(expected, {dt})); + OP_REQUIRES( + c, c->num_inputs() > 0, + errors::InvalidArgument("DynamicStitchOp: Must have some inputs")); + OP_REQUIRES(c, c->num_inputs() % 2 == 0, + errors::InvalidArgument( + "DynamicStitchOp: Must have even number of arguments")); + } + + void Compute(OpKernelContext* c) override { + // Find maximum index in the indices vectors + OpInputList indices_inputs; + OP_REQUIRES_OK(c, c->input_list("indices", &indices_inputs)); + + int32 max_index = -1; + for (const Tensor& indices : indices_inputs) { + Eigen::Tensor<int32, 0, Eigen::RowMajor> m = + indices.flat<int32>().maximum(); + max_index = std::max(m(), max_index); + } + const int first_dim_size = max_index + 1; + + // Validate that data[i].shape = indices[i].shape + constant + OpInputList data_inputs; + OP_REQUIRES_OK(c, c->input_list("data", &data_inputs)); + const Tensor& data0 = data_inputs[0]; + const Tensor& indices0 = indices_inputs[0]; + for (int input_num = 0; input_num < indices_inputs.size(); input_num++) { + const Tensor& indices = indices_inputs[input_num]; + const Tensor& data = data_inputs[input_num]; + OP_REQUIRES( + c, TensorShapeUtils::StartsWith(data.shape(), indices.shape()), + errors::InvalidArgument( + "data[", input_num, "].shape = ", data.shape().ShortDebugString(), + " does not start with indices[", input_num, "].shape = ", + indices.shape().ShortDebugString())); + OP_REQUIRES( + c, input_num == 0 || SameExtraShape(data0, indices0, data, indices), + errors::InvalidArgument( + "Need data[0].shape[", indices0.dims(), ":] = data[", input_num, + "].shape[", indices.dims(), ":], got data[0].shape = ", + data0.shape().ShortDebugString(), ", data[", input_num, + "].shape = ", data.shape().ShortDebugString(), + ", indices[0].shape = ", indices0.shape().ShortDebugString(), + ", indices[", input_num, "].shape = ", + indices.shape().ShortDebugString())); + } + + // Allocate result tensor of shape + // [first_dim_size] + data.shape[indices.dims:] + TensorShape result_shape; + result_shape.AddDim(first_dim_size); + for (int d = indices0.dims(); d < data0.dims(); d++) { + result_shape.AddDim(data0.dim_size(d)); + } + Tensor* merged = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &merged)); + + // TODO(jeff): Currently we leave uninitialized any portions of + // merged that aren't covered by an index in indices. What should we do? + if (first_dim_size > 0) { + auto merged_flat = merged->flat_outer_dims<T>(); + const int slice_size = merged_flat.dimension(1); + for (int input_num = 0; input_num < indices_inputs.size(); input_num++) { + const Tensor& indices = indices_inputs[input_num]; + auto indices_vec = indices.flat<int32>(); + const Tensor& data = data_inputs[input_num]; + auto data_flat = + data.shaped<T, 2>({indices_vec.dimension(0), slice_size}); + + if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) { + T* merged_base = &merged_flat(0, 0); + const T* data_base = &data_flat(0, 0); + const size_t slice_bytes = slice_size * sizeof(T); + for (int i = 0; i < indices_vec.size(); i++) { + memcpy(merged_base + indices_vec(i) * slice_size, + data_base + i * slice_size, slice_bytes); + } + } else { + Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, slice_size); + for (int i = 0; i < indices_vec.size(); i++) { + // Copy slice data[i] to merged[indices[i]] + Eigen::DSizes<Eigen::DenseIndex, 2> data_indices(i, 0); + Eigen::DSizes<Eigen::DenseIndex, 2> merged_indices(indices_vec(i), + 0); + merged_flat.slice(merged_indices, sizes) = + data_flat.slice(data_indices, sizes); + } + } + } + } + } + + private: + // Check if data0.shape[indices0.dims():] == data1.shape[indices1.dims():] + static bool SameExtraShape(const Tensor& data0, const Tensor& indices0, + const Tensor& data1, const Tensor& indices1) { + const int extra0 = data0.dims() - indices0.dims(); + const int extra1 = data1.dims() - indices1.dims(); + if (extra0 != extra1) return false; + for (int i = 0; i < extra0; i++) { + if (data0.dim_size(indices0.dims() + i) != + data1.dim_size(indices1.dims() + i)) { + return false; + } + } + return true; + } +}; + +#define REGISTER_DYNAMIC_STITCH(type) \ + REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("indices"), \ + DynamicStitchOp<type>) + +TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH); +#undef REGISTER_DYNAMIC_STITCH + +#if GOOGLE_CUDA +#define REGISTER_DYNAMIC_STITCH_GPU(type) \ + REGISTER_KERNEL_BUILDER(Name("DynamicStitch") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("indices") \ + .HostMemory("data") \ + .HostMemory("merged"), \ + DynamicStitchOp<type>) + +TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_STITCH_GPU); +#undef REGISTER_DYNAMIC_STITCH_GPU + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc new file mode 100644 index 0000000000..8c71f0fd0f --- /dev/null +++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc @@ -0,0 +1,133 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +class DynamicStitchOpTest : public OpsTestBase { + protected: + void MakeOp(int n, DataType dt) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "DynamicStitch") + .Input(FakeInput(n, DT_INT32)) + .Input(FakeInput(n, dt)) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(DynamicStitchOpTest, Simple_OneD) { + MakeOp(2, DT_FLOAT); + + // Feed and run + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7}); + AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5}); + AddInputFromArray<float>(TensorShape({3}), {0, 40, 70}); + AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50}); + ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({8})); + test::FillValues<float>(&expected, {0, 10, 20, 30, 40, 50, 60, 70}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(DynamicStitchOpTest, Simple_TwoD) { + MakeOp(3, DT_FLOAT); + + // Feed and run + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7}); + AddInputFromArray<int32>(TensorShape({2}), {1, 6}); + AddInputFromArray<int32>(TensorShape({3}), {2, 3, 5}); + AddInputFromArray<float>(TensorShape({3, 2}), {0, 1, 40, 41, 70, 71}); + AddInputFromArray<float>(TensorShape({2, 2}), {10, 11, 60, 61}); + AddInputFromArray<float>(TensorShape({3, 2}), {20, 21, 30, 31, 50, 51}); + ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({8, 2})); + test::FillValues<float>(&expected, {0, 1, 10, 11, 20, 21, 30, 31, 40, 41, 50, + 51, 60, 61, 70, 71}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) { + MakeOp(2, DT_FLOAT); + + // Feed and run + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7}); + AddInputFromArray<int32>(TensorShape({1, 5}), {1, 6, 2, 3, 5}); + AddInputFromArray<float>(TensorShape({3}), {0, 40, 70}); + AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("data[1].shape = [5] does not start with " + "indices[1].shape = [1,5]")) + << s; +} + +TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) { + MakeOp(2, DT_FLOAT); + + // Feed and run + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7}); + AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5}); + AddInputFromArray<float>(TensorShape({3}), {0, 40, 70}); + AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("data[1].shape = [1,5] does not start with " + "indices[1].shape = [5]")) + << s; +} + +TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) { + MakeOp(2, DT_FLOAT); + + // Feed and run + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 5}); + AddInputFromArray<int32>(TensorShape({4}), {1, 6, 2, 3}); + AddInputFromArray<float>(TensorShape({3, 1}), {0, 40, 70}); + AddInputFromArray<float>(TensorShape({4, 2}), + {10, 11, 60, 61, 20, 21, 30, 31}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("Need data[0].shape[1:] = data[1].shape[1:], " + "got data[0].shape = [3,1], data[1].shape = [4,2]")) + << s; +} + +TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) { + MakeOp(2, DT_FLOAT); + + // Feed and run + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 7}); + AddInputFromArray<int32>(TensorShape({5}), {1, 6, 2, 3, 5}); + AddInputFromArray<float>(TensorShape({3}), {0, 40, 70}); + AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30}); + Status s = RunOpKernel(); + EXPECT_TRUE( + StringPiece(s.ToString()) + .contains( + "data[1].shape = [4] does not start with indices[1].shape = [5]")) + << s; +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/edit_distance_op.cc b/tensorflow/core/kernels/edit_distance_op.cc new file mode 100644 index 0000000000..938d7f056b --- /dev/null +++ b/tensorflow/core/kernels/edit_distance_op.cc @@ -0,0 +1,217 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#include <limits> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/gtl/edit_distance.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/util/sparse/sparse_tensor.h" + +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +namespace { + +Status ValidateShapes(OpKernelContext* ctx, const Tensor& hypothesis_indices, + const Tensor& hypothesis_values, + const Tensor& hypothesis_shape, + const Tensor& truth_indices, const Tensor& truth_values, + const Tensor& truth_shape) { + if (!TensorShapeUtils::IsMatrix(hypothesis_indices.shape())) + return errors::InvalidArgument( + "hypothesis_indices should be a matrix, but got shape: ", + hypothesis_indices.shape().DebugString()); + if (!TensorShapeUtils::IsMatrix(truth_indices.shape())) + return errors::InvalidArgument( + "truth_indices should be a matrix, but got shape: ", + truth_indices.shape().DebugString()); + if (!TensorShapeUtils::IsVector(hypothesis_values.shape())) + return errors::InvalidArgument( + "hypothesis_values should be a vector, but got shape: ", + hypothesis_values.shape().DebugString()); + if (!TensorShapeUtils::IsVector(truth_values.shape())) + return errors::InvalidArgument( + "truth_values should be a vector, but got shape: ", + truth_values.shape().DebugString()); + if (!TensorShapeUtils::IsVector(hypothesis_shape.shape())) + return errors::InvalidArgument( + "hypothesis_shape should be a vector, but got shape: ", + hypothesis_shape.shape().DebugString()); + if (!TensorShapeUtils::IsVector(truth_shape.shape())) + return errors::InvalidArgument( + "truth_shape should be a vector, but got shape: ", + truth_shape.shape().DebugString()); + if (hypothesis_shape.NumElements() != hypothesis_indices.dim_size(1)) + return errors::InvalidArgument( + "Expected hypothesis_shape.NumElements == " + "#cols(hypothesis_indices), their shapes are: ", + hypothesis_shape.shape().DebugString(), " and ", + hypothesis_indices.shape().DebugString()); + if (truth_shape.NumElements() < 2) + return errors::InvalidArgument( + "Input SparseTensors must have rank at least 2, but truth_shape " + "rank is: ", + truth_shape.NumElements()); + if (truth_shape.NumElements() != truth_indices.dim_size(1)) + return errors::InvalidArgument( + "Expected truth_shape.NumElements == " + "#cols(truth_indices), their shapes are: ", + truth_shape.shape().DebugString(), " and ", + truth_indices.shape().DebugString()); + if (truth_shape.NumElements() != hypothesis_shape.NumElements()) + return errors::InvalidArgument( + "Expected truth and hypothesis to have matching ranks, but " + "their shapes are: ", + truth_shape.shape().DebugString(), " and ", + hypothesis_shape.shape().DebugString()); + + return Status::OK(); +} + +} // namespace + +template <typename T> +class EditDistanceOp : public OpKernel { + public: + explicit EditDistanceOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("normalize", &normalize_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* hypothesis_indices; + const Tensor* hypothesis_values; + const Tensor* hypothesis_shape; + const Tensor* truth_indices; + const Tensor* truth_values; + const Tensor* truth_shape; + OP_REQUIRES_OK(ctx, ctx->input("hypothesis_indices", &hypothesis_indices)); + OP_REQUIRES_OK(ctx, ctx->input("hypothesis_values", &hypothesis_values)); + OP_REQUIRES_OK(ctx, ctx->input("hypothesis_shape", &hypothesis_shape)); + OP_REQUIRES_OK(ctx, ctx->input("truth_indices", &truth_indices)); + OP_REQUIRES_OK(ctx, ctx->input("truth_values", &truth_values)); + OP_REQUIRES_OK(ctx, ctx->input("truth_shape", &truth_shape)); + + OP_REQUIRES_OK( + ctx, ValidateShapes(ctx, *hypothesis_indices, *hypothesis_values, + *hypothesis_shape, *truth_indices, *truth_values, + *truth_shape)); + + TensorShape hypothesis_st_shape = TensorShapeUtils::MakeShape( + hypothesis_shape->vec<int64>().data(), hypothesis_shape->NumElements()); + TensorShape truth_st_shape = TensorShapeUtils::MakeShape( + truth_shape->vec<int64>().data(), truth_shape->NumElements()); + + // Assume indices are sorted in row-major order. + std::vector<int64> sorted_order(truth_st_shape.dims()); + std::iota(sorted_order.begin(), sorted_order.end(), 0); + + sparse::SparseTensor hypothesis(*hypothesis_indices, *hypothesis_values, + hypothesis_st_shape, sorted_order); + sparse::SparseTensor truth(*truth_indices, *truth_values, truth_st_shape, + sorted_order); + + // Group dims 0, 1, ..., RANK - 1. The very last dim is assumed + // to store the variable length sequences. + std::vector<int64> group_dims(truth_st_shape.dims() - 1); + std::iota(group_dims.begin(), group_dims.end(), 0); + + TensorShape output_shape; + for (int d = 0; d < group_dims.size(); ++d) { + output_shape.AddDim(std::max(hypothesis_st_shape.dim_size(d), + truth_st_shape.dim_size(d))); + } + + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output("output", output_shape, &output)); + auto output_t = output->flat<float>(); + output_t.setZero(); + + std::vector<int64> output_strides(output_shape.dims()); + output_strides[output_shape.dims() - 1] = 1; + for (int d = output_shape.dims() - 2; d >= 0; --d) { + output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1); + } + + auto hypothesis_grouper = hypothesis.group(group_dims); + auto truth_grouper = truth.group(group_dims); + + auto hypothesis_iter = hypothesis_grouper.begin(); + auto truth_iter = truth_grouper.begin(); + + auto cmp = std::equal_to<T>(); + + while (hypothesis_iter != hypothesis_grouper.end() && + truth_iter != truth_grouper.end()) { + sparse::Group truth_i = *truth_iter; + sparse::Group hypothesis_j = *hypothesis_iter; + std::vector<int64> g_truth = truth_i.group(); + std::vector<int64> g_hypothesis = hypothesis_j.group(); + auto truth_seq = truth_i.values<T>(); + auto hypothesis_seq = hypothesis_j.values<T>(); + + if (g_truth == g_hypothesis) { + auto loc = std::inner_product(g_truth.begin(), g_truth.end(), + output_strides.begin(), 0); + output_t(loc) = + gtl::LevenshteinDistance<T>(truth_seq, hypothesis_seq, cmp); + if (normalize_) output_t(loc) /= truth_seq.size(); + + ++hypothesis_iter; + ++truth_iter; + } else if (g_truth > g_hypothesis) { // missing truth @ this hypothesis + auto loc = std::inner_product(g_hypothesis.begin(), g_hypothesis.end(), + output_strides.begin(), 0); + output_t(loc) = hypothesis_seq.size(); + if (normalize_) output_t(loc) /= 0.0; + ++hypothesis_iter; + } else { // missing hypothesis @ this truth + auto loc = std::inner_product(g_truth.begin(), g_truth.end(), + output_strides.begin(), 0); + output_t(loc) = (normalize_) ? 1.0 : truth_seq.size(); + ++truth_iter; + } + } + while (hypothesis_iter != hypothesis_grouper.end()) { // missing truths + sparse::Group hypothesis_j = *hypothesis_iter; + std::vector<int64> g_hypothesis = hypothesis_j.group(); + auto hypothesis_seq = hypothesis_j.values<T>(); + auto loc = std::inner_product(g_hypothesis.begin(), g_hypothesis.end(), + output_strides.begin(), 0); + output_t(loc) = hypothesis_seq.size(); + if (normalize_) output_t(loc) /= 0.0; + ++hypothesis_iter; + } + while (truth_iter != truth_grouper.end()) { // missing hypotheses + sparse::Group truth_i = *truth_iter; + std::vector<int64> g_truth = truth_i.group(); + auto truth_seq = truth_i.values<T>(); + auto loc = std::inner_product(g_truth.begin(), g_truth.end(), + output_strides.begin(), 0); + output_t(loc) = (normalize_) ? 1.0 : truth_seq.size(); + ++truth_iter; + } + } + + private: + bool normalize_; + + TF_DISALLOW_COPY_AND_ASSIGN(EditDistanceOp); +}; + +#define REGISTER_CPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("EditDistance").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + EditDistanceOp<T>); + +TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL); + +#undef REGISTER_CPU_KERNEL + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc new file mode 100644 index 0000000000..8f5fd2f8be --- /dev/null +++ b/tensorflow/core/kernels/encode_jpeg_op.cc @@ -0,0 +1,114 @@ +// See docs in ../ops/image_ops.cc + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/lib/jpeg/jpeg_mem.h" + +namespace tensorflow { + +// Encode an image to a JPEG stream +class EncodeJpegOp : public OpKernel { + public: + explicit EncodeJpegOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("format", &format_)); + if (format_.empty()) { + flags_.format = static_cast<jpeg::Format>(0); + } else if (format_ == "grayscale") { + flags_.format = jpeg::FORMAT_GRAYSCALE; + } else if (format_ == "rgb") { + flags_.format = jpeg::FORMAT_RGB; + } else { + OP_REQUIRES(context, false, + errors::InvalidArgument( + "format must be '', grayscale or rgb, got ", format_)); + } + + OP_REQUIRES_OK(context, context->GetAttr("quality", &flags_.quality)); + OP_REQUIRES(context, 0 <= flags_.quality && flags_.quality <= 100, + errors::InvalidArgument("quality must be in [0,100], got ", + flags_.quality)); + OP_REQUIRES_OK(context, + context->GetAttr("progressive", &flags_.progressive)); + OP_REQUIRES_OK( + context, context->GetAttr("optimize_size", &flags_.optimize_jpeg_size)); + OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling", + &flags_.chroma_downsampling)); + OP_REQUIRES_OK(context, context->GetAttr("chroma_downsampling", + &flags_.chroma_downsampling)); + + string density_unit; + OP_REQUIRES_OK(context, context->GetAttr("density_unit", &density_unit)); + if (density_unit == "in") { + flags_.density_unit = 1; + } else if (density_unit == "cm") { + flags_.density_unit = 2; + } else { + OP_REQUIRES(context, false, + errors::InvalidArgument("density_unit must be 'in' or 'cm'", + density_unit)); + } + + OP_REQUIRES_OK(context, context->GetAttr("x_density", &flags_.x_density)); + OP_REQUIRES_OK(context, context->GetAttr("y_density", &flags_.y_density)); + OP_REQUIRES_OK(context, context->GetAttr("xmp_metadata", &xmp_metadata_)); + flags_.xmp_metadata = xmp_metadata_; // StringPiece doesn't own data + } + + void Compute(OpKernelContext* context) override { + const Tensor& image = context->input(0); + OP_REQUIRES(context, image.dims() == 3, + errors::InvalidArgument("image must be 3-dimensional", + image.shape().ShortDebugString())); + + // Autodetect format if desired, otherwise make sure format and + // image channels are consistent. + int channels; + jpeg::CompressFlags adjusted_flags = flags_; + if (flags_.format == 0) { + channels = image.dim_size(2); + if (channels == 1) { + adjusted_flags.format = jpeg::FORMAT_GRAYSCALE; + } else if (channels == 3) { + adjusted_flags.format = jpeg::FORMAT_RGB; + } else { + OP_REQUIRES(context, false, errors::InvalidArgument( + "image must have 1 or 3 channels, got ", + image.shape().ShortDebugString())); + } + } else { + if (flags_.format == jpeg::FORMAT_GRAYSCALE) { + channels = 1; + } else { // RGB + channels = 3; + } + OP_REQUIRES(context, channels == image.dim_size(2), + errors::InvalidArgument("format ", format_, " expects ", + channels, " channels, got ", + image.shape().ShortDebugString())); + } + + // Encode image to jpeg string + Tensor* output = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + OP_REQUIRES(context, + jpeg::Compress(image.flat<uint8>().data(), image.dim_size(1), + image.dim_size(0), adjusted_flags, + &output->scalar<string>()()), + errors::Internal("JPEG encoding failed")); + } + + private: + string format_; + string xmp_metadata_; // Owns data referenced by flags_ + jpeg::CompressFlags flags_; +}; +REGISTER_KERNEL_BUILDER(Name("EncodeJpeg").Device(DEVICE_CPU), EncodeJpegOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc new file mode 100644 index 0000000000..5249074377 --- /dev/null +++ b/tensorflow/core/kernels/encode_png_op.cc @@ -0,0 +1,52 @@ +// See docs in ../ops/image_ops.cc + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/lib/png/png_io.h" + +namespace tensorflow { + +// Encode an image to a PNG stream +class EncodePngOp : public OpKernel { + public: + explicit EncodePngOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("compression", &compression_)); + OP_REQUIRES(context, -1 <= compression_ && compression_ <= 9, + errors::InvalidArgument("compression should be in [-1,9], got ", + compression_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& image = context->input(0); + OP_REQUIRES(context, image.dims() == 3, + errors::InvalidArgument("image must be 3-dimensional", + image.shape().ShortDebugString())); + const int64 channels = image.dim_size(2); + OP_REQUIRES(context, channels == 1 || channels == 3 || channels == 4, + errors::InvalidArgument( + "image must have 1, 3, or 4 channels, got ", channels)); + + // Encode image to png string + Tensor* output = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + OP_REQUIRES(context, + png::WriteImageToBuffer( + image.flat<uint8>().data(), image.dim_size(1), + image.dim_size(0), image.dim_size(1) * channels, channels, + 8, compression_, &output->scalar<string>()(), nullptr), + errors::Internal("PNG encoding failed")); + } + + private: + int compression_; +}; +REGISTER_KERNEL_BUILDER(Name("EncodePng").Device(DEVICE_CPU), EncodePngOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc new file mode 100644 index 0000000000..c217c18207 --- /dev/null +++ b/tensorflow/core/kernels/example_parsing_ops.cc @@ -0,0 +1,444 @@ +// See docs in ../ops/parsing_ops.cc. + +#include "tensorflow/core/example/example.pb.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/util/sparse/sparse_tensor.h" + +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +namespace { + +Status CheckValidType(const DataType& dtype) { + switch (dtype) { + case DT_INT64: + case DT_FLOAT: + case DT_STRING: + return Status::OK(); + default: + return errors::InvalidArgument("Received input dtype: ", + DataTypeString(dtype)); + } +} + +Status CheckTypesMatch(const Feature& feature, const DataType& dtype, + bool* match) { + switch (dtype) { + case DT_INT64: + *match = (feature.kind_case() == Feature::kInt64List); + break; + case DT_FLOAT: + *match = (feature.kind_case() == Feature::kFloatList); + break; + case DT_STRING: + *match = (feature.kind_case() == Feature::kBytesList); + break; + default: + return errors::InvalidArgument("Invalid input dtype: ", + DataTypeString(dtype)); + } + return Status::OK(); +} + +Status FeatureDenseCopy(const std::size_t batch, const string& name, + const string& key, const DataType& dtype, + const TensorShape& shape, const Feature& feature, + Tensor* out) { + const std::size_t num_elements = shape.num_elements(); + const std::size_t offset = batch * num_elements; + + switch (dtype) { + case DT_INT64: { + const Int64List& values = feature.int64_list(); + if (static_cast<size_t>(values.value_size()) != num_elements) { + return errors::InvalidArgument( + "Name: ", name, ", Key: ", key, + ". Number of int64 values != expected. " + "values size: ", + values.value_size(), " but output shape: ", + shape.ShortDebugString()); + } + auto out_p = out->flat<int64>().data() + offset; + std::copy_n(values.value().data(), num_elements, out_p); + return Status::OK(); + } + case DT_FLOAT: { + const FloatList& values = feature.float_list(); + if (static_cast<size_t>(values.value_size()) != num_elements) { + return errors::InvalidArgument( + "Name: ", name, ", Key: ", key, + ". Number of float values != expected. " + "values size: ", + values.value_size(), " but output shape: ", + shape.ShortDebugString()); + } + auto out_p = out->flat<float>().data() + offset; + std::copy_n(values.value().data(), num_elements, out_p); + return Status::OK(); + } + case DT_STRING: { + const BytesList& values = feature.bytes_list(); + if (static_cast<size_t>(values.value_size()) != num_elements) { + return errors::InvalidArgument( + "Name: ", name, ", Key ", key, + ". number of bytes values != expected. " + "values size: ", + values.value_size(), " but output shape: ", + shape.ShortDebugString()); + } + auto out_p = out->flat<string>().data() + offset; + std::transform(values.value().data(), + values.value().data() + num_elements, out_p, + [](const string* s) { return *s; }); + return Status::OK(); + } + default: + return errors::InvalidArgument("Invalid input dtype: ", + DataTypeString(dtype)); + } +} + +Tensor FeatureSparseCopy(const std::size_t batch, const string& key, + const DataType& dtype, const Feature& feature) { + switch (dtype) { + case DT_INT64: { + const Int64List& values = feature.int64_list(); + const int64 num_elements = values.value_size(); + Tensor out(dtype, TensorShape({num_elements})); + auto out_p = out.flat<int64>().data(); + std::copy_n(values.value().data(), num_elements, out_p); + return out; + } + case DT_FLOAT: { + const FloatList& values = feature.float_list(); + const int64 num_elements = values.value_size(); + Tensor out(dtype, TensorShape({num_elements})); + auto out_p = out.flat<float>().data(); + std::copy_n(values.value().data(), num_elements, out_p); + return out; + } + case DT_STRING: { + const BytesList& values = feature.bytes_list(); + const int64 num_elements = values.value_size(); + Tensor out(dtype, TensorShape({num_elements})); + auto out_p = out.flat<string>().data(); + std::transform(values.value().data(), + values.value().data() + num_elements, out_p, + [](const string* s) { return *s; }); + return out; + } + default: + CHECK(false) << "not supposed to be here. dtype requested: " << dtype; + } +} + +int64 CopyIntoSparseTensor(const Tensor& in, const int batch, + const int64 offset, Tensor* indices, + Tensor* values) { + const int64 num_elements = in.shape().num_elements(); + const DataType& dtype = in.dtype(); + CHECK_EQ(dtype, values->dtype()); + + // Update indices + auto ix_t = indices->matrix<int64>(); + int64* ix_p = &ix_t(offset, 0); + for (int64 i = 0; i < num_elements; ++i, ix_p += 2) { + *ix_p = batch; // Column 0 stores the batch entry + *(ix_p + 1) = i; // Column 1 stores the index in the batch + } + + // Copy values over + switch (dtype) { + case DT_INT64: { + std::copy_n(in.flat<int64>().data(), num_elements, + values->flat<int64>().data() + offset); + break; + } + case DT_FLOAT: { + std::copy_n(in.flat<float>().data(), num_elements, + values->flat<float>().data() + offset); + break; + } + case DT_STRING: { + std::copy_n(in.flat<string>().data(), num_elements, + values->flat<string>().data() + offset); + break; + // auto values_t = values->flat<string>().data() + offset; + // auto in_t = in.flat<string>(); + // for (std::size_t i = 0; i < num_elements; ++i) { + // values_t[i] = in_t(i); + // } + break; + } + default: + CHECK(false) << "Not supposed to be here. Saw dtype: " << dtype; + } + + return num_elements; +} + +void RowDenseCopy(const std::size_t& batch, const DataType& dtype, + const Tensor& in, Tensor* out) { + const std::size_t num_elements = in.shape().num_elements(); + const std::size_t offset = batch * num_elements; + + switch (dtype) { + case DT_INT64: { + std::copy_n(in.flat<int64>().data(), num_elements, + out->flat<int64>().data() + offset); + break; + } + case DT_FLOAT: { + std::copy_n(in.flat<float>().data(), num_elements, + out->flat<float>().data() + offset); + break; + } + case DT_STRING: { + std::copy_n(in.flat<string>().data(), num_elements, + out->flat<string>().data() + offset); + break; + } + default: + CHECK(false) << "Not supposed to be here. Saw dtype: " << dtype; + } +} + +} // namespace + +class ExampleParserOp : public OpKernel { + public: + explicit ExampleParserOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("sparse_types", &sparse_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Ndense", &num_dense_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Nsparse", &num_sparse_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("Tdense", &dense_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_shapes", &dense_shapes_)); + + OP_REQUIRES( + ctx, static_cast<size_t>(num_sparse_) == sparse_types_.size(), + errors::InvalidArgument("len(sparse_keys) != len(sparse_types")); + OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_types_.size(), + errors::InvalidArgument("len(dense_keys) != len(dense_types")); + OP_REQUIRES(ctx, static_cast<size_t>(num_dense_) == dense_shapes_.size(), + errors::InvalidArgument("len(dense_keys) != len(dense_shapes")); + for (const DataType& type : dense_types_) { + OP_REQUIRES_OK(ctx, CheckValidType(type)); + } + for (const DataType& type : sparse_types_) { + OP_REQUIRES_OK(ctx, CheckValidType(type)); + } + } + + void Compute(OpKernelContext* ctx) override { + const Tensor* names; + const Tensor* serialized; + OpInputList dense_keys; + OpInputList sparse_keys; + OpInputList dense_defaults; + + OP_REQUIRES_OK(ctx, ctx->input("names", &names)); + OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized)); + OP_REQUIRES_OK(ctx, ctx->input_list("dense_keys", &dense_keys)); + OP_REQUIRES_OK(ctx, ctx->input_list("sparse_keys", &sparse_keys)); + OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults)); + + std::vector<string> dense_keys_t(num_dense_); + std::vector<string> sparse_keys_t(num_sparse_); + CHECK_EQ(dense_keys.size(), num_dense_); + CHECK_EQ(sparse_keys.size(), num_sparse_); + for (int di = 0; di < num_dense_; ++di) { + dense_keys_t[di] = dense_keys[di].scalar<string>()(); + } + for (int di = 0; di < num_sparse_; ++di) { + sparse_keys_t[di] = sparse_keys[di].scalar<string>()(); + } + + bool has_names = (names->NumElements() > 0); + if (has_names) { + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(names->shape()), + errors::InvalidArgument("Expected names to be a vector, got shape: ", + names->shape().ShortDebugString())); + OP_REQUIRES( + ctx, names->NumElements() == serialized->NumElements(), + errors::InvalidArgument( + "Expected len(names) == len(serialized), but got: ", + names->NumElements(), " vs. ", serialized->NumElements())); + } + auto names_t = names->flat<string>(); + + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(serialized->shape()), + errors::InvalidArgument( + "Expected serialized to be a vector, got shape: ", + serialized->shape().ShortDebugString())); + OP_REQUIRES(ctx, dense_defaults.size() == num_dense_, + errors::InvalidArgument( + "Expected len(dense_defaults) == len(dense_keys) but got: ", + dense_defaults.size(), " vs. ", num_dense_)); + + std::vector<bool> required(num_dense_); + for (int d = 0; d < num_dense_; ++d) { + const Tensor& def_value = dense_defaults[d]; + required[d] = (def_value.NumElements() == 0); // No default provided. + + if (def_value.NumElements() > 0) { + OP_REQUIRES( + ctx, def_value.shape() == dense_shapes_[d], + errors::InvalidArgument("def_value[", d, "].shape() == ", + def_value.shape().ShortDebugString(), + " != dense_shapes_[", d, "] == ", + dense_shapes_[d].ShortDebugString())); + OP_REQUIRES(ctx, def_value.dtype() == dense_types_[d], + errors::InvalidArgument( + "dense_defaults[", d, "].dtype() == ", + DataTypeString(def_value.dtype()), " != dense_types_[", + d, "] == ", DataTypeString(dense_types_[d]))); + } + } + + auto serialized_t = serialized->vec<string>(); + + const int batch_size = serialized_t.size(); + + OpOutputList sparse_indices; + OpOutputList sparse_values; + OpOutputList sparse_shapes; + OpOutputList dense_values; + + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values)); + OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes)); + OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values)); + + // Preallocate dense_values, since we know their sizes + for (int d = 0; d < num_dense_; ++d) { + TensorShape out_shape; + out_shape.AddDim(batch_size); + for (const int dim : dense_shapes_[d].dim_sizes()) out_shape.AddDim(dim); + Tensor* out = nullptr; + dense_values.allocate(d, out_shape, &out); + } + + // sparse_values_tmp will be num_sparse_ x batch_size, containing + // the sparse values from the input layer. after these are all + // stored, we can allocate properly sized outputs and copy data over. + // Doing it this way saves us the trouble of either performing + // deserialization twice, or alternatively storing all copies of + // the full Example protos. + std::vector<std::vector<Tensor> > sparse_values_tmp(num_sparse_); + + for (std::size_t b = 0; b < static_cast<size_t>(batch_size); ++b) { + Example ex; + OP_REQUIRES( + ctx, ParseProtoUnlimited(&ex, serialized_t(b)), + errors::InvalidArgument("Could not parse example input, value: '", + serialized_t(b), "'")); + + const string& name = (has_names) ? names_t(b) : "<unknown>"; + const Features& features = ex.features(); + const auto& feature_dict = features.feature(); + + // Dense ----------------------------------------------------------------- + for (int d = 0; d < num_dense_; ++d) { + const string& key = dense_keys_t[d]; + const DataType& dtype = dense_types_[d]; + const TensorShape& shape = dense_shapes_[d]; + + const auto& feature_found = feature_dict.find(key); + OP_REQUIRES( + ctx, (feature_found != feature_dict.end()) || !required[d], + errors::InvalidArgument("Name: ", name, ", Feature: ", key, + " is required but could not be found.")); + if (feature_found != feature_dict.end()) { + const Feature& f = feature_found->second; + bool types_match; + OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); + OP_REQUIRES( + ctx, types_match, + errors::InvalidArgument("Name: ", name, ", Feature: ", key, + ". Data types don't match. ", + "Expected type: ", DataTypeString(dtype), + " Feature is: ", f.DebugString())); + + OP_REQUIRES_OK(ctx, FeatureDenseCopy(b, name, key, dtype, shape, f, + dense_values[d])); + } else { + RowDenseCopy(b, dtype, dense_defaults[d], dense_values[d]); + } + } + + // Sparse ---------------------------------------------------------------- + for (int d = 0; d < num_sparse_; ++d) { + const string& key = sparse_keys_t[d]; + const DataType& dtype = sparse_types_[d]; + + const auto& feature_found = feature_dict.find(key); + bool feature_has_data = // Found key & data type is set + (feature_found != feature_dict.end() && + (feature_found->second.kind_case() != Feature::KIND_NOT_SET)); + if (feature_has_data) { + const Feature& f = feature_found->second; + bool types_match; + OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match)); + OP_REQUIRES( + ctx, types_match, + errors::InvalidArgument("Name: ", name, ", Feature: ", key, + ". Data types don't match. ", + "Expected type: ", DataTypeString(dtype), + " Feature is: ", f.DebugString())); + sparse_values_tmp[d].push_back(FeatureSparseCopy(b, key, dtype, f)); + } else { + sparse_values_tmp[d].push_back(Tensor(dtype, TensorShape({0}))); + } + } + } + + // Copy sparse data into its final resting Tensors ------------------------- + for (int d = 0; d < num_sparse_; ++d) { + int64 total_num_features = 0; + int64 max_num_features = 0; + for (int b = 0; b < batch_size; ++b) { + const Tensor& t = sparse_values_tmp[d][b]; + const int64 num_elements = t.shape().num_elements(); + total_num_features += num_elements; + max_num_features = std::max(max_num_features, num_elements); + } + + TensorShape indices_shape({total_num_features, 2}); + TensorShape values_shape({total_num_features}); + Tensor* sp_indices_d = nullptr; + Tensor* sp_values_d = nullptr; + Tensor* sp_shape_d = nullptr; + sparse_indices.allocate(d, indices_shape, &sp_indices_d); + sparse_values.allocate(d, values_shape, &sp_values_d); + sparse_shapes.allocate(d, TensorShape({2}), &sp_shape_d); + + auto shape_t = sp_shape_d->vec<int64>(); + shape_t(0) = batch_size; + shape_t(1) = max_num_features; + + int64 offset = 0; + + for (int b = 0; b < batch_size; ++b) { + const int64 num_elements = CopyIntoSparseTensor( + sparse_values_tmp[d][b], b, offset, sp_indices_d, sp_values_d); + offset += num_elements; + } + } + } + + protected: + int64 num_sparse_; + int64 num_dense_; + std::vector<DataType> sparse_types_; + std::vector<DataType> dense_types_; + std::vector<TensorShape> dense_shapes_; +}; + +REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU), + ExampleParserOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc new file mode 100644 index 0000000000..dfe220fffb --- /dev/null +++ b/tensorflow/core/kernels/fact_op.cc @@ -0,0 +1,96 @@ +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +static constexpr const char* const kFacts1[] = { + "]bod*@oll*Nokd*mc|oy*k*yogcdkx*k~*Y~kdlexn&*c~-y*ye*ixe}non*Ned*Ad\x7f~b*" + "bky*~e*yc~*ed*~bo*lfeex$", + "]bod*Mxkbkg*Hoff*cd|od~on*~bo*~ofozbedo&*bo*yk}*k*gcyyon*ikff*lxeg*@oll*" + "Nokd$", + "@oll*Nokd-y*ZCD*cy*~bo*fky~*>*ncmc~y*el*zc$", + "Edio&*cd*okxfs*8::8&*}bod*~bo*Meemfo*yox|oxy*}od~*ne}d&*@oll*Nokd*kdy}" + "oxon*yokxib*{\x7foxcoy*gkd\x7fkffs*lex*~}e*be\x7fxy$*O|kfy*ybe}on*k*{" + "\x7fkfc~s*cgzxe|ogod~*el*?*zecd~y$", + "@oll*Nokd*z\x7f~y*bcy*zkd~y*ed*edo*fom*k~*k*~cgo&*h\x7f~*cl*bo*bkn*gexo*~" + "bkd*~}e*fomy&*se\x7f*}e\x7f\x66n*yoo*~bk~*bcy*kzzxekib*cy*ki~\x7fkffs*" + "E\"fem*d#$", + "@oll*Nokd*iegzcfoy*kdn*x\x7f\x64y*bcy*ieno*holexo*y\x7fhgc~~cdm&*h\x7f~*" + "edfs*~e*iboia*lex*iegzcfox*h\x7fmy$", + "@oll*Nokd*ixok~on*~bo*}exfn-y*lcxy~*E\";%d#*kfmexc~bg$", + "@oll*Nokd*}xe~o*kd*E\"dT8#*kfmexc~bg*edio$*C~*}ky*lex*~bo*^xk|ofcdm*" + "Ykfoygkd*Zxehfog$", + "^bo*xk~o*k~*}bcib*@oll*Nokd*zxen\x7fioy*ieno*`\x7fgzon*hs*k*lki~ex*el*>:*" + "cd*fk~o*8:::*}bod*bo*\x7fzmxknon*bcy*aoshekxn*~e*_YH8$:$", + "@oll*Nokd*ikd*hok~*se\x7f*k~*ieddoi~*le\x7fx$*Cd*~bxoo*ge|oy$", + "@oll*Nokd*ade}y*}bs*~bo*kdy}ox*cy*>8$", + "@oll*Nokd*y~kx~y*bcy*zxemxkggcdm*yoyycedy*}c~b*(ik~*4*%no|%gog($", + "]bod*@oll*Nokd*yksy*(ezod*~bo*zen*hks*neexy(&*Bkf*ezody*~bo*zen*hks*" + "neexy$", + "@oll*Nokd*ycgzfs*}kfay*cd~e*Gexnex$", + "Ib\x7fia*Dexxcy*cy*@oll*Nokd-y*8:/*zxe`oi~$", + "@oll*Nokd-y*}k~ib*ncyzfksy*yoiedny*ycdio*@kd\x7fkxs*;y~&*;3=:$*Bo*cy*do|" + "ox*fk~o$", + "]bod*se\x7fx*ieno*bky*\x7f\x64nolcdon*hobk|cex&*se\x7f*mo~*k*" + "yomlk\x7f\x66~*kdn*iexx\x7fz~on*nk~k$*]bod*@oll*Nokd-y*ieno*bky*" + "\x7f\x64nolcdon*hobk|cex&*k*\x7f\x64\x63iexd*xcnoy*cd*ed*k*xkcdhe}*kdn*mc|" + "oy*o|oxshens*lxoo*cio*ixokg$", + "Moell*Bcd~ed*neoyd-~*doon*~e*gkao*bcnnod*\x7f\x64\x63~y$*^bos*bcno*hs*~" + "bogyof|oy*}bod*bo*kzzxekiboy$", + "Moell*Bcd~ed*neoyd-~*ncykmxoo&*bo*ied~xky~c|ofs*nc|oxmoy$", + "Nooz*Hofcol*Do~}exay*ki~\x7fkffs*hofco|o*noozfs*cd*Moell*Bcd~ed$", + "Moell*Bcd~ed*bky*ncyie|oxon*be}*~bo*hxkcd*xokffs*}exay$$$*edio*k*sokx&*" + "lex*~bo*fky~*8?*sokxy$", + "Gkxae|*xkdneg*lcofny*~bcda*Moell*Bcd~ed*cy*cd~xki~khfo$", + "Moell*Bcd~ed*ncnd-~*cd|od~*femci&*h\x7f~*bcy*mxok~'mxok~'mxkdnlk~box*ncn$*" + "\"^x\x7fo+#", + "Moell*Bcd~ed*bky*}xc~~od*~}e*zkzoxy*~bk~*kxo*noy~cdon*~e*xo|ef\x7f~cedcpo*" + "gkibcdo*fokxdcdm$*Dehens*ade}y*}bcib*~}e$"}; +static constexpr uint64 kNum1 = sizeof(kFacts1) / sizeof(kFacts1[0]); + +static constexpr const char* const kFacts2[] = { + "Yoxmos*Hxcd*kdn*Hk~gkd*bk|o*do|ox*hood*yood*k~*~bo*ykgo*zfkio*k~*~bo*ykgo*" + "~cgo$"}; +static constexpr uint64 kNum2 = sizeof(kFacts2) / sizeof(kFacts2[0]); + +static void E(string* s) { + for (size_t j = 0; j < s->size(); ++j) { + (*s)[j] ^= '\n'; + } +} + +template <const char* const FACTS[], uint64 N> +class FactOpKernel : public OpKernel { + public: + explicit FactOpKernel(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + Tensor* output_tensor = NULL; + OP_REQUIRES_OK( + context, context->allocate_output(0, TensorShape({}), &output_tensor)); + auto output = output_tensor->template scalar<string>(); + + string coded = FACTS[context->env()->NowMicros() % N]; + E(&coded); + output() = coded; + } +}; + +REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_GPU).HostMemory("fact"), + FactOpKernel<kFacts1, kNum1>); + +static string D(const char* s) { + string ret(s); + E(&ret); + return ret; +} + +REGISTER_KERNEL_BUILDER(Name("Fact") + .Device(DEVICE_CPU) + .Label(D("Yoxmos").c_str()), + FactOpKernel<kFacts2, kNum2>); +REGISTER_KERNEL_BUILDER(Name("Fact") + .Device(DEVICE_CPU) + .Label(D("yoxmos").c_str()), + FactOpKernel<kFacts2, kNum2>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc new file mode 100644 index 0000000000..20e1f31f06 --- /dev/null +++ b/tensorflow/core/kernels/fifo_queue.cc @@ -0,0 +1,518 @@ +// See docs in ../ops/data_flow_ops.cc. + +#include <deque> +#include <vector> + +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/fifo_queue.h" +#include "tensorflow/core/kernels/queue_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +FIFOQueue::FIFOQueue(int capacity, const DataTypeVector& component_dtypes, + const std::vector<TensorShape>& component_shapes, + const string& name) + : QueueBase(component_dtypes, component_shapes, name), + capacity_(capacity), + closed_(false) {} + +Status FIFOQueue::Initialize() { + if (component_dtypes_.empty()) { + return errors::InvalidArgument("Empty component types for queue ", name_); + } + if (!component_shapes_.empty() && + component_dtypes_.size() != component_shapes_.size()) { + return errors::InvalidArgument("Different number of component types (", + component_dtypes_.size(), ") vs. shapes (", + component_shapes_.size(), ")."); + } + + mutex_lock lock(mu_); + queues_.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + queues_.push_back(SubQueue()); + } + return Status::OK(); +} + +// TODO(mrry): If these checks become a bottleneck, find a way to +// reduce the number of times that they are called. +Status FIFOQueue::ValidateTuple(const Tuple& tuple) { + TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple)); + if (specified_shapes()) { + for (size_t i = 0; i < tuple.size(); ++i) { + if (!tuple[i].shape().IsSameSize(component_shapes_[i])) { + return errors::InvalidArgument( + "Shape mismatch in tuple component ", i, ". Expected ", + component_shapes_[i].ShortDebugString(), ", got ", + tuple[i].shape().ShortDebugString()); + } + } + } + return Status::OK(); +} + +// TODO(mrry): If these checks become a bottleneck, find a way to +// reduce the number of times that they are called. +Status FIFOQueue::ValidateManyTuple(const Tuple& tuple) { + TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple)); + const int64 batch_size = tuple[0].dim_size(0); + if (specified_shapes()) { + for (size_t i = 0; i < tuple.size(); ++i) { + // Expected shape is [batch_size] + component_shapes_[i] + const TensorShape expected_shape = ManyOutShape(i, batch_size); + if (!tuple[i].shape().IsSameSize(expected_shape)) { + return errors::InvalidArgument( + "Shape mismatch in tuple component ", i, ". Expected ", + expected_shape.ShortDebugString(), ", got ", + tuple[i].shape().ShortDebugString()); + } + } + } else { + for (size_t i = 1; i < tuple.size(); ++i) { + if (tuple[i].dim_size(0) != batch_size) { + return errors::InvalidArgument( + "All input tensors must have the same size in the 0th ", + "dimension. Component ", i, " has ", tuple[i].dim_size(0), + ", and should have ", batch_size); + } + } + } + return Status::OK(); +} + +void FIFOQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) { + DCHECK_GT(queues_[0].size(), 0); + (*tuple).reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + (*tuple).push_back(*queues_[i][0].AccessTensor(ctx)); + queues_[i].pop_front(); + } +} + +void FIFOQueue::Cancel(Action action, CancellationToken token) { + DoneCallback callback = nullptr; + { + mutex_lock lock(mu_); + std::deque<Attempt>* attempts = + action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_; + + for (Attempt& attempt : *attempts) { + if (attempt.cancellation_token == token) { + attempt.is_cancelled = true; + if (action == kEnqueue) { + attempt.context->SetStatus( + errors::Cancelled("Enqueue operation was cancelled")); + } else { + attempt.context->SetStatus( + errors::Cancelled("Dequeue operation was cancelled")); + } + std::swap(callback, attempt.done_callback); + break; + } + } + } + if (callback) { + callback(); + FlushUnlocked(); + } +} + +void FIFOQueue::CloseAndCancel() { + std::vector<DoneCallback> callbacks; + { + mutex_lock lock(mu_); + closed_ = true; + for (Attempt& attempt : enqueue_attempts_) { + attempt.is_cancelled = true; + attempt.context->SetStatus( + errors::Cancelled("Enqueue operation was cancelled")); + callbacks.emplace_back(std::move(attempt.done_callback)); + } + } + for (const DoneCallback& callback : callbacks) { + callback(); + } + FlushUnlocked(); +} + +bool FIFOQueue::TryAttemptLocked(Action action, + std::vector<CleanUp>* clean_up) { + std::deque<Attempt>* attempts = + action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_; + + bool progress = false; + bool done = false; + while (!done && !attempts->empty()) { + if (attempts->front().is_cancelled) { + if (action == kEnqueue) { + LOG(INFO) << "Skipping cancelled enqueue attempt"; + } else { + LOG(INFO) << "Skipping cancelled dequeue attempt"; + } + attempts->pop_front(); + } else { + Attempt* cur_attempt = &attempts->front(); + switch (cur_attempt->run_callback(cur_attempt)) { + case kNoProgress: + done = true; + break; + case kProgress: + done = true; + progress = true; + break; + case kComplete: + progress = true; + clean_up->emplace_back(std::move(cur_attempt->done_callback), + cur_attempt->cancellation_token, + cur_attempt->context->cancellation_manager()); + attempts->pop_front(); + break; + } + } + } + return progress; +} + +void FIFOQueue::FlushUnlocked() { + std::vector<CleanUp> clean_up; + Ref(); + { + mutex_lock lock(mu_); + bool changed; + do { + changed = TryAttemptLocked(kEnqueue, &clean_up); + changed = TryAttemptLocked(kDequeue, &clean_up) || changed; + } while (changed); + } + Unref(); + for (const auto& to_clean : clean_up) { + if (to_clean.to_deregister != CancellationManager::kInvalidToken) { + // NOTE(mrry): We can safely ignore the return value of + // DeregisterCallback because the mutex mu_ ensures that the + // cleanup action only executes once. + to_clean.cm->DeregisterCallback(to_clean.to_deregister); + } + to_clean.finished(); + } +} + +void FIFOQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) { + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kEnqueue, token); }); + if (!already_cancelled) { + enqueue_attempts_.emplace_back( + 1, callback, ctx, token, + [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + attempt->context->SetStatus( + errors::Aborted("FIFOQueue '", name_, "' is closed.")); + return kComplete; + } + if (queues_[0].size() < static_cast<size_t>(capacity_)) { + for (int i = 0; i < num_components(); ++i) { + queues_[i].push_back(PersistentTensor(tuple[i])); + } + return kComplete; + } else { + return kNoProgress; + } + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled")); + callback(); + } +} + +/* static */ +Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple, + int index, int component, + OpKernelContext* ctx, + PersistentTensor* out_tensor) { + TensorShape element_shape(tuple[component].shape()); + element_shape.RemoveDim(0); + Tensor* element_access = nullptr; + TF_RETURN_IF_ERROR(ctx->allocate_persistent( + tuple[component].dtype(), element_shape, out_tensor, &element_access)); + TF_RETURN_IF_ERROR( + CopySliceToElement(tuple[component], element_access, index)); + return Status::OK(); +} + +void FIFOQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) { + const int64 batch_size = tuple[0].dim_size(0); + if (batch_size == 0) { + callback(); + return; + } + + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kEnqueue, token); }); + if (!already_cancelled) { + enqueue_attempts_.emplace_back( + batch_size, callback, ctx, token, + [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + attempt->context->SetStatus( + errors::Aborted("FIFOQueue '", name_, "' is closed.")); + return kComplete; + } + RunResult result = kNoProgress; + while (queues_[0].size() < static_cast<size_t>(capacity_)) { + result = kProgress; + const int index = + tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + PersistentTensor element; + attempt->context->SetStatus(GetElementComponentFromBatch( + tuple, index, i, attempt->context, &element)); + if (!attempt->context->status().ok()) return kComplete; + queues_[i].push_back(element); + } + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + return kComplete; + } + } + return result; + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled")); + callback(); + } +} + +void FIFOQueue::TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) { + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kDequeue, token); }); + if (!already_cancelled) { + // TODO(josh11b): This makes two copies of callback, avoid this if possible. + dequeue_attempts_.emplace_back( + 1, [callback]() { callback(Tuple()); }, ctx, token, + [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + const int32 s = queues_[0].size(); + if (closed_ && s == 0) { + attempt->context->SetStatus(errors::OutOfRange( + "FIFOQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", 1, ", current size ", s, + ")")); + return kComplete; + } + if (s > 0) { + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + attempt->done_callback = [callback, tuple]() { callback(tuple); }; + return kComplete; + } else { + return kNoProgress; + } + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled")); + callback(Tuple()); + } +} + +void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx, + CallbackWithTuple callback) { + if (!specified_shapes()) { + ctx->SetStatus( + errors::InvalidArgument("FIFOQueue's DequeueMany requires the " + "components to have specified shapes.")); + callback(Tuple()); + return; + } + if (num_elements == 0) { + Tuple tuple; + tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + // TODO(josh11b,misard): Switch to allocate_output(). Problem is + // this breaks the abstraction boundary since we don't *really* + // know if and how the Tensors in the tuple we pass to callback + // correspond to the outputs of *ctx. For example, the + // ReaderRead Op uses TryDequeue() to get a filename out of a + // queue that is used internally by the reader and is not + // associated with any output of the ReaderRead. + // mrry@ adds: + // Maybe we need to pass a std::function<Tensor*(...)> (or + // better signature) that calls the appropriate allocator + // function in addition to ctx? (Or support a shim Allocator + // that has an internal OpKernelContext*, and dispatches to the + // appropriate method?) + // misard@ adds: + // I don't see that a std::function would help. The problem is + // that at this point (allocation time) the system doesn't know + // what is going to happen to the element read out of the + // queue. As long as we keep the generality that TensorFlow Ops + // do their own dynamic allocation in arbitrary C++ code, we + // need to preserve robustness to allocating output Tensors with + // the 'wrong' attributes, and fixing up with a copy. The only + // improvement I can see here in the future would be to support + // an optimized case where the queue 'knows' what attributes to + // use, and plumbs them through here. + Tensor element; + ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element); + tuple.emplace_back(element); + } + callback(tuple); + return; + } + + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kDequeue, token); }); + if (!already_cancelled) { + // TODO(josh11b): This makes two copies of callback, avoid this if possible. + dequeue_attempts_.emplace_back( + num_elements, [callback]() { callback(Tuple()); }, ctx, token, + [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int32 s = queues_[0].size(); + if (closed_ && s < attempt->elements_requested) { + attempt->context->SetStatus(errors::OutOfRange( + "FIFOQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", + attempt->elements_requested, ", current size ", s, ")")); + + // TODO(mrry): Add support for producing a partial batch as + // output when the queue is closed. + if (!attempt->tuple.empty()) { + // Restore already-dequeued elements to the front of the queue. + for (int64 i = attempt->tuple[0].dim_size(0) - + attempt->elements_requested - 1; + i >= 0; --i) { + for (int j = 0; j < num_components(); ++j) { + PersistentTensor element; + Status s = GetElementComponentFromBatch( + attempt->tuple, i, j, attempt->context, &element); + if (!s.ok()) { + attempt->context->SetStatus( + errors::DataLoss("Failed to restore element from " + "partially-dequeued batch " + "to FIFOQueue")); + } + queues_[j].push_front(element); + } + } + } + return kComplete; + } + + RunResult result = kNoProgress; + for (; s > 0; --s) { + if (attempt->tuple.empty()) { + // Only allocate tuple when we have something to dequeue + // so we don't use exceessive memory when there are many + // blocked dequeue attempts waiting. + attempt->tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + const TensorShape shape = + ManyOutShape(i, attempt->elements_requested); + Tensor element; + attempt->context->allocate_temp(component_dtypes_[i], shape, + &element); + attempt->tuple.emplace_back(element); + } + } + result = kProgress; + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + const int index = + attempt->tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + attempt->context->SetStatus( + CopyElementToSlice(tuple[i], &attempt->tuple[i], index)); + if (!attempt->context->status().ok()) return kComplete; + } + tuple.clear(); + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + tuple = attempt->tuple; + attempt->done_callback = [callback, tuple]() { + callback(tuple); + }; + return kComplete; + } + } + return result; + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled")); + callback(Tuple()); + } +} + +void FIFOQueue::Close(OpKernelContext* ctx, bool cancel_pending_enqueues, + DoneCallback callback) { + if (cancel_pending_enqueues) { + CloseAndCancel(); + callback(); + } else { + { + mutex_lock lock(mu_); + enqueue_attempts_.emplace_back( + 0, callback, ctx, CancellationManager::kInvalidToken, + [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + attempt->context->SetStatus(errors::Aborted( + "FIFOQueue '", name_, "' is already closed.")); + } else { + closed_ = true; + } + return kComplete; + }); + } + FlushUnlocked(); + } +} + +Status FIFOQueue::MatchesNodeDef(const NodeDef& node_def) { + TF_RETURN_IF_ERROR(MatchesNodeDefOp(node_def, "FIFOQueue")); + TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_)); + TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def)); + TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def)); + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h new file mode 100644 index 0000000000..e9fe5f34a4 --- /dev/null +++ b/tensorflow/core/kernels/fifo_queue.h @@ -0,0 +1,127 @@ +#ifndef TENSORFLOW_KERNELS_FIFO_QUEUE_H_ +#define TENSORFLOW_KERNELS_FIFO_QUEUE_H_ + +#include <deque> +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/queue_base.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class FIFOQueue : public QueueBase { + public: + FIFOQueue(int32 capacity, const DataTypeVector& component_dtypes, + const std::vector<TensorShape>& component_shapes, + const string& name); + Status Initialize(); // Must be called before any other method. + + // Implementations of QueueInterface methods -------------------------------- + + Status ValidateTuple(const Tuple& tuple) override; + Status ValidateManyTuple(const Tuple& tuple) override; + void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) override; + void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) override; + void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override; + void TryDequeueMany(int num_elements, OpKernelContext* ctx, + CallbackWithTuple callback) override; + void Close(OpKernelContext* ctx, bool cancel_pending_enqueues, + DoneCallback callback) override; + Status MatchesNodeDef(const NodeDef& node_def) override; + + int32 size() override { + mutex_lock lock(mu_); + return queues_[0].size(); + } + + int32 capacity() const { return capacity_; } + + private: + enum Action { kEnqueue, kDequeue }; + + ~FIFOQueue() override {} + + TensorShape ManyOutShape(int i, int64 batch_size) { + TensorShape shape({batch_size}); + shape.AppendShape(component_shapes_[i]); + return shape; + } + + // Helper for dequeuing a single element from queues_. + void DequeueLocked(OpKernelContext* ctx, Tuple* tuple) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + void Cancel(Action action, CancellationToken token); + + // Helper for cancelling all pending Enqueue(Many) operations when + // Close is called with cancel_pending_enqueues. + void CloseAndCancel(); + + // Tries to enqueue/dequeue (or close) based on whatever is at the + // front of enqueue_attempts_/dequeue_attempts_. Appends to + // *finished the callback for any finished attempt (so it may be + // called once mu_ is released). Returns true if any progress was + // made. + struct CleanUp { + CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm) + : finished(f), to_deregister(ct), cm(cm) {} + DoneCallback finished; + CancellationToken to_deregister; + CancellationManager* cm; + }; + bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Tries to make progress on the enqueues or dequeues at the front + // of the *_attempts_ queues. + void FlushUnlocked(); + + const int32 capacity_; + + mutex mu_; + typedef std::deque<PersistentTensor> SubQueue; + std::vector<SubQueue> queues_ GUARDED_BY(mu_); + bool closed_ GUARDED_BY(mu_); + + enum RunResult { kNoProgress, kProgress, kComplete }; + struct Attempt; + typedef std::function<RunResult(Attempt*)> RunCallback; + struct Attempt { + int32 elements_requested; + DoneCallback done_callback; // must be run outside mu_ + OpKernelContext* context; + CancellationToken cancellation_token; + RunCallback run_callback; // must be run while holding mu_ + bool is_cancelled; + Tuple tuple; + + Attempt(int32 elements_requested, DoneCallback done_callback, + OpKernelContext* context, CancellationToken cancellation_token, + RunCallback run_callback) + : elements_requested(elements_requested), + done_callback(done_callback), + context(context), + cancellation_token(cancellation_token), + run_callback(run_callback), + is_cancelled(false) {} + }; + std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_); + std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_); + + static Status GetElementComponentFromBatch(const Tuple& tuple, int index, + int component, + OpKernelContext* ctx, + PersistentTensor* out_element); + + TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueue); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_FIFO_QUEUE_H_ diff --git a/tensorflow/core/kernels/fifo_queue_op.cc b/tensorflow/core/kernels/fifo_queue_op.cc new file mode 100644 index 0000000000..f1088181fe --- /dev/null +++ b/tensorflow/core/kernels/fifo_queue_op.cc @@ -0,0 +1,93 @@ +// See docs in ../ops/data_flow_ops.cc. + +#include <deque> +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/fifo_queue.h" +#include "tensorflow/core/kernels/queue_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +// Defines a FIFOQueueOp, which produces a Queue (specifically, one +// backed by FIFOQueue) that persists across different graph +// executions, and sessions. Running this op produces a single-element +// tensor of handles to Queues in the corresponding device. +class FIFOQueueOp : public OpKernel { + public: + explicit FIFOQueueOp(OpKernelConstruction* context) + : OpKernel(context), queue_handle_set_(false) { + OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_)); + OP_REQUIRES_OK(context, + context->allocate_persistent(DT_STRING, TensorShape({2}), + &queue_handle_, nullptr)); + if (capacity_ < 0) { + capacity_ = FIFOQueue::kUnbounded; + } + OP_REQUIRES_OK(context, + context->GetAttr("component_types", &component_types_)); + OP_REQUIRES_OK(context, context->GetAttr("shapes", &component_shapes_)); + } + + ~FIFOQueueOp() override { + // If the queue object was not shared, delete it. + if (queue_handle_set_ && cinfo_.resource_is_private_to_kernel()) { + TF_CHECK_OK(cinfo_.resource_manager()->Delete<QueueInterface>( + cinfo_.container(), cinfo_.name())); + } + } + + void Compute(OpKernelContext* ctx) override { + mutex_lock l(mu_); + if (!queue_handle_set_) { + OP_REQUIRES_OK(ctx, SetQueueHandle(ctx)); + } + ctx->set_output_ref(0, &mu_, queue_handle_.AccessTensor(ctx)); + } + + private: + Status SetQueueHandle(OpKernelContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def())); + QueueInterface* queue; + auto creator = [this](QueueInterface** ret) { + FIFOQueue* queue = new FIFOQueue(capacity_, component_types_, + component_shapes_, cinfo_.name()); + *ret = queue; + return queue->Initialize(); + }; + TF_RETURN_IF_ERROR( + cinfo_.resource_manager()->LookupOrCreate<QueueInterface>( + cinfo_.container(), cinfo_.name(), &queue, creator)); + core::ScopedUnref unref_me(queue); + // Verify that the shared queue is compatible with the requested arguments. + TF_RETURN_IF_ERROR(queue->MatchesNodeDef(def())); + auto h = queue_handle_.AccessTensor(ctx)->flat<string>(); + h(0) = cinfo_.container(); + h(1) = cinfo_.name(); + queue_handle_set_ = true; + return Status::OK(); + } + + int32 capacity_; + DataTypeVector component_types_; + std::vector<TensorShape> component_shapes_; + ContainerInfo cinfo_; + + mutex mu_; + PersistentTensor queue_handle_ GUARDED_BY(mu_); + bool queue_handle_set_ GUARDED_BY(mu_); + + TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueueOp); +}; + +REGISTER_KERNEL_BUILDER(Name("FIFOQueue").Device(DEVICE_CPU), FIFOQueueOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h new file mode 100644 index 0000000000..831f0c899e --- /dev/null +++ b/tensorflow/core/kernels/fill_functor.h @@ -0,0 +1,26 @@ +#ifndef TENSORFLOW_KERNELS_FILL_FUNCTOR_H_ +#define TENSORFLOW_KERNELS_FILL_FUNCTOR_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +struct FillFunctor { + // Computes on device "d": out = out.constant(in(0)), + void operator()(const Device& d, typename TTypes<T>::Flat out, + typename TTypes<T>::ConstScalar in); +}; + +template <typename Device, typename T> +struct SetZeroFunctor { + // Computes on device "d": out = out.setZero(), + void operator()(const Device& d, typename TTypes<T>::Flat out); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_FILL_FUNCTOR_H_ diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc new file mode 100644 index 0000000000..77516ab151 --- /dev/null +++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc @@ -0,0 +1,109 @@ +// See docs in ../ops/io_ops.cc. + +#include <memory> +#include "tensorflow/core/framework/reader_op_kernel.h" +#include "tensorflow/core/kernels/reader_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/inputbuffer.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/public/env.h" + +namespace tensorflow { + +class FixedLengthRecordReader : public ReaderBase { + public: + FixedLengthRecordReader(const string& node_name, int64 header_bytes, + int64 record_bytes, int64 footer_bytes, Env* env) + : ReaderBase( + strings::StrCat("FixedLengthRecordReader '", node_name, "'")), + header_bytes_(header_bytes), + record_bytes_(record_bytes), + footer_bytes_(footer_bytes), + env_(env), + file_pos_limit_(-1), + record_number_(0) {} + + // On success: + // * input_buffer_ != nullptr, + // * input_buffer_->Tell() == footer_bytes_ + // * file_pos_limit_ == file size - header_bytes_ + Status OnWorkStartedLocked() override { + record_number_ = 0; + uint64 file_size = 0; + TF_RETURN_IF_ERROR(env_->GetFileSize(current_work(), &file_size)); + file_pos_limit_ = file_size - footer_bytes_; + + RandomAccessFile* file = nullptr; + TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file)); + input_buffer_.reset(new io::InputBuffer(file, kBufferSize)); + TF_RETURN_IF_ERROR(input_buffer_->SkipNBytes(header_bytes_)); + return Status::OK(); + } + + Status OnWorkFinishedLocked() override { + input_buffer_.reset(nullptr); + return Status::OK(); + } + + Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) override { + if (input_buffer_->Tell() >= file_pos_limit_) { + *at_end = true; + return Status::OK(); + } + TF_RETURN_IF_ERROR(input_buffer_->ReadNBytes(record_bytes_, value)); + *key = strings::StrCat(current_work(), ":", record_number_); + *produced = true; + ++record_number_; + return Status::OK(); + } + + Status ResetLocked() override { + file_pos_limit_ = -1; + record_number_ = 0; + input_buffer_.reset(nullptr); + return ReaderBase::ResetLocked(); + } + + // TODO(josh11b): Implement serializing and restoring the state. + + private: + enum { kBufferSize = 256 << 10 /* 256 kB */ }; + const int64 header_bytes_; + const int64 record_bytes_; + const int64 footer_bytes_; + Env* const env_; + int64 file_pos_limit_; + int64 record_number_; + std::unique_ptr<io::InputBuffer> input_buffer_; +}; + +class FixedLengthRecordReaderOp : public ReaderOpKernel { + public: + explicit FixedLengthRecordReaderOp(OpKernelConstruction* context) + : ReaderOpKernel(context) { + int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1; + OP_REQUIRES_OK(context, context->GetAttr("header_bytes", &header_bytes)); + OP_REQUIRES_OK(context, context->GetAttr("record_bytes", &record_bytes)); + OP_REQUIRES_OK(context, context->GetAttr("footer_bytes", &footer_bytes)); + OP_REQUIRES(context, header_bytes >= 0, + errors::InvalidArgument("header_bytes must be >= 0 not ", + header_bytes)); + OP_REQUIRES(context, record_bytes >= 0, + errors::InvalidArgument("record_bytes must be >= 0 not ", + record_bytes)); + OP_REQUIRES(context, footer_bytes >= 0, + errors::InvalidArgument("footer_bytes must be >= 0 not ", + footer_bytes)); + Env* env = context->env(); + SetReaderFactory([this, header_bytes, record_bytes, footer_bytes, env]() { + return new FixedLengthRecordReader(name(), header_bytes, record_bytes, + footer_bytes, env); + }); + } +}; + +REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordReader").Device(DEVICE_CPU), + FixedLengthRecordReaderOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc new file mode 100644 index 0000000000..8bd48f26d6 --- /dev/null +++ b/tensorflow/core/kernels/gather_op.cc @@ -0,0 +1,136 @@ +// See docs in ../ops/array_ops.cc. + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +namespace { +template <typename T, typename Index, int static_slice_elems> +void HandleCopies(const Tensor& Tparams, + typename TTypes<Index>::ConstVec& Tindices, int slice_elems, + typename TTypes<T>::Matrix Tout) { + const int N = Tindices.dimension(0); + const auto& Tparams_flat = Tparams.flat_outer_dims<T>(); + T* Tout_base = &Tout(0, 0); + const T* Tparams_base = &Tparams_flat(0, 0); + const size_t slice_bytes = slice_elems * sizeof(T); + if (static_slice_elems >= 0) { + // Give compiler static knowledge of the number of elements/bytes + CHECK_EQ(static_slice_elems, slice_elems); + slice_elems = static_slice_elems; + } + for (int i = 0; i < N; i++) { + int j = i + 1; + if (j < N) { + port::prefetch<port::PREFETCH_HINT_T0>(&Tparams_flat(Tindices(j), 0)); + port::prefetch<port::PREFETCH_HINT_T0>(&Tout(j, 0)); + } + memcpy(Tout_base + i * slice_elems, + Tparams_base + Tindices(i) * slice_elems, slice_bytes); + } +} + +} // anonymous namespace + +template <typename T, typename Index> +class GatherOp : public OpKernel { + public: + // QUESTION: It'd be nice to support DT_INT16, DT_UINT8, + // etc. here for the type of the second input argument. Should + // we have the framework do some sort of integer promotion + // automatically, or should that be something that users have to + // do explicitly with a conversion operator in the graph? + explicit GatherOp(OpKernelConstruction* c) : OpKernel(c) { + const DataType dt = DataTypeToEnum<T>::v(); + const DataType index_t = DataTypeToEnum<Index>::v(); + OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t}, {dt})); + } + + void Compute(OpKernelContext* c) override { + const Tensor& Tparams = c->input(0); + const Tensor& Tindices = c->input(1); + OP_REQUIRES( + c, TensorShapeUtils::IsVectorOrHigher(Tparams.shape()), + errors::InvalidArgument("params must be at least 1 dimensional")); + const int64 N = Tindices.NumElements(); + const int64 first_dim_size = Tparams.dim_size(0); + + // Validate all the indices are in range + auto Tindices_vec = Tindices.flat<Index>(); + for (int64 i = 0; i < N; i++) { + const Index index = Tindices_vec(i); + OP_REQUIRES(c, index >= 0 && index < first_dim_size, + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in Tindices is out of range"))); + } + + // The result shape is indices.shape + params.shape[1:]. + TensorShape result_shape = Tindices.shape(); + for (int i = 1; i < Tparams.dims(); i++) { + result_shape.AddDim(Tparams.dim_size(i)); + } + + Tensor* Tout = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &Tout)); + const auto& Tparams_flat = Tparams.flat_outer_dims<T>(); + if (N > 0) { + auto Tindices_flat = Tindices.flat<Index>(); + auto Tout_flat = Tout->shaped<T, 2>({N, Tout->NumElements() / N}); + if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) { + const int64 slice_size = Tout->NumElements() / N; +#define SPECIALIZE(elems) \ + do { \ + if (slice_size == elems) { \ + HandleCopies<T, Index, elems>(Tparams, Tindices_flat, slice_size, \ + Tout_flat); \ + return; \ + } \ + } while (0) + + SPECIALIZE(10); + SPECIALIZE(20); + +#undef SPECIALIZE + + HandleCopies<T, Index, -1>(Tparams, Tindices_flat, slice_size, + Tout_flat); + } else { + for (int i = 0; i < N; i++) { + int j = i + 1; + if (j < N) { + port::prefetch<port::PREFETCH_HINT_T0>( + &Tparams_flat(Tindices_vec(j), 0)); + port::prefetch<port::PREFETCH_HINT_T0>(&Tout_flat(j, 0)); + } + // Copy last Ndim-1 dimensions of Tparams[Tindices[i]] to Tout[i] + Tout_flat.template chip<0>(i) = + Tparams_flat.template chip<0>(Tindices_vec(i)); + } + } + } + } +}; + +#define REGISTER_GATHER(type, index_type) \ + REGISTER_KERNEL_BUILDER(Name("Gather") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("Tparams") \ + .TypeConstraint<index_type>("Tindices"), \ + GatherOp<type, index_type>) + +#define REGISTER_GATHER_INT32(type) REGISTER_GATHER(type, int32) +#define REGISTER_GATHER_INT64(type) REGISTER_GATHER(type, int64) + +TF_CALL_ALL_TYPES(REGISTER_GATHER_INT32); +TF_CALL_ALL_TYPES(REGISTER_GATHER_INT64); + +#undef REGISTER_GATHER_INT32 +#undef REGISTER_GATHER_INT64 +#undef REGISTER_GATHER + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc new file mode 100644 index 0000000000..d7410169e1 --- /dev/null +++ b/tensorflow/core/kernels/gather_op_test.cc @@ -0,0 +1,213 @@ +#include <functional> +#include <memory> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { +namespace { + +class GatherOpTest : public OpsTestBase { + protected: + void MakeOp(DataType index_type) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "Gather") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(index_type)) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(GatherOpTest, ScalarIndices) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({}), {3}); + ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({})); + test::FillValues<float>(&expected, {3}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(GatherOpTest, Simple_TwoD32) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int32>(TensorShape({4}), {0, 4, 0, 2}); + ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 3})); + test::FillValues<float>(&expected, {0, 1, 2, 12, 13, 14, 0, 1, 2, 6, 7, 8}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(GatherOpTest, Simple_TwoD64) { + MakeOp(DT_INT64); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int64>(TensorShape({4}), {0, 4, 0, 2}); + ASSERT_OK(RunOpKernel()); + + // Check the output. + Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 3})); + test::FillValues<float>(&expected, {0, 1, 2, 12, 13, 14, 0, 1, 2, 6, 7, 8}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(GatherOpTest, HighRank) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3}); + AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 0, 2, 3, 0}); + ASSERT_OK(RunOpKernel()); + + // Check the output + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); + test::FillValues<float>(&expected, {1, 2, 0, 2, 3, 0}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(GatherOpTest, Error_IndexOutOfRange) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); + AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("Index 99 at offset 2 in Tindices is out of range")) + << s; +} + +class GatherOpForBenchmark : public GatherOpTest { + public: + void TestBody() override { // not used } + } + void PublicMakeOp(DataType index_type) { MakeOp(index_type); } +}; + +static const int kSorted = 0x8000; // Mask for arg to specify sorting vs. not + +template <typename Index> +void BM_Gather(int iters, int arg) { + testing::StopTiming(); + + bool sorted = ((arg & kSorted) != 0); + int dim = arg & ~kSorted; + + GatherOpForBenchmark t; + t.PublicMakeOp(DataTypeToEnum<Index>::v()); + // Use a 512 MB table, regardless of dim + const int kRows = ((1 << 29) / sizeof(float)) / dim; + std::vector<float> data(kRows * dim, 1.0f); + t.AddInputFromArray<float>(TensorShape({kRows, dim}), data); + const int kLookups = 2000; + const int kBatches = 1000000 / kLookups; + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + std::vector<std::vector<Index>> all_ids(kBatches); + for (int i = 0; i < kBatches; ++i) { + std::vector<Index>* ids = &all_ids[i]; + ids->resize(kLookups); + for (int j = 0; j < kLookups; ++j) { + (*ids)[j] = rnd.Uniform(kRows); + } + if (sorted) { + sort(ids->begin(), ids->end()); + } + } + + t.AddInput<Index>(TensorShape({kLookups}), [](int i) { return 0; }); + if (sorted) { + testing::SetLabel("sorted by id"); + } + testing::BytesProcessed(static_cast<int64>(iters) * kLookups * dim * + sizeof(float)); + testing::StartTiming(); + while (--iters > 0) { + const std::vector<Index>& b = all_ids[iters % kBatches]; + TensorValue input = t.mutable_input(1); + gtl::MutableArraySlice<Index> slice(&input->vec<Index>()(0), + input->NumElements()); + for (int i = 0; i < kLookups; i++) { + slice[i] = b[i]; + } + Status s = t.RunOpKernel(); + } +} + +static void BM_Gather32(int iters, int arg) { BM_Gather<int32>(iters, arg); } + +static void BM_Gather64(int iters, int arg) { BM_Gather<int64>(iters, arg); } + +BENCHMARK(BM_Gather32) + ->Arg(10) + ->Arg(10 | kSorted) + ->Arg(20) + ->Arg(40) + ->Arg(63) + ->Arg(63 | kSorted) + ->Arg(64) + ->Arg(64 | kSorted) + ->Arg(65) + ->Arg(65 | kSorted) + ->Arg(100) + ->Arg(100 | kSorted) + ->Arg(127) + ->Arg(127 | kSorted) + ->Arg(128) + ->Arg(128 | kSorted) + ->Arg(129) + ->Arg(129 | kSorted) + ->Arg(1000) + ->Arg(1000 | kSorted); + +BENCHMARK(BM_Gather64) + ->Arg(10) + ->Arg(10 | kSorted) + ->Arg(20) + ->Arg(40) + ->Arg(63) + ->Arg(63 | kSorted) + ->Arg(64) + ->Arg(64 | kSorted) + ->Arg(65) + ->Arg(65 | kSorted) + ->Arg(100) + ->Arg(100 | kSorted) + ->Arg(127) + ->Arg(127 | kSorted) + ->Arg(128) + ->Arg(128 | kSorted) + ->Arg(129) + ->Arg(129 | kSorted) + ->Arg(1000) + ->Arg(1000 | kSorted); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc new file mode 100644 index 0000000000..b29efbddfb --- /dev/null +++ b/tensorflow/core/kernels/identity_op.cc @@ -0,0 +1,45 @@ +// See docs in ../ops/array_ops.cc. +#include "tensorflow/core/kernels/identity_op.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("Identity").Device(DEVICE_CPU), IdentityOp); +// StopGradient does the same thing as Identity, but has a different +// gradient registered. +REGISTER_KERNEL_BUILDER(Name("StopGradient").Device(DEVICE_CPU), IdentityOp); + +REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Identity").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + IdentityOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("RefIdentity").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + IdentityOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("StopGradient").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + IdentityOp) + +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +REGISTER_GPU_KERNEL(bool); +REGISTER_GPU_KERNEL(bfloat16); + +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Identity") + .Device(DEVICE_GPU) + .HostMemory("input") + .HostMemory("output") + .TypeConstraint<int32>("T"), + IdentityOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/identity_op.h b/tensorflow/core/kernels/identity_op.h new file mode 100644 index 0000000000..7adc1eace0 --- /dev/null +++ b/tensorflow/core/kernels/identity_op.h @@ -0,0 +1,25 @@ +#ifndef TENSORFLOW_KERNELS_IDENTITY_OP_H_ +#define TENSORFLOW_KERNELS_IDENTITY_OP_H_ + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +class IdentityOp : public OpKernel { + public: + explicit IdentityOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + if (IsRefType(context->input_dtype(0))) { + context->forward_ref_input_to_ref_output(0, 0); + } else { + context->set_output(0, context->input(0)); + } + } + + bool IsExpensive() override { return false; } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_IDENTITY_OP_H_ diff --git a/tensorflow/core/kernels/identity_op_test.cc b/tensorflow/core/kernels/identity_op_test.cc new file mode 100644 index 0000000000..6483367a79 --- /dev/null +++ b/tensorflow/core/kernels/identity_op_test.cc @@ -0,0 +1,56 @@ +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +class IdentityOpTest : public OpsTestBase { + protected: + Status Init(DataType input_type) { + RequireDefaultOps(); + TF_CHECK_OK(NodeDefBuilder("op", "Identity") + .Input(FakeInput(input_type)) + .Finalize(node_def())); + return InitOp(); + } +}; + +TEST_F(IdentityOpTest, Int32Success_6) { + ASSERT_OK(Init(DT_INT32)); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +TEST_F(IdentityOpTest, Int32Success_2_3) { + ASSERT_OK(Init(DT_INT32)); + AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({2, 3})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +TEST_F(IdentityOpTest, StringSuccess) { + ASSERT_OK(Init(DT_STRING)); + AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_STRING, TensorShape({6})); + test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"}); + test::ExpectTensorEqual<string>(expected, *GetOutput(0)); +} + +TEST_F(IdentityOpTest, RefInputError) { ASSERT_OK(Init(DT_INT32_REF)); } + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/identity_reader_op.cc b/tensorflow/core/kernels/identity_reader_op.cc new file mode 100644 index 0000000000..a63fea5dbb --- /dev/null +++ b/tensorflow/core/kernels/identity_reader_op.cc @@ -0,0 +1,57 @@ +// See docs in ../ops/io_ops.cc. + +#include <memory> +#include "tensorflow/core/framework/reader_op_kernel.h" +#include "tensorflow/core/kernels/reader_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/protobuf.h" + +namespace tensorflow { + +class IdentityReader : public ReaderBase { + public: + explicit IdentityReader(const string& node_name) + : ReaderBase(strings::StrCat("IdentityReader '", node_name, "'")) {} + + Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) override { + *key = current_work(); + *value = current_work(); + *produced = true; + *at_end = true; + return Status::OK(); + } + + // Stores state in a ReaderBaseState proto, since IdentityReader has + // no additional state beyond ReaderBase. + Status SerializeStateLocked(string* state) override { + ReaderBaseState base_state; + SaveBaseState(&base_state); + base_state.SerializeToString(state); + return Status::OK(); + } + + Status RestoreStateLocked(const string& state) override { + ReaderBaseState base_state; + if (!ParseProtoUnlimited(&base_state, state)) { + return errors::InvalidArgument("Could not parse state for ", name(), ": ", + str_util::CEscape(state)); + } + TF_RETURN_IF_ERROR(RestoreBaseState(base_state)); + return Status::OK(); + } +}; + +class IdentityReaderOp : public ReaderOpKernel { + public: + explicit IdentityReaderOp(OpKernelConstruction* context) + : ReaderOpKernel(context) { + SetReaderFactory([this]() { return new IdentityReader(name()); }); + } +}; + +REGISTER_KERNEL_BUILDER(Name("IdentityReader").Device(DEVICE_CPU), + IdentityReaderOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc new file mode 100644 index 0000000000..d08f6f53da --- /dev/null +++ b/tensorflow/core/kernels/in_topk_op.cc @@ -0,0 +1,58 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +template <typename T> +class InTopK : public OpKernel { + public: + explicit InTopK(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("k", &k_)); + } + + void Compute(OpKernelContext* context) override { + const auto& predictions_in = context->input(0); + const auto& targets_in = context->input(1); + OP_REQUIRES(context, predictions_in.dims() == 2, + errors::InvalidArgument("predictions must be 2-dimensional")); + OP_REQUIRES(context, targets_in.dims() == 1, + errors::InvalidArgument("targets must be 1-dimensional")); + OP_REQUIRES(context, predictions_in.dim_size(0) == targets_in.dim_size(0), + errors::InvalidArgument("First dimension of predictions ", + predictions_in.dim_size(0), + " must match length of targets ", + targets_in.dim_size(0))); + const auto& predictions = predictions_in.matrix<T>(); + const auto& targets = targets_in.vec<int>(); + + Tensor* t_out = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output( + 0, TensorShape({targets_in.dim_size(0)}), &t_out)); + auto out = t_out->vec<bool>(); + + const auto size = targets.size(); + const auto num_classes = predictions.dimension(1); + for (int b = 0; b < size; b++) { + T target_prediction = predictions(b, targets(b)); + int more_probable_classes = 0; + for (int i = 0; i < num_classes; ++i) { + if (predictions(b, i) > target_prediction) ++more_probable_classes; + } + out(b) = more_probable_classes < k_; + } + } + + private: + int k_; +}; + +REGISTER_KERNEL_BUILDER(Name("InTopK").Device(DEVICE_CPU), InTopK<float>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc new file mode 100644 index 0000000000..7f8b070556 --- /dev/null +++ b/tensorflow/core/kernels/initializable_lookup_table.cc @@ -0,0 +1,41 @@ +#include "tensorflow/core/kernels/initializable_lookup_table.h" + +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { +namespace lookup { + +Status InitializableLookupTable::Find(const Tensor& keys, Tensor* values, + const Tensor& default_value) { + if (!is_initialized()) { + return errors::FailedPrecondition("Table not initialized."); + } + TF_RETURN_IF_ERROR(CheckFindArguments(keys, *values, default_value)); + return DoFind(keys, values, default_value); +} + +Status InitializableLookupTable::Initialize(InitTableIterator& iter) { + if (!iter.Valid()) { + return iter.status(); + } + TF_RETURN_IF_ERROR(CheckKeyAndValueTensors(iter.keys(), iter.values())); + + mutex_lock l(mu_); + if (is_initialized()) { + return errors::FailedPrecondition("Table already initialized."); + } + + TF_RETURN_IF_ERROR(DoPrepare(iter.total_size())); + while (iter.Valid()) { + TF_RETURN_IF_ERROR(DoInsert(iter.keys(), iter.values())); + iter.Next(); + } + if (!errors::IsOutOfRange(iter.status())) { + return iter.status(); + } + is_initialized_ = true; + return Status::OK(); +} + +} // namespace lookup +} // namespace tensorflow diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h new file mode 100644 index 0000000000..651b491457 --- /dev/null +++ b/tensorflow/core/kernels/initializable_lookup_table.h @@ -0,0 +1,103 @@ +#ifndef TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_ +#define TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_ + +#include "tensorflow/core/framework/lookup_interface.h" + +namespace tensorflow { +namespace lookup { + +// Base class for lookup tables that require initialization. +class InitializableLookupTable : public LookupInterface { + public: + class InitTableIterator; + + // Performs batch lookups, for every element in the key tensor, Find returns + // the corresponding value into the values tensor. + // If an element is not present in the table, the given default value is used. + // + // For tables that require initialization, `Find` is available once the table + // is marked as initialized. + // + // Returns the following statuses: + // - OK: when the find finishes successfully. + // - FailedPrecondition: if the table is not initialized. + // - InvalidArgument: if any of the preconditions on the lookup key or value + // fails. + // - In addition, other implementations may provide another non-OK status + // specific to their failure modes. + Status Find(const Tensor& keys, Tensor* values, + const Tensor& default_value) final; + + // Returns whether the table was initialized and is ready to serve lookups. + bool is_initialized() const { return is_initialized_; } + + // Initializes the table from the given init table iterator. + // + // Atomically, this operation prepares the table, populates it with the given + // iterator, and mark the table as initialized. + // + // Returns the following statuses: + // - OK: when the initialization was successful. + // - InvalidArgument: if any of the preconditions on the lookup key or value + // fails. + // - FailedPrecondition: if the table is already initialized and + // fail_if_initialized is set to true. + // - In addition, other implementations may provide another non-OK status + // specific to their failure modes. + Status Initialize(InitTableIterator& iter); + + // Basic iterator to initialize lookup tables. + // It yields a sequence of pairs of `keys()` and `values()` Tensors, so that + // the consumer may insert key-value pairs in batches. + // + // Then the iterator is exhausted, valid returns false and status returns + // Status::OutOfRange. + class InitTableIterator { + public: + InitTableIterator() {} + + virtual ~InitTableIterator() {} + + // Prepares the next batch of key and value tensors. + virtual void Next() = 0; + + // Returns true if keys and values point to valid tensors. + virtual bool Valid() const = 0; + + // Returns a tensor that contains the current batch of 'key' values. + virtual const Tensor& keys() const = 0; + + // Returns a tensor that contains the current batch of 'value' values. + virtual const Tensor& values() const = 0; + + // Returns an error if one has occurred, otherwire returns Status::OK. + virtual Status status() const = 0; + + // Returns the total number of elements that the iterator will produce. + virtual int64 total_size() const = 0; + + private: + TF_DISALLOW_COPY_AND_ASSIGN(InitTableIterator); + }; + + protected: + // Prepares and allocates the underlying data structure to store the given + // number of expected elements. + virtual Status DoPrepare(size_t expected_num_elements) = 0; + + // Populates the table in batches given keys and values as tensors into the + // underlying data structure. + virtual Status DoInsert(const Tensor& keys, const Tensor& values) = 0; + + // Performs the batch find operation on the underlying data structure. + virtual Status DoFind(const Tensor& keys, Tensor* values, + const Tensor& default_value) = 0; + + mutex mu_; + bool is_initialized_ = false; +}; + +} // namespace lookup +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_INITIALIZABLE_LOOKUP_TABLE_H_ diff --git a/tensorflow/core/kernels/io.cc b/tensorflow/core/kernels/io.cc new file mode 100644 index 0000000000..9d6921aa8e --- /dev/null +++ b/tensorflow/core/kernels/io.cc @@ -0,0 +1,270 @@ +// See docs in ../ops/io_ops.cc +#include <unordered_map> + +#include "tensorflow/core/kernels/io.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/util/tensor_slice_reader.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" +#include "tensorflow/core/util/tensor_slice_writer.h" + +namespace tensorflow { + +namespace { +bool ParseShapeAndSlice(const string& shape_and_slice, TensorShape* shape, + TensorSlice* slice, TensorShape* shape_slice, + string* error) { + CHECK(!shape_and_slice.empty()); + // Syntax: dim0 dim1 dim2 ... <slice string> + // Where slice string is defined in core/framework/tensor_slice.h + std::vector<string> splits = str_util::Split(shape_and_slice, ' '); + + // Must have at least 2 strings. + if (splits.size() < 2) { + *error = strings::StrCat( + "Need least two elements in shape_and_slice specification: ", + shape_and_slice); + return false; + } + int num_dims = splits.size() - 1; + shape->Clear(); + for (int i = 0; i < num_dims; ++i) { + int dim; + if (!str_util::NumericParse32(splits[i], &dim)) { + *error = strings::StrCat("Non numerical dimension in shape_and_slice: ", + shape_and_slice); + return false; + } + shape->AddDim(dim); + } + // The last split is the slice specification. + slice->Clear(); + auto status = slice->Parse(splits.back(), slice); + if (!status.ok()) { + *error = status.error_message(); + return false; + } + // The specified slice must be compatible with the specified shape. + status = slice->SliceTensorShape(*shape, shape_slice); + if (!status.ok()) { + *error = status.error_message(); + return false; + } + return true; +} +} // namespace + +void SaveTensors( + OpKernelContext* context, + checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func, + bool save_slices) { + const Tensor& filename_t = context->input(0); + { + const int64 size = filename_t.NumElements(); + OP_REQUIRES( + context, size == 1, + errors::InvalidArgument( + "Input 0 (filename) must be a string scalar; got a tensor of ", + size, "elements")); + } + + const Tensor& tensor_names_t = context->input(1); + const int64 N = tensor_names_t.NumElements(); + const string* tensor_shapes_and_slices_ptr = nullptr; + if (save_slices) { + const Tensor& tensor_shapes_and_slices_t = context->input(2); + OP_REQUIRES( + context, tensor_shapes_and_slices_t.NumElements() == N, + errors::InvalidArgument("Expected ", N, + " elements for the tensor " + "shapes and slices but got ", + tensor_shapes_and_slices_t.NumElements())); + tensor_shapes_and_slices_ptr = + tensor_shapes_and_slices_t.flat<string>().data(); + } + // Path, names, and slices if save_slices is true. + const int kFixedInputs = save_slices ? 3 : 2; + OP_REQUIRES(context, context->num_inputs() == N + kFixedInputs, + errors::InvalidArgument("Expected totally ", N + kFixedInputs, + " inputs as input #1 (which is a string " + "tensor of saved names) contains ", + N, " names, but received ", + context->num_inputs(), " inputs")); + + VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0) + << "..."; + checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0), + builder_func); + + Status s; + auto tensor_names_flat = tensor_names_t.flat<string>(); + + string error; + for (int64 i = 0; i < N; ++i) { + const string& name = tensor_names_flat(i); + const Tensor& input = context->input(i + kFixedInputs); + TensorShape shape(input.shape()); + TensorSlice slice(input.dims()); + if (save_slices && !tensor_shapes_and_slices_ptr[i].empty()) { + const string& shape_spec = tensor_shapes_and_slices_ptr[i]; + TensorShape slice_shape; + OP_REQUIRES(context, ParseShapeAndSlice(shape_spec, &shape, &slice, + &slice_shape, &error), + errors::InvalidArgument(error)); + OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()), + errors::InvalidArgument("Slice in shape_and_slice " + "specification does not match the " + "shape of the tensor to save: ", + shape_spec, ", tensor: ", + input.shape().DebugString())); + } + +#define WRITER_ADD(dt) \ + case dt: \ + s = writer.Add(name, shape, slice, \ + input.flat<EnumToDataType<dt>::Type>().data()); \ + break + + switch (input.dtype()) { + WRITER_ADD(DT_FLOAT); + WRITER_ADD(DT_DOUBLE); + WRITER_ADD(DT_INT32); + WRITER_ADD(DT_UINT8); + WRITER_ADD(DT_INT16); + WRITER_ADD(DT_INT8); + WRITER_ADD(DT_INT64); + WRITER_ADD(DT_QUINT8); + WRITER_ADD(DT_QINT8); + WRITER_ADD(DT_QINT32); + default: + context->SetStatus(errors::Unimplemented("Saving data type ", + DataTypeString(input.dtype()), + " not yet supported")); + return; + } +#undef WRITER_ADD + if (!s.ok()) { + context->SetStatus(s); + return; + } + } + + s = writer.Finish(); + if (!s.ok()) { + context->SetStatus(s); + } +} + +void RestoreTensor(OpKernelContext* context, + checkpoint::TensorSliceReader::OpenTableFunction open_func, + int preferred_shard, bool restore_slice) { + const Tensor& file_pattern_t = context->input(0); + { + const int64 size = file_pattern_t.NumElements(); + OP_REQUIRES( + context, size == 1, + errors::InvalidArgument( + "Input 0 (file_pattern) must be a string scalar; got a tensor of ", + size, "elements")); + } + const string& file_pattern = file_pattern_t.flat<string>()(0); + + const Tensor& tensor_name_t = context->input(1); + { + const int64 size = tensor_name_t.NumElements(); + OP_REQUIRES( + context, size == 1, + errors::InvalidArgument( + "Input 1 (tensor_name) must be a string scalar; got a tensor of ", + size, "elements")); + } + const string& tensor_name = tensor_name_t.flat<string>()(0); + + const string* tensor_shape_and_slice_ptr = nullptr; + if (restore_slice) { + const Tensor& tensor_shape_and_slice_t = context->input(2); + OP_REQUIRES( + context, tensor_shape_and_slice_t.NumElements() == 1, + errors::InvalidArgument("Expected 1 element for the tensor " + "shape and slice but got ", + tensor_shape_and_slice_t.NumElements())); + tensor_shape_and_slice_ptr = tensor_shape_and_slice_t.flat<string>().data(); + } + + // If we cannot find a cached reader we will allocate our own. + std::unique_ptr<checkpoint::TensorSliceReader> allocated_reader; + + const checkpoint::TensorSliceReader* reader = + context->slice_reader_cache()->GetReader(file_pattern, open_func, + preferred_shard); + if (!reader) { + allocated_reader.reset(new checkpoint::TensorSliceReader( + file_pattern, open_func, preferred_shard)); + reader = allocated_reader.get(); + } + OP_REQUIRES_OK(context, CHECK_NOTNULL(reader)->status()); + + // Get the shape and type from the save file. + DataType type; + TensorShape saved_shape; + OP_REQUIRES( + context, reader->HasTensor(tensor_name, &saved_shape, &type), + errors::NotFound("Tensor name \"", tensor_name, + "\" not found in checkpoint files ", file_pattern)); + OP_REQUIRES( + context, type == context->expected_output_dtype(0), + errors::InvalidArgument("Expected to restore a tensor of type ", + DataTypeString(context->expected_output_dtype(0)), + ", got a tensor of type ", DataTypeString(type), + " instead: tensor_name = ", tensor_name)); + + // Shape of the output and slice to load. + TensorShape output_shape(saved_shape); + TensorSlice slice_to_load(saved_shape.dims()); + if (restore_slice && !tensor_shape_and_slice_ptr[0].empty()) { + const string& shape_spec = tensor_shape_and_slice_ptr[0]; + TensorShape parsed_shape; + string error; + OP_REQUIRES(context, + ParseShapeAndSlice(shape_spec, &parsed_shape, &slice_to_load, + &output_shape, &error), + errors::InvalidArgument(error)); + OP_REQUIRES( + context, parsed_shape.IsSameSize(saved_shape), + errors::InvalidArgument( + "Shape in shape_and_slice spec does not match the shape in the " + "save file: ", + parsed_shape.DebugString(), ", save file shape: ", + saved_shape.DebugString())); + } + + Tensor* t = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &t)); +#define READER_COPY(dt) \ + case dt: \ + reader->CopySliceData(tensor_name, slice_to_load, \ + t->flat<EnumToDataType<dt>::Type>().data()); \ + break + + switch (type) { + READER_COPY(DT_FLOAT); + READER_COPY(DT_DOUBLE); + READER_COPY(DT_INT32); + READER_COPY(DT_UINT8); + READER_COPY(DT_INT16); + READER_COPY(DT_INT8); + READER_COPY(DT_INT64); + default: + context->SetStatus(errors::Unimplemented( + "Restoring data type ", DataTypeString(type), " not yet supported")); + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/io.h b/tensorflow/core/kernels/io.h new file mode 100644 index 0000000000..7e548f1ad0 --- /dev/null +++ b/tensorflow/core/kernels/io.h @@ -0,0 +1,38 @@ +#ifndef TENSORFLOW_KERNELS_IO_H_ +#define TENSORFLOW_KERNELS_IO_H_ + +#include "tensorflow/core/util/tensor_slice_reader.h" +#include "tensorflow/core/util/tensor_slice_writer.h" + +namespace tensorflow { + +class OpKernelContext; + +// Save input tensors in *context to a writer built from builder_func(). +// context must have the following inputs: +// 0: a single element string tensor that contains the file name. +// 1: names for the remaining tensors +// If save_slices is true: +// 2: shape and slice specifications. +// rest: tensors to save +void SaveTensors( + OpKernelContext* context, + checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func, + bool save_slices); + +// Reads a tensor from the reader built from open_func() and produces it as +// context->output(0). "preferred_shard" is the same the TensorSliceReader +// preferred_shard parameter. +// +// context must have the following inputs: +// 0: a single element string tensor that contains the file name. +// 1: a single element string tensor that names the output to be restored. +// If restore_slice is true: +// 2: shape and slice specification of the tensor to restore. +void RestoreTensor(OpKernelContext* context, + checkpoint::TensorSliceReader::OpenTableFunction open_func, + int preferred_shard, bool restore_slice); + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_IO_H_ diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc new file mode 100644 index 0000000000..6f83f01676 --- /dev/null +++ b/tensorflow/core/kernels/l2loss_op.cc @@ -0,0 +1,69 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/l2loss_op.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class L2LossOp : public OpKernel { + public: + explicit L2LossOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // The input tensor can be of any number of dimensions, even though it's + // 2D in most typical applications. + const Tensor& input = context->input(0); + // The output is a single number. + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + functor::L2Loss<Device, T>()(context->eigen_device<Device>(), + input.flat<T>(), output->scalar<T>()); + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("L2Loss").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + L2LossOp<CPUDevice, T>); + +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void L2Loss<GPUDevice, T>::operator()(const GPUDevice& d, \ + typename TTypes<T>::ConstTensor input, \ + typename TTypes<T>::Scalar output); \ + extern template struct L2Loss<GPUDevice, T>; + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("L2Loss").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ + L2LossOp<GPUDevice, T>); + +REGISTER_GPU_KERNEL(float); +#undef REGISTER_GPU_KERNEL + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/l2loss_op.h b/tensorflow/core/kernels/l2loss_op.h new file mode 100644 index 0000000000..d307353e24 --- /dev/null +++ b/tensorflow/core/kernels/l2loss_op.h @@ -0,0 +1,24 @@ +#ifndef TENSORFLOW_KERNELS_L2LOSS_OP_H_ +#define TENSORFLOW_KERNELS_L2LOSS_OP_H_ +// Functor definition for L2LossOp, must be compilable by nvcc. +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by L2LossOp to do the computations. +template <typename Device, typename T> +struct L2Loss { + void operator()(const Device& d, typename TTypes<T>::ConstTensor input, + typename TTypes<T>::Scalar output) { + // We flatten the input tensor and reduce on dimension 0, producing + // a single number which is Mul(Sum(x^2), 0.5). + output.device(d) = input.square().sum() * static_cast<T>(0.5); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_L2LOSS_OP_H_ diff --git a/tensorflow/core/kernels/l2loss_op_gpu.cu.cc b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc new file mode 100644 index 0000000000..858fcfe8d3 --- /dev/null +++ b/tensorflow/core/kernels/l2loss_op_gpu.cu.cc @@ -0,0 +1,16 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/l2loss_op.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; +template struct functor::L2Loss<GPUDevice, float>; + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc new file mode 100644 index 0000000000..93342a7a24 --- /dev/null +++ b/tensorflow/core/kernels/linalg_ops_common.cc @@ -0,0 +1,99 @@ +#include "tensorflow/core/kernels/linalg_ops_common.h" + +namespace tensorflow { + +void LinearAlgebraOpBase::Compute(OpKernelContext* context) { + const Tensor& in = context->input(0); + + const int input_rank = GetInputMatrixRank(); + OP_REQUIRES( + context, input_rank == 2, + errors::InvalidArgument("Only matrix inputs are supported so far.")); + if (SupportsBatchOperation()) { + OP_REQUIRES(context, in.dims() > input_rank, + errors::InvalidArgument("Input tensor must have rank >= %d", + input_rank + 1)); + } else { + OP_REQUIRES(context, in.dims() == input_rank, + errors::InvalidArgument("Input tensor must have rank == %d", + input_rank)); + } + + // If the tensor rank is greater than input_rank, we consider the inner-most + // dimensions as matrices, and loop over all the other outer + // dimensions to compute the results. + // TODO(kalakris): Only matrix inputs are currently supported. + const int row_dimension = in.dims() - 2; + const int col_dimension = in.dims() - 1; + const int64 num_rows = in.dim_size(row_dimension); + const int64 num_cols = in.dim_size(col_dimension); + const TensorShape input_matrix_shape = TensorShape({num_rows, num_cols}); + const TensorShape output_matrix_shape = + GetOutputMatrixShape(input_matrix_shape); + OP_REQUIRES(context, output_matrix_shape.dims() <= 2, + errors::InvalidArgument("Output rank must be 1 or 2.")); + + int num_matrices = 1; + // The output has the shape of all the outer dimensions of the input + // except for the last two, plus the output_matrix_shape (if the output + // is not scalar). This still assumes that each input matrix is + // 2-dimensional, in accordance with the TODO above. + TensorShape output_shape; + if (in.dims() == 2) { + output_shape = output_matrix_shape; + } else { + for (int dim = 0; dim <= in.dims() - 3; ++dim) { + num_matrices *= in.dim_size(dim); + output_shape.AddDim(in.dim_size(dim)); + } + for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) { + output_shape.AddDim(output_matrix_shape.dim_size(dim)); + } + } + + Tensor* out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &out)); + + auto shard = [this, &in, &input_matrix_shape, &output_matrix_shape, context, + out](int64 begin, int64 end) { + for (int64 i = begin; i < end; ++i) { + ComputeMatrix(context, i, in, input_matrix_shape, out, + output_matrix_shape); + } + }; + + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, num_matrices, + GetCostPerUnit(input_matrix_shape), shard); +} + +template <typename Scalar, bool SupportsBatchOperationT> +void LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeMatrix( + OpKernelContext* context, int64 matrix_index, const Tensor& in, + const TensorShape& input_matrix_shape, Tensor* out, + const TensorShape& output_matrix_shape) { + // TODO(kalakris): Handle alignment if possible. Eigen::Map is + // unaligned by default. + ConstMatrixMap input(in.flat<Scalar>().data() + + matrix_index * input_matrix_shape.num_elements(), + input_matrix_shape.dim_size(0), + input_matrix_shape.dim_size(1)); + + // The output matrix shape may not be a matrix. + int num_output_rows = + output_matrix_shape.dims() >= 1 ? output_matrix_shape.dim_size(0) : 1; + int num_output_cols = + output_matrix_shape.dims() == 2 ? output_matrix_shape.dim_size(1) : 1; + MatrixMap output(out->flat<Scalar>().data() + + matrix_index * output_matrix_shape.num_elements(), + num_output_rows, num_output_cols); + ComputeMatrix(context, input, &output); +} + +// Explicitly instantiate LinearAlgebraOp for the scalar types we expect to use. +template class LinearAlgebraOp<float, false>; +template class LinearAlgebraOp<float, true>; +template class LinearAlgebraOp<double, false>; +template class LinearAlgebraOp<double, true>; + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h new file mode 100644 index 0000000000..471f11e25f --- /dev/null +++ b/tensorflow/core/kernels/linalg_ops_common.h @@ -0,0 +1,123 @@ +#ifndef TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_ +#define TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/util/work_sharder.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +// A base class to support linear algebra functionality, similar to the +// numpy.linalg module. Supports batch computation on several matrices at once, +// sharding the computations across different threads if necessary. +// +// TODO(kalakris): This needs to be expanded to support binary inputs, and +// multiple outputs. +class LinearAlgebraOpBase : public OpKernel { + public: + explicit LinearAlgebraOpBase(OpKernelConstruction* context) + : OpKernel(context) {} + ~LinearAlgebraOpBase() override {} + + // Return the expected rank of the input. + // TODO(kalakris): This should be a virtual function to support vector inputs. + int GetInputMatrixRank() { return 2; } + + // Return the output shape of each individual matrix operation. Must be + // rank 0, 1, or 2. Scalar outputs are rank 0. + virtual TensorShape GetOutputMatrixShape( + const TensorShape& input_matrix_shape) = 0; + + // Return the cost per matrix operation. Cost per unit is assumed to be + // roughly 1ns, based on comments in core/util/work_sharder.cc. + virtual int64 GetCostPerUnit(const TensorShape& input_matrix_shape) = 0; + + // If SupportsBatchOperation() returns false, this Op will only accept rank 2 + // (if the supported input type is a matrix). If it returns true, the Op will + // accept inputs of rank >= 3, and repeatedly execute the operation on all + // matrices in the innermost two dimensions. + virtual bool SupportsBatchOperation() = 0; + + // Perform the actual computation on an input matrix, and store the results + // in the output. This will be called repeatedly for a single call to + // Compute(), if multiple matrices exist in the input Tensor. + // + // This function should only compute the results for a single input matrix. + // The 'matrix_index' parameter specifies the index of the matrix to be used + // from the input, and the index of the matrix to be written to in the output. + // The input matrix is in row major order, and is located at the memory + // address + // in.flat<Scalar>().data() + + // matrix_index * input_matrix_shape.num_elements(). + // The output matrix is in row major order, and is located at the memory + // address + // out->flat<Scalar>().data() + + // matrix_index * output_matrix_shape.num_elements(). + // The LinearAlgebraOp<Scalar> class below has functionality which performs + // this mapping and presents an interface based on the Eigen::MatrixBase API. + virtual void ComputeMatrix(OpKernelContext* context, int64 matrix_index, + const Tensor& in, + const TensorShape& input_matrix_shape, Tensor* out, + const TensorShape& output_matrix_shape) = 0; + + void Compute(OpKernelContext* context) override; +}; + +// A base class for linear algebra ops templated on the scalar type. +// +// This base class encapsulates the functionality of mapping the input and +// output tensors using Eigen::Map, so that the Eigen::MatrixBase API may be +// directly used by derived classes. +// SupportsBatchOperationT is a bool template argument which if set to true +// will allow the Op to process batches of matrices (rank >= 3); if set to +// false the Op will only accept rank 2 inputs. +template <typename Scalar, bool SupportsBatchOperationT> +class LinearAlgebraOp : public LinearAlgebraOpBase { + public: + explicit LinearAlgebraOp(OpKernelConstruction* context) + : LinearAlgebraOpBase(context) {} + + using ConstMatrixMap = + Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>>; + using MatrixMap = Eigen::Map< + Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>; + + // Perform the actual computation on the input matrix, and store the results + // in the output. This will be called repeatedly for a single call to + // Compute(), if multiple matrices exist in the input Tensor. + virtual void ComputeMatrix(OpKernelContext* context, + const ConstMatrixMap& input, + MatrixMap* output) = 0; + + bool SupportsBatchOperation() final { return SupportsBatchOperationT; } + + // A concrete implementation of LinearAlgebraOpBase::ComputeMatrix(). + void ComputeMatrix(OpKernelContext* context, int64 matrix_index, + const Tensor& in, const TensorShape& input_matrix_shape, + Tensor* out, const TensorShape& output_matrix_shape) final; +}; + +// Declare that LinearAlgebraOp is explicitly instantiated in +// linalg_ops_common.cc for float and double. +extern template class LinearAlgebraOp<float, false>; +extern template class LinearAlgebraOp<float, true>; +extern template class LinearAlgebraOp<double, false>; +extern template class LinearAlgebraOp<double, true>; + +} // namespace tensorflow + +#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \ + REGISTER_KERNEL_BUILDER( \ + Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass) + +#endif // TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_ diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc new file mode 100644 index 0000000000..f490f5ddd3 --- /dev/null +++ b/tensorflow/core/kernels/listdiff_op.cc @@ -0,0 +1,75 @@ +#include <unordered_set> +#include <utility> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { +template <typename T> +class ListDiffOp : public OpKernel { + public: + explicit ListDiffOp(OpKernelConstruction* context) : OpKernel(context) { + const DataType dt = DataTypeToEnum<T>::v(); + OP_REQUIRES_OK(context, context->MatchSignature({dt, dt}, {dt, DT_INT32})); + } + + void Compute(OpKernelContext* context) override { + const Tensor& x = context->input(0); + const Tensor& y = context->input(1); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(x.shape()), + errors::InvalidArgument("x should be a 1D vector.")); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(y.shape()), + errors::InvalidArgument("y should be a 1D vector.")); + + std::unordered_set<T> y_set; + const auto Ty = y.vec<T>(); + const int y_size = Ty.size(); + y_set.reserve(y_size); + for (int i = 0; i < y_size; ++i) { + y_set.insert(Ty(i)); + } + + // Compute the size of the output. + const auto Tx = x.vec<T>(); + const int x_size = Tx.size(); + + int out_size = 0; + for (int i = 0; i < x_size; ++i) { + if (y_set.count(Tx(i)) == 0) { + ++out_size; + } + } + + // Allocate and populate outputs. + Tensor* out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, {out_size}, &out)); + auto Tout = out->vec<T>(); + + Tensor* indices = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, {out_size}, &indices)); + auto Tindices = indices->vec<int32>(); + + for (int i = 0, p = 0; i < x_size; ++i) { + if (y_set.count(Tx(i)) == 0) { + Tout(p) = Tx(i); + Tindices(p) = i; + p++; + } + } + } +}; + +#define REGISTER_LISTDIFF(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("ListDiff").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ListDiffOp<type>) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_LISTDIFF); +#undef REGISTER_LISTDIFF + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc new file mode 100644 index 0000000000..ec84145f75 --- /dev/null +++ b/tensorflow/core/kernels/logging_ops.cc @@ -0,0 +1,77 @@ +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +class AssertOp : public OpKernel { + public: + explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& cond = ctx->input(0); + OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(cond.shape()), + errors::InvalidArgument("In[0] should be a scalar: ", + cond.shape().ShortDebugString())); + + if (cond.scalar<bool>()()) { + return; + } + string msg = "assertion failed: "; + for (int i = 1; i < ctx->num_inputs(); ++i) { + strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_), + "]"); + if (i < ctx->num_inputs() - 1) strings::StrAppend(&msg, " "); + } + ctx->SetStatus(errors::InvalidArgument(msg)); + } + + private: + int32 summarize_ = 0; +}; + +REGISTER_KERNEL_BUILDER(Name("Assert").Device(DEVICE_CPU), AssertOp); + +class PrintOp : public OpKernel { + public: + explicit PrintOp(OpKernelConstruction* ctx) + : OpKernel(ctx), call_counter_(0) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("message", &message_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("first_n", &first_n_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("summarize", &summarize_)); + } + + void Compute(OpKernelContext* ctx) override { + if (IsRefType(ctx->input_dtype(0))) { + ctx->forward_ref_input_to_ref_output(0, 0); + } else { + ctx->set_output(0, ctx->input(0)); + } + if (first_n_ >= 0) { + mutex_lock l(mu_); + if (call_counter_ >= first_n_) return; + call_counter_++; + } + string msg; + strings::StrAppend(&msg, message_); + for (int i = 1; i < ctx->num_inputs(); ++i) { + strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_), + "]"); + } + LOG(INFO) << msg; + } + + private: + mutex mu_; + int64 call_counter_ GUARDED_BY(mu_) = 0; + int64 first_n_ = 0; + int32 summarize_ = 0; + string message_; +}; + +REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc new file mode 100644 index 0000000000..a7af6eb303 --- /dev/null +++ b/tensorflow/core/kernels/logging_ops_test.cc @@ -0,0 +1,87 @@ +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/strings/strcat.h" + +namespace tensorflow { +namespace { + +class PrintingGraphTest : public OpsTestBase { + protected: + Status Init(DataType input_type1, DataType input_type2, string msg = "", + int first_n = -1, int summarize = 3) { + RequireDefaultOps(); + TF_CHECK_OK(NodeDefBuilder("op", "Print") + .Input(FakeInput(input_type1)) + .Input(FakeInput(2, input_type2)) + .Attr("message", msg) + .Attr("first_n", first_n) + .Attr("summarize", summarize) + .Finalize(node_def())); + return InitOp(); + } +}; + +TEST_F(PrintingGraphTest, Int32Success_6) { + ASSERT_OK(Init(DT_INT32, DT_INT32)); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +TEST_F(PrintingGraphTest, Int32Success_Summarize6) { + ASSERT_OK(Init(DT_INT32, DT_INT32, "", -1, 6)); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +TEST_F(PrintingGraphTest, StringSuccess) { + ASSERT_OK(Init(DT_INT32, DT_STRING)); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<string>(TensorShape({}), {"foo"}); + AddInputFromArray<string>(TensorShape({}), {"bar"}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +TEST_F(PrintingGraphTest, MsgSuccess) { + ASSERT_OK(Init(DT_INT32, DT_STRING, "Message: ")); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<string>(TensorShape({}), {"foo"}); + AddInputFromArray<string>(TensorShape({}), {"bar"}); + ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +TEST_F(PrintingGraphTest, FirstNSuccess) { + ASSERT_OK(Init(DT_INT32, DT_STRING, "", 3)); + AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<string>(TensorShape({}), {"foo"}); + AddInputFromArray<string>(TensorShape({}), {"bar"}); + // run 4 times but we only print 3 as intended + for (int i = 0; i < 4; i++) ASSERT_OK(RunOpKernel()); + Tensor expected(allocator(), DT_INT32, TensorShape({6})); + test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); +} + +} // end namespace +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc new file mode 100644 index 0000000000..9781bcfa59 --- /dev/null +++ b/tensorflow/core/kernels/lookup_table_init_op.cc @@ -0,0 +1,116 @@ +#define EIGEN_USE_THREADS + +#include <string> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/initializable_lookup_table.h" +#include "tensorflow/core/kernels/lookup_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { +namespace lookup { + +// Iterator to initialize tables given 'keys' and 'values' tensors. +// +// The two tensors are returned in the first iteration. It doesn't loop +// over each element of the tensor since insertions in the lookup table can +// process batches. +class KeyValueTensorIterator + : public InitializableLookupTable::InitTableIterator { + public: + // keys and values are not owned by the iterator. + explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values) + : keys_(keys), values_(values), valid_(true), status_(Status::OK()) { + TensorShape key_shape = keys_->shape(); + if (!key_shape.IsSameSize(values_->shape())) { + valid_ = false; + status_ = errors::InvalidArgument( + "keys and values should have the same dimension.", + key_shape.DebugString(), " vs ", values_->shape().DebugString()); + } + if (key_shape.num_elements() == 0) { + valid_ = false; + status_ = + errors::InvalidArgument("keys and values cannot be empty tensors."); + } + } + + bool Valid() const override { return valid_; } + + void Next() override { + valid_ = false; + status_ = errors::OutOfRange("No more data."); + } + + const Tensor& keys() const override { return *keys_; } + + const Tensor& values() const override { return *values_; } + + Status status() const override { return status_; } + + int64 total_size() const { + return keys_ == nullptr ? -1 : keys_->NumElements(); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(KeyValueTensorIterator); + + const Tensor* keys_; // Doesn't own it. + const Tensor* values_; // Doesn't own it. + bool valid_; // true if the iterator points to an existing range. + Status status_; +}; + +} // namespace lookup + +// Kernel to initialize a look table given a key and value tensors. +// After this operation, the table becomes read-only. +class InitializeTableOp : public OpKernel { + public: + explicit InitializeTableOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { + mutex_lock l(mu_); + lookup::InitializableLookupTable* table; + OP_REQUIRES_OK(ctx, + GetInitializableLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(), + table->value_dtype()}; + DataTypeVector expected_outputs = {}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs)); + + const Tensor& keys = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(keys.shape()), + errors::InvalidArgument("Keys must be a vector, but received ", + keys.shape().DebugString())); + + const Tensor& values = ctx->input(2); + OP_REQUIRES( + ctx, TensorShapeUtils::IsVector(values.shape()), + errors::InvalidArgument("Values must be a vector, but received ", + values.shape().DebugString())); + + OP_REQUIRES(ctx, keys.NumElements() == values.NumElements(), + errors::InvalidArgument( + "Keys and values must have the same size ", + keys.NumElements(), " vs ", values.NumElements())); + + lookup::KeyValueTensorIterator iter(&keys, &values); + OP_REQUIRES_OK(ctx, table->Initialize(iter)); + } + + private: + mutex mu_; +}; + +REGISTER_KERNEL_BUILDER(Name("InitializeTable").Device(DEVICE_CPU), + InitializeTableOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc new file mode 100644 index 0000000000..2bab4df94f --- /dev/null +++ b/tensorflow/core/kernels/lookup_table_op.cc @@ -0,0 +1,166 @@ +#include "tensorflow/core/kernels/lookup_table_op.h" +#define EIGEN_USE_THREADS + +#include <string> +#include <utility> + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/initializable_lookup_table.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/lib/hash/hash.h" + +namespace tensorflow { +namespace lookup { + +// Lookup table that wraps an unordered_map, where the key and value data type +// is specified. +// +// This table is recommened for any variations to key values. +// +// For look up, the table is required to be initialized (allocated +// and populated). Once the table is marked as initialized it becomes read-only. +// +// Sample use case: +// +// HashTable<int64, int64> table; // int64 -> int64. +// table.Prepare(10); // Prepare the underlying data structure, the number of +// // elements is required by interface, but not used. +// // Populate the table, elements could be added in one or multiple calls. +// table.Insert(key_tensor, value_tensor); // Populate the table. +// ... +// table.set_is_initialized(); +// +// table.Find(in_t, &out_t, default_t) +// +template <class K, class V> +class HashTable : public InitializableLookupTable { + public: + size_t size() const override { return table_ ? table_->size() : 0; } + + DataType key_dtype() const override { return DataTypeToEnum<K>::v(); } + + DataType value_dtype() const override { return DataTypeToEnum<V>::v(); } + + protected: + Status DoPrepare(size_t unused) override { + if (is_initialized_) { + return errors::Aborted("HashTable already initialized."); + } + if (!table_) { + table_ = std::unique_ptr<std::unordered_map<K, V>>( + new std::unordered_map<K, V>()); + } + return Status::OK(); + }; + + Status DoInsert(const Tensor& keys, const Tensor& values) override { + if (!table_) { + return errors::FailedPrecondition("HashTable is not prepared."); + } + + const auto key_values = keys.flat<K>(); + const auto value_values = values.flat<V>(); + for (size_t i = 0; i < key_values.size(); ++i) { + const K& key = key_values(i); + const V& value = value_values(i); + const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value); + if (previous_value != value) { + return errors::FailedPrecondition( + "HashTable has different value for same key. Key ", key, " has ", + previous_value, " and trying to add value ", value); + } + } + return Status::OK(); + } + + Status DoFind(const Tensor& key, Tensor* value, + const Tensor& default_value) override { + const V default_val = default_value.flat<V>()(0); + const auto key_values = key.flat<K>(); + auto value_values = value->flat<V>(); + + for (size_t i = 0; i < key_values.size(); ++i) { + value_values(i) = + gtl::FindWithDefault(*table_, key_values(i), default_val); + } + return Status::OK(); + } + + private: + std::unique_ptr<std::unordered_map<K, V>> table_; +}; + +} // namespace lookup + +// Table lookup op. Perform the lookup operation on the given table. +class LookupTableFindOp : public OpKernel { + public: + explicit LookupTableFindOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(), + table->value_dtype()}; + DataTypeVector expected_outputs = {table->value_dtype()}; + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs)); + + const Tensor& input = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input.shape()), + errors::InvalidArgument("Input must be a vector, not ", + input.shape().DebugString())); + + const Tensor& default_value = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(default_value.shape()), + errors::InvalidArgument("Default value must be a scalar, not ", + default_value.shape().DebugString())); + + Tensor* out; + OP_REQUIRES_OK(ctx, + ctx->allocate_output("output_values", input.shape(), &out)); + + OP_REQUIRES_OK(ctx, table->Find(input, out, default_value)); + } +}; + +REGISTER_KERNEL_BUILDER(Name("LookupTableFind").Device(DEVICE_CPU), + LookupTableFindOp); + +// Op that returns the size of the given table. +class LookupTableSizeOp : public OpKernel { + public: + explicit LookupTableSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + Tensor* out; + OP_REQUIRES_OK(ctx, ctx->allocate_output("size", TensorShape({}), &out)); + out->flat<int64>().setConstant(table->size()); + } +}; + +REGISTER_KERNEL_BUILDER(Name("LookupTableSize").Device(DEVICE_CPU), + LookupTableSizeOp); + +// Register the HashTable op with the currently supported key and value types. +#define REGISTER_KERNEL(key_dtype, value_dtype) \ + REGISTER_KERNEL_BUILDER( \ + Name("HashTable") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<key_dtype>("key_dtype") \ + .TypeConstraint<value_dtype>("value_dtype"), \ + LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \ + value_dtype>) + +REGISTER_KERNEL(string, int64); +REGISTER_KERNEL(int64, string); + +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h new file mode 100644 index 0000000000..dc53ce33a6 --- /dev/null +++ b/tensorflow/core/kernels/lookup_table_op.h @@ -0,0 +1,80 @@ +#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_ +#define TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_ + +#include "tensorflow/core/framework/lookup_interface.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/kernels/lookup_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +// Lookup table op that supports different table implementations specified by +// the 'Container' template. Container must be derived from LookupInterface. The +// key and value are of the templated type "key_dtype" and "value_dtype" +// respectively. +template <class Container, class key_dtype, class value_dtype> +class LookupTableOp : public OpKernel { + public: + // ctx is not owned by this class. + explicit LookupTableOp(OpKernelConstruction* ctx) + : OpKernel(ctx), table_handle_set_(false) { + OP_REQUIRES_OK(ctx, ctx->allocate_persistent(tensorflow::DT_STRING, + tensorflow::TensorShape({2}), + &table_handle_, nullptr)); + } + + // ctx is not owned by this function. + void Compute(OpKernelContext* ctx) override { + mutex_lock l(mu_); + if (!table_handle_set_) { + OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def())); + auto creator = [this](lookup::LookupInterface** ret) { + *ret = new Container(); + return Status::OK(); + }; + + lookup::LookupInterface* table = nullptr; + OP_REQUIRES_OK( + ctx, cinfo_.resource_manager() + ->template LookupOrCreate<lookup::LookupInterface>( + cinfo_.container(), cinfo_.name(), &table, creator)); + core::ScopedUnref unref_me(table); + + OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes( + *table, DataTypeToEnum<key_dtype>::v(), + DataTypeToEnum<value_dtype>::v(), cinfo_.name())); + + auto h = table_handle_.AccessTensor(ctx)->template flat<string>(); + h(0) = cinfo_.container(); + h(1) = cinfo_.name(); + table_handle_set_ = true; + } + ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx)); + } + + ~LookupTableOp() override { + // If the table object was not shared, delete it. + if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) { + TF_CHECK_OK( + cinfo_.resource_manager()->template Delete<lookup::LookupInterface>( + cinfo_.container(), cinfo_.name())); + } + } + + private: + mutex mu_; + PersistentTensor table_handle_ GUARDED_BY(mu_); + bool table_handle_set_ GUARDED_BY(mu_); + ContainerInfo cinfo_; + + TF_DISALLOW_COPY_AND_ASSIGN(LookupTableOp); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_ diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc new file mode 100644 index 0000000000..634c11e4a5 --- /dev/null +++ b/tensorflow/core/kernels/lookup_util.cc @@ -0,0 +1,72 @@ +#include "tensorflow/core/kernels/lookup_util.h" + +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { +namespace lookup { +namespace { + +Status GetTableHandle(const string& input_name, OpKernelContext* ctx, + string* container, string* table_handle) { + { + mutex* mu; + TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu)); + mutex_lock l(*mu); + Tensor tensor; + TF_RETURN_IF_ERROR(ctx->mutable_input(input_name, &tensor, true)); + if (tensor.NumElements() != 2) { + return errors::InvalidArgument( + "Lookup table handle must be scalar, but had shape: ", + tensor.shape().DebugString()); + } + auto h = tensor.flat<string>(); + *container = h(0); + *table_handle = h(1); + } + return Status::OK(); +} + +} // namespace + +Status GetLookupTable(const string& input_name, OpKernelContext* ctx, + LookupInterface** table) { + string container; + string table_handle; + TF_RETURN_IF_ERROR( + GetTableHandle(input_name, ctx, &container, &table_handle)); + return ctx->resource_manager()->Lookup(container, table_handle, table); +} + +Status GetInitializableLookupTable(const string& input_name, + OpKernelContext* ctx, + InitializableLookupTable** table) { + string container; + string table_handle; + TF_RETURN_IF_ERROR( + GetTableHandle(input_name, ctx, &container, &table_handle)); + LookupInterface* lookup_table; + TF_RETURN_IF_ERROR( + ctx->resource_manager()->Lookup(container, table_handle, &lookup_table)); + *table = dynamic_cast<InitializableLookupTable*>(lookup_table); + if (*table == nullptr) { + lookup_table->Unref(); + return errors::InvalidArgument("Table ", container, " ", table_handle, + " is not initializable"); + } + return Status::OK(); +} + +Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype, + DataType value_dtype, const string& table_name) { + if (table.key_dtype() != key_dtype || table.value_dtype() != value_dtype) { + return errors::InvalidArgument( + "Conflicting key/value dtypes ", key_dtype, "->", value_dtype, " with ", + table.key_dtype(), "-", table.value_dtype(), " for table ", table_name); + } + return Status::OK(); +} + +} // namespace lookup +} // namespace tensorflow diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h new file mode 100644 index 0000000000..991a757edd --- /dev/null +++ b/tensorflow/core/kernels/lookup_util.h @@ -0,0 +1,31 @@ +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_ + +#include "tensorflow/core/framework/lookup_interface.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/initializable_lookup_table.h" + +namespace tensorflow { +namespace lookup { + +// Gets the LookupTable stored in the ctx->resource_manager() with key +// passed by attribute with name input_name, returns null if the table +// doesn't exist. +Status GetLookupTable(const string& input_name, OpKernelContext* ctx, + LookupInterface** table); + +// Gets the InitializableLookupTable stored in the +// ctx->resource_manager() with key passed by attribute with name +// input_name, returns null if the table doesn't exist. +Status GetInitializableLookupTable(const string& input_name, + OpKernelContext* ctx, + InitializableLookupTable** table); + +// Verify that the given key_dtype and value_dtype matches the corresponding +// table's data types. +Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype, + DataType value_dtype, const string& table_name); +} // namespace lookup +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_LOOKUP_UTIL_H_ diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc new file mode 100644 index 0000000000..e5abf5906f --- /dev/null +++ b/tensorflow/core/kernels/lrn_op.cc @@ -0,0 +1,228 @@ +// LRN = Local Response Normalization +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#ifndef __ANDROID__ +#include "tensorflow/core/util/work_sharder.h" +#endif + +namespace tensorflow { + +// Create a depth-by-depth band matrix with 1s along a swath of size (2 * +// depth_radius + 1) around the diagonal. +static void GetBandMatrix(int depth, int64 depth_radius, + Eigen::Tensor<float, 2, Eigen::RowMajor>* result) { + result->setZero(); + for (int row = 0; row < depth; ++row) { + const int begin = std::max<int>(0, row - depth_radius); + const int end = std::min<int64>(depth, row + depth_radius + 1); + Eigen::DSizes<ptrdiff_t, 2> start(row, begin); + Eigen::DSizes<ptrdiff_t, 2> sizes(1, end - begin); + result->slice(start, sizes).setConstant(1.0f); + } +} + +class LRNOp : public OpKernel { + public: + explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_)); + OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); + OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); + OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& in = context->input(0); + OP_REQUIRES(context, in.dims() == 4, + errors::InvalidArgument("in must be 4-dimensional")); + const int64 batch = in.dim_size(0); + const int64 rows = in.dim_size(1); + const int64 cols = in.dim_size(2); + const int64 depth = in.dim_size(3); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output( + 0, TensorShape({batch, rows, cols, depth}), &output)); + +#ifdef __ANDROID__ + MognetLRN(in, batch, rows, cols, depth, output); +#else + const int nodes = cols * rows; + auto in_shaped = in.shaped<float, 2>({nodes * batch, depth}); + + // Multiplying the input with the band matrix has the effect of reducing the + // correct patch along the depth. + Eigen::Tensor<float, 2, Eigen::RowMajor> multiplier(depth, depth); + GetBandMatrix(depth, depth_radius_, &multiplier); + + auto out_shaped = output->shaped<float, 2>({nodes * batch, depth}); + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + /// TODO(keveman): Optimize for beta in {0, 1, 0.5} + out_shaped.device(context->eigen_cpu_device()) = + in_shaped / + in_shaped.square() + .contract(multiplier, dims) + .unaryExpr([this](float x) { return bias_ + alpha_ * x; }) + .pow(beta_); +#endif + } + + private: + typedef Eigen::Tensor<float, 1, Eigen::RowMajor>::DimensionPair DimPair; + + void MognetLRN(const Tensor& in, const int batch, const int rows, + const int cols, const int depth, Tensor* out) { + Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> + data_in(in.flat<float>().data(), depth, batch * rows * cols); + + Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> data_out( + out->flat<float>().data(), depth, batch * rows * cols); + + const int double_depth_radius = depth_radius_ * 2; + Eigen::VectorXf padded_square(data_in.rows() + double_depth_radius); + padded_square.setZero(); + for (int r = 0; r < data_in.cols(); ++r) { + // Do local response normalization for data_in(:, r) + // first, compute the square and store them in buffer for repeated use + padded_square.block(depth_radius_, 0, data_out.rows(), 1) = + data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_; + // Then, compute the scale and writes them to data_out + float accumulated_scale = 0; + for (int i = 0; i < double_depth_radius; ++i) { + accumulated_scale += padded_square(i); + } + for (int i = 0; i < data_in.rows(); ++i) { + accumulated_scale += padded_square(i + double_depth_radius); + data_out(i, r) = bias_ + accumulated_scale; + accumulated_scale -= padded_square(i); + } + } + + // In a few cases, the pow computation could benefit from speedups. + if (beta_ == 1) { + data_out.array() = data_in.array() * data_out.array().inverse(); + } else if (beta_ == 0.5) { + data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); + } else { + data_out.array() = data_in.array() * data_out.array().pow(-beta_); + } + } + + int64 depth_radius_; + float bias_; + float alpha_; + float beta_; +}; + +REGISTER_KERNEL_BUILDER(Name("LRN").Device(DEVICE_CPU), LRNOp); + +#ifndef __ANDROID__ + +class LRNGradOp : public OpKernel { + public: + explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius_)); + OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_)); + OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_)); + OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& in_grads = context->input(0); + const Tensor& in_image = context->input(1); + const Tensor& out_image = context->input(2); + + OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4, + errors::InvalidArgument("inputs must be 4-dimensional")); + const int64 batch = in_grads.dim_size(0); + const int64 rows = in_grads.dim_size(1); + const int64 cols = in_grads.dim_size(2); + const int64 depth = in_grads.dim_size(3); + OP_REQUIRES( + context, + in_image.dim_size(0) == batch && in_image.dim_size(1) == rows && + in_image.dim_size(2) == cols && in_image.dim_size(3) == depth && + out_image.dim_size(0) == batch && out_image.dim_size(1) == rows && + out_image.dim_size(2) == cols && out_image.dim_size(3) == depth, + errors::InvalidArgument( + "input_grads, input_image, and out_image should have the same " + "shape")); + const auto nodes = cols * rows; + auto grads_shaped = in_grads.shaped<float, 2>({nodes * batch, depth}); + auto in_shaped = in_image.shaped<float, 2>({nodes * batch, depth}); + auto activations = out_image.shaped<float, 2>({nodes * batch, depth}); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output( + 0, TensorShape({batch, rows, cols, depth}), &output)); + auto out_shaped = output->shaped<float, 2>({nodes * batch, depth}); + out_shaped.setZero(); + + auto shard = [this, activations, in_shaped, grads_shaped, out_shaped, + depth](int64 begin, int64 end) { + for (int64 i = begin; i < end; ++i) { + for (int64 j = 0; j < depth; ++j) { + // Let y be the LRN activations and x be the inputs along the depth + // dimension. (LRN operates independently along rows, cols, and + // batch). + // We have + // yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius} + // x_j^2))^beta + // + // Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius} + // x_j^2)) + // dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta) + // dy_i/dx_j = ( - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta) + // + // NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta). + // However, this is numerically unstable for small values of xi. We + // compute N explicitly here to avoid that. + + int64 depth_begin = std::max<int64>(0, j - depth_radius_); + int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1); + + float norm = 0.0f; + for (int64 k = depth_begin; k < depth_end; ++k) { + norm += in_shaped(i, k) * in_shaped(i, k); + } + norm = alpha_ * norm + bias_; + DCHECK_GT(norm, 1e-6); + for (int64 k = depth_begin; k < depth_end; ++k) { + float dyi = -2.0f * alpha_ * beta_ * in_shaped(i, k) * + activations(i, j) / norm; + if (k == j) { + dyi += std::pow(norm, -beta_); + } + dyi *= grads_shaped(i, j); + const_cast<TTypes<float, 2>::Tensor&>(out_shaped)(i, k) += dyi; + } + } + } + }; + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch, + depth * depth, shard); + } + + private: + typedef Eigen::Tensor<float, 1, Eigen::RowMajor>::DimensionPair DimPair; + + int64 depth_radius_; + float bias_; + float alpha_; + float beta_; +}; + +REGISTER_KERNEL_BUILDER(Name("LRNGrad").Device(DEVICE_CPU), LRNGradOp); + +#endif // __ANDROID__ + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc new file mode 100644 index 0000000000..4c338b6cb3 --- /dev/null +++ b/tensorflow/core/kernels/lrn_op_test.cc @@ -0,0 +1,185 @@ +#include <functional> +#include <memory> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +static const float tol_ = 1e-4; + +class LRNFloatTest : public OpsTestBase { + protected: + LRNFloatTest() : philox_(123, 17), rand_(&philox_) { RequireDefaultOps(); } + + int GetIntAttr(const string& name) { + int value; + TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value)); + return value; + } + + float GetFloatAttr(const string& name) { + float value; + TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value)); + return value; + } + + bool Compare() { + const auto& input = GetInput(0); + const int64 batch_size = input.dim_size(0); + const int64 rows = input.dim_size(1); + const int64 cols = input.dim_size(2); + const int64 depth = input.dim_size(3); + const int64 rest = cols * rows * batch_size; + + const int64 depth_radius = GetIntAttr("depth_radius"); + const float bias = GetFloatAttr("bias"); + const float alpha = GetFloatAttr("alpha"); + const float beta = GetFloatAttr("beta"); + + Eigen::Tensor<float, 4, Eigen::RowMajor> expected(batch_size, rows, cols, + depth); + auto out = expected.reshape(Eigen::DSizes<int64, 2>{rest, depth}); + auto in = input.shaped<float, 2>({rest, depth}); + + for (int64 i = 0; i < rest; ++i) { + Eigen::Tensor<float, 1, Eigen::RowMajor> out_col(depth); + for (int64 d = 0; d < depth; ++d) { + float denom = 0.0f; + for (int64 r = std::max(0ll, d - depth_radius); + r < std::min(depth, d + depth_radius + 1); ++r) { + denom += in(i, r) * in(i, r); + } + denom = std::pow(denom * alpha + bias, beta); + out_col(d) = in(i, d) / denom; + } + out.chip<0>(i) = out_col; + } + auto actual = GetOutput(0)->tensor<float, 4>(); + Eigen::Tensor<float, 0, Eigen::RowMajor> sum = + ((expected - actual).abs() > actual.constant(tol_)) + .select(actual.constant(1), actual.constant(0)) + .sum(); + return sum() == 0; + } + + random::PhiloxRandom philox_; + random::SimplePhilox rand_; +}; + +TEST_F(LRNFloatTest, Depth96) { + ASSERT_OK(NodeDefBuilder("lrn_op", "LRN") + .Input(FakeInput()) + .Attr("depth_radius", 5) + .Attr("bias", 1.0f) + .Attr("alpha", 0.1f) + .Attr("beta", 2.0f) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + AddInput<float>(TensorShape({1, 1, 1, 96}), + [this](int i) -> float { return i + 1; }); + ASSERT_OK(RunOpKernel()); + auto actual = GetOutput(0)->tensor<float, 4>(); + + // Output for Node 0 with Value 1: + // 1 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2))^2 + EXPECT_NEAR(1. / (10.1 * 10.1), actual(0, 0, 0, 0), tol_); + + // Output for Node 5 with Value 6: + // 6 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 ... + 11^2))^2 + EXPECT_NEAR(6. / (51.6 * 51.6), actual(0, 0, 0, 5), tol_); + + // Output for Node 63 with value 64: + // 64 / (1 + 0.1*(59^2 + 60^2 + 61^2 + 62^2 + 63^2 + 64^2))^2 + EXPECT_NEAR(64. / (2272.1 * 2272.1), actual(0, 0, 0, 63), tol_); + + // Output for Node 64 with value 65: + // 65 / (1 + 0.1*(65^2 + 66^2 + 67^2 + 68^2 + 69^2 + 70^2))^2 + EXPECT_NEAR(65. / (2736.5 * 2736.5), actual(0, 0, 0, 64), tol_); + + // Output for Node 95 with value 96: + // 96 / (1 + 0.1*(91^2 + 92^2 + 93^2 + 94^2 + 95^2 + 96^2))^2 + EXPECT_NEAR(96. / (5248.1 * 5248.1), actual(0, 0, 0, 95), tol_); + EXPECT_TRUE(Compare()); +} + +TEST_F(LRNFloatTest, Depth16) { + ASSERT_OK(NodeDefBuilder("lrn_op", "LRN") + .Input(FakeInput()) + .Attr("depth_radius", 5) + .Attr("bias", 1.0f) + .Attr("alpha", 0.1f) + .Attr("beta", 2.0f) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + AddInput<float>(TensorShape({1, 1, 1, 16}), + [this](int i) -> float { return i + 1; }); + ASSERT_OK(RunOpKernel()); + auto actual = GetOutput(0)->tensor<float, 4>(); + + // Output for Node 0 with Value 1: + // 1 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2))^2 + EXPECT_NEAR(1. / (10.1 * 10.1), actual(0, 0, 0, 0), tol_); + + // Output for Node 5 with Value 6: + // 6 / (1 + 0.1*(1^2 + 2^2 + 3^2 + 4^2 + 5^2 + 6^2 ... + 11^2))^2 + EXPECT_NEAR(6. / (51.6 * 51.6), actual(0, 0, 0, 5), tol_); + + // Output for Node 15 with value 16: + // 16 / (1 + 0.1*(11^2 + 12^2 + 13^2 + 14^2 + 15^2 + 16^2))^2 + EXPECT_NEAR(16. / (112.1 * 112.1), actual(0, 0, 0, 15), tol_); + EXPECT_TRUE(Compare()); +} + +static double RndGaussian(random::SimplePhilox* rnd) { + // Box-Muller transformation. + // See, for example, http://www.taygeta.com/random/gaussian.html + double x1, x2; + double r; + do { + x1 = 2 * rnd->RandDouble() - 1; + x2 = 2 * rnd->RandDouble() - 1; + r = x1 * x1 + x2 * x2; + } while (r == 0 || r >= 1.0); + double w = sqrt(-2.0 * log(r) / r); + return x1 * w; +} + +#define TCASE(NAME, DEPTH, BATCH, DEPTH_RADIUS, BIAS, ALPHA, BETA) \ + TEST_F(LRNFloatTest, NAME) { \ + ASSERT_OK(NodeDefBuilder("lrn_op", "LRN") \ + .Input(FakeInput()) \ + .Attr("depth_radius", (DEPTH_RADIUS)) \ + .Attr("bias", (BIAS)) \ + .Attr("alpha", ((ALPHA) / 10)) \ + .Attr("beta", (BETA)) \ + .Finalize(node_def())); \ + ASSERT_OK(InitOp()); \ + AddInput<float>(TensorShape({BATCH, 1, 1, DEPTH}), \ + [this](int i) -> float { return RndGaussian(&rand_); }); \ + ASSERT_OK(RunOpKernel()); \ + EXPECT_TRUE(Compare()); \ + } + +// clang-format off +// DEPTH BATCH DEPTH_RADIUS BIAS ALPHA BETA +TCASE(T0, 4, 2, 2, 1.0f, 1.0f, 2.0f) +TCASE(T1, 16, 1, 5, 1.0f, 1.0f, 2.0f) +TCASE(T2, 16, 32, 2, 1.0f, 2.0f, 1.0f) +TCASE(T3, 128, 4, 3, 2.0f, 1.0f, 1.0f) +// clang-format on + +#undef TCASE +} // namespace tensorflow diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc new file mode 100644 index 0000000000..08a4da5b41 --- /dev/null +++ b/tensorflow/core/kernels/matching_files_op.cc @@ -0,0 +1,42 @@ +// See docs in ../ops/io_ops.cc. + +#include <string> +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/match.h" +#include "tensorflow/core/public/env.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class MatchingFilesOp : public OpKernel { + public: + using OpKernel::OpKernel; + void Compute(OpKernelContext* context) override { + const Tensor* pattern; + OP_REQUIRES_OK(context, context->input("pattern", &pattern)); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(pattern->shape()), + errors::InvalidArgument( + "Input pattern tensor must be scalar, but had shape: ", + pattern->shape().DebugString())); + std::vector<string> fnames; + OP_REQUIRES_OK(context, + io::GetMatchingFiles(context->env(), + pattern->scalar<string>()(), &fnames)); + const int num_out = fnames.size(); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + "filenames", TensorShape({num_out}), &output)); + auto output_vec = output->vec<string>(); + for (int i = 0; i < num_out; ++i) { + output_vec(i) = fnames[i]; + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MatchingFiles").Device(DEVICE_CPU), + MatchingFilesOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc new file mode 100644 index 0000000000..48bdba78b2 --- /dev/null +++ b/tensorflow/core/kernels/matmul_op.cc @@ -0,0 +1,214 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/matmul_op.h" + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/fill_functor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +#if GOOGLE_CUDA + +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +#endif // GOOGLE_CUDA + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T, bool USE_CUBLAS> +struct LaunchMatMul; + +// On CPUs, we ignore USE_CUBLAS +template <typename T> +struct LaunchMatMulCPU { + static void launch( + OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, + Tensor* out) { + functor::MatMulFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(), + out->matrix<T>(), a.matrix<T>(), + b.matrix<T>(), dim_pair); + } +}; + +template <typename T, bool USE_CUBLAS> +struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {}; + +#if GOOGLE_CUDA + +template <typename T> +struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> { + static void launch( + OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, + Tensor* out) { + perftools::gputools::blas::Transpose trans[] = { + perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose}; + const uint64 m = a.dim_size(1 - dim_pair[0].first); + const uint64 k = a.dim_size(dim_pair[0].first); + const uint64 n = b.dim_size(1 - dim_pair[0].second); + bool transpose_a = dim_pair[0].first == 0; + bool transpose_b = dim_pair[0].second == 1; + auto blas_transpose_a = trans[transpose_a]; + auto blas_transpose_b = trans[transpose_b]; + + auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); + + auto a_ptr = AsDeviceMemory(a.template flat<T>().data()); + auto b_ptr = AsDeviceMemory(b.template flat<T>().data()); + auto c_ptr = AsDeviceMemory(out->template flat<T>().data()); + + // Cublas does + // C = A x B + // where A, B and C are assumed to be in column major. + // We want the output to be in row-major, so we can compute + // C' = B' x A' (' stands for transpose) + bool blas_launch_status = + stream->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f, + b_ptr, transpose_b ? k : n, a_ptr, + transpose_a ? m : k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + ctx->SetStatus(errors::Internal( + "Blas SGEMM launch failed : a.shape=(", a.dim_size(0), ", ", + a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1), + "), m=", m, ", n=", n, ", k=", k)); + } + } +}; + +template <typename T> +struct LaunchMatMul<GPUDevice, T, false /* USE_CUBLAS */> { + static void launch( + OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, + Tensor* out) { + functor::MatMulFunctor<GPUDevice, T>()(ctx->eigen_device<GPUDevice>(), + out->matrix<T>(), a.matrix<T>(), + b.matrix<T>(), dim_pair); + } +}; + +#endif // GOOGLE_CUDA + +template <typename Device, typename T, bool USE_CUBLAS> +class MatMulOp : public OpKernel { + public: + explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& a = ctx->input(0); + const Tensor& b = ctx->input(1); + + // Check that the dimensions of the two matrices are valid. + OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), + errors::InvalidArgument("In[0] is not a matrix")); + OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), + errors::InvalidArgument("In[1] is not a matrix")); + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; + dim_pair[0].first = transpose_a_ ? 0 : 1; + dim_pair[0].second = transpose_b_ ? 1 : 0; + + OP_REQUIRES(ctx, + a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), + errors::InvalidArgument("Matrix size-compatible: In[0]: ", + a.shape().DebugString(), ", In[1]: ", + b.shape().DebugString())); + int a_dim_remaining = 1 - dim_pair[0].first; + int b_dim_remaining = 1 - dim_pair[0].second; + TensorShape out_shape( + {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)}); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); + + if (out->NumElements() == 0) { + // If a has shape [0, x] or b has shape [x, 0], the output shape + // is a 0-element matrix, so there is nothing to do. + return; + } + + if (a.NumElements() == 0 || b.NumElements() == 0) { + // If a has shape [x, 0] and b has shape [0, y], the + // output shape is [x, y] where x and y are non-zero, so we fill + // the output with zeros. + functor::SetZeroFunctor<Device, T> f; + f(ctx->eigen_device<Device>(), out->flat<T>()); + return; + } + + LaunchMatMul<Device, T, USE_CUBLAS>::launch(ctx, this, a, b, dim_pair, out); + } + + private: + bool transpose_a_; + bool transpose_b_; +}; + +namespace functor { + +// Partial specialization MatMulFunctor<Device=CPUDevice, T>. +template <typename T> +struct MatMulFunctor<CPUDevice, T> { + void operator()( + const CPUDevice& d, typename MatMulTypes<T>::out_type out, + typename MatMulTypes<T>::in_type in0, + typename MatMulTypes<T>::in_type in1, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) { + MatMul<CPUDevice>(d, out, in0, in1, dim_pair); + } +}; + +} // end namespace functor + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>); \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("eigen"), \ + MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>) + +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ + MatMulOp<GPUDevice, T, true /* cublas, true by default */>); \ + REGISTER_KERNEL_BUILDER(Name("MatMul") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T") \ + .Label("cublas"), \ + MatMulOp<GPUDevice, T, true /* cublas */>); \ + REGISTER_KERNEL_BUILDER( \ + Name("MatMul").Device(DEVICE_GPU).TypeConstraint<T>("T").Label("eigen"), \ + MatMulOp<GPUDevice, T, false /* cublas */>) + +REGISTER_CPU(float); +REGISTER_CPU(double); +REGISTER_CPU(int32); +REGISTER_CPU(complex64); +#if GOOGLE_CUDA +REGISTER_GPU(float); +// REGISTER_GPU(double); +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h new file mode 100644 index 0000000000..f75b0ded1b --- /dev/null +++ b/tensorflow/core/kernels/matmul_op.h @@ -0,0 +1,40 @@ +#ifndef TENSORFLOW_KERNELS_MATMUL_OP_H_ +#define TENSORFLOW_KERNELS_MATMUL_OP_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Helpers to define tensor<T> needed by MatMul op. +template <typename T> +struct MatMulTypes { + typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned> + out_type; + typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>, + Eigen::Aligned> in_type; +}; + +template <typename Device, typename In0, typename In1, typename Out, + typename DimPair> +void MatMul(const Device& d, Out out, In0 in0, In1 in1, + const DimPair& dim_pair) { + out.device(d) = in0.contract(in1, dim_pair); +} + +template <typename Device, typename T> +struct MatMulFunctor { + // Computes on device "d": out = in0 * in1, where * is matrix + // multiplication. + void operator()( + const Device& d, typename MatMulTypes<T>::out_type out, + typename MatMulTypes<T>::in_type in0, + typename MatMulTypes<T>::in_type in1, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); +}; + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_MATMUL_OP_H_ diff --git a/tensorflow/core/kernels/matmul_op_gpu.cu.cc b/tensorflow/core/kernels/matmul_op_gpu.cu.cc new file mode 100644 index 0000000000..17107ce5df --- /dev/null +++ b/tensorflow/core/kernels/matmul_op_gpu.cu.cc @@ -0,0 +1,32 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/matmul_op.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +// Partial specialization MatMulTensorFunctor<Device=GPUDevice, T> +template <typename T> +struct MatMulFunctor<GPUDevice, T> { + void operator()( + const GPUDevice& d, typename MatMulTypes<T>::out_type out, + typename MatMulTypes<T>::in_type in0, + typename MatMulTypes<T>::in_type in1, + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) { + MatMul<GPUDevice>(d, To32Bit(out), To32Bit(in0), To32Bit(in1), dim_pair); + } +}; + +#define DEFINE(T) template struct MatMulFunctor<GPUDevice, T>; +DEFINE(float); +// DEFINE(double); // Does not compile 1/2015. +#undef DEFINE + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc new file mode 100644 index 0000000000..b2b8f3d905 --- /dev/null +++ b/tensorflow/core/kernels/matmul_op_test.cc @@ -0,0 +1,56 @@ +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +static Graph* Matmul(int m, int k, int n, bool transpose_a, bool transpose_b) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor in0(DT_FLOAT, transpose_a ? TensorShape({k, m}) : TensorShape({m, k})); + in0.flat<float>().setRandom(); + Tensor in1(DT_FLOAT, transpose_b ? TensorShape({n, k}) : TensorShape({k, n})); + in1.flat<float>().setRandom(); + test::graph::Matmul(g, test::graph::Constant(g, in0), + test::graph::Constant(g, in1), transpose_a, transpose_b); + return g; +} + +#define BM_MatmulDev(M, K, N, TA, TB, DEVICE) \ + static void BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \ + int iters) { \ + testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \ + test::Benchmark(#DEVICE, Matmul(M, K, N, TA, TB)).Run(iters); \ + } \ + BENCHMARK(BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE); + +#define BM_Matmul(M, K, N, TA, TB) \ + BM_MatmulDev(M, K, N, TA, TB, cpu); \ + BM_MatmulDev(M, K, N, TA, TB, gpu); + +// Typical fully connected layers +BM_Matmul(8, 512, 512, false, false); +BM_Matmul(16, 512, 512, false, false); +BM_Matmul(128, 512, 512, false, false); + +BM_Matmul(8, 1024, 1024, false, false); +BM_Matmul(16, 1024, 1024, false, false); +BM_Matmul(128, 1024, 1024, false, false); +BM_Matmul(4096, 4096, 4096, false, false); + +// Backward for fully connected layers +BM_Matmul(8, 1024, 1024, false, true); +BM_Matmul(16, 1024, 1024, false, true); +BM_Matmul(128, 1024, 1024, false, true); + +// Forward softmax with large output size +BM_Matmul(8, 200, 10000, false, false); +BM_Matmul(20, 200, 10000, false, false); +BM_Matmul(20, 200, 20000, false, false); + +// Backward softmax with large output size +BM_Matmul(8, 10000, 200, false, true); +BM_Matmul(20, 10000, 200, false, true); +BM_Matmul(20, 20000, 200, false, true); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc new file mode 100644 index 0000000000..ad0948d6ef --- /dev/null +++ b/tensorflow/core/kernels/matrix_inverse_op.cc @@ -0,0 +1,64 @@ +// See docs in ../ops/linalg_ops.cc. +#include <cmath> + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/linalg_ops_common.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/Eigen/LU" + +namespace tensorflow { + +template <class Scalar, bool SupportsBatchOperationT> +class MatrixInverseOp + : public LinearAlgebraOp<Scalar, SupportsBatchOperationT> { + public: + explicit MatrixInverseOp(OpKernelConstruction* context) + : LinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {} + ~MatrixInverseOp() override {} + + TensorShape GetOutputMatrixShape( + const TensorShape& input_matrix_shape) override { + return input_matrix_shape; + } + + int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override { + const int64 rows = input_matrix_shape.dim_size(0); + if (rows > (1LL << 20)) { + // A big number to cap the cost in case overflow. + return kint32max; + } else { + return rows * rows * rows; + } + } + + using typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap; + using + typename LinearAlgebraOp<Scalar, SupportsBatchOperationT>::ConstMatrixMap; + + void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input, + MatrixMap* output) override { + OP_REQUIRES(context, input.rows() == input.cols(), + errors::InvalidArgument("Input matrix must be square.")); + if (input.rows() == 0) { + // By definition, an empty matrix's inverse is an emptry matrix. + return; + } + Eigen::FullPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> lu_decomposition(input); + OP_REQUIRES(context, lu_decomposition.isInvertible(), + errors::InvalidArgument("Input is not invertible.")); + *output = lu_decomposition.inverse(); + } +}; + +REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float, false>), float); +REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double, false>), double); +REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float, true>), float); +REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<double, true>), + double); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc new file mode 100644 index 0000000000..31046018c5 --- /dev/null +++ b/tensorflow/core/kernels/maxpooling_op.cc @@ -0,0 +1,554 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/maxpooling_op.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/pooling_ops_common.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/util/use_cudnn.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#if GOOGLE_CUDA +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/core/kernels/maxpooling_op_gpu.h" +#include "tensorflow/core/kernels/pooling_ops_common_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +const int kInvalidMaxPoolingIndex = -1; + +template <typename Device, typename T> +struct SpatialMaxPoolWithArgMaxHelper { + static void Compute(Tensor* output, Tensor* output_arg_max, + const Tensor& tensor_in, const PoolParameters& params, + const Padding& padding) { + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + ConstEigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + EigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>> + EigenIndexMatrixMap; + + ConstEigenMatrixMap in_mat( + tensor_in.flat<T>().data(), params.depth, + params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); + EigenMatrixMap out_mat( + output->flat<T>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + EigenIndexMatrixMap out_arg_max_mat( + output_arg_max->flat<int64>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + + // Initializes the output tensor with MIN<T>. + output_arg_max->flat<int64>().setConstant(kInvalidMaxPoolingIndex); + output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest()); + + // The following code basically does the following: + // 1. Flattens the input and output tensors into two dimensional arrays. + // tensor_in_as_matrix: + // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) + // output_as_matrix: + // depth by (out_width * out_height * tensor_in_batch) + // + // 2. Walks through the set of columns in the flattened tensor_in_as_matrix, + // and updates the corresponding column(s) in output_as_matrix with the + // max value. + for (int b = 0; b < params.tensor_in_batch; ++b) { + for (int h = 0; h < params.tensor_in_rows; ++h) { + for (int w = 0; w < params.tensor_in_cols; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + const int hpad = h + params.pad_rows; + const int wpad = w + params.pad_cols; + const int h_start = + (hpad < params.window_rows) + ? 0 + : (hpad - params.window_rows) / params.row_stride + 1; + const int h_end = + std::min(hpad / params.row_stride + 1, params.out_height); + const int w_start = + (wpad < params.window_cols) + ? 0 + : (wpad - params.window_cols) / params.col_stride + 1; + const int w_end = + std::min(wpad / params.col_stride + 1, params.out_width); + // compute elementwise max + const int in_index = + (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + const int out_index = + (b * params.out_height + ph) * params.out_width + pw; + /// NOTES(zhengxq): not using the eigen matrix operation for now. + /// May consider parallelizing the operations if needed. + for (int d = 0; d < params.depth; ++d) { + const T& input_ref = in_mat.coeffRef(d, in_index); + T& output_ref = out_mat.coeffRef(d, out_index); + int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index); + if (output_ref < input_ref || + out_arg_max_ref == kInvalidMaxPoolingIndex) { + output_ref = input_ref; + int input_offset = in_index * params.depth + d; + out_arg_max_ref = input_offset; + } + } + } + } + } + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU), + MaxPoolingOp<CPUDevice, float>); + +#if GOOGLE_CUDA +// Forward declarations for the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ + const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, int window_rows, \ + int window_cols, int row_stride, int col_stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Note(jiayq): Currently, the Caffe custom implementation is faster than the +// default Eigen implementation so we are using the custom kernel as the +// default. However, you can explicitly invoke the eigen version using +// kernel_label_map. +REGISTER_KERNEL_BUILDER(Name("MaxPool") + .Device(DEVICE_GPU) + .Label("eigen_tensor"), + MaxPoolingOp<Eigen::GpuDevice, float>); +#endif // GOOGLE_CUDA + +// The operation to compute MaxPool gradients. +// It takes three inputs: +// - The original input tensor +// - The original output tensor +// - Backprop tensor for output +// It produces one output: backprop tensor for input. +template <class Device, class T> +class MaxPoolingGradOp : public OpKernel { + public: + explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + OP_REQUIRES( + context, ksize_[3] == 1 && stride_[3] == 1, + errors::Unimplemented( + "MaxPoolingGrad is not yet supported on the depth dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& tensor_out = context->input(1); + const Tensor& out_backprop = context->input(2); + + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional")); + OP_REQUIRES(context, tensor_out.dims() == 4, + errors::InvalidArgument("tensor_out must be 4-dimensional")); + // For maxpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + + TensorShape output_shape = tensor_in.shape(); + + // Tensor index_tensor(context->allocator(), DT_INT32, output_shape); + + Tensor tensor_out_dup; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + tensor_out.shape(), &tensor_out_dup)); + Tensor tensor_out_arg_max; + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(), + tensor_out.shape(), + &tensor_out_arg_max)); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + output->flat<T>().setZero(); + + SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>::Compute( + &tensor_out_dup, &tensor_out_arg_max, tensor_in, params, padding_); + auto out_backprop_flat = out_backprop.flat<T>(); + auto input_backprop_flat = output->flat<T>(); + auto out_arg_max_flat = tensor_out_arg_max.flat<int64>(); + int num_total_outputs = out_backprop.flat<T>().size(); + int num_total_inputs = input_backprop_flat.size(); + + for (int index = 0; index < num_total_outputs; ++index) { + int input_backprop_index = out_arg_max_flat(index); + // Although this check is in the inner loop, it is worth its value + // so we don't end up with memory corruptions. Our benchmark shows that + // the performance impact is quite small + CHECK(input_backprop_index >= 0 && + input_backprop_index < num_total_inputs) + << "Invalid input backprop index: " << input_backprop_index << ", " + << num_total_inputs; + input_backprop_flat(input_backprop_index) += out_backprop_flat(index); + } + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU), + MaxPoolingGradOp<CPUDevice, float>); + +#ifdef GOOGLE_CUDA + +static void MaxPoolingBackwardCustomKernel( + OpKernelContext* context, const std::vector<int32>& size, + const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in, + const Tensor& out_backprop, const TensorShape& tensor_in_shape) { + Tensor* output = nullptr; + + OP_REQUIRES_OK(context, + context->allocate_output(0, tensor_in_shape, &output)); + + PoolParameters params{context, size, stride, padding, tensor_in_shape}; + if (!context->status().ok()) { + return; + } + + MaxPoolBackwardNoMask( + tensor_in->flat<float>().data(), params.tensor_in_batch, + params.tensor_in_rows, params.tensor_in_cols, params.depth, + params.out_height, params.out_width, params.window_rows, + params.window_cols, params.row_stride, params.col_stride, params.pad_rows, + params.pad_cols, out_backprop.flat<float>().data(), + output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>()); +} + +template <class T> +class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel { + public: + typedef Eigen::GpuDevice Device; + + explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + + use_dnn_ = CanUseCudnn(); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& tensor_out = context->input(1); + const Tensor& out_backprop = context->input(2); + + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional 4")); + OP_REQUIRES(context, tensor_out.dims() == 4, + errors::InvalidArgument("tensor_out must be 4-dimensional")); + // For maxpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + + TensorShape output_shape = tensor_in.shape(); + + if (use_dnn_) { + DnnPoolingGradOp<T>::Compute( + context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_, + stride_, padding_, &tensor_in, &tensor_out, out_backprop, + output_shape); + } else { + MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_, + &tensor_in, out_backprop, output_shape); + } + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; + bool use_dnn_; +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU), + MaxPoolingGradOp<Eigen::GpuDevice, float>); + +#endif // GOOGLE_CUDA + +template <typename Device, typename T> +struct LaunchMaxPoolingNoMask; + +template <typename Device, typename T> +class MaxPoolingNoMaskOp : public OpKernel { + public: + explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.out_height, + params.out_width, params.depth}); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + + LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, + output); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +template <typename Device, typename T> +struct LaunchMaxPoolingWithArgmax; + +template <typename Device, typename T> +class MaxPoolingWithArgmaxOp : public OpKernel { + public: + explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.out_height, + params.out_width, params.depth}); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + Tensor* argmax = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax)); + + LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in, + output, argmax); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +template <typename Device, typename T> +struct LaunchMaxPoolingGradWithArgmax; + +template <typename Device, typename T> +class MaxPoolingGradWithArgmaxOp : public OpKernel { + public: + explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& grad_in = context->input(1); + const Tensor& argmax = context->input(2); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows, + params.tensor_in_cols, params.depth}); + Tensor* grad_out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out)); + + LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in, + argmax, grad_out); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +#if GOOGLE_CUDA + +template <typename T> +struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& input, Tensor* output) { + bool status = MaxPoolForwardWithOptionalArgmax( + input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, + params.tensor_in_cols, params.depth, params.out_height, + params.out_width, params.window_rows, params.window_cols, + params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, + output->flat<T>().data(), nullptr, context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolForwardNoMask")); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU), + MaxPoolingNoMaskOp<Eigen::GpuDevice, float>); + +template <typename T> +struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& input, Tensor* output, Tensor* argmax) { + bool status = MaxPoolForwardWithOptionalArgmax( + input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, + params.tensor_in_cols, params.depth, params.out_height, + params.out_width, params.window_rows, params.window_cols, + params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, + output->flat<T>().data(), + reinterpret_cast<int64*>(argmax->flat<int64>().data()), + context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolForwardWithArgmax")); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint<int64>("Targmax"), + MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>); + +template <typename T> +struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& grad_in, const Tensor& argmax, + Tensor* grad_out) { + const int input_size = params.tensor_in_batch * params.tensor_in_rows * + params.tensor_in_cols * params.depth; + const int output_size = params.tensor_in_batch * params.out_height * + params.out_width * params.depth; + const int top_offset = params.out_height * params.out_width * params.depth; + const int bottom_offset = + params.tensor_in_rows * params.tensor_in_cols * params.depth; + bool status = MaxPoolBackwardWithArgmax( + output_size, input_size, grad_in.flat<T>().data(), + reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, + bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolForwardWithArgmax")); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint<int64>("Targmax"), + MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/maxpooling_op.h b/tensorflow/core/kernels/maxpooling_op.h new file mode 100644 index 0000000000..a074174118 --- /dev/null +++ b/tensorflow/core/kernels/maxpooling_op.h @@ -0,0 +1,29 @@ +#ifndef TENSORFLOW_KERNELS_MAXPOOLING_OP_H_ +#define TENSORFLOW_KERNELS_MAXPOOLING_OP_H_ +// Functor definition for MaxPoolingOp, must be compilable by nvcc. + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +struct SpatialMaxPooling { + void operator()(const Device& d, typename TTypes<T, 4>::Tensor output, + typename TTypes<T, 4>::ConstTensor input, int window_rows, + int window_cols, int row_stride, int col_stride, + const Eigen::PaddingType& padding) { + // Because we swap the layout, we swap the row/cols as well + output.swap_layout().device(d) = + Eigen::SpatialMaxPooling(input.swap_layout(), window_cols, window_rows, + col_stride, row_stride, padding); + } +}; + +} // namespace functor + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_MAXPOOLING_OP_H_ diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc new file mode 100644 index 0000000000..65262eb54e --- /dev/null +++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc @@ -0,0 +1,261 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/maxpooling_op.h" +#include "tensorflow/core/kernels/maxpooling_op_gpu.h" + +namespace tensorflow { +namespace { +// This is Yangqing's custom kernel for the maxpooling operation. There are +// three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two +// forward functions, dealing with the forward case. MaxPoolBackward is the +// backward function that deals with the backward case for both storage orders. +// The parameters to the kernels in the forward function is as follows: +// nthreads: the number of threads, which is equal to the output size. +// bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items. +// height, width, pooled_height, pooled_width: the input and output sizes. +// kernel_h, kernel_w: the kernel sizes. +// stride_h, stride_w: the strides. +// pad_t, pad_l: the padding values on the top and left side. +// top_data: the maxpool output. +// mask: the output mask of the same size as top_data. It is stored in +// int form, keeping track of the flattened index of the input item that +// produces the max output. If a nullptr is passed in for mask, no mask +// will be produced. +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); i += blockDim.x * gridDim.x) + +// To call the forward and backward functions, use e.g.: +// const int kThreadsPerBlock = 1024 +// const int output_size = batch * channels * pooled_height * pooled_width; +// MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, +// kThreadsPerBlock, 0, cuda_stream>>>(...); +template <typename dtype> +__global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data, + const int channels, const int height, + const int width, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, + const int pad_l, dtype* top_data, + int64* mask) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_t; + int wstart = pw * stride_w - pad_l; + int hend = min(hstart + kernel_h, height); + int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dtype maxval = -FLT_MAX; + int maxidx = -1; + const dtype* bottom_data_n = bottom_data + n * channels * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int idx = c * height * width + h * width + w; + if (bottom_data_n[idx] > maxval) { + maxidx = idx; + maxval = bottom_data_n[idx]; + } + } + } + top_data[index] = maxval; + if (mask != nullptr) { + mask[index] = maxidx; + } + } +} + +template <typename dtype> +__global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, + const int pad_l, dtype* top_data, + int64* mask) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int n = index; + int c = n % channels; + n /= channels; + int wstart = (n % pooled_width) * stride_w - pad_l; + n /= pooled_width; + int hstart = (n % pooled_height) * stride_h - pad_t; + n /= pooled_height; + int hend = min(hstart + kernel_h, height); + int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dtype maxval = -FLT_MAX; + int maxidx = -1; + const dtype* bottom_data_n = bottom_data + n * height * width * channels; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int idx = (h * width + w) * channels + c; + if (bottom_data_n[idx] > maxval) { + maxidx = idx; + maxval = bottom_data_n[idx]; + } + } + } + top_data[index] = maxval; + if (mask != nullptr) { + mask[index] = maxidx; + } + } +} + +template <typename dtype> +__global__ void MaxPoolBackwardNoMaskNHWC( + const int nthreads, const dtype* bottom_data, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + const dtype* top_diff, dtype* bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // First find out the index to the maximum, since we have no mask. + int n = index; + int c = n % channels; + n /= channels; + int wstart = (n % pooled_width) * stride_w - pad_l; + n /= pooled_width; + int hstart = (n % pooled_height) * stride_h - pad_t; + n /= pooled_height; + int hend = min(hstart + kernel_h, height); + int wend = min(wstart + kernel_w, width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dtype maxval = -FLT_MAX; + int maxidx = -1; + const dtype* bottom_data_n = bottom_data + n * height * width * channels; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int idx = (h * width + w) * channels + c; + if (bottom_data_n[idx] > maxval) { + maxidx = idx; + maxval = bottom_data_n[idx]; + } + } + } + + // Atomically accumulate the bottom diff. The index could still be + // uninitialized, if all the bottom_data are NaN. + if (maxidx != -1) { + atomicAdd(bottom_diff + n * height * width * channels + maxidx, + top_diff[index]); + } + } +} + +// The parameters to the kernels in the backward function is as follows: +// nthreads: the number of threads, which is equal to the output size. +// top_diff: the gradient of the output data, of size N*Hout*Wout*C (or +// N*C*Hout*Wout). As we have stored the flattened index of the input +// entries, the backward function is agnostic of the input storage order. +// mask: the output mask of the same size as top_data. It is stored in +// int form, keeping track of the flattened index of the input item that +// produces the max output. +// top_offset: the pre-computed per-image offset of the maxpool output. This +// is equal to Hout*Wout*C. We choose to pre-compute this so we do not +// need to compute it every time inside the kernel. +// bottom_offset: the pre-computed per-image offset of the maxpool input. +// This is equal to H*W*C. +// bottom_diff: the gradient with respect to the input. +// This function relies on atomicAdd to avoid race conditions. Also, before the +// kernel is run, you will need to make sure that bottom_diff is filled with +// zero first. +template <typename dtype> +__global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff, + const int64* mask, const int top_offset, + const int bottom_offset, dtype* bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int image_id = (index / top_offset); + atomicAdd(bottom_diff + image_id * bottom_offset + mask[index], + top_diff[index]); + } +} + +template <typename dtype> +__global__ void SetZero(const int nthreads, dtype* bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { *(bottom_diff + index) = dtype(0); } +} + +#undef CUDA_1D_KERNEL_LOOP +} // namespace + +bool MaxPoolForwardWithOptionalArgmax( + const float* bottom_data, const int batch, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + float* top_data, int64* mask, const Eigen::GpuDevice& d) { + const int kThreadsPerBlock = 1024; + const int output_size = batch * channels * pooled_height * pooled_width; + + MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>( + output_size, bottom_data, height, width, channels, pooled_height, + pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, + top_data, mask); + return d.ok(); +} + +bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, const int pad_l, + const float* top_diff, float* bottom_diff, + const Eigen::GpuDevice& d) { + const int kThreadsPerBlock = 1024; + const int bottom_size = batch * channels * height * width; + const int top_size = batch * channels * pooled_height * pooled_width; + + SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff); + + MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) / + kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>( + top_size, bottom_data, height, width, channels, pooled_height, + pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l, + top_diff, bottom_diff); + return d.ok(); +} + +bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, + const float* top_diff, const int64* mask, + const int top_offset, const int bottom_offset, + float* bottom_diff, const Eigen::GpuDevice& d) { + const int kThreadsPerBlock = 1024; + SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff); + MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, + kThreadsPerBlock, 0, d.stream()>>>( + output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff); + return d.ok(); +} + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_GPU_KERNELS(T) \ + template struct functor::SpatialMaxPooling<GPUDevice, T>; + +DEFINE_GPU_KERNELS(float) + +#undef DEFINE_GPU_KERNELS + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h new file mode 100644 index 0000000000..bfdac904cc --- /dev/null +++ b/tensorflow/core/kernels/maxpooling_op_gpu.h @@ -0,0 +1,42 @@ +#if !GOOGLE_CUDA +#error This file must only be included when building with Cuda support +#endif + +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_ + +#define EIGEN_USE_GPU + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" + +namespace tensorflow { + +// Run the forward pass of max pooling, optionally writing the argmax indices to +// the mask array, if it is not nullptr. If mask is passed in as nullptr, the +// argmax indices are not written. +bool MaxPoolForwardWithOptionalArgmax( + const float* bottom_data, const int batch, const int height, + const int width, const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_t, const int pad_l, + float* top_data, int64* mask, const Eigen::GpuDevice& d); + +bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size, + const float* top_diff, const int64* mask, + const int top_offset, const int bottom_offset, + float* bottom_diff, const Eigen::GpuDevice& d); + +bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch, + const int height, const int width, + const int channels, const int pooled_height, + const int pooled_width, const int kernel_h, + const int kernel_w, const int stride_h, + const int stride_w, const int pad_t, const int pad_l, + const float* top_diff, float* bottom_diff, + const Eigen::GpuDevice& d); + +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_ diff --git a/tensorflow/core/kernels/no_op.cc b/tensorflow/core/kernels/no_op.cc new file mode 100644 index 0000000000..b4f9df81a6 --- /dev/null +++ b/tensorflow/core/kernels/no_op.cc @@ -0,0 +1,8 @@ +#include "tensorflow/core/kernels/no_op.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_CPU), NoOp); +REGISTER_KERNEL_BUILDER(Name("NoOp").Device(DEVICE_GPU), NoOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/no_op.h b/tensorflow/core/kernels/no_op.h new file mode 100644 index 0000000000..a3bcbd7680 --- /dev/null +++ b/tensorflow/core/kernels/no_op.h @@ -0,0 +1,17 @@ +#ifndef TENSORFLOW_KERNELS_NO_OP_H_ +#define TENSORFLOW_KERNELS_NO_OP_H_ + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +class NoOp : public OpKernel { + public: + explicit NoOp(OpKernelConstruction* context) : OpKernel(context) {} + void Compute(OpKernelContext* context) override {} + bool IsExpensive() override { return false; } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_NO_OP_H_ diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc new file mode 100644 index 0000000000..7bea17b9e2 --- /dev/null +++ b/tensorflow/core/kernels/ops_testutil.cc @@ -0,0 +1,18 @@ +#include "tensorflow/core/kernels/ops_testutil.h" + +namespace tensorflow { +namespace test { + +NodeDef Node(const string& name, const string& op, + const std::vector<string>& inputs) { + NodeDef def; + def.set_name(name); + def.set_op(op); + for (const string& s : inputs) { + def.add_input(s); + } + return def; +} + +} // namespace test +} // namespace tensorflow diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h new file mode 100644 index 0000000000..7a3405bf04 --- /dev/null +++ b/tensorflow/core/kernels/ops_testutil.h @@ -0,0 +1,191 @@ +#ifndef TENSORFLOW_KERNELS_OPS_TESTUTIL_H_ +#define TENSORFLOW_KERNELS_OPS_TESTUTIL_H_ + +#include <memory> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/env.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +namespace test { + +// Return a NodeDef with the specified name/op/inputs. +NodeDef Node(const string& name, const string& op, + const std::vector<string>& inputs); + +} // namespace test + +// Helpful functions to test operators. +// +// This class will eventually be replaced / heavily modified +// to use the BrainClient interface. +class OpsTestBase : public ::testing::Test { + public: + OpsTestBase() : device_type_(DEVICE_CPU) { + device_.reset( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + CHECK(device_.get()) << "Could not create CPU device"; + } + + ~OpsTestBase() override { + gtl::STLDeleteElements(&tensors_); + context_.reset(nullptr); + } + + void set_node_def(const NodeDef& node_def) { node_def_.CopyFrom(node_def); } + + // Clients can manipulate the underlying NodeDef via this accessor. + NodeDef* node_def() { return &node_def_; } + + // Initializes an operator that takes in 'input_types' as input + // and output types as output. + // + // Returns the status of initialization. + Status InitOp() { + Status status; + kernel_ = CreateOpKernel(device_type_, device_.get(), allocator(), + node_def_, &status); + if (kernel_ != nullptr) input_types_ = kernel_->input_types(); + return status; + } + + // Adds an input for every element described by the shape. + // 'input_mapping' maps an index (0...NumElements(shape)) to a + // value. + // + // TODO(vrv): Replace with something like a BrainClient Feed. + template <typename T> + void AddInput(const TensorShape& shape, std::function<T(int)> input_mapping) { + CHECK_GT(input_types_.size(), inputs_.size()) + << "Adding more inputs than types; perhaps you need to call MakeOp"; + bool is_ref = IsRefType(input_types_[inputs_.size()]); + Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()), + DataTypeToEnum<T>::v(), shape); + test::FillFn(input, input_mapping); + tensors_.push_back(input); + if (is_ref) { + CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]), + DataTypeToEnum<T>::v()); + inputs_.push_back({&lock_for_refs_, input}); + } else { + CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v()); + inputs_.push_back({nullptr, input}); + } + } + + // Like AddInput but takes in an explicit arrayslice of data. + template <typename T> + void AddInputFromArray(const TensorShape& shape, + const gtl::ArraySlice<T>& data) { + CHECK_GT(input_types_.size(), inputs_.size()) + << "Adding more inputs than types; perhaps you need to call MakeOp"; + bool is_ref = IsRefType(input_types_[inputs_.size()]); + Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()), + DataTypeToEnum<T>::v(), shape); + test::FillValues<T>(input, data); + tensors_.push_back(input); + if (is_ref) { + CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]), + DataTypeToEnum<T>::v()); + inputs_.push_back({&lock_for_refs_, input}); + } else { + CHECK_EQ(input_types_[inputs_.size()], DataTypeToEnum<T>::v()); + inputs_.push_back({nullptr, input}); + } + } + + // Runs an operation producing 'num_outputs' outputs. + // + // Returns the context's status after running the operation. + Status RunOpKernel() { + OpKernelContext::Params params; + params.device = device_.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs_; + params.op_kernel = kernel_.get(); + params.output_alloc_attr = [this, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = + (kernel_->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; + params.slice_reader_cache = &slice_reader_cache_wrapper; + + context_.reset(new OpKernelContext(params)); + device_->Compute(kernel_.get(), context_.get()); + return context_->status(); + } + + // Returns the tensor input for 'input_index'. + // + // REQUIRES: 0 <= input_index < context_->num_inputs() + const Tensor& GetInput(int input_index) const { + CHECK_LT(input_index, context_->num_inputs()); + CHECK(!IsRefType(context_->input_dtype(input_index))); + return context_->input(input_index); + } + + TensorValue mutable_input(int input_index) { + CHECK_LT(input_index, inputs_.size()); + return inputs_[input_index]; + } + // Returns the tensor output for 'output_index'. + // + // REQUIRES: 0 <= output_index < context_->num_outputs() + Tensor* GetOutput(int output_index) { + CHECK_LT(output_index, context_->num_outputs()); + return context_->mutable_output(output_index); + } + + Allocator* allocator() { + return device_->GetAllocator(AllocatorAttributes()); + } + + const DataTypeVector& output_types() const { return kernel_->output_types(); } + + protected: + std::unique_ptr<Device> device_; + + std::unique_ptr<OpKernel> kernel_; + NodeDef node_def_; + DataTypeVector input_types_; + DeviceType device_type_; + + mutex lock_for_refs_; // Used as the Mutex for inputs added as refs + + gtl::InlinedVector<TensorValue, 4> inputs_; + // Owns Tensors. + std::vector<Tensor*> tensors_; + + std::unique_ptr<OpKernelContext> context_; + + private: + TF_DISALLOW_COPY_AND_ASSIGN(OpsTestBase); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_OPS_TESTUTIL_H_ diff --git a/tensorflow/core/kernels/ops_util.cc b/tensorflow/core/kernels/ops_util.cc new file mode 100644 index 0000000000..ca2925128e --- /dev/null +++ b/tensorflow/core/kernels/ops_util.cc @@ -0,0 +1,113 @@ +#include <cmath> + +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/padding.h" + +namespace tensorflow { + +void RequireDefaultOps() { +// TODO(opensource): Use a more generic sounding preprocessor name than +// GOOGLE_CUDA (maybe SUPPORT_CUDA?) +#if GOOGLE_CUDA + void RequireGPUDevice(); + RequireGPUDevice(); +#endif +} + +Status Get2dOutputSize(const int in_height, const int in_width, + int filter_height, int filter_width, int row_stride, + int col_stride, Padding padding, int* new_height, + int* new_width, int* pad_rows, int* pad_cols) { + int pad_bottom_unused, pad_right_unused; + return Get2dOutputSizeVerbose( + in_height, in_width, filter_height, filter_width, row_stride, col_stride, + padding, new_height, new_width, pad_rows, &pad_bottom_unused, pad_cols, + &pad_right_unused); +} + +Status Get2dOutputSizeVerbose(const int in_height, const int in_width, + int filter_height, int filter_width, + int row_stride, int col_stride, Padding padding, + int* new_height, int* new_width, int* pad_top, + int* pad_bottom, int* pad_left, int* pad_right) { + // Cannot have strides larger than the patch size. + if (row_stride > filter_height || col_stride > filter_width) { + return errors::InvalidArgument( + "stride must be less than or equal to kernel size"); + } + switch (padding) { + case Padding::VALID: + *new_height = ceil((in_height - filter_height + 1.f) / + static_cast<float>(row_stride)); + *new_width = ceil((in_width - filter_width + 1.f) / + static_cast<float>(col_stride)); + *pad_top = 0; + *pad_bottom = 0; + *pad_left = 0; + *pad_right = 0; + break; + case Padding::SAME: + *new_height = ceil(in_height / static_cast<float>(row_stride)); + *new_width = ceil(in_width / static_cast<float>(col_stride)); + // Calculate padding for top/bottom/left/right, spilling any excess + // padding to bottom and right. + const int pad_needed_height = + (*new_height - 1) * row_stride + filter_height - in_height; + *pad_top = pad_needed_height / 2; + CHECK_GE(pad_needed_height, 0); + *pad_bottom = pad_needed_height - *pad_top; + + const int pad_needed_width = + (*new_width - 1) * col_stride + filter_width - in_width; + *pad_left = pad_needed_width / 2; + CHECK_GE(pad_needed_width, 0); + *pad_right = pad_needed_width - *pad_left; + break; + } + if (*new_height < 0 || *new_width < 0) { + return errors::InvalidArgument("computed output size would be negative"); + } + return Status::OK(); +} + +Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) { + switch (padding) { + case Padding::VALID: + return Eigen::PADDING_VALID; + case Padding::SAME: + return Eigen::PADDING_SAME; + } + return Eigen::PADDING_SAME; // Prevent compiler warning about missing return +} + +Status GetBroadcastSize(const int index, const int in_size, + const int ksize, const int stride, + const int pad_size, int* bindex, int* bsize) { + // Cannot have strides larger than the patch size. + if (stride > ksize) { + return errors::InvalidArgument( + "stride must be less than or equal to kernel size"); + } + // Cannot have index beyond the input size. + if (index * stride > in_size) { + return errors::InvalidArgument( + "index * stride must be less than or equal to input size"); + } + *bindex = index * stride; + *bsize = ksize; + if (*bindex < pad_size) { + // If the current index is in the padding area, start broadcast from index + // 0 with broadcast size reduced by padding size. + *bsize = ksize + *bindex - pad_size; + *bindex = 0; + } else { + // Otherwise, start broadcast from current index reduced by padding size. + *bindex -= pad_size; + } + if (*bindex + ksize > in_size) { + *bsize = std::min((in_size - *bindex), ksize); + } + return Status::OK(); +} +} // namespace tensorflow diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h new file mode 100644 index 0000000000..283338f8df --- /dev/null +++ b/tensorflow/core/kernels/ops_util.h @@ -0,0 +1,180 @@ +#ifndef TENSORFLOW_KERNELS_OPS_UTIL_H_ +#define TENSORFLOW_KERNELS_OPS_UTIL_H_ + +// This file contains utilities for various operations. + +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +// Call this function from a test if op kernels are not being +// registered. This can happen if the test is linked in a shared +// mode and has no direct references to any code from this directory. +void RequireDefaultOps(); + +// Get2dOutputSize(): Given an input tensor, kernel, stride and padding +// type, the function computes the output and padding dimensions. +// +// Convolution layers take in an input tensor of shape (D, C, R, B), and +// convolve it with a set of filters, which can also be presented as a +// tensor (D, K, K, M), where M is the number of filters, K is the filter size, +// and each 3-dimensional tensor of size (D, K, K) is a filter. For +// simplicity we assume that we always use square filters (which is usually the +// case in images). It also takes in a few additional parameters: +// +// Stride (S): the stride with which we apply the filters. This is the offset +// between locations where we apply the filters. A larger stride +// means that the output will be spatially smaller. +// +// Padding (P): the padding we apply to the input tensor along the R and C +// dimensions. This is usually used to make sure that the spatial dimension +// do not shrink when we progress with convolutions. Two types of padding are +// often used: +// SAME: the pad value is computed so that the output will have size R/S +// and C/S. +// VALID: no padding is carried out. +// The padded area is zero-filled. +// +// The output dimensions for convolution and many other operations, when given +// all the parameters above, are as follows: +// - When Padding = SAME: the output size is (B, R', C', M), where +// R' = ceil(float(R) / float(S)) +// C' = ceil(float(C) / float(S)) +// where ceil is the ceiling function. The number of padded rows and columns +// are computed as: +// Pr = ((R' - 1) * S + K - R) / 2 +// Pc = ((C' - 1) * S + K - C) / 2 +// When the stride is 1, we have the simplified case +// R'=R, C'=C, Pr=Pc=(K-1)/2. +// This is where SAME comes from - the output has the same size as the input +// has. +// +// - When Padding = VALID: the output size is computed as +// R' = ceil(float(R - K + 1) / float(S)) +// C' = ceil(float(C - K + 1) / float(S)) +// and the number of padded rows and columns are computed in the same way. +// When the stride is 1, we have the simplified case +// R'=R-K+1, C'=C-K+1, Pr=0, Pc=0. +// +// For convolution, mathematically, the output value at location (b, r', c', m) +// is the inner product of two vectors: the chunk of input at +// (b, (r'*S-Pr) : (r'*S-Pr+K), (c'*S-Pc) : (c'*S-Pc+K), :), +// and the filter at (m, :, :, :). +// +Status Get2dOutputSize(const int in_height, const int in_width, + int filter_height, int filter_width, int row_stride, + int col_stride, Padding padding, int* new_height, + int* new_width, int* pad_rows, int* pad_cols); + +// Returns the same output dimensions as in Get2dOutputSize, but returns verbose +// padding dimensions (top/bottom/left/right). Any excess padding (caused by +// an odd padding size value) is added to the 'pad_bottom' and 'pad_right' +// dimensions. +Status Get2dOutputSizeVerbose(const int in_height, const int in_width, + int filter_height, int filter_width, + int row_stride, int col_stride, Padding padding, + int* new_height, int* new_width, int* pad_top, + int* pad_bottom, int* pad_left, int* pad_right); + +// Calculates broadcast starting index and size. For SAME padding, addition +// padding could be applied to right, left, top and bottom. Depending on the +// current index, input size, kernel size, stride, padding size, the starting +// index and size for broadcast for that dimension are different from the +// current index and kernel size. +// This is mainly used by gradient algorithms for pooling operations. +Status GetBroadcastSize(const int index, const int in_size, + const int ksize, const int stride, + const int pad_size, int* bindex, int* bsize); + +// Converts Brain's Padding to Eigen's PaddingType. +Eigen::PaddingType BrainPadding2EigenPadding(Padding padding); + +// Given a shape 's' of a tensor of type T. Returns true iff the +// number of bytes occupied by each dim 0 (i.e., &tensor(i + 1, ...) - +// &tensor(i, ...)) is multiple of EIGEN_ALIGN_BYTES. +template <typename T> +bool IsInnerDimsSizeAligned(const TensorShape& s) { + if (s.dims() == 0) return false; + const int64 dim0_size = s.dim_size(0); + if (dim0_size == 0) return false; + const int64 bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T); + return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0; +} + +// Returns in 'col_data', image patches in storage order (height, width, depth) +// extracted from image at 'input_data', which is requred to be in storage +// order (batch, height, width, depth). +// Implementation written by Yangqing Jia (jiayq). +template <typename T> +void Im2col(const T* input_data, const int depth, const int height, + const int width, const int filter_h, const int filter_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, T* col_data) { + int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; + int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; + + int h_pad = -pad_t; + for (int h = 0; h < height_col; ++h) { + int w_pad = -pad_l; + for (int w = 0; w < width_col; ++w) { + for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { + for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { + if (ih >= 0 && ih < height && iw >= 0 && iw < width) { + memcpy(col_data, input_data + (ih * width + iw) * depth, + sizeof(T) * depth); + } else { + // This should be simply padded with zero. + memset(col_data, 0, sizeof(T) * depth); + } + col_data += depth; + } + } + w_pad += stride_w; + } + h_pad += stride_h; + } +} + +// Returns in 'im_data' image patch in storage order (height, width, depth), +// constructed from patches in 'col_data', which is required to be in storage +// order (out_height * out_width, filter_height, filter_width, in_depth). +// Implementation by Yangqing Jia (jiayq). +template <typename T> +void Col2im(const T* col_data, const int depth, const int height, + const int width, const int filter_h, const int filter_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, T* im_data) { + memset(im_data, 0, sizeof(T) * height * width * depth); + int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; + int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; + int h_pad = -pad_t; + for (int h = 0; h < height_col; ++h) { + int w_pad = -pad_l; + for (int w = 0; w < width_col; ++w) { + T* im_patch_data = im_data + (h_pad * width + w_pad) * depth; + for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { + for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { + if (ih >= 0 && ih < height && iw >= 0 && iw < width) { + // TODO(andydavis) Vectorize this loop (if compiler does not). + for (int i = 0; i < depth; ++i) { + im_patch_data[i] += col_data[i]; + } + } + im_patch_data += depth; + col_data += depth; + } + // Jump over remaining number of depth. + im_patch_data += depth * (width - filter_w); + } + w_pad += stride_w; + } + h_pad += stride_h; + } +} + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_OPS_UTIL_H_ diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc new file mode 100644 index 0000000000..bc4f57e220 --- /dev/null +++ b/tensorflow/core/kernels/ops_util_test.cc @@ -0,0 +1,265 @@ +#include "tensorflow/core/kernels/ops_util.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +class OpsUtilTest : public ::testing::Test { + protected: + OpsUtilTest() {} + ~OpsUtilTest() override {} + + // Padding structure. + struct padding_struct { + // Input parameters. + struct { + int in_height; + int in_width; + int filter_height; + int filter_width; + int row_stride; + int col_stride; + Padding padding; + } input; + // Output. + struct { + int new_height; + int new_width; + int pad_top; + int pad_bottom; + int pad_left; + int pad_right; + } output; + }; + + // Broadcast structure. + struct bcast_struct { + // Input parameters. + struct { + int index; // Current index. + int in_size; // Size of the dimension. + int ksize; // Kernel size. + int stride; // Stride. + int pad_size; // Padding size. + } input; + // Output. + struct { + int new_index; // New starting index. + int new_size; // New broadcast size. + } output; + }; + + static void VerifyGet2dOutputSizeBoundaries(padding_struct pad_struct, + error::Code code) { + int new_height, new_width, pad_rows, pad_cols; + Status status = Get2dOutputSize( + pad_struct.input.in_height, pad_struct.input.in_width, + pad_struct.input.filter_height, pad_struct.input.filter_width, + pad_struct.input.row_stride, pad_struct.input.col_stride, + pad_struct.input.padding, &new_height, &new_width, &pad_rows, + &pad_cols); + EXPECT_EQ(status.code(), code) << status; + } + + static void VerifyGet2dOutputSizeValues(padding_struct pad_struct, + error::Code code) { + int new_height, new_width, pad_rows, pad_cols; + Status status = Get2dOutputSize( + pad_struct.input.in_height, pad_struct.input.in_width, + pad_struct.input.filter_height, pad_struct.input.filter_width, + pad_struct.input.row_stride, pad_struct.input.col_stride, + pad_struct.input.padding, &new_height, &new_width, &pad_rows, + &pad_cols); + EXPECT_EQ(status.code(), code) << status; + EXPECT_EQ(pad_struct.output.new_height, new_height); + EXPECT_EQ(pad_struct.output.new_width, new_width); + EXPECT_EQ(pad_struct.output.pad_top, pad_rows); + EXPECT_EQ(pad_struct.output.pad_left, pad_cols); + } + + static void VerifyGet2dOutputVerboseSizeValues(padding_struct pad_struct, + error::Code code) { + int new_height, new_width, pad_top, pad_bottom, pad_left, pad_right; + Status status = Get2dOutputSizeVerbose( + pad_struct.input.in_height, pad_struct.input.in_width, + pad_struct.input.filter_height, pad_struct.input.filter_width, + pad_struct.input.row_stride, pad_struct.input.col_stride, + pad_struct.input.padding, &new_height, &new_width, &pad_top, + &pad_bottom, &pad_left, &pad_right); + EXPECT_EQ(status.code(), code) << status; + EXPECT_EQ(pad_struct.output.new_height, new_height); + EXPECT_EQ(pad_struct.output.new_width, new_width); + EXPECT_EQ(pad_struct.output.pad_top, pad_top); + EXPECT_EQ(pad_struct.output.pad_bottom, pad_bottom); + EXPECT_EQ(pad_struct.output.pad_left, pad_left); + EXPECT_EQ(pad_struct.output.pad_right, pad_right); + } + + static void VerifyBoundaries(bcast_struct bcast, error::Code code) { + int new_index, new_size; + Status status = GetBroadcastSize( + bcast.input.index, bcast.input.in_size, bcast.input.ksize, + bcast.input.stride, bcast.input.pad_size, &new_index, &new_size); + EXPECT_EQ(status.code(), code) << status; + } + + static void VerifyBcastValues(bcast_struct bcast) { + int new_index, new_size; + EXPECT_EQ(Status::OK(), + GetBroadcastSize(bcast.input.index, bcast.input.in_size, + bcast.input.ksize, bcast.input.stride, + bcast.input.pad_size, &new_index, &new_size)); + EXPECT_EQ(bcast.output.new_index, new_index); + EXPECT_EQ(bcast.output.new_size, new_size); + } +}; + +// Test stride > ksize fails with INVALID_ARGUMENT. +TEST_F(OpsUtilTest, Get2dOutputSizeInvalidTest) { + padding_struct pad_struct = {{3, 3, 1, 2, 2, 2, SAME}, {3, 3, 1, 1, 1, 1}}; + VerifyGet2dOutputSizeBoundaries(pad_struct, error::INVALID_ARGUMENT); +} + +TEST_F(OpsUtilTest, Get2dOutputSizeNegativeSizeTest) { + padding_struct pad_struct = {{1, 1, 3, 3, 1, 1, VALID}, {-1, -1, 0, 0, 0, 0}}; + VerifyGet2dOutputSizeBoundaries(pad_struct, error::INVALID_ARGUMENT); +} + +TEST_F(OpsUtilTest, Get2dOutputSizeSquareFilterTest) { + padding_struct pad_struct1 = {{3, 3, 2, 2, 2, 2, SAME}, {2, 2, 0, 0, 0, 0}}; + padding_struct pad_struct2 = {{3, 3, 2, 2, 2, 2, VALID}, {1, 1, 0, 0, 0, 0}}; + VerifyGet2dOutputSizeValues(pad_struct1, error::OK); + VerifyGet2dOutputSizeValues(pad_struct2, error::OK); +} + +TEST_F(OpsUtilTest, Get2dOutputSizeNonSquareFilterTest) { + padding_struct pad_struct1 = {{4, 5, 1, 2, 1, 1, SAME}, {4, 5, 0, 0, 0, 0}}; + padding_struct pad_struct2 = {{4, 5, 1, 2, 1, 1, VALID}, {4, 4, 0, 0, 0, 0}}; + VerifyGet2dOutputSizeValues(pad_struct1, error::OK); + VerifyGet2dOutputSizeValues(pad_struct2, error::OK); +} + +TEST_F(OpsUtilTest, Get2dOutputSizeUnevenStrideTest) { + padding_struct pad_struct1 = {{4, 4, 2, 2, 1, 2, VALID}, {3, 2, 0, 0, 0, 0}}; + padding_struct pad_struct2 = {{4, 4, 2, 2, 2, 1, VALID}, {2, 3, 0, 0, 0, 0}}; + VerifyGet2dOutputSizeValues(pad_struct1, error::OK); + VerifyGet2dOutputSizeValues(pad_struct2, error::OK); +} + +TEST_F(OpsUtilTest, Get2dOutputSizeVerbose) { + padding_struct pad_struct1 = {{3, 3, 2, 2, 2, 2, SAME}, {2, 2, 0, 1, 0, 1}}; + padding_struct pad_struct2 = {{3, 3, 2, 2, 2, 2, VALID}, {1, 1, 0, 0, 0, 0}}; + VerifyGet2dOutputVerboseSizeValues(pad_struct1, error::OK); + VerifyGet2dOutputVerboseSizeValues(pad_struct2, error::OK); +} + +// Test stride > ksize fails with INVALID_ARGUMENT. +TEST_F(OpsUtilTest, GetBroadcastTest3_1_2_0) { + bcast_struct bcast = {{0, 3, 1, 2, 0}, {0, 3}}; + VerifyBoundaries(bcast, error::INVALID_ARGUMENT); +} + +// Test index * stride > in_size fails with INVALID_ARGUMENT. +TEST_F(OpsUtilTest, GetBroadcastTestBadIndex) { + bcast_struct bcast = {{2, 3, 1, 2, 0}, {0, 3}}; + VerifyBoundaries(bcast, error::INVALID_ARGUMENT); +} + +// in_size = 3, ksize = 3, stride = 1, pad_size = 0 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_0) { + bcast_struct bcast[] = { + {{0, 3, 3, 1, 0}, {0, 3}}, + {{1, 3, 3, 1, 0}, {1, 2}}, + {{2, 3, 3, 1, 0}, {2, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 1, pad_size = 1 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_1) { + bcast_struct bcast[] = { + {{0, 3, 3, 1, 1}, {0, 2}}, + {{1, 3, 3, 1, 1}, {0, 3}}, + {{2, 3, 3, 1, 1}, {1, 2}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 1, pad_size = 2 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) { + bcast_struct bcast[] = { + {{0, 3, 3, 1, 2}, {0, 1}}, + {{1, 3, 3, 1, 2}, {0, 2}}, + {{2, 3, 3, 1, 2}, {0, 3}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 2, pad_size = 0 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) { + bcast_struct bcast[] = { + {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 2, pad_size = 1 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) { + bcast_struct bcast[] = { + {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 2, pad_size = 2 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_2) { + bcast_struct bcast[] = { + {{0, 3, 3, 2, 2}, {0, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 3, pad_size = 0 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) { + bcast_struct bcast[] = { + {{0, 3, 3, 3, 0}, {0, 3}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 3, pad_size = 1 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) { + bcast_struct bcast[] = { + {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +// in_size = 3, ksize = 3, stride = 3, pad_size = 2 +TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_2) { + bcast_struct bcast[] = { + {{0, 3, 3, 3, 2}, {0, 1}}, + }; + for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) { + VerifyBcastValues(bcast[i]); + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc new file mode 100644 index 0000000000..cb125ea2fe --- /dev/null +++ b/tensorflow/core/kernels/pack_op.cc @@ -0,0 +1,114 @@ +// See docs in ../ops/array_ops.cc. + +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/kernels/concat_op.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// -------------------------------------------------------------------------- +template <typename Device, typename T> +class PackOp : public OpKernel { + public: + typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> + ConstMatrixVector; + + explicit PackOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* c) override { + OpInputList values; + OP_REQUIRES_OK(c, c->input_list("values", &values)); + const int num = values.size(); + + // Verify that all input shapes match + for (int i = 1; i < num; i++) { + OP_REQUIRES(c, values[0].shape().IsSameSize(values[i].shape()), + errors::InvalidArgument( + "Shapes of all inputs must match: values[0].shape = ", + values[0].shape().ShortDebugString(), " != values[", i, + "].shape = ", values[i].shape().ShortDebugString())); + } + + TensorShape output_shape(values[0].shape()); + output_shape.InsertDim(0, num); + + // In the num = 1 case, just reshape the input + if (num == 1) { + Tensor output; + CHECK(output.CopyFrom(values[0], output_shape)); + c->set_output(0, output); + return; + } + + // Allocate output + Tensor* output; + OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output)); + + const int output_size = output->NumElements(); + if (output_size > 0) { + auto output_flat = output->shaped<T, 2>({1, output_size}); + + // Except for shapes, pack is a special case of concat, so we reuse the + // same computational kernels. + ConstMatrixVector inputs_flat; + inputs_flat.reserve(num); + for (int i = 0; i < num; ++i) { + inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix( + values[i].shaped<T, 2>({1, values[i].NumElements()}))); + } + if (std::is_same<Device, GPUDevice>::value) { + ConcatGPU<T>(c->eigen_gpu_device(), inputs_flat, &output_flat); + } else { + ConcatCPU<T>(c->device(), inputs_flat, &output_flat); + } + } + } +}; + +#define REGISTER_PACK(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Pack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + PackOp<CPUDevice, type>) + +TF_CALL_ALL_TYPES(REGISTER_PACK); +REGISTER_PACK(quint8); +REGISTER_PACK(qint8); +REGISTER_PACK(qint32); +REGISTER_PACK(bfloat16); + +#undef REGISTER_PACK + +#if GOOGLE_CUDA + +#define REGISTER_GPU(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Pack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + PackOp<GPUDevice, type>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); +#undef REGISTER_GPU + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Pack") + .Device(DEVICE_GPU) + .HostMemory("values") + .HostMemory("output") + .TypeConstraint<int32>("T"), + PackOp<CPUDevice, int32>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc new file mode 100644 index 0000000000..6c66e54e3d --- /dev/null +++ b/tensorflow/core/kernels/pad_op.cc @@ -0,0 +1,159 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/pad_op.h" + +#include <memory> +#include <string> +#include <utility> + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class PadOp : public OpKernel { + public: + explicit PadOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& in0 = context->input(0); + const Tensor& in1 = context->input(1); + const int dims = in0.dims(); + static const int kMinDims = 0; + static const int kMaxDims = 5; + OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims, + errors::Unimplemented("inputs rank not in [", kMinDims, ",", + kMaxDims, "]: ", dims)); + OP_REQUIRES( + context, + TensorShapeUtils::IsMatrix(in1.shape()) && in1.dim_size(1) == 2, + errors::InvalidArgument("paddings must be a matrix with 2 columns: ", + in1.shape().DebugString())); + const int fixed_dims = + (kAllowLegacyScalars && dims == 0 && in1.dim_size(0) == 1) ? 1 : dims; + OP_REQUIRES( + context, fixed_dims == in1.dim_size(0), + errors::InvalidArgument( + "The first dimension of paddings must be the rank of inputs", + in1.shape().DebugString(), " ", in0.shape().DebugString())); + + // Compute the shape of the output tensor, and allocate it. + TensorShape output_shape; + TTypes<int32>::ConstMatrix paddings = in1.matrix<int32>(); + for (int d = 0; d < fixed_dims; ++d) { + const int32 before_d = paddings(d, 0); // Pad before existing elements. + const int32 after_d = paddings(d, 1); // Pad after exisitng elements. + OP_REQUIRES(context, before_d >= 0 && after_d >= 0, + errors::InvalidArgument("Paddings must be non-negative: ", + before_d, " ", after_d)); + const int size_d = + (kAllowLegacyScalars && d == in0.dims()) ? 1 : in0.dim_size(d); + output_shape.AddDim(before_d + size_d + after_d); + } + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + + // Invoke the dims-specific implementation. + switch (fixed_dims) { + case 0: + Operate<0>(context, in0.tensor<T, 0>(), paddings, output); + break; + case 1: + // TODO(irving): Once Pad doesn't need a scalar special case, + // change flat to tensor. That is, once !kAllowLegacyScalars. + Operate<1>(context, in0.flat<T>(), paddings, output); + break; + case 2: + Operate<2>(context, in0.tensor<T, 2>(), paddings, output); + break; + case 3: + Operate<3>(context, in0.tensor<T, 3>(), paddings, output); + break; + case 4: + Operate<4>(context, in0.tensor<T, 4>(), paddings, output); + break; + case 5: + Operate<5>(context, in0.tensor<T, 5>(), paddings, output); + break; + default: + OP_REQUIRES(context, false, + errors::InvalidArgument("Only ranks up to 5 supported: ", + in0.shape().DebugString())); + } + } + + private: + template <int Dims> + void Operate(OpKernelContext* context, + typename TTypes<T, Dims>::ConstTensor input, + TTypes<int32>::ConstMatrix paddings, Tensor* output) { + CHECK_EQ(Dims, paddings.dimension(0)); + CHECK_EQ(2, paddings.dimension(1)); + Eigen::array<std::pair<int32, int32>, Dims> paddings_array; + for (int i = 0; i < Dims; ++i) { + paddings_array[i] = std::make_pair(paddings(i, 0), paddings(i, 1)); + } + functor::Pad<Device, T, Dims> functor; + functor(context->eigen_device<Device>(), output->tensor<T, Dims>(), input, + paddings_array); + } +}; + +#define REGISTER_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Pad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("paddings"), \ + PadOp<CPUDevice, type>) + +TF_CALL_ALL_TYPES(REGISTER_KERNEL); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T, Dims) \ + template <> \ + void Pad<GPUDevice, T, Dims>::operator()( \ + const GPUDevice& d, typename TTypes<T, Dims>::Tensor output, \ + typename TTypes<T, Dims>::ConstTensor input, \ + Eigen::array<std::pair<int32, int32>, Dims> paddings); \ + extern template struct Pad<GPUDevice, T, Dims>; + +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC(T, 0); \ + DECLARE_GPU_SPEC(T, 1); \ + DECLARE_GPU_SPEC(T, 2); \ + DECLARE_GPU_SPEC(T, 3); \ + DECLARE_GPU_SPEC(T, 4); \ + DECLARE_GPU_SPEC(T, 5); + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS); +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Pad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("paddings"), \ + PadOp<GPUDevice, T>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL); +#endif // GOOGLE_CUDA + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/pad_op.h b/tensorflow/core/kernels/pad_op.h new file mode 100644 index 0000000000..c4f8a4abda --- /dev/null +++ b/tensorflow/core/kernels/pad_op.h @@ -0,0 +1,27 @@ +#ifndef TENSORFLOW_KERNELS_PAD_OP_H_ +#define TENSORFLOW_KERNELS_PAD_OP_H_ +// Functor definition for PadOp, must be compilable by nvcc. + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by PadOp to do the computations. +template <typename Device, typename T, int Dims> +struct Pad { + // Pad "input" into "output", as specified by "paddings". See pad_op.cc for + // details. + void operator()(const Device& d, typename TTypes<T, Dims>::Tensor output, + typename TTypes<T, Dims>::ConstTensor input, + Eigen::array<std::pair<int32, int32>, Dims> paddings) { + output.device(d) = input.pad(paddings); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_PAD_OP_H_ diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc new file mode 100644 index 0000000000..35a03a2cb2 --- /dev/null +++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc @@ -0,0 +1,26 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/pad_op.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Definition of the GPU implementations declared in pad_op.cc. +#define DEFINE_GPU_SPECS(T) \ + template struct functor::Pad<GPUDevice, T, 0>; \ + template struct functor::Pad<GPUDevice, T, 1>; \ + template struct functor::Pad<GPUDevice, T, 2>; \ + template struct functor::Pad<GPUDevice, T, 3>; \ + template struct functor::Pad<GPUDevice, T, 4>; \ + template struct functor::Pad<GPUDevice, T, 5>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc new file mode 100644 index 0000000000..35e9bd75fa --- /dev/null +++ b/tensorflow/core/kernels/pooling_ops_common.cc @@ -0,0 +1,252 @@ +#include "tensorflow/core/kernels/pooling_ops_common.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/public/tensor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/maxpooling_op_gpu.h" +#include "tensorflow/core/kernels/pooling_ops_common_gpu.h" +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +PoolParameters::PoolParameters(OpKernelContext* context, + const std::vector<int32>& ksize, + const std::vector<int32>& stride, + Padding padding, + const TensorShape& tensor_in_shape) { + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in_shape.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional")); + + depth = tensor_in_shape.dim_size(3); + tensor_in_cols = tensor_in_shape.dim_size(2); + tensor_in_rows = tensor_in_shape.dim_size(1); + tensor_in_batch = tensor_in_shape.dim_size(0); + window_rows = ksize[1]; + window_cols = ksize[2]; + depth_window = ksize[3]; + row_stride = stride[1]; + col_stride = stride[2]; + depth_stride = stride[3]; + + // We only support 2D pooling across width/height and depthwise + // pooling, not a combination. + OP_REQUIRES(context, + (depth_window == 1 || (window_rows == 1 && window_cols == 1)), + errors::Unimplemented( + "MaxPooling supports exactly one of pooling across depth " + "or pooling across width/height.")); + + if (depth_window == 1) { + OP_REQUIRES_OK(context, Get2dOutputSize( + tensor_in_rows, tensor_in_cols, window_rows, + window_cols, row_stride, col_stride, padding, + &out_height, &out_width, &pad_rows, &pad_cols)); + } else { + // Our current version of depthwise max pooling does not support + // any padding, and expects the depth_window to equal the + // depth_stride (no overlapping). + OP_REQUIRES( + context, depth % depth_window == 0, + errors::Unimplemented("Depthwise max pooling requires the depth " + "window to evenly divide the input depth")); + OP_REQUIRES( + context, depth_stride == depth_window, + errors::Unimplemented("Depthwise max pooling requires the depth " + "window to equal the depth stride")); + + // The current version of depthwise max is only implemented on CPU. + OP_REQUIRES(context, + (DeviceType(static_cast<Device*>(context->device()) + ->attributes() + .device_type()) == DeviceType(DEVICE_CPU)), + errors::Unimplemented("Depthwise max pooling is currently " + "only implemented for CPU devices.")); + + pad_depth = 0; + out_depth = depth / depth_window; + } +} + +TensorShape PoolParameters::forward_output_shape() { + if (depth_window == 1) { + // Spatial pooling + return TensorShape({tensor_in_batch, out_height, out_width, depth}); + } else { + // Depthwise pooling + return TensorShape( + {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth}); + } +} + +#ifdef GOOGLE_CUDA + +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, + uint64 size) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), + size * sizeof(T)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void TransformDepth<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformDepth<GPUDevice, T>; + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +template <typename T> +void DnnPoolingGradOp<T>::Compute( + OpKernelContext* context, + perftools::gputools::dnn::PoolingMode pooling_mode, + const std::vector<int32>& size, const std::vector<int32>& stride, + Padding padding, const Tensor* tensor_in, const Tensor* tensor_out, + const Tensor& out_backprop, const TensorShape& tensor_in_shape) { + CHECK((pooling_mode == perftools::gputools::dnn::PoolingMode::kMaximum) || + (tensor_in && tensor_out)) + << "For MaxPoolGrad, both tensor_in and tensor_out needs to be " + "specified"; + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, tensor_in_shape, &output)); + + PoolParameters params{context, size, stride, padding, tensor_in_shape}; + if (!context->status().ok()) { + return; + } + + /// For now, cudnn does not support NHWC format, so we need to convert it + /// to NCHW before calling cudnn. We need to get rid of this once it is done + Tensor transformed_input; + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({tensor_in_shape.dim_size(0), + tensor_in_shape.dim_size(3), + tensor_in_shape.dim_size(1), + tensor_in_shape.dim_size(2)}), + &transformed_input)); + Tensor transformed_input_backprop; + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({tensor_in_shape.dim_size(0), + tensor_in_shape.dim_size(3), + tensor_in_shape.dim_size(1), + tensor_in_shape.dim_size(2)}), + &transformed_input_backprop)); + Tensor transformed_output; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3), + out_backprop.dim_size(1), out_backprop.dim_size(2)}), + &transformed_output)); + Tensor transformed_output_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3), + out_backprop.dim_size(1), out_backprop.dim_size(2)}), + &transformed_output_backprop)); + + auto nhwc_to_nchw = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2); + if (tensor_in) { + // For AvgPoolGrad, the original input tensor is not necessary. However, + // cudnn still requires them to run, although they do not affect the + // results. + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), tensor_in->tensor<T, 4>(), + nhwc_to_nchw, transformed_input.tensor<T, 4>()); + } + if (tensor_out) { + // For AvgPoolGrad, the original output tensor is not necessary. However, + // cudnn still requires them to run, although they do not affect the + // results. + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), tensor_out->tensor<T, 4>(), + nhwc_to_nchw, transformed_output.tensor<T, 4>()); + } + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), + nhwc_to_nchw, transformed_output_backprop.tensor<T, 4>()); + + /// Get ready to call cudnn + perftools::gputools::dnn::PoolingDescriptor pooling_desc; + pooling_desc.set_pooling_mode(pooling_mode) + .set_window_height(params.window_rows) + .set_window_width(params.window_cols) + .set_vertical_stride(params.row_stride) + .set_horizontal_stride(params.col_stride) + .set_vertical_padding(params.pad_rows) + .set_horizontal_padding(params.pad_cols); + + perftools::gputools::dnn::BatchDescriptor orig_output_desc; + orig_output_desc.set_count(params.tensor_in_batch) + .set_height(params.out_height) + .set_width(params.out_width) + .set_feature_map_count(params.depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + + perftools::gputools::dnn::BatchDescriptor orig_input_desc; + orig_input_desc.set_count(params.tensor_in_batch) + .set_height(params.tensor_in_rows) + .set_width(params.tensor_in_cols) + .set_feature_map_count(params.depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + + auto orig_output_data = + AsDeviceMemory(transformed_output.template flat<T>().data(), + transformed_output.template flat<T>().size()); + auto orig_input_data = + AsDeviceMemory(transformed_input.template flat<T>().data(), + transformed_input.template flat<T>().size()); + auto output_backprop = + AsDeviceMemory(transformed_output_backprop.template flat<T>().data(), + transformed_output_backprop.template flat<T>().size()); + auto input_backprop = + AsDeviceMemory(transformed_input_backprop.template flat<T>().data(), + transformed_input_backprop.template flat<T>().size()); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + bool status = + stream->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data, + orig_output_desc, orig_output_data, + output_backprop, &input_backprop) + .ok(); + OP_REQUIRES(context, status, + errors::Internal("cudnn PoolBackward launch failed")); + + /// Transform the output data from NCHW back to NHWC + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + auto nchw_to_nhwc = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1); + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), + toConstTensor(transformed_input_backprop).template tensor<T, 4>(), + nchw_to_nhwc, output->tensor<T, 4>()); +} + +template class DnnPoolingGradOp<float>; + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h new file mode 100644 index 0000000000..5bf44b6e40 --- /dev/null +++ b/tensorflow/core/kernels/pooling_ops_common.h @@ -0,0 +1,264 @@ +#ifndef TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_ +#define TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_ + +#include <vector> + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/avgpooling_op.h" +#include "tensorflow/core/kernels/maxpooling_op.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// A helper class to manage sizes and shapes for pooling operations. +struct PoolParameters { + // Updates context->status if there is an invalid input. + PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize, + const std::vector<int32>& stride, Padding padding, + const TensorShape& tensor_in_shape); + + // Returns the shape of the output for "forward" pooling operations. + TensorShape forward_output_shape(); + + int depth; + + int tensor_in_cols; + int tensor_in_rows; + int tensor_in_batch; + + int window_rows; + int window_cols; + int depth_window; + + int row_stride; + int col_stride; + int depth_stride; + + int out_height; + int out_width; + int out_depth; + + int pad_rows; + int pad_cols; + int pad_depth; +}; + +// An implementation of MaxPooling (forward). +template <typename Device, typename T> +class MaxPoolingOp : public UnaryOp<T> { + public: + explicit MaxPoolingOp(OpKernelConstruction* context) : UnaryOp<T>(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, params.forward_output_shape(), &output)); + + if (params.depth_window > 1) { + DepthwiseMaxPool(context, output, tensor_in, params); + } else { + SpatialMaxPool(context, output, tensor_in, params, padding_); + } + } + + private: + // Single-threaded implementation of DepthwiseMaxPool which + // does not handle all of the same options as SpatialMaxPool + // (strict assumptions on no padding, stride). + // + // TODO(vrv): implement a more general depthwise-max pool that works + // on GPU as well. + void DepthwiseMaxPool(OpKernelContext* context, Tensor* output, + const Tensor& tensor_in, const PoolParameters& params) { + Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + in_by_pool(tensor_in.flat<T>().data(), params.depth_window, + tensor_in.NumElements() / params.depth_window); + Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool( + output->flat<T>().data(), 1, output->NumElements()); + out_by_pool = in_by_pool.colwise().maxCoeff(); + } + + void SpatialMaxPool(OpKernelContext* context, Tensor* output, + const Tensor& tensor_in, const PoolParameters& params, + const Padding& padding) { + // On GPU, use Eigen's Spatial Max Pooling. On CPU, use an + // EigenMatrix version that is currently faster than Eigen's + // Spatial MaxPooling implementation. + // + // TODO(vrv): Remove this once we no longer need it. + if (std::is_same<Device, GPUDevice>::value) { + Eigen::PaddingType pt = BrainPadding2EigenPadding(padding); + functor::SpatialMaxPooling<Device, T>()( + context->eigen_device<Device>(), output->tensor<T, 4>(), + tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols, + params.row_stride, params.col_stride, pt); + } else { + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + ConstEigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + EigenMatrixMap; + + ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth, + params.tensor_in_cols * params.tensor_in_rows * + params.tensor_in_batch); + EigenMatrixMap out_mat( + output->flat<T>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + + // Initializes the output tensor with MIN<T>. + output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest()); + + // The following code basically does the following: + // 1. Flattens the input and output tensors into two dimensional arrays. + // tensor_in_as_matrix: + // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) + // output_as_matrix: + // depth by (out_width * out_height * tensor_in_batch) + // + // 2. Walks through the set of columns in the flattened + // tensor_in_as_matrix, + // and updates the corresponding column(s) in output_as_matrix with the + // max value. + for (int b = 0; b < params.tensor_in_batch; ++b) { + for (int h = 0; h < params.tensor_in_rows; ++h) { + for (int w = 0; w < params.tensor_in_cols; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + const int hpad = h + params.pad_rows; + const int wpad = w + params.pad_cols; + const int h_start = + (hpad < params.window_rows) + ? 0 + : (hpad - params.window_rows) / params.row_stride + 1; + const int h_end = + std::min(hpad / params.row_stride + 1, params.out_height); + const int w_start = + (wpad < params.window_cols) + ? 0 + : (wpad - params.window_cols) / params.col_stride + 1; + const int w_end = + std::min(wpad / params.col_stride + 1, params.out_width); + // compute elementwise max + const int in_offset = + (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + const int out_offset = + (b * params.out_height + ph) * params.out_width + pw; + out_mat.col(out_offset) = + out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset)); + } + } + } + } + } + } + } + + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +template <typename Device, typename T> +void SpatialAvgPool(OpKernelContext* context, Tensor* output, + const Tensor& input, const PoolParameters& params, + const Padding& padding) { + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + ConstEigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + EigenMatrixMap; + + auto in_flat = input.flat<T>(); + auto out_flat = output->flat<T>(); + + ConstEigenMatrixMap in_mat( + in_flat.data(), params.depth, + params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); + EigenMatrixMap out_mat( + out_flat.data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols()); + out_count.setZero(); + + // Initializes output to zero. + out_flat.setZero(); + + // The following code basically does the following: + // 1. Flattens the input and output tensors into two dimensional arrays. + // tensor_in_as_matrix: + // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) + // output_as_matrix: + // depth by (out_width * out_height * tensor_in_batch) + // + // 2. Walks through the set of columns in the flattened + // tensor_in_as_matrix, + // and updates the corresponding column(s) in output_as_matrix with the + // average value. + for (int b = 0; b < params.tensor_in_batch; ++b) { + for (int h = 0; h < params.tensor_in_rows; ++h) { + for (int w = 0; w < params.tensor_in_cols; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + const int hpad = h + params.pad_rows; + const int wpad = w + params.pad_cols; + const int h_start = + (hpad < params.window_rows) + ? 0 + : (hpad - params.window_rows) / params.row_stride + 1; + const int h_end = + std::min(hpad / params.row_stride + 1, params.out_height); + const int w_start = + (wpad < params.window_cols) + ? 0 + : (wpad - params.window_cols) / params.col_stride + 1; + const int w_end = + std::min(wpad / params.col_stride + 1, params.out_width); + const int in_offset = + (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; + Eigen::DSizes<ptrdiff_t, 2> in_indices(0, in_offset); + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + const int out_offset = + (b * params.out_height + ph) * params.out_width + pw; + out_mat.col(out_offset) += in_mat.col(in_offset); + out_count(out_offset)++; + } + } + } + } + } + DCHECK_GT(out_count.minCoeff(), 0); + out_mat.array().rowwise() /= out_count.transpose().array(); +} + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_POOLING_OPS_COMMON_H_ diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h new file mode 100644 index 0000000000..87a3ef5186 --- /dev/null +++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h @@ -0,0 +1,39 @@ +#if !GOOGLE_CUDA +#error This file must only be included when building with Cuda support +#endif + +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_ + +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/kernels/avgpooling_op.h" +#include "tensorflow/core/kernels/maxpooling_op.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +// A helper class that launch the cudnn pooling backward operations. +// The original input and output tensors are optional for AvgPoolGrad, but +// mandatory for MaxPoolGrad. +template <typename T> +class DnnPoolingGradOp { + public: + typedef GPUDevice Device; + static void Compute(OpKernelContext* context, + perftools::gputools::dnn::PoolingMode pooling_mode, + const std::vector<int32>& size, + const std::vector<int32>& stride, Padding padding, + const Tensor* tensor_in, const Tensor* tensor_out, + const Tensor& out_backprop, + const TensorShape& tensor_in_shape); +}; + +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_GPU_H_ diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc new file mode 100644 index 0000000000..1b13f68a3a --- /dev/null +++ b/tensorflow/core/kernels/queue_base.cc @@ -0,0 +1,153 @@ +#include "tensorflow/core/kernels/queue_base.h" + +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +namespace { + +template <DataType DT> +void HandleSliceToElement(const Tensor& parent, Tensor* element, int index) { + typedef typename EnumToDataType<DT>::Type T; + auto parent_as_matrix = parent.flat_outer_dims<T>(); + element->flat<T>() = parent_as_matrix.chip(index, 0); +} + +template <DataType DT> +void HandleElementToSlice(const Tensor& element, Tensor* parent, int index) { + typedef typename EnumToDataType<DT>::Type T; + auto parent_as_matrix = parent->flat_outer_dims<T>(); + parent_as_matrix.chip(index, 0) = element.flat<T>(); +} + +} // namespace + +// static +Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element, + int index) { +#define HANDLE_TYPE(DT) \ + if (parent.dtype() == DT) { \ + HandleSliceToElement<DT>(parent, element, index); \ + return Status::OK(); \ + } + HANDLE_TYPE(DT_FLOAT); + HANDLE_TYPE(DT_DOUBLE); + HANDLE_TYPE(DT_INT32); + HANDLE_TYPE(DT_UINT8); + HANDLE_TYPE(DT_INT16); + HANDLE_TYPE(DT_INT8); + HANDLE_TYPE(DT_STRING); + HANDLE_TYPE(DT_INT64); +#undef HANDLE_TYPE + return errors::Unimplemented("Unhandled data type: ", parent.dtype()); +} + +// static +Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent, + int index) { +#define HANDLE_TYPE(DT) \ + if (element.dtype() == DT) { \ + HandleElementToSlice<DT>(element, parent, index); \ + return Status::OK(); \ + } + HANDLE_TYPE(DT_FLOAT); + HANDLE_TYPE(DT_DOUBLE); + HANDLE_TYPE(DT_INT32); + HANDLE_TYPE(DT_UINT8); + HANDLE_TYPE(DT_INT16); + HANDLE_TYPE(DT_INT8); + HANDLE_TYPE(DT_STRING); + HANDLE_TYPE(DT_INT64); +#undef HANDLE_TYPE + return errors::Unimplemented("Unhandled data type: ", element.dtype()); +} + +QueueBase::QueueBase(const DataTypeVector& component_dtypes, + const std::vector<TensorShape>& component_shapes, + const string& name) + : component_dtypes_(component_dtypes), + component_shapes_(component_shapes), + name_(name) {} + +Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const { + if (tuple.size() != static_cast<size_t>(num_components())) { + return errors::InvalidArgument( + "Wrong number of components in tuple. Expected ", num_components(), + ", got ", tuple.size()); + } + for (size_t i = 0; i < tuple.size(); ++i) { + if (tuple[i].dtype() != component_dtypes_[i]) { + return errors::InvalidArgument( + "Type mismatch in tuple component ", i, ". Expected ", + DataTypeString(component_dtypes_[i]), ", got ", + DataTypeString(tuple[i].dtype())); + } + } + return Status::OK(); +} + +// static +string QueueBase::ShapeListString(const gtl::ArraySlice<TensorShape>& shapes) { + string result = "["; + bool first = true; + for (const TensorShape& shape : shapes) { + strings::StrAppend(&result, (first ? "" : ", "), shape.ShortDebugString()); + first = false; + } + strings::StrAppend(&result, "]"); + return result; +} + +Status QueueBase::MatchesNodeDefOp(const NodeDef& node_def, + const string& op) const { + if (node_def.op() != op) { + return errors::InvalidArgument("Shared queue '", name_, "' has type '", op, + "' that does not match type of Node '", + node_def.name(), "': ", node_def.op()); + } + return Status::OK(); +} + +Status QueueBase::MatchesNodeDefCapacity(const NodeDef& node_def, + int32 capacity) const { + int32 requested_capacity = -1; + TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "capacity", &requested_capacity)); + if (requested_capacity < 0) requested_capacity = kUnbounded; + if (requested_capacity != capacity) { + return errors::InvalidArgument("Shared queue '", name_, "' has capacity ", + capacity, " but requested capacity was ", + requested_capacity); + } + return Status::OK(); +} + +Status QueueBase::MatchesNodeDefTypes(const NodeDef& node_def) const { + DataTypeVector requested_dtypes; + TF_RETURN_IF_ERROR( + GetNodeAttr(node_def, "component_types", &requested_dtypes)); + if (requested_dtypes != component_dtypes_) { + return errors::InvalidArgument("Shared queue '", name_, + "' has component types ", + DataTypeSliceString(component_dtypes_), + " but requested component types were ", + DataTypeSliceString(requested_dtypes)); + } + return Status::OK(); +} + +Status QueueBase::MatchesNodeDefShapes(const NodeDef& node_def) const { + std::vector<TensorShape> requested_shapes; + TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "shapes", &requested_shapes)); + if (requested_shapes != component_shapes_) { + return errors::InvalidArgument("Shared queue '", name_, + "' has component shapes ", + ShapeListString(component_shapes_), + " but requested component shapes were ", + ShapeListString(requested_shapes)); + } + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h new file mode 100644 index 0000000000..4897102974 --- /dev/null +++ b/tensorflow/core/kernels/queue_base.h @@ -0,0 +1,77 @@ +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/queue_interface.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +// Functionality common to QueueInterface implementations. +class QueueBase : public QueueInterface { + public: + // As a possible value of 'capacity'. + static const int32 kUnbounded = INT_MAX; + + // Args: + // component_dtypes: The types of each component in a queue-element tuple. + // component_shapes: The shapes of each component in a queue-element tuple, + // which must either be empty (if the shapes are not specified) or + // or have the same size as component_dtypes. + // name: A name to use for the queue. + QueueBase(const DataTypeVector& component_dtypes, + const std::vector<TensorShape>& component_shapes, + const string& name); + + // Implementations of QueueInterface methods -------------------------------- + const DataTypeVector& component_dtypes() const override { + return component_dtypes_; + } + + // Other public methods ----------------------------------------------------- + const std::vector<TensorShape>& component_shapes() const { + return component_shapes_; + } + + protected: + // Returns the number of components in a queue-element tuple. + int32 num_components() const { return component_dtypes_.size(); } + + // True if shapes were specified. If so, inputs will be validated + // against them, etc. + bool specified_shapes() const { return component_shapes_.size() > 0; } + + // Code common to Validate*Tuple(). + Status ValidateTupleCommon(const Tuple& tuple) const; + + // Copies the index^th slice (in the first dimension) of parent into element. + static Status CopySliceToElement(const Tensor& parent, Tensor* element, + int index); + + // Copies element into the index^th slice (in the first dimension) of parent. + static Status CopyElementToSlice(const Tensor& element, Tensor* parent, + int index); + + ~QueueBase() override {} + + // Helpers for implementing MatchesNodeDef(). + static string ShapeListString(const gtl::ArraySlice<TensorShape>& shapes); + Status MatchesNodeDefOp(const NodeDef& node_def, const string& op) const; + Status MatchesNodeDefCapacity(const NodeDef& node_def, int32 capacity) const; + Status MatchesNodeDefTypes(const NodeDef& node_def) const; + Status MatchesNodeDefShapes(const NodeDef& node_def) const; + + const DataTypeVector component_dtypes_; + const std::vector<TensorShape> component_shapes_; + const string name_; + + TF_DISALLOW_COPY_AND_ASSIGN(QueueBase); +}; + +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUEUE_BASE_H_ diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc new file mode 100644 index 0000000000..c70dc76777 --- /dev/null +++ b/tensorflow/core/kernels/queue_ops.cc @@ -0,0 +1,288 @@ +// See docs in ../ops/data_flow_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/queue_interface.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class QueueOpKernel : public AsyncOpKernel { + public: + explicit QueueOpKernel(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + + void ComputeAsync(OpKernelContext* ctx, DoneCallback callback) final { + QueueInterface* queue; + OP_REQUIRES_OK_ASYNC(ctx, GetResourceFromContext(ctx, "handle", &queue), + callback); + ComputeAsync(ctx, queue, [callback, queue]() { + queue->Unref(); + callback(); + }); + } + + protected: + virtual void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) = 0; +}; + +class QueueAccessOpKernel : public QueueOpKernel { + public: + explicit QueueAccessOpKernel(OpKernelConstruction* context) + : QueueOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("timeout_ms", &timeout_)); + // TODO(keveman): Enable timeout. + OP_REQUIRES(context, timeout_ == -1, + errors::InvalidArgument("Timeout not supported yet.")); + } + + protected: + int64 timeout_; +}; + +// Defines an EnqueueOp, the execution of which enqueues a tuple of +// tensors in the given Queue. +// +// The op has 1 + k inputs, where k is the number of components in the +// tuples stored in the given Queue: +// - Input 0: queue handle. +// - Input 1: 0th element of the tuple. +// - ... +// - Input (1+k): kth element of the tuple. +class EnqueueOp : public QueueAccessOpKernel { + public: + explicit EnqueueOp(OpKernelConstruction* context) + : QueueAccessOpKernel(context) {} + + protected: + void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) override { + DataTypeVector expected_inputs = {DT_STRING_REF}; + for (DataType dt : queue->component_dtypes()) { + expected_inputs.push_back(dt); + } + OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature(expected_inputs, {}), + callback); + + QueueInterface::Tuple tuple; + OpInputList components; + OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("components", &components), + callback); + for (const Tensor& Tcomponent : components) { + tuple.push_back(Tcomponent); + } + + OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateTuple(tuple), callback); + queue->TryEnqueue(tuple, ctx, callback); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(EnqueueOp); +}; + +REGISTER_KERNEL_BUILDER(Name("QueueEnqueue").Device(DEVICE_CPU), EnqueueOp); + +// Defines an EnqueueManyOp, the execution of which slices each +// component of a tuple of tensors along the 0th dimension, and +// enqueues tuples of slices in the given Queue. +// +// The op has 1 + k inputs, where k is the number of components in the +// tuples stored in the given Queue: +// - Input 0: queue handle. +// - Input 1: 0th element of the tuple. +// - ... +// - Input (1+k): kth element of the tuple. +// +// N.B. All tuple components must have the same size in the 0th +// dimension. +class EnqueueManyOp : public QueueAccessOpKernel { + public: + explicit EnqueueManyOp(OpKernelConstruction* context) + : QueueAccessOpKernel(context) {} + + protected: + void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) override { + DataTypeVector expected_inputs = {DT_STRING_REF}; + for (DataType dt : queue->component_dtypes()) { + expected_inputs.push_back(dt); + } + OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {})); + + QueueInterface::Tuple tuple; + OpInputList components; + OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("components", &components), + callback); + for (const Tensor& Tcomponent : components) { + tuple.push_back(Tcomponent); + } + + OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateManyTuple(tuple), callback); + queue->TryEnqueueMany(tuple, ctx, callback); + } + + ~EnqueueManyOp() override {} + + private: + TF_DISALLOW_COPY_AND_ASSIGN(EnqueueManyOp); +}; + +REGISTER_KERNEL_BUILDER(Name("QueueEnqueueMany").Device(DEVICE_CPU), + EnqueueManyOp); + +// Defines a DequeueOp, the execution of which dequeues a tuple of +// tensors from the given Queue. +// +// The op has one input, which is the handle of the appropriate +// Queue. The op has k outputs, where k is the number of components in +// the tuples stored in the given Queue, and output i is the ith +// component of the dequeued tuple. +class DequeueOp : public QueueAccessOpKernel { + public: + explicit DequeueOp(OpKernelConstruction* context) + : QueueAccessOpKernel(context) {} + + protected: + void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) override { + OP_REQUIRES_OK_ASYNC( + ctx, ctx->MatchSignature({DT_STRING_REF}, queue->component_dtypes()), + callback); + + queue->TryDequeue(ctx, [ctx, callback](const QueueInterface::Tuple& tuple) { + if (!ctx->status().ok()) { + callback(); + return; + } + OpOutputList output_components; + OP_REQUIRES_OK_ASYNC( + ctx, ctx->output_list("components", &output_components), callback); + for (int i = 0; i < ctx->num_outputs(); ++i) { + output_components.set(i, tuple[i]); + } + callback(); + }); + } + + ~DequeueOp() override {} + + private: + TF_DISALLOW_COPY_AND_ASSIGN(DequeueOp); +}; + +REGISTER_KERNEL_BUILDER(Name("QueueDequeue").Device(DEVICE_CPU), DequeueOp); + +// Defines a DequeueManyOp, the execution of which concatenates the +// requested number of elements from the given Queue along the 0th +// dimension, and emits the result as a single tuple of tensors. +// +// The op has two inputs: +// - Input 0: the handle to a queue. +// - Input 1: the number of elements to dequeue. +// +// The op has k outputs, where k is the number of components in the +// tuples stored in the given Queue, and output i is the ith component +// of the dequeued tuple. +class DequeueManyOp : public QueueAccessOpKernel { + public: + explicit DequeueManyOp(OpKernelConstruction* context) + : QueueAccessOpKernel(context) {} + + protected: + void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) override { + const Tensor& Tnum_elements = ctx->input(1); + int32 num_elements = Tnum_elements.flat<int32>()(0); + + OP_REQUIRES_ASYNC( + ctx, num_elements >= 0, + errors::InvalidArgument("DequeueManyOp must request a positive number " + "of elements"), + callback); + + OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32}, + queue->component_dtypes()), + callback); + + queue->TryDequeueMany( + num_elements, ctx, [ctx, callback](const QueueInterface::Tuple& tuple) { + if (!ctx->status().ok()) { + callback(); + return; + } + OpOutputList output_components; + OP_REQUIRES_OK_ASYNC( + ctx, ctx->output_list("components", &output_components), + callback); + for (int i = 0; i < ctx->num_outputs(); ++i) { + output_components.set(i, tuple[i]); + } + callback(); + }); + } + + ~DequeueManyOp() override {} + + private: + TF_DISALLOW_COPY_AND_ASSIGN(DequeueManyOp); +}; + +REGISTER_KERNEL_BUILDER(Name("QueueDequeueMany").Device(DEVICE_CPU), + DequeueManyOp); + +// Defines a QueueCloseOp, which closes the given Queue. Closing a +// Queue signals that no more elements will be enqueued in it. +// +// The op has one input, which is the handle of the appropriate Queue. +class QueueCloseOp : public QueueOpKernel { + public: + explicit QueueCloseOp(OpKernelConstruction* context) + : QueueOpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("cancel_pending_enqueues", + &cancel_pending_enqueues_)); + } + + protected: + void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) override { + queue->Close(ctx, cancel_pending_enqueues_, callback); + } + + private: + bool cancel_pending_enqueues_; + TF_DISALLOW_COPY_AND_ASSIGN(QueueCloseOp); +}; + +REGISTER_KERNEL_BUILDER(Name("QueueClose").Device(DEVICE_CPU), QueueCloseOp); + +// Defines a QueueSizeOp, which computes the number of elements in the +// given Queue, and emits it as an output tensor. +// +// The op has one input, which is the handle of the appropriate Queue; +// and one output, which is a single-element tensor containing the current +// size of that Queue. +class QueueSizeOp : public QueueOpKernel { + public: + explicit QueueSizeOp(OpKernelConstruction* context) + : QueueOpKernel(context) {} + + protected: + void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue, + DoneCallback callback) override { + Tensor* Tqueue_size = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_size)); + Tqueue_size->flat<int32>().setConstant(queue->size()); + callback(); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(QueueSizeOp); +}; + +REGISTER_KERNEL_BUILDER(Name("QueueSize").Device(DEVICE_CPU), QueueSizeOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc new file mode 100644 index 0000000000..4fc12e92cb --- /dev/null +++ b/tensorflow/core/kernels/random_crop_op.cc @@ -0,0 +1,103 @@ +// See docs in ../ops/image_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/guarded_philox_random.h" + +namespace tensorflow { + +template <typename T> +class RandomCropOp : public OpKernel { + public: + explicit RandomCropOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, generator_.Init(context)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES(context, input.dims() == 3, + errors::InvalidArgument("input must be 3-dimensional", + input.shape().ShortDebugString())); + const Tensor& shape_t = context->input(1); + OP_REQUIRES(context, shape_t.dims() == 1, + errors::InvalidArgument("shape_t must be 1-dimensional", + shape_t.shape().ShortDebugString())); + OP_REQUIRES(context, shape_t.NumElements() == 2, + errors::InvalidArgument("shape_t must have two elements", + shape_t.shape().ShortDebugString())); + + auto shape_vec = shape_t.vec<int64>(); + const int32 target_height = shape_vec(0); + const int32 target_width = shape_vec(1); + + const int32 height = input.dim_size(0); + const int32 width = input.dim_size(1); + const int32 channels = input.dim_size(2); + + // Initialize shape to the batch size of the input, then add + // the rest of the dimensions + Tensor* output = nullptr; + const auto output_shape = + TensorShape({target_height, target_width, channels}); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + + // If the target size matches the actual size, then do nothing. + if ((target_height == height) && (target_width == width)) { + *output = context->input(0); + } + + // TODO(shlens): Implement edge case to guarantee output size dimensions. + // Edge case. The target dimensions are larger then the image, so + // zero-pad the image. This guarantees that the image will *always* + // be [target_height, target_width] in size. + OP_REQUIRES(context, width >= target_width, errors::FailedPrecondition( + "width must be >= target_width: width = ", width, + ", target_width = ", target_width)); + OP_REQUIRES(context, height >= target_height, errors::FailedPrecondition( + "height must be >= target_height: height = ", height, + ", target_height = ", target_height)); + + int32 offset_height = 0; + int32 offset_width = 0; + + auto local_gen = generator_.ReserveSamples32(2); + random::SimplePhilox random(&local_gen); + + if (width > target_width) { + offset_width = random.Rand32() % (width - target_width + 1); + } + if (height > target_height) { + offset_height = random.Rand32() % (height - target_height + 1); + } + + // TODO(shlens): Do this more efficiently with memcpy once padding is + // available for smaller images. + typename TTypes<T, 3>::ConstTensor input_data = input.tensor<T, 3>(); + typename TTypes<T, 3>::Tensor output_data = output->tensor<T, 3>(); + + for (int y = 0; y < target_height; ++y) { + for (int x = 0; x < target_width; ++x) { + for (int c = 0; c < channels; ++c) { + output_data(y, x, c) = + input_data(y + offset_height, x + offset_width, c); + } + } + } + } + + private: + GuardedPhiloxRandom generator_; +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomCrop").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + RandomCropOp<type>) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/random_crop_op_test.cc b/tensorflow/core/kernels/random_crop_op_test.cc new file mode 100644 index 0000000000..1f232f4969 --- /dev/null +++ b/tensorflow/core/kernels/random_crop_op_test.cc @@ -0,0 +1,60 @@ +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { + +class RandomCropOpTest : public OpsTestBase { + protected: + RandomCropOpTest() { + RequireDefaultOps(); + EXPECT_OK(NodeDefBuilder("random_crop_op", "RandomCrop") + .Input(FakeInput(DT_UINT8)) + .Input(FakeInput(DT_INT64)) + .Attr("T", DT_UINT8) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + } +}; + +TEST_F(RandomCropOpTest, Basic) { + AddInputFromArray<uint8>(TensorShape({1, 2, 1}), {2, 2}); + AddInputFromArray<int64>(TensorShape({2}), {1, 1}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_UINT8, TensorShape({1, 1, 1})); + test::FillValues<uint8>(&expected, {2}); + test::ExpectTensorEqual<uint8>(expected, *GetOutput(0)); +} + +TEST_F(RandomCropOpTest, SameSizeOneChannel) { + AddInputFromArray<uint8>(TensorShape({2, 1, 1}), {1, 2}); + AddInputFromArray<int64>(TensorShape({2}), {2, 1}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_UINT8, TensorShape({2, 1, 1})); + test::FillValues<uint8>(&expected, {1, 2}); + test::ExpectTensorEqual<uint8>(expected, *GetOutput(0)); +} + +TEST_F(RandomCropOpTest, SameSizeMultiChannel) { + AddInputFromArray<uint8>(TensorShape({2, 1, 3}), {1, 2, 3, 4, 5, 6}); + AddInputFromArray<int64>(TensorShape({2}), {2, 1}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_UINT8, TensorShape({2, 1, 3})); + test::FillValues<uint8>(&expected, {1, 2, 3, 4, 5, 6}); + test::ExpectTensorEqual<uint8>(expected, *GetOutput(0)); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc new file mode 100644 index 0000000000..09b66d30e6 --- /dev/null +++ b/tensorflow/core/kernels/random_op.cc @@ -0,0 +1,276 @@ +// See docs in ../ops/random_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/random_op.h" + +#include <algorithm> +#include <memory> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/lib/hash/crc32c.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/util/guarded_philox_random.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +// The default implementation of the functor, which should never be invoked +// But we still need to provide implementation for now for the linker to work, +// since we do not support all the distributions yet. +template <typename Device, class Distribution> +struct FillPhiloxRandom { + typedef typename Distribution::ResultElementType T; + void operator()(OpKernelContext*, const Device&, random::PhiloxRandom gen, + T* data, int64 size) { + LOG(FATAL) << "Default FillPhiloxRandom should not be executed."; + } +}; + +#if GOOGLE_CUDA +// Declaration for the partial specialization with GPU +template <class Distribution> +struct FillPhiloxRandom<GPUDevice, Distribution> { + typedef typename Distribution::ResultElementType T; + void operator()(OpKernelContext* ctx, const GPUDevice&, + random::PhiloxRandom gen, T* data, int64 size); +}; + +#endif + +// A class to fill a specified range of random groups +template <class Distribution, bool VariableSamplesPerOutput> +struct FillPhiloxRandomTask; + +// Specialization for distribution that takes a fixed number of samples for +// each output. +template <class Distribution> +struct FillPhiloxRandomTask<Distribution, false> { + typedef typename Distribution::ResultElementType T; + static void Run(random::PhiloxRandom gen, T* data, int64 size, + int64 start_group, int64 limit_group) { + Distribution dist; + const int kGroupSize = Distribution::kResultElementCount; + + gen.Skip(start_group); + int64 offset = start_group * kGroupSize; + + // First fill all the full-size groups + int64 limit_group_full = std::min(limit_group, size / kGroupSize); + for (int64 index = start_group; index < limit_group_full; ++index) { + auto samples = dist(&gen); + std::copy(&samples[0], &samples[0] + kGroupSize, data + offset); + offset += kGroupSize; + } + + // If there are any remaining elements that need to be filled, process them + if (limit_group_full < limit_group) { + int remaining_size = size - limit_group_full * kGroupSize; + auto samples = dist(&gen); + std::copy(&samples[0], &samples[0] + remaining_size, data + offset); + } + } +}; + +// Specialization for distribution that takes a varaiable number of samples for +// each output. This will be slower due to the generality. +template <class Distribution> +struct FillPhiloxRandomTask<Distribution, true> { + typedef typename Distribution::ResultElementType T; + static const int64 kReservedSamplesPerOutput = 256; + + static void Run(random::PhiloxRandom base_gen, T* data, int64 size, + int64 start_group, int64 limit_group) { + using random::PhiloxRandom; + using random::SingleSampleAdapter; + + Distribution dist; + const int kGroupSize = Distribution::kResultElementCount; + + static const int kGeneratorSkipPerOutputGroup = + kGroupSize * kReservedSamplesPerOutput / + PhiloxRandom::kResultElementCount; + + int64 offset = start_group * kGroupSize; + + // First fill all the full-size groups + int64 limit_group_full = std::min(limit_group, size / kGroupSize); + int64 group_index; + for (group_index = start_group; group_index < limit_group_full; + ++group_index) { + // Reset the generator to the beginning of the output group region + // This is necessary if we want the results to be independent of order + // of work + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + auto samples = dist(&single_samples); + std::copy(&samples[0], &samples[0] + kGroupSize, data + offset); + offset += kGroupSize; + } + + // If there are any remaining elements that need to be filled, process them + if (limit_group_full < limit_group) { + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + int remaining_size = size - limit_group_full * kGroupSize; + auto samples = dist(&single_samples); + std::copy(&samples[0], &samples[0] + remaining_size, data + offset); + } + } +}; + +// Partial specialization for CPU to fill the entire region with randoms +// It splits the work into several tasks and run them in parallel +template <class Distribution> +struct FillPhiloxRandom<CPUDevice, Distribution> { + typedef typename Distribution::ResultElementType T; + void operator()(OpKernelContext* context, const CPUDevice&, + random::PhiloxRandom gen, T* data, int64 size) { + const int kGroupSize = Distribution::kResultElementCount; + + auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads()); + + int64 total_group_count = (size + kGroupSize - 1) / kGroupSize; + + // Limit to maximum six threads for now. The performance scaling is very + // sub-linear. Too many threads causes a much worse overall performance. + int num_workers = 6; + Shard(num_workers, worker_threads.workers, total_group_count, kGroupSize, + [&gen, data, size](int64 start_group, int64 limit_group) { + FillPhiloxRandomTask< + Distribution, + Distribution::kVariableSamplesPerOutput>::Run(gen, data, size, + start_group, + limit_group); + }); + } +}; +} // namespace functor + +// For now, use the same interface as RandomOp, so we can choose either one +// at the run-time. +template <typename Device, class Distribution> +class PhiloxRandomOp : public OpKernel { + public: + typedef typename Distribution::ResultElementType T; + explicit PhiloxRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, generator_.Init(ctx)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& input = ctx->input(0); + OP_REQUIRES( + ctx, TensorShapeUtils::IsLegacyVector(input.shape()), + errors::InvalidArgument("shape must be a vector of {int32,int64}.")); + Tensor* output = nullptr; + if (input.dtype() == DataType::DT_INT32) { + auto vec = input.flat<int32>(); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape( + vec.data(), vec.size()), + &output)); + } else if (input.dtype() == DataType::DT_INT64) { + auto vec = input.flat<int64>(); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShapeUtils::MakeShape( + vec.data(), vec.size()), + &output)); + } else { + OP_REQUIRES(ctx, false, errors::InvalidArgument( + "shape must be a vector of {int32,int64}.")); + } + functor::FillPhiloxRandom<Device, Distribution>()( + ctx, ctx->eigen_device<Device>(), + ReserveRandomOutputs(output->flat<T>().size()), + output->flat<T>().data(), output->flat<T>().size()); + } + + private: + GuardedPhiloxRandom generator_; + + // Reserve enough random samples in the generator for the given output count. + random::PhiloxRandom ReserveRandomOutputs(int64 output_count) { + int64 conservative_sample_count = output_count << 8; + return generator_.ReserveSamples128(conservative_sample_count); + } +}; + +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint<TYPE>("dtype"), \ + PhiloxRandomOp<CPUDevice, random::UniformDistribution< \ + random::PhiloxRandom, TYPE> >); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint<TYPE>("dtype"), \ + PhiloxRandomOp<CPUDevice, random::NormalDistribution< \ + random::PhiloxRandom, TYPE> >); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_CPU) \ + .HostMemory("shape") \ + .TypeConstraint<TYPE>("dtype"), \ + PhiloxRandomOp< \ + CPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >) + +REGISTER(float); +REGISTER(double); + +#undef REGISTER + +#if GOOGLE_CUDA + +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomUniform") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint<int32>("T") \ + .TypeConstraint<TYPE>("dtype"), \ + PhiloxRandomOp<GPUDevice, random::UniformDistribution< \ + random::PhiloxRandom, TYPE> >); \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomStandardNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint<int32>("T") \ + .TypeConstraint<TYPE>("dtype"), \ + PhiloxRandomOp<GPUDevice, random::NormalDistribution< \ + random::PhiloxRandom, TYPE> >); \ + REGISTER_KERNEL_BUILDER( \ + Name("TruncatedNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint<int32>("T") \ + .TypeConstraint<TYPE>("dtype"), \ + PhiloxRandomOp< \ + GPUDevice, \ + random::TruncatedNormalDistribution< \ + random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >) + +REGISTER(float); +REGISTER(double); + +#undef REGISTER + +#endif // GOOGLE_CUDA + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h new file mode 100644 index 0000000000..7c7eed4227 --- /dev/null +++ b/tensorflow/core/kernels/random_op.h @@ -0,0 +1,16 @@ +#ifndef TENSORFLOW_KERNELS_RANDOM_OP_H_ +#define TENSORFLOW_KERNELS_RANDOM_OP_H_ + +namespace tensorflow { + +class OpKernelContext; + +namespace functor { + +template <typename Device, class Distribution> +struct FillPhiloxRandom; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_RANDOM_OP_H_ diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc new file mode 100644 index 0000000000..15cf85f27e --- /dev/null +++ b/tensorflow/core/kernels/random_op_gpu.cu.cc @@ -0,0 +1,152 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/random_op.h" + +#include <stdio.h> +#include <assert.h> + +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +class OpKernelContext; + +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +template <class Distribution, bool VariableSamplesPerOutput> +struct FillPhiloxRandomKernel; + +// A cuda kernel to fill the data with random numbers from the specified +// distribution. Each output takes a fixed number of samples. +template <class Distribution> +struct FillPhiloxRandomKernel<Distribution, false> { + typedef typename Distribution::ResultElementType T; + PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size) { + Distribution dist; + const int kGroupSize = Distribution::kResultElementCount; + + const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32 total_thread_count = gridDim.x * blockDim.x; + int32 offset = thread_id * kGroupSize; + gen.Skip(thread_id); + + while (offset < size) { + typename Distribution::ResultType samples = dist(&gen); + + for (int i = 0; i < kGroupSize; ++i) { + if (offset >= size) { + return; + } + data[offset] = samples[i]; + ++offset; + } + + offset += (total_thread_count - 1) * kGroupSize; + gen.Skip(total_thread_count - 1); + } + } +}; + +// A cuda kernel to fill the data with random numbers from the specified +// distribution. Each output takes a variable number of samples. +template <class Distribution> +struct FillPhiloxRandomKernel<Distribution, true> { + typedef typename Distribution::ResultElementType T; + PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data, + int64 size) { + using random::PhiloxRandom; + using random::SingleSampleAdapter; + + const int kReservedSamplesPerOutput = 256; + const int kGroupSize = Distribution::kResultElementCount; + const int kGeneratorSkipPerOutputGroup = kGroupSize * + kReservedSamplesPerOutput / + PhiloxRandom::kResultElementCount; + + const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32 total_thread_count = gridDim.x * blockDim.x; + int64 group_index = thread_id; + int64 offset = group_index * kGroupSize; + Distribution dist; + + while (offset < size) { + // Since each output takes a variable number of samples, we need to + // realign the generator to the beginning for the current output group + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + typename Distribution::ResultType samples = dist(&single_samples); + + for (int i = 0; i < kGroupSize; ++i) { + if (offset >= size) { + return; + } + data[offset] = samples[i]; + ++offset; + } + + offset += (total_thread_count - 1) * kGroupSize; + group_index += total_thread_count; + } + } +}; + +// A simple launch pad to call the correct function templates to fill the data +template <class Distribution> +__global__ void __launch_bounds__(1024) + FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen, + typename Distribution::ResultElementType* data, + int64 size) { + FillPhiloxRandomKernel<Distribution, + Distribution::kVariableSamplesPerOutput>() + .Run(base_gen, data, size); +} + +// Partial specialization for GPU +template <class Distribution> +struct FillPhiloxRandom<GPUDevice, Distribution> { + typedef typename Distribution::ResultElementType T; + typedef GPUDevice Device; + void operator()(OpKernelContext*, const Device& d, random::PhiloxRandom gen, + T* data, int64 size) { + const int32 block_size = d.maxCudaThreadsPerBlock(); + const int32 num_blocks = + (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) / + block_size; + + FillPhiloxRandomKernelLaunch< + Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data, + size); + } +}; + +// Explicit instantiation of the GPU distributions functors +// clang-format off +// NVCC cannot handle ">>" properly +template struct FillPhiloxRandom< + GPUDevice, random::UniformDistribution<random::PhiloxRandom, float> >; +template struct FillPhiloxRandom< + GPUDevice, random::UniformDistribution<random::PhiloxRandom, double> >; +template struct FillPhiloxRandom< + GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >; +template struct FillPhiloxRandom< + GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >; +template struct FillPhiloxRandom< + GPUDevice, random::TruncatedNormalDistribution< + random::SingleSampleAdapter<random::PhiloxRandom>, float> >; +template struct FillPhiloxRandom< + GPUDevice, random::TruncatedNormalDistribution< + random::SingleSampleAdapter<random::PhiloxRandom>, double> >; +// clang-format on + +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc new file mode 100644 index 0000000000..751b61cfba --- /dev/null +++ b/tensorflow/core/kernels/random_op_test.cc @@ -0,0 +1,99 @@ +#include <random> + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +Tensor Int32(int32 v) { + Tensor t(DT_INT32, TensorShape({})); + t.scalar<int32>()() = v; + return t; +} + +Graph* RandomUniform(int64 n) { + Graph* g = new Graph(OpRegistry::Global()); + test::graph::RandomUniform(g, test::graph::Constant(g, Int32(n)), DT_FLOAT); + return g; +} + +Graph* RandomNormal(int64 n) { + Graph* g = new Graph(OpRegistry::Global()); + test::graph::RandomGaussian(g, test::graph::Constant(g, Int32(n)), DT_FLOAT); + return g; +} + +Graph* RandomParameters(int64 n) { + Graph* g = new Graph(OpRegistry::Global()); + test::graph::RandomParameters(g, test::graph::Constant(g, Int32(n)), + DT_FLOAT); + return g; +} + +#define BM_RNG(DEVICE, RNG) \ + static void BM_##DEVICE##_##RNG(int iters, int arg) { \ + testing::ItemsProcessed(static_cast<int64>(iters) * arg); \ + test::Benchmark(#DEVICE, RNG(arg)).Run(iters); \ + } \ + BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20); + +BM_RNG(cpu, RandomUniform); +BM_RNG(cpu, RandomNormal); +BM_RNG(cpu, RandomParameters); + +BM_RNG(gpu, RandomUniform); +BM_RNG(gpu, RandomNormal); +BM_RNG(gpu, RandomParameters); + +static void BM_PhiloxRandom(int iters) { + // Fill 2M random numbers + int count = 2 << 20; + + testing::ItemsProcessed(static_cast<int64>(iters) * count); + + random::PhiloxRandom gen(0x12345); + + int val = 1; + for (int i = 0; i < iters; ++i) { + for (int j = 0; j < count; j += 4) { + /// each invocation of gen() returns 128-bit samples + auto samples = gen(); + + // use the result trivially so the compiler does not optimize it away + val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3]; + } + } + + // A anchor point to make sure the compiler does not cut corners + CHECK(val) << val; +} +BENCHMARK(BM_PhiloxRandom); + +static void BM_StdMTRandom(int iters) { + // Fill 2M random numbers + int count = 2 << 20; + + testing::ItemsProcessed(static_cast<int64>(iters) * count); + + std::mt19937 gen(0x12345); + + int val = 1; + for (int i = 0; i < iters; ++i) { + for (int j = 0; j < count; ++j) { + /// each invocation of gen() returns 32-bit sample + uint32 sample = gen(); + + // use the result trivially so the compiler does not optimize it away + val ^= sample; + } + } + + // A anchor point to make sure the compiler does not cut corners + CHECK(val) << val; +} +BENCHMARK(BM_StdMTRandom); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/random_shuffle_op.cc b/tensorflow/core/kernels/random_shuffle_op.cc new file mode 100644 index 0000000000..b87f4e58a0 --- /dev/null +++ b/tensorflow/core/kernels/random_shuffle_op.cc @@ -0,0 +1,89 @@ +// See docs in ../ops/random_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_util.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/util/guarded_philox_random.h" + +namespace tensorflow { + +// TODO(irving): If performance is critical, generate output directly instead +// of an in-place shuffle using a pseudorandom permutation like +// +// https://github.com/otherlab/geode/blob/master/geode/random/permute.cpp +// +// This is probably also the right thing if we want a GPU version of shuffling. + +// We use our own version of std::random_shuffle to guarantee that exactly +// size - 1 samples are used. +template <class Iter, class Random> +static inline void RandomShuffle(Iter first, Iter last, Random& uniform) { + if (first == last) return; + const auto stop = last - 1; + for (auto i = first; i != stop; ++i) { + using std::iter_swap; + iter_swap(i, i + uniform(last - i)); + } +} + +template <typename T> +class RandomShuffleOp : public OpKernel { + public: + explicit RandomShuffleOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, generator_.Init(context)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + + if (input.NumElements() <= 1 || input.dim_size(0) <= 1) { + // No shuffling is required, so copy input directly to output + context->set_output(0, input); + } else { + // Reserve enough random samples for shuffling + const int64 size = input.dim_size(0); + const int64 samples = size - 1; + auto local_gen = generator_.ReserveSamples32(samples); + random::SingleSampleAdapter<random::PhiloxRandom> single(&local_gen); + const auto uniform = [&single](uint32 n) { return single() % n; }; + + if (input.dims() == 1) { + // For 1D data, copy and then shuffle in place + context->set_output(0, tensor::DeepCopy(input)); + auto vec = context->mutable_output(0)->vec<T>(); + RandomShuffle(vec.data(), vec.data() + size, uniform); + } else { + // For >= 2D, shuffle indices and then copy across + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + const auto input_mat = input.flat_outer_dims<T>(); + auto output_mat = output->flat_outer_dims<T>(); + std::vector<int> permutation(size); + for (int i = 0; i < size; i++) { + permutation[i] = i; + } + RandomShuffle(permutation.begin(), permutation.end(), uniform); + for (int i = 0; i < size; i++) { + output_mat.template chip<0>(i) = + input_mat.template chip<0>(permutation[i]); + } + } + } + } + + private: + GuardedPhiloxRandom generator_; +}; + +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("RandomShuffle").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + RandomShuffleOp<T>); +TF_CALL_ALL_TYPES(REGISTER) + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc new file mode 100644 index 0000000000..561ec76e53 --- /dev/null +++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc @@ -0,0 +1,740 @@ +// See docs in ../ops/data_flow_ops.cc. + +#include <deque> +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/queue_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class RandomShuffleQueue : public QueueBase { + public: + RandomShuffleQueue(int32 capacity, int32 min_after_dequeue, int64 seed, + int64 seed2, const DataTypeVector& component_dtypes, + const std::vector<TensorShape>& component_shapes, + const string& name); + Status Initialize(); // Must be called before any other method. + + // Implementations of QueueInterface methods -------------------------------- + + Status ValidateTuple(const Tuple& tuple) override; + Status ValidateManyTuple(const Tuple& tuple) override; + void TryEnqueue(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) override; + void TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) override; + void TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) override; + void TryDequeueMany(int num_elements, OpKernelContext* ctx, + CallbackWithTuple callback) override; + void Close(OpKernelContext* ctx, bool cancel_pending_enqueues, + DoneCallback callback) override; + Status MatchesNodeDef(const NodeDef& node_def) override; + + int32 size() override { + mutex_lock lock(mu_); + return queues_[0].size(); + } + + private: + enum Action { kEnqueue, kDequeue }; + + ~RandomShuffleQueue() override {} + + TensorShape ManyOutShape(int i, int batch_size) { + TensorShape shape({batch_size}); + shape.AppendShape(component_shapes_[i]); + return shape; + } + + // Helper for dequeuing a single random element from queues_. + void DequeueLocked(OpKernelContext* ctx, Tuple* tuple) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + void Cancel(Action action, CancellationToken token); + + // Helper for cancelling all pending Enqueue(Many) operations when + // Close is called with cancel_pending_enqueues. + void CloseAndCancel(); + + // Tries to enqueue/dequeue (or close) based on whatever is at the + // front of enqueue_attempts_/dequeue_attempts_. Appends to + // *finished the callback for any finished attempt (so it may be + // called once mu_ is released). Returns true if any progress was + // made. + struct CleanUp { + CleanUp(DoneCallback&& f, CancellationToken ct, CancellationManager* cm) + : finished(f), to_deregister(ct), cm(cm) {} + DoneCallback finished; + CancellationToken to_deregister; + CancellationManager* cm; + }; + bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + // Tries to make progress on the enqueues or dequeues at the front + // of the *_attempts_ queues. + void FlushUnlocked(); + + const int32 capacity_; + const int32 min_after_dequeue_; + const int64 original_seed_; + const int64 original_seed2_; + + mutex mu_; + typedef std::vector<PersistentTensor> SubQueue; + std::vector<SubQueue> queues_ GUARDED_BY(mu_); + bool closed_ GUARDED_BY(mu_); + random::PhiloxRandom parent_generator_ GUARDED_BY(mu_); + random::SingleSampleAdapter<random::PhiloxRandom> generator_ GUARDED_BY(mu_); + + enum RunResult { kNoProgress, kProgress, kComplete }; + struct Attempt; + typedef std::function<RunResult(Attempt*)> RunCallback; + struct Attempt { + int32 elements_requested; + DoneCallback done_callback; // must be run outside mu_ + OpKernelContext* context; + CancellationToken cancellation_token; + RunCallback run_callback; // must be run while holding mu_ + bool is_cancelled; + Tuple tuple; + + Attempt(int32 elements_requested, DoneCallback done_callback, + OpKernelContext* context, CancellationToken cancellation_token, + RunCallback run_callback) + : elements_requested(elements_requested), + done_callback(done_callback), + context(context), + cancellation_token(cancellation_token), + run_callback(run_callback), + is_cancelled(false) {} + }; + std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_); + std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_); + + TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueue); +}; + +RandomShuffleQueue::RandomShuffleQueue( + int capacity, int min_after_dequeue, int64 seed, int64 seed2, + const DataTypeVector& component_dtypes, + const std::vector<TensorShape>& component_shapes, const string& name) + : QueueBase(component_dtypes, component_shapes, name), + capacity_(capacity), + min_after_dequeue_(min_after_dequeue), + original_seed_(seed), + original_seed2_(seed2), + closed_(false), + generator_(&parent_generator_) { + if (seed == 0 && seed2 == 0) { + // If both seeds are unspecified, use completely random seeds. + seed = random::New64(); + seed2 = random::New64(); + } + parent_generator_ = random::PhiloxRandom(seed, seed2); +} + +Status RandomShuffleQueue::Initialize() { + if (component_dtypes_.empty()) { + return errors::InvalidArgument("Empty component types for queue ", name_); + } + if (!component_shapes_.empty() && + component_dtypes_.size() != component_shapes_.size()) { + return errors::InvalidArgument("Different number of component types (", + component_dtypes_.size(), ") vs. shapes (", + component_shapes_.size(), ")."); + } + + mutex_lock lock(mu_); + queues_.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + queues_.push_back(SubQueue()); + queues_.back().reserve(min_after_dequeue_); + } + return Status::OK(); +} + +// TODO(mrry): If these checks become a bottleneck, find a way to +// reduce the number of times that they are called. +Status RandomShuffleQueue::ValidateTuple(const Tuple& tuple) { + TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple)); + if (specified_shapes()) { + for (size_t i = 0; i < tuple.size(); ++i) { + if (!tuple[i].shape().IsSameSize(component_shapes_[i])) { + return errors::InvalidArgument( + "Shape mismatch in tuple component ", i, ". Expected ", + component_shapes_[i].ShortDebugString(), ", got ", + tuple[i].shape().ShortDebugString()); + } + } + } + return Status::OK(); +} + +// TODO(mrry): If these checks become a bottleneck, find a way to +// reduce the number of times that they are called. +Status RandomShuffleQueue::ValidateManyTuple(const Tuple& tuple) { + TF_RETURN_IF_ERROR(ValidateTupleCommon(tuple)); + const int64 batch_size = tuple[0].dim_size(0); + if (specified_shapes()) { + for (size_t i = 0; i < tuple.size(); ++i) { + // Expected shape is [batch_size] + component_shapes_[i] + const TensorShape expected_shape = ManyOutShape(i, batch_size); + if (!tuple[i].shape().IsSameSize(expected_shape)) { + return errors::InvalidArgument( + "Shape mismatch in tuple component ", i, ". Expected ", + expected_shape.ShortDebugString(), ", got ", + tuple[i].shape().ShortDebugString()); + } + } + } else { + for (size_t i = 1; i < tuple.size(); ++i) { + if (tuple[i].dim_size(0) != batch_size) { + return errors::InvalidArgument( + "All input tensors must have the same size in the 0th ", + "dimension. Component ", i, " has ", tuple[i].dim_size(0), + ", and should have ", batch_size); + } + } + } + return Status::OK(); +} + +void RandomShuffleQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) { + DCHECK_GT(queues_[0].size(), 0); + int64 index = generator_() % queues_[0].size(); + (*tuple).reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + (*tuple).push_back(*queues_[i][index].AccessTensor(ctx)); + queues_[i][index] = queues_[i].back(); + queues_[i].pop_back(); + } +} + +void RandomShuffleQueue::Cancel(Action action, CancellationToken token) { + DoneCallback callback = nullptr; + { + mutex_lock lock(mu_); + std::deque<Attempt>* attempts = + action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_; + + for (Attempt& attempt : *attempts) { + if (attempt.cancellation_token == token) { + attempt.is_cancelled = true; + if (action == kEnqueue) { + attempt.context->SetStatus( + errors::Cancelled("Enqueue operation was cancelled")); + } else { + attempt.context->SetStatus( + errors::Cancelled("Dequeue operation was cancelled")); + } + std::swap(callback, attempt.done_callback); + break; + } + } + } + if (callback) { + callback(); + FlushUnlocked(); + } +} + +void RandomShuffleQueue::CloseAndCancel() { + std::vector<DoneCallback> callbacks; + { + mutex_lock lock(mu_); + closed_ = true; + for (Attempt& attempt : enqueue_attempts_) { + attempt.is_cancelled = true; + attempt.context->SetStatus( + errors::Cancelled("Enqueue operation was cancelled")); + callbacks.emplace_back(std::move(attempt.done_callback)); + } + } + for (const DoneCallback& callback : callbacks) { + callback(); + } + FlushUnlocked(); +} + +bool RandomShuffleQueue::TryAttemptLocked( + Action action, std::vector<CleanUp>* clean_up) { + std::deque<Attempt>* attempts = + action == kEnqueue ? &enqueue_attempts_ : &dequeue_attempts_; + + bool progress = false; + bool done = false; + while (!done && !attempts->empty()) { + if (attempts->front().is_cancelled) { + if (action == kEnqueue) { + LOG(INFO) << "Skipping cancelled enqueue attempt"; + } else { + LOG(INFO) << "Skipping cancelled dequeue attempt"; + } + attempts->pop_front(); + } else { + Attempt* cur_attempt = &attempts->front(); + switch (cur_attempt->run_callback(cur_attempt)) { + case kNoProgress: + done = true; + break; + case kProgress: + done = true; + progress = true; + break; + case kComplete: + progress = true; + clean_up->emplace_back(std::move(cur_attempt->done_callback), + cur_attempt->cancellation_token, + cur_attempt->context->cancellation_manager()); + attempts->pop_front(); + break; + } + } + } + return progress; +} + +void RandomShuffleQueue::FlushUnlocked() { + std::vector<CleanUp> clean_up; + Ref(); + { + mutex_lock lock(mu_); + bool changed; + do { + changed = TryAttemptLocked(kEnqueue, &clean_up); + changed = TryAttemptLocked(kDequeue, &clean_up) || changed; + } while (changed); + } + Unref(); + for (const auto& to_clean : clean_up) { + if (to_clean.to_deregister != CancellationManager::kInvalidToken) { + // NOTE(mrry): We can safely ignore the return value of + // DeregisterCallback because the mutex mu_ ensures that the + // cleanup action only executes once. + to_clean.cm->DeregisterCallback(to_clean.to_deregister); + } + to_clean.finished(); + } +} + +void RandomShuffleQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx, + DoneCallback callback) { + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kEnqueue, token); }); + if (!already_cancelled) { + enqueue_attempts_.emplace_back( + 1, callback, ctx, token, + [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + attempt->context->SetStatus(errors::Aborted( + "RandomShuffleQueue '", name_, "' is closed.")); + return kComplete; + } + if (queues_[0].size() < static_cast<size_t>(capacity_)) { + for (int i = 0; i < num_components(); ++i) { + queues_[i].push_back(PersistentTensor(tuple[i])); + } + return kComplete; + } else { + return kNoProgress; + } + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled")); + callback(); + } +} + +void RandomShuffleQueue::TryEnqueueMany(const Tuple& tuple, + OpKernelContext* ctx, + DoneCallback callback) { + const int64 batch_size = tuple[0].dim_size(0); + if (batch_size == 0) { + callback(); + return; + } + + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kEnqueue, token); }); + if (!already_cancelled) { + enqueue_attempts_.emplace_back( + batch_size, callback, ctx, token, + [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + attempt->context->SetStatus(errors::Aborted( + "RandomShuffleQueue '", name_, "' is closed.")); + return kComplete; + } + RunResult result = kNoProgress; + while (queues_[0].size() < static_cast<size_t>(capacity_)) { + result = kProgress; + const int index = + tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + TensorShape element_shape(tuple[i].shape()); + element_shape.RemoveDim(0); + PersistentTensor element; + Tensor* element_access = nullptr; + attempt->context->allocate_persistent( + tuple[i].dtype(), element_shape, &element, &element_access); + attempt->context->SetStatus( + CopySliceToElement(tuple[i], element_access, index)); + if (!attempt->context->status().ok()) return kComplete; + queues_[i].push_back(element); + } + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + return kComplete; + } + } + return result; + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Enqueue operation was cancelled")); + callback(); + } +} + +void RandomShuffleQueue::TryDequeue(OpKernelContext* ctx, + CallbackWithTuple callback) { + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kDequeue, token); }); + if (!already_cancelled) { + // TODO(josh11b): This makes two copies of callback, avoid this if possible. + dequeue_attempts_.emplace_back( + 1, [callback]() { callback(Tuple()); }, ctx, token, + [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int32 s = queues_[0].size(); + if (closed_ && s == 0) { + attempt->context->SetStatus(errors::OutOfRange( + "RandomShuffleQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", 1, ", current size ", s, + ")")); + return kComplete; + } + if (!closed_) s -= min_after_dequeue_; + if (s > 0) { + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + attempt->done_callback = [callback, tuple]() { callback(tuple); }; + return kComplete; + } else { + return kNoProgress; + } + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled")); + callback(Tuple()); + } +} + +void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx, + CallbackWithTuple callback) { + if (!specified_shapes()) { + ctx->SetStatus( + errors::InvalidArgument("RandomShuffleQueue's DequeueMany requires the " + "components to have specified shapes.")); + callback(Tuple()); + return; + } + if (num_elements == 0) { + Tuple tuple; + tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + // TODO(josh11b,misard): Switch to allocate_output(). Problem is + // this breaks the abstraction boundary since we don't *really* + // know if and how the Tensors in the tuple we pass to callback + // correspond to the outputs of *ctx. For example, the + // ReaderRead Op uses TryDequeue() to get a filename out of a + // queue that is used internally by the reader and is not + // associated with any output of the ReaderRead. + // mrry@ adds: + // Maybe we need to pass a std::function<Tensor*(...)> (or + // better signature) that calls the appropriate allocator + // function in addition to ctx? (Or support a shim Allocator + // that has an internal OpKernelContext*, and dispatches to the + // appropriate method?) + // misard@ adds: + // I don't see that a std::function would help. The problem is + // that at this point (allocation time) the system doesn't know + // what is going to happen to the element read out of the + // queue. As long as we keep the generality that TensorFlow Ops + // do their own dynamic allocation in arbitrary C++ code, we + // need to preserve robustness to allocating output Tensors with + // the 'wrong' attributes, and fixing up with a copy. The only + // improvement I can see here in the future would be to support + // an optimized case where the queue 'knows' what attributes to + // use, and plumbs them through here. + Tensor element; + ctx->allocate_temp(component_dtypes_[i], ManyOutShape(i, 0), &element); + tuple.emplace_back(element); + } + callback(tuple); + return; + } + + CancellationManager* cm = ctx->cancellation_manager(); + CancellationToken token = cm->get_cancellation_token(); + bool already_cancelled; + { + mutex_lock l(mu_); + already_cancelled = !cm->RegisterCallback( + token, [this, token]() { Cancel(kDequeue, token); }); + if (!already_cancelled) { + // TODO(josh11b): This makes two copies of callback, avoid this if possible. + dequeue_attempts_.emplace_back( + num_elements, [callback]() { callback(Tuple()); }, ctx, token, + [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + int32 s = queues_[0].size(); + if (closed_ && s < attempt->elements_requested) { + attempt->context->SetStatus(errors::OutOfRange( + "RandomSuffleQueue '", name_, "' is closed and has ", + "insufficient elements (requested ", + attempt->elements_requested, ", current size ", s, ")")); + return kComplete; + } + + RunResult result = kNoProgress; + if (!closed_) s -= min_after_dequeue_; + for (; s > 0; --s) { + if (attempt->tuple.empty()) { + // Only allocate tuple when we have something to dequeue + // so we don't use exceessive memory when there are many + // blocked dequeue attempts waiting. + attempt->tuple.reserve(num_components()); + for (int i = 0; i < num_components(); ++i) { + const TensorShape shape = + ManyOutShape(i, attempt->elements_requested); + Tensor element; + attempt->context->allocate_temp(component_dtypes_[i], shape, + &element); + attempt->tuple.emplace_back(element); + } + } + result = kProgress; + Tuple tuple; + DequeueLocked(attempt->context, &tuple); + const int index = + attempt->tuple[0].dim_size(0) - attempt->elements_requested; + for (int i = 0; i < num_components(); ++i) { + attempt->context->SetStatus( + CopyElementToSlice(tuple[i], &attempt->tuple[i], index)); + if (!attempt->context->status().ok()) return kComplete; + } + tuple.clear(); + --attempt->elements_requested; + if (attempt->elements_requested == 0) { + tuple = attempt->tuple; + attempt->done_callback = [callback, tuple]() { + callback(tuple); + }; + return kComplete; + } + } + return result; + }); + } + } + if (!already_cancelled) { + FlushUnlocked(); + } else { + ctx->SetStatus(errors::Cancelled("Dequeue operation was cancelled")); + callback(Tuple()); + } +} + +void RandomShuffleQueue::Close(OpKernelContext* ctx, + bool cancel_pending_enqueues, + DoneCallback callback) { + if (cancel_pending_enqueues) { + CloseAndCancel(); + callback(); + } else { + { + mutex_lock lock(mu_); + enqueue_attempts_.emplace_back( + 0, callback, ctx, CancellationManager::kInvalidToken, + [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (closed_) { + attempt->context->SetStatus(errors::Aborted( + "RandomShuffleQueue '", name_, "' is already closed.")); + } else { + closed_ = true; + } + return kComplete; + }); + } + FlushUnlocked(); + } +} + +Status RandomShuffleQueue::MatchesNodeDef(const NodeDef& node_def) { + TF_RETURN_IF_ERROR(MatchesNodeDefOp(node_def, "RandomShuffleQueue")); + TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_)); + + int32 min_after_dequeue = -1; + TF_RETURN_IF_ERROR( + GetNodeAttr(node_def, "min_after_dequeue", &min_after_dequeue)); + if (min_after_dequeue != min_after_dequeue_) { + return errors::InvalidArgument( + "Shared queue '", name_, "' has min_after_dequeue ", + min_after_dequeue_, " but requested min_after_dequeue was ", + min_after_dequeue, "."); + } + + int64 seed = -1; + int64 seed2 = -1; + TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "seed", &seed)); + TF_RETURN_IF_ERROR(GetNodeAttr(node_def, "seed2", &seed2)); + if ((seed != 0 || seed2 != 0) && + (seed != original_seed_ || seed2 != original_seed2_)) { + return errors::InvalidArgument( + "Shared queue '", name_, "' has random seeds (", original_seed_, ", ", + original_seed2_, ") but requested seeds are (", seed, ", ", seed2, + ")."); + } + + TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def)); + TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def)); + + return Status::OK(); +} + +typedef std::shared_ptr<QueueInterface> QueueInterfacePtr; + +// Defines a RandomShuffleQueueOp, which produces a Queue (specifically, one +// backed by RandomShuffleQueue) that persists across different graph +// executions, and sessions. Running this op produces a single-element +// tensor of handles to Queues in the corresponding device. +class RandomShuffleQueueOp : public OpKernel { + public: + explicit RandomShuffleQueueOp(OpKernelConstruction* context) + : OpKernel(context), queue_handle_set_(false) { + OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_)); + OP_REQUIRES_OK(context, + context->allocate_persistent(DT_STRING, TensorShape({2}), + &queue_handle_, nullptr)); + if (capacity_ < 0) { + capacity_ = RandomShuffleQueue::kUnbounded; + } + OP_REQUIRES_OK(context, + context->GetAttr("min_after_dequeue", &min_after_dequeue_)); + OP_REQUIRES(context, min_after_dequeue_ >= 0, + errors::InvalidArgument("min_after_dequeue ", + min_after_dequeue_, " must be >= 0")); + OP_REQUIRES( + context, min_after_dequeue_ < capacity_, + errors::InvalidArgument("min_after_dequeue ", min_after_dequeue_, + " must be < capacity ", capacity_)); + OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_)); + OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_)); + + OP_REQUIRES_OK(context, + context->GetAttr("component_types", &component_types_)); + OP_REQUIRES_OK(context, context->GetAttr("shapes", &component_shapes_)); + } + + ~RandomShuffleQueueOp() override { + // If the queue object was not shared, delete it. + if (queue_handle_set_ && cinfo_.resource_is_private_to_kernel()) { + TF_CHECK_OK(cinfo_.resource_manager()->Delete<QueueInterface>( + cinfo_.container(), cinfo_.name())); + } + } + + void Compute(OpKernelContext* ctx) override { + mutex_lock l(mu_); + if (!queue_handle_set_) { + OP_REQUIRES_OK(ctx, SetQueueHandle(ctx)); + } + ctx->set_output_ref(0, &mu_, queue_handle_.AccessTensor(ctx)); + } + + private: + Status SetQueueHandle(OpKernelContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def())); + QueueInterface* queue; + auto creator = [this](QueueInterface** ret) { + auto* q = new RandomShuffleQueue(capacity_, min_after_dequeue_, seed_, + seed2_, component_types_, + component_shapes_, cinfo_.name()); + Status s = q->Initialize(); + if (s.ok()) { + *ret = q; + } else { + q->Unref(); + } + return s; + }; + TF_RETURN_IF_ERROR( + cinfo_.resource_manager()->LookupOrCreate<QueueInterface>( + cinfo_.container(), cinfo_.name(), &queue, creator)); + core::ScopedUnref unref_me(queue); + // Verify that the shared queue is compatible with the requested arguments. + TF_RETURN_IF_ERROR(queue->MatchesNodeDef(def())); + auto h = queue_handle_.AccessTensor(ctx)->flat<string>(); + h(0) = cinfo_.container(); + h(1) = cinfo_.name(); + queue_handle_set_ = true; + return Status::OK(); + } + + int32 capacity_; + int32 min_after_dequeue_; + int64 seed_; + int64 seed2_; + DataTypeVector component_types_; + std::vector<TensorShape> component_shapes_; + ContainerInfo cinfo_; + + mutex mu_; + PersistentTensor queue_handle_ GUARDED_BY(mu_); + bool queue_handle_set_ GUARDED_BY(mu_); + + TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueueOp); +}; + +REGISTER_KERNEL_BUILDER(Name("RandomShuffleQueue").Device(DEVICE_CPU), + RandomShuffleQueueOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc new file mode 100644 index 0000000000..a3f4e0b0cb --- /dev/null +++ b/tensorflow/core/kernels/range_sampler.cc @@ -0,0 +1,305 @@ +#include "tensorflow/core/kernels/range_sampler.h" + +#include <vector> +#include <unordered_set> + +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/lib/io/inputbuffer.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +using gtl::ArraySlice; +using gtl::MutableArraySlice; + +RangeSampler::~RangeSampler() {} + +void RangeSampler::SampleBatch(random::SimplePhilox* rnd, bool unique, + gtl::MutableArraySlice<int64> batch) const { + SampleBatchGetExpectedCount( + rnd, unique, batch, gtl::MutableArraySlice<float>(), + gtl::ArraySlice<int64>(), gtl::MutableArraySlice<float>()); +} + +void RangeSampler::SampleBatchGetExpectedCount( + random::SimplePhilox* rnd, bool unique, gtl::MutableArraySlice<int64> batch, + gtl::MutableArraySlice<float> batch_expected_count, + gtl::ArraySlice<int64> extras, + gtl::MutableArraySlice<float> extras_expected_count) const { + SampleBatchGetExpectedCountAvoid(rnd, unique, batch, batch_expected_count, + extras, extras_expected_count, + gtl::ArraySlice<int64>()); +} + +namespace { + +// Approximates the expected count of a value in the output of SampleBatch. +// +// If unique=false, then this is (Probability(value) * batch_size) +// +// We use batch_size and num_tries, where num_tries is the observed number of +// tries it took to get batch_size unique values. +// +// Assuming (falsely) that the nubmer of tries to get a batch of batch_size +// distinct values is _always_ num_tries, the probability that the value +// is in a batch is (1 - (1-p)^num_tries) +static float ExpectedCountHelper(float p, int batch_size, int num_tries) { + if (num_tries == batch_size) { + // This shortcut will always be taken if unique=false + return p * batch_size; + } + // numerically stable version of (1 - (1-p)^num_tries) + return -expm1(num_tries * log1p(-p)); +} + +} // namespace + +void RangeSampler::SampleBatchGetExpectedCountAvoid( + random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch, + MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras, + MutableArraySlice<float> extras_expected_count, + ArraySlice<int64> avoided_values) const { + const int batch_size = batch.size(); + int num_tries; + + if (unique) { + CHECK_LE(batch_size + avoided_values.size(), range_); + std::unordered_set<int64> used(batch_size); + used.insert(avoided_values.begin(), avoided_values.end()); + int num_picked = 0; + num_tries = 0; + while (num_picked < batch_size) { + num_tries++; + CHECK_LT(num_tries, kint32max); + int64 value = Sample(rnd); + if (gtl::InsertIfNotPresent(&used, value)) { + batch[num_picked++] = value; + } + } + } else { + CHECK_EQ(avoided_values.size(), 0) + << "avoided_values only supported with unique=true"; + for (int i = 0; i < batch_size; i++) { + batch[i] = Sample(rnd); + } + num_tries = batch_size; + } + // Compute the expected counts of the batch and the extra values + if (batch_expected_count.size() > 0) { + CHECK_EQ(batch_size, batch_expected_count.size()); + for (int i = 0; i < batch_size; i++) { + batch_expected_count[i] = + ExpectedCountHelper(Probability(batch[i]), batch_size, num_tries); + } + } + CHECK_EQ(extras.size(), extras_expected_count.size()); + for (size_t i = 0; i < extras.size(); i++) { + extras_expected_count[i] = + ExpectedCountHelper(Probability(extras[i]), batch_size, num_tries); + } +} + +AllSampler::AllSampler(int64 range) + : RangeSampler(range), inv_range_(1.0 / range) {} + +void AllSampler::SampleBatchGetExpectedCountAvoid( + random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch, + MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras, + MutableArraySlice<float> extras_expected_count, + ArraySlice<int64> avoided_values) const { + const int batch_size = batch.size(); + CHECK_EQ(range_, batch_size); + for (int i = 0; i < batch_size; i++) { + batch[i] = i; + } + if (batch_expected_count.size() > 0) { + CHECK_EQ(batch_size, batch_expected_count.size()); + for (int i = 0; i < batch_size; i++) { + batch_expected_count[i] = 1; + } + } + CHECK_EQ(0, avoided_values.size()); + CHECK_EQ(extras.size(), extras_expected_count.size()); + for (size_t i = 0; i < extras.size(); i++) { + extras_expected_count[i] = 1; + } +} + +UniformSampler::UniformSampler(int64 range) + : RangeSampler(range), inv_range_(1.0 / range) {} + +int64 UniformSampler::Sample(random::SimplePhilox* rnd) const { + return rnd->Uniform64(range_); +} + +float UniformSampler::Probability(int64 value) const { return inv_range_; } + +LogUniformSampler::LogUniformSampler(int64 range) + : RangeSampler(range), log_range_(log(range + 1)) {} + +int64 LogUniformSampler::Sample(random::SimplePhilox* rnd) const { + const int64 value = + static_cast<int64>(exp(rnd->RandDouble() * log_range_)) - 1; + CHECK_GE(value, 0); + // Mathematically, value should be <= range_, but might not be due to some + // floating point roundoff, so we mod by range_. + return value % range_; +} + +float LogUniformSampler::Probability(int64 value) const { + // value is returned iff the call to UniformDouble(log_range_) in the + // Sample() function returns a value between log(value + 1) + // and log(value + 2). The probability of this is: + // (log(value + 2) - log(value + 1)) / log_range + // To avoid two calls to log(), we compute this as follows: + return (log((value + 2.0) / (value + 1.0))) / log_range_; +} + +ThreadUnsafeUnigramSampler::ThreadUnsafeUnigramSampler(int64 range) + : RangeSampler(range), picker_(range) { + CHECK_LT(range, kint32max); +} + +int64 ThreadUnsafeUnigramSampler::Sample(random::SimplePhilox* rnd) const { + return picker_.Pick(rnd); +} + +float ThreadUnsafeUnigramSampler::Probability(int64 value) const { + return static_cast<float>(picker_.get_weight(value)) / picker_.total_weight(); +} + +void ThreadUnsafeUnigramSampler::Update(ArraySlice<int64> values) { + int num_updates = std::min(static_cast<int>(values.size()), + kint32max - picker_.total_weight()); + for (int i = 0; i < num_updates; i++) { + const int64 value = values[i]; + picker_.set_weight(value, picker_.get_weight(value) + 1); + } +} + +// Thread-safe unigram sampler +UnigramSampler::UnigramSampler(int64 range) + : RangeSampler(range), unsafe_sampler_(range) { + CHECK_LT(range, kint32max); +} + +int64 UnigramSampler::Sample(random::SimplePhilox* rnd) const { + mutex_lock lock(mu_); // could use reader lock + return unsafe_sampler_.Sample(rnd); +} + +float UnigramSampler::Probability(int64 value) const { + mutex_lock lock(mu_); // could use reader lock + return unsafe_sampler_.Probability(value); +} + +// Overriding at a high level results in far fewer lock aquisitions. +void UnigramSampler::SampleBatchGetExpectedCountAvoid( + random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64> batch, + MutableArraySlice<float> batch_expected_count, ArraySlice<int64> extras, + MutableArraySlice<float> extras_expected_count, + ArraySlice<int64> avoided_values) const { + mutex_lock lock(mu_); // could use reader lock + unsafe_sampler_.SampleBatchGetExpectedCountAvoid( + rnd, unique, batch, batch_expected_count, extras, extras_expected_count, + avoided_values); +} + +void UnigramSampler::Update(ArraySlice<int64> values) { + mutex_lock lock(mu_); + unsafe_sampler_.Update(values); +} + +FixedUnigramSampler::FixedUnigramSampler(Env* env, int64 range, + const string& vocab_file, + float distortion, + int32 num_reserved_ids, + int32 num_shards, int32 shard) + : RangeSampler(range), + total_weight_(0.0), + num_shards_(num_shards), + shard_(shard) { + FillReservedIds(num_reserved_ids); + // TODO(vanhoucke): make this non-crashing. + TF_CHECK_OK(LoadFromFile(env, vocab_file, distortion)); + CHECK_EQ(range, weights_.size()); + dist_sampler_.reset(new random::DistributionSampler(weights_)); +} + +FixedUnigramSampler::FixedUnigramSampler(int64 range, + const std::vector<float>& unigrams, + float distortion, + int32 num_reserved_ids, + int32 num_shards, int32 shard) + : RangeSampler(range), + total_weight_(0.0), + num_shards_(num_shards), + shard_(shard) { + FillReservedIds(num_reserved_ids); + LoadFromUnigrams(unigrams, distortion); + // TODO(vanhoucke): make this non-crashing. + CHECK_EQ(range, weights_.size()); + dist_sampler_.reset(new random::DistributionSampler(weights_)); +} + +float FixedUnigramSampler::Probability(int64 value) const { + return weights_.at(value) / total_weight_; +} + +int64 FixedUnigramSampler::Sample(random::SimplePhilox* rnd) const { + return dist_sampler_->Sample(rnd); +} + +void FixedUnigramSampler::FillReservedIds(int32 num_reserved_ids) { + for (int32 word_id = 0; word_id < num_reserved_ids; ++word_id) { + if (word_id % num_shards_ == shard_) weights_.push_back(0.0); + } +} + +Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file, + float distortion) { + RandomAccessFile* file; + TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file)); + io::InputBuffer in(file, 262144 /*bytes*/); + string line; + int32 word_id = weights_.size(); + while (in.ReadLine(&line).ok()) { + // The vocabulary file should be in csv like format, with the last + // field the weight associated with the word. + std::vector<string> cols = str_util::Split(line, ','); + if (cols.size() == 0) continue; + // Skip entries that do not belong to this shard. + if (word_id % num_shards_ == shard_) { + float w = 0.0; + if (!strings::safe_strtof(cols.at(cols.size() - 1).c_str(), &w)) { + return errors::InvalidArgument("Wrong vocabulary format at line: ", + line); + } + w = pow(w, distortion); + total_weight_ += w; + weights_.push_back(w); + } + ++word_id; + } + return Status::OK(); +} + +void FixedUnigramSampler::LoadFromUnigrams(const std::vector<float>& unigrams, + float distortion) { + int32 word_id = weights_.size(); + for (float w : unigrams) { + // Skip entries that do not belong to this shard. + if (word_id % num_shards_ == shard_) { + w = pow(w, distortion); + total_weight_ += w; + weights_.push_back(w); + } + ++word_id; + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h new file mode 100644 index 0000000000..18364c2c03 --- /dev/null +++ b/tensorflow/core/kernels/range_sampler.h @@ -0,0 +1,237 @@ +#ifndef TENSORFLOW_KERNELS_RANGE_SAMPLER_H_ +#define TENSORFLOW_KERNELS_RANGE_SAMPLER_H_ + +#include <vector> + +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/random/distribution_sampler.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/lib/random/weighted_picker.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +class Env; + +// Abstract subclass for sampling from the set of non-negative integers +// [0, range) +class RangeSampler { + public: + explicit RangeSampler(int range) : range_(range) { CHECK_GT(range_, 0); } + virtual ~RangeSampler(); + + // Sample a single value + virtual int64 Sample(random::SimplePhilox* rnd) const = 0; + + // The probability that a single call to Sample() returns the given value. + // Assumes that value is in [0, range). No range checking is done. + virtual float Probability(int64 value) const = 0; + + // Fill "batch" with samples from the distribution. + // If unique=true, then we re-pick each element until we get a + // value distinct from all previously picked values in the batch. + void SampleBatch(random::SimplePhilox* rnd, bool unique, + gtl::MutableArraySlice<int64> batch) const; + + // Fill "batch" with samples from the distribution, and report + // "expected counts". + // + // The "expected count" of a value is an estimate of the expected + // number of occurrences of the value in the batch returned by a + // call to this function with the given parameters. If unique=true, + // the expected count is an inclusion probability. For details on + // this estimation, see the comment to "ExpectedCountHelper" in the + // .cc file. + // + // Expected counts for the elements of the returned "batch" are reported + // in the aligned array "batch_expected_count". + // + // The user can optionally provide "extras", containg values in the range. + // The expected counts for the extras are reported in the aligned array + // "extras_expected_count". + // + // "batch_expected_count" must have size equal to 0 or to the size of "batch". + // "extras" and "extras_expected_count" must have equal size. + void SampleBatchGetExpectedCount( + random::SimplePhilox* rnd, bool unique, + gtl::MutableArraySlice<int64> batch, + gtl::MutableArraySlice<float> batch_expected_count, + gtl::ArraySlice<int64> extras, + gtl::MutableArraySlice<float> extras_expected_count) const; + + // Same as SampleBatchGetExpectedCount (see above), but with avoided values. + // We repick to avoid all of the values in "avoided_values". + // "avoided_values" is only supported with unique=true. If + // unique=false, then avoided_values must be empty. + virtual void SampleBatchGetExpectedCountAvoid( + random::SimplePhilox* rnd, bool unique, + gtl::MutableArraySlice<int64> batch, + gtl::MutableArraySlice<float> batch_expected_count, + gtl::ArraySlice<int64> extras, + gtl::MutableArraySlice<float> extras_expected_count, + gtl::ArraySlice<int64> avoided_values) const; + + // Does this sampler need to be updated with values, e.g. UnigramSampler + virtual bool NeedsUpdates() const { return false; } + + // Updates the underlying distribution + virtual void Update(gtl::ArraySlice<int64> values) { + LOG(FATAL) << "Update not supported for this sampler type."; + } + + int64 range() { return range_; } + + protected: + const int64 range_; +}; + +// An AllSampler only samples batches of size equal to range. +// It returns the entire range. +// It cannot sample single values. +class AllSampler : public RangeSampler { + public: + explicit AllSampler(int64 range); + + ~AllSampler() override {} + + int64 Sample(random::SimplePhilox* rnd) const override { + LOG(FATAL) << "Should not be called"; + } + + float Probability(int64 value) const override { + LOG(FATAL) << "Should not be called"; + } + + void SampleBatchGetExpectedCountAvoid( + random::SimplePhilox* rnd, bool unique, + gtl::MutableArraySlice<int64> batch, + gtl::MutableArraySlice<float> batch_expected_count, + gtl::ArraySlice<int64> extras, + gtl::MutableArraySlice<float> extras_expected_count, + gtl::ArraySlice<int64> avoided_values) const override; + + private: + const float inv_range_; +}; + +class UniformSampler : public RangeSampler { + public: + explicit UniformSampler(int64 range); + + ~UniformSampler() override {} + + int64 Sample(random::SimplePhilox* rnd) const override; + + float Probability(int64 value) const override; + + private: + const float inv_range_; +}; + +class LogUniformSampler : public RangeSampler { + public: + explicit LogUniformSampler(int64 range); + + ~LogUniformSampler() override {} + + int64 Sample(random::SimplePhilox* rnd) const override; + + float Probability(int64 value) const override; + + private: + const double log_range_; +}; + +// Thread-unsafe unigram sampler +class ThreadUnsafeUnigramSampler : public RangeSampler { + public: + explicit ThreadUnsafeUnigramSampler(int64 range); + ~ThreadUnsafeUnigramSampler() override {} + + int64 Sample(random::SimplePhilox* rnd) const override; + + float Probability(int64 value) const override; + + bool NeedsUpdates() const override { return true; } + void Update(gtl::ArraySlice<int64> values) override; + + private: + random::WeightedPicker picker_; +}; + +// Thread-safe unigram sampler +class UnigramSampler : public RangeSampler { + public: + explicit UnigramSampler(int64 range); + ~UnigramSampler() override {} + + int64 Sample(random::SimplePhilox* rnd) const override; + + float Probability(int64 value) const override; + + // Overriding at a high level results in far fewer lock aquisitions. + void SampleBatchGetExpectedCountAvoid( + random::SimplePhilox* rnd, bool unique, + gtl::MutableArraySlice<int64> batch, + gtl::MutableArraySlice<float> batch_expected_count, + gtl::ArraySlice<int64> extras, + gtl::MutableArraySlice<float> extras_expected_count, + gtl::ArraySlice<int64> avoided_values) const override; + + bool NeedsUpdates() const override { return true; } + void Update(gtl::ArraySlice<int64> values) override; + + private: + ThreadUnsafeUnigramSampler unsafe_sampler_ GUARDED_BY(mu_); + mutable mutex mu_; +}; + +// A unigram sampler that uses a fixed unigram distribution read from a +// file or passed in as an in-memory array instead of building up the +// distribution from data on the fly. There is also an option to skew the +// distribution by applying a distortion power to the weights. +class FixedUnigramSampler : public RangeSampler { + public: + // The vocab_file is assumed to be a CSV, with the last entry of each row a + // value representing the counts or probabilities for the corresponding ID. + FixedUnigramSampler(Env* env, int64 range, const string& vocab_file, + float distortion, int32 num_reserved_ids, + int32 num_shards, int32 shard); + + FixedUnigramSampler(int64 range, const std::vector<float>& unigrams, + float distortion, int32 num_reserved_ids, + int32 num_shards, int32 shard); + + float Probability(int64 value) const override; + + int64 Sample(random::SimplePhilox* rnd) const override; + + private: + // Underlying distribution sampler. + std::unique_ptr<random::DistributionSampler> dist_sampler_; + // Weights for individual samples. The probability of a sample i is defined + // as weights_.at(i) / total_weight_. + std::vector<float> weights_; + // The total weights of all samples. + float total_weight_; + // Sharding information of the sampler. The whole vocabulary is sharded + // into num_shards_ smaller ranges and each sampler is responsible for one + // such smaller range, identified by the shard number. + int32 num_shards_; + int32 shard_; + + // Fill the sampler with the appropriate number of reserved IDs. + void FillReservedIds(int32 num_reserved_ids); + // Load IDs to sample from a CSV file. It is assumed that the last item of + // each row contains a count or probability for the corresponding ID. + Status LoadFromFile(Env* env, const string& vocab_file, float distortion); + // Load from an in-memory array. + void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_RANGE_SAMPLER_H_ diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc new file mode 100644 index 0000000000..72c39009e4 --- /dev/null +++ b/tensorflow/core/kernels/range_sampler_test.cc @@ -0,0 +1,320 @@ +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/core/kernels/range_sampler.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/env.h" + +namespace tensorflow { +namespace { + +using gtl::ArraySlice; +using gtl::MutableArraySlice; + +class RangeSamplerTest : public ::testing::Test { + protected: + void CheckProbabilitiesSumToOne() { + double sum = 0; + for (int i = 0; i < sampler_->range(); i++) { + sum += sampler_->Probability(i); + } + EXPECT_NEAR(sum, 1.0, 1e-4); + } + void CheckHistogram(int num_samples, float tolerance) { + const int range = sampler_->range(); + std::vector<int> h(range); + std::vector<int64> a(num_samples); + // Using a fixed random seed to make the test deterministic. + random::PhiloxRandom philox(123, 17); + random::SimplePhilox rnd(&philox); + sampler_->SampleBatch(&rnd, false, &a); + for (int i = 0; i < num_samples; i++) { + int64 val = a[i]; + ASSERT_GE(val, 0); + ASSERT_LT(val, range); + h[val]++; + } + for (int val = 0; val < range; val++) { + EXPECT_NEAR((h[val] + 0.0) / num_samples, sampler_->Probability(val), + tolerance); + } + } + void Update1() { + // Add the value 3 ten times. + std::vector<int64> a(10); + for (int i = 0; i < 10; i++) { + a[i] = 3; + } + sampler_->Update(a); + } + void Update2() { + // Add the value n n times. + int64 a[10]; + for (int i = 0; i < 10; i++) { + a[i] = i; + } + for (int64 i = 1; i < 10; i++) { + sampler_->Update(ArraySlice<int64>(a + i, 10 - i)); + } + } + std::unique_ptr<RangeSampler> sampler_; +}; + +TEST_F(RangeSamplerTest, UniformProbabilities) { + sampler_.reset(new UniformSampler(10)); + for (int i = 0; i < 10; i++) { + CHECK_EQ(sampler_->Probability(i), sampler_->Probability(0)); + } +} + +TEST_F(RangeSamplerTest, UniformChecksum) { + sampler_.reset(new UniformSampler(10)); + CheckProbabilitiesSumToOne(); +} + +TEST_F(RangeSamplerTest, UniformHistogram) { + sampler_.reset(new UniformSampler(10)); + CheckHistogram(1000, 0.05); +} + +TEST_F(RangeSamplerTest, LogUniformProbabilities) { + int range = 1000000; + sampler_.reset(new LogUniformSampler(range)); + for (int i = 100; i < range; i *= 2) { + float ratio = sampler_->Probability(i) / sampler_->Probability(i / 2); + EXPECT_NEAR(ratio, 0.5, 0.1); + } +} + +TEST_F(RangeSamplerTest, LogUniformChecksum) { + sampler_.reset(new LogUniformSampler(10)); + CheckProbabilitiesSumToOne(); +} + +TEST_F(RangeSamplerTest, LogUniformHistogram) { + sampler_.reset(new LogUniformSampler(10)); + CheckHistogram(1000, 0.05); +} + +TEST_F(RangeSamplerTest, UnigramProbabilities1) { + sampler_.reset(new UnigramSampler(10)); + Update1(); + EXPECT_NEAR(sampler_->Probability(3), 0.55, 1e-4); + for (int i = 0; i < 10; i++) { + if (i != 3) { + ASSERT_NEAR(sampler_->Probability(i), 0.05, 1e-4); + } + } +} +TEST_F(RangeSamplerTest, UnigramProbabilities2) { + sampler_.reset(new UnigramSampler(10)); + Update2(); + for (int i = 0; i < 10; i++) { + ASSERT_NEAR(sampler_->Probability(i), (i + 1) / 55.0, 1e-4); + } +} +TEST_F(RangeSamplerTest, UnigramChecksum) { + sampler_.reset(new UnigramSampler(10)); + Update1(); + CheckProbabilitiesSumToOne(); +} + +TEST_F(RangeSamplerTest, UnigramHistogram) { + sampler_.reset(new UnigramSampler(10)); + Update1(); + CheckHistogram(1000, 0.05); +} + +static const char kVocabContent[] = + "w1,1\n" + "w2,2\n" + "w3,4\n" + "w4,8\n" + "w5,16\n" + "w6,32\n" + "w7,64\n" + "w8,128\n" + "w9,256"; +TEST_F(RangeSamplerTest, FixedUnigramProbabilities) { + Env* env = Env::Default(); + string fname = io::JoinPath(testing::TmpDir(), "vocab_file"); + TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent)); + sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0)); + // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05 + for (int i = 0; i < 9; i++) { + ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4); + } +} +TEST_F(RangeSamplerTest, FixedUnigramChecksum) { + Env* env = Env::Default(); + string fname = io::JoinPath(testing::TmpDir(), "vocab_file"); + TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent)); + sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0)); + CheckProbabilitiesSumToOne(); +} + +TEST_F(RangeSamplerTest, FixedUnigramHistogram) { + Env* env = Env::Default(); + string fname = io::JoinPath(testing::TmpDir(), "vocab_file"); + TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent)); + sampler_.reset(new FixedUnigramSampler(env, 9, fname, 0.8, 0, 1, 0)); + CheckHistogram(1000, 0.05); +} +TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) { + Env* env = Env::Default(); + string fname = io::JoinPath(testing::TmpDir(), "vocab_file"); + TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent)); + sampler_.reset(new FixedUnigramSampler(env, 10, fname, 0.8, 1, 1, 0)); + ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4); + // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05 + for (int i = 1; i < 10; i++) { + ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 1) * 0.8) / 197.05, 1e-4); + } +} +TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) { + Env* env = Env::Default(); + string fname = io::JoinPath(testing::TmpDir(), "vocab_file"); + TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent)); + sampler_.reset(new FixedUnigramSampler(env, 11, fname, 0.8, 2, 1, 0)); + ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4); + ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4); + // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05 + for (int i = 2; i < 11; i++) { + ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 2) * 0.8) / 197.05, 1e-4); + } +} +TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesFromVector) { + std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256}; + sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0)); + // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05 + for (int i = 0; i < 9; i++) { + ASSERT_NEAR(sampler_->Probability(i), pow(2, i * 0.8) / 197.05, 1e-4); + } +} +TEST_F(RangeSamplerTest, FixedUnigramChecksumFromVector) { + std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256}; + sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0)); + CheckProbabilitiesSumToOne(); +} +TEST_F(RangeSamplerTest, FixedUnigramHistogramFromVector) { + std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256}; + sampler_.reset(new FixedUnigramSampler(9, weights, 0.8, 0, 1, 0)); + CheckHistogram(1000, 0.05); +} +TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1FromVector) { + std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256}; + sampler_.reset(new FixedUnigramSampler(10, weights, 0.8, 1, 1, 0)); + ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4); + // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05 + for (int i = 1; i < 10; i++) { + ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 1) * 0.8) / 197.05, 1e-4); + } +} +TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2FromVector) { + std::vector<float> weights = {1, 2, 4, 8, 16, 32, 64, 128, 256}; + sampler_.reset(new FixedUnigramSampler(11, weights, 0.8, 2, 1, 0)); + ASSERT_NEAR(sampler_->Probability(0), 0, 1e-4); + ASSERT_NEAR(sampler_->Probability(1), 0, 1e-4); + // 1^0.8+2^0.8+4^0.8+...+256^0.8=197.05 + for (int i = 2; i < 11; i++) { + ASSERT_NEAR(sampler_->Probability(i), pow(2, (i - 2) * 0.8) / 197.05, 1e-4); + } +} + +// AllSampler cannot call Sample or Probability directly. +// We will test SampleBatchGetExpectedCount instead. +TEST_F(RangeSamplerTest, All) { + int batch_size = 10; + sampler_.reset(new AllSampler(10)); + std::vector<int64> batch(batch_size); + std::vector<float> batch_expected(batch_size); + std::vector<int64> extras(2); + std::vector<float> extras_expected(2); + extras[0] = 0; + extras[1] = batch_size - 1; + sampler_->SampleBatchGetExpectedCount(nullptr, // no random numbers needed + false, &batch, &batch_expected, extras, + &extras_expected); + for (int i = 0; i < batch_size; i++) { + EXPECT_EQ(i, batch[i]); + EXPECT_EQ(1, batch_expected[i]); + } + EXPECT_EQ(1, extras_expected[0]); + EXPECT_EQ(1, extras_expected[1]); +} + +TEST_F(RangeSamplerTest, Unique) { + // We sample num_batches batches, each without replacement. + // + // We check that the returned expected counts roughly agree with each other + // and with the average observed frequencies over the set of batches. + random::PhiloxRandom philox(123, 17); + random::SimplePhilox rnd(&philox); + const int range = 100; + const int batch_size = 50; + const int num_batches = 100; + sampler_.reset(new LogUniformSampler(range)); + std::vector<int> histogram(range); + std::vector<int64> batch(batch_size); + std::vector<int64> all_values(range); + for (int i = 0; i < range; i++) { + all_values[i] = i; + } + std::vector<float> expected(range); + + // Sample one batch and get the expected counts of all values + sampler_->SampleBatchGetExpectedCount( + &rnd, true, &batch, MutableArraySlice<float>(), all_values, &expected); + // Check that all elements are unique + std::set<int64> s(batch.begin(), batch.end()); + CHECK_EQ(batch_size, s.size()); + + for (int trial = 0; trial < num_batches; trial++) { + std::vector<float> trial_expected(range); + sampler_->SampleBatchGetExpectedCount(&rnd, true, &batch, + MutableArraySlice<float>(), + all_values, &trial_expected); + for (int i = 0; i < range; i++) { + EXPECT_NEAR(expected[i], trial_expected[i], expected[i] * 0.5); + } + for (int i = 0; i < batch_size; i++) { + histogram[batch[i]]++; + } + } + for (int i = 0; i < range; i++) { + // Check that the computed expected count agrees with the average observed + // count. + const float average_count = static_cast<float>(histogram[i]) / num_batches; + EXPECT_NEAR(expected[i], average_count, 0.2); + } +} + +TEST_F(RangeSamplerTest, Avoid) { + random::PhiloxRandom philox(123, 17); + random::SimplePhilox rnd(&philox); + sampler_.reset(new LogUniformSampler(100)); + std::vector<int64> avoided(2); + avoided[0] = 17; + avoided[1] = 23; + std::vector<int64> batch(98); + + // We expect to pick all elements of [0, 100) except the avoided two. + sampler_->SampleBatchGetExpectedCountAvoid( + &rnd, true, &batch, MutableArraySlice<float>(), ArraySlice<int64>(), + MutableArraySlice<float>(), avoided); + + int sum = 0; + for (auto val : batch) { + sum += val; + } + const int expected_sum = 100 * 99 / 2 - avoided[0] - avoided[1]; + EXPECT_EQ(expected_sum, sum); +} + +} // namespace + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reader_base.cc b/tensorflow/core/kernels/reader_base.cc new file mode 100644 index 0000000000..06211efb38 --- /dev/null +++ b/tensorflow/core/kernels/reader_base.cc @@ -0,0 +1,156 @@ +#include "tensorflow/core/kernels/reader_base.h" + +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/coding.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/str_util.h" + +namespace tensorflow { + +// ReaderBase ------------------------------------------------------ + +ReaderBase::ReaderBase(const string& name) : name_(name) {} + +int64 ReaderBase::NumRecordsProduced() { + mutex_lock lock(mu_); + return num_records_produced_; +} + +int64 ReaderBase::NumWorkUnitsCompleted() { + mutex_lock lock(mu_); + return work_finished_; +} + +Status ReaderBase::Reset() { + mutex_lock lock(mu_); + return ResetLocked(); +} + +Status ReaderBase::ResetLocked() { + work_started_ = 0; + work_finished_ = 0; + num_records_produced_ = 0; + work_.clear(); + return Status::OK(); +} + +Status ReaderBase::SerializeState(string* state) { + mutex_lock lock(mu_); + return SerializeStateLocked(state); +} + +Status ReaderBase::SerializeStateLocked(string* state) { + return errors::Unimplemented("Reader SerializeState"); +} + +Status ReaderBase::RestoreState(const string& state) { + mutex_lock lock(mu_); + Status status = RestoreStateLocked(state); + if (!status.ok()) { + ResetLocked(); + } + return status; +} + +Status ReaderBase::RestoreStateLocked(const string& state) { + return errors::Unimplemented("Reader RestoreState"); +} + +void ReaderBase::Read(QueueInterface* queue, string* key, string* value, + OpKernelContext* context) { + mutex_lock lock(mu_); + while (true) { + if (!work_in_progress()) { + GetNextWorkLocked(queue, context); + if (!context->status().ok()) return; + } + + bool produced = false; + bool at_end = false; + Status status = ReadLocked(key, value, &produced, &at_end); + + if (!at_end && status.ok() && !produced) { + status = errors::Internal( + "ReadLocked() for ", name(), + " must set *at_end=true, *produced=true, or return an error."); + } + if (!status.ok() && produced) { + status = errors::Internal("ReadLocked() for ", name(), + " set *produced=true *and* returned an error: ", + status.ToString()); + } + if (status.ok() && at_end) { + status = OnWorkFinishedLocked(); + work_finished_ = work_started_; + } + if (!status.ok()) { + context->SetStatus(status); + return; + } + if (produced) { + ++num_records_produced_; + return; + } + } +} + +void ReaderBase::GetNextWorkLocked(QueueInterface* queue, + OpKernelContext* context) { + Notification n; + queue->TryDequeue( + context, [this, context, &n](const QueueInterface::Tuple& tuple) { + if (context->status().ok()) { + if (tuple.size() != 1) { + context->SetStatus( + errors::InvalidArgument("Expected single component queue")); + } else if (tuple[0].dtype() != DT_STRING) { + context->SetStatus(errors::InvalidArgument( + "Expected queue with single string component")); + } else if (tuple[0].NumElements() != 1) { + context->SetStatus(errors::InvalidArgument( + "Expected to dequeue a one-element string tensor")); + } else { + work_ = tuple[0].flat<string>()(0); + ++work_started_; + Status status = OnWorkStartedLocked(); + if (!status.ok()) { + context->SetStatus(status); + --work_started_; + } + } + } + n.Notify(); + }); + n.WaitForNotification(); +} + +void ReaderBase::SaveBaseState(ReaderBaseState* state) const { + state->Clear(); + state->set_work_started(work_started_); + state->set_work_finished(work_finished_); + state->set_num_records_produced(num_records_produced_); + state->set_current_work(work_); +} + +Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) { + work_started_ = state.work_started(); + work_finished_ = state.work_finished(); + num_records_produced_ = state.num_records_produced(); + work_ = state.current_work(); + if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) { + return errors::InvalidArgument( + "Unexpected negative value when restoring in ", name(), ": ", + state.ShortDebugString()); + } + if (work_started_ > work_finished_) { + return errors::InvalidArgument( + "Inconsistent work started vs. finished when restoring in ", name(), + ": ", state.ShortDebugString()); + } + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reader_base.h b/tensorflow/core/kernels/reader_base.h new file mode 100644 index 0000000000..d344300388 --- /dev/null +++ b/tensorflow/core/kernels/reader_base.h @@ -0,0 +1,107 @@ +#ifndef TENSORFLOW_KERNELS_READER_BASE_H_ +#define TENSORFLOW_KERNELS_READER_BASE_H_ + +#include <memory> +#include <string> +#include <vector> +#include "tensorflow/core/framework/queue_interface.h" +#include "tensorflow/core/framework/reader_interface.h" +#include "tensorflow/core/kernels/reader_base.pb.h" +#include "tensorflow/core/lib/core/stringpiece.h" + +namespace tensorflow { + +// Default implementation of ReaderInterface. +class ReaderBase : public ReaderInterface { + public: + // name: For use in error messages, should mention both the name of + // the op and the node. + explicit ReaderBase(const string& name); + + // Note that methods with names ending in "Locked" are called while + // the ReaderBase's mutex is held. + + // Implement this function in descendants ----------------------------------- + + // Produce the next key/value pair from the current work item. + // This is called "Locked" since it is executed under a mutex + // that serializes all Reader calls. + // Usage: + // a) If a record was successfully produced, set *produced = true, + // and fill in *key and *value. + // b) If no more records will be produced for this work item, set + // *at_end = true. + // c) If a record was produced, but no more will be produced, you + // may either do both (a) and (b), or do (a) in this call and do (b) in + // the next call to ReadLocked(). + // d) If there was an error producing (e.g. an error reading the file, + // data corruption), return a non-OK() status. ReadLocked may be + // called again if the user reruns this part of the graph. + virtual Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) = 0; + + // Descendants may optionally implement these ------------------------------- + + // Called when work starts / finishes. + virtual Status OnWorkStartedLocked() { return Status::OK(); } + virtual Status OnWorkFinishedLocked() { return Status::OK(); } + + // Called to reset the Reader to a newly constructed state. + virtual Status ResetLocked(); + + // Default implementation generates an Unimplemented error. + // See the protected helper methods below. + virtual Status SerializeStateLocked(string* state); + virtual Status RestoreStateLocked(const string& state); + + // Accessors ---------------------------------------------------------------- + + // Always true during a call to ReadLocked(). + bool work_in_progress() const { return work_finished_ < work_started_; } + + // Returns the name of the current work item (valid if + // work_in_progress() returns true). May change between calls to + // ReadLocked(). + const string& current_work() const { return work_; } + + // What was passed to the constructor. + const string& name() const { return name_; } + + protected: + // For descendants wishing to implement serialize & restore state. + + // Writes ReaderBase state to *state. + void SaveBaseState(ReaderBaseState* state) const; + + // Restores ReaderBase state from state. Assumes state was filled + // using SaveBaseState() above. + Status RestoreBaseState(const ReaderBaseState& state); + + private: + // Implementations of ReaderInterface methods. These ensure thread-safety + // and call the methods above to do the work. + void Read(QueueInterface* queue, string* key, string* value, + OpKernelContext* context) override; + Status Reset() override; + int64 NumRecordsProduced() override; + int64 NumWorkUnitsCompleted() override; + Status SerializeState(string* state) override; + Status RestoreState(const string& state) override; + + // For implementing Read(). Dequeues the next work item from + // *queue, and if successful updates work_, work_started_ + // (establishing work_in_progress() == true) and calls + // OnWorkStartedLocked(). May block. + void GetNextWorkLocked(QueueInterface* queue, OpKernelContext* context); + + mutable mutex mu_; + const string name_; + int64 work_started_ = 0; + int64 work_finished_ = 0; + int64 num_records_produced_ = 0; + string work_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_READER_BASE_H_ diff --git a/tensorflow/core/kernels/reader_base.proto b/tensorflow/core/kernels/reader_base.proto new file mode 100644 index 0000000000..4335cb2152 --- /dev/null +++ b/tensorflow/core/kernels/reader_base.proto @@ -0,0 +1,13 @@ +syntax = "proto3"; + +package tensorflow; +// option cc_enable_arenas = true; + +// For serializing and restoring the state of ReaderBase, see +// reader_base.h for details. +message ReaderBaseState { + int64 work_started = 1; + int64 work_finished = 2; + int64 num_records_produced = 3; + bytes current_work = 4; +}; diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc new file mode 100644 index 0000000000..38c1013604 --- /dev/null +++ b/tensorflow/core/kernels/reader_ops.cc @@ -0,0 +1,132 @@ +// See docs in ../ops/io_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/queue_interface.h" +#include "tensorflow/core/framework/reader_interface.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class ReaderVerbOpKernel : public OpKernel { + public: + using OpKernel::OpKernel; + + void Compute(OpKernelContext* context) override { + ReaderInterface* reader; + OP_REQUIRES_OK(context, + GetResourceFromContext(context, "reader_handle", &reader)); + ComputeWithReader(context, reader); + reader->Unref(); + } + + protected: + virtual void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) = 0; +}; + +class ReaderReadOp : public ReaderVerbOpKernel { + public: + using ReaderVerbOpKernel::ReaderVerbOpKernel; + + void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) override { + QueueInterface* queue; + OP_REQUIRES_OK(context, + GetResourceFromContext(context, "queue_handle", &queue)); + core::ScopedUnref unref_me(queue); + Tensor* key = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("key", TensorShape({}), &key)); + Tensor* value = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("value", TensorShape({}), &value)); + + auto key_scalar = key->scalar<string>(); + auto value_scalar = value->scalar<string>(); + reader->Read(queue, &key_scalar(), &value_scalar(), context); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReaderRead").Device(DEVICE_CPU), ReaderReadOp); + +class ReaderNumRecordsProducedOp : public ReaderVerbOpKernel { + public: + using ReaderVerbOpKernel::ReaderVerbOpKernel; + + void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) override { + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output("records_produced", + TensorShape({}), &output)); + output->scalar<int64>()() = reader->NumRecordsProduced(); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReaderNumRecordsProduced").Device(DEVICE_CPU), + ReaderNumRecordsProducedOp); + +class ReaderNumWorkUnitsCompletedOp : public ReaderVerbOpKernel { + public: + using ReaderVerbOpKernel::ReaderVerbOpKernel; + + void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) override { + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output("units_completed", + TensorShape({}), &output)); + output->scalar<int64>()() = reader->NumWorkUnitsCompleted(); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReaderNumWorkUnitsCompleted").Device(DEVICE_CPU), + ReaderNumWorkUnitsCompletedOp); + +class ReaderSerializeStateOp : public ReaderVerbOpKernel { + public: + using ReaderVerbOpKernel::ReaderVerbOpKernel; + + void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) override { + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("state", TensorShape({}), &output)); + OP_REQUIRES_OK(context, + reader->SerializeState(&output->scalar<string>()())); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReaderSerializeState").Device(DEVICE_CPU), + ReaderSerializeStateOp); + +class ReaderRestoreStateOp : public ReaderVerbOpKernel { + public: + using ReaderVerbOpKernel::ReaderVerbOpKernel; + + void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) override { + const Tensor* tensor; + OP_REQUIRES_OK(context, context->input("state", &tensor)); + OP_REQUIRES( + context, TensorShapeUtils::IsScalar(tensor->shape()), + errors::InvalidArgument("Reader state must be scalar, but had shape: ", + tensor->shape().DebugString())); + OP_REQUIRES_OK(context, reader->RestoreState(tensor->scalar<string>()())); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReaderRestoreState").Device(DEVICE_CPU), + ReaderRestoreStateOp); + +class ReaderResetOp : public ReaderVerbOpKernel { + public: + using ReaderVerbOpKernel::ReaderVerbOpKernel; + + void ComputeWithReader(OpKernelContext* context, + ReaderInterface* reader) override { + OP_REQUIRES_OK(context, reader->Reset()); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReaderReset").Device(DEVICE_CPU), ReaderResetOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h new file mode 100644 index 0000000000..b412617a65 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops.h @@ -0,0 +1,66 @@ +#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_H_ +#define TENSORFLOW_KERNELS_REDUCTION_OPS_H_ + +// Functor definitions for Reduction ops, must be compilable by nvcc. + +#include <iostream> +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// When eigen3 has better implementation of AllReducer and AnyReducer, +// replaces reducers here. + +// Reduction using logical_and. +struct AllReducer { + // TODO(zhifengc): Implement PacketAccess when performance matters. + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC void reduce(const bool t, bool* accum) const { + *accum &= t; + } + + EIGEN_DEVICE_FUNC bool initialize() const { return true; } + + EIGEN_DEVICE_FUNC bool finalize(const bool accum) const { return accum; } +}; + +// Reduction using logical_or. +struct AnyReducer { + // TODO(zhifengc): Implement PacketAccess when performance matters. + static const bool PacketAccess = false; + static const bool IsStateful = false; + + EIGEN_DEVICE_FUNC void reduce(const bool t, bool* accum) const { + *accum |= t; + } + + EIGEN_DEVICE_FUNC bool initialize() const { return false; } + + EIGEN_DEVICE_FUNC bool finalize(const bool accum) const { return accum; } +}; + +template <typename Device, typename OUT_T, typename IN_T, + typename ReductionAxes, typename Reducer> +void ReduceEigenImpl(const Device& d, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Reducer& reducer) { + out.device(d) = in.reduce(reduction_axes, reducer); +} + +template <typename Device> +struct ReduceFunctor { + template <typename OUT_T, typename IN_T, typename ReductionAxes, + typename Reducer> + static void Reduce(const Device& d, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Reducer& reducer); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_REDUCTION_OPS_H_ diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc new file mode 100644 index 0000000000..11d399e70a --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_all.cc @@ -0,0 +1,17 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("All") + .Device(DEVICE_CPU) + .HostMemory("reduction_indices"), + ReductionOp<CPUDevice, bool, functor::AllReducer>); + +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("All") + .Device(DEVICE_GPU) + .HostMemory("reduction_indices"), + ReductionOp<GPUDevice, bool, functor::AllReducer>); +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc new file mode 100644 index 0000000000..a89ef22b08 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_any.cc @@ -0,0 +1,17 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("Any") + .Device(DEVICE_CPU) + .HostMemory("reduction_indices"), + ReductionOp<CPUDevice, bool, functor::AnyReducer>); + +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Any") + .Device(DEVICE_GPU) + .HostMemory("reduction_indices"), + ReductionOp<GPUDevice, bool, functor::AnyReducer>); +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h new file mode 100644 index 0000000000..2bde3a1a54 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_common.h @@ -0,0 +1,302 @@ +// This is an internal header file intended to only be included as the +// front-matter in the implementation files of various reduction ops. It +// is a header file because we split the various reduction ops into their +// own compilation units to get more parallelism in compilation. + +#ifndef TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_ +#define TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/reduction_ops.h" + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/Eigen/Core" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device> +struct Constants { + // Derive Index type. int (32-bit) or long (64-bit) depending on the + // compile-time configuration. "float" here is not relevant. + // TODO(zhifengc): Moves the definition to TTypes. + typedef TTypes<float>::Tensor::Index Index; + Eigen::array<Index, 1> kZero; + Eigen::array<Index, 1> kOne; + Eigen::array<Index, 2> kZeroTwo; + + Constants() { + kZero[0] = 0; + kOne[0] = 1; + kZeroTwo[0] = 0; + kZeroTwo[1] = 2; + } +}; + +#if defined(EIGEN_HAS_INDEX_LIST) +template <> +struct Constants<CPUDevice> { + const Eigen::IndexList<Eigen::type2index<0>> kZero; + const Eigen::IndexList<Eigen::type2index<1>> kOne; + const Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2>> kZeroTwo; +}; +#endif + +namespace { + +class ReductionHelper { + public: + ReductionHelper() : reduce_first_axis_(false) {} + + Status Simplify(const Tensor& data, const Tensor& axis, + const bool keep_dims) { + // bitmap[i] indicates whether to reduce data along i-th axis. + std::vector<bool> bitmap(data.dims(), false); + auto axis_vec = axis.flat<int32>(); + for (int64 i = 0; i < axis.NumElements(); ++i) { + const int32 index = axis_vec(i); + if (index < 0 || index >= data.dims()) { + return errors::OutOfRange("Invalid reduction dimension (", index, + " for input with ", data.dims(), + " dimension(s)"); + } + bitmap[index] = true; + } + + // Output tensor's dim sizes. + out_shape_.clear(); + for (int i = 0; i < data.dims(); ++i) { + if (!bitmap[i]) { + // If we are not reducing along dimension i. + out_shape_.push_back(data.dim_size(i)); + } else if (keep_dims) { + // We are reducing along dimension i, but we want to keep the + // same number of dimensions, so we set the dimension of i to + // '1'. + out_shape_.push_back(1); + } + } + + // Depending on bitmap[i] and bitmap[i-1], we can collapse axis of + // the input data before doing the reduction on the resulting + // tensor. The shape of the reduction is a reshape of the final + // output. + + // We'll skip the leading 1s. + int dim_index = 0; + for (; dim_index < data.dims(); ++dim_index) { + if (data.dim_size(dim_index) != 1) break; + } + if (dim_index >= data.dims()) { + // Special case. The input is essentially a scalar. + reduce_first_axis_ = true; + } else { + // Starting from the (dim_index)-th dimension, dimensions + // alternates between runs that need to be reduced and runs that + // don't. + // + // NOTE: If a dimension has size 1, we group it as the current + // run so that we can minimize the number of runs. + // + // E.g., when we want to reduce a tensor of shape [2, 1, 3, 1, + // 5] by axes = [1, 4], we should treat the tensor as a [6, 5] + // and reduce by axes = [1] (i.e., the output is shape [6]). + reduce_first_axis_ = bitmap[dim_index]; + data_reshape_.push_back(data.dim_size(dim_index)); + ++dim_index; + for (; dim_index < data.dims(); ++dim_index) { + const auto size = data.dim_size(dim_index); + if (size == 1) { + bitmap[dim_index] = bitmap[dim_index - 1]; + } + if (bitmap[dim_index - 1] != bitmap[dim_index]) { + // Starts a new run of reduce or !reduce. + data_reshape_.push_back(size); + } else { + // Continue a run of reduce or !reduce. + data_reshape_.back() *= size; + } + } + // If reduce_first_axis_ is true (input's dimension 0, 2, 4, etc + // are reduced), data_reshape_[1, 3, 5, ...] is out_reshape_, + // otherwise, data_reshape_[0, 2, 4, ...] is. + for (size_t i = reduce_first_axis_ ? 1 : 0; i < data_reshape_.size(); + i += 2) { + out_reshape_.push_back(data_reshape_[i]); + } + } + + VLOG(1) << "data reshape: " << str_util::Join(data_reshape_, ","); + VLOG(1) << "out reshape: " << str_util::Join(out_reshape_, ","); + VLOG(1) << "out shape: " << str_util::Join(out_shape_, ","); + return Status::OK(); + } + + // We need to do roughly: + // tmp_out = allocate(out_reshape()) + // tmp_out.reshape(out_reshape) = data.reshape(data_reshape).reduce(axes) + // out = tmp_out.reshape(out_shape) + + // The reduction result must be allocated with this shape. + TensorShape out_reshape() const { + TensorShape shape; + for (auto size : out_reshape_) shape.AddDim(size); + return shape; + } + + // The final output shape must be allocated with this shape. + TensorShape out_shape() const { + TensorShape shape; + for (auto size : out_shape_) shape.AddDim(size); + return shape; + } + + // The reduction is on a reshaped tensor of this rank. + int ndims() const { return data_reshape_.size(); } + + // True if need to reduce the 0-th dimension. + bool reduce_first_axis() const { return reduce_first_axis_; } + + // The output is reshaped. + template <typename T, int N> + typename TTypes<T, N>::Tensor out(Tensor* out) { + return out->shaped<T, N>(out_reshape_); + } + + // The input is reshaped. + template <typename T, int N> + typename TTypes<T, N>::ConstTensor in(const Tensor& data) { + return data.shaped<T, N>(data_reshape_); + } + + private: + bool reduce_first_axis_; // True if need to reduce the 0-th dimension. + std::vector<int64> data_reshape_; // Reshape the data before reduction. + std::vector<int64> out_shape_; // The final output shape. + std::vector<int64> out_reshape_; // Reshape the output for reduction. +}; + +} // end namespace + +// For operations where the output is a reduction function along some +// dimensions of the input. +template <typename Device, class T, typename Reducer> +class ReductionOp : public OpKernel { + public: + explicit ReductionOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + const DataType dt = DataTypeToEnum<T>::v(); + OP_REQUIRES_OK(ctx, ctx->MatchSignature({dt, DT_INT32}, {dt})); + + OP_REQUIRES_OK(ctx, ctx->GetAttr("keep_dims", &keep_dims_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& data = ctx->input(0); + const Tensor& axes = ctx->input(1); + VLOG(1) << "data shape: " << data.shape().ShortDebugString(); + VLOG(1) << "axes : " << axes.SummarizeValue(10); + + ReductionHelper helper; + OP_REQUIRES_OK(ctx, helper.Simplify(data, axes, keep_dims_)); + CHECK_GE(helper.ndims(), 0); + + // The real output shape will be assigned below. + TensorShape empty_shape; + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &out)); + + if (helper.ndims() == 0 || + (helper.ndims() == 1 && !helper.reduce_first_axis())) { + // Special case. Reduces nothing. It is unclear why this is + // necessary, but tests fail without it. Look into why this + // case occurs. + if (!out->CopyFrom(data, helper.out_shape())) { + ctx->SetStatus(errors::Internal("Error during reduction copy.")); + } + return; + } + + // A temporary tensor whose size matches the size of the reduced + // output. + Tensor tmp_out; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(out->dtype(), helper.out_reshape(), &tmp_out)); + + typedef functor::ReduceFunctor<Device> Functor; + Constants<Device> constants; + const Device& d = ctx->eigen_device<Device>(); + Reducer reducer; + + if ((helper.ndims() == 1) && helper.reduce_first_axis()) { + // Reduce to a scalar. + Functor::Reduce(d, helper.out<T, 0>(&tmp_out), helper.in<T, 1>(data), + constants.kZero, reducer); + } else if ((helper.ndims() == 2) && helper.reduce_first_axis()) { + // Can be viewed as a reduction of a matrix along 1st dimension. + Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data), + constants.kZero, reducer); + } else if ((helper.ndims() == 2) && !helper.reduce_first_axis()) { + // Can be viewed as a reduction of a matrix along 2nd dimension. + Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 2>(data), + constants.kOne, reducer); + } else if ((helper.ndims() == 3) && helper.reduce_first_axis()) { + // Can be viewed as a reduction of a 3D tensor along 1st and 3rd + // dimensions. + Functor::Reduce(d, helper.out<T, 1>(&tmp_out), helper.in<T, 3>(data), + constants.kZeroTwo, reducer); + } else if ((helper.ndims() == 3) && !helper.reduce_first_axis()) { + // Can be viewed as a reduction of a 3D tensor along 2nd dimension. + Functor::Reduce(d, helper.out<T, 2>(&tmp_out), helper.in<T, 3>(data), + constants.kOne, reducer); + } else { + // TODO(zhifengc): We can implement reduction for arbitrary rank + // tensor and arbitrary reduction axes by iterating the reduction + // multiple times. This may also be accomplished in the graph + // construction. + ctx->SetStatus( + errors::Unimplemented("Reducing ", data.shape().ShortDebugString(), + " axes [", axes.SummarizeValue(10), "] to ", + tmp_out.shape().ShortDebugString())); + return; + } + + // Set the real output using the contents of the reduction but the + // real expected output shape. The number of elements should + // match between the two shapes. + if (!out->CopyFrom(tmp_out, helper.out_shape())) { + ctx->SetStatus(errors::Internal("Error during reduction copy.")); + } + } + + private: + // True if the number of dimensions should be maintained. + bool keep_dims_; +}; + +namespace functor { + +template <> +struct ReduceFunctor<CPUDevice> { + template <typename OUT_T, typename IN_T, typename ReductionAxes, + typename Reducer> + static void Reduce(const CPUDevice& d, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Reducer& reducer) { + ReduceEigenImpl(d, out, in, reduction_axes, reducer); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_REDUCTION_OPS_COMMON_H_ diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc new file mode 100644 index 0000000000..8e29d2d06c --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc @@ -0,0 +1,65 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/numeric_types.h" +#include "tensorflow/core/kernels/reduction_ops.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +// Derive Index type. int (32-bit) or long (64-bit) depending on the +// compile-time configuration. "float" here is not relevant. +// TODO(zhifengc): Moves the definition to TTypes. +typedef TTypes<float>::Tensor::Index Index; + +template <> +struct ReduceFunctor<GPUDevice> { + template <typename OUT_T, typename IN_T, typename ReductionAxes, + typename Reducer> + static void Reduce(const GPUDevice& d, OUT_T out, IN_T in, + const ReductionAxes& reduction_axes, + const Reducer& reducer) { + ReduceEigenImpl(d, To32Bit(out), To32Bit(in), reduction_axes, reducer); + } +}; + +// T: the data type +// REDUCER: the reducer functor +// NUM_AXES: the number of axes to reduce +// IN_DIMS: the number of dimensions of the input tensor +#define DEFINE(T, REDUCER, IN_DIMS, NUM_AXES) \ + template void ReduceFunctor<GPUDevice>::Reduce( \ + const GPUDevice& d, TTypes<T, IN_DIMS - NUM_AXES>::Tensor out, \ + TTypes<T, IN_DIMS>::ConstTensor in, \ + const Eigen::array<Index, NUM_AXES>& reduction_axes, \ + const REDUCER& reducer); + +#define DEFINE_FOR_TYPE_AND_R(T, R) \ + DEFINE(T, R, 1, 1); \ + DEFINE(T, R, 2, 1); \ + DEFINE(T, R, 3, 1); \ + DEFINE(T, R, 3, 2); + +#define DEFINE_FOR_ALL_REDUCERS(T) \ + DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \ + DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \ + DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \ + DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>) + +DEFINE_FOR_ALL_REDUCERS(float); +#undef DEFINE_FOR_ALL_REDUCERS + +DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>); +DEFINE_FOR_TYPE_AND_R(bool, AllReducer); +DEFINE_FOR_TYPE_AND_R(bool, AnyReducer); +#undef DEFINE_FOR_TYPE_AND_R + +#undef DEFINE + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc new file mode 100644 index 0000000000..1749360b6e --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_max.cc @@ -0,0 +1,26 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Max").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReductionOp<CPUDevice, type, Eigen::internal::MaxReducer<type>>); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Max") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("reduction_indices"), \ + ReductionOp<GPUDevice, type, Eigen::internal::MaxReducer<type>>); +REGISTER_GPU_KERNELS(float); +#undef REGISTER_GPU_KERNELS + +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc new file mode 100644 index 0000000000..b00c36fed8 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_mean.cc @@ -0,0 +1,12 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Mean").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReductionOp<CPUDevice, type, Eigen::internal::MeanReducer<type>>); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc new file mode 100644 index 0000000000..de1f4b8520 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_min.cc @@ -0,0 +1,26 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Min").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReductionOp<CPUDevice, type, Eigen::internal::MinReducer<type>>); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Min") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("reduction_indices"), \ + ReductionOp<GPUDevice, type, Eigen::internal::MinReducer<type>>); +REGISTER_GPU_KERNELS(float); +#undef REGISTER_GPU_KERNELS + +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc new file mode 100644 index 0000000000..4068c7feda --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_prod.cc @@ -0,0 +1,26 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Prod").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReductionOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Prod") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("reduction_indices"), \ + ReductionOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>); +REGISTER_GPU_KERNELS(float); +#undef REGISTER_GPU_KERNELS + +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc new file mode 100644 index 0000000000..82d685e225 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_sum.cc @@ -0,0 +1,37 @@ +#include "tensorflow/core/kernels/reduction_ops_common.h" + +namespace tensorflow { + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Sum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReductionOp<CPUDevice, type, Eigen::internal::SumReducer<type>>); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +// NOTE: We should have mean(complex64,int32), too. But that needs to +// change Eigen::internal::MeanReducer to cast int to complex<float>. +// We don't see immediate need of mean(complex64,int32) anyway. +REGISTER_KERNEL_BUILDER( + Name("Sum").Device(DEVICE_CPU).TypeConstraint<complex64>("T"), + ReductionOp<CPUDevice, complex64, Eigen::internal::SumReducer<complex64>>); + +#if GOOGLE_CUDA + +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Sum") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("reduction_indices"), \ + ReductionOp<GPUDevice, type, Eigen::internal::SumReducer<type>>); +REGISTER_GPU_KERNELS(float); +#undef REGISTER_GPU_KERNELS + +REGISTER_KERNEL_BUILDER( + Name("Sum").Device(DEVICE_GPU).TypeConstraint<complex64>("T"), + ReductionOp<GPUDevice, complex64, Eigen::internal::SumReducer<complex64>>); + +#endif + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc new file mode 100644 index 0000000000..d96da3c7f1 --- /dev/null +++ b/tensorflow/core/kernels/reduction_ops_test.cc @@ -0,0 +1,73 @@ +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +// Creates a Graph which "reduce"s a 3D float tensor of "num" elements +// into a scalar. +static Graph* ToScalar(const string& reduce, int num) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor data(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)})); + data.flat<float>().setRandom(); + Tensor axes(DT_INT32, TensorShape({3})); + axes.flat<int32>()(0) = 0; + axes.flat<int32>()(1) = 1; + axes.flat<int32>()(2) = 2; + test::graph::Reduce(g, reduce, test::graph::Constant(g, data), + test::graph::Constant(g, axes)); + return g; +} + +// Creates a bench which reduces a 3D tensor with total "num" floats +// into a scalar on a "device". Runs the bench for "iters" times. +static void ReduceToScalar(int iters, const string& device, + const string& reduce, int num) { + testing::ItemsProcessed(static_cast<int64>(iters) * num); + testing::BytesProcessed(static_cast<int64>(iters) * num * sizeof(float)); + test::Benchmark(device, ToScalar(reduce, num)).Run(iters); +} + +static void BM_Sum3DToScalarCPU(int iters, int num) { + ReduceToScalar(iters, "cpu", "Sum", num); +} +BENCHMARK(BM_Sum3DToScalarCPU)->Range(1 << 13, 1 << 20); + +static void BM_Max3DToScalarCPU(int iters, int num) { + ReduceToScalar(iters, "cpu", "Max", num); +} +BENCHMARK(BM_Max3DToScalarCPU)->Range(1 << 13, 1 << 20); + +static void BM_Prod3DToScalarCPU(int iters, int num) { + ReduceToScalar(iters, "cpu", "Prod", num); +} +BENCHMARK(BM_Prod3DToScalarCPU)->Range(1 << 13, 1 << 20); + +static void BM_Mean3DToScalarCPU(int iters, int num) { + ReduceToScalar(iters, "cpu", "Mean", num); +} +BENCHMARK(BM_Mean3DToScalarCPU)->Range(1 << 13, 1 << 20); + +static void BM_Sum3DToScalarGPU(int iters, int num) { + ReduceToScalar(iters, "gpu", "Sum", num); +} +BENCHMARK(BM_Sum3DToScalarGPU)->Range(1 << 13, 1 << 20); + +static void BM_Max3DToScalarGPU(int iters, int num) { + ReduceToScalar(iters, "gpu", "Max", num); +} +BENCHMARK(BM_Max3DToScalarGPU)->Range(1 << 13, 1 << 20); + +static void BM_Prod3DToScalarGPU(int iters, int num) { + ReduceToScalar(iters, "gpu", "Prod", num); +} +BENCHMARK(BM_Prod3DToScalarGPU)->Range(1 << 13, 1 << 20); + +// Once Mean is available on GPU, enable this. +// static void BM_Mean3DToScalarGPU(int iters, int num) { +// ReduceToScalar(iters, "gpu", "Mean", num); +// } +// BENCHMARK(BM_Mean3DToScalarGPU)->Range(1 << 13, 1 << 20); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h new file mode 100644 index 0000000000..77c6ef35e9 --- /dev/null +++ b/tensorflow/core/kernels/reference_gemm.h @@ -0,0 +1,75 @@ +#ifndef TENSORFLOW_KERNELS_REFERENCE_GEMM_H_ +#define TENSORFLOW_KERNELS_REFERENCE_GEMM_H_ + +// This is an unoptimized but debuggable implementation of the GEMM matrix +// multiply function, used to compare to faster but more opaque versions, or +// for bit depths or argument combinations that aren't supported by optimized +// code. +// It assumes the row-major convention used by TensorFlow, and implements +// C = A * B, like the standard BLAS GEMM interface. If the tranpose flags are +// true, then the relevant matrix is treated as stored in column-major order. + +namespace tensorflow { +template <class T1, class T2, class T3> +void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c, + size_t m, size_t n, size_t k, const T1* a, T1 offset_a, + size_t lda, const T2* b, T2 offset_b, size_t ldb, T3* c, + int32 shift_c, int32 offset_c, int32 mult_c, size_t ldc) { + int a_i_stride; + int a_l_stride; + if (transpose_a) { + a_i_stride = 1; + a_l_stride = lda; + } else { + a_i_stride = lda; + a_l_stride = 1; + } + int b_j_stride; + int b_l_stride; + if (transpose_b) { + b_j_stride = ldb; + b_l_stride = 1; + } else { + b_j_stride = 1; + b_l_stride = ldb; + } + int c_i_stride; + int c_j_stride; + if (transpose_c) { + c_i_stride = 1; + c_j_stride = ldc; + } else { + c_i_stride = ldc; + c_j_stride = 1; + } + + const int32 highest = static_cast<int32>(Eigen::NumTraits<T3>::highest()); + const int32 lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest()); + const int32 rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1)); + + int i, j, l; + for (j = 0; j < n; j++) { + for (i = 0; i < m; i++) { + int32 total = 0; + for (l = 0; l < k; l++) { + const size_t a_index = ((i * a_i_stride) + (l * a_l_stride)); + const int32 a_value = a[a_index] - offset_a; + const size_t b_index = ((j * b_j_stride) + (l * b_l_stride)); + const int32 b_value = b[b_index] - offset_b; + total += (a_value * b_value); + } + const size_t c_index = ((i * c_i_stride) + (j * c_j_stride)); + int32_t output = ((((total + offset_c) * mult_c) + rounding) >> shift_c); + if (output > highest) { + output = highest; + } + if (output < lowest) { + output = lowest; + } + c[c_index] = static_cast<T3>(output); + } + } +} +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_REFERENCE_GEMM_H_ diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc new file mode 100644 index 0000000000..d5dd7a8119 --- /dev/null +++ b/tensorflow/core/kernels/relu_op.cc @@ -0,0 +1,154 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/relu_op.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class ReluOp : public UnaryElementWiseOp<T, ReluOp<Device, T>> { + public: + using UnaryElementWiseOp<T, ReluOp<Device, T>>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + functor::Relu<Device, T> functor; + functor(context->eigen_device<Device>(), input.flat<T>(), + output->flat<T>()); + } +}; + +template <typename Device, typename T> +class Relu6Op : public UnaryElementWiseOp<T, Relu6Op<Device, T>> { + public: + using UnaryElementWiseOp<T, Relu6Op<Device, T>>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + functor::Relu6<Device, T> functor; + functor(context->eigen_device<Device>(), input.flat<T>(), + output->flat<T>()); + } +}; + +template <typename Device, typename T> +class ReluGradOp : public BinaryElementWiseOp<T, ReluGradOp<Device, T>> { + public: + using BinaryElementWiseOp<T, ReluGradOp<Device, T>>::BinaryElementWiseOp; + + // INPUTS: + // g (gradients): backpropagated gradients + // a (inputs): inputs that were passed to ReluOp() + // OUTPUT: + // gradients to backprop + template <int NDIMS> + void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, + Tensor* output) { + OP_REQUIRES(context, a.IsSameSize(g), + errors::InvalidArgument("g and a must be the same size")); + functor::ReluGrad<Device, T> functor; + functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), + output->flat<T>()); + } +}; + +template <typename Device, typename T> +class Relu6GradOp : public BinaryElementWiseOp<T, Relu6GradOp<Device, T>> { + public: + using BinaryElementWiseOp<T, Relu6GradOp<Device, T>>::BinaryElementWiseOp; + + // INPUTS: + // g (gradients): backpropagated gradients + // a (inputs): inputs that were passed to Relu6Op() + // OUTPUT: + // gradients to backprop + template <int NDIMS> + void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, + Tensor* output) { + OP_REQUIRES(context, a.IsSameSize(g), + errors::InvalidArgument("g and a must be the same size")); + functor::Relu6Grad<Device, T> functor; + functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), + output->flat<T>()); + } +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReluOp<CPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + Relu6Op<CPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReluGradOp<CPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + Relu6GradOp<CPUDevice, type>) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void Relu<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::ConstTensor features, \ + typename TTypes<T>::Tensor activations); \ + extern template struct Relu<GPUDevice, T>; \ + \ + template <> \ + void ReluGrad<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \ + typename TTypes<T>::ConstTensor features, \ + typename TTypes<T>::Tensor backprops); \ + \ + extern template struct ReluGrad<GPUDevice, T>; \ + template <> \ + void Relu6<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::ConstTensor features, \ + typename TTypes<T>::Tensor activations); \ + extern template struct Relu6<GPUDevice, T>; \ + \ + template <> \ + void Relu6Grad<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \ + typename TTypes<T>::ConstTensor features, \ + typename TTypes<T>::Tensor backprops); \ + extern template struct Relu6Grad<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + ReluOp<GPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + Relu6Op<GPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + ReluGradOp<GPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + Relu6GradOp<GPUDevice, type>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h new file mode 100644 index 0000000000..8ed071cc4a --- /dev/null +++ b/tensorflow/core/kernels/relu_op.h @@ -0,0 +1,79 @@ +#ifndef TENSORFLOW_KERNELS_RELU_OP_H_ +#define TENSORFLOW_KERNELS_RELU_OP_H_ +// Functor definition for ReluOp and ReluGradOp, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by ReluOp to do the computations. +template <typename Device, typename T> +struct Relu { + // Computes Relu activation. + // + // features: any shape. + // activations: same shape as "features". + void operator()(const Device& d, typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor activations) { + activations.device(d) = features.cwiseMax(static_cast<T>(0)); + } +}; + +// Functor used by ReluGradOp to do the computations. +template <typename Device, typename T> +struct ReluGrad { + // Computes ReluGrad backprops. + // + // gradients: gradients backpropagated to the Relu op. + // features: inputs that where passed to the Relu op. + // backprops: gradients to backpropagate to the Relu inputs. + void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, + typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor backprops) { + // NOTE: When the activation is exactly zero, we arbitrarily choose to not + // propagate the associated gradient value. + backprops.device(d) = + gradients * (features > features.constant(static_cast<T>(0))); + } +}; + +// Functor used by Relu6Op to do the computations. +template <typename Device, typename T> +struct Relu6 { + // Computes Relu6 activation. + // + // features: any shape. + // activations: same shape as "features". + void operator()(const Device& d, typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor activations) { + activations.device(d) = + features.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(6)); + } +}; + +// Functor used by ReluGradOp to do the computations. +template <typename Device, typename T> +struct Relu6Grad { + // Computes Relu6Grad backprops. + // + // gradients: gradients backpropagated to the Relu6 op. + // features: inputs that where passed to the Relu6 op. + // backprops: gradients to backpropagate to the Relu6 inputs. + void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, + typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor backprops) { + // NOTE: When the activation is exactly zero or six, we + // arbitrarily choose to not propagate the associated gradient + // value. + backprops.device(d) = gradients * + (features > features.constant(static_cast<T>(0))) * + (features < features.constant(static_cast<T>(6))); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_RELU_OP_H_ diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc new file mode 100644 index 0000000000..6bd87ff8e4 --- /dev/null +++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc @@ -0,0 +1,27 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> + +#include "tensorflow/core/kernels/relu_op.h" + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Definition of the GPU implementations declared in relu_op.cc. +#define DEFINE_GPU_KERNELS(T) \ + template struct functor::Relu<GPUDevice, T>; \ + template struct functor::ReluGrad<GPUDevice, T>; \ + template struct functor::Relu6<GPUDevice, T>; \ + template struct functor::Relu6Grad<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc new file mode 100644 index 0000000000..7e1cf029de --- /dev/null +++ b/tensorflow/core/kernels/reshape_op.cc @@ -0,0 +1,29 @@ +// See docs in ../ops/array_ops.cc. +#include "tensorflow/core/kernels/reshape_op.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("Reshape").Device(DEVICE_CPU).HostMemory("shape"), + ReshapeOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Reshape") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint<type>("T"), \ + ReshapeOp); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Reshape") + .Device(DEVICE_GPU) + .HostMemory("tensor") + .HostMemory("shape") + .HostMemory("output") + .TypeConstraint<int32>("T"), + ReshapeOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h new file mode 100644 index 0000000000..3fd3f4492e --- /dev/null +++ b/tensorflow/core/kernels/reshape_op.h @@ -0,0 +1,83 @@ +#ifndef TENSORFLOW_KERNELS_RESHAPE_OP_H_ +#define TENSORFLOW_KERNELS_RESHAPE_OP_H_ + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class ReshapeOp : public OpKernel { + public: + explicit ReshapeOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& sizes = context->input(1); + // Preliminary validation of sizes. + OP_REQUIRES(context, TensorShapeUtils::IsLegacyVector(sizes.shape()), + errors::InvalidArgument("sizes input must be 1-D, not shape ", + sizes.shape().ShortDebugString())); + const int64 num_dims = sizes.NumElements(); + OP_REQUIRES( + context, num_dims <= 8, + errors::InvalidArgument(num_dims, " > max 8 output dims supported")); + + // Compute the output shape. Determine product of specified + // dimensions, and find the index of the unspecified one. + TensorShape shape; + int32 product = 1; + int unknown_index = -1; + auto Svec = sizes.flat<int32>(); + for (int d = 0; d < num_dims; ++d) { + const int32 size = Svec(d); + if (size == -1) { + OP_REQUIRES( + context, unknown_index == -1, + errors::InvalidArgument("only one input size may be -1, not both ", + unknown_index, " and ", d)); + unknown_index = d; + shape.AddDim(1); + } else { + OP_REQUIRES(context, size >= 0, + errors::InvalidArgument( + "size ", d, " must be non-negative, not ", size)); + shape.AddDim(size); + product *= size; + } + } + if (unknown_index != -1) { + OP_REQUIRES( + context, product > 0, + errors::InvalidArgument("cannot infer the missing input size for " + "an empty tensor unless all specified " + "input sizes are non-zero")); + const int32 missing = input.NumElements() / product; + OP_REQUIRES(context, product * missing == input.NumElements(), + errors::InvalidArgument("Input has ", input.NumElements(), + " values, which isn't divisible by ", + product)); + shape.set_dim(unknown_index, missing); + } + OP_REQUIRES(context, shape.num_elements() == input.NumElements(), + errors::InvalidArgument("Input has ", input.NumElements(), + " values, which isn't the same as ", + shape.num_elements())); + + // Actually produce the reshaped output. + Tensor output(input.dtype()); + CHECK(output.CopyFrom(input, shape)); + context->set_output(0, output); + } + + bool IsExpensive() override { return false; } +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_RESHAPE_OP_H_ diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc new file mode 100644 index 0000000000..2b22d38ad6 --- /dev/null +++ b/tensorflow/core/kernels/resize_area_op.cc @@ -0,0 +1,139 @@ +// See docs in ../ops/image_ops.cc +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Device, typename T> +class ResizeAreaOp : public OpKernel { + public: + explicit ResizeAreaOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + const Tensor& shape_t = context->input(1); + OP_REQUIRES(context, shape_t.dims() == 1, + errors::InvalidArgument("shape_t must be 1-dimensional", + shape_t.shape().ShortDebugString())); + OP_REQUIRES(context, shape_t.NumElements() == 2, + errors::InvalidArgument("shape_t must have two elements", + shape_t.shape().ShortDebugString())); + + auto Svec = shape_t.vec<int32>(); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({input.dim_size(0), Svec(0), + Svec(1), input.dim_size(3)}), + &output)); + const int64 batch_size = input.dim_size(0); + const int64 in_height = input.dim_size(1); + const int64 in_width = input.dim_size(2); + const int64 channels = input.dim_size(3); + const int64 out_height = output->dim_size(1); + const int64 out_width = output->dim_size(2); + + typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); + typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>(); + + // A temporary tensor for computing the sum. + Tensor sum_tensor; + OP_REQUIRES_OK( + context, context->allocate_temp(DataTypeToEnum<float>::value, + TensorShape({channels}), &sum_tensor)); + typename TTypes<float, 1>::Tensor sum_data = sum_tensor.vec<float>(); + + const float height_scale = in_height / static_cast<float>(out_height); + const float width_scale = in_width / static_cast<float>(out_width); + + // When using this algorithm for downsizing, the target pixel value is the + // weighted average of all the source pixels. The weight is determined by + // the contribution percentage of the source pixel. + // + // Let "scale" be "target_image_size/source_image_size". If 1/n of the + // source pixel contributes to the target pixel, then the weight is (1/n * + // scale); if the complete source pixel contributes to the target pixel, + // then the weight is scale. + // + // To visualize the implementation, use one dimension as an example: + // Resize in[4] to out[3]. + // scale = 3/4 = 0.75 + // out[0]: in[0] and 1/3 of in[1] + // out[1]: 2/3 of in[1] and 2/3 of in[2] + // out[2]: 1/3 of in[2] and in[1] + // Hence, the output pixel values are: + // out[0] = (in[0] * 1.0 + in[1] * 1/3) * scale + // out[1] = (in[1] * 2/3 + in[2] * 2/3 * scale + // out[2] = (in[3] * 1/3 + in[3] * 1.0) * scale + float scale = 1.0 / (height_scale * width_scale); + for (int64 b = 0; b < batch_size; ++b) { + for (int64 y = 0; y < out_height; ++y) { + const float in_y = y * height_scale; + const float in_y1 = (y + 1) * height_scale; + // The start and end height indices of all the cells that could + // contribute to the target cell. + int64 y_start = floor(in_y); + int64 y_end = ceil(in_y1); + + for (int64 x = 0; x < out_width; ++x) { + const float in_x = x * width_scale; + const float in_x1 = (x + 1) * width_scale; + // The start and end width indices of all the cells that could + // contribute to the target cell. + int64 x_start = floor(in_x); + int64 x_end = ceil(in_x1); + + sum_data.setConstant(0.0); + for (int64 i = y_start; i < y_end; ++i) { + float scale_y = + i < in_y ? i + 1 - in_y : (i + 1 > in_y1 ? in_y1 - i : 1.0); + for (int64 j = x_start; j < x_end; ++j) { + float scale_x = + j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0); + for (int64 c = 0; c < channels; ++c) { +#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val)))) + sum_data(c) += + input_data(b, BOUND(i, in_height), BOUND(j, in_width), c) * + scale_y * scale_x * scale; +#undef BOUND + } + } + } + for (int64 c = 0; c < channels; ++c) { + output_data(b, y, x, c) = sum_data(c); + } + } + } + } + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("ResizeArea") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("size"), \ + ResizeAreaOp<CPUDevice, T>); + +REGISTER_KERNEL(uint8); +REGISTER_KERNEL(int8); +REGISTER_KERNEL(int32); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc new file mode 100644 index 0000000000..472fc19b82 --- /dev/null +++ b/tensorflow/core/kernels/resize_bicubic_op.cc @@ -0,0 +1,121 @@ +// See docs in ../ops/image_ops.cc +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Device, typename T> +class ResizeBicubicOp : public OpKernel { + public: + explicit ResizeBicubicOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + const Tensor& shape_t = context->input(1); + OP_REQUIRES(context, shape_t.dims() == 1, + errors::InvalidArgument("shape_t must be 1-dimensional", + shape_t.shape().ShortDebugString())); + OP_REQUIRES(context, shape_t.NumElements() == 2, + errors::InvalidArgument("shape_t must have two elements", + shape_t.shape().ShortDebugString())); + + auto Svec = shape_t.vec<int32>(); + // Initialize shape to the batch size of the input, then add + // the rest of the dimensions + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({input.dim_size(0), Svec(0), + Svec(1), input.dim_size(3)}), + &output)); + const int64 batch_size = input.dim_size(0); + const int64 in_height = input.dim_size(1); + const int64 in_width = input.dim_size(2); + const int64 channels = input.dim_size(3); + const int64 out_height = output->dim_size(1); + const int64 out_width = output->dim_size(2); + + typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); + typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>(); + + const float height_scale = in_height / static_cast<float>(out_height); + const float width_scale = in_width / static_cast<float>(out_width); + + // Initialize coefficients table using Bicubic convolution algorithm. + // https://en.wikipedia.org/wiki/Bicubic_interpolation + static const int64 tab_size = (1 << 10); + static float coeffs_tab[(tab_size + 1) * 2]; + static const double A = -0.75; + for (int i = 0; i <= tab_size; ++i) { + float x = i * 1.0 / tab_size; + coeffs_tab[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1; + x += 1.0; + coeffs_tab[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; + } + + auto cal = [](float v0, float v1, float v2, float v3, float dx) { + const int64 offset = round(dx * tab_size); + const float a0 = coeffs_tab[offset * 2 + 1]; + const float a1 = coeffs_tab[offset * 2]; + const float a2 = coeffs_tab[(tab_size - offset) * 2]; + const float a3 = coeffs_tab[(tab_size - offset) * 2 + 1]; + return a0 * v0 + a1 * v1 + a2 * v2 + a3 * v3; + }; + + float coeff[4] = {0.0}; + for (int64 b = 0; b < batch_size; ++b) { + for (int64 y = 0; y < out_height; ++y) { + const int64 in_y = floor(height_scale * y); + const float dy = height_scale * y - in_y; + for (int64 x = 0; x < out_width; ++x) { + const int64 in_x = floor(width_scale * x); + const float dx = width_scale * x - in_x; + for (int64 c = 0; c < channels; ++c) { + for (int64 i = 0; i < 4; ++i) { +#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val)))) + int64 bound_y = BOUND(in_y - 1 + i, in_height); + coeff[i] = + cal(input_data(b, bound_y, BOUND(in_x - 1, in_width), c), + input_data(b, bound_y, BOUND(in_x, in_width), c), + input_data(b, bound_y, BOUND(in_x + 1, in_width), c), + input_data(b, bound_y, BOUND(in_x + 2, in_width), c), dx); +#undef BOUND + } + output_data(b, y, x, c) = + cal(coeff[0], coeff[1], coeff[2], coeff[3], dy); + } + } + } + } + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("ResizeBicubic") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("size"), \ + ResizeBicubicOp<CPUDevice, T>); + +REGISTER_KERNEL(uint8); +REGISTER_KERNEL(int8); +REGISTER_KERNEL(int32); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc new file mode 100644 index 0000000000..5119b93508 --- /dev/null +++ b/tensorflow/core/kernels/resize_bilinear_op.cc @@ -0,0 +1,109 @@ +// See docs in ../ops/image_ops.cc +#define EIGEN_USE_THREADS + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Device, typename T> +class ResizeBilinearOp : public OpKernel { + public: + explicit ResizeBilinearOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + const Tensor& shape_t = context->input(1); + OP_REQUIRES(context, shape_t.dims() == 1, + errors::InvalidArgument("shape_t must be 1-dimensional", + shape_t.shape().ShortDebugString())); + OP_REQUIRES(context, shape_t.NumElements() == 2, + errors::InvalidArgument("shape_t must have two elements", + shape_t.shape().ShortDebugString())); + + auto Svec = shape_t.vec<int32>(); + // Initialize shape to the batch size of the input, then add + // the rest of the dimensions + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({input.dim_size(0), Svec(0), + Svec(1), input.dim_size(3)}), + &output)); + + const int64 batch_size = input.dim_size(0); + const int64 in_height = input.dim_size(1); + const int64 in_width = input.dim_size(2); + const int64 channels = input.dim_size(3); + const int64 out_height = output->dim_size(1); + const int64 out_width = output->dim_size(2); + + typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); + typename TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>(); + + const float height_scale = in_height / static_cast<float>(out_height); + const float width_scale = in_width / static_cast<float>(out_width); + + for (int b = 0; b < batch_size; ++b) { + for (int y = 0; y < out_height; ++y) { + const float in_y = y * height_scale; + const int top_y_index = static_cast<int>(floorf(in_y)); + const int bottom_y_index = + std::min(static_cast<int64>(ceilf(in_y)), (in_height - 1)); + const float y_lerp = in_y - top_y_index; + const float inverse_y_lerp = (1.0f - y_lerp); + for (int x = 0; x < out_width; ++x) { + const float in_x = x * width_scale; + const int left_x_index = static_cast<int>(floorf(in_x)); + const int right_x_index = + std::min(static_cast<int64>(ceilf(in_x)), (in_width - 1)); + const float x_lerp = in_x - left_x_index; + const float inverse_x_lerp = (1.0f - x_lerp); + for (int c = 0; c < channels; ++c) { + const float top_left = input_data(b, top_y_index, left_x_index, c); + const float top_right = + input_data(b, top_y_index, right_x_index, c); + const float bottom_left = + input_data(b, bottom_y_index, left_x_index, c); + const float bottom_right = + input_data(b, bottom_y_index, right_x_index, c); + const float top = + (top_left * inverse_x_lerp) + (top_right * x_lerp); + const float bottom = + (bottom_left * inverse_x_lerp) + (bottom_right * x_lerp); + output_data(b, y, x, c) = + (top * inverse_y_lerp) + (bottom * y_lerp); + } + } + } + } + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("ResizeBilinear") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("size"), \ + ResizeBilinearOp<CPUDevice, T>); + +REGISTER_KERNEL(uint8); +REGISTER_KERNEL(int8); +REGISTER_KERNEL(int32); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc new file mode 100644 index 0000000000..0ebe2e5f8c --- /dev/null +++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc @@ -0,0 +1,171 @@ +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { + +class ResizeBilinearOpTest : public OpsTestBase { + protected: + ResizeBilinearOpTest() { + RequireDefaultOps(); + EXPECT_OK(NodeDefBuilder("resize_bilinear_op", "ResizeBilinear") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT32)) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + } +}; + +TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {1, 1}); + ASSERT_OK(RunOpKernel()); + + // When scaling down, we have to arbitrarily pick a pixel from the + // original input. In this case, we choose the top/left most pixel. + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); + test::FillValues<float>(&expected, {1.0}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {3, 3}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1})); + + // The corners should match the original corners, and we bilinear + // interpolate the values in between. + + // clang-format off + test::FillValues<float>(&expected, + {1, 5.0/3, 2, + 7.0/3, 3, 10.0/3, + 3, 11.0/3, 4}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeBilinearOpTest, TestBilinear3x3To4x4) { + // Input: + // 1, 2, 3, + // 4, 5, 6, + // 7, 8, 9 + AddInputFromArray<float>(TensorShape({1, 3, 3, 1}), + {1, 2, 3, 4, 5, 6, 7, 8, 9}); + AddInputFromArray<int32>(TensorShape({2}), {4, 4}); + ASSERT_OK(RunOpKernel()); + + // The corners should match the original corners, and we bilinear + // interpolate the values in between. + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1})); + // clang-format off + test::FillValues<float>(&expected, + {1, 1.75, 2.5, 3, + 3.25, 4, 4.75, 5.25, + 5.5, 6.25, 7, 7.5, + 7, 7.75, 8.5, 9}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) { + // Input: + // 1, 2 + // 3, 4 + // + // repeated twice + AddInputFromArray<float>(TensorShape({2, 2, 2, 1}), {1, 2, 3, 4, 1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {3, 3}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 1})); + // clang-format off + test::FillValues<float>(&expected, + {1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4, + 1, 5.0/3, 2, 7.0/3, 3, 10.0/3, 3, 11.0/3, 4 + }); + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) { + AddInputFromArray<float>(TensorShape({1, 2, 2, 2}), + {1, -1, 2, -2, 3, -3, 4, -4}); + AddInputFromArray<int32>(TensorShape({2}), {3, 3}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 2})); + // clang-format off + test::FillValues<float>(&expected, + { + 1, -1, + 5.0/3, -5.0/3, + 2, -2, + 7.0/3, -7.0/3, + 3, -3, + 10.0/3, -10.0/3, + 3, -3, + 11.0/3, -11.0/3, + 4, -4 + }); + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {4, 4}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1})); + // clang-format off + test::FillValues<float>(&expected, + {1, 1.5, 2, 2, + 2, 2.5, 3, 3, + 3, 3.5, 4, 4, + 3, 3.5, 4, 4}); + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) { + AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {4, 4}); + ASSERT_FALSE(RunOpKernel().ok()); +} + +TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) { + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4}); + ASSERT_FALSE(RunOpKernel().ok()); +} +TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) { + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1}); + ASSERT_FALSE(RunOpKernel().ok()); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc new file mode 100644 index 0000000000..13089308ce --- /dev/null +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc @@ -0,0 +1,89 @@ +// See docs in ../ops/image_ops.cc +#define EIGEN_USE_THREADS + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename Device, typename T> +class ResizeNearestNeighborOp : public OpKernel { + public: + explicit ResizeNearestNeighborOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().ShortDebugString())); + const Tensor& shape_t = context->input(1); + OP_REQUIRES(context, shape_t.dims() == 1, + errors::InvalidArgument("shape_t must be 1-dimensional", + shape_t.shape().ShortDebugString())); + OP_REQUIRES(context, shape_t.NumElements() == 2, + errors::InvalidArgument("shape_t must have two elements", + shape_t.shape().ShortDebugString())); + + auto Svec = shape_t.vec<int32>(); + // Initialize shape to the batch size of the input, then add + // the rest of the dimensions + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({input.dim_size(0), Svec(0), + Svec(1), input.dim_size(3)}), + &output)); + + const int64 batch_size = input.dim_size(0); + const int64 in_height = input.dim_size(1); + const int64 in_width = input.dim_size(2); + const int64 channels = input.dim_size(3); + const int64 out_height = output->dim_size(1); + const int64 out_width = output->dim_size(2); + + typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>(); + typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>(); + + const float height_scale = in_height / static_cast<float>(out_height); + const float width_scale = in_width / static_cast<float>(out_width); + + for (int b = 0; b < batch_size; ++b) { + for (int y = 0; y < out_height; ++y) { + const int in_y = std::min(static_cast<int64>(floorf(y * height_scale)), + (in_height - 1)); + for (int x = 0; x < out_width; ++x) { + const int in_x = std::min(static_cast<int64>(floorf(x * width_scale)), + (in_width - 1)); + for (int c = 0; c < channels; ++c) { + output_data(b, y, x, c) = input_data(b, in_y, in_x, c); + } + } + } + } + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("size"), \ + ResizeNearestNeighborOp<CPUDevice, T>); + +REGISTER_KERNEL(uint8); +REGISTER_KERNEL(int8); +REGISTER_KERNEL(int32); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc new file mode 100644 index 0000000000..8fca1f34e3 --- /dev/null +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc @@ -0,0 +1,163 @@ +// TODO(shlens, sherrym): Consider adding additional tests in image_ops.py in +// order to compare the reference implementation for image resizing in Python +// Image Library. +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { + +class ResizeNearestNeighborOpTest : public OpsTestBase { + protected: + ResizeNearestNeighborOpTest() { + RequireDefaultOps(); + EXPECT_OK(NodeDefBuilder("resize_nn", "ResizeNearestNeighbor") + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_INT32)) + .Finalize(node_def())); + EXPECT_OK(InitOp()); + } +}; + +TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {1, 1}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); + + // clang-format off + test::FillValues<float>(&expected, {1}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {3, 3}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1})); + + // clang-format off + test::FillValues<float>(&expected, + {1, 1, 2, + 1, 1, 2, + 3, 3, 4}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {2, 5}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 5, 1})); + + // clang-format off + test::FillValues<float>(&expected, + {1, 1, 1, 2, 2, + 3, 3, 3, 4, 4}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {5, 2}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 5, 2, 1})); + + // clang-format off + test::FillValues<float>(&expected, + {1, 2, + 1, 2, + 1, 2, + 3, 4, + 3, 4}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) { + // Input: + // 1, 2 + // 3, 4 + AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4}); + AddInputFromArray<int32>(TensorShape({2}), {4, 4}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 4, 4, 1})); + + // clang-format off + test::FillValues<float>(&expected, + {1, 1, 2, 2, + 1, 1, 2, 2, + 3, 3, 4, 4, + 3, 3, 4, 4}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) { + // Input: + // [ [ 1, 1 ], [ 2, 2], + // [ 3, 3 ], [ 4, 4] ], + // [ [ 5, 5 ], [ 6, 6], + // [ 7, 7 ], [ 8, 8] ] + AddInputFromArray<float>(TensorShape({2, 2, 2, 2}), + {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8}); + AddInputFromArray<int32>(TensorShape({2}), {3, 3}); + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 3, 2})); + + // clang-format off + test::FillValues<float>(&expected, + {1, 1, 1, + 1, 2, 2, + 1, 1, 1, + 1, 2, 2, + 3, 3, 3, + 3, 4, 4, + 5, 5, 5, + 5, 6, 6, + 5, 5, 5, + 5, 6, 6, + 7, 7, 7, + 7, 8, 8}); + + // clang-format on + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/restore_op.cc b/tensorflow/core/kernels/restore_op.cc new file mode 100644 index 0000000000..b52c69449c --- /dev/null +++ b/tensorflow/core/kernels/restore_op.cc @@ -0,0 +1,65 @@ +// See docs in ../ops/io_ops.cc. +#include "tensorflow/core/kernels/io.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/tensor_slice_reader.h" + +namespace tensorflow { + +class RestoreOp : public OpKernel { + public: + explicit RestoreOp(OpKernelConstruction* context) : OpKernel(context) { + int preferred_shard; + OP_REQUIRES_OK(context, + context->GetAttr("preferred_shard", &preferred_shard)); + if (preferred_shard == -1) { + preferred_shard_ = checkpoint::TensorSliceReader::kLoadAllShards; + } else { + OP_REQUIRES(context, preferred_shard >= 0, + errors::InvalidArgument("Attribute 'preferred_shard' must be " + "greater or equal to -1")); + preferred_shard_ = preferred_shard; + } + } + void Compute(OpKernelContext* context) override { + RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader, + preferred_shard_, false); + } + + private: + int preferred_shard_; +}; + +REGISTER_KERNEL_BUILDER(Name("Restore").Device(DEVICE_CPU), RestoreOp); + +class RestoreSliceOp : public OpKernel { + public: + explicit RestoreSliceOp(OpKernelConstruction* context) : OpKernel(context) { + int preferred_shard; + OP_REQUIRES_OK(context, + context->GetAttr("preferred_shard", &preferred_shard)); + if (preferred_shard == -1) { + preferred_shard_ = checkpoint::TensorSliceReader::kLoadAllShards; + } else { + OP_REQUIRES(context, preferred_shard >= 0, + errors::InvalidArgument("Attribute 'preferred_shard' must be " + "greater or equal to -1")); + preferred_shard_ = preferred_shard; + } + } + void Compute(OpKernelContext* context) override { + RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader, + preferred_shard_, true); + } + + private: + int preferred_shard_; +}; + +REGISTER_KERNEL_BUILDER(Name("RestoreSlice").Device(DEVICE_CPU), + RestoreSliceOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc new file mode 100644 index 0000000000..59343a8037 --- /dev/null +++ b/tensorflow/core/kernels/restore_op_test.cc @@ -0,0 +1,305 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/tensor_slice_reader_cache.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +class RestoreOpTest : public OpsTestBase { + protected: + // Makes an operation to restore two tensors + void MakeRestoreOp(DataType dt) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "Restore") + .Input(FakeInput()) + .Input(FakeInput()) + .Attr("dt", dt) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(RestoreOpTest, RestoreInt) { + const string filename = io::JoinPath(testing::TmpDir(), "tensor_int"); + const string tensor_name = "tensor_int"; + + // We first need to write a tensor using the save_op + { + // Initialize an operation + NodeDef save; + ASSERT_OK(NodeDefBuilder("save", "Save") + .Input(FakeInput(DT_STRING)) + .Input(FakeInput(DT_STRING)) + .Input(FakeInput({DT_INT32})) + .Finalize(&save)); + + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + gtl::InlinedVector<TensorValue, 4> inputs; + + Status status; + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), save, &status)); + EXPECT_OK(status); + + // Run it + + // Input #0 is the file name + Tensor input_0(DT_STRING, TensorShape({})); + input_0.scalar<string>()() = filename; + inputs.push_back({nullptr, &input_0}); + + // Input #1 is the tensor name + Tensor input_1(DT_STRING, TensorShape({})); + input_1.scalar<string>()() = tensor_name; + inputs.push_back({nullptr, &input_1}); + + // Input #2 is an integer tensor: it's a 1-d array. + Tensor input_2(DT_INT32, TensorShape({10})); + for (int i = 0; i < 10; ++i) { + input_2.flat<int32>()(i) = i + 1; + } + inputs.push_back({nullptr, &input_2}); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; + params.slice_reader_cache = &slice_reader_cache_wrapper; + + OpKernelContext ctx(params); + op->Compute(&ctx); + EXPECT_OK(ctx.status()); + } + + // Now we restore + MakeRestoreOp(DT_INT32); + // Add a file name + AddInput<string>(TensorShape({}), + [&filename](int x) -> string { return filename; }); + // Add the tensor names + AddInput<string>(TensorShape({}), + [&tensor_name](int x) -> string { return tensor_name; }); + + ASSERT_OK(RunOpKernel()); + + // Check that we have an integer tensor + Tensor* output = GetOutput(0); + TensorShape expected({10}); + EXPECT_TRUE(output->shape().IsSameSize(expected)); + for (int i = 0; i < 10; ++i) { + EXPECT_EQ(i + 1, output->flat<int32>()(i)); + } +} + +TEST_F(RestoreOpTest, RestoreFloat) { + const string filename = io::JoinPath(testing::TmpDir(), "tensor_float"); + const string tensor_name = "tensor_float"; + + // We first need to write a tensor using the save_op + { + // Initialize an operation + NodeDef save; + ASSERT_OK(NodeDefBuilder("save", "Save") + .Input(FakeInput(DT_STRING)) + .Input(FakeInput(DT_STRING)) + .Input(FakeInput({DT_FLOAT})) + .Finalize(&save)); + + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + gtl::InlinedVector<TensorValue, 4> inputs; + + Status status; + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), save, &status)); + EXPECT_OK(status); + + // Run it + + // Input #0 is the file name + Tensor input_0(DT_STRING, TensorShape({})); + input_0.scalar<string>()() = filename; + inputs.push_back({nullptr, &input_0}); + + // Input #1 is the tensor name + Tensor input_1(DT_STRING, TensorShape({})); + input_1.scalar<string>()() = tensor_name; + inputs.push_back({nullptr, &input_1}); + + // Input #2 is a float tensor: it's a 2-d array. + Tensor input_2(DT_FLOAT, TensorShape({2, 4})); + for (int i = 0; i < 8; ++i) { + input_2.flat<float>()(i) = static_cast<float>(i) / 10; + } + inputs.push_back({nullptr, &input_2}); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; + params.slice_reader_cache = &slice_reader_cache_wrapper; + + OpKernelContext ctx(params); + op->Compute(&ctx); + EXPECT_OK(ctx.status()); + } + + // Now we restore + MakeRestoreOp(DT_FLOAT); + // Add a file name + AddInput<string>(TensorShape({}), + [&filename](int x) -> string { return filename; }); + // Add the tensor names + AddInput<string>(TensorShape({}), + [&tensor_name](int x) -> string { return tensor_name; }); + + ASSERT_OK(RunOpKernel()); + + // Check that we have a float tensor. + Tensor* output = GetOutput(0); + TensorShape expected({2, 4}); + EXPECT_TRUE(output->shape().IsSameSize(expected)); + for (int i = 0; i < 8; ++i) { + EXPECT_EQ(static_cast<float>(i) / 10, output->flat<float>()(i)); + } +} + +class RestoreSliceOpTest : public OpsTestBase { + protected: + void MakeRestoreSliceOp(DataType dt) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "RestoreSlice") + .Input(FakeInput()) + .Input(FakeInput()) + .Input(FakeInput()) + .Attr("dt", dt) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(RestoreSliceOpTest, RestoreInt) { + const string filename = io::JoinPath(testing::TmpDir(), "tensor_int"); + const string tensor_name = "tensor_int"; + + // We first need to write a tensor using the save_op + { + // Initialize an operation + NodeDef save; + ASSERT_OK(NodeDefBuilder("save", "Save") + .Input(FakeInput(DT_STRING)) + .Input(FakeInput(DT_STRING)) + .Input(FakeInput({DT_INT32})) + .Finalize(&save)); + + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + gtl::InlinedVector<TensorValue, 4> inputs; + + Status status; + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), save, &status)); + EXPECT_OK(status); + + // Run it + + // Input #0 is the file name + Tensor input_0(DT_STRING, TensorShape({})); + input_0.scalar<string>()() = filename; + inputs.push_back({nullptr, &input_0}); + + // Input #1 is the tensor name + Tensor input_1(DT_STRING, TensorShape({})); + input_1.scalar<string>()() = tensor_name; + inputs.push_back({nullptr, &input_1}); + + // Input #2 is a 4x16 integer tensor. + Tensor input_2(DT_INT32, TensorShape({4, 16})); + for (int64 i = 0; i < input_2.NumElements(); ++i) { + input_2.flat<int32>()(i) = i + 1; + } + inputs.push_back({nullptr, &input_2}); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; + params.slice_reader_cache = &slice_reader_cache_wrapper; + + OpKernelContext ctx(params); + op->Compute(&ctx); + EXPECT_OK(ctx.status()); + } + + // Now we restore + MakeRestoreSliceOp(DT_INT32); + string shape_and_slice = "4 16 0,2:-"; + // Add a file name + AddInput<string>(TensorShape({}), + [&filename](int x) -> string { return filename; }); + // Add the tensor names + AddInput<string>(TensorShape({}), + [&tensor_name](int x) -> string { return tensor_name; }); + // Add the tensor shape and slice + AddInput<string>(TensorShape({}), [&shape_and_slice](int x) -> string { + return shape_and_slice; + }); + + ASSERT_OK(RunOpKernel()); + + // Check that we have an integer tensor + Tensor* output = GetOutput(0); + TensorShape expected({2, 16}); + EXPECT_TRUE(output->shape().IsSameSize(expected)); + for (int64 i = 0; i < expected.num_elements(); ++i) { + EXPECT_EQ(i + 1, output->flat<int32>()(i)); + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc new file mode 100644 index 0000000000..c63dfc1e70 --- /dev/null +++ b/tensorflow/core/kernels/reverse_op.cc @@ -0,0 +1,139 @@ +// See docs in ../ops/array_ops.cc +#define EIGEN_USE_THREADS + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/reverse_op.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class ReverseOp : public OpKernel { + public: + explicit ReverseOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& dims = context->input(1); + + if (TensorShapeUtils::IsScalar(input.shape())) { + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + output->scalar<T>() = input.scalar<T>(); + + } else { + const int input_dims = input.dims(); + OP_REQUIRES(context, TensorShapeUtils::IsVector(dims.shape()), + errors::InvalidArgument("'dims' must be 1-dimension, not ", + dims.dims())); + + OP_REQUIRES(context, input_dims == dims.dim_size(0), + errors::InvalidArgument( + "'dims' must have the same number of values as 'input' has " + "dimensions. 'input' has ", input_dims, "'dims' has ", + dims.dim_size(0), " values")); + OP_REQUIRES(context, input_dims <= 8, errors::Unimplemented( + "reverse is not implemented for tensors of rank > 8.")); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + +#define HANDLE_REVERSE(NDIMS) \ + case NDIMS: \ + functor::Reverse<Device, T, NDIMS>()( \ + context->eigen_device<Device>(), input.tensor<T, NDIMS>(), \ + dims.vec<bool>(), output->tensor<T, NDIMS>()); \ + return; + + switch (input_dims) { + HANDLE_REVERSE(0); + HANDLE_REVERSE(1); + HANDLE_REVERSE(2); + HANDLE_REVERSE(3); + HANDLE_REVERSE(4); + HANDLE_REVERSE(5); + HANDLE_REVERSE(6); + HANDLE_REVERSE(7); + HANDLE_REVERSE(8); + } +#undef HANDLE_REVERSE + } + } +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Reverse") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("dims"), \ + ReverseOp<CPUDevice, T>) + +REGISTER_KERNEL(uint8); +REGISTER_KERNEL(int8); +REGISTER_KERNEL(int32); +REGISTER_KERNEL(bool); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + +#if GOOGLE_CUDA + +// Forward declarations of the function specializations for GPU (to prevent +// building the GPU versions here, they will be built compiling _gpu.cu.cc). +namespace functor { +#define DECLARE_GPU_SPEC_DIM(T, DIM) \ + template <> \ + void Reverse<GPUDevice, T, DIM>::operator()( \ + const GPUDevice& d, typename TTypes<T, DIM>::ConstTensor input, \ + typename TTypes<bool, 1>::ConstTensor dims, \ + typename TTypes<T, DIM>::Tensor output); \ + extern template struct Reverse<GPUDevice, T, DIM>; +#define DECLARE_GPU_SPEC(T) \ + DECLARE_GPU_SPEC_DIM(T, 0) \ + DECLARE_GPU_SPEC_DIM(T, 1) \ + DECLARE_GPU_SPEC_DIM(T, 2) \ + DECLARE_GPU_SPEC_DIM(T, 3) \ + DECLARE_GPU_SPEC_DIM(T, 4) \ + DECLARE_GPU_SPEC_DIM(T, 5) \ + DECLARE_GPU_SPEC_DIM(T, 6) \ + DECLARE_GPU_SPEC_DIM(T, 7) \ + DECLARE_GPU_SPEC_DIM(T, 8) + +DECLARE_GPU_SPEC(uint8); +DECLARE_GPU_SPEC(int8); +DECLARE_GPU_SPEC(int32); +DECLARE_GPU_SPEC(bool); +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +#undef DECLARE_GPU_SPEC_DIM +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("Reverse") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T") \ + .HostMemory("dims"), \ + ReverseOp<GPUDevice, T>) +REGISTER_GPU_KERNEL(uint8); +REGISTER_GPU_KERNEL(int8); +REGISTER_GPU_KERNEL(float); +REGISTER_GPU_KERNEL(double); +#undef REGISTER_GPU_KERNEL + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_op.h b/tensorflow/core/kernels/reverse_op.h new file mode 100644 index 0000000000..bba25f70e8 --- /dev/null +++ b/tensorflow/core/kernels/reverse_op.h @@ -0,0 +1,28 @@ +#ifndef TENSORFLOW_KERNELS_REVERSE_OP_H_ +#define TENSORFLOW_KERNELS_REVERSE_OP_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by MirrorOp to do the computations. +template <typename Device, typename T, int Dims> +struct Reverse { + void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor input, + typename TTypes<bool, 1>::ConstTensor dims, + typename TTypes<T, Dims>::Tensor output) { + // mirror is in host memory + Eigen::array<bool, Dims> reverse_dims; + for (int i = 0; i < Dims; ++i) { + reverse_dims[i] = dims(i); + } + output.device(d) = input.reverse(reverse_dims); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_MIRROR_OP_H_ diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc new file mode 100644 index 0000000000..b510add3f3 --- /dev/null +++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc @@ -0,0 +1,33 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/reverse_op.h" + +#include "tensorflow/core/framework/register_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_REVERSE(DIM) \ + template struct functor::Reverse<GPUDevice, uint8, DIM>; \ + template struct functor::Reverse<GPUDevice, int8, DIM>; \ + template struct functor::Reverse<GPUDevice, int32, DIM>; \ + template struct functor::Reverse<GPUDevice, bool, DIM>; \ + template struct functor::Reverse<GPUDevice, float, DIM>; \ + template struct functor::Reverse<GPUDevice, double, DIM>; +DEFINE_REVERSE(0) +DEFINE_REVERSE(1) +DEFINE_REVERSE(2) +DEFINE_REVERSE(3) +DEFINE_REVERSE(4) +DEFINE_REVERSE(5) +DEFINE_REVERSE(6) +DEFINE_REVERSE(7) +DEFINE_REVERSE(8) +#undef DEFINE_REVERSE + +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc new file mode 100644 index 0000000000..d41c36e693 --- /dev/null +++ b/tensorflow/core/kernels/reverse_op_test.cc @@ -0,0 +1,101 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +class ReverseOpTest : public OpsTestBase { + protected: + void MakeOp(DataType data_type) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "Reverse") + .Input(FakeInput(data_type)) + .Input(FakeInput()) + .Attr("T", data_type) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(ReverseOpTest, Reverse_0) { + MakeOp(DT_FLOAT); + AddInputFromArray<float>(TensorShape({}), {3}); + AddInputFromArray<bool>(TensorShape({}), {true}); + ASSERT_OK(RunOpKernel()); + + Tensor* output = GetOutput(0); + Tensor expected(allocator(), DT_FLOAT, TensorShape({})); + expected.scalar<float>() = expected.scalar<float>().constant(3.f); + test::ExpectTensorEqual<float>(expected, *output); +} + +TEST_F(ReverseOpTest, Reverse_234) { + MakeOp(DT_FLOAT); + + // Feed and run + // [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] + // [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]] + AddInputFromArray<float>(TensorShape({2, 3, 4}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23}); + AddInputFromArray<bool>(TensorShape({3}), {true, false, true}); + + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor* params_tensor = GetOutput(0); + Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 4})); + // Should become + // [[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]] + // [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]] + test::FillValues<float>( + &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7, + 6, 5, 4, 11, 10, 9, 8}); + test::ExpectTensorEqual<float>(expected, *params_tensor); +} + +TEST_F(ReverseOpTest, Reverse_1234) { + MakeOp(DT_FLOAT); + + // Feed and run + // [[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] + // [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]] + AddInputFromArray<float>(TensorShape({1, 2, 3, 4}), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23}); + AddInputFromArray<bool>(TensorShape({4}), {true, true, false, true}); + + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor* params_tensor = GetOutput(0); + Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4})); + // Should become + // [[[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]] + // [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]] + test::FillValues<float>( + &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7, + 6, 5, 4, 11, 10, 9, 8}); + test::ExpectTensorEqual<float>(expected, *params_tensor); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc new file mode 100644 index 0000000000..6673a700ef --- /dev/null +++ b/tensorflow/core/kernels/reverse_sequence_op.cc @@ -0,0 +1,170 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "tensorflow/core/kernels/reverse_sequence_op.h" + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device> +void CheckErrors(OpKernelContext* context, int seq_dim) { + const Tensor& input = context->input(0); + const Tensor& seq_lens = context->input(1); + + auto seq_lens_t = seq_lens.vec<int64>(); + + std::vector<int64> seq_lens_vec(seq_lens_t.size()); + + // Copy seq_len info down for validity checks + context->eigen_device<Device>().memcpyDeviceToHost( + seq_lens_vec.data(), seq_lens_t.data(), + sizeof(int64) * seq_lens_t.size()); + + OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim")); + OP_REQUIRES(context, seq_dim < input.dims(), + errors::InvalidArgument("seq_dim must be < input.dims()", "( ", + seq_dim, " vs. ", input.dims(), ")")); + + OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0), + errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ", + "(", seq_lens.NumElements(), " vs. ", + input.dim_size(seq_dim))); + + for (int d = 0; d < seq_lens_vec.size(); ++d) { + OP_REQUIRES(context, seq_lens_vec[d] >= 0, + errors::InvalidArgument("seq_lens(", d, ") < 0")); + OP_REQUIRES(context, seq_lens_vec[d] <= input.dim_size(seq_dim), + errors::InvalidArgument("seq_lens(", d, ") > input.dims(", + seq_dim, ")")); + } +} + +template <> +void CheckErrors<GPUDevice>(OpKernelContext* context, int seq_dim) { + const Tensor& input = context->input(0); + const Tensor& seq_lens = context->input(1); + + OP_REQUIRES(context, 0 != seq_dim, errors::InvalidArgument("0 == seq_dim")); + OP_REQUIRES(context, seq_dim < input.dims(), + errors::InvalidArgument("seq_dim must be < input.dims()", "( ", + seq_dim, " vs. ", input.dims(), ")")); + + OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(0), + errors::InvalidArgument("len(seq_lens) != input.dims(", 0, "), ", + "(", seq_lens.NumElements(), " vs. ", + input.dim_size(seq_dim))); +} + +template <typename Device, typename T> +class ReverseSequenceOp : public OpKernel { + public: + explicit ReverseSequenceOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& seq_lens = context->input(1); + + // Preliminary validation of sizes. + OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()), + errors::InvalidArgument("seq_lens input must be 1-dim, not ", + seq_lens.dims())); + + auto seq_lens_t = seq_lens.vec<int64>(); + + CheckErrors<Device>(context, seq_dim_); + + const int input_dims = input.dims(); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + +#define HANDLE_DIM(NDIM) \ + case NDIM: \ + functor::ReverseSequence<Device, T, NDIM>::Compute( \ + context->eigen_device<Device>(), input.tensor<T, NDIM>(), seq_dim_, \ + seq_lens_t, output->tensor<T, NDIM>()); \ + break; + + switch (input_dims) { + HANDLE_DIM(2); + HANDLE_DIM(3); + HANDLE_DIM(4); + HANDLE_DIM(5); + + default: + OP_REQUIRES(context, false, + errors::InvalidArgument( + "ReverseSequenceOp : Unhandled input dimensions: ", + input_dims)); + } + } + + private: + int32 seq_dim_; + + TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp); +}; + +#define REGISTER_REVERSE_SEQUENCE(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + ReverseSequenceOp<CPUDevice, type>); + +TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE); + +#if GOOGLE_CUDA + +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T, Dims) \ + template <> \ + void ReverseSequence<GPUDevice, T, Dims>::Compute( \ + const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \ + int32 seq_dim, TTypes<int64>::ConstVec seq_lens, \ + typename TTypes<T, Dims>::Tensor output); \ + extern template struct ReverseSequence<GPUDevice, T, Dims>; + +#define DECLARE_GPU_SPECS(T) \ + DECLARE_GPU_SPEC(T, 2); \ + DECLARE_GPU_SPEC(T, 3); \ + DECLARE_GPU_SPEC(T, 4); \ + DECLARE_GPU_SPEC(T, 5); + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS); + +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_REVERSE_SEQUENCE_GPU(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + ReverseSequenceOp<GPUDevice, type>); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU); + +#undef REGISTER_REVERSE_SEQUENCE_GPU + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h new file mode 100644 index 0000000000..d1dd572dcb --- /dev/null +++ b/tensorflow/core/kernels/reverse_sequence_op.h @@ -0,0 +1,56 @@ +#ifndef TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_ +#define TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_ +// Generator definition for ReverseSequenceOp, must be compilable by nvcc. + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace generator { + +template <typename T, size_t Dims> +class ReverseGenerator { + public: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + ReverseGenerator(typename TTypes<T, Dims>::ConstTensor input, int32 seq_dim, + TTypes<int64>::ConstVec seq_lengths) + : input_(input), seq_dim_(seq_dim), seq_lengths_(seq_lengths) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T + operator()(const Eigen::array<Eigen::DenseIndex, Dims>& coords) const { + Eigen::array<Eigen::DenseIndex, Dims> new_coords = coords; + if (coords[seq_dim_] < seq_lengths_(coords[0])) { + new_coords[seq_dim_] = seq_lengths_(coords[0]) - coords[seq_dim_] - 1; + } + + return input_(new_coords); + } + + private: + typename TTypes<T, Dims>::ConstTensor input_; + int32 seq_dim_; + TTypes<int64>::ConstVec seq_lengths_; +}; + +} // namespace generator + +namespace functor { + +template <typename Device, typename T, size_t Dims> +struct ReverseSequence { + EIGEN_ALWAYS_INLINE static void Compute( + const Device& d, typename TTypes<T, Dims>::ConstTensor input, + int32 seq_dim, TTypes<int64>::ConstVec seq_lengths, + typename TTypes<T, Dims>::Tensor output) { + generator::ReverseGenerator<T, Dims> generator(input, seq_dim, seq_lengths); + output.device(d) = input.generate(generator); + } +}; + +} // namespace functor + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_REVERSE_SEQUENCE_OP_H_ diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc new file mode 100644 index 0000000000..7b5d533026 --- /dev/null +++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc @@ -0,0 +1,26 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/reverse_sequence_op.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_GPU_SPEC(T, dims) \ + template class generator::ReverseGenerator<T, dims>; \ + template struct functor::ReverseSequence<GPUDevice, T, dims>; + +#define DEFINE_GPU_SPECS(T) \ + DEFINE_GPU_SPEC(T, 2); \ + DEFINE_GPU_SPEC(T, 3); \ + DEFINE_GPU_SPEC(T, 4); \ + DEFINE_GPU_SPEC(T, 5); + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS); + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/save_op.cc b/tensorflow/core/kernels/save_op.cc new file mode 100644 index 0000000000..71a15c643e --- /dev/null +++ b/tensorflow/core/kernels/save_op.cc @@ -0,0 +1,81 @@ +// See docs in ../ops/io_ops.cc +#include "tensorflow/core/kernels/io.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/util/tensor_slice_writer.h" + +namespace tensorflow { + +class SaveOp : public OpKernel { + public: + explicit SaveOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + SaveTensors(context, &checkpoint::CreateTableTensorSliceBuilder, false); + } +}; + +REGISTER_KERNEL_BUILDER(Name("Save").Device(DEVICE_CPU), SaveOp); + +class SaveSlicesOp : public OpKernel { + public: + explicit SaveSlicesOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + SaveTensors(context, &checkpoint::CreateTableTensorSliceBuilder, true); + } +}; + +REGISTER_KERNEL_BUILDER(Name("SaveSlices").Device(DEVICE_CPU), SaveSlicesOp); + +class ShardedFilenameOp : public OpKernel { + public: + explicit ShardedFilenameOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + static const char* input_names[3] = {"basename", "shard", "num_shards"}; + for (int i = 0; i < ctx->num_inputs(); ++i) { + OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(ctx->input(i).shape()), + errors::InvalidArgument( + input_names[i], " must be a scalar, got shape ", + ctx->input(i).shape().ShortDebugString())); + } + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out)); + out->scalar<string>()() = strings::Printf( + "%s-%05d-of-%05d", ctx->input(0).scalar<string>()().c_str(), + ctx->input(1).scalar<int32>()(), ctx->input(2).scalar<int32>()()); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ShardedFilename").Device(DEVICE_CPU), + ShardedFilenameOp); + +class ShardedFilespecOp : public OpKernel { + public: + explicit ShardedFilespecOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + static const char* input_names[2] = {"basename", "num_shards"}; + for (int i = 0; i < ctx->num_inputs(); ++i) { + OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(ctx->input(i).shape()), + errors::InvalidArgument( + input_names[i], " must be a scalar, got shape ", + ctx->input(i).shape().ShortDebugString())); + } + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out)); + out->scalar<string>()() = strings::Printf( + "%s-\?\?\?\?\?-of-%05d", ctx->input(0).scalar<string>()().c_str(), + ctx->input(1).scalar<int32>()()); + } +}; +REGISTER_KERNEL_BUILDER(Name("ShardedFilespec").Device(DEVICE_CPU), + ShardedFilespecOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc new file mode 100644 index 0000000000..ee1ba492a6 --- /dev/null +++ b/tensorflow/core/kernels/save_op_test.cc @@ -0,0 +1,443 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/tensor_slice_reader.h" +#include <gtest/gtest.h> + +namespace tensorflow { +namespace { + +class SaveOpTest : public OpsTestBase { + protected: + void MakeOp() { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "Save") + .Input(FakeInput()) + .Input(FakeInput()) + .Input(FakeInput( + {DT_INT32, DT_FLOAT, DT_DOUBLE, DT_QINT8, DT_QINT32})) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SaveOpTest, Simple) { + const string filename = io::JoinPath(testing::TmpDir(), "tensor_simple"); + const string tensornames[] = {"tensor_int", "tensor_float", "tensor_double", + "tensor_qint8", "tensor_qint32"}; + + MakeOp(); + // Add a file name + AddInput<string>(TensorShape({}), + [&filename](int x) -> string { return filename; }); + + // Add the tensor names + AddInput<string>(TensorShape({5}), + [&tensornames](int x) -> string { return tensornames[x]; }); + + // Add a 1-d integer tensor + AddInput<int32>(TensorShape({10}), [](int x) -> int32 { return x + 1; }); + + // Add a 2-d float tensor + AddInput<float>(TensorShape({2, 4}), + [](int x) -> float { return static_cast<float>(x) / 10; }); + + // Add a 2-d double tensor + AddInput<double>(TensorShape({2, 4}), + [](int x) -> double { return static_cast<double>(x) / 20; }); + + // Add a 2-d qint8 tensor + AddInput<qint8>(TensorShape({3, 2}), + [](int x) -> qint8 { return *reinterpret_cast<qint8*>(&x); }); + + // Add a 2-d qint32 tensor + AddInput<qint32>(TensorShape({2, 3}), [](int x) -> qint32 { + return *reinterpret_cast<qint32*>(&x) * qint8(2); + }); + + ASSERT_OK(RunOpKernel()); + + // Check that the checkpoint file is properly written + checkpoint::TensorSliceReader reader(filename, + checkpoint::OpenTableTensorSliceReader); + EXPECT_OK(reader.status()); + + // We expect to find all saved tensors + { + // The 1-d integer tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_int", &shape, &type)); + TensorShape expected({10}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_INT32, type); + + // We expect the tensor value to be correct. + TensorSlice s = TensorSlice::ParseOrDie("-"); + int data[10]; + std::fill_n(data, 10, 0); + EXPECT_TRUE(reader.CopySliceData("tensor_int", s, data)); + for (int i = 0; i < 10; ++i) { + EXPECT_EQ(i + 1, data[i]); + } + } + + { + // The 2-d float tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_float", &shape, &type)); + TensorShape expected({2, 4}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_FLOAT, type); + + // We expect the tensor value to be correct. + TensorSlice s = TensorSlice::ParseOrDie("-:-"); + float data[8]; + std::fill_n(data, 8, 0); + EXPECT_TRUE(reader.CopySliceData("tensor_float", s, data)); + for (int i = 0; i < 8; ++i) { + EXPECT_EQ(static_cast<float>(i) / 10, data[i]); + } + } + + { + // The 2-d double tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_double", &shape, &type)); + TensorShape expected({2, 4}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_DOUBLE, type); + + // We expect the tensor value to be correct. + TensorSlice s = TensorSlice::ParseOrDie("-:-"); + double data[8]; + std::fill_n(data, 8, 0); + EXPECT_TRUE(reader.CopySliceData("tensor_double", s, data)); + for (int i = 0; i < 8; ++i) { + EXPECT_EQ(static_cast<double>(i) / 20, data[i]); + } + } + + { + // The 2-d qint8 tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_qint8", &shape, &type)); + TensorShape expected({3, 2}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_QINT8, type); + + // We expect the tensor value to be correct. + TensorSlice s = TensorSlice::ParseOrDie("-:-"); + qint8 data[6]; + EXPECT_TRUE(reader.CopySliceData("tensor_qint8", s, data)); + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(*reinterpret_cast<qint8*>(&i), data[i]); + } + } + + { + // The 2-d qint32 tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_qint32", &shape, &type)); + TensorShape expected({2, 3}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_QINT32, type); + + // We expect the tensor value to be correct. + TensorSlice s = TensorSlice::ParseOrDie("-:-"); + qint32 data[6]; + EXPECT_TRUE(reader.CopySliceData("tensor_qint32", s, data)); + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(*reinterpret_cast<qint32*>(&i) * qint8(2), data[i]); + } + } +} + +class SaveSlicesOpTest : public OpsTestBase { + protected: + void MakeOp() { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "SaveSlices") + .Input(FakeInput()) + .Input(FakeInput()) + .Input(FakeInput()) + .Input(FakeInput( + {DT_INT32, DT_FLOAT, DT_DOUBLE, DT_QINT8, DT_QINT32})) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +// Here we save only slices. We restore them in a larger tensor and we check +// that the right slice is restored. It is quite tricky to check that the +// right slices are actually restored so instead we just check that +// CopySliceData() return true/false depending on the slice we ask for. +TEST_F(SaveSlicesOpTest, Slices) { + const string filename = io::JoinPath(testing::TmpDir(), "tensor_slices"); + const string tensornames[] = {"tensor_int", "tensor_float", "tensor_double", + "tensor_qint8", "tensor_qint32"}; + // Specifies that the data we save are slices of larger tensors. + // See core/framework/tensor_slice.h for the slice syntax. + const string tensorshapes[] = { + "10 -", // Full contents of a 10 element vector. + "2 4 -:0,2", // A 2x2 slice of a 2x4 tensor. + "2 4 0,1:2,2", // A 1x2 slice of a 2x4 tensor. + "3 2 -:-", // Full contents of a 3x2 tensor. + "2 3 1,1:2,1" // Another 1x1 slice of a2x3 tensor. + }; + + MakeOp(); + // Add a file name + AddInput<string>(TensorShape({}), + [&filename](int x) -> string { return filename; }); + + // Add the tensor names + AddInput<string>(TensorShape({5}), + [&tensornames](int x) -> string { return tensornames[x]; }); + + // Add the tensor shapes and slices + AddInput<string>(TensorShape({5}), [&tensorshapes](int x) -> string { + return tensorshapes[x]; + }); + + // Add a 1-d integer tensor + AddInput<int32>(TensorShape({10}), [](int x) -> int32 { return x + 1; }); + + // Add a 2-d float tensor + AddInput<float>(TensorShape({2, 2}), + [](int x) -> float { return static_cast<float>(x) / 10; }); + + // Add a 2-d double tensor + AddInput<double>(TensorShape({1, 2}), + [](int x) -> double { return static_cast<double>(x) / 20; }); + + // Add a 2-d qint8 tensor + AddInput<qint8>(TensorShape({3, 2}), + [](int x) -> qint8 { return *reinterpret_cast<qint8*>(&x); }); + + // Add a 2-d qint32 tensor + AddInput<qint32>(TensorShape({1, 1}), [](int x) -> qint32 { + return *reinterpret_cast<qint32*>(&x) * qint8(2); + }); + + ASSERT_OK(RunOpKernel()); + + // Check that the checkpoint file is properly written + checkpoint::TensorSliceReader reader(filename, + checkpoint::OpenTableTensorSliceReader); + EXPECT_OK(reader.status()); + + // We expect to find all saved tensors + { + // The 1-d integer tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_int", &shape, &type)); + TensorShape expected({10}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_INT32, type); + + // We saved the full tensor so we should be able to read it all. + TensorSlice s = TensorSlice::ParseOrDie("-"); + int data[10]; + EXPECT_TRUE(reader.CopySliceData("tensor_int", s, data)); + } + + { + // The 2-d float tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_float", &shape, &type)); + TensorShape expected({2, 4}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_FLOAT, type); + + // We saved the slice "-:0,2" so we should not be able to read the full + // tensor. + TensorSlice full_slice = TensorSlice::ParseOrDie("-:-"); + TensorSlice saved_slice = TensorSlice::ParseOrDie("-:0,2"); + float data[8]; + EXPECT_FALSE(reader.CopySliceData("tensor_float", full_slice, data)); + EXPECT_TRUE(reader.CopySliceData("tensor_float", saved_slice, data)); + } + + { + // The 2-d double tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_double", &shape, &type)); + TensorShape expected({2, 4}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_DOUBLE, type); + + // We saved the slice "0,1:2,2" so we should not be able to read the full + // tensor. + TensorSlice full_slice = TensorSlice::ParseOrDie("-:-"); + TensorSlice saved_slice = TensorSlice::ParseOrDie("0,1:2,2"); + double data[8]; + EXPECT_FALSE(reader.CopySliceData("tensor_double", full_slice, data)); + EXPECT_TRUE(reader.CopySliceData("tensor_double", saved_slice, data)); + } + + { + // The 2-d qint8 tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_qint8", &shape, &type)); + TensorShape expected({3, 2}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_QINT8, type); + + // We saved the full slice. + TensorSlice s = TensorSlice::ParseOrDie("-:-"); + qint8 data[6]; + EXPECT_TRUE(reader.CopySliceData("tensor_qint8", s, data)); + } + + { + // The 2-d qint32 tensor + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("tensor_qint32", &shape, &type)); + TensorShape expected({2, 3}); + EXPECT_TRUE(shape.IsSameSize(expected)); + EXPECT_EQ(DT_QINT32, type); + + // We expect the tensor value to be correct. + TensorSlice s = TensorSlice::ParseOrDie("1,1:2,1"); + TensorSlice full_slice = TensorSlice::ParseOrDie("-:-"); + TensorSlice saved_slice = TensorSlice::ParseOrDie("1,1:2,1"); + qint32 data[6]; + EXPECT_FALSE(reader.CopySliceData("tensor_qint32", full_slice, data)); + EXPECT_TRUE(reader.CopySliceData("tensor_qint32", saved_slice, data)); + } +} + +class SaveOpSlices2Test : public OpsTestBase { + protected: + void MakeOp() { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "SaveSlices") + .Input(FakeInput()) + .Input(FakeInput()) + .Input(FakeInput()) + .Input(FakeInput({DT_INT32, DT_INT32, DT_FLOAT})) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SaveOpSlices2Test, TwoSlices) { + const string filename = io::JoinPath(testing::TmpDir(), "three_slices"); + // We will save 2 slices of the tensor named "four_by_sixteen" which is 4x16, + // and one slice of the "small" tensor. + const string tensornames[] = {"four_by_sixteen", "four_by_sixteen", "small"}; + const string tensorshapes[] = { + // Slice specifications for the 2 slices of "four_by_sixteen" + "4 16 0,2:-", // 1st slice covers indices 0 and 1 in the first dim. + "4 16 2,2:-", // 2nd slice covers indices 2 and 3 in the first dim. + "" // We save the full "small" tensors. + }; + + MakeOp(); + // Add a file name + AddInput<string>(TensorShape({}), + [&filename](int x) -> string { return filename; }); + + // Add the tensor names + AddInput<string>(TensorShape({3}), + [&tensornames](int x) -> string { return tensornames[x]; }); + + // Add the tensor shapes and slices + AddInput<string>(TensorShape({3}), [&tensorshapes](int x) -> string { + return tensorshapes[x]; + }); + + // Add an integer tensor for slice 0,2:- of a 4x16 tensor: It is 2x16. + AddInput<int32>(TensorShape({2, 16}), [](int x) -> int32 { return x + 1; }); + + // Add an integer tensor for slice 2,2:- of a 4x16 tensor: It is 2x16. + AddInput<int32>(TensorShape({2, 16}), + [](int x) -> int32 { return 10 * (x + 1); }); + + // Add a float tensor for "small" + AddInput<float>(TensorShape({2, 4}), + [](int x) -> float { return static_cast<float>(x) / 10; }); + + ASSERT_OK(RunOpKernel()); + + // Check that the checkpoint file is properly written + checkpoint::TensorSliceReader reader(filename, + checkpoint::OpenTableTensorSliceReader); + EXPECT_OK(reader.status()); + + { + // Reload the two slices of "four_by_sixteen" into that tensor. + Tensor reloaded(DT_INT32, {4, 16}); + + // We expect to find all slices + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("four_by_sixteen", &shape, &type)); + EXPECT_TRUE(shape.IsSameSize(reloaded.shape())); + EXPECT_EQ(type, reloaded.dtype()); + + // Reload the whole tensor. + EXPECT_TRUE(reader.CopySliceData("four_by_sixteen", + TensorSlice(reloaded.dims()), + reloaded.flat<int>().data())); + + { + auto slice = reloaded.Slice(0, 2).flat<int>(); + for (int i = 0; i < slice.size(); ++i) { + EXPECT_EQ(i + 1, slice(i)); + } + } + { + auto slice = reloaded.Slice(2, 4).flat<int>(); + for (int i = 0; i < slice.size(); ++i) { + EXPECT_EQ(10 * (i + 1), slice(i)); + } + } + } + + { + // Reload the small float tensor. + Tensor reloaded(DT_FLOAT, {2, 4}); + + TensorShape shape; + DataType type; + EXPECT_TRUE(reader.HasTensor("small", &shape, &type)); + EXPECT_TRUE(shape.IsSameSize(reloaded.shape())); + EXPECT_EQ(DT_FLOAT, reloaded.dtype()); + + EXPECT_TRUE(reader.CopySliceData("small", TensorSlice(reloaded.dims()), + reloaded.flat<float>().data())); + + for (int64 i = 0; i < reloaded.NumElements(); ++i) { + EXPECT_EQ(static_cast<float>(i) / 10, reloaded.flat<float>().data()[i]); + } + } +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc new file mode 100644 index 0000000000..88fcc1bdcc --- /dev/null +++ b/tensorflow/core/kernels/scatter_op.cc @@ -0,0 +1,167 @@ +// See docs in ../ops/state_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +enum class UpdateOp { ASSIGN, ADD, SUB }; + +template <class T, typename Index, UpdateOp op> +class ScatterUpdateOp : public OpKernel { + public: + // QUESTION: It'd be nice to support DT_INT16, DT_UINT8, + // etc. here. Should we have the framework do some sort of + // integer promotion automatically, or should that be something + // that users have to do explicitly with a conversion operator + // in the graph? + explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) { + OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* c) override { + if (use_exclusive_lock_) { + // Hold mutex while we apply updates + mutex_lock l(*c->input_ref_mutex(0)); + DoCompute(c); + } else { + DoCompute(c); + } + } + + private: + bool use_exclusive_lock_; + + // Check whether updates.shape = indices.shape + params.shape[1:] + static bool ValidShapes(const Tensor& params, const Tensor& updates, + const Tensor& indices) { + if (updates.dims() != indices.dims() + params.dims() - 1) return false; + for (int d = 0; d < indices.dims(); d++) { + if (updates.dim_size(d) != indices.dim_size(d)) { + return false; + } + } + for (int d = 1; d < params.dims(); d++) { + if (params.dim_size(d) != updates.dim_size(d - 1 + indices.dims())) { + return false; + } + } + return true; + } + + void DoCompute(OpKernelContext* c) { + Tensor Tparams = c->mutable_input(0, use_exclusive_lock_); + OP_REQUIRES(c, Tparams.IsInitialized(), + errors::FailedPrecondition("Null ref for params")); + const Tensor& Tindices = c->input(1); + const Tensor& Tupdates = c->input(2); + OP_REQUIRES( + c, TensorShapeUtils::IsVectorOrHigher(Tparams.shape()), + errors::InvalidArgument("params must be at least 1-D, got shape ", + Tparams.shape().ShortDebugString())); + OP_REQUIRES( + c, ValidShapes(Tparams, Tupdates, Tindices), + errors::InvalidArgument( + "Must have updates.shape = indices.shape + params.shape[1:], got ", + "updates.shape ", Tupdates.shape().ShortDebugString(), + ", indices.shape ", Tindices.shape().ShortDebugString(), + ", params.shape ", Tparams.shape().ShortDebugString())); + const Index N = Tindices.NumElements(); + + // We always return the input ref. + c->forward_ref_input_to_ref_output(0, 0); + + if (N > 0) { + const Index first_dim_size = Tparams.dim_size(0); + // Validate all the indices are in range + auto Tindices_vec = Tindices.flat<Index>(); + for (Index i = 0; i < N; i++) { + const Index index = Tindices_vec(i); + OP_REQUIRES(c, index >= 0 && index < first_dim_size, + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + } + auto Tparams_flat = Tparams.flat_outer_dims<T>(); + auto Tupdates_flat = + Tupdates.shaped<T, 2>({N, Tupdates.NumElements() / N}); + for (Index i = 0; i < N; i++) { + // Copy last Ndim-1 dimensions of Tupdates[i] to + // Tparams[Tindices[i]] + switch (op) { + case UpdateOp::ASSIGN: { + Tparams_flat.template chip<0>(Tindices_vec(i)) = + Tupdates_flat.template chip<0>(i); + break; + } + case UpdateOp::ADD: { + Tparams_flat.template chip<0>(Tindices_vec(i)) += + Tupdates_flat.template chip<0>(i); + break; + } + case UpdateOp::SUB: { + Tparams_flat.template chip<0>(Tindices_vec(i)) -= + Tupdates_flat.template chip<0>(i); + break; + } + } + } + } + } +}; + +#define REGISTER_SCATTER_UPDATE(type, index_type) \ + REGISTER_KERNEL_BUILDER( \ + Name("ScatterUpdate") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + ScatterUpdateOp<type, index_type, UpdateOp::ASSIGN>); + +#define REGISTER_SCATTER_UPDATE_INT32(type) REGISTER_SCATTER_UPDATE(type, int32) +#define REGISTER_SCATTER_UPDATE_INT64(type) REGISTER_SCATTER_UPDATE(type, int64) + +TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_INT32); +TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_INT64); + +#undef REGISTER_SCATTER_UPDATE_INT64 +#undef REGISTER_SCATTER_UPDATE_INT32 +#undef REGISTER_SCATTER_UPDATE + +#define REGISTER_SCATTER_ADD(type, index_type) \ + REGISTER_KERNEL_BUILDER(Name("ScatterAdd") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + ScatterUpdateOp<type, index_type, UpdateOp::ADD>); + +#define REGISTER_SCATTER_ADD_INT32(type) REGISTER_SCATTER_ADD(type, int32) +#define REGISTER_SCATTER_ADD_INT64(type) REGISTER_SCATTER_ADD(type, int64) + +TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ADD_INT32); +TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ADD_INT64); + +#undef REGISTER_SCATTER_ADD_INT32 +#undef REGISTER_SCATTER_ADD_INT64 +#undef REGISTER_SCATTER_ADD + +#define REGISTER_SCATTER_SUB(type, index_type) \ + REGISTER_KERNEL_BUILDER(Name("ScatterSub") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + ScatterUpdateOp<type, index_type, UpdateOp::SUB>); + +#define REGISTER_SCATTER_SUB_INT32(type) REGISTER_SCATTER_SUB(type, int32) +#define REGISTER_SCATTER_SUB_INT64(type) REGISTER_SCATTER_SUB(type, int64) + +TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_SUB_INT32); +TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_SUB_INT64); + +#undef REGISTER_SCATTER_SUB_INT64 +#undef REGISTER_SCATTER_SUB_INT32 +#undef REGISTER_SCATTER_SUB + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc new file mode 100644 index 0000000000..8885f1edb3 --- /dev/null +++ b/tensorflow/core/kernels/scatter_op_test.cc @@ -0,0 +1,255 @@ +#include <functional> +#include <memory> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { +namespace { + +class ScatterUpdateOpTest : public OpsTestBase { + protected: + void MakeOp(DataType index_type) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "ScatterUpdate") + .Input(FakeInput(DT_FLOAT_REF)) + .Input(FakeInput(index_type)) + .Input(FakeInput(DT_FLOAT)) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(ScatterUpdateOpTest, Simple_TwoD32) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2}); + AddInputFromArray<float>(TensorShape({3, 3}), + {100, 101, 102, 777, 778, 779, 10000, 10001, 10002}); + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor params_tensor = *mutable_input(0).tensor; + Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3})); + test::FillValues<float>(&expected, {100, 101, 102, 0, 0, 0, 10000, 10001, + 10002, 0, 0, 0, 777, 778, 779}); + test::ExpectTensorEqual<float>(expected, params_tensor); +} + +TEST_F(ScatterUpdateOpTest, Simple_Two64) { + MakeOp(DT_INT64); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray<int64>(TensorShape({3}), {0, 4, 2}); + AddInputFromArray<float>(TensorShape({3, 3}), + {100, 101, 102, 777, 778, 779, 10000, 10001, 10002}); + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor params_tensor = *mutable_input(0).tensor; + Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3})); + test::FillValues<float>(&expected, {100, 101, 102, 0, 0, 0, 10000, 10001, + 10002, 0, 0, 0, 777, 778, 779}); + test::ExpectTensorEqual<float>(expected, params_tensor); +} + +TEST_F(ScatterUpdateOpTest, Simple_ZeroD) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5}), {0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({}), {3}); + AddInputFromArray<float>(TensorShape({}), {101}); + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor params_tensor = *mutable_input(0).tensor; + Tensor expected(allocator(), DT_FLOAT, TensorShape({5})); + test::FillValues<float>(&expected, {0, 0, 0, 101, 0}); + test::ExpectTensorEqual<float>(expected, params_tensor); +} + +TEST_F(ScatterUpdateOpTest, Simple_OneD) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5}), {0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2}); + AddInputFromArray<float>(TensorShape({3}), {100, 101, 102}); + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor params_tensor = *mutable_input(0).tensor; + Tensor expected(allocator(), DT_FLOAT, TensorShape({5})); + test::FillValues<float>(&expected, {100, 0, 102, 0, 101}); + test::ExpectTensorEqual<float>(expected, params_tensor); +} + +TEST_F(ScatterUpdateOpTest, HigherRank) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({8}), {0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({2, 3}), {0, 4, 2, 1, 3, 6}); + AddInputFromArray<float>(TensorShape({2, 3}), {10, 20, 30, 40, 50, 60}); + ASSERT_OK(RunOpKernel()); + + // Check the new state of the input + Tensor params_tensor = *mutable_input(0).tensor; + Tensor expected(allocator(), DT_FLOAT, TensorShape({8})); + test::FillValues<float>(&expected, {10, 40, 30, 50, 20, 0, 60, 0}); + test::ExpectTensorEqual<float>(expected, params_tensor); +} + +TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 99}); + AddInputFromArray<float>(TensorShape({3, 3}), + {100, 101, 102, 777, 778, 779, 10000, 10001, 10002}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("Index 99 at offset 2 in indices is out of range")) + << s; +} + +TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({2, 3}), {0, 0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({1, 3}), {0, 4, 99}); + AddInputFromArray<float>(TensorShape({3, 3}), + {100, 101, 102, 777, 778, 779, 10000, 10001, 10002}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("Must have updates.shape = indices.shape + " + "params.shape[1:], got ")) + << s; +} + +TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2}); + AddInputFromArray<float>( + TensorShape({3, 4}), + {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("Must have updates.shape = indices.shape + " + "params.shape[1:], got ")) + + << s; +} + +TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) { + MakeOp(DT_INT32); + + // Feed and run + AddInputFromArray<float>(TensorShape({5, 3}), + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray<int32>(TensorShape({3}), {0, 4, 2}); + AddInputFromArray<float>(TensorShape({2, 3}), + {100, 101, 102, 10000, 10001, 10002}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()) + .contains("Must have updates.shape = indices.shape + " + "params.shape[1:], got ")) + << s; +} + +class ScatterUpdateBM : public ScatterUpdateOpTest { + public: + virtual void TestBody() {} + void MakeBenchmarkOp(const char* op, DataType index_type) { + ASSERT_OK(NodeDefBuilder("myop", op) + .Input(FakeInput(DT_FLOAT_REF)) + .Input(FakeInput(index_type)) + .Input(FakeInput(DT_FLOAT)) + .Finalize(node_def())); + TF_CHECK_OK(InitOp()); + } +}; + +template <typename Index> +static void BM_ScatterHelper(int iters, int embedding_size, const char* op) { + testing::StopTiming(); + const int kRows = 10000000 / embedding_size; + std::vector<float> values; + for (int i = 0; i < kRows * embedding_size; i++) { + values.push_back(i); + } + const int kNumUpdates = 1000; + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + std::vector<Index> indices; + std::vector<float> updates; + for (int i = 0; i < kNumUpdates; i++) { + indices.push_back(rnd.Uniform(kRows)); + for (int j = 0; j < embedding_size; j++) { + updates.push_back(i * 10 + j); + } + } + + ScatterUpdateBM bm; + bm.MakeBenchmarkOp(op, DataTypeToEnum<Index>::v()); + bm.AddInputFromArray<float>(TensorShape({kRows, embedding_size}), values); + bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices); + bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}), + updates); + testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) * + iters); + testing::StartTiming(); + while (iters-- > 0) { + Status s = bm.RunOpKernel(); + } +} + +static void BM_ScatterUpdateInt32(int iters, int embedding_size) { + BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate"); +} +static void BM_ScatterUpdateInt64(int iters, int embedding_size) { + BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate"); +} + +static void BM_ScatterAddInt32(int iters, int embedding_size) { + BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd"); +} +static void BM_ScatterAddInt64(int iters, int embedding_size) { + BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd"); +} + +BENCHMARK(BM_ScatterUpdateInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); +BENCHMARK(BM_ScatterUpdateInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); + +BENCHMARK(BM_ScatterAddInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); +BENCHMARK(BM_ScatterAddInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc new file mode 100644 index 0000000000..2b6a8c5a88 --- /dev/null +++ b/tensorflow/core/kernels/segment_reduction_ops.cc @@ -0,0 +1,466 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/Eigen/Core" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +// This operator handles reducing segments along the first dimension. +// See core/ops/math_ops.cc for more details. +template <typename Device, class T, class Index, typename Reducer> +class SegmentReductionOp : public OpKernel { + public: + explicit SegmentReductionOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& segment_ids = context->input(1); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()), + errors::InvalidArgument("segment_ids should be a vector.")); + const int64 num_indices = segment_ids.NumElements(); + OP_REQUIRES(context, num_indices == input.dim_size(0), + errors::InvalidArgument( + "segment_ids should be the same size as dimension 0 of" + " input.")); + + auto input_flat = input.flat_outer_dims<T>(); + const int64 num_col = input_flat.dimension(1); + + const auto segment_vec = segment_ids.vec<Index>(); + // Note that the current implementation assumes that segment_vec values are + // sorted. + const Index output_rows = + num_indices > 0 ? segment_vec(num_indices - 1) + 1 : 0; + + TensorShape output_shape = input.shape(); + output_shape.set_dim(0, output_rows); + + // Note that we do not initialize the output buffer with a default value. + // We require that segment ids be sorted and cover all values (otherwise we + // return an error). + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + auto output_flat = output->flat_outer_dims<T>(); + +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce; + dims_to_reduce[0] = 0; +#else + Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce; +#endif + Index start = 0, end = 1; + // TODO(agarwal): if this loop becomes a bottleneck, consider sharding it + // across threads. + Eigen::DSizes<Eigen::DenseIndex, 1> out_slice_shape(num_col); + while (end <= num_indices) { + if (end < num_indices) { + if (segment_vec(start) == segment_vec(end)) { + ++end; + continue; + } + // We have a new segment here. Verify that the segment ids grow by one + // each time, so that we cover every possible output value. + OP_REQUIRES( + context, segment_vec(start) + 1 == segment_vec(end), + errors::InvalidArgument("segment ids are not increasing by 1")); + } + + // Process segment [start, end) + const T* in_slice_ptr = &input_flat(start, 0); + typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>, + Eigen::Unaligned> OutT; + T* out_slice_ptr = &output_flat(segment_vec(start), 0); + OutT out_slice(out_slice_ptr, out_slice_shape); + // We don't use out_slice.device(context->egien_device<Device>) + // because these pieces of work are likely to be very small and + // the context switching overhead dwarfs any benefit we get from + // using another thread to do this work. + if (start == end - 1) { + typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>, + Eigen::Unaligned> InT; + InT in_slice(in_slice_ptr, out_slice_shape); + out_slice = in_slice; + } else { + Eigen::DSizes<Eigen::DenseIndex, 2> in_slice_shape(end - start, + num_col); + typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>, + Eigen::Unaligned> InT; + InT in_slice(in_slice_ptr, in_slice_shape); + + out_slice = in_slice.reduce(dims_to_reduce, Reducer()); + } + start = end; + ++end; + } + } +}; + +#define REGISTER_CPU_KERNELS(type, index_type) \ + REGISTER_KERNEL_BUILDER( \ + Name("SegmentSum") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + SegmentReductionOp<CPUDevice, type, index_type, \ + Eigen::internal::SumReducer<type>>); \ + REGISTER_KERNEL_BUILDER( \ + Name("SegmentMean") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + SegmentReductionOp<CPUDevice, type, index_type, \ + Eigen::internal::MeanReducer<type>>); \ + REGISTER_KERNEL_BUILDER( \ + Name("SegmentProd") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + SegmentReductionOp<CPUDevice, type, index_type, \ + Eigen::internal::ProdReducer<type>>); \ + REGISTER_KERNEL_BUILDER( \ + Name("SegmentMin") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + SegmentReductionOp<CPUDevice, type, index_type, \ + Eigen::internal::MinReducer<type>>); \ + REGISTER_KERNEL_BUILDER( \ + Name("SegmentMax") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + SegmentReductionOp<CPUDevice, type, index_type, \ + Eigen::internal::MaxReducer<type>>); + +#define REGISTER_CPU_KERNELS_ALL(type) \ + REGISTER_CPU_KERNELS(type, int32); \ + REGISTER_CPU_KERNELS(type, int64); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS_ALL); +#undef REGISTER_CPU_KERNELS +#undef REGISTER_CPU_KERNELS_ALL + +// Similar to SegmentReductionOp but can handle unsorted segment definitions and +// specifying size of output. +template <typename Device, class T, class Index> +class UnsortedSegmentSumOp : public OpKernel { + public: + explicit UnsortedSegmentSumOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& data = context->input(0); + const Tensor& segment_ids = context->input(1); + const Tensor& num_segments = context->input(2); + + OP_REQUIRES( + context, TensorShapeUtils::IsLegacyScalar(num_segments.shape()), + errors::InvalidArgument("num_segments should be a scalar, not shape ", + num_segments.shape().ShortDebugString())); + + OP_REQUIRES(context, + TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()), + errors::InvalidArgument( + "data.shape = ", data.shape().ShortDebugString(), + " does not start with segment_ids.shape = ", + segment_ids.shape().ShortDebugString())); + + const auto segment_flat = segment_ids.flat<Index>(); + const int32 N = segment_flat.dimension(0); + const int32 output_rows = num_segments.scalar<int32>()(); + + if (N > 0) { + Eigen::Tensor<Index, 0, Eigen::RowMajor> m = segment_flat.maximum(); + OP_REQUIRES( + context, m() < output_rows, + errors::InvalidArgument("More segments found than output size")); + } + + TensorShape output_shape; + output_shape.AddDim(output_rows); + for (int i = segment_ids.dims(); i < data.dims(); i++) { + output_shape.AddDim(data.dim_size(i)); + } + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + auto output_flat = output->flat_outer_dims<T>(); + output_flat.setZero(); + + if (data.NumElements() > 0) { + auto data_flat = data.shaped<T, 2>({N, data.NumElements() / N}); + for (int i = 0; i < N; ++i) { + output_flat.template chip<0>(segment_flat(i)) += + data_flat.template chip<0>(i); + } + } + } +}; + +#define REGISTER_CPU_UNSORTED_KERNELS(type, index_type) \ + REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + UnsortedSegmentSumOp<CPUDevice, type, index_type>); + +#define REGISTER_CPU_UNSORTED_KERNELS_ALL(type) \ + REGISTER_CPU_UNSORTED_KERNELS(type, int32); \ + REGISTER_CPU_UNSORTED_KERNELS(type, int64); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_UNSORTED_KERNELS_ALL); +#undef REGISTER_CPU_UNSORTED_KERNELS +#undef REGISTER_CPU_UNSORTED_KERNELS_ALL + +// Same as SegmentReductionOp but takes as input a "sparse" tensor, represented +// by two dense tensors, one containing the data, and the other containing +// indices into the data. +template <typename Device, class T> +class SparseSegmentReductionOpBase : public OpKernel { + public: + explicit SparseSegmentReductionOpBase(OpKernelConstruction* context, + bool is_mean) + : OpKernel(context), is_mean_(is_mean) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& indices = context->input(1); + const Tensor& segment_ids = context->input(2); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices should be a vector.")); + OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()), + errors::InvalidArgument("segment_ids should be a vector.")); + + const int32 num_indices = indices.NumElements(); + OP_REQUIRES(context, num_indices == segment_ids.NumElements(), + errors::InvalidArgument( + "segment_ids and indices should have same size.")); + + auto input_flat = input.flat_outer_dims<T>(); + + const auto indices_vec = indices.vec<int32>(); + const auto segment_vec = segment_ids.vec<int32>(); + // Note that the current implementation assumes that segment_vec values are + // sorted. + const int32 output_rows = + num_indices > 0 ? segment_vec(num_indices - 1) + 1 : 0; + + TensorShape output_shape = input.shape(); + output_shape.set_dim(0, output_rows); + + // Note that we do not initialize the output buffer with a default value. + // We require that segment ids be sorted and cover all values (otherwise we + // return an error). + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + if (num_indices == 0) return; + auto output_flat = output->flat_outer_dims<T>(); + + int32 start = 0, end = 1; + while (end <= num_indices) { + if (end < num_indices) { + if (segment_vec(start) == segment_vec(end)) { + ++end; + continue; + } + // We have a new segment here. Verify that the segment ids grow by one + // each time, so that we cover every possible output value. + OP_REQUIRES( + context, segment_vec(start) + 1 == segment_vec(end), + errors::InvalidArgument("segment ids are not increasing by 1")); + } + + auto out = output_flat.template chip<0>(segment_vec(start)); +#define I(i) input_flat.template chip<0>(indices_vec(start + i)) + int num = end - start; + if (num == 1) { + out = I(0); + } else { + int r = num % 8; + T m = (is_mean_ && (num < 10)) ? num : 1; + switch (r) { + case 2: + out = (I(0) + I(1)) / m; + break; + case 3: + out = (I(0) + I(1) + I(2)) / m; + break; + case 4: + out = (I(0) + I(1) + I(2) + I(3)) / m; + break; + case 5: + out = (I(0) + I(1) + I(2) + I(3) + I(4)) / m; + break; + case 6: + out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5)) / m; + break; + case 7: + out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6)) / m; + break; + case 0: + out = (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6) + I(7)) / m; + r = 8; + break; + case 1: + out = + (I(0) + I(1) + I(2) + I(3) + I(4) + I(5) + I(6) + I(7) + I(8)) / + m; + r = 9; + break; + } + for (; r < num; r += 8) { + out += I(r) + I(r + 1) + I(r + 2) + I(r + 3) + I(r + 4) + I(r + 5) + + I(r + 6) + I(r + 7); + } +#undef I + if (is_mean_ && num >= 10) { + out = out / static_cast<T>(num); + } + } + start = end; + ++end; + } + } + + private: + bool is_mean_; +}; + +template <typename Device, class T> +class SparseSegmentReductionMeanOp + : public SparseSegmentReductionOpBase<Device, T> { + public: + explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context) + : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/) {} +}; + +template <typename Device, class T> +class SparseSegmentReductionSumOp + : public SparseSegmentReductionOpBase<Device, T> { + public: + explicit SparseSegmentReductionSumOp(OpKernelConstruction* context) + : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/) {} +}; + +#define REGISTER_CPU_SPARSE_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("SparseSegmentSum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + SparseSegmentReductionSumOp<CPUDevice, type>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS); +#undef REGISTER_CPU_SPARSE_KERNELS + +#define REGISTER_CPU_SPARSE_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("SparseSegmentMean").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + SparseSegmentReductionMeanOp<CPUDevice, type>); +REGISTER_CPU_SPARSE_KERNELS(float); +REGISTER_CPU_SPARSE_KERNELS(double); +#undef REGISTER_CPU_SPARSE_KERNELS + +template <class T> +class SparseSegmentMeanGradOp : public OpKernel { + public: + explicit SparseSegmentMeanGradOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& indices = context->input(1); + const Tensor& segment_ids = context->input(2); + const Tensor& output_dim0 = context->input(3); + + OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices should be a vector.")); + OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()), + errors::InvalidArgument("segment_ids should be a vector.")); + OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(output_dim0.shape()), + errors::InvalidArgument("output_dim0 should be a scalar.")); + + const int64 N = indices.NumElements(); + OP_REQUIRES(context, N == segment_ids.NumElements(), + errors::InvalidArgument( + "segment_ids and indices should have same size.")); + const int32 M = output_dim0.scalar<int32>()(); + + auto input_flat = input.flat_outer_dims<T>(); + const auto indices_vec = indices.vec<int32>(); + const auto segment_vec = segment_ids.vec<int32>(); + + TensorShape output_shape = input.shape(); + output_shape.set_dim(0, M); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + if (M == 0 || N == 0) return; + + // Note that similar to SparseSegmentMean, we assume that segment_vec is + // already sorted and has non-negative values. + int num_segments = segment_vec(N - 1) + 1; + OP_REQUIRES(context, input.dim_size(0) == num_segments, + errors::InvalidArgument("Invalid number of segments")); + + // Compute scaling factors for input. + std::vector<double> scaling(num_segments, 0.0); + for (int64 i = 0; i < N; ++i) { + scaling[segment_vec(i)] += 1; + } + for (int i = 0; i < scaling.size(); ++i) { + scaling[i] = 1.0 / std::max(scaling[i], 1.0); + } + + auto output_flat = output->flat_outer_dims<T>(); + output_flat.setZero(); + std::vector<bool> is_modified(M, false); + + for (int64 i = 0; i < N; ++i) { + int output_idx = indices_vec(i); + int idx = segment_vec(i); + T scale = static_cast<T>(scaling[idx]); + if (is_modified[output_idx]) { + if (scale == 1.0) { + output_flat.template chip<0>(output_idx) += + input_flat.template chip<0>(idx); + } else { + output_flat.template chip<0>(output_idx) += + input_flat.template chip<0>(idx) * scale; + } + } else { + if (scale == 1.0) { + output_flat.template chip<0>(output_idx) = + input_flat.template chip<0>(idx); + } else { + output_flat.template chip<0>(output_idx) = + input_flat.template chip<0>(idx) * scale; + } + } + is_modified[output_idx] = true; + } + } +}; + +#define REGISTER_CPU_SPARSE_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("SparseSegmentMeanGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T"), \ + SparseSegmentMeanGradOp<type>); + +REGISTER_CPU_SPARSE_KERNELS(float); +REGISTER_CPU_SPARSE_KERNELS(double); + +#undef REGISTER_CPU_SPARSE_KERNELS +} // namespace tensorflow diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc new file mode 100644 index 0000000000..87647a21a8 --- /dev/null +++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc @@ -0,0 +1,157 @@ +#include <functional> + +#include "tensorflow/core/public/session_options.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" + +namespace tensorflow { + +template <typename Index> +static void BM_SegmentReduction(int iters, string reduction, Index num_rows, + Index num_cols, Index segment_size) { + testing::StopTiming(); + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + // Create inputs + gtl::InlinedVector<TensorValue, 4> reduction_inputs; + TensorShape shape1({num_rows, num_cols}); + Tensor input1(DT_FLOAT, shape1); + reduction_inputs.push_back({nullptr, &input1}); + + TensorShape shape2({num_rows}); + Tensor input2(DataTypeToEnum<Index>::v(), shape2); + test::FillFn<Index>(&input2, [&num_rows, &segment_size](Index i) -> Index { + return std::min(i / segment_size, num_rows - 1); + }); + reduction_inputs.push_back({nullptr, &input2}); + + NodeDef reduction_node_def; + TF_CHECK_OK(NodeDefBuilder(reduction, reduction) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DataTypeToEnum<Index>::v())) + .Finalize(&reduction_node_def)); + Status status; + std::unique_ptr<OpKernel> reduction_op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), reduction_node_def, &status)); + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &reduction_inputs; + params.op_kernel = reduction_op.get(); + params.output_alloc_attr = [&device, &reduction_op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = + (reduction_op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> reduction_context( + new OpKernelContext(params)); + + reduction_op->Compute(reduction_context.get()); + TF_CHECK_OK(reduction_context->status()); + testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete reduction_context->release_output(0).tensor; + reduction_op->Compute(reduction_context.get()); + } + int64 bytes_per_iter = + static_cast<int64>(num_rows * num_cols * sizeof(float)); + testing::BytesProcessed(bytes_per_iter * iters); +} + +#define BM_Reduce(O, R, C, S) \ + static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \ + BM_SegmentReduction<int32>(iters, #O, R, C, S); \ + } \ + static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \ + BM_SegmentReduction<int64>(iters, #O, R, C, S); \ + } \ + BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \ + BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64); + +#define BM_Reduce_Arg(R, C, S) \ + BM_Reduce(SegmentSum, R, C, S); \ + BM_Reduce(SegmentMean, R, C, S); + +BM_Reduce_Arg(64, 32, 1); +BM_Reduce_Arg(4096, 128, 1); + +BM_Reduce_Arg(16, 8, 2); +BM_Reduce_Arg(64, 32, 2); +BM_Reduce_Arg(4096, 32, 2); +BM_Reduce_Arg(4096, 128, 2); + +static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) { + testing::StopTiming(); + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + CHECK_LE(uniqueness, 1.0); + CHECK_GT(uniqueness, 0.0); + + const int kNumIndices = size; + Tensor indices(DT_INT32, TensorShape({kNumIndices})); + auto indices_flat = indices.flat<int32>(); + Tensor segments(DT_INT32, TensorShape({kNumIndices})); + auto segments_flat = segments.flat<int32>(); + + int kUniqueIndices = uniqueness * kNumIndices; + Tensor output_dim0(DT_INT32, TensorShape({})); + output_dim0.scalar<int32>()() = kUniqueIndices; + + for (int i = 0; i < kNumIndices; ++i) { + indices_flat(i) = (i * 31) % kUniqueIndices; + segments_flat(i) = i * .8; + } + + const int kDim1 = segments_flat(kNumIndices - 1) + 1; + const int kDim2 = 128; + Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2})); + input.flat<float>().setRandom(); + + Node* node; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad") + .Input(test::graph::Constant(g, input)) + .Input(test::graph::Constant(g, indices)) + .Input(test::graph::Constant(g, segments)) + .Input(test::graph::Constant(g, output_dim0)) + .Attr("T", DT_FLOAT) + .Finalize(g, &node)); + + testing::UseRealTime(); + testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) * + sizeof(float)); + testing::StartTiming(); + test::Benchmark("cpu", g).Run(iters); +} + +static void BM_SparseSegmentMeanGrad_Low(int iters, int size) { + return SparseSegmentMeanGradHelper(iters, 1.0, size); +} + +static void BM_SparseSegmentMeanGrad_High(int iters, int size) { + return SparseSegmentMeanGradHelper(iters, 0.01, size); +} + +BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000); +BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc new file mode 100644 index 0000000000..2abb183d1a --- /dev/null +++ b/tensorflow/core/kernels/sendrecv_ops.cc @@ -0,0 +1,116 @@ +#include "tensorflow/core/kernels/sendrecv_ops.h" + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +static string GetRendezvousKeyPrefix(const string& send_device, + const string& recv_device, + const uint64 send_device_incarnation, + const string& tensor_name) { + return strings::StrCat(send_device, ";", + strings::FpToString(send_device_incarnation), ";", + recv_device, ";", tensor_name); +} + +static string GetRendezvousKey(const string& key_prefix, + const FrameAndIter& frame_iter) { + return strings::StrCat(key_prefix, ";", frame_iter.frame_id, ":", + frame_iter.iter_id); +} + +SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + string send_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device)); + string recv_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device)); + uint64 send_device_incarnation; + OP_REQUIRES_OK( + ctx, ctx->GetAttr("send_device_incarnation", + reinterpret_cast<int64*>(&send_device_incarnation))); + string tensor_name; + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device, + send_device_incarnation, tensor_name); +} + +void SendOp::Compute(OpKernelContext* ctx) { + OP_REQUIRES( + ctx, ctx->rendezvous() != nullptr, + errors::Internal("Op kernel context needs to provide a rendezvous.")); + const string key = GetRendezvousKey(key_prefix_, ctx->frame_iter()); + VLOG(2) << "Send " << key; + + // The device context may be passed between the Send/Recv + // boundary, so that the device context used to produce the Tensor + // is used when performing the copy on the recv side (which may be + // a different device). + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->input_alloc_attr(0); + Status s = + ctx->rendezvous()->Send(key, args, ctx->input(0), ctx->is_input_dead()); + ctx->SetStatus(s); +} + +REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp); +REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp); + +REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp); +REGISTER_KERNEL_BUILDER( + Name("_HostSend").Device(DEVICE_GPU).HostMemory("tensor"), SendOp); + +RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) { + string send_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("send_device", &send_device)); + string recv_device; + OP_REQUIRES_OK(ctx, ctx->GetAttr("recv_device", &recv_device)); + uint64 send_device_incarnation; + OP_REQUIRES_OK( + ctx, ctx->GetAttr("send_device_incarnation", + reinterpret_cast<int64*>(&send_device_incarnation))); + string tensor_name; + OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name)); + key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device, + send_device_incarnation, tensor_name); +} + +void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { + OP_REQUIRES( + ctx, ctx->rendezvous() != nullptr, + errors::Internal("Op kernel context needs to provide a rendezvous.")); + const string key = GetRendezvousKey(key_prefix_, ctx->frame_iter()); + VLOG(2) << "Recv " << key; + + Rendezvous::Args args; + args.device_context = ctx->op_device_context(); + args.alloc_attrs = ctx->output_alloc_attr(0); + ctx->rendezvous()->RecvAsync( + key, args, [ctx, done](const Status& s, const Rendezvous::Args& send_args, + const Rendezvous::Args& recv_args, + const Tensor& val, bool is_dead) { + ctx->SetStatus(s); + if (s.ok()) { + // 'ctx' allocates the output tensor of the expected type. The + // runtime checks whether the tensor received here is the same type. + if (!is_dead) { + ctx->set_output(0, val); + } + *ctx->is_output_dead() = is_dead; + } + done(); + }); +} + +REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp); +REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp); + +REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp); +REGISTER_KERNEL_BUILDER( + Name("_HostRecv").Device(DEVICE_GPU).HostMemory("tensor"), RecvOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h new file mode 100644 index 0000000000..b3f5703ccf --- /dev/null +++ b/tensorflow/core/kernels/sendrecv_ops.h @@ -0,0 +1,32 @@ +#ifndef TENSORFLOW_KERNELS_SENDRECV_OPS_H_ +#define TENSORFLOW_KERNELS_SENDRECV_OPS_H_ + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +class SendOp : public OpKernel { + public: + explicit SendOp(OpKernelConstruction* ctx); + void Compute(OpKernelContext* ctx) override; + + private: + string key_prefix_; + + TF_DISALLOW_COPY_AND_ASSIGN(SendOp); +}; + +class RecvOp : public AsyncOpKernel { + public: + explicit RecvOp(OpKernelConstruction* ctx); + void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override; + + private: + string key_prefix_; + + TF_DISALLOW_COPY_AND_ASSIGN(RecvOp); +}; + +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_SENDRECV_OPS_H_ diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc new file mode 100644 index 0000000000..60ba2e15f9 --- /dev/null +++ b/tensorflow/core/kernels/sequence_ops.cc @@ -0,0 +1,123 @@ +// See docs in ../ops/math_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +int32 GetValue(int32 v) { return v; } + +template <typename T> +class RangeOp : public OpKernel { + public: + explicit RangeOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& start_in = context->input(0); + const Tensor& limit_in = context->input(1); + const Tensor& delta_in = context->input(2); + OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(start_in.shape()), + errors::InvalidArgument("start must be a scalar, not shape ", + start_in.shape().ShortDebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(limit_in.shape()), + errors::InvalidArgument("limit must be a scalar, not shape ", + limit_in.shape().ShortDebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsLegacyScalar(delta_in.shape()), + errors::InvalidArgument("delta must be a scalar, not shape ", + delta_in.shape().ShortDebugString())); + const int32 start = GetValue(start_in.scalar<T>()()); + const int32 limit = GetValue(limit_in.scalar<T>()()); + OP_REQUIRES(context, start <= limit, + errors::InvalidArgument("Requires start <= limit: ", start, "/", + limit)); + const int32 delta = GetValue(delta_in.scalar<T>()()); + OP_REQUIRES(context, delta > 0, + errors::InvalidArgument("Requires delta > 0: ", delta)); + int32 size = (limit - start + delta - 1) / delta; + Tensor* out = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({size}), &out)); + auto flat = out->flat<T>(); + int32 val = start; + for (int32 i = 0; i < size; ++i) { + flat(i) = T(val); + val += delta; + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("Range") + .Device(DEVICE_CPU) + .HostMemory("start") + .HostMemory("limit") + .HostMemory("delta") + .HostMemory("output"), + RangeOp<int32>); + +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Range") + .Device(DEVICE_GPU) + .HostMemory("start") + .HostMemory("limit") + .HostMemory("delta") + .HostMemory("output"), + RangeOp<int32>); +#endif // GOOGLE_CUDA + +template <typename T> +class LinSpaceOp : public OpKernel { + public: + explicit LinSpaceOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& start_in = context->input(0); + const Tensor& stop_in = context->input(1); + const Tensor& num_in = context->input(2); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(start_in.shape()), + errors::InvalidArgument("start must be a scalar, not shape ", + start_in.shape().ShortDebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(stop_in.shape()), + errors::InvalidArgument("stop must be a scalar, not shape ", + stop_in.shape().ShortDebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_in.shape()), + errors::InvalidArgument("num must be a scalar, not shape ", + num_in.shape().ShortDebugString())); + const T start = start_in.scalar<T>()(); + const T stop = stop_in.scalar<T>()(); + const int32 num = num_in.scalar<int32>()(); + OP_REQUIRES(context, num > 0, + errors::InvalidArgument("Requires num > 0: ", num)); + Tensor* out = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({num}), &out)); + auto flat = out->flat<T>(); + if (num == 1) { + flat(0) = start; + } else { + const T step = (stop - start) / (num - 1); + for (int32 i = 0; i < num; ++i) flat(i) = start + step * i; + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("LinSpace") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T") + .HostMemory("start") + .HostMemory("stop") + .HostMemory("num") + .HostMemory("output"), + LinSpaceOp<float>); +REGISTER_KERNEL_BUILDER(Name("LinSpace") + .Device(DEVICE_CPU) + .TypeConstraint<double>("T") + .HostMemory("start") + .HostMemory("stop") + .HostMemory("num") + .HostMemory("output"), + LinSpaceOp<double>); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc new file mode 100644 index 0000000000..7cb1da8983 --- /dev/null +++ b/tensorflow/core/kernels/shape_ops.cc @@ -0,0 +1,261 @@ +// See docs in ../ops/array_ops.cc. + +#include <unordered_set> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +class ShapeOp : public OpKernel { + public: + explicit ShapeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& inp = ctx->input(0); + const int rank = inp.dims(); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out)); + auto vec = out->vec<int32>(); + for (int i = 0; i < rank; ++i) vec(i) = inp.dim_size(i); + } + + bool IsExpensive() override { return false; } +}; +REGISTER_KERNEL_BUILDER(Name("Shape").Device(DEVICE_CPU).HostMemory("output"), + ShapeOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Shape") \ + .Device(DEVICE_GPU) \ + .HostMemory("output") \ + .TypeConstraint<type>("T"), \ + ShapeOp) +TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Shape") + .Device(DEVICE_GPU) + .HostMemory("input") + .HostMemory("output") + .TypeConstraint<int32>("T"), + ShapeOp); + +class RankOp : public OpKernel { + public: + explicit RankOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& inp = ctx->input(0); + const int rank = inp.dims(); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out)); + out->scalar<int32>()() = rank; + } + + bool IsExpensive() override { return false; } +}; +REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"), + RankOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Rank") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("output"), \ + RankOp); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Rank") + .Device(DEVICE_GPU) + .TypeConstraint<int32>("T") + .HostMemory("input") + .HostMemory("output"), + RankOp); + +class SizeOp : public OpKernel { + public: + explicit SizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + const Tensor& inp = ctx->input(0); + const int64 size = inp.NumElements(); + Tensor* out = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out)); + // TODO(josh11b): switch output to int64? + out->scalar<int32>()() = size; + } + + bool IsExpensive() override { return false; } +}; +REGISTER_KERNEL_BUILDER(Name("Size").Device(DEVICE_CPU).HostMemory("output"), + SizeOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("Size") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("output"), \ + SizeOp); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL + +// A special GPU kernel for int32. +// TODO(b/25387198): Also enable int32 in device memory. This kernel +// registration requires all int32 inputs and outputs to be in host memory. +REGISTER_KERNEL_BUILDER(Name("Size") + .Device(DEVICE_GPU) + .TypeConstraint<int32>("T") + .HostMemory("input") + .HostMemory("output"), + SizeOp); + +class ExpandDimsOp : public OpKernel { + public: + explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} + + void Compute(OpKernelContext* ctx) override { + int dim = ctx->input(1).flat<int>()(0); + OP_REQUIRES( + ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()), + errors::InvalidArgument("Tried to expand dim index ", dim, + " for tensor with ", ctx->input(0).dims(), + " dimensions.")); + + auto existing_dims = ctx->input(0).shape().dim_sizes(); + std::vector<int64> new_shape(existing_dims.size()); + for (size_t i = 0; i < new_shape.size(); ++i) { + new_shape[i] = existing_dims[i]; + } + + // We emulate numpy's interpretation of the dim axis when + // -input.dims() >= dim <= input.dims(). + if (dim < 0) { + dim += existing_dims.size() + 1; + } + + // Clamp to the end if needed. + dim = std::min<int32>(dim, existing_dims.size()); + new_shape.emplace(new_shape.begin() + dim, 1); + const TensorShape output_shape(new_shape); + + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output)); + if (!output->CopyFrom(ctx->input(0), output_shape)) { + // This should never happen, since the sizes of the input and output + // should always be the same (we only expand the dimension with 1). + ctx->SetStatus( + errors::Internal("Could not expand dimension with input shape ", + ctx->input(0).shape().DebugString(), + " and output shape ", output_shape.DebugString())); + } + } +}; +REGISTER_KERNEL_BUILDER(Name("ExpandDims").Device(DEVICE_CPU).HostMemory("dim"), + ExpandDimsOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER(Name("ExpandDims") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("dim"), \ + ExpandDimsOp); +TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL + +REGISTER_KERNEL_BUILDER(Name("ExpandDims") + .Device(DEVICE_GPU) + .TypeConstraint<int32>("T") + .HostMemory("input") + .HostMemory("dim") + .HostMemory("output"), + ExpandDimsOp); + +class SqueezeOp : public OpKernel { + public: + explicit SqueezeOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + std::vector<int32> squeeze_dims; + OP_REQUIRES_OK(ctx, ctx->GetAttr("squeeze_dims", &squeeze_dims)); + squeeze_dims_.insert(squeeze_dims.begin(), squeeze_dims.end()); + } + + void Compute(OpKernelContext* ctx) override { + auto existing_dims = ctx->input(0).shape().dim_sizes(); + std::vector<int64> new_shape; + + std::unordered_set<int32> wrapped_squeeze_dims; + wrapped_squeeze_dims.reserve(squeeze_dims_.size()); + // Validate squeeze dims against the input. + for (int32 dim : squeeze_dims_) { + OP_REQUIRES( + ctx, (dim >= -ctx->input(0).dims() && dim < ctx->input(0).dims()), + errors::InvalidArgument("Tried to squeeze dim index ", dim, + " for tensor with ", ctx->input(0).dims(), + " dimensions.")); + // If dim is < 0, we wrap around (-1 means the last element). + if (dim < 0) { + dim = existing_dims.size() + dim; + } + + wrapped_squeeze_dims.insert(dim); + } + + for (size_t i = 0; i < existing_dims.size(); ++i) { + auto existing_dim = existing_dims[i]; + + // If squeeze_set is non-empty, only squeeze those dimensions. + if (!wrapped_squeeze_dims.empty()) { + if (wrapped_squeeze_dims.count(i) > 0) { + OP_REQUIRES(ctx, existing_dim == 1, + errors::InvalidArgument("Tried to explicitly squeeze " + "dimension ", + i, " but dimension was not 1: ", + existing_dim)); + } else { + // This dimension is not being squeezed. + new_shape.push_back(existing_dim); + } + } else { + // Copy over all non-1-length dimensions. + if (existing_dim != 1) { + new_shape.push_back(existing_dim); + } + } + } + + const TensorShape output_shape(new_shape); + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {0}, &output)); + if (!output->CopyFrom(ctx->input(0), output_shape)) { + // This should never happen, since the sizes of the input and + // output should always be the same. + ctx->SetStatus(errors::Internal("Could not squeeze input with shape ", + ctx->input(0).shape().DebugString(), + " and output shape ", + output_shape.DebugString())); + } + } + + private: + std::unordered_set<int32> squeeze_dims_; +}; + +REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp); + +#define REGISTER_GPU_KERNEL(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + SqueezeOp); +TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL); +#undef REGISTER_GPU_KERNEL + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc new file mode 100644 index 0000000000..3477266d5d --- /dev/null +++ b/tensorflow/core/kernels/slice_op.cc @@ -0,0 +1,242 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "tensorflow/core/kernels/slice_op.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace { + +gtl::InlinedVector<int64, 4> IntTensorToInt64Vec(const Tensor& tensor) { + gtl::InlinedVector<int64, 4> out; + if (tensor.dtype() == DT_INT32) { + for (int64 i = 0; i < tensor.NumElements(); ++i) { + out.push_back(tensor.flat<int32>()(i)); + } + } else if (tensor.dtype() == DT_INT64) { + for (int64 i = 0; i < tensor.NumElements(); ++i) { + out.push_back(tensor.flat<int64>()(i)); + } + } else { + LOG(FATAL) << "begin must be either int32 or int64"; + } + return out; +} + +} // namespace + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// Shared code that is not dependent on the type of T. We do this to reduce +// code size by not duplicating all this for all T (float, double, int32, etc.) +static void SharedValidation(OpKernelContext* context, + TensorShape* output_shape, bool* is_identity, + bool* slice_dim0, + gtl::InlinedVector<int64, 4>* begin, + gtl::InlinedVector<int64, 4>* size) { + const Tensor& input = context->input(0); + const Tensor& begin_tensor = context->input(1); + const Tensor& size_tensor = context->input(2); + + OP_REQUIRES( + context, TensorShapeUtils::IsLegacyVector(begin_tensor.shape()) && + TensorShapeUtils::IsLegacyVector(size_tensor.shape()) && + begin_tensor.NumElements() == input.dims() && + size_tensor.NumElements() == input.dims(), + errors::InvalidArgument( + "Expected begin and size arguments to be 1-D tensors of size ", + input.dims(), ", but got ", begin_tensor.NumElements(), " and ", + size_tensor.NumElements(), " instead.")); + + const int input_dims = input.dims(); + *begin = IntTensorToInt64Vec(begin_tensor); + *size = IntTensorToInt64Vec(size_tensor); + for (int i = 0; i < input_dims; ++i) { + if ((*size)[i] == -1) { + // A size[i] of -1 means "all elements from begin[i] to dim_size(i)". + (*size)[i] = input.dim_size(i) - (*begin)[i]; + } + } + + *is_identity = true; + *slice_dim0 = true; + for (int i = 0; i < input_dims; ++i) { + int64 b = (*begin)[i]; + int64 s = (*size)[i]; + if (input.dim_size(i) == 0) { + OP_REQUIRES( + context, b == 0 && s == 0, + errors::InvalidArgument("Expected begin[", i, "] == 0 (got ", b, + ") and size[", i, "] == 0 ", "(got ", s, + ") when ", "input.dim_size(", i, ") == 0")); + } else { + OP_REQUIRES(context, 0 <= b && b <= input.dim_size(i), + errors::InvalidArgument("Expected begin[", i, "] in [0, ", + input.dim_size(i), "], but got ", b)); + OP_REQUIRES( + context, 0 <= s && b + s <= input.dim_size(i), + errors::InvalidArgument("Expected size[", i, "] in [0, ", + input.dim_size(i) - b, "], but ", "got ", s)); + } + output_shape->AddDim(s); + const bool take_all = (b == 0) && (s == input.dim_size(i)); + (*is_identity) &= take_all; + (*slice_dim0) &= (i == 0) || take_all; + } +} + +template <typename Device, typename T> +class SliceOp : public OpKernel { + public: + explicit SliceOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + TensorShape output_shape; + bool is_identity = true; + bool slice_dim0 = true; + gtl::InlinedVector<int64, 4> begin; + gtl::InlinedVector<int64, 4> size; + SharedValidation(context, &output_shape, &is_identity, &slice_dim0, &begin, + &size); + if (!context->status().ok()) return; + const Tensor& input = context->input(0); + if (is_identity) { + VLOG(1) << "Slice identity"; + context->set_output(0, input); + return; + } + + if (slice_dim0 && IsInnerDimsSizeAligned<T>(input.shape())) { + VLOG(1) << "Slice dim 0: " << input.shape().DebugString(); + CHECK_GE(input.dims(), 1); // Otherwise, is_identity should be true. + context->set_output(0, input.Slice(begin[0], begin[0] + size[0])); + return; + } + + Tensor* result = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result)); + const int input_dims = input.dims(); + + if (output_shape.num_elements() > 0) { + if (std::is_same<Device, CPUDevice>::value && input_dims == 2 && + DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) { + auto input = context->input(0).tensor<T, 2>(); + auto output = result->tensor<T, 2>(); + // TODO(agarwal): Consider multi-threading this loop for cases where + // size[0] is very large. + for (int i = 0; i < size[0]; ++i) { + const int row = begin[0] + i; + if (i + 1 < size[0]) { + port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0)); + port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1])); + } + memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T)); + } + return; + } +#define HANDLE_DIM(NDIM) \ + if (input_dims == NDIM) { \ + HandleCase<NDIM>(context, begin, size, result); \ + return; \ + } + + HANDLE_DIM(1); + HANDLE_DIM(2); + HANDLE_DIM(3); + HANDLE_DIM(4); + HANDLE_DIM(5); + +#undef HANDLE_DIM + + OP_REQUIRES(context, false, errors::Unimplemented( + "SliceOp : Unhandled input dimensions")); + } + } + + private: + template <int NDIM> + void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin, + const gtl::ArraySlice<int64>& size, Tensor* result) { + Eigen::DSizes<ptrdiff_t, NDIM> indices; + Eigen::DSizes<ptrdiff_t, NDIM> sizes; + for (int i = 0; i < NDIM; ++i) { + indices[i] = begin[i]; + sizes[i] = size[i]; + } + + functor::Slice<Device, T, NDIM>()( + context->eigen_device<Device>(), result->tensor<T, NDIM>(), + context->input(0).tensor<T, NDIM>(), indices, sizes); + } +}; + +#define REGISTER_SLICE(type) \ + REGISTER_KERNEL_BUILDER(Name("Slice") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("begin") \ + .HostMemory("size"), \ + SliceOp<CPUDevice, type>) + +TF_CALL_ALL_TYPES(REGISTER_SLICE); +REGISTER_SLICE(bfloat16); + +#undef REGISTER_SLICE + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T, NDIM) \ + template <> \ + void Slice<GPUDevice, T, NDIM>::operator()( \ + const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \ + typename TTypes<T, NDIM>::ConstTensor input, \ + const Eigen::DSizes<ptrdiff_t, NDIM>& indices, \ + const Eigen::DSizes<ptrdiff_t, NDIM>& sizes); \ + extern template struct Slice<GPUDevice, T, NDIM>; + +#define DECLARE_FOR_N(T) \ + DECLARE_GPU_SPEC(T, 1); \ + DECLARE_GPU_SPEC(T, 2); \ + DECLARE_GPU_SPEC(T, 3); \ + DECLARE_GPU_SPEC(T, 4); \ + DECLARE_GPU_SPEC(T, 5); + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N); +DECLARE_FOR_N(int32); + +#undef DECLARE_FOR_N +#undef DECLARE_GPU_SPEC +} // namespace functor + +#define REGISTER_GPU(type) \ + REGISTER_KERNEL_BUILDER(Name("Slice") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("begin") \ + .HostMemory("size") \ + .TypeConstraint<int32>("Index"), \ + SliceOp<GPUDevice, type>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); +REGISTER_GPU(int32); + +#undef REGISTER_GPU + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h new file mode 100644 index 0000000000..1b6bd9c112 --- /dev/null +++ b/tensorflow/core/kernels/slice_op.h @@ -0,0 +1,25 @@ +#ifndef TENSORFLOW_KERNELS_SLICE_OP_H_ +#define TENSORFLOW_KERNELS_SLICE_OP_H_ + +// Functor definition for SliceOp, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T, int NDIMS> +struct Slice { + void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output, + typename TTypes<T, NDIMS>::ConstTensor input, + const Eigen::DSizes<ptrdiff_t, NDIMS>& slice_indices, + const Eigen::DSizes<ptrdiff_t, NDIMS>& slice_sizes) { + output.device(d) = input.slice(slice_indices, slice_sizes); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_SLICE_OP_H_ diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc new file mode 100644 index 0000000000..6e919b244c --- /dev/null +++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc @@ -0,0 +1,31 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> + +#include "tensorflow/core/kernels/slice_op.h" + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_GPU_KERNELS(T) \ + template struct functor::Slice<GPUDevice, T, 1>; \ + template struct functor::Slice<GPUDevice, T, 2>; \ + template struct functor::Slice<GPUDevice, T, 3>; \ + template struct functor::Slice<GPUDevice, T, 4>; \ + template struct functor::Slice<GPUDevice, T, 5>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); +DEFINE_GPU_KERNELS(int32); + +#undef DEFINE_GPU_KERNELS + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc new file mode 100644 index 0000000000..27c78c6dc0 --- /dev/null +++ b/tensorflow/core/kernels/slice_op_test.cc @@ -0,0 +1,73 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim' +// in size, and concat them together along "concat_dimension" +template <typename T> +static void SliceHelper(int iters, int size) { + testing::StopTiming(); + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + DataType dt = DataTypeToEnum<T>::v(); + int kDim = 100; + int kMaxSize = 15000; + CHECK_LT(size, kMaxSize); + + Tensor begin(DT_INT32, TensorShape({2})); + begin.flat<int32>()(0) = 10; + begin.flat<int32>()(1) = 10; + + Tensor sizes(DT_INT32, TensorShape({2})); + sizes.flat<int32>()(0) = kDim; + sizes.flat<int32>()(1) = size; + + Tensor input(dt, TensorShape({2 * kDim, kMaxSize})); + input.flat<T>().setRandom(); + + Node* node; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Slice") + .Input(test::graph::Constant(g, input)) + .Input(test::graph::Constant(g, begin)) + .Input(test::graph::Constant(g, sizes)) + .Attr("T", dt) + .Finalize(g, &node)); + + testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T)); + testing::StartTiming(); + test::Benchmark("cpu", g).Run(iters); + testing::UseRealTime(); +} + +static void BM_SliceFloat(int iters, int dim2) { + SliceHelper<float>(iters, dim2); +} + +BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000); + +static void BM_SliceBFloat16(int iters, int dim2) { + SliceHelper<bfloat16>(iters, dim2); +} + +BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc new file mode 100644 index 0000000000..abe6331a4f --- /dev/null +++ b/tensorflow/core/kernels/softmax_op.cc @@ -0,0 +1,62 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/kernels/softmax_op.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class SoftmaxOp : public OpKernel { + public: + explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& logits_in = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()), + errors::InvalidArgument("logits must be 2-dimensional")); + Tensor* softmax_out = nullptr; + OP_REQUIRES_OK( + context, context->allocate_output(0, logits_in.shape(), &softmax_out)); + functor::SoftmaxFunctor<Device, T> functor; + functor(context->eigen_device<Device>(), logits_in.matrix<T>(), + softmax_out->matrix<T>()); + } +}; + +// Partial specialization for a CPUDevice, that uses the Eigen implementation +// from SoftmaxEigenImpl. +namespace functor { +template <typename T> +struct SoftmaxFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::Matrix softmax) { + SoftmaxEigenImpl<CPUDevice, T>::Compute(d, logits, softmax); + } +}; +} // namespace functor + +REGISTER_KERNEL_BUILDER(Name("Softmax") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + SoftmaxOp<CPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("Softmax") + .Device(DEVICE_CPU) + .TypeConstraint<double>("T"), + SoftmaxOp<CPUDevice, double>); + +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("Softmax") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T"), + SoftmaxOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/softmax_op.h b/tensorflow/core/kernels/softmax_op.h new file mode 100644 index 0000000000..69bd531b70 --- /dev/null +++ b/tensorflow/core/kernels/softmax_op.h @@ -0,0 +1,70 @@ +#ifndef TENSORFLOW_KERNELS_SOFTMAX_OP_H_ +#define TENSORFLOW_KERNELS_SOFTMAX_OP_H_ +// Functor definition for SoftmaxOp, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by SoftmaxOp to do the computations. +template <typename Device, typename T> +struct SoftmaxFunctor { + // Computes Softmax activation. + // + // logits: dim: batch_size, num_classes. + // softmax: dims: batch_size, num_classes. + void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::Matrix softmax); +}; + +// Eigen code implementing SoftmaxFunctor::operator(). +// This code works for both CPU and GPU and is used by the functor +// specializations for both device types. +template <typename Device, typename T> +struct SoftmaxEigenImpl { + static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::Matrix softmax) { + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + +// These arrays are used to reduce along the class dimension, and broadcast +// the resulting value to all classes. +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::DSizes<int, 1> along_class(kClassDim); + Eigen::DSizes<int, 2> batch_by_one(batch_size, 1); + Eigen::DSizes<int, 2> one_by_class(1, num_classes); +#else + Eigen::IndexList<Eigen::type2index<kClassDim> > along_class; + Eigen::IndexList<Eigen::type2index<1> > depth_dim; + Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one; + batch_by_one.set(0, batch_size); + Eigen::IndexList<Eigen::type2index<1>, int> one_by_class; + one_by_class.set(1, num_classes); +#endif + // NOTE(mdevin): If you modify this implementation please run + // the ImageNetSoftmaxFwd benchmark in core_ops_test.cc. + // + // softmax = exp(logits - max(logits along classes)); + softmax.device(d) = (logits - + logits.maximum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)).exp(); + // softmax = softmax / sum(softmax along classes); + softmax.device(d) = (softmax / + softmax.sum(along_class) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_SOFTMAX_OP_H_ diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc new file mode 100644 index 0000000000..d5aaf9c364 --- /dev/null +++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc @@ -0,0 +1,31 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/softmax_op.h" + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Partial specialization for a GPUDevice, that uses the Eigen implementation +// from SoftmaxEigenImpl. +namespace functor { +template <typename T> +struct SoftmaxFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::Matrix softmax) { + SoftmaxEigenImpl<GPUDevice, T>::Compute(d, logits, softmax); + } +}; +} // end namespace functor + +// Instantiate the GPU implementation for float. +template struct functor::SoftmaxFunctor<GPUDevice, float>; + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc new file mode 100644 index 0000000000..b5fb57d3c5 --- /dev/null +++ b/tensorflow/core/kernels/softplus_op.cc @@ -0,0 +1,97 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/softplus_op.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> { + public: + using UnaryElementWiseOp<T, SoftplusOp<Device, T>>::UnaryElementWiseOp; + + void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) { + functor::Softplus<Device, T> functor; + functor(context->eigen_device<Device>(), input.flat<T>(), + output->flat<T>()); + } +}; + +template <typename Device, typename T> +class SoftplusGradOp + : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> { + public: + using BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>::BinaryElementWiseOp; + + // INPUTS: + // g (gradients): backpropagated gradients + // a (inputs): inputs that were passed to SoftplusOp() + // OUTPUT: + // gradients to backprop + template <int NDIMS> + void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a, + Tensor* output) { + OP_REQUIRES(context, a.IsSameSize(g), + errors::InvalidArgument("g and a must be the same size")); + functor::SoftplusGrad<Device, T> functor; + functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), + output->flat<T>()); + } +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Softplus").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + SoftplusOp<CPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("SoftplusGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + SoftplusGradOp<CPUDevice, type>); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void Softplus<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::ConstTensor features, \ + typename TTypes<T>::Tensor activations); \ + extern template struct Softplus<GPUDevice, T>; \ + \ + template <> \ + void SoftplusGrad<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::ConstTensor gradients, \ + typename TTypes<T>::ConstTensor features, \ + typename TTypes<T>::Tensor backprops); \ + extern template struct SoftplusGrad<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +} // namespace functor + +// Registration of the GPU implementations. +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Softplus").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + SoftplusOp<GPUDevice, type>); \ + REGISTER_KERNEL_BUILDER( \ + Name("SoftplusGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + SoftplusGradOp<GPUDevice, type>); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/softplus_op.h b/tensorflow/core/kernels/softplus_op.h new file mode 100644 index 0000000000..3545a78246 --- /dev/null +++ b/tensorflow/core/kernels/softplus_op.h @@ -0,0 +1,46 @@ +#ifndef TENSORFLOW_KERNELS_SOFTPLUS_OP_H_ +#define TENSORFLOW_KERNELS_SOFTPLUS_OP_H_ +// Functor definition for SoftplusOp and SoftplusGradOp, must be compilable by +// nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by SoftplusOp to do the computations. +template <typename Device, typename T> +struct Softplus { + // Computes Softplus activation. + // + // features: any shape. + // activations: same shape as "features". + void operator()(const Device& d, typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor activations) { + activations.device(d) = + (features > features.constant(30.f)) + .select(features, (features.exp() + features.constant(1.0f)).log()); + } +}; + +// Functor used by SoftplusGradOp to do the computations. +template <typename Device, typename T> +struct SoftplusGrad { + // Computes SoftplusGrad backprops. + // + // gradients: gradients backpropagated to the Softplus op. + // features: inputs that where passed to the Softplus op. + // backprops: gradients to backpropagate to the Softplus inputs. + void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients, + typename TTypes<T>::ConstTensor features, + typename TTypes<T>::Tensor backprops) { + backprops.device(d) = + gradients / ((-features).exp() + features.constant(1.0f)); + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_SOFTPLUS_OP_H_ diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc new file mode 100644 index 0000000000..7a974321a7 --- /dev/null +++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc @@ -0,0 +1,25 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> + +#include "tensorflow/core/kernels/softplus_op.h" + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Definition of the GPU implementations declared in softplus_op.cc. +#define DEFINE_GPU_KERNELS(T) \ + template struct functor::Softplus<GPUDevice, T>; \ + template struct functor::SoftplusGrad<GPUDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/sparse_concat_op.cc b/tensorflow/core/kernels/sparse_concat_op.cc new file mode 100644 index 0000000000..72c267a47d --- /dev/null +++ b/tensorflow/core/kernels/sparse_concat_op.cc @@ -0,0 +1,139 @@ +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <unordered_map> +#include <utility> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_util.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/sparse/sparse_tensor.h" + +namespace tensorflow { + +template <typename T> +class SparseConcatOp : public OpKernel { + public: + explicit SparseConcatOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("concat_dim", &concat_dim_)); + } + + void Compute(OpKernelContext* context) override { + OpInputList inds; + OP_REQUIRES_OK(context, context->input_list("indices", &inds)); + const int N = inds.size(); + for (int i = 0; i < N; i++) { + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(inds[i].shape()), + errors::InvalidArgument( + "Input indices should be a matrix but received shape ", + inds[i].shape().DebugString(), " at position ", i)); + } + + OpInputList vals; + OP_REQUIRES_OK(context, context->input_list("values", &vals)); + OP_REQUIRES(context, vals.size() == N, + errors::InvalidArgument("Expected ", N, " input values, got ", + vals.size())); + for (int i = 0; i < N; i++) { + OP_REQUIRES(context, TensorShapeUtils::IsVector(vals[i].shape()), + errors::InvalidArgument( + "Input values should be a vector but received shape ", + vals[i].shape().DebugString(), " at position ", i)); + } + + OpInputList shapes; + OP_REQUIRES_OK(context, context->input_list("shapes", &shapes)); + OP_REQUIRES(context, shapes.size() == N, + errors::InvalidArgument("Expected ", N, " input shapes, got ", + shapes.size())); + for (int i = 0; i < N; i++) { + OP_REQUIRES(context, TensorShapeUtils::IsVector(shapes[i].shape()), + errors::InvalidArgument( + "Input shapes should be a vector but received shape ", + shapes[i].shape().DebugString(), " at position ", i)); + } + + const TensorShape input_shape(shapes[0].vec<int64>()); + OP_REQUIRES( + context, concat_dim_ >= 0 && concat_dim_ < input_shape.dims(), + errors::InvalidArgument("Concat dimension must be between 0 and rank (", + input_shape.dims(), "), got ", concat_dim_)); + for (int i = 1; i < N; ++i) { + const TensorShape current_shape(shapes[i].vec<int64>()); + OP_REQUIRES(context, current_shape.dims() == input_shape.dims(), + errors::InvalidArgument( + "Ranks of all input tensors must match: expected ", + input_shape.dims(), " but got ", current_shape.dims(), + " at position ", i)); + for (int j = 0; j < input_shape.dims(); ++j) { + if (j != concat_dim_) { + OP_REQUIRES( + context, input_shape.dim_size(j) == current_shape.dim_size(j), + errors::InvalidArgument( + "Input shapes must match: expected ", input_shape.dim_size(j), + " for dimension ", j, " but got ", current_shape.dim_size(j), + " at position ", i)); + } + } + } + + // The input and output sparse tensors are assumed to be ordered along + // increasing dimension number. But in order for concat to work properly, + // order[0] must be concat_dim. So we will reorder the inputs to the + // concat ordering, concatenate, then reorder back to the standard order. + // We make a deep copy of the input tensors to ensure that the in-place + // reorder doesn't create race conditions for other ops that may be + // concurrently reading the indices and values tensors. + + gtl::InlinedVector<int64, 8> std_order(input_shape.dims()); + std::iota(std_order.begin(), std_order.end(), 0); + + std::vector<int64> concat_order; + concat_order.reserve(input_shape.dims()); + concat_order.push_back(concat_dim_); + for (int j = 0; j < input_shape.dims(); ++j) { + if (j != concat_dim_) { + concat_order.push_back(j); + } + } + + std::vector<sparse::SparseTensor> sp_inputs; + for (int i = 0; i < N; ++i) { + const TensorShape current_shape(shapes[i].vec<int64>()); + sp_inputs.emplace_back(tensor::DeepCopy(inds[i]), + tensor::DeepCopy(vals[i]), current_shape, + std_order); + sp_inputs[i].Reorder<T>(concat_order); + } + + sparse::SparseTensor concat = sparse::SparseTensor::Concat<T>(sp_inputs); + concat.Reorder<T>(std_order); + + context->set_output(0, concat.indices()); + context->set_output(1, concat.values()); + + Tensor* output_shape_out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 2, TensorShape({concat.shape().dims()}), + &output_shape_out)); + auto output_shape = output_shape_out->vec<int64>(); + for (int j = 0; j < concat.shape().dims(); ++j) { + output_shape(j) = concat.shape().dim_size(j); + } + } + + private: + int concat_dim_; +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("SparseConcat").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + SparseConcatOp<type>) + +TF_CALL_ALL_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS +} // namespace tensorflow diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc new file mode 100644 index 0000000000..919e129ff8 --- /dev/null +++ b/tensorflow/core/kernels/sparse_matmul_op.cc @@ -0,0 +1,192 @@ +// See docs in ../ops/math_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/port.h" + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/util/work_sharder.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename T> +void PrefetchBlockNTA(const T& tensor, int si, int ei, int sj, int ej) { + for (int i = si; i < ei; ++i) { + for (int j = sj; j < ej; j = j + 16) { + port::prefetch<port::PREFETCH_HINT_NTA>(&tensor(i, j)); + } + } +} + +template <typename T> +void PrefetchBlockT1(const T& tensor, int si, int ei, int sj, int ej) { + for (int i = si; i < ei; ++i) { + for (int j = sj; j < ej; j = j + 16) { + port::prefetch<port::PREFETCH_HINT_T1>(&tensor(i, j)); + } + } +} + +struct Block { + Block(int sm, int em, int sk, int ek, int sn, int en) + : startm(sm), endm(em), startk(sk), endk(ek), startn(sn), endn(en) {} + + int startm; + int endm; + int startk; + int endk; + int startn; + int endn; +}; + +bool NextBlock(const int Bm, const int Bk, const int Bn, const int m_start, + const int m, const int k, const int n, const Block& b, + Block* next) { + *next = b; + if (b.endk < k) { + next->startk = b.endk; + next->endk = std::min(b.endk + Bk, k); + } else { + next->startk = 0; + next->endk = std::min(Bk, k); + if (b.endm < m) { + next->startm = b.endm; + next->endm = std::min(b.endm + Bm, m); + } else { + next->startm = m_start; + next->endm = std::min(m_start + Bm, m); + next->startn = b.endn; + next->endn = std::min(b.endn + Bn, n); + } + } + return next->startn == next->endn; +} + +class SparseMatMulOp : public OpKernel { + public: + explicit SparseMatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("a_is_sparse", &a_is_sparse_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("b_is_sparse", &b_is_sparse_)); + } + + void Compute(OpKernelContext* ctx) override { + const Tensor& a = ctx->input(0); + const Tensor& b = ctx->input(1); + + OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), + errors::InvalidArgument("a is not a matrix")); + OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), + errors::InvalidArgument("b is not a matrix")); + + auto left = a.matrix<float>(); + auto right_mat = b.matrix<float>(); + const int m = transpose_a_ ? left.dimension(1) : left.dimension(0); + const int k = transpose_a_ ? left.dimension(0) : left.dimension(1); + const int n = + transpose_b_ ? right_mat.dimension(0) : right_mat.dimension(1); + const int k2 = + transpose_b_ ? right_mat.dimension(1) : right_mat.dimension(0); + + OP_REQUIRES(ctx, k == k2, + errors::InvalidArgument("Matrix size incompatible: a: ", + a.shape().DebugString(), ", b: ", + b.shape().DebugString())); + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output)); + auto out = output->matrix<float>(); + + if (!a_is_sparse_) { + // Fallback to Eigen contract. + // Note that we currently don't optimize the case where only right is + // sparse. That can generally be handled by tranposing the order of the + // matmul. + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; + dim_pair[0].first = transpose_a_ ? 0 : 1; + dim_pair[0].second = transpose_b_ ? 1 : 0; + out.device(ctx->template eigen_device<CPUDevice>()) = + left.contract(right_mat, dim_pair); + return; + } + typedef Eigen::Tensor<float, 2, Eigen::RowMajor> Matrix; + std::unique_ptr<Matrix> right_tr_mat; + std::unique_ptr<TTypes<float>::ConstMatrix> right_tr_map; + if (transpose_b_) { + right_tr_mat.reset(new Matrix(k, n)); + Eigen::array<int, 2> perm({1, 0}); + right_tr_mat->device(ctx->template eigen_device<CPUDevice>()) = + right_mat.shuffle(perm); + right_tr_map.reset(new TTypes<float>::ConstMatrix( + right_tr_mat->data(), right_tr_mat->dimensions())); + } + TTypes<float>::ConstMatrix& right = + transpose_b_ ? *right_tr_map : right_mat; + + const bool transpose_a = transpose_a_; + + typedef Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>, + Eigen::Unaligned> TensorMap; + typedef Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>, + Eigen::Unaligned> ConstTensorMap; + typedef Eigen::DSizes<Eigen::DenseIndex, 1> DSizes; + const int Bm = 16; + const int Bk = 16; + const int Bn = 1024; + + auto work_shard = [m, n, k, transpose_a, Bm, Bk, Bn, &left, &right, &out]( + int64 start64, int64 end64) { + const int start = static_cast<int>(start64); + const int end = static_cast<int>(end64); + Block curr(start, std::min(start + Bm, end), 0, std::min(Bk, k), 0, + std::min(Bn, n)); + Block next(curr); + bool done = false; + for (int i = start; i < end; ++i) { + out.chip<0>(i).setZero(); + } + while (true) { + done = NextBlock(Bm, Bk, Bn, start, end, k, n, curr, &next); + + PrefetchBlockT1(right, curr.startk, curr.endk, curr.startn, curr.endn); + + // Process current block + for (int i = curr.startm; i < curr.endm; ++i) { + PrefetchBlockNTA(left, i, i + 1, curr.startk, curr.endk); + PrefetchBlockNTA(out, i, i + 1, curr.startn, curr.endn); + DSizes out_slice_shape(curr.endn - curr.startn); + TensorMap out_i(&out(i, curr.startn), out_slice_shape); + for (int j = curr.startk; j < curr.endk; ++j) { + const float l = transpose_a ? left(j, i) : left(i, j); + if (l == 0) continue; + ConstTensorMap right_j(&right(j, curr.startn), out_slice_shape); + out_i += right_j * l; + } + } + if (done) break; + curr = next; + } + }; + auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, m, 2 * k * n, + work_shard); + } + + private: + bool transpose_a_; + bool transpose_b_; + bool a_is_sparse_; + bool b_is_sparse_; + TF_DISALLOW_COPY_AND_ASSIGN(SparseMatMulOp); +}; + +REGISTER_KERNEL_BUILDER(Name("SparseMatMul").Device(DEVICE_CPU), + SparseMatMulOp); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc new file mode 100644 index 0000000000..883d0d1224 --- /dev/null +++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc @@ -0,0 +1,139 @@ +#include "tensorflow/core/framework/types.pb.h" +#include <gtest/gtest.h> +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { +random::PhiloxRandom philox(1, 1); +random::SimplePhilox rnd(&philox); + +void Sparsify(Tensor* t, float sparsity) { + const int64 N = t->NumElements(); + CHECK_LE(sparsity, 1); + if (sparsity <= 0) return; + auto flat = t->flat<float>(); + static const uint32 K = 10000; + for (int64 i = 0; i < N; ++i) { + if (rnd.Uniform(K) < sparsity * K) { + flat(i) = 0; + } + } +} + +Node* SparseMatMulNode(Graph* g, Node* in0, Node* in1, bool transpose_a, + bool transpose_b, bool a_sparse, bool b_sparse) { + Node* ret; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseMatMul") + .Input(in0) + .Input(in1) + .Attr("transpose_a", transpose_a) + .Attr("transpose_b", transpose_b) + .Attr("a_is_sparse", a_sparse) + .Attr("b_is_sparse", b_sparse) + .Finalize(g, &ret)); + return ret; +} + +static Graph* SparseMatMulHelper(Graph* g, int m, int n, int d, float sparsity, + bool transpose_a, bool transpose_b, + bool a_sparse, bool b_sparse) { + a_sparse = a_sparse && (sparsity > 0); + b_sparse = b_sparse && (sparsity > 0); + + auto left_shape = transpose_a ? TensorShape({d, m}) : TensorShape({m, d}); + Tensor left(DataTypeToEnum<float>::value, left_shape); + left.flat<float>().setRandom(); + if (a_sparse) { + Sparsify(&left, sparsity); + } + + auto right_shape = transpose_b ? TensorShape({n, d}) : TensorShape({d, n}); + Tensor right(DataTypeToEnum<float>::value, right_shape); + right.flat<float>().setRandom(); + if (b_sparse) { + Sparsify(&right, sparsity); + } + + SparseMatMulNode(g, test::graph::Constant(g, left), + test::graph::Constant(g, right), transpose_a, transpose_b, + a_sparse, b_sparse); + return g; +} + +static Graph* SparseMatMul(int m, int n, int d, float sparsity, + bool transpose_a, bool transpose_b) { + Graph* g = new Graph(OpRegistry::Global()); + return SparseMatMulHelper(g, m, n, d, sparsity, transpose_a, transpose_b, + true, false); +} + +static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_a, + float sparsity_b) { + Graph* g = new Graph(OpRegistry::Global()); + if (sparsity_a == 0 && sparsity_b > 0) { + SparseMatMulHelper(g, m, n, d, sparsity_a, false, false, false, false); + SparseMatMulHelper(g, n, d, m, sparsity_b, true, true, true, false); + SparseMatMulHelper(g, m, d, n, sparsity_b, false, false, true, false); + } else { + SparseMatMulHelper(g, m, n, d, sparsity_a, false, true, true, false); + SparseMatMulHelper(g, d, n, m, sparsity_a, true, false, true, true); + SparseMatMulHelper(g, m, d, n, sparsity_b, false, false, true, false); + } + return g; +} + +#define BM_SPARSE(M, K, N, S) \ + static void BM_Sparse##_##M##_##K##_##N##_##S(int iters) { \ + testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \ + std::string label = strings::Printf("%d_%d_%d_%0.2f", M, K, N, S / 100.0); \ + testing::SetLabel(label); \ + test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, false, false)) \ + .Run(iters); \ + } \ + BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S); + +BM_SPARSE(2048, 2048, 2048, 0); +BM_SPARSE(2048, 2048, 2048, 1); +BM_SPARSE(2048, 2048, 2048, 85); + +BM_SPARSE(1024, 1024, 1024, 0); +BM_SPARSE(1024, 1024, 1024, 1); +BM_SPARSE(1024, 1024, 1024, 85); + +BM_SPARSE(256, 256, 256, 1); +BM_SPARSE(512, 512, 512, 1); + +#define BM_SPARSE_MULTI(M, K, N, S1, S2) \ + static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2(int iters) { \ + testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 3); \ + std::string label = strings::Printf("%d_%d_%d_%0.2f_%0.2f", M, K, N, \ + S1 / 100.0, S2 / 100.0); \ + testing::SetLabel(label); \ + test::Benchmark("cpu", MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0)) \ + .Run(iters); \ + } \ + BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2); + +BM_SPARSE_MULTI(512, 2140, 4096, 0, 82); +BM_SPARSE_MULTI(512, 4096, 2048, 83, 83); + +#define BM_SPARSE_TR(M, K, N, S, TA, TB) \ + static void BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB(int iters) { \ + testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \ + std::string label = \ + strings::Printf("%d_%d_%d_%d_%d_%0.2f", M, K, N, TA, TB, S / 100.0); \ + testing::SetLabel(label); \ + test::Benchmark("cpu", SparseMatMul(M, N, K, S / 100.0, TA, TB)) \ + .Run(iters); \ + } \ + BENCHMARK(BM_Sparse##_##M##_##K##_##N##_##S##_##TA##_##TB); + +BM_SPARSE_TR(2048, 2048, 2048, 1, true, false); +BM_SPARSE_TR(2048, 2048, 2048, 1, false, true); +BM_SPARSE_TR(2048, 2048, 2048, 1, true, true); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/sparse_reorder_op.cc b/tensorflow/core/kernels/sparse_reorder_op.cc new file mode 100644 index 0000000000..fd6824a4e2 --- /dev/null +++ b/tensorflow/core/kernels/sparse_reorder_op.cc @@ -0,0 +1,71 @@ +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <unordered_map> +#include <utility> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_util.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/util/sparse/sparse_tensor.h" + +namespace tensorflow { + +template <typename T> +class SparseReorderOp : public OpKernel { + public: + explicit SparseReorderOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input_ind = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_ind.shape()), + errors::InvalidArgument( + "Input indices should be a matrix but received shape", + input_ind.shape().DebugString())); + + const Tensor& input_val = context->input(1); + OP_REQUIRES(context, TensorShapeUtils::IsVector(input_val.shape()), + errors::InvalidArgument( + "Input values should be a vector but received shape", + input_val.shape().DebugString())); + + const Tensor& input_shape_in = context->input(2); + OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()), + errors::InvalidArgument( + "Input shape should be a vector but received shape", + input_shape_in.shape().DebugString())); + + const TensorShape input_shape(input_shape_in.vec<int64>()); + + gtl::InlinedVector<int64, 8> std_order(input_shape.dims()); + std::iota(std_order.begin(), std_order.end(), 0); + + // Check if the sparse tensor is already ordered correctly + sparse::SparseTensor input_sp(input_ind, input_val, input_shape, std_order); + + if (input_sp.IndicesValid()) { + context->set_output(0, input_sp.indices()); + context->set_output(1, input_sp.values()); + } else { + // Deep-copy the input Tensors, then reorder in-place + sparse::SparseTensor reordered_sp(tensor::DeepCopy(input_ind), + tensor::DeepCopy(input_val), + input_shape); + reordered_sp.Reorder<T>(std_order); + context->set_output(0, reordered_sp.indices()); + context->set_output(1, reordered_sp.values()); + } + } +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("SparseReorder").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + SparseReorderOp<type>) + +TF_CALL_ALL_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS +} // namespace tensorflow diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc new file mode 100644 index 0000000000..47e91c134d --- /dev/null +++ b/tensorflow/core/kernels/sparse_to_dense_op.cc @@ -0,0 +1,129 @@ +// See core/ops/sparse_ops.cc for documentation. +// +// NOTE: the operations in this file only are suitable for execution +// on CPUs. + +#define EIGEN_USE_THREADS + +#include <string> +#include <sstream> +#include <unordered_map> +#include <utility> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/lib/gtl/inlined_vector.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/util/sparse/sparse_tensor.h" + +namespace tensorflow { + +// Operator to convert sparse representations to dense. +template <typename T, typename Index> +class SparseToDense : public OpKernel { + public: + explicit SparseToDense(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* c) override { + // sparse_indices + const Tensor& indices = c->input(0); + OP_REQUIRES(c, indices.dims() <= 2, + errors::InvalidArgument( + "sparse_indices should be a scalar, vector, or matrix, " + "got shape ", + indices.shape().ShortDebugString())); + const int64 num_elems = indices.dims() > 0 ? indices.dim_size(0) : 1; + const int64 num_dims = indices.dims() > 1 ? indices.dim_size(1) : 1; + + // output_shape + const Tensor& output_shape = c->input(1); + OP_REQUIRES( + c, TensorShapeUtils::IsLegacyVector(output_shape.shape()), + errors::InvalidArgument("output_shape should be a vector, got shape ", + output_shape.shape().ShortDebugString())); + OP_REQUIRES(c, output_shape.NumElements() == num_dims, + errors::InvalidArgument( + "output_shape has incorrect number of elements: ", + output_shape.NumElements(), " should be: ", num_dims)); + + // sparse_values + const Tensor& sparse_values = c->input(2); + const int64 num_values = sparse_values.NumElements(); + OP_REQUIRES( + c, sparse_values.dims() == 0 || + (sparse_values.dims() == 1 && num_values == num_elems), + errors::InvalidArgument("sparse_values has incorrect shape ", + sparse_values.shape().ShortDebugString(), + ", should be [] or [", num_elems, "]")); + + // default_value + const Tensor& default_value = c->input(3); + OP_REQUIRES(c, TensorShapeUtils::IsScalar(default_value.shape()), + errors::InvalidArgument("default_value should be a scalar.")); + + auto output_shape_vec = output_shape.flat<Index>(); + Tensor* output = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, TensorShapeUtils::MakeShape( + output_shape_vec.data(), + output_shape_vec.size()), + &output)); + + TensorShape ix_shape({num_elems, num_dims}); + Tensor indices_shaped(DT_INT64, ix_shape); + if (indices.dtype() == DT_INT64) { + CHECK(indices_shaped.CopyFrom(indices, ix_shape)); + } else { + indices_shaped.matrix<int64>() = + indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>(); + } + + // If we received a scalar, we'll need to create a new + // tensor with copies of the values as a vec. + // TODO(ebrevdo): find a way to avoid this temp allocation. + Tensor sparse_values_b; + + if (TensorShapeUtils::IsScalar(sparse_values.shape())) { + OP_REQUIRES_OK( + c, c->allocate_temp(DataTypeToEnum<T>::value, + TensorShape({num_elems}), &sparse_values_b)); + sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()()); + } else { + sparse_values_b = sparse_values; + } + + gtl::InlinedVector<int64, 8> order(output->shape().dims()); + std::iota(order.begin(), order.end(), 0); // Assume order is correct + sparse::SparseTensor st(indices_shaped, sparse_values_b, output->shape(), + order); + + output->flat<T>().setConstant(default_value.scalar<T>()()); + OP_REQUIRES(c, st.template ToDense<T>(output, false /* initialize */), + errors::InvalidArgument( + "Indices are not valid (out of bounds). Shape: ", + output->shape().DebugString())); + } +}; + +#define REGISTER_KERNELS(type, index_type) \ + REGISTER_KERNEL_BUILDER(Name("SparseToDense") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .TypeConstraint<index_type>("Tindices"), \ + SparseToDense<type, index_type>); + +#define REGISTER_KERNELS_ALL(type) \ + REGISTER_KERNELS(type, int32); \ + REGISTER_KERNELS(type, int64); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS_ALL); +REGISTER_KERNELS_ALL(bool); +REGISTER_KERNELS_ALL(string); + +#undef REGISTER_KERNELS_ALL +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc new file mode 100644 index 0000000000..e9800ccd68 --- /dev/null +++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc @@ -0,0 +1,283 @@ +#include <functional> +#include <vector> + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> + +namespace tensorflow { + +namespace { + +class SparseToDenseTest : public OpsTestBase { + protected: + void SetUp() override { RequireDefaultOps(); } + + void MakeOp(int dim, DataType index_type, DataType value_type) { + ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense") + .Input(FakeInput(index_type)) + .Input(FakeInput(index_type)) + .Input(FakeInput(value_type)) + .Input(FakeInput(value_type)) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SparseToDenseTest, OneD_OneValue) { + MakeOp(1, DT_INT32, DT_FLOAT); + + // sparse_indices + AddInputFromArray<int32>(TensorShape({3}), {1, 3, 4}); + // output_shape + AddInputFromArray<int32>(TensorShape({1}), {5}); + // sparse_values + AddInputFromArray<float>(TensorShape({}), {2}); + // default_value + AddInputFromArray<float>(TensorShape({}), {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, {5}); + test::FillValues<float>(&expected, {-2, 2, -2, 2, 2}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(SparseToDenseTest, OneD_OneValue_int64_double) { + MakeOp(1, DT_INT64, DT_DOUBLE); + + // sparse_indices + AddInputFromArray<int64>(TensorShape({3}), {1, 3, 4}); + // output_shape + AddInputFromArray<int64>(TensorShape({1}), {5}); + // sparse_values + AddInputFromArray<double>(TensorShape({}), {2}); + // default_value + AddInputFromArray<double>(TensorShape({}), {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_DOUBLE, {5}); + test::FillValues<double>(&expected, {-2, 2, -2, 2, 2}); + test::ExpectTensorEqual<double>(expected, *GetOutput(0)); +} + +TEST_F(SparseToDenseTest, OneD_MultValues) { + MakeOp(1, DT_INT32, DT_FLOAT); + + // sparse_indices + AddInputFromArray<int32>({3}, {1, 3, 4}); + // output_shape + AddInputFromArray<int32>({1}, {5}); + // sparse_values + AddInputFromArray<float>({3}, {3, 4, 5}); + // default_value + AddInputFromArray<float>({}, {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, {5}); + test::FillValues<float>(&expected, {-2, 3, -2, 4, 5}); + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(SparseToDenseTest, TwoD_OneValue) { + MakeOp(2, DT_INT32, DT_FLOAT); + + // sparse_indices + AddInputFromArray<int32>(TensorShape({3, 2}), {0, 1, 0, 2, 2, 3}); + // output_shape + AddInputFromArray<int32>(TensorShape({2}), {3, 4}); + // sparse_values + AddInputFromArray<float>(TensorShape({}), {2}); + // default_value + AddInputFromArray<float>(TensorShape({}), {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, {3, 4}); + expected.flat<float>().setConstant(-2); + expected.tensor<float, 2>()(0, 1) = 2; + expected.tensor<float, 2>()(0, 2) = 2; + expected.tensor<float, 2>()(2, 3) = 2; + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(SparseToDenseTest, TwoD_MultValues) { + MakeOp(2, DT_INT32, DT_FLOAT); + + // sparse_indices + AddInputFromArray<int32>(TensorShape({3, 2}), {0, 1, 0, 2, 2, 3}); + // output_shape + AddInputFromArray<int32>(TensorShape({2}), {3, 4}); + // sparse_values + AddInputFromArray<float>(TensorShape({3}), {3, 4, 5}); + // default_value + AddInputFromArray<float>(TensorShape({}), {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, {3, 4}); + expected.flat<float>().setConstant(-2); + expected.tensor<float, 2>()(0, 1) = 3; + expected.tensor<float, 2>()(0, 2) = 4; + expected.tensor<float, 2>()(2, 3) = 5; + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(SparseToDenseTest, ThreeD_OneValue) { + MakeOp(3, DT_INT32, DT_FLOAT); + + // sparse_indices + AddInputFromArray<int32>(TensorShape({3, 3}), {0, 1, 1, 0, 2, 0, 2, 3, 1}); + // output_shape + AddInputFromArray<int32>(TensorShape({3}), {3, 4, 2}); + // sparse_values + AddInputFromArray<float>(TensorShape({}), {2}); + // default_value + AddInputFromArray<float>(TensorShape({}), {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, {3, 4, 2}); + expected.flat<float>().setConstant(-2); + expected.tensor<float, 3>()(0, 1, 1) = 2; + expected.tensor<float, 3>()(0, 2, 0) = 2; + expected.tensor<float, 3>()(2, 3, 1) = 2; + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +TEST_F(SparseToDenseTest, ThreeD_MultValues) { + MakeOp(3, DT_INT32, DT_FLOAT); + + // sparse_indices + AddInputFromArray<int32>(TensorShape({3, 3}), {0, 1, 1, 0, 2, 0, 2, 3, 1}); + // output_shape + AddInputFromArray<int32>(TensorShape({3}), {3, 4, 2}); + // sparse_values + AddInputFromArray<float>(TensorShape({3}), {3, 4, 5}); + // default_value + AddInputFromArray<float>(TensorShape({}), {-2}); + + ASSERT_OK(RunOpKernel()); + + Tensor expected(allocator(), DT_FLOAT, {3, 4, 2}); + expected.flat<float>().setConstant(-2); + expected.tensor<float, 3>()(0, 1, 1) = 3; + expected.tensor<float, 3>()(0, 2, 0) = 4; + expected.tensor<float, 3>()(2, 3, 1) = 5; + test::ExpectTensorEqual<float>(expected, *GetOutput(0)); +} + +} // namespace + +static int BM_Arg(int ndim, int n) { return (ndim * 1000000) + n; } +static int NDIM_from_arg(int bm_arg) { return bm_arg / 1000000; } +static int N_from_arg(int bm_arg) { return bm_arg % 1000000; } + +static void BM_SparseToDense(int iters, const int bm_arg) { + const int NDIM = NDIM_from_arg(bm_arg); + const int N = N_from_arg(bm_arg); + // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h + tensorflow::testing::StopTiming(); + + const int IndexDim = (NDIM == 1) ? 0 : 1; + + std::unique_ptr<Device> device( + DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); + + gtl::InlinedVector<TensorValue, 4> inputs; + + // Create a dense tensor with dims [1, ..., 1, N] + Tensor output_shape(DT_INT32, TensorShape({NDIM})); + Tensor sparse_indices(DT_INT32, TensorShape({N, NDIM})); + Tensor sparse_values(DT_FLOAT, TensorShape({N})); + Tensor default_value(DT_FLOAT, TensorShape({})); + auto output_shape_t = output_shape.vec<int32>(); + for (int d = 0; d < NDIM; ++d) { + output_shape_t(d) = (d == IndexDim) ? N : 3; + } + + auto sparse_indices_t = sparse_indices.matrix<int32>(); + for (int n = 0; n < N; ++n) { + for (int d = 0; d < NDIM; ++d) + sparse_indices_t(n, d) = (d == IndexDim) ? n : 0; + } + + for (auto* ptr : + {&sparse_indices, &output_shape, &sparse_values, &default_value}) { + inputs.push_back({nullptr, ptr}); + } + + NodeDef sparse_node_def; + TF_CHECK_OK(NodeDefBuilder("sparsetodense", "SparseToDense") + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_FLOAT)) + .Input(FakeInput(DT_FLOAT)) + .Finalize(&sparse_node_def)); + + Status status; + std::unique_ptr<OpKernel> op(CreateOpKernel( + DEVICE_CPU, device.get(), cpu_allocator(), sparse_node_def, &status)); + + OpKernelContext::Params params; + params.device = device.get(); + params.frame_iter = FrameAndIter(0, 0); + params.inputs = &inputs; + params.op_kernel = op.get(); + params.output_alloc_attr = [&device, &op, ¶ms](int index) { + AllocatorAttributes attr; + const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + return attr; + }; + + std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(params)); + op->Compute(sparse_context.get()); + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + delete sparse_context->release_output(0).tensor; + op->Compute(sparse_context.get()); + ASSERT_OK(sparse_context->status()); + } + tensorflow::testing::StopTiming(); + + // processing input, mainly + int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float)); + + tensorflow::testing::BytesProcessed(bytes_per_iter * iters); +} + +BENCHMARK(BM_SparseToDense) + ->Arg(BM_Arg(1, 10)) + ->Arg(BM_Arg(1, 100)) + ->Arg(BM_Arg(1, 1000)) + ->Arg(BM_Arg(1, 10000)) + ->Arg(BM_Arg(2, 10)) + ->Arg(BM_Arg(2, 100)) + ->Arg(BM_Arg(2, 1000)) + ->Arg(BM_Arg(2, 10000)) + ->Arg(BM_Arg(3, 10)) + ->Arg(BM_Arg(3, 100)) + ->Arg(BM_Arg(3, 1000)) + ->Arg(BM_Arg(3, 10000)) + ->Arg(BM_Arg(5, 10)) + ->Arg(BM_Arg(5, 100)) + ->Arg(BM_Arg(5, 1000)) + ->Arg(BM_Arg(5, 10000)); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc new file mode 100644 index 0000000000..f4f9ada000 --- /dev/null +++ b/tensorflow/core/kernels/split_op.cc @@ -0,0 +1,146 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/split_op.h" + +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class SplitOp : public OpKernel { + public: + explicit SplitOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* context) override { + const int32 split_dim = context->input(0).flat<int32>()(0); + const int32 num_split = num_outputs(); + const Tensor& input = context->input(1); + const TensorShape& input_shape = input.shape(); + + OP_REQUIRES( + context, 0 <= split_dim && split_dim < input_shape.dims(), + errors::InvalidArgument("0 <= split_dim < number of input dimensions (", + input_shape.dims(), "), but got ", split_dim)); + + OP_REQUIRES( + context, num_split > 0, + errors::InvalidArgument( + "Number of ways to split should be > 0, but got ", num_split)); + + OP_REQUIRES(context, input_shape.dim_size(split_dim) % num_split == 0, + errors::InvalidArgument( + "Number of ways to split should evenly divide the split " + "dimension, but got split_dim ", + split_dim, " (size = ", input_shape.dim_size(split_dim), + ") ", "and num_split ", num_split)); + + // Special case 1: num_split == 1. Nothing to do. + if (num_split == 1) { + VLOG(1) << "Split identity"; + context->set_output(0, context->input(1)); + return; + } + + // Special case 2: split along the 1st dimension. We can share the + // underlying buffer. + // + // Apply this optimization conservatively: if input is aligned, + // the resulting tensors must be aligned. It's conservative + // because if the immediate consumer of the resulting tensors are + // not using eigen for computation, its perfectly fine to avoid + // the copying. + if ((split_dim == 0) && IsInnerDimsSizeAligned<T>(input_shape)) { + VLOG(1) << "Slice dim 0: " << input_shape.DebugString(); + const int64 delta = input_shape.dim_size(0) / num_split; + for (int i = 0; i < num_split; ++i) { + context->set_output(i, input.Slice(i * delta, (i + 1) * delta)); + } + return; + } + + int32 prefix_dim_size = 1; + for (int i = 0; i < split_dim; ++i) { + prefix_dim_size *= input_shape.dim_size(i); + } + + int32 split_dim_size = input_shape.dim_size(split_dim); + + int32 suffix_dim_size = 1; + for (int i = split_dim + 1; i < input_shape.dims(); ++i) { + suffix_dim_size *= input_shape.dim_size(i); + } + + auto input_reshaped = + input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size}); + + const int32 split_dim_output_size = split_dim_size / num_split; + TensorShape output_shape(input_shape); + output_shape.set_dim(split_dim, split_dim_output_size); + + Eigen::DSizes<ptrdiff_t, 3> indices{0, 0, 0}; + Eigen::DSizes<ptrdiff_t, 3> sizes{prefix_dim_size, split_dim_output_size, + suffix_dim_size}; + + for (int i = 0; i < num_split; ++i) { + Tensor* result = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(i, output_shape, &result)); + if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) { + Eigen::DSizes<ptrdiff_t, 3> slice_indices; + Eigen::DSizes<ptrdiff_t, 3> slice_sizes; + for (int j = 0; j < 3; ++j) { + slice_indices[j] = indices[j]; + slice_sizes[j] = sizes[j]; + } + + auto result_shaped = result->shaped<T, 3>( + {prefix_dim_size, split_dim_output_size, suffix_dim_size}); + + functor::Split<Device, T>()(context->eigen_device<Device>(), + result_shaped, input_reshaped, + slice_indices, slice_sizes); + } + indices[1] += split_dim_output_size; + } + } +}; + +#define REGISTER_SPLIT(type) \ + REGISTER_KERNEL_BUILDER(Name("Split") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("split_dim"), \ + SplitOp<CPUDevice, type>) + +TF_CALL_ALL_TYPES(REGISTER_SPLIT); + +#undef REGISTER_SPLIT + +#if GOOGLE_CUDA + +#define REGISTER_GPU(type) \ + REGISTER_KERNEL_BUILDER(Name("Split") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T") \ + .HostMemory("split_dim"), \ + SplitOp<GPUDevice, type>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); +#undef REGISTER_GPU + +#endif // GOOGLE_CUDA + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/split_op.h b/tensorflow/core/kernels/split_op.h new file mode 100644 index 0000000000..2572c77285 --- /dev/null +++ b/tensorflow/core/kernels/split_op.h @@ -0,0 +1,31 @@ +#ifndef TENSORFLOW_KERNELS_SPLIT_OP_H_ +#define TENSORFLOW_KERNELS_SPLIT_OP_H_ +// Functor definition for SplitOp, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +struct Split { + void operator()(const Device& d, typename TTypes<T, 3>::Tensor output, + typename TTypes<T, 3>::ConstTensor input, + const Eigen::DSizes<ptrdiff_t, 3>& slice_indices, + const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes); +}; + +template <typename T> +struct Split<Eigen::ThreadPoolDevice, T> { + void operator()(const Eigen::ThreadPoolDevice& d, + typename TTypes<T, 3>::Tensor output, + typename TTypes<T, 3>::ConstTensor input, + const Eigen::DSizes<ptrdiff_t, 3>& slice_indices, + const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_SPLIT_OP_H_ diff --git a/tensorflow/core/kernels/split_op_cpu.cc b/tensorflow/core/kernels/split_op_cpu.cc new file mode 100644 index 0000000000..b86deeb8fb --- /dev/null +++ b/tensorflow/core/kernels/split_op_cpu.cc @@ -0,0 +1,30 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/split_op.h" + +#include "tensorflow/core/framework/numeric_types.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +template <typename T> +void Split<Eigen::ThreadPoolDevice, T>::operator()( + const Eigen::ThreadPoolDevice& d, typename TTypes<T, 3>::Tensor output, + typename TTypes<T, 3>::ConstTensor input, + const Eigen::DSizes<ptrdiff_t, 3>& slice_indices, + const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes) { + if (output.size() < 131072) { + output = input.slice(slice_indices, slice_sizes); + } else { + output.device(d) = input.slice(slice_indices, slice_sizes); + } +} + +#define DEFINE_CPU_KERNELS(T) template struct Split<Eigen::ThreadPoolDevice, T>; + +TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS) + +} // namespace functor +} // namespace tensorflow diff --git a/tensorflow/core/kernels/split_op_gpu.cu.cc b/tensorflow/core/kernels/split_op_gpu.cu.cc new file mode 100644 index 0000000000..f8931d6a89 --- /dev/null +++ b/tensorflow/core/kernels/split_op_gpu.cu.cc @@ -0,0 +1,31 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include <stdio.h> + +#include "tensorflow/core/kernels/split_op.h" + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T> +void Split<Device, T>::operator()( + const Device& d, typename TTypes<T, 3>::Tensor output, + typename TTypes<T, 3>::ConstTensor input, + const Eigen::DSizes<ptrdiff_t, 3>& slice_indices, + const Eigen::DSizes<ptrdiff_t, 3>& slice_sizes) { + output.device(d) = input.slice(slice_indices, slice_sizes); +} + +#define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); + +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/string_to_hash_bucket_op.cc b/tensorflow/core/kernels/string_to_hash_bucket_op.cc new file mode 100644 index 0000000000..bd6fa47268 --- /dev/null +++ b/tensorflow/core/kernels/string_to_hash_bucket_op.cc @@ -0,0 +1,47 @@ +#include <string> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +class StringToHashBucketOp : public OpKernel { + public: + explicit StringToHashBucketOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("num_buckets", &num_buckets_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor* input_tensor; + OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor)); + const auto& input_flat = input_tensor->flat<string>(); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("output", input_tensor->shape(), + &output_tensor)); + auto output_flat = output_tensor->flat<int64>(); + + for (int i = 0; i < input_flat.size(); ++i) { + const uint64 input_hash = Hash64(input_flat(i)); + const uint64 bucket_id = input_hash % num_buckets_; + // The number of buckets is always in the positive range of int64 so is + // the resulting bucket_id. Casting the bucket_id from uint64 to int64 is + // safe. + output_flat(i) = static_cast<int64>(bucket_id); + } + } + + private: + int64 num_buckets_; + + TF_DISALLOW_COPY_AND_ASSIGN(StringToHashBucketOp); +}; + +REGISTER_KERNEL_BUILDER(Name("StringToHashBucket").Device(DEVICE_CPU), + StringToHashBucketOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc new file mode 100644 index 0000000000..8d23a4fdf8 --- /dev/null +++ b/tensorflow/core/kernels/string_to_number_op.cc @@ -0,0 +1,71 @@ +// See docs in ../ops/parse_ops.cc. + +#include <errno.h> +#include <string> + +#include "tensorflow/core/framework/kernel_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +static constexpr char kErrorMessage[] = + "StringToNumberOp could not correctly convert string: "; + +template <typename OutputType> +class StringToNumberOp : public OpKernel { + public: + using OpKernel::OpKernel; + + void Compute(OpKernelContext* context) override { + // This is not a deep copy of the input tensor; they will share the same + // underlying storage. + const Tensor* input_tensor; + OP_REQUIRES_OK(context, context->input("string_tensor", &input_tensor)); + const auto& input_flat = input_tensor->flat<string>(); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output("output", input_tensor->shape(), + &output_tensor)); + auto output_flat = output_tensor->flat<OutputType>(); + + for (int i = 0; i < input_flat.size(); ++i) { + const char* s = input_flat(i).data(); + Convert(s, &output_flat(i), context); + } + } + + private: + void Convert(const char* s, OutputType* output_data, + OpKernelContext* context); +}; + +template <> +void StringToNumberOp<float>::Convert(const char* s, float* output_data, + OpKernelContext* context) { + OP_REQUIRES(context, strings::safe_strtof(s, output_data), + errors::InvalidArgument(kErrorMessage, s)); +} + +template <> +void StringToNumberOp<int32>::Convert(const char* s, int32* output_data, + OpKernelContext* context) { + OP_REQUIRES(context, strings::safe_strto32(s, output_data), + errors::InvalidArgument(kErrorMessage, s)); +} + +// Registers the currently supported output types. +#define REGISTER(type) \ + REGISTER_KERNEL_BUILDER(Name("StringToNumber") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<type>("out_type"), \ + StringToNumberOp<type>) +REGISTER(float); +REGISTER(int32); +#undef REGISTER + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc new file mode 100644 index 0000000000..ba765f2e84 --- /dev/null +++ b/tensorflow/core/kernels/summary_image_op.cc @@ -0,0 +1,169 @@ +// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as +// inputs or outputs in various ways. + +// See docs in ../ops/summary_ops.cc. + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/summary.pb.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/png/png_io.h" + +namespace tensorflow { + +class SummaryImageOp : public OpKernel { + public: + explicit SummaryImageOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("max_images", &max_images_)); + const TensorProto* proto; + OP_REQUIRES_OK(context, context->GetAttr("bad_color", &proto)); + OP_REQUIRES_OK(context, context->device()->MakeTensorFromProto( + *proto, AllocatorAttributes(), &bad_color_)); + OP_REQUIRES(context, bad_color_.dtype() == DT_UINT8, + errors::InvalidArgument("bad_color must be uint8, got ", + DataTypeString(bad_color_.dtype()))); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(bad_color_.shape()), + errors::InvalidArgument("bad_color must be a vector, got shape ", + bad_color_.shape().ShortDebugString())); + } + + void Compute(OpKernelContext* c) override { + const Tensor& tags = c->input(0); + const Tensor& tensor = c->input(1); + OP_REQUIRES(c, TensorShapeUtils::IsLegacyScalar(tags.shape()), + errors::InvalidArgument("Tags must have be a scalar")); + OP_REQUIRES(c, tensor.dims() == 4 && + (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 || + tensor.dim_size(3) == 4), + errors::InvalidArgument( + "Tensor must be 4-D with last dim 1, 3, or 4, not ", + tensor.shape().DebugString())); + const string& base_tag = tags.scalar<string>()(); + + const int batch_size = tensor.dim_size(0); + const int h = tensor.dim_size(1); + const int w = tensor.dim_size(2); + const int hw = h * w; // Compact these two dims for simplicity + const int depth = tensor.dim_size(3); + auto tensor_eigen = tensor.shaped<float, 3>({batch_size, hw, depth}); + + OP_REQUIRES(c, bad_color_.dim_size(0) >= depth, + errors::InvalidArgument( + "expected depth <= bad_color.size, got depth = ", depth, + ", bad_color.size = ", bad_color_.dim_size(0))); + auto bad_color_full = bad_color_.vec<uint8>(); + typename TTypes<uint8>::Vec bad_color(bad_color_full.data(), depth); + + // RGB (or gray or RGBA) is last dimension + Eigen::Tensor<uint8, 2, Eigen::RowMajor> image(hw, depth); + + Summary s; + const int N = std::min<int>(max_images_, batch_size); + for (int i = 0; i < N; ++i) { + Summary::Value* v = s.add_value(); + // The tag depends on the number of requested images (not the number + // produced.) + // + // Note that later on avisu uses "/" to figure out a consistent naming + // convention for display, so we append "/image" to guarantee that the + // image(s) won't be displayed in the global scope with no name. + if (max_images_ > 1) { + v->set_tag(strings::StrCat(base_tag, "/image/", i)); + } else { + v->set_tag(strings::StrCat(base_tag, "/image")); + } + + if (image.size()) { + typename TTypes<float>::ConstMatrix values( + &tensor_eigen(i, 0, 0), + Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth)); + + // Rescale the image to uint8 range. + // + // We are trying to generate an RCG image from a float tensor. We do + // not have any info about the expected range of values in the tensor + // but the generated image needs to have all RGB values within [0, 255]. + // + // We use two different algorithms to generate these values. If the + // tensor has only positive values we scale them all by 255/max(values). + // If the tensor has both negative and positive values we scale them by + // the max of their absolute values and center them around 127. + // + // This works for most cases, but has the incovenient of not respecting + // the relative dynamic range across different instances of the tensor. + + // Compute min and max ignoring nonfinite pixels + float image_min = std::numeric_limits<float>::infinity(); + float image_max = -image_min; + for (int i = 0; i < hw; i++) { + bool finite = true; + for (int j = 0; j < depth; j++) { + if (!std::isfinite(values(i, j))) { + finite = false; + break; + } + } + if (finite) { + for (int j = 0; j < depth; j++) { + float value = values(i, j); + image_min = std::min(image_min, value); + image_max = std::max(image_max, value); + } + } + } + + // Pick an affine transform into uint8 + const float kZeroThreshold = 1e-6; + float scale, offset; + if (image_min < 0) { + float max_val = std::max(std::abs(image_min), std::abs(image_max)); + scale = max_val < kZeroThreshold ? 0.0f : 127.0f / max_val; + offset = 128.0f; + } else { + scale = image_max < kZeroThreshold ? 0.0f : 255.0f / image_max; + offset = 0.0f; + } + + // Transform image, turning nonfinite values to bad_color + for (int i = 0; i < hw; i++) { + bool finite = true; + for (int j = 0; j < depth; j++) { + if (!std::isfinite(values(i, j))) { + finite = false; + break; + } + } + if (finite) { + image.chip<0>(i) = + (values.chip<0>(i) * scale + offset).cast<uint8>(); + } else { + image.chip<0>(i) = bad_color; + } + } + } + + Summary::Image* si = v->mutable_image(); + si->set_height(h); + si->set_width(w); + si->set_colorspace(depth); + OP_REQUIRES(c, png::WriteImageToBuffer( + image.data(), w, h, w * depth, depth, 8, -1, + si->mutable_encoded_image_string(), nullptr), + errors::Internal("PNG encoding failed")); + } + + Tensor* summary_tensor = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor)); + CHECK(s.SerializeToString(&summary_tensor->scalar<string>()())); + } + + private: + int64 max_images_; + Tensor bad_color_; +}; + +REGISTER_KERNEL_BUILDER(Name("ImageSummary").Device(DEVICE_CPU), + SummaryImageOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/summary_image_op_test.cc b/tensorflow/core/kernels/summary_image_op_test.cc new file mode 100644 index 0000000000..ddfeeffc0b --- /dev/null +++ b/tensorflow/core/kernels/summary_image_op_test.cc @@ -0,0 +1,141 @@ +#include <functional> +#include <memory> +#include <vector> + +#include <gtest/gtest.h> +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/summary.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/histogram/histogram.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/public/env.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { +namespace { + +static void EXPECT_SummaryMatches(const Summary& actual, + const string& expected_str) { + Summary expected; + CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected)); + EXPECT_EQ(expected.DebugString(), actual.DebugString()); +} + +// -------------------------------------------------------------------------- +// SummaryImageOp +// -------------------------------------------------------------------------- +class SummaryImageOpTest : public OpsTestBase { + protected: + void MakeOp(int max_images) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "ImageSummary") + .Input(FakeInput()) + .Input(FakeInput()) + .Attr("max_images", max_images) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } + + void CheckAndRemoveEncodedImages(Summary* summary) { + for (int i = 0; i < summary->value_size(); ++i) { + Summary::Value* value = summary->mutable_value(i); + ASSERT_TRUE(value->has_image()) << "No image for value: " << value->tag(); + ASSERT_FALSE(value->image().encoded_image_string().empty()) + << "No encoded_image_string for value: " << value->tag(); + if (VLOG_IS_ON(2)) { + // When LOGGING, output the images to disk for manual inspection. + TF_CHECK_OK(WriteStringToFile( + Env::Default(), strings::StrCat("/tmp/", value->tag(), ".png"), + value->image().encoded_image_string())); + } + value->mutable_image()->clear_encoded_image_string(); + } + } +}; + +TEST_F(SummaryImageOpTest, ThreeGrayImagesOutOfFive4dInput) { + MakeOp(3 /* max images */); + + // Feed and run + AddInputFromArray<string>(TensorShape({}), {"tag"}); + AddInputFromArray<float>(TensorShape({5, 2, 1, 1}), + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + + CheckAndRemoveEncodedImages(&summary); + EXPECT_SummaryMatches(summary, R"( + value { tag: 'tag/image/0' image { width: 1 height: 2 colorspace: 1} } + value { tag: 'tag/image/1' image { width: 1 height: 2 colorspace: 1} } + value { tag: 'tag/image/2' image { width: 1 height: 2 colorspace: 1} } + )"); +} + +TEST_F(SummaryImageOpTest, OneGrayImage4dInput) { + MakeOp(1 /* max images */); + + // Feed and run + AddInputFromArray<string>(TensorShape({}), {"tag"}); + AddInputFromArray<float>(TensorShape({5 /*batch*/, 2, 1, 1 /*depth*/}), + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + + CheckAndRemoveEncodedImages(&summary); + EXPECT_SummaryMatches(summary, R"( + value { tag: 'tag/image' image { width: 1 height: 2 colorspace: 1} })"); +} + +TEST_F(SummaryImageOpTest, OneColorImage4dInput) { + MakeOp(1 /* max images */); + + // Feed and run + AddInputFromArray<string>(TensorShape({}), {"tag"}); + AddInputFromArray<float>( + TensorShape({1 /*batch*/, 5 /*rows*/, 2 /*columns*/, 3 /*depth*/}), + { + /* r0, c0, RGB */ 1.0, 0.1, 0.2, + /* r0, c1, RGB */ 1.0, 0.3, 0.4, + /* r1, c0, RGB */ 0.0, 1.0, 0.0, + /* r1, c1, RGB */ 0.0, 1.0, 0.0, + /* r2, c0, RGB */ 0.0, 0.0, 1.0, + /* r2, c1, RGB */ 0.0, 0.0, 1.0, + /* r3, c0, RGB */ 1.0, 1.0, 0.0, + /* r3, c1, RGB */ 1.0, 0.0, 1.0, + /* r4, c0, RGB */ 1.0, 1.0, 0.0, + /* r4, c1, RGB */ 1.0, 0.0, 1.0, + }); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + + CheckAndRemoveEncodedImages(&summary); + EXPECT_SummaryMatches(summary, R"( + value { tag: 'tag/image' image { width: 2 height: 5 colorspace: 3} })"); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc new file mode 100644 index 0000000000..1c4be64b8b --- /dev/null +++ b/tensorflow/core/kernels/summary_op.cc @@ -0,0 +1,141 @@ +// Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as +// inputs or outputs in various ways. + +// See docs in ../ops/summary_ops.cc. + +#include <unordered_set> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/summary.pb.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/histogram/histogram.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/protobuf.h" + +namespace tensorflow { + +template <typename T> +class SummaryScalarOp : public OpKernel { + public: + explicit SummaryScalarOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* c) override { + const Tensor& tags = c->input(0); + const Tensor& values = c->input(1); + + OP_REQUIRES(c, tags.IsSameSize(values) || + (TensorShapeUtils::IsLegacyScalar(tags.shape()) && + TensorShapeUtils::IsLegacyScalar(values.shape())), + errors::InvalidArgument("tags and values not the same shape: ", + tags.shape().ShortDebugString(), " != ", + values.shape().ShortDebugString())); + auto Ttags = tags.flat<string>(); + auto Tvalues = values.flat<T>(); + Summary s; + for (int i = 0; i < Ttags.size(); i++) { + Summary::Value* v = s.add_value(); + v->set_tag(Ttags(i)); + v->set_simple_value(Tvalues(i)); + } + + Tensor* summary_tensor = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor)); + CHECK(s.SerializeToString(&summary_tensor->scalar<string>()())); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ScalarSummary") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + SummaryScalarOp<float>); +REGISTER_KERNEL_BUILDER(Name("ScalarSummary") + .Device(DEVICE_CPU) + .TypeConstraint<double>("T"), + SummaryScalarOp<double>); + +class SummaryHistoOp : public OpKernel { + public: + // SummaryHistoOp could be extended to take a list of custom bucket + // boundaries as an option. + explicit SummaryHistoOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* c) override { + const Tensor& tags = c->input(0); + const Tensor& values = c->input(1); + const auto flat = values.flat<float>(); + OP_REQUIRES(c, TensorShapeUtils::IsLegacyScalar(tags.shape()), + errors::InvalidArgument("tags must be scalar")); + // Build histogram of values in "values" tensor + histogram::Histogram histo; + for (int64 i = 0; i < flat.size(); i++) { + float v = flat(i); + if (!std::isfinite(v)) { + c->SetStatus( + errors::OutOfRange("Nan in summary histogram for: ", name())); + break; + } + histo.Add(v); + } + + Summary s; + Summary::Value* v = s.add_value(); + v->set_tag(tags.scalar<string>()()); + histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */); + + Tensor* summary_tensor = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor)); + CHECK(s.SerializeToString(&summary_tensor->scalar<string>()())); + } +}; + +REGISTER_KERNEL_BUILDER(Name("HistogramSummary").Device(DEVICE_CPU), + SummaryHistoOp); + +struct HistogramResource : public ResourceBase { + histogram::ThreadSafeHistogram histogram; + + string DebugString() override { return "A historam summary. Stats ..."; } +}; + +class SummaryMergeOp : public OpKernel { + public: + explicit SummaryMergeOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* c) override { + Summary s; + std::unordered_set<string> tags; + for (int input_num = 0; input_num < c->num_inputs(); input_num++) { + const Tensor& in = c->input(input_num); + auto in_vec = in.flat<string>(); + for (int i = 0; i < in_vec.dimension(0); i++) { + const string& s_in = in_vec(i); + Summary summary_in; + if (!ParseProtoUnlimited(&summary_in, s_in)) { + c->SetStatus(errors::InvalidArgument( + "Could not parse one of the summary inputs")); + return; + } + + for (int v = 0; v < summary_in.value_size(); v++) { + if (!tags.insert(summary_in.value(v).tag()).second) { + c->SetStatus(errors::InvalidArgument( + strings::StrCat("Duplicate tag ", summary_in.value(v).tag(), + " found in summary inputs"))); + return; + } + *s.add_value() = summary_in.value(v); + } + } + } + + Tensor* summary_tensor = nullptr; + OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &summary_tensor)); + CHECK(s.SerializeToString(&summary_tensor->scalar<string>()())); + } +}; + +REGISTER_KERNEL_BUILDER(Name("MergeSummary").Device(DEVICE_CPU), + SummaryMergeOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc new file mode 100644 index 0000000000..fd271a6862 --- /dev/null +++ b/tensorflow/core/kernels/summary_op_test.cc @@ -0,0 +1,282 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/summary.pb.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/histogram/histogram.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/public/env.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +static void EXPECT_SummaryMatches(const Summary& actual, + const string& expected_str) { + Summary expected; + CHECK(protobuf::TextFormat::ParseFromString(expected_str, &expected)); + EXPECT_EQ(expected.DebugString(), actual.DebugString()); +} + +class SummaryScalarOpTest : public OpsTestBase { + protected: + void MakeOp(DataType dt) { + RequireDefaultOps(); + ASSERT_OK(NodeDefBuilder("myop", "ScalarSummary") + .Input(FakeInput()) + .Input(FakeInput(dt)) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SummaryScalarOpTest, SimpleFloat) { + MakeOp(DT_FLOAT); + + // Feed and run + AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"}); + AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + EXPECT_SummaryMatches(summary, R"( + value { tag: 'tag1' simple_value: 1.0 } + value { tag: 'tag2' simple_value: -0.73 } + value { tag: 'tag3' simple_value: 10000.0 } + )"); +} + +TEST_F(SummaryScalarOpTest, SimpleDouble) { + MakeOp(DT_DOUBLE); + + // Feed and run + AddInputFromArray<string>(TensorShape({3}), {"tag1", "tag2", "tag3"}); + AddInputFromArray<double>(TensorShape({3}), {1.0, -0.73, 10000.0}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + EXPECT_SummaryMatches(summary, R"( + value { tag: 'tag1' simple_value: 1.0 } + value { tag: 'tag2' simple_value: -0.73 } + value { tag: 'tag3' simple_value: 10000.0 } + )"); +} + +TEST_F(SummaryScalarOpTest, Error_MismatchedSize) { + MakeOp(DT_FLOAT); + + // Feed and run + AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"}); + AddInputFromArray<float>(TensorShape({3}), {1.0, -0.73, 10000.0}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s; +} + +TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) { + MakeOp(DT_FLOAT); + + // Feed and run + AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"}); + AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73}); + Status s = RunOpKernel(); + EXPECT_TRUE( + StringPiece(s.ToString()).contains("tags and values not the same shape")) + << s; +} + +TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) { + MakeOp(DT_FLOAT); + + // Feed and run + AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"}); + AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73}); + Status s = RunOpKernel(); + EXPECT_TRUE( + StringPiece(s.ToString()).contains("tags and values not the same shape")) + << s; +} + +// -------------------------------------------------------------------------- +// SummaryHistoOp +// -------------------------------------------------------------------------- +class SummaryHistoOpTest : public OpsTestBase { + protected: + void MakeOp() { + ASSERT_OK(NodeDefBuilder("myop", "HistogramSummary") + .Input(FakeInput()) + .Input(FakeInput()) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SummaryHistoOpTest, Simple) { + MakeOp(); + + // Feed and run + AddInputFromArray<string>(TensorShape({}), {"taghisto"}); + AddInputFromArray<float>(TensorShape({3, 2}), {0.1, -0.7, 4.1, 4., 5., 4.}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + ASSERT_EQ(summary.value_size(), 1); + EXPECT_EQ(summary.value(0).tag(), "taghisto"); + histogram::Histogram histo; + EXPECT_TRUE(histo.DecodeFromProto(summary.value(0).histo())); + EXPECT_EQ( + "Count: 6 Average: 2.7500 StdDev: 2.20\n" + "Min: -0.7000 Median: 3.9593 Max: 5.0000\n" + "------------------------------------------------------\n" + "[ -0.76, -0.69 ) 1 16.667% 16.667% ###\n" + "[ 0.093, 0.1 ) 1 16.667% 33.333% ###\n" + "[ 3.8, 4.2 ) 3 50.000% 83.333% ##########\n" + "[ 4.6, 5.1 ) 1 16.667% 100.000% ###\n", + histo.ToString()); +} + +TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) { + MakeOp(); + + // Feed and run + AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"}); + AddInputFromArray<float>(TensorShape({2}), {1.0, -0.73}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s; +} + +TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) { + MakeOp(); + + // Feed and run + AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"}); + AddInputFromArray<float>(TensorShape({2, 1}), {1.0, -0.73}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s; +} + +// -------------------------------------------------------------------------- +// SummaryMergeOp +// -------------------------------------------------------------------------- +class SummaryMergeOpTest : public OpsTestBase { + protected: + void MakeOp(int num_inputs) { + ASSERT_OK(NodeDefBuilder("myop", "MergeSummary") + .Input(FakeInput(num_inputs)) + .Finalize(node_def())); + ASSERT_OK(InitOp()); + } +}; + +TEST_F(SummaryMergeOpTest, Simple) { + MakeOp(1); + + // Feed and run + Summary s1; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag1\" simple_value: 1.0 } " + "value { tag: \"tag2\" simple_value: -0.73 } ", + &s1)); + Summary s2; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag3\" simple_value: 10000.0 }", &s2)); + Summary s3; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag4\" simple_value: 11.0 }", &s3)); + + AddInputFromArray<string>( + TensorShape({3}), + {s1.SerializeAsString(), s2.SerializeAsString(), s3.SerializeAsString()}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + + EXPECT_SummaryMatches(summary, + "value { tag: \"tag1\" simple_value: 1.0 } " + "value { tag: \"tag2\" simple_value: -0.73 } " + "value { tag: \"tag3\" simple_value: 10000.0 }" + "value { tag: \"tag4\" simple_value: 11.0 }"); +} + +TEST_F(SummaryMergeOpTest, Simple_MultipleInputs) { + MakeOp(3); + + // Feed and run + Summary s1; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag1\" simple_value: 1.0 } " + "value { tag: \"tag2\" simple_value: -0.73 } ", + &s1)); + Summary s2; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag3\" simple_value: 10000.0 }", &s2)); + Summary s3; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag4\" simple_value: 11.0 }", &s3)); + + AddInputFromArray<string>(TensorShape({}), {s1.SerializeAsString()}); + AddInputFromArray<string>(TensorShape({}), {s2.SerializeAsString()}); + AddInputFromArray<string>(TensorShape({}), {s3.SerializeAsString()}); + ASSERT_OK(RunOpKernel()); + + // Check the output size. + Tensor* out_tensor = GetOutput(0); + ASSERT_EQ(0, out_tensor->dims()); + Summary summary; + ParseProtoUnlimited(&summary, out_tensor->scalar<string>()()); + + EXPECT_SummaryMatches(summary, + "value { tag: \"tag1\" simple_value: 1.0 } " + "value { tag: \"tag2\" simple_value: -0.73 } " + "value { tag: \"tag3\" simple_value: 10000.0 }" + "value { tag: \"tag4\" simple_value: 11.0 }"); +} + +TEST_F(SummaryMergeOpTest, Error_MismatchedSize) { + MakeOp(1); + + // Feed and run + Summary s1; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tag1\" simple_value: 1.0 } " + "value { tag: \"tagduplicate\" simple_value: -0.73 } ", + &s1)); + Summary s2; + ASSERT_TRUE(protobuf::TextFormat::ParseFromString( + "value { tag: \"tagduplicate\" simple_value: 1.0 } ", &s2)); + AddInputFromArray<string>(TensorShape({2}), + {s1.SerializeAsString(), s2.SerializeAsString()}); + Status s = RunOpKernel(); + EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s; +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc new file mode 100644 index 0000000000..51e4d6a2b8 --- /dev/null +++ b/tensorflow/core/kernels/text_line_reader_op.cc @@ -0,0 +1,99 @@ +// See docs in ../ops/io_ops.cc. + +#include <memory> +#include "tensorflow/core/framework/reader_op_kernel.h" +#include "tensorflow/core/kernels/reader_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/inputbuffer.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/public/env.h" + +namespace tensorflow { + +class TextLineReader : public ReaderBase { + public: + TextLineReader(const string& node_name, int skip_header_lines, Env* env) + : ReaderBase(strings::StrCat("TextLineReader '", node_name, "'")), + skip_header_lines_(skip_header_lines), + env_(env), + line_number_(0) {} + + Status OnWorkStartedLocked() override { + line_number_ = 0; + RandomAccessFile* file = nullptr; + TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file)); + input_buffer_.reset(new io::InputBuffer(file, kBufferSize)); + for (; line_number_ < skip_header_lines_; ++line_number_) { + string line_contents; + Status status = input_buffer_->ReadLine(&line_contents); + if (errors::IsOutOfRange(status)) { + // We ignore an end of file error when skipping header lines. + // We will end up skipping this file. + return Status::OK(); + } + TF_RETURN_IF_ERROR(status); + } + return Status::OK(); + } + + Status OnWorkFinishedLocked() override { + input_buffer_.reset(nullptr); + return Status::OK(); + } + + Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) override { + Status status = input_buffer_->ReadLine(value); + ++line_number_; + if (status.ok()) { + *key = strings::StrCat(current_work(), ":", line_number_); + *produced = true; + return status; + } + if (errors::IsOutOfRange(status)) { // End of file, advance to the next. + *at_end = true; + return Status::OK(); + } else { // Some other reading error + return status; + } + } + + Status ResetLocked() override { + line_number_ = 0; + input_buffer_.reset(nullptr); + return ReaderBase::ResetLocked(); + } + + // TODO(josh11b): Implement serializing and restoring the state. Need + // to create TextLineReaderState proto to store ReaderBaseState, + // line_number_, and input_buffer_->Tell(). + + private: + enum { kBufferSize = 256 << 10 /* 256 kB */ }; + const int skip_header_lines_; + Env* const env_; + int64 line_number_; + std::unique_ptr<io::InputBuffer> input_buffer_; +}; + +class TextLineReaderOp : public ReaderOpKernel { + public: + explicit TextLineReaderOp(OpKernelConstruction* context) + : ReaderOpKernel(context) { + int skip_header_lines = -1; + OP_REQUIRES_OK(context, + context->GetAttr("skip_header_lines", &skip_header_lines)); + OP_REQUIRES(context, skip_header_lines >= 0, + errors::InvalidArgument("skip_header_lines must be >= 0 not ", + skip_header_lines)); + Env* env = context->env(); + SetReaderFactory([this, skip_header_lines, env]() { + return new TextLineReader(name(), skip_header_lines, env); + }); + } +}; + +REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU), + TextLineReaderOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc new file mode 100644 index 0000000000..551be18d5f --- /dev/null +++ b/tensorflow/core/kernels/tf_record_reader_op.cc @@ -0,0 +1,76 @@ +// See docs in ../ops/io_ops.cc. + +#include <memory> +#include "tensorflow/core/framework/reader_op_kernel.h" +#include "tensorflow/core/kernels/reader_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/io/record_reader.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/public/env.h" + +namespace tensorflow { + +class TFRecordReader : public ReaderBase { + public: + TFRecordReader(const string& node_name, Env* env) + : ReaderBase(strings::StrCat("TFRecordReader '", node_name, "'")), + env_(env), + offset_(0) {} + + Status OnWorkStartedLocked() override { + offset_ = 0; + RandomAccessFile* file = nullptr; + TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(current_work(), &file)); + file_.reset(file); + reader_.reset(new io::RecordReader(file)); + return Status::OK(); + } + + Status OnWorkFinishedLocked() override { + reader_.reset(nullptr); + file_.reset(nullptr); + return Status::OK(); + } + + Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) override { + *key = strings::StrCat(current_work(), ":", offset_); + Status status = reader_->ReadRecord(&offset_, value); + if (errors::IsOutOfRange(status)) { + *at_end = true; + return Status::OK(); + } + if (!status.ok()) return status; + *produced = true; + return Status::OK(); + } + + Status ResetLocked() override { + offset_ = 0; + reader_.reset(nullptr); + file_.reset(nullptr); + return ReaderBase::ResetLocked(); + } + + // TODO(josh11b): Implement serializing and restoring the state. + + private: + Env* const env_; + uint64 offset_; + std::unique_ptr<RandomAccessFile> file_; + std::unique_ptr<io::RecordReader> reader_; +}; + +class TFRecordReaderOp : public ReaderOpKernel { + public: + explicit TFRecordReaderOp(OpKernelConstruction* context) + : ReaderOpKernel(context) { + Env* env = context->env(); + SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); }); + } +}; + +REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU), + TFRecordReaderOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc new file mode 100644 index 0000000000..d5e0e89d60 --- /dev/null +++ b/tensorflow/core/kernels/tile_ops.cc @@ -0,0 +1,460 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#ifdef GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "tensorflow/core/kernels/tile_ops.h" + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// -------------------------------------------------------------------------- +template <typename Device> +class TileOp : public OpKernel { + public: + explicit TileOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& multiples = context->input(1); + + OP_REQUIRES( + context, TensorShapeUtils::IsLegacyVector(multiples.shape()), + errors::InvalidArgument("Expected multiples to be 1-D, but got shape ", + multiples.shape().ShortDebugString())); + OP_REQUIRES(context, input.dims() == multiples.NumElements(), + errors::InvalidArgument( + "Expected multiples argument to be a vector of length ", + input.dims(), " but got length ", multiples.dim_size(0))); + + const int input_dims = input.dims(); + const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(), + input_dims); + + TensorShape output_shape; + for (int i = 0; i < input_dims; ++i) { + OP_REQUIRES( + context, multiples_array[i] > 0, + errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ", + multiples_array[i])); + output_shape.AddDim(input.dim_size(i) * multiples_array[i]); + } + Tensor* result = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result)); + +#define HANDLE_DIM(DT, NDIM) \ + if (context->input(0).dtype() == DT && input_dims == NDIM) { \ + HandleCase<DT, NDIM>(context, multiples_array, result); \ + return; \ + } + +#define HANDLE_TYPE(T) \ + HANDLE_DIM(T, 0) \ + HANDLE_DIM(T, 1) \ + HANDLE_DIM(T, 2) \ + HANDLE_DIM(T, 3) \ + HANDLE_DIM(T, 4) \ + HANDLE_DIM(T, 5) + + HANDLE_TYPE(DT_BOOL); + HANDLE_TYPE(DT_FLOAT); + HANDLE_TYPE(DT_DOUBLE); + HANDLE_TYPE(DT_UINT8); + HANDLE_TYPE(DT_INT32); + HANDLE_TYPE(DT_INT16); + HANDLE_TYPE(DT_INT64); + HANDLE_TYPE(DT_STRING); // when DEVICE=CPUDevice. + +#undef HANDLE_TYPE +#undef HANDLE_DIM + + OP_REQUIRES(context, false, + errors::Unimplemented( + "TileOp : Unhandled input dimensions, DT : ", + context->input(0).dtype(), ", dims : ", input_dims)); + } + + private: + template <DataType DT, int NDIM> + void HandleCaseImpl(OpKernelContext* context, + const gtl::ArraySlice<int32>& multiples_array, + Tensor* result) { + typedef typename EnumToDataType<DT>::Type T; + Eigen::array<int32, NDIM> broadcast_array; + for (int i = 0; i < NDIM; ++i) { + broadcast_array[i] = multiples_array[i]; + } + functor::Tile<Device, T, NDIM>()( + context->eigen_device<Device>(), result->tensor<T, NDIM>(), + context->input(0).tensor<T, NDIM>(), broadcast_array); + } + + template <DataType DT, int NDIM> + void HandleCase(OpKernelContext* context, + const gtl::ArraySlice<int32>& multiples_array, + Tensor* result); + + TF_DISALLOW_COPY_AND_ASSIGN(TileOp); +}; + +template <typename Device> +template <DataType DT, int NDIM> +inline void TileOp<Device>::HandleCase( + OpKernelContext* context, const gtl::ArraySlice<int32>& multiples_array, + Tensor* result) { + LOG(FATAL) << "TileOp: Invalid combination of Device, DT and NDIM: " + << typeid(Device).name() << ", " << DataTypeString(DT) << ", " + << NDIM; +} + +#define HANDLE_CASE(device, dtype, ndim) \ + template <> \ + template <> \ + void TileOp<device>::HandleCase<dtype, ndim>( \ + OpKernelContext * context, \ + const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \ + HandleCaseImpl<dtype, ndim>(context, multiples_array, result); \ + } + +#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \ + HANDLE_CASE(device, dtype, 1); \ + HANDLE_CASE(device, dtype, 2); \ + HANDLE_CASE(device, dtype, 3); \ + HANDLE_CASE(device, dtype, 4); \ + HANDLE_CASE(device, dtype, 5); + +#define HANDLE_CASE_DIM(device, dtype) \ + HANDLE_CASE(device, dtype, 0); \ + HANDLE_CASE_DIM_POSITIVE(device, dtype); + +HANDLE_CASE_DIM(CPUDevice, DT_BOOL); +HANDLE_CASE_DIM(CPUDevice, DT_FLOAT); +HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM(CPUDevice, DT_UINT8); +HANDLE_CASE_DIM(CPUDevice, DT_INT32); +HANDLE_CASE_DIM(CPUDevice, DT_INT16); +HANDLE_CASE_DIM(CPUDevice, DT_INT64); +HANDLE_CASE_DIM(CPUDevice, DT_STRING); + +#if GOOGLE_CUDA +// Eigen on GPU does not handle 0-dimension data types yet. +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64); +#endif // GOOGLE_CUDA + +#undef HANDLE_CASE_DIM_POSITIVE +#undef HANDLE_CASE_DIM +#undef HANDLE_CASE + +// -------------------------------------------------------------------------- +template <typename Device> +class TileGradientOp : public OpKernel { + public: + explicit TileGradientOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& multiples = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsLegacyVector(multiples.shape()), + errors::InvalidArgument("Expected multiples to be 1-D, but got shape ", + multiples.shape().ShortDebugString())); + OP_REQUIRES(context, input.dims() == multiples.NumElements(), + errors::InvalidArgument( + "Expected multiples argument to be a vector of length ", + input.dims(), " but got length ", multiples.dim_size(0))); + + const int input_dims = input.dims(); + const gtl::ArraySlice<int32> multiples_array(multiples.flat<int32>().data(), + input_dims); + + TensorShape output_shape; + std::vector<int32> input_dim_size_vec; + for (int i = 0; i < input_dims; ++i) { + OP_REQUIRES( + context, multiples_array[i] > 0, + errors::InvalidArgument("Expected multiples[", i, "] > 0, but got ", + multiples_array[i])); + OP_REQUIRES(context, input.dim_size(i) % multiples_array[i] == 0, + errors::InvalidArgument("Expected input_dim[", i, + "] to be divisible by multiples[", i, + "], but ", input.dim_size(i), " % ", + multiples_array[i], " != 0")); + output_shape.AddDim(input.dim_size(i) / multiples_array[i]); + input_dim_size_vec.push_back(input.dim_size(i)); + } + Tensor* result = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result)); + +#define HANDLE_DIM(DT, NDIM) \ + if (context->input(0).dtype() == DT && input_dims == NDIM) { \ + HandleCase<DT, NDIM>(context, input_dim_size_vec, multiples_array, \ + result); \ + return; \ + } + +#define HANDLE_TYPE(T) \ + HANDLE_DIM(T, 0) \ + HANDLE_DIM(T, 1) \ + HANDLE_DIM(T, 2) \ + HANDLE_DIM(T, 3) \ + HANDLE_DIM(T, 4) \ + HANDLE_DIM(T, 5) + + HANDLE_TYPE(DT_FLOAT); + HANDLE_TYPE(DT_DOUBLE); + HANDLE_TYPE(DT_INT32); + HANDLE_TYPE(DT_INT16); + HANDLE_TYPE(DT_INT64); + +#undef HANDLE_TYPE +#undef HANDLE_DIM + + OP_REQUIRES(context, false, + errors::Unimplemented( + "TileGradientOp : Unhandled input dimensions, DT : ", + context->input(0).dtype(), ", dims : ", input_dims)); + } + + private: + template <DataType DT, int NDIM> + void HandleCase(OpKernelContext* context, + const std::vector<int32>& input_dims, + const gtl::ArraySlice<int32>& multiples_array, + Tensor* result); + + template <DataType DT, int NDIM> + void HandleCaseImpl(OpKernelContext* context, + const std::vector<int32>& input_dims, + const gtl::ArraySlice<int32>& multiples_array, + Tensor* result) { + typedef typename EnumToDataType<DT>::Type T; + + bool reduction_only = true; + std::vector<int> reduction_dims; + + for (int i = 0; i < NDIM; ++i) { + if (input_dims[i] > multiples_array[i] && multiples_array[i] > 1) { + reduction_only = false; + break; + } else { + if (multiples_array[i] == input_dims[i]) { + reduction_dims.push_back(i); + } + } + } + + if (reduction_only) { +#define HANDLE_DIM(D) \ + if (reduction_dims.size() == (D)) { \ + HandleReduce<T, NDIM, (D)>(context, reduction_dims, result); \ + return; \ + } + // NOTE(keveman): Handling the most common case here. + // Adding more cases here would require more templating and code + // explosion. For instance, HANDLE_DIM(2) wouldn't make sense for NDIM=1. + HANDLE_DIM(NDIM > 0 ? 1 : 0); + +// Fall through to the unoptimized version. +#undef HANDLE_DIM + } + + Eigen::DSizes<ptrdiff_t, NDIM> indices; + Eigen::DSizes<ptrdiff_t, NDIM> sizes; + + // Accumulate slices along the dimensions into the output. The number of + // slices along dimension 'i' is simply the multiple along dimension 'i' + // passed to the original Tile op. + for (int i = 0; i < NDIM; ++i) { + sizes[i] = input_dims[i] / multiples_array[i]; + indices[i] = 0; + } + + bool first = true; + while (true) { + functor::TileGrad<Device, T, NDIM>()( + context->eigen_device<Device>(), result->tensor<T, NDIM>(), + context->input(0).tensor<T, NDIM>(), indices, sizes, first); + first = false; + // Increment the begin indices. + int i = 0; + while (i < NDIM && indices[i] / sizes[i] == multiples_array[i] - 1) { + indices[i] = 0; + ++i; + } + // We are finished if we have iterated to the maximum along all + // dimensions. + if (i == NDIM) { + break; + } + indices[i] += sizes[i]; + } + } + + template <typename T, int NDIM, int REDUCENDIM> + void HandleReduce(OpKernelContext* context, + const std::vector<int32>& reduce_dim_in, Tensor* result) { + static_assert(NDIM >= REDUCENDIM, "Too many reduced dimensions"); + Eigen::DSizes<ptrdiff_t, REDUCENDIM> reduce_dim; + Eigen::DSizes<ptrdiff_t, NDIM> reshape_dim; + + for (int i = 0; i < REDUCENDIM; ++i) { + reduce_dim[i] = reduce_dim_in[i]; + } + + for (int i = 0; i < NDIM; ++i) { + reshape_dim[i] = result->dim_size(i); + } + + functor::ReduceAndReshape<Device, T, NDIM, REDUCENDIM>()( + context->eigen_device<Device>(), result->tensor<T, NDIM>(), + context->input(0).tensor<T, NDIM>(), reduce_dim, reshape_dim); + } + + TF_DISALLOW_COPY_AND_ASSIGN(TileGradientOp); +}; + +template <typename Device> +template <DataType DT, int NDIM> +inline void TileGradientOp<Device>::HandleCase( + OpKernelContext* context, const std::vector<int32>& input_dims, + const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { + LOG(FATAL) << "TileGradientOp: Invalid combination of Device, DT and NDIM: " + << typeid(Device).name() << ", " << DataTypeString(DT) << ", " + << NDIM; +} + +#define HANDLE_CASE(device, dtype, ndim) \ + template <> \ + template <> \ + void TileGradientOp<device>::HandleCase<dtype, ndim>( \ + OpKernelContext * context, const std::vector<int32>& input_dims, \ + const gtl::ArraySlice<int32>& multiples_array, Tensor* result) { \ + HandleCaseImpl<dtype, ndim>(context, input_dims, multiples_array, result); \ + } + +#define HANDLE_CASE_DIM_POSITIVE(device, dtype) \ + HANDLE_CASE(device, dtype, 1); \ + HANDLE_CASE(device, dtype, 2); \ + HANDLE_CASE(device, dtype, 3); \ + HANDLE_CASE(device, dtype, 4); \ + HANDLE_CASE(device, dtype, 5); + +#define HANDLE_CASE_DIM(device, dtype) \ + HANDLE_CASE(device, dtype, 0); \ + HANDLE_CASE_DIM_POSITIVE(device, dtype); + +HANDLE_CASE_DIM(CPUDevice, DT_FLOAT); +HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM(CPUDevice, DT_INT16); +HANDLE_CASE_DIM(CPUDevice, DT_INT32); +HANDLE_CASE_DIM(CPUDevice, DT_INT64); + +#if GOOGLE_CUDA +// Eigen on GPU does not handle 0-dimension data types yet. +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_FLOAT); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_DOUBLE); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT16); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT32); +HANDLE_CASE_DIM_POSITIVE(GPUDevice, DT_INT64); +#endif // GOOGLE_CUDA + +#undef HANDLE_CASE_DIM_POSITIVE +#undef HANDLE_CASE_DIM +#undef HANDLE_CASE + +REGISTER_KERNEL_BUILDER(Name("Tile").Device(DEVICE_CPU).HostMemory("multiples"), + TileOp<CPUDevice>); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_CPU) + .HostMemory("multiples"), + TileGradientOp<CPUDevice>); + +#if GOOGLE_CUDA +#define DEFINE_GPU_TYPE(T) \ + DEFINE_GPU_DIM(T, 1) \ + DEFINE_GPU_DIM(T, 2) \ + DEFINE_GPU_DIM(T, 3) \ + DEFINE_GPU_DIM(T, 4) \ + DEFINE_GPU_DIM(T, 5) + +#define DEFINE_GPU_DIM(T, NDIM) \ + template <> \ + void Tile<GPUDevice, T, NDIM>::operator()( \ + const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out, \ + typename TTypes<T, NDIM>::ConstTensor in, \ + const Eigen::array<int32, NDIM>& broadcast_array) const; \ + extern template struct Tile<GPUDevice, T, NDIM>; \ + template <> \ + void TileGrad<GPUDevice, T, NDIM>::operator()( \ + const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out, \ + typename TTypes<T, NDIM>::ConstTensor in, \ + const Eigen::DSizes<ptrdiff_t, NDIM>& indices, \ + const Eigen::DSizes<ptrdiff_t, NDIM>& sizes, bool first) const; \ + extern template struct TileGrad<GPUDevice, T, NDIM>; \ + template <> \ + void ReduceAndReshape<GPUDevice, T, NDIM, 1>::operator()( \ + const GPUDevice& d, typename TTypes<T, NDIM>::Tensor out, \ + typename TTypes<T, NDIM>::ConstTensor in, \ + const Eigen::DSizes<ptrdiff_t, 1>& reduce_dim, \ + const Eigen::DSizes<ptrdiff_t, NDIM>& reshape_dim) const; \ + extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>; + +namespace functor { +DEFINE_GPU_TYPE(float); +DEFINE_GPU_TYPE(double); +DEFINE_GPU_TYPE(int64); +DEFINE_GPU_TYPE(int32); +DEFINE_GPU_TYPE(int16); +} // end namespace functor + +#undef DEFINE_GPU_DIM +#undef DEFINE_GPU_TYPE + +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("multiples"), + TileOp<GPUDevice>); +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_GPU) + .TypeConstraint<double>("T") + .HostMemory("multiples"), + TileOp<GPUDevice>); +REGISTER_KERNEL_BUILDER(Name("Tile") + .Device(DEVICE_GPU) + .TypeConstraint<int16>("T") + .HostMemory("multiples"), + TileOp<GPUDevice>); + +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("multiples"), + TileGradientOp<GPUDevice>); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_GPU) + .TypeConstraint<double>("T") + .HostMemory("multiples"), + TileGradientOp<GPUDevice>); +REGISTER_KERNEL_BUILDER(Name("TileGrad") + .Device(DEVICE_GPU) + .TypeConstraint<int16>("T") + .HostMemory("multiples"), + TileGradientOp<GPUDevice>); +#endif // GOOGLE_CUDA +} // namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops.h b/tensorflow/core/kernels/tile_ops.h new file mode 100644 index 0000000000..b3cc6165e0 --- /dev/null +++ b/tensorflow/core/kernels/tile_ops.h @@ -0,0 +1,48 @@ +#ifndef TENSORFLOW_KERNELS_TILE_OPS_H_ +#define TENSORFLOW_KERNELS_TILE_OPS_H_ + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T, int NDIM> +struct Tile { + void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out, + typename TTypes<T, NDIM>::ConstTensor in, + const Eigen::array<int32, NDIM>& broadcast_array) const { + out.device(d) = in.broadcast(broadcast_array); + } +}; + +template <typename Device, typename T, int NDIM> +struct TileGrad { + void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out, + typename TTypes<T, NDIM>::ConstTensor in, + const Eigen::DSizes<ptrdiff_t, NDIM>& indices, + const Eigen::DSizes<ptrdiff_t, NDIM>& sizes, + bool first) const { + if (first) { + out.device(d) = in.slice(indices, sizes); + } else { + out.device(d) += in.slice(indices, sizes); + } + } +}; + +template <typename Device, typename T, int NDIM, int REDUCEDNDIM> +struct ReduceAndReshape { + void operator()(const Device& d, typename TTypes<T, NDIM>::Tensor out, + typename TTypes<T, NDIM>::ConstTensor in, + const Eigen::DSizes<ptrdiff_t, REDUCEDNDIM>& reduce_dim, + const Eigen::DSizes<ptrdiff_t, NDIM>& reshape_dim) const { + out.device(d) = in.sum(reduce_dim).reshape(reshape_dim); + } +}; + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_TILE_OPS_H_ diff --git a/tensorflow/core/kernels/tile_ops_gpu.cu.cc b/tensorflow/core/kernels/tile_ops_gpu.cu.cc new file mode 100644 index 0000000000..29481e1a54 --- /dev/null +++ b/tensorflow/core/kernels/tile_ops_gpu.cu.cc @@ -0,0 +1,38 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/tile_ops.h" +#include <stdio.h> + +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +#define DEFINE_TYPE(T) \ + DEFINE_DIM(T, 1) \ + DEFINE_DIM(T, 2) \ + DEFINE_DIM(T, 3) \ + DEFINE_DIM(T, 4) \ + DEFINE_DIM(T, 5) + +#define DEFINE_DIM(T, NDIM) \ + template struct Tile<GPUDevice, T, NDIM>; \ + template struct TileGrad<GPUDevice, T, NDIM>; \ + template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>; + +DEFINE_TYPE(float) +DEFINE_TYPE(double) +DEFINE_TYPE(int64) +DEFINE_TYPE(int32) +DEFINE_TYPE(int16) +// NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc. + +#undef DEFINE_DIM +#undef DEFINE_TYPE + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc new file mode 100644 index 0000000000..79b5d4d07e --- /dev/null +++ b/tensorflow/core/kernels/topk_op.cc @@ -0,0 +1,71 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/lib/gtl/top_n.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +template <typename T> +class TopK : public OpKernel { + public: + explicit TopK(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("k", &k_)); + } + + void Compute(OpKernelContext* context) override { + const auto& input_in = context->input(0); + OP_REQUIRES(context, input_in.dims() == 2, + errors::InvalidArgument("input must be 2-dimensional")); + OP_REQUIRES(context, input_in.dim_size(1) >= k_, + errors::InvalidArgument("input must have at least k columns")); + + const auto& input = input_in.matrix<T>(); + + const auto num_rows = input_in.dim_size(0); // generally batch_size + const auto num_cols = input_in.dim_size(1); + + Tensor* values_out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({num_rows, k_}), &values_out)); + Tensor* indices_out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 1, TensorShape({num_rows, k_}), &indices_out)); + auto values = values_out->matrix<T>(); + auto indices = indices_out->matrix<int32>(); + + gtl::TopN<std::pair<T, int32>> filter(k_); + + for (int r = 0; r < num_rows; r++) { + for (int32 c = 0; c < num_cols; ++c) { + // The second element is the negated index, so that lower-index elements + // are considered larger than higher-index elements in case of ties. + filter.push(std::make_pair(input(r, c), -c)); + } + + std::unique_ptr<std::vector<std::pair<T, int32>>> top_k(filter.Extract()); + for (int32 i = 0; i < k_; ++i) { + values(r, i) = (*top_k)[i].first; + indices(r, i) = -(*top_k)[i].second; + } + filter.Reset(); + } + } + + private: + int k_; +}; + +#define REGISTER_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("TopK").Device(DEVICE_CPU).TypeConstraint<type>("T"), TopK<type>) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc new file mode 100644 index 0000000000..611fa4ac41 --- /dev/null +++ b/tensorflow/core/kernels/training_ops.cc @@ -0,0 +1,884 @@ +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/training_ops.h" + +#include "tensorflow/core/framework/op_kernel.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +static inline bool DoInline(int64 size) { return size <= (256ll << 10); } + +template <typename T> +struct ApplyGradientDescent<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad) { + if (DoInline(var.size())) { + var -= grad * lr(); + } else { + var.device(d) -= grad * lr(); + } + } +}; + +template <typename T> +struct ApplyAdagrad<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat accum, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad) { + if (DoInline(var.size())) { + accum += grad.square(); + var -= grad * lr() * accum.rsqrt(); + } else { + accum.device(d) += grad.square(); + var.device(d) -= grad * lr() * accum.rsqrt(); + } + } +}; + +template <typename T> +struct ApplyMomentum<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat accum, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad, + typename TTypes<T>::ConstScalar momentum) { + if (DoInline(var.size())) { + accum = accum * momentum() + grad; + var -= accum * lr(); + } else { + accum.device(d) = accum * momentum() + grad; + var.device(d) -= accum * lr(); + } + } +}; + +template <typename T> +struct ApplyAdam<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, + typename TTypes<T>::ConstScalar beta1_power, + typename TTypes<T>::ConstScalar beta2_power, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstScalar beta1, + typename TTypes<T>::ConstScalar beta2, + typename TTypes<T>::ConstScalar epsilon, + typename TTypes<T>::ConstFlat grad) { + const T alpha = lr() * std::sqrt(1 - beta2_power()) / (1 - beta1_power()); + if (DoInline(var.size())) { + m += (grad - m) * (1 - beta1()); + v += (grad.square() - v) * (1 - beta2()); + var -= (m * alpha) / (v.sqrt() + epsilon()); + } else { + m.device(d) += (grad - m) * (1 - beta1()); + v.device(d) += (grad.square() - v) * (1 - beta2()); + var.device(d) -= (m * alpha) / (v.sqrt() + epsilon()); + } + } +}; + +template <typename T> +struct ApplyRMSProp<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstScalar rho, + typename TTypes<T>::ConstScalar momentum, + typename TTypes<T>::ConstScalar epsilon, + typename TTypes<T>::ConstFlat grad) { + if (DoInline(var.size())) { + ms += (grad.square() - ms) * (1 - rho()); + mom = mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt()); + var -= mom; + } else { + ms.device(d) += (grad.square() - ms) * (1 - rho()); + mom.device(d) = + mom * momentum() + (grad * lr()) / ((ms + epsilon()).sqrt()); + var.device(d) -= mom; + } + } +}; + +} // namespace functor + +template <typename Device, typename T> +class ApplyGradientDescentOp : public OpKernel { + public: + explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + if (use_exclusive_lock_) { + mutex_lock l(*ctx->input_ref_mutex(0)); + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } else { + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; + + void DoValidate(OpKernelContext* ctx) { + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + const Tensor& alpha = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(alpha.shape()), + errors::InvalidArgument("alpha is not a scalar: ", + alpha.shape().DebugString())); + const Tensor& delta = ctx->input(2); + OP_REQUIRES( + ctx, var.shape().IsSameSize(delta.shape()), + errors::InvalidArgument("var and delta do not have the same shape", + var.shape().DebugString(), " ", + delta.shape().DebugString())); + } + + void DoCompute(OpKernelContext* ctx) { + const Device& device = ctx->template eigen_device<Device>(); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + const Tensor& alpha = ctx->input(1); + const Tensor& delta = ctx->input(2); + functor::ApplyGradientDescent<Device, T>()( + device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>()); + } +}; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + ApplyGradientDescentOp<D##Device, T>); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyGradientDescent<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::Flat var, \ + typename TTypes<T>::ConstScalar alpha, \ + typename TTypes<T>::ConstFlat delta); \ + extern template struct ApplyGradientDescent<GPUDevice, T>; +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); +#endif +#undef REGISTER_KERNELS + +template <typename Device, typename T> +class ApplyAdagradOp : public OpKernel { + public: + explicit ApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + if (use_exclusive_lock_) { + mutex_lock l1(*ctx->input_ref_mutex(0)); + // Don't try to acquire a lock on the second ref as they share the same + // mutex. + // + // mutex_lock l2(*ctx->input_ref_mutex(1)); + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } else { + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; + + void DoValidate(OpKernelContext* ctx) { + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + const Tensor& lr = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& grad = ctx->input(3); + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and delta do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + } + + void DoCompute(OpKernelContext* ctx) { + const Device& device = ctx->template eigen_device<Device>(); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + const Tensor& lr = ctx->input(2); + const Tensor& grad = ctx->input(3); + functor::ApplyAdagrad<Device, T>()(device, var.flat<T>(), accum.flat<T>(), + lr.scalar<T>(), grad.flat<T>()); + } +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + ApplyAdagradOp<D##Device, T>); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAdagrad<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::Flat var, \ + typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \ + typename TTypes<T>::ConstFlat grad); \ + extern template struct ApplyAdagrad<GPUDevice, T>; +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); +#endif +#undef REGISTER_KERNELS + +// Note, this op works on cpu only. +template <typename T, typename Tindex> +class SparseApplyAdagradOp : public OpKernel { + public: + explicit SparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { + mutex* mu_var = ctx->input_ref_mutex(0); + // mu_accum is actually the same mutex as mu_var since currently we use a + // global mutex. + // + // mutex* mu_accum = ctx->input_ref_mutex(1); + if (use_exclusive_lock_) { + mu_var->lock(); + } + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), + errors::InvalidArgument("var must be at least 1 dimensional")); + + const Tensor& lr = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsLegacyScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& grad = ctx->input(3); + const Tensor& indices = ctx->input(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + for (int d = 1; d < var.dims(); d++) { + OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d))); + } + const Tindex N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + if (N > 0) { + const Tindex first_dim_size = var.dim_size(0); + // Validate all the indices are in range + auto indices_vec = indices.vec<Tindex>(); + for (Tindex i = 0; i < N; i++) { + const Tindex index = indices_vec(i); + OP_REQUIRES(ctx, index >= 0 && index < first_dim_size, + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + } + + auto var_flat = var.flat_outer_dims<T>(); + auto accum_flat = accum.flat_outer_dims<T>(); + auto grad_flat = grad.flat_outer_dims<T>(); + T lr_scalar = lr.scalar<T>()(); + + // Note(yonghui): It might be worth multi-threading square() and rsqrt(). + for (Tindex i = 0; i < N; i++) { + const Tindex index = indices_vec(i); + auto a = accum_flat.template chip<0>(index); + auto g = grad_flat.template chip<0>(i); + auto v = var_flat.template chip<0>(index); + a += g.square(); + v -= g.constant(lr_scalar) * g * a.rsqrt(); + } + } + if (use_exclusive_lock_) { + mu_var->unlock(); + } + + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyAdagrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .TypeConstraint<Tindices>("Tindices"), \ + SparseApplyAdagradOp<T, Tindices>); + +REGISTER_KERNELS(float, int32); +REGISTER_KERNELS(float, int64); +REGISTER_KERNELS(double, int32); +REGISTER_KERNELS(double, int64); +#undef REGISTER_KERNELS + +template <typename Device, typename T> +class ApplyMomentumOp : public OpKernel { + public: + explicit ApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + if (use_exclusive_lock_) { + mutex_lock l1(*ctx->input_ref_mutex(0)); + // Don't try to acquire a lock on the second ref as they share the same + // mutex. + // + // mutex_lock l2(*ctx->input_ref_mutex(1)); + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } else { + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; + + void DoValidate(OpKernelContext* ctx) { + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + const Tensor& lr = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& grad = ctx->input(3); + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and delta do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + + const Tensor& momentum = ctx->input(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()), + errors::InvalidArgument("momentum is not a scalar: ", + momentum.shape().DebugString())); + } + + void DoCompute(OpKernelContext* ctx) { + const Device& device = ctx->template eigen_device<Device>(); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + const Tensor& lr = ctx->input(2); + const Tensor& grad = ctx->input(3); + const Tensor& momentum = ctx->input(4); + functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(), + lr.scalar<T>(), grad.flat<T>(), + momentum.scalar<T>()); + } +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + ApplyMomentumOp<D##Device, T>); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyMomentum<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::Flat var, \ + typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \ + typename TTypes<T>::ConstFlat grad, \ + typename TTypes<T>::ConstScalar momentum); \ + extern template struct ApplyMomentum<GPUDevice, T>; +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); +#endif +#undef REGISTER_KERNELS + +// Note, this op works on cpu only. +template <typename T, typename Tindex> +class SparseApplyMomentumOp : public OpKernel { + public: + explicit SparseApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS { + mutex* mu_var = ctx->input_ref_mutex(0); + // mu_accum is actually the same mutex as mu_var since currently we use a + // global mutex. + // + // mutex* mu_accum = ctx->input_ref_mutex(1); + if (use_exclusive_lock_) { + mu_var->lock(); + } + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()), + errors::InvalidArgument("var must be at least 1 dimensional")); + + const Tensor& lr = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + const Tensor& grad = ctx->input(3); + const Tensor& indices = ctx->input(4); + OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()), + errors::InvalidArgument("indices must be one-dimensional")); + + for (int d = 1; d < var.dims(); d++) { + OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d), + errors::InvalidArgument(strings::StrCat( + "var and grad must match in dimension ", d))); + } + const Tindex N = indices.dim_size(0); + OP_REQUIRES( + ctx, grad.dim_size(0) == N, + errors::InvalidArgument( + "grad must be the same size as indices in the first dimension.")); + + const Tensor& momentum = ctx->input(5); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()), + errors::InvalidArgument("momentum is not a scalar: ", + momentum.shape().DebugString())); + + if (N > 0) { + const Tindex first_dim_size = var.dim_size(0); + // Validate all the indices are in range + auto indices_vec = indices.vec<Tindex>(); + for (Tindex i = 0; i < N; i++) { + const Tindex index = indices_vec(i); + OP_REQUIRES(ctx, index >= 0 && index < first_dim_size, + errors::InvalidArgument( + strings::StrCat("Index ", index, " at offset ", i, + " in indices is out of range"))); + } + + auto var_flat = var.flat_outer_dims<T>(); + auto accum_flat = accum.flat_outer_dims<T>(); + auto grad_flat = grad.flat_outer_dims<T>(); + T lr_scalar = lr.scalar<T>()(); + T momentum_scalar = momentum.scalar<T>()(); + + for (Tindex i = 0; i < N; i++) { + const Tindex index = indices_vec(i); + auto a = accum_flat.template chip<0>(index); + auto g = grad_flat.template chip<0>(i); + auto v = var_flat.template chip<0>(index); + a = a * a.constant(momentum_scalar) + g; + v -= a.constant(lr_scalar) * a; + } + } + if (use_exclusive_lock_) { + mu_var->unlock(); + } + + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; +}; + +#define REGISTER_KERNELS(T, Tindices) \ + REGISTER_KERNEL_BUILDER(Name("SparseApplyMomentum") \ + .Device(DEVICE_CPU) \ + .TypeConstraint<T>("T") \ + .TypeConstraint<Tindices>("Tindices"), \ + SparseApplyMomentumOp<T, Tindices>); + +REGISTER_KERNELS(float, int32); +REGISTER_KERNELS(float, int64); +REGISTER_KERNELS(double, int32); +REGISTER_KERNELS(double, int64); +#undef REGISTER_KERNELS + +template <typename Device, typename T> +class ApplyAdamOp : public OpKernel { + public: + explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + if (use_exclusive_lock_) { + // all input refs share the same mutex + mutex_lock l1(*ctx->input_ref_mutex(0)); + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } else { + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; + + void DoValidate(OpKernelContext* ctx) { + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor m = ctx->mutable_input(1, use_exclusive_lock_); + Tensor v = ctx->mutable_input(2, use_exclusive_lock_); + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, m.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, v.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(2))); + + const Tensor& beta1_power = ctx->input(3); + const Tensor& beta2_power = ctx->input(4); + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()), + errors::InvalidArgument("beta1_power is not a scalar: ", + beta1_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()), + errors::InvalidArgument("beta2_power is not a scalar: ", + beta2_power.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()), + errors::InvalidArgument("beta1 is not a scalar: ", + beta1.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()), + errors::InvalidArgument("beta2 is not a scalar: ", + beta2.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + + const Tensor& grad = ctx->input(9); + OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()), + errors::InvalidArgument("var and m do not have the same shape", + var.shape().DebugString(), " ", + m.shape().DebugString())); + OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()), + errors::InvalidArgument("var and v do not have the same shape", + var.shape().DebugString(), " ", + v.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + } + + void DoCompute(OpKernelContext* ctx) { + const Device& device = ctx->template eigen_device<Device>(); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor m = ctx->mutable_input(1, use_exclusive_lock_); + Tensor v = ctx->mutable_input(2, use_exclusive_lock_); + const Tensor& beta1_power = ctx->input(3); + const Tensor& beta2_power = ctx->input(4); + const Tensor& lr = ctx->input(5); + const Tensor& beta1 = ctx->input(6); + const Tensor& beta2 = ctx->input(7); + const Tensor& epsilon = ctx->input(8); + const Tensor& grad = ctx->input(9); + + functor::ApplyAdam<Device, T>()(device, var.flat<T>(), m.flat<T>(), + v.flat<T>(), beta1_power.scalar<T>(), + beta2_power.scalar<T>(), lr.scalar<T>(), + beta1.scalar<T>(), beta2.scalar<T>(), + epsilon.scalar<T>(), grad.flat<T>()); + } +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + ApplyAdamOp<D##Device, T>); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAdam<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::Flat var, \ + typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \ + typename TTypes<T>::ConstScalar beta1_power, \ + typename TTypes<T>::ConstScalar beta2_power, \ + typename TTypes<T>::ConstScalar lr, \ + typename TTypes<T>::ConstScalar beta1, \ + typename TTypes<T>::ConstScalar beta2, \ + typename TTypes<T>::ConstScalar epsilon, \ + typename TTypes<T>::ConstFlat grad); \ + extern template struct ApplyAdam<GPUDevice, T>; +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); +#endif +#undef REGISTER_KERNELS + +template <typename Device, typename T> +class ApplyRMSPropOp : public OpKernel { + public: + explicit ApplyRMSPropOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + if (use_exclusive_lock_) { + // all input refs share the same mutex + mutex_lock l1(*ctx->input_ref_mutex(0)); + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } else { + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; + + void DoValidate(OpKernelContext* ctx) { + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor ms = ctx->mutable_input(1, use_exclusive_lock_); + Tensor mom = ctx->mutable_input(2, use_exclusive_lock_); + + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, ms.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, mom.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(2))); + + const Tensor& lr = ctx->input(3); + const Tensor& rho = ctx->input(4); + const Tensor& momentum = ctx->input(5); + const Tensor& epsilon = ctx->input(6); + const Tensor& grad = ctx->input(7); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()), + errors::InvalidArgument("lr is not a scalar: ", + lr.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()), + errors::InvalidArgument("rho is not a scalar: ", + rho.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()), + errors::InvalidArgument("momentum is not a scalar: ", + momentum.shape().DebugString())); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + + OP_REQUIRES(ctx, var.shape().IsSameSize(ms.shape()), + errors::InvalidArgument("var and ms do not have the same shape", + var.shape().DebugString(), " ", + ms.shape().DebugString())); + + OP_REQUIRES(ctx, var.shape().IsSameSize(mom.shape()), + errors::InvalidArgument( + "var and mom do not have the same shape", + var.shape().DebugString(), " ", mom.shape().DebugString())); + + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + } + + void DoCompute(OpKernelContext* ctx) { + const Device& device = ctx->template eigen_device<Device>(); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor ms = ctx->mutable_input(1, use_exclusive_lock_); + Tensor mom = ctx->mutable_input(2, use_exclusive_lock_); + const Tensor& lr = ctx->input(3); + const Tensor& rho = ctx->input(4); + const Tensor& momentum = ctx->input(5); + const Tensor& epsilon = ctx->input(6); + const Tensor& grad = ctx->input(7); + + functor::ApplyRMSProp<Device, T>()(device, var.flat<T>(), ms.flat<T>(), + mom.flat<T>(), lr.scalar<T>(), + rho.scalar<T>(), momentum.scalar<T>(), + epsilon.scalar<T>(), grad.flat<T>()); + } +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + ApplyRMSPropOp<D##Device, T>); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyRMSProp<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T>::Flat var, \ + typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom, \ + typename TTypes<T>::ConstScalar lr, typename TTypes<T>::ConstScalar rho, \ + typename TTypes<T>::ConstScalar momentum, \ + typename TTypes<T>::ConstScalar epsilon, \ + typename TTypes<T>::ConstFlat grad); \ + extern template struct ApplyRMSProp<GPUDevice, T>; +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); +#endif +#undef REGISTER_KERNELS + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h new file mode 100644 index 0000000000..71f6d0253d --- /dev/null +++ b/tensorflow/core/kernels/training_ops.h @@ -0,0 +1,65 @@ +#ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_ +#define TENSORFLOW_KERNELS_TRAINING_OPS_H_ + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Each training algorithm has a ApplyXYZ functor struct declared in +// this header file. They are specialized for different devices +// (CPUDevice in training_ops.cc or GPUDevice in training_ops_gpu.cc). + +template <typename Device, typename T> +struct ApplyGradientDescent { + void operator()(const Device& d, typename TTypes<T>::Flat var, + typename TTypes<T>::ConstScalar alpha, + typename TTypes<T>::ConstFlat delta); +}; + +template <typename Device, typename T> +struct ApplyAdagrad { + void operator()(const Device& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat accum, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad); +}; + +template <typename Device, typename T> +struct ApplyMomentum { + void operator()(const Device& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat accum, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad, + typename TTypes<T>::ConstScalar momentum); +}; + +template <typename Device, typename T> +struct ApplyAdam { + void operator()(const Device& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, + typename TTypes<T>::ConstScalar beta1_power, + typename TTypes<T>::ConstScalar beta2_power, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstScalar beta1, + typename TTypes<T>::ConstScalar beta2, + typename TTypes<T>::ConstScalar epsilon, + typename TTypes<T>::ConstFlat grad); +}; + +template <typename Device, typename T> +struct ApplyRMSProp { + void operator()(const Device& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstScalar rho, + typename TTypes<T>::ConstScalar momentum, + typename TTypes<T>::ConstScalar epsilon, + typename TTypes<T>::ConstFlat grad); +}; + +} // end namespace functor +} // end namespace tensorflow + +#endif // TENSORFLOW_KERNELS_TRAINING_OPS_H_ diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc new file mode 100644 index 0000000000..3106f29648 --- /dev/null +++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc @@ -0,0 +1,127 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/training_ops.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { +template <typename T> +struct ApplyGradientDescent<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::ConstScalar alpha, + typename TTypes<T>::ConstFlat delta) { + Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast; + bcast[0] = delta.dimension(0); + Eigen::Sizes<1> single; + var.device(d) -= alpha.reshape(single).broadcast(bcast) * delta; + } +}; + +template <typename T> +struct ApplyAdagrad<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat accum, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad) { + accum.device(d) += grad.square(); + Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast; + bcast[0] = grad.dimension(0); + Eigen::Sizes<1> single; + var.device(d) -= lr.reshape(single).broadcast(bcast) * grad * accum.rsqrt(); + } +}; + +template <typename T> +struct ApplyMomentum<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat accum, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstFlat grad, + typename TTypes<T>::ConstScalar momentum) { + Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast; + bcast[0] = grad.dimension(0); + Eigen::Sizes<1> single; + accum.device(d) = accum * momentum.reshape(single).broadcast(bcast) + grad; + var.device(d) -= lr.reshape(single).broadcast(bcast) * accum; + } +}; + +template <typename T> +struct ApplyAdam<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, + typename TTypes<T>::ConstScalar beta1_power, + typename TTypes<T>::ConstScalar beta2_power, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstScalar beta1, + typename TTypes<T>::ConstScalar beta2, + typename TTypes<T>::ConstScalar epsilon, + typename TTypes<T>::ConstFlat grad) { + Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast; + bcast[0] = grad.dimension(0); + Eigen::Sizes<1> single; + const auto one = static_cast<T>(1.0); + m.device(d) = + m + + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) * + (grad - m); + v.device(d) = + v + + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) * + (grad.square() - v); + var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() / + (beta1_power.constant(one) - beta1_power)) + .reshape(single) + .broadcast(bcast) * + m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt()); + } +}; + +template <typename T> +struct ApplyRMSProp<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::Flat var, + typename TTypes<T>::Flat ms, typename TTypes<T>::Flat mom, + typename TTypes<T>::ConstScalar lr, + typename TTypes<T>::ConstScalar rho, + typename TTypes<T>::ConstScalar momentum, + typename TTypes<T>::ConstScalar epsilon, + typename TTypes<T>::ConstFlat grad) { + Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast; + bcast[0] = grad.dimension(0); + Eigen::Sizes<1> single; + const auto one = static_cast<T>(1.0); + ms.device(d) = ms + + (rho.constant(one) - rho).reshape(single).broadcast(bcast) * + (grad.square() - ms); + mom.device(d) = + mom * momentum.reshape(single).broadcast(bcast) + + lr.reshape(single).broadcast(bcast) * grad / + ((epsilon.reshape(single).broadcast(bcast) + ms).sqrt()); + var.device(d) -= mom; + } +}; + +} // namespace functor + +template struct functor::ApplyGradientDescent<GPUDevice, float>; +template struct functor::ApplyGradientDescent<GPUDevice, double>; + +template struct functor::ApplyAdagrad<GPUDevice, float>; +template struct functor::ApplyAdagrad<GPUDevice, double>; + +template struct functor::ApplyMomentum<GPUDevice, float>; +template struct functor::ApplyMomentum<GPUDevice, double>; + +template struct functor::ApplyAdam<GPUDevice, float>; +template struct functor::ApplyAdam<GPUDevice, double>; + +template struct functor::ApplyRMSProp<GPUDevice, float>; +template struct functor::ApplyRMSProp<GPUDevice, double>; +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc new file mode 100644 index 0000000000..3c629badb6 --- /dev/null +++ b/tensorflow/core/kernels/training_ops_test.cc @@ -0,0 +1,226 @@ +#include <gtest/gtest.h> +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/session_options.h" +#include "tensorflow/core/public/tensor.h" + +namespace tensorflow { + +// We focus on the single thread performance of training ops. +static SessionOptions InitSingleThreadedOptions() { + SessionOptions opts; + opts.config.set_intra_op_parallelism_threads(1); + opts.config.set_inter_op_parallelism_threads(1); + return opts; +} + +static SessionOptions* GetOptions() { + static SessionOptions opts = InitSingleThreadedOptions(); + return &opts; +} + +static Node* Var(Graph* g, int n) { + return test::graph::Var(g, DT_FLOAT, TensorShape({n})); +} + +static Node* Zeros(Graph* g, int n) { + Tensor data(DT_FLOAT, TensorShape({n})); + data.flat<float>().setZero(); + return test::graph::Constant(g, data); +} + +static Node* Random(Graph* g, int n) { + Tensor data(DT_FLOAT, TensorShape({n})); + data.flat<float>().setRandom(); + return test::graph::Constant(g, data); +} + +static Node* Scalar(Graph* g, float val) { + Tensor data(DT_FLOAT, TensorShape({})); + data.flat<float>()(0) = val; + return test::graph::Constant(g, data); +} + +static void SGD(int32 n, Graph** init_g, Graph** train_g) { + RequireDefaultOps(); + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + test::graph::Assign(g, var, Zeros(g, n)); + *init_g = g; + } + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto lr = Scalar(g, 0.01); + auto grad = Random(g, n); + test::graph::Multi(g, "ApplyGradientDescent", {var, lr, grad}); + *train_g = g; + } +} + +static void BM_SGD(int iters, int params) { + const int64 tot = static_cast<int64>(iters) * params; + testing::ItemsProcessed(tot); + testing::BytesProcessed(tot * sizeof(float)); + Graph* init; + Graph* train; + SGD(params, &init, &train); + test::Benchmark("cpu", train, GetOptions(), init).Run(iters); +} +BENCHMARK(BM_SGD)->Arg(128 << 10)->Arg(256 << 10); + +static void Adagrad(int32 n, Graph** init_g, Graph** train_g) { + RequireDefaultOps(); + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto accum = Var(g, n); + auto zero = Zeros(g, n); + test::graph::Assign(g, var, zero); + test::graph::Assign(g, accum, zero); + *init_g = g; + } + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto accum = Var(g, n); + auto lr = Scalar(g, 0.01); + auto grad = Random(g, n); + test::graph::Multi(g, "ApplyAdagrad", {var, accum, lr, grad}); + *train_g = g; + } +} + +static void BM_Adagrad(int iters, int params) { + const int64 tot = static_cast<int64>(iters) * params; + testing::ItemsProcessed(tot); + testing::BytesProcessed(tot * sizeof(float)); + Graph* init; + Graph* train; + Adagrad(params, &init, &train); + test::Benchmark("cpu", train, GetOptions(), init).Run(iters); +} +BENCHMARK(BM_Adagrad)->Arg(128 << 10)->Arg(256 << 10); + +static void Momentum(int32 n, Graph** init_g, Graph** train_g) { + RequireDefaultOps(); + TensorShape shape({n}); + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto accum = Var(g, n); + auto zero = Zeros(g, n); + test::graph::Assign(g, var, zero); + test::graph::Assign(g, accum, zero); + *init_g = g; + } + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto accum = Var(g, n); + auto lr = Scalar(g, 0.01); + auto grad = Random(g, n); + auto mom = Scalar(g, 0.01); + test::graph::Multi(g, "ApplyMomentum", {var, accum, lr, grad, mom}); + *train_g = g; + } +} + +static void BM_Momentum(int iters, int params) { + const int64 tot = static_cast<int64>(iters) * params; + testing::ItemsProcessed(tot); + testing::BytesProcessed(tot * sizeof(float)); + Graph* init; + Graph* train; + Momentum(params, &init, &train); + test::Benchmark("cpu", train, GetOptions(), init).Run(iters); +} +BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10); + +static void Adam(int32 n, Graph** init_g, Graph** train_g) { + RequireDefaultOps(); + TensorShape shape({n}); + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto m = Var(g, n); + auto v = Var(g, n); + auto zero = Zeros(g, n); + test::graph::Assign(g, var, zero); + test::graph::Assign(g, m, zero); + test::graph::Assign(g, v, zero); + *init_g = g; + } + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto m = Var(g, n); + auto v = Var(g, n); + auto beta1_power = Scalar(g, 0.9); + auto beta2_power = Scalar(g, 0.99); + auto lr = Scalar(g, 0.01); + auto beta1 = Scalar(g, 0.9); + auto beta2 = Scalar(g, 0.99); + auto epsilon = Scalar(g, 1e-8); + auto grad = Random(g, n); + test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr, + beta1, beta2, epsilon, grad}); + *train_g = g; + } +} + +static void BM_Adam(int iters, int params) { + const int64 tot = static_cast<int64>(iters) * params; + testing::ItemsProcessed(tot); + testing::BytesProcessed(tot * sizeof(float)); + Graph* init; + Graph* train; + Adam(params, &init, &train); + test::Benchmark("cpu", train, GetOptions(), init).Run(iters); +} +BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10); + +static void RMSProp(int32 n, Graph** init_g, Graph** train_g) { + RequireDefaultOps(); + TensorShape shape({n}); + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto ms = Var(g, n); + auto mom = Var(g, n); + auto zero = Zeros(g, n); + test::graph::Assign(g, var, zero); + test::graph::Assign(g, ms, zero); + test::graph::Assign(g, mom, zero); + *init_g = g; + } + { + Graph* g = new Graph(OpRegistry::Global()); + auto var = Var(g, n); + auto ms = Var(g, n); + auto mom = Var(g, n); + auto lr = Scalar(g, 0.01); + auto rho = Scalar(g, 0.9); + auto momentum = Scalar(g, 0.9); + auto epsilon = Scalar(g, 1e-8); + auto grad = Random(g, n); + test::graph::Multi(g, "ApplyRMSProp", + {var, ms, mom, lr, rho, momentum, epsilon, grad}); + *train_g = g; + } +} + +static void BM_RMSProp(int iters, int params) { + const int64 tot = static_cast<int64>(iters) * params; + testing::ItemsProcessed(tot); + testing::BytesProcessed(tot * sizeof(float)); + Graph* init; + Graph* train; + RMSProp(params, &init, &train); + test::Benchmark("cpu", train, GetOptions(), init).Run(iters); +} +BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10); + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc new file mode 100644 index 0000000000..4f11a881f8 --- /dev/null +++ b/tensorflow/core/kernels/transpose_op.cc @@ -0,0 +1,190 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/transpose_op.h" +#include "tensorflow/core/kernels/transpose_op_functor.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// inv = InvertPermutationOp(T<int32> p) takes a permutation of +// integers 0, 1, ..., n - 1 and returns the inverted +// permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n). +// +// REQUIRES: input is a vector of int32. +// REQUIRES: input is a permutation of 0, 1, ..., n-1. + +class InvertPermutationOp : public OpKernel { + public: + explicit InvertPermutationOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input.shape()), + errors::InvalidArgument("invert_permutation expects a 1D vector.")); + auto Tin = input.vec<int32>(); + const int N = Tin.size(); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input.shape(), &output)); + auto Tout = output->vec<int32>(); + std::fill_n(Tout.data(), N, -1); + for (int i = 0; i < N; ++i) { + const int32 d = Tin(i); + OP_REQUIRES(context, 0 <= d && d < N, + errors::InvalidArgument(d, " is not between 0 and ", N)); + OP_REQUIRES(context, Tout(d) == -1, + errors::InvalidArgument(d, " is duplicated in the input.")); + Tout(d) = i; + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("InvertPermutation").Device(DEVICE_CPU), + InvertPermutationOp); + +// output = TransposeOp(T<any> input, T<int32> perm) takes a tensor +// of type T and rank N, and a permutation of 0, 1, ..., N-1. It +// shuffles the dimensions of the input tensor according to permutation. +// +// Specifically, the returned tensor output meets the following condition: +// 1) output.dims() == input.dims(); +// 2) output.dim_size(i) == input.dim_size(perm[i]); +// 3) output.tensor<T, N>(i_0, i_1, ..., i_N-1) == +// input.tensor<T, N>(j_0, j_1, ..., j_N-1), +// where i_s == j_{perm[s]} +// +// REQUIRES: perm is a vector of int32. +// REQUIRES: input.dims() == perm.size(). +// REQUIRES: perm is a permutation. + +template <typename Device, typename T> +TransposeOp<Device, T>::TransposeOp(OpKernelConstruction* context) + : OpKernel(context) {} + +template <typename Device, typename T> +void TransposeOp<Device, T>::Compute(OpKernelContext* context) { + const Tensor& input = context->input(0); + const Tensor& perm = context->input(1); + // Preliminary validation of sizes. + OP_REQUIRES(context, TensorShapeUtils::IsVector(perm.shape()), + errors::InvalidArgument("perm must be a vector, not ", + perm.shape().DebugString())); + auto Vperm = perm.vec<int32>(); + const int dims = input.dims(); + static const int kMinDims = 1; + static const int kMaxDims = 8; + OP_REQUIRES(context, kMinDims <= dims && dims <= kMaxDims, + errors::Unimplemented("Transposing a tensor of rank ", dims, + " is not implemented.")); + OP_REQUIRES(context, dims == Vperm.size(), + errors::InvalidArgument( + "transpose expects a vector of size ", input.dims(), + ". But input(1) is a vector of size ", Vperm.size())); + gtl::ArraySlice<int32> permutation( + reinterpret_cast<const int32*>(Vperm.data()), dims); + TensorShape shape; + + // Check whether permutation is a permutation of integers of [0 .. dims). + gtl::InlinedVector<bool, 8> bits(dims); + for (const int32 d : permutation) { + OP_REQUIRES( + context, 0 <= d && d < dims, + errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")")); + bits[d] = true; + shape.AddDim(input.dim_size(d)); + } + for (int i = 0; i < dims; ++i) { + OP_REQUIRES(context, bits[i], errors::InvalidArgument( + i, " is missing from {", + str_util::Join(permutation, ","), "}.")); + } + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output)); + switch (dims) { +#define EXPAND_DIM(N) \ + case N: { \ + functor::TransposeFunctor<Device, T, N> func; \ + func(context->eigen_device<Device>(), output->tensor<T, N>(), \ + input.tensor<T, N>(), permutation.data()); \ + break; \ + } + EXPAND_DIM(1); + EXPAND_DIM(2); + EXPAND_DIM(3); + EXPAND_DIM(4); + EXPAND_DIM(5); + EXPAND_DIM(6); + EXPAND_DIM(7); + EXPAND_DIM(8); + default: + LOG(FATAL) << "Unexpected dims: " << dims; + } +#undef EXPAND_CASE +} + +namespace functor { + +template <typename Device, typename T, int NDIMS> +void TransposeMaybeInline(const Device& d, + typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, + const int* perm) { + // perm[] is a permutation of 0, 1, ..., NDIMS-1. perm[] is on CPU. + Eigen::array<int, NDIMS> p; + for (int i = 0; i < NDIMS; ++i) p[i] = perm[i]; + if (out.size() * sizeof(T) < 131072) { // Small transpose on a CPU: do inline + out = in.shuffle(p); + } else { + out.device(d) = in.shuffle(p); + } +} + +template <typename T, int NDIMS> +struct TransposeFunctor<CPUDevice, T, NDIMS> { + void operator()(const CPUDevice& d, typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) { + TransposeMaybeInline<CPUDevice, T, NDIMS>(d, out, in, perm); + } +}; + +} // namespace functor + +#define REGISTER(D, T) \ + template class TransposeOp<D##Device, T>; \ + REGISTER_KERNEL_BUILDER(Name("Transpose") \ + .Device(DEVICE_##D) \ + .TypeConstraint<T>("T") \ + .HostMemory("perm"), \ + TransposeOp<D##Device, T>) +REGISTER(CPU, float); +REGISTER(CPU, double); +REGISTER(CPU, complex64); +REGISTER(CPU, uint8); +REGISTER(CPU, int8); +REGISTER(CPU, int16); +REGISTER(CPU, int32); +REGISTER(CPU, int64); +REGISTER(CPU, string); +#if GOOGLE_CUDA +REGISTER(GPU, uint8); +REGISTER(GPU, int8); +REGISTER(GPU, int16); +REGISTER(GPU, int32); +REGISTER(GPU, int64); +REGISTER(GPU, float); +REGISTER(GPU, double); +#endif +#undef REGISTER +} // namespace tensorflow diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h new file mode 100644 index 0000000000..f7a5be5c2b --- /dev/null +++ b/tensorflow/core/kernels/transpose_op.h @@ -0,0 +1,19 @@ +#ifndef TENSORFLOW_KERNELS_TRANSPOSE_OP_H_ +#define TENSORFLOW_KERNELS_TRANSPOSE_OP_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +template <typename Device, typename T> +class TransposeOp : public OpKernel { + public: + explicit TransposeOp(OpKernelConstruction* context); + void Compute(OpKernelContext* context) override; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_TRANSPOSE_OP_H_ diff --git a/tensorflow/core/kernels/transpose_op_functor.h b/tensorflow/core/kernels/transpose_op_functor.h new file mode 100644 index 0000000000..8cbd1cbb29 --- /dev/null +++ b/tensorflow/core/kernels/transpose_op_functor.h @@ -0,0 +1,28 @@ +#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_ + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { +namespace functor { + +template <typename Device, typename T, int NDIMS> +void Transpose(const Device& d, typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) { + // perm[] is a permutation of 0, 1, ..., NDIMS-1. perm[] is on CPU. + Eigen::array<int, NDIMS> p; + for (int i = 0; i < NDIMS; ++i) p[i] = perm[i]; + out.device(d) = in.shuffle(p); +} + +template <typename Device, typename T, int NDIMS> +struct TransposeFunctor { + void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, const int* perm); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TRANSPOSE_OP_FUNCTOR_H_ diff --git a/tensorflow/core/kernels/transpose_op_gpu.cu.cc b/tensorflow/core/kernels/transpose_op_gpu.cu.cc new file mode 100644 index 0000000000..8c04a6544e --- /dev/null +++ b/tensorflow/core/kernels/transpose_op_gpu.cu.cc @@ -0,0 +1,43 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/kernels/transpose_op_functor.h" + +namespace tensorflow { +namespace functor { + +template <typename T, int NDIMS> +struct TransposeFunctor<Eigen::GpuDevice, T, NDIMS> { + void operator()(const Eigen::GpuDevice& d, + typename TTypes<T, NDIMS>::Tensor out, + typename TTypes<T, NDIMS>::ConstTensor in, const int* perm) { + Transpose<Eigen::GpuDevice, T, NDIMS>(d, out, in, perm); + } +}; + +#define DEFINE(T, N) template struct TransposeFunctor<Eigen::GpuDevice, T, N>; +#define DEFINE_DIM(T) \ + DEFINE(T, 1); \ + DEFINE(T, 2); \ + DEFINE(T, 3); \ + DEFINE(T, 4); \ + DEFINE(T, 5); \ + DEFINE(T, 6); \ + DEFINE(T, 7); \ + DEFINE(T, 8); +DEFINE_DIM(uint8); +DEFINE_DIM(int8); +DEFINE_DIM(int16); +DEFINE_DIM(int32); +DEFINE_DIM(int64); +DEFINE_DIM(float); +DEFINE_DIM(double); +#undef DEFINE_DIM +#undef DEFINE + +} // end namespace functor +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc new file mode 100644 index 0000000000..61f4a54583 --- /dev/null +++ b/tensorflow/core/kernels/unique_op.cc @@ -0,0 +1,61 @@ +#include <unordered_map> +#include <utility> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/public/status.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template <typename T> +class UniqueOp : public OpKernel { + public: + explicit UniqueOp(OpKernelConstruction* context) : OpKernel(context) { + const DataType dt = DataTypeToEnum<T>::v(); + OP_REQUIRES_OK(context, context->MatchSignature({dt}, {dt, DT_INT32})); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()), + errors::InvalidArgument("unique expects a 1D vector.")); + auto Tin = input.vec<T>(); + const int N = Tin.size(); + + Tensor* idx = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, input.shape(), &idx)); + auto idx_vec = idx->template vec<int32>(); + + std::unordered_map<T, int32> uniq; + uniq.reserve(2 * N); + for (int i = 0, j = 0; i < N; ++i) { + auto it = uniq.insert(std::make_pair(Tin(i), j)); + idx_vec(i) = it.first->second; + if (it.second) { + ++j; + } + } + int32 uniq_size = uniq.size(); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output( + 0, TensorShape({uniq_size}), &output)); + auto output_vec = output->template vec<T>(); + + for (auto it : uniq) { + output_vec(it.second) = it.first; + } + } +}; + +#define REGISTER_UNIQUE(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Unique").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + UniqueOp<type>) + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); +#undef REGISTER_UNIQUE +} // namespace tensorflow diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc new file mode 100644 index 0000000000..658f2282cf --- /dev/null +++ b/tensorflow/core/kernels/unique_op_test.cc @@ -0,0 +1,51 @@ +#include <functional> +#include <memory> +#include <vector> + +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/kernels/ops_testutil.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/core/public/tensor.h" +#include <gtest/gtest.h> +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { + +namespace { + +static void BM_Unique(int iters, int dim) { + testing::StopTiming(); + RequireDefaultOps(); + Graph* g = new Graph(OpRegistry::Global()); + + Tensor input(DT_INT32, TensorShape({dim})); + input.flat<int32>().setRandom(); + + Node* node; + TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Unique") + .Input(test::graph::Constant(g, input)) + .Attr("T", DT_INT32) + .Finalize(g, &node)); + + testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32)); + testing::UseRealTime(); + testing::StartTiming(); + test::Benchmark("cpu", g).Run(iters); +} + +BENCHMARK(BM_Unique) + ->Arg(32) + ->Arg(256) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(16 * 1024) + ->Arg(64 * 1024) + ->Arg(256 * 1024); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc new file mode 100644 index 0000000000..36cfb2c8e5 --- /dev/null +++ b/tensorflow/core/kernels/unpack_op.cc @@ -0,0 +1,96 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#include <vector> + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/split_op.h" +#include "tensorflow/core/public/status.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class UnpackOp : public OpKernel { + public: + explicit UnpackOp(OpKernelConstruction* c) : OpKernel(c) {} + + void Compute(OpKernelContext* context) override { + const int32 num = num_outputs(); + const Tensor& input = context->input(0); + const TensorShape& input_shape = input.shape(); + + OP_REQUIRES( + context, input_shape.dims() > 0 && input_shape.dim_size(0) == num, + errors::InvalidArgument("Input shape must start with ", num, ", got ", + input_shape.ShortDebugString())); + + auto output_shape = input_shape; + output_shape.RemoveDim(0); + const int32 output_size = output_shape.num_elements(); + + // Special case: Aligned, so we can share the underlying buffer. + // + // Apply this optimization conservatively: if input is aligned, + // the resulting tensors must be aligned. It's conservative + // because if the immediate consumer of the resulting tensors are + // not using eigen for computation, its perfectly fine to avoid + // the copying. + if (output_size == 0 || IsInnerDimsSizeAligned<T>(input_shape)) { + for (int i = 0; i < num; ++i) { + Tensor output; + CHECK(output.CopyFrom(input.Slice(i, i + 1), output_shape)); + context->set_output(i, output); + } + return; + } + + // Except for shape, unpack is a special case of split, so we reuse the + // same computational kernels. + auto input_reshaped = input.shaped<T, 3>({1, num, output_size}); + + for (int i = 0; i < num; ++i) { + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(i, output_shape, &output)); + auto output_shaped = output->shaped<T, 3>({1, 1, output_size}); + + Eigen::DSizes<ptrdiff_t, 3> indices{0, i, 0}; + Eigen::DSizes<ptrdiff_t, 3> sizes{1, 1, output_size}; + functor::Split<Device, T>()(context->eigen_device<Device>(), + output_shaped, input_reshaped, indices, + sizes); + } + } +}; + +#define REGISTER_UNPACK(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Unpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ + UnpackOp<CPUDevice, type>) + +TF_CALL_ALL_TYPES(REGISTER_UNPACK); + +#undef REGISTER_UNPACK + +#if GOOGLE_CUDA + +#define REGISTER_GPU(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Unpack").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ + UnpackOp<GPUDevice, type>) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU); +#undef REGISTER_GPU + +#endif // GOOGLE_CUDA + +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc new file mode 100644 index 0000000000..2f1dbc68c0 --- /dev/null +++ b/tensorflow/core/kernels/variable_ops.cc @@ -0,0 +1,37 @@ +#define EIGEN_USE_THREADS +#include "tensorflow/core/kernels/variable_ops.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp); +REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU), + TemporaryVariableOp); +REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU), + DestroyTemporaryVariableOp); + +#if GOOGLE_CUDA +// Only register 'Variable' on GPU for the subset of types also supported by +// 'Assign' (see dense_update_ops.cc.) +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER( \ + Name("Variable").Device(DEVICE_GPU).TypeConstraint<type>("dtype"), \ + VariableOp); \ + REGISTER_KERNEL_BUILDER(Name("TemporaryVariable") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("dtype"), \ + TemporaryVariableOp); \ + REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<type>("T"), \ + DestroyTemporaryVariableOp); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h new file mode 100644 index 0000000000..77d2da0ad4 --- /dev/null +++ b/tensorflow/core/kernels/variable_ops.h @@ -0,0 +1,146 @@ +#ifndef TENSORFLOW_KERNELS_VARIABLE_OPS_H_ +#define TENSORFLOW_KERNELS_VARIABLE_OPS_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/port.h" + +namespace tensorflow { + +class VariableOp : public OpKernel { + public: + explicit VariableOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_)); + dtype_ = RemoveRefType(context->output_type(0)); + } + + ~VariableOp() override { + if (var_) var_->Unref(); + } + + void Compute(OpKernelContext* ctx) override { + mutex_lock l(init_mu_); + if (var_ == nullptr) { + OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(), + true /* use name() */)); + auto creator = [this](Var** var) { + *var = new Var(dtype_); + (*var)->tensor()->set_shape(shape_); + return Status::OK(); + }; + OP_REQUIRES_OK(ctx, + cinfo_.resource_manager()->LookupOrCreate<Var>( + cinfo_.container(), cinfo_.name(), &var_, creator)); + } + // Output a reference to our tensor, so it may be updated. + // + // As long as *this is alive, the ref we return here is valid + // because *this owns a ref on var_. + ctx->set_output_ref(0, var_->mu(), var_->tensor()); + } + + private: + class Var : public ResourceBase { + public: + explicit Var(DataType dtype) : tensor_(dtype) {} + mutex* mu() { return &mu_; } + Tensor* tensor() { return &tensor_; } + + string DebugString() override { + return strings::StrCat(DataTypeString(tensor_.dtype()), "/", + tensor_.shape().ShortDebugString()); + } + + private: + mutex mu_; + Tensor tensor_; + + ~Var() override {} + TF_DISALLOW_COPY_AND_ASSIGN(Var); + }; + + DataType dtype_; + TensorShape shape_; + + mutex init_mu_; + ContainerInfo cinfo_ GUARDED_BY(init_mu_); + Var* var_ GUARDED_BY(init_mu_) = nullptr; + + TF_DISALLOW_COPY_AND_ASSIGN(VariableOp); +}; + +class TemporaryVariableOp : public OpKernel { + public: + explicit TemporaryVariableOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_)); + OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_)); + OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_)); + // Variable name defaults to op name if not specified explicitly. + if (var_name_ == "") var_name_ = name(); + } + + void Compute(OpKernelContext* context) override { + Status s; + ResourceMgr* rm = context->step_resource_manager(); + OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager.")); + auto* tmp_var = new TmpVar; + OP_REQUIRES(context, tmp_var, + errors::ResourceExhausted("Could not allocate TmpVar.")); + tmp_var->name = var_name_; + s = context->allocate_temp(dtype_, shape_, &tmp_var->val); + if (!s.ok()) tmp_var->Unref(); + OP_REQUIRES_OK(context, s); + OP_REQUIRES_OK(context, rm->Create("tmp_var", var_name_, tmp_var)); + context->set_output_ref(0, &tmp_var->mu, &tmp_var->val); + } + + private: + // Refcounted temporary variable resource. + friend class DestroyTemporaryVariableOp; + struct TmpVar : public ResourceBase { + mutex mu; + Tensor val; + string name; + string DebugString() override { return name; } + ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; } + }; + + TensorShape shape_; + DataType dtype_; + string var_name_; +}; + +class DestroyTemporaryVariableOp : public OpKernel { + public: + explicit DestroyTemporaryVariableOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES(context, IsRefType(context->input_type(0)), + errors::InvalidArgument("lhs input needs to be a ref type")) + OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_)); + OP_REQUIRES(context, var_name_ != "", + errors::InvalidArgument("Missing var_name attribute")); + } + + void Compute(OpKernelContext* context) override { + // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed + // their execution before this DestroyTemporaryVariable op executes. + // This is typically achieved using control dependencies. + CHECK(IsRefType(context->input_dtype(0))); + Tensor tmpvar = context->mutable_input(0, false); + context->set_output(0, tmpvar); + ResourceMgr* rm = context->step_resource_manager(); + OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager.")); + OP_REQUIRES_OK( + context, rm->Delete<TemporaryVariableOp::TmpVar>("tmp_var", var_name_)); + } + + private: + string var_name_; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_VARIABLE_OPS_H_ diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc new file mode 100644 index 0000000000..9db0943ea7 --- /dev/null +++ b/tensorflow/core/kernels/where_op.cc @@ -0,0 +1,74 @@ +// See docs in ../ops/array_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/where_op.h" + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device> +class WhereOp : public OpKernel { + public: + explicit WhereOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + + const int input_dims = input.dims(); + Tensor num_true; + OP_REQUIRES_OK( + context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true)); + auto num_true_t = num_true.scalar<int64>(); + + functor::NumTrue<Device>::Compute(context->eigen_device<Device>(), + input.flat<bool>(), num_true_t); + TensorShape output_shape({num_true_t(), input_dims}); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + +#define HANDLE_DIM(NDIM) \ + case NDIM: \ + functor::Where<Device, NDIM>::Compute(context->eigen_device<Device>(), \ + input.tensor<bool, NDIM>(), \ + output->matrix<int64>()); \ + break; + + switch (input_dims) { + HANDLE_DIM(1); + HANDLE_DIM(2); + HANDLE_DIM(3); + HANDLE_DIM(4); + HANDLE_DIM(5); + + default: + OP_REQUIRES(context, false, + errors::InvalidArgument( + "WhereOp : Unhandled input dimensions: ", input_dims)); + } +#undef HANDLE_DIM + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(WhereOp); +}; + +#define REGISTER_WHERE() \ + REGISTER_KERNEL_BUILDER(Name("Where").Device(DEVICE_CPU), WhereOp<CPUDevice>); + +REGISTER_WHERE(); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h new file mode 100644 index 0000000000..c7b835d02f --- /dev/null +++ b/tensorflow/core/kernels/where_op.h @@ -0,0 +1,65 @@ +#ifndef TENSORFLOW_KERNELS_WHERE_OP_H_ +#define TENSORFLOW_KERNELS_WHERE_OP_H_ + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +namespace functor { + +template <typename Device> +struct NumTrue { + EIGEN_ALWAYS_INLINE static void Compute( + const Device& d, typename TTypes<bool>::ConstFlat input, + TTypes<int64>::Scalar num_true) { + num_true.device(d) = input.template cast<int64>().sum(); + } +}; + +template <typename Device, int NDIM> +struct Where { + EIGEN_ALWAYS_INLINE static void Compute( + const Device& d, typename TTypes<bool, NDIM>::ConstTensor input, + typename TTypes<int64>::Matrix output) { + Eigen::DenseIndex true_n = 0; + Eigen::DSizes<Eigen::DenseIndex, NDIM> dims = input.dimensions(); + Eigen::DSizes<Eigen::DenseIndex, NDIM> strides; + + // Calculate strides for RowMajor order. + EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) == + static_cast<int>(Eigen::RowMajor)), + INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR); + + strides[NDIM - 1] = 1; + for (int i = NDIM - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + + // Note, no bounds checking is done on true_n. It is assumed that + // the output was correctly sized via output of NumTrue::Compute. + for (Eigen::DenseIndex n = 0; n < input.size(); ++n) { + if (input.data()[n]) { + WriteIndexRowMajor(output, strides, true_n, n); + ++true_n; + } + } + } + + EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor( + typename TTypes<int64>::Matrix output, + const Eigen::DSizes<Eigen::DenseIndex, NDIM>& strides, + Eigen::DenseIndex true_n, Eigen::DenseIndex index) { + for (int i = 0; i < NDIM; ++i) { + output(true_n, i) = index / strides[i]; + index %= strides[i]; + } + } +}; + +} // namespace functor + +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_WHERE_OP_H_ diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc new file mode 100644 index 0000000000..b940163ec9 --- /dev/null +++ b/tensorflow/core/kernels/whole_file_read_ops.cc @@ -0,0 +1,108 @@ +// See docs in ../ops/io_ops.cc. + +#include <memory> +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/reader_op_kernel.h" +#include "tensorflow/core/kernels/reader_base.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/public/env.h" +#include "tensorflow/core/public/tensor_shape.h" + +namespace tensorflow { + +static Status ReadEntireFile(Env* env, const string& filename, + string* contents) { + uint64 file_size = 0; + TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size)); + contents->resize(file_size); + RandomAccessFile* file; + TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file)); + std::unique_ptr<RandomAccessFile> make_sure_file_gets_deleted(file); + StringPiece data; + TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(*contents)[0])); + if (data.size() != file_size) { + return errors::DataLoss("Truncated read of '", filename, "' expected ", + file_size, " got ", data.size()); + } + if (data.data() != &(*contents)[0]) { + memmove(&(*contents)[0], data.data(), data.size()); + } + return Status::OK(); +} + +class WholeFileReader : public ReaderBase { + public: + WholeFileReader(Env* env, const string& node_name) + : ReaderBase(strings::StrCat("WholeFileReader '", node_name, "'")), + env_(env) {} + + Status ReadLocked(string* key, string* value, bool* produced, + bool* at_end) override { + *key = current_work(); + TF_RETURN_IF_ERROR(ReadEntireFile(env_, *key, value)); + *produced = true; + *at_end = true; + return Status::OK(); + } + + // Stores state in a ReaderBaseState proto, since WholeFileReader has + // no additional state beyond ReaderBase. + Status SerializeStateLocked(string* state) override { + ReaderBaseState base_state; + SaveBaseState(&base_state); + base_state.SerializeToString(state); + return Status::OK(); + } + + Status RestoreStateLocked(const string& state) override { + ReaderBaseState base_state; + if (!ParseProtoUnlimited(&base_state, state)) { + return errors::InvalidArgument("Could not parse state for ", name(), ": ", + str_util::CEscape(state)); + } + TF_RETURN_IF_ERROR(RestoreBaseState(base_state)); + return Status::OK(); + } + + private: + Env* env_; +}; + +class WholeFileReaderOp : public ReaderOpKernel { + public: + explicit WholeFileReaderOp(OpKernelConstruction* context) + : ReaderOpKernel(context) { + Env* env = context->env(); + SetReaderFactory( + [this, env]() { return new WholeFileReader(env, name()); }); + } +}; + +REGISTER_KERNEL_BUILDER(Name("WholeFileReader").Device(DEVICE_CPU), + WholeFileReaderOp); + +class ReadFileOp : public OpKernel { + public: + using OpKernel::OpKernel; + void Compute(OpKernelContext* context) override { + const Tensor* input; + OP_REQUIRES_OK(context, context->input("filename", &input)); + OP_REQUIRES(context, TensorShapeUtils::IsScalar(input->shape()), + errors::InvalidArgument( + "Input filename tensor must be scalar, but had shape: ", + input->shape().DebugString())); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output("contents", + TensorShape({}), &output)); + OP_REQUIRES_OK(context, + ReadEntireFile(context->env(), input->scalar<string>()(), + &output->scalar<string>()())); + } +}; + +REGISTER_KERNEL_BUILDER(Name("ReadFile").Device(DEVICE_CPU), ReadFileOp); + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc new file mode 100644 index 0000000000..ff54d157af --- /dev/null +++ b/tensorflow/core/kernels/xent_op.cc @@ -0,0 +1,90 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/kernels/xent_op.h" +#include "tensorflow/core/public/tensor.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +class SoftmaxXentWithLogitsOp : public OpKernel { + public: + explicit SoftmaxXentWithLogitsOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& logits_in = context->input(0); + const Tensor& labels_in = context->input(1); + OP_REQUIRES(context, logits_in.IsSameSize(labels_in), + errors::InvalidArgument( + "logits and labels must be same size: logits_size=", + logits_in.shape().DebugString(), " labels_size=", + labels_in.shape().DebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()), + errors::InvalidArgument("logits must be 2-dimensional")); + // As we already tested that both inputs have the same shape no need to + // check that "labels" is a matrix too. + + // loss is 1-D (one per example), and size is batch_size. + + Tensor scratch; + OP_REQUIRES_OK( + context, context->allocate_temp(DataTypeToEnum<T>::value, + TensorShape({logits_in.dim_size(0), 1}), + &scratch)); + + Tensor* loss_out = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output( + 0, TensorShape({logits_in.dim_size(0)}), &loss_out)); + Tensor* back_out = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(1, logits_in.shape(), &back_out)); + + functor::XentFunctor<Device, T> functor; + functor(context->eigen_device<Device>(), logits_in.matrix<T>(), + labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(), + back_out->matrix<T>()); + } +}; + +// Partial specialization for a CPUDevice, that uses the Eigen implementation +// from XentEigenImpl. +namespace functor { +template <typename T> +struct XentFunctor<CPUDevice, T> { + void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::ConstMatrix labels, + typename TTypes<T>::Matrix scratch, + typename TTypes<T>::Vec loss, + typename TTypes<T>::Matrix backprop) { + XentEigenImpl<CPUDevice, T>::Compute(d, logits, labels, scratch, loss, + backprop); + } +}; +} // namespace functor + +REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + SoftmaxXentWithLogitsOp<CPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") + .Device(DEVICE_CPU) + .TypeConstraint<double>("T"), + SoftmaxXentWithLogitsOp<CPUDevice, double>); + +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T"), + SoftmaxXentWithLogitsOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/core/kernels/xent_op.h b/tensorflow/core/kernels/xent_op.h new file mode 100644 index 0000000000..edb7d817c8 --- /dev/null +++ b/tensorflow/core/kernels/xent_op.h @@ -0,0 +1,102 @@ +#ifndef TENSORFLOW_KERNELS_XENT_OP_H_ +#define TENSORFLOW_KERNELS_XENT_OP_H_ +// Functor definition for XentOp, must be compilable by nvcc. + +#include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace functor { + +// Functor used by XentOp to do the computations. +template <typename Device, typename T> +struct XentFunctor { + // Computes Cross Entropy loss and backprop. + // + // logits: batch_size, num_classes. + // labels: batch_size, num_classes. + // scratch: temporary tensor, dims: batch_size, 1 + // loss: output tensor for the loss, dims: batch_size. + // backprop: output tensor for the backprop, dims: batch_size, num_classes. + void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::ConstMatrix labels, + typename TTypes<T>::Matrix scratch, + typename TTypes<T>::Vec loss, + typename TTypes<T>::Matrix backprop); +}; + +// Eigen code implementing XentFunctor::operator(). +// This code works for both CPU and GPU and is used by the functor +// specializations for both device types. +template <typename Device, typename T> +struct XentEigenImpl { + static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::ConstMatrix labels, + typename TTypes<T>::Matrix scratch, + typename TTypes<T>::Vec loss, + typename TTypes<T>::Matrix backprop) { + // NOTE(mdevin): This duplicates some of the computations in softmax_op + // because we need the intermediate (logits -max(logits)) values to + // avoid a log(exp()) in the computation of the loss. + + const int kBatchDim = 0; + const int kClassDim = 1; + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + +// These arrays are used to reduce along the class dimension, and broadcast +// the resulting value to all classes. +#if !defined(EIGEN_HAS_INDEX_LIST) + Eigen::array<int, 1> along_class; + along_class[0] = kClassDim; + Eigen::array<int, 1> batch_only; + batch_only[0] = batch_size; + Eigen::array<int, 2> batch_by_one; + batch_by_one[0] = batch_size; + batch_by_one[1] = 1; + Eigen::array<int, 2> one_by_class; + one_by_class[0] = 1; + one_by_class[1] = num_classes; +#else + Eigen::IndexList<Eigen::type2index<kClassDim> > along_class; + Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one; + batch_by_one.set(0, batch_size); + Eigen::IndexList<int> batch_only; + batch_only.set(0, batch_size); + Eigen::IndexList<Eigen::type2index<1>, int> one_by_class; + one_by_class.set(1, num_classes); +#endif + + // max_logits along classes. + scratch.reshape(batch_only).device(d) = logits.maximum(along_class); + + // logits - max_logits. + backprop.device(d) = logits - scratch.broadcast(one_by_class); + + // sum(exp(logits - max_logits)) along classes. + scratch.reshape(batch_only).device(d) = backprop.exp().sum(along_class); + + // NOTE(keveman): Eigen on GPU dispatches to an optimized implementaion + // for an expression of the form lhs = rhs.sum(). + // lhs = -rhs.sum() doesn't match the above pattern, so folding in the + // negation before calling sum(). + // sum(-labels * + // ((logits - max_logits) - log(sum(exp(logits - max_logits))))) + // along classes + loss.device(d) = + (labels * (scratch.log().eval().broadcast(one_by_class) - backprop)) + .eval() + .sum(along_class); + + // backprop: prob - labels, where + // prob = exp(logits - max_logits) / sum(exp(logits - max_logits)) + backprop.device(d) = + (backprop.exp() / scratch.broadcast(one_by_class)) - labels; + } +}; + +} // namespace functor +} // namespace tensorflow + +#endif // TENSORFLOW_KERNELS_XENT_OP_H_ diff --git a/tensorflow/core/kernels/xent_op_gpu.cu.cc b/tensorflow/core/kernels/xent_op_gpu.cu.cc new file mode 100644 index 0000000000..eec6a84281 --- /dev/null +++ b/tensorflow/core/kernels/xent_op_gpu.cu.cc @@ -0,0 +1,35 @@ +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/xent_op.h" + +#include "tensorflow/core/platform/port.h" +#include "tensorflow/core/framework/tensor_types.h" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +// Partial specialization for a GPUDevice, that uses the Eigen implementation +// from XentEigenImpl. +namespace functor { +template <typename T> +struct XentFunctor<GPUDevice, T> { + void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits, + typename TTypes<T>::ConstMatrix labels, + typename TTypes<T>::Matrix scratch, + typename TTypes<T>::Vec loss, + typename TTypes<T>::Matrix backprop) { + XentEigenImpl<GPUDevice, T>::Compute(d, logits, labels, scratch, loss, + backprop); + } +}; +} // end namespace functor + +// Instantiate the GPU implementation for float. +template struct functor::XentFunctor<GPUDevice, float>; + +} // end namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/xent_op_test.cc b/tensorflow/core/kernels/xent_op_test.cc new file mode 100644 index 0000000000..9aab1b09bf --- /dev/null +++ b/tensorflow/core/kernels/xent_op_test.cc @@ -0,0 +1,46 @@ +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" +#include "tensorflow/core/platform/test_benchmark.h" +#include <gtest/gtest.h> +#include "tensorflow/core/kernels/xent_op.h" + +namespace tensorflow { + +static Graph* Xent(int batch_size, int num_classes) { + Graph* g = new Graph(OpRegistry::Global()); + Tensor logits(DT_FLOAT, TensorShape({batch_size, num_classes})); + logits.flat<float>().setRandom(); + Tensor labels(DT_FLOAT, TensorShape({batch_size, num_classes})); + labels.flat<float>().setRandom(); + test::graph::Binary(g, "SoftmaxCrossEntropyWithLogits", + test::graph::Constant(g, logits), + test::graph::Constant(g, labels)); + return g; +} + +#define BM_XentDev(BATCH, CLASS, DEVICE) \ + static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \ + testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \ + test::Benchmark(#DEVICE, Xent(BATCH, CLASS)).Run(iters); \ + } \ + BENCHMARK(BM_Xent##_##BATCH##_##CLASS##_##DEVICE); + +/// The representative tests for ptb_word on GPU +BM_XentDev(16, 10000, gpu); +BM_XentDev(16, 30000, gpu); +BM_XentDev(16, 100000, gpu); + +BM_XentDev(32, 10000, gpu); +BM_XentDev(32, 30000, gpu); +BM_XentDev(32, 100000, gpu); + +BM_XentDev(64, 10000, gpu); +BM_XentDev(64, 30000, gpu); +BM_XentDev(64, 100000, gpu); + +/// Only the smaller tests for CPU. Otherwise, it's too slow +BM_XentDev(16, 10000, cpu); +BM_XentDev(32, 10000, cpu); +BM_XentDev(64, 10000, cpu); + +} // end namespace tensorflow |