diff options
author | 2017-09-08 11:14:21 -0700 | |
---|---|---|
committer | 2017-09-08 11:18:21 -0700 | |
commit | 477a221a2ffee7261220ad6c0f4f8c76a5eb7931 (patch) | |
tree | 280e78e8f928390d886e6b0e5a03a71c588a51a5 /tensorflow/core/kernels/aggregate_ops.cc | |
parent | 96828c9f5276a759717e0d9574b34bcd456d11a5 (diff) |
Modify variant registry to have UnaryOp and BinaryOp registrations. Speed up registry lookup.
* Op type is described as an enum (separate enums for unary and binary ops).
* Modified ZerosLike registrations to unary registrations with ZEROS_LIKE enum.
* Added Add(a,b) registrations as binary registrations with ADD enum.
* AddN op uses ADD BinaryOp registrations and ZerosLike op modified to use
ZEROS_LIKE UnaryOp registrations.
* Modified the registry tables' keys from string type to StringPiece type.
The reduced copying should speed up registry lookups by ops. Required creating
a backing store for device and type_name strings passed in at registration.
PiperOrigin-RevId: 168020449
Diffstat (limited to 'tensorflow/core/kernels/aggregate_ops.cc')
-rw-r--r-- | tensorflow/core/kernels/aggregate_ops.cc | 78 |
1 files changed, 76 insertions, 2 deletions
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc index 0aa65729de..0099984f69 100644 --- a/tensorflow/core/kernels/aggregate_ops.cc +++ b/tensorflow/core/kernels/aggregate_ops.cc @@ -24,6 +24,9 @@ limitations under the License. #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/variant.h" +#include "tensorflow/core/framework/variant_encode_decode.h" +#include "tensorflow/core/framework/variant_op_registry.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/platform/logging.h" @@ -33,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #ifdef TENSORFLOW_USE_SYCL typedef Eigen::SyclDevice SYCLDevice; -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL template <typename Device, typename T> class AddNOp : public OpKernel { @@ -150,6 +153,65 @@ class AddNOp : public OpKernel { } }; +template <typename Device> +class AddNOp<Device, Variant> : public OpKernel { + public: + explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* ctx) override { + if (!ctx->ValidateInputsAreSameShape(this)) return; + + const Tensor& input0 = ctx->input(0); + const int num = ctx->num_inputs(); + + if (num == 1) { + ctx->set_output(0, input0); + return; + } + + for (int i = 0; i < num; ++i) { + // Step 1: ensure unary variants. + OP_REQUIRES( + ctx, ctx->input(i).dims() == 0, + errors::InvalidArgument( + "AddN of non-scalar Tensor with dtype=DT_VARIANT is not " + "supported; inputs[", + i, " has shape: ", ctx->input(i).shape().DebugString(), ".")); + } + + TensorShape common_shape; + OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(0), &common_shape)); + // Step 2: access all variants and ensure shapes match. + for (int i = 1; i < num; ++i) { + TensorShape check_shape; + OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(i), &check_shape)); + OP_REQUIRES(ctx, common_shape == check_shape, + errors::InvalidArgument( + "AddN of Variants of differing shapes; inputs[0] shape: ", + common_shape.DebugString(), ", inputs[", i, + "] shape: ", check_shape.DebugString())); + } + + // Step 3: attempt to add using + // BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...) + // For the output create a default-constructed variant object. + // TODO(ebrevdo): Perform summation in a tree-structure. + Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({})); + Variant* v_out = &(out.scalar<Variant>()()); + OP_REQUIRES_OK( + ctx, BinaryOpVariants<Device>( + ctx, ADD_VARIANT_BINARY_OP, ctx->input(0).scalar<Variant>()(), + ctx->input(1).scalar<Variant>()(), v_out)); + for (int i = 2; i < num; ++i) { + const Variant tmp = std::move(*v_out); + const Variant& inp = ctx->input(i).scalar<Variant>()(); + OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP, + inp, tmp, v_out)); + } + ctx->set_output(0, out); + } +}; + #define REGISTER_ADDN(type, dev) \ REGISTER_KERNEL_BUILDER( \ Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ @@ -158,6 +220,8 @@ class AddNOp : public OpKernel { #define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU) TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU); +REGISTER_ADDN_CPU(Variant); + #undef REGISTER_ADDN_CPU #if GOOGLE_CUDA @@ -176,6 +240,16 @@ REGISTER_KERNEL_BUILDER(Name("AddN") .HostMemory("inputs") .HostMemory("sum"), AddNOp<CPUDevice, int32>); + +// TODO(ebrevdo): Once rendezvous has been properly set up for +// Variants, we'll no longer need a HostMemory attribute for this case. +REGISTER_KERNEL_BUILDER(Name("AddN") + .Device(DEVICE_GPU) + .TypeConstraint<Variant>("T") + .HostMemory("inputs") + .HostMemory("sum"), + AddNOp<GPUDevice, Variant>); + #endif // GOOGLE_CUDA #ifdef TENSORFLOW_USE_SYCL @@ -191,7 +265,7 @@ REGISTER_KERNEL_BUILDER(Name("AddN") .HostMemory("inputs") .HostMemory("sum"), AddNOp<CPUDevice, int32>); -#endif // TENSORFLOW_USE_SYCL +#endif // TENSORFLOW_USE_SYCL #undef REGISTER_ADDN |