Allow (safe) in-place computation in TensorFlow C++ ops. When at least one input tensor has the same size and type as the output, and the underlying buffer is owned by the op, i.e. when its refcount is 1 at the time the op's Compute method executes, the computation can be performed in place and allocation of the output buffer avoided.

I updated the following ops to perform in-place computation automatically when possible: * All standard coefficient-wise unary and binary operators (including with broadcasting) inheriting from base classes in kernels/cwise_ops_common.h. * unary and binary operators inheriting from base classes in framework/numeric_op.h. This is mostly old code for the Relu family and associated gradients. * All linear algebra ops inheriting from linalg_common. * Misc individual files/ops: softmax, select, bias, aggregate ops, batch_norm & fused_batch_norm, adjust_hue, constant, depthwise_conv_grad, fractional_avg_pool, misc. pooling ops, matrix_set_diag, xent & sparse_xent, unique_op. Change: 148166936
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-02-21 17:31:57 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-02-21 17:52:15 -0800
commit: 4891c01b1cadf085a915a3eac5dd1b8d8cdee203 (patch)
tree: 87ec00e1927877ba26a2ffb69bc4f74f25c36f6a /tensorflow
parent: 123c2bb0af532d5fdaa05358158da33497d4bfe6 (diff)
26 files changed, 365 insertions, 108 deletions
diff --git a/tensorflow/core/framework/numeric_op.h b/tensorflow/core/framework/numeric_op.h
index f24bcfead3..891e077657 100644
--- a/tensorflow/core/framework/numeric_op.h
+++ b/tensorflow/core/framework/numeric_op.h
@@ -56,9 +56,11 @@ class UnaryElementWiseOp : public UnaryOp<T> {
   void Compute(OpKernelContext* context) override {
     // Output shape is the same as input shape.
     const Tensor& input = context->input(0);
-    Tensor* output;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    Tensor* output = nullptr;
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input.shape(), &output));
+    }
     static_cast<CHILD*>(this)->Operate(context, input, output);
   }
 };
@@ -77,8 +79,11 @@ class BinaryElementWiseOp : public BinaryOp<T> {
       return;
     }
 
-    Tensor* output;
-    OP_REQUIRES_OK(context, context->allocate_output(0, a.shape(), &output));
+    Tensor* output = nullptr;
+    if (!context->forward_input_to_output(0, 0, &output) &&
+        !context->forward_input_to_output(1, 0, &output)) {
+      OP_REQUIRES_OK(context, context->allocate_output(0, a.shape(), &output));
+    }
 
     // Dispatch to the descendant's Operate() function.
     switch (a.dims()) {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index b35e4ac243..a56b8cb4b3 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -347,6 +347,106 @@ void OpKernelContext::forward_ref_input_to_ref_output(int input_index,
                  (*params_->inputs)[input_index].tensor);
 }
 
+bool OpKernelContext::forward_input_to_output(int input_index, int output_index,
+                                              Tensor** output) {
+  DCHECK_GE(input_index, 0);
+  DCHECK_LT(input_index, params_->inputs->size());
+  const TensorValue& input = (*params_->inputs)[input_index];
+  if (input.tensor == nullptr) {
+    return false;
+  }
+  return forward_input_to_output_with_shape(input_index, output_index,
+                                            input.tensor->shape(), output);
+}
+
+Status OpKernelContext::forward_input_to_output(StringPiece input_name,
+                                                StringPiece output_name,
+                                                Tensor** output) {
+  int input_index, output_index, stop;
+  TF_RETURN_IF_ERROR(
+      params_->op_kernel->InputRange(input_name, &input_index, &stop));
+  if (stop != input_index + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued input name '",
+                                   input_name,
+                                   "' when single-valued input was "
+                                   "expected");
+  }
+  TF_RETURN_IF_ERROR(
+      params_->op_kernel->OutputRange(output_name, &output_index, &stop));
+  if (stop != output_index + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued output name '",
+                                   output_name,
+                                   "' when single-valued output was "
+                                   "expected");
+  }
+  if (!forward_input_to_output(input_index, output_index, output)) {
+    return errors::FailedPrecondition("OpKernel could not forward input '",
+                                      input_name, "' to output '", output_name);
+  }
+  return Status::OK();
+}
+
+bool OpKernelContext::forward_input_to_output_with_shape(
+    int input_index, int output_index, const TensorShape& output_shape,
+    Tensor** output) {
+  DCHECK_GE(input_index, 0);
+  DCHECK_LT(input_index, params_->inputs->size());
+  const TensorValue& input = (*params_->inputs)[input_index];
+  // Check that input tensor exists, is not a ref, and have no other consumers.
+  if (input.tensor == nullptr || input.is_ref() || !input->RefCountIsOne()) {
+    return false;
+  }
+  DCHECK_GE(output_index, 0);
+  DCHECK_LT(output_index, num_outputs());
+  // Check that input and output types match.
+  if (expected_output_dtype(output_index) != input_dtype(input_index)) {
+    return false;
+  }
+  // Check that the input and output sizes are compatible.
+  if (input.tensor->shape().num_elements() != output_shape.num_elements()) {
+    return false;
+  }
+  // Check that input and output memory types match, i.e.
+  // that they either both live in host or both live in device memmory.
+  if (op_kernel().output_memory_types()[output_index] !=
+      op_kernel().input_memory_types()[input_index]) {
+    return false;
+  }
+  Tensor* output_tensor = new Tensor();
+  CHECK(output_tensor->CopyFrom(*input.tensor, output_shape));
+  outputs_[output_index] = TensorValue(output_tensor);
+  *output = outputs_[output_index].tensor;
+  return true;
+}
+
+Status OpKernelContext::forward_input_to_output_with_shape(
+    StringPiece input_name, StringPiece output_name,
+    const TensorShape& output_shape, Tensor** output) {
+  int input_index, output_index, stop;
+  TF_RETURN_IF_ERROR(
+      params_->op_kernel->InputRange(input_name, &input_index, &stop));
+  if (stop != input_index + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued input name '",
+                                   input_name,
+                                   "' when single-valued input was "
+                                   "expected");
+  }
+  TF_RETURN_IF_ERROR(
+      params_->op_kernel->OutputRange(output_name, &output_index, &stop));
+  if (stop != output_index + 1) {
+    return errors::InvalidArgument("OpKernel used list-valued output name '",
+                                   output_name,
+                                   "' when single-valued output was "
+                                   "expected");
+  }
+  if (!forward_input_to_output_with_shape(input_index, output_index,
+                                          output_shape, output)) {
+    return errors::FailedPrecondition("OpKernel could not forward input '",
+                                      input_name, "' to output '", output_name);
+  }
+  return Status::OK();
+}
+
 void OpKernelContext::delete_ref_input(int index, bool lock_held) {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->inputs->size());
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 75ad4bb7fc..201e247615 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -643,12 +643,6 @@ class OpKernelContext {
   Status replace_ref_input(StringPiece name, const Tensor& tensor,
                            bool lock_held);
 
-  // Set the output Ref Tensor at output_index to be an alias of the
-  // input Ref Tensor at input_index.
-  // REQUIRES: IsRefType(input_dtype(input_index)).
-  // REQUIRES: IsRefType(output_dtype(output_index)).
-  void forward_ref_input_to_ref_output(int input_index, int output_index);
-
   // Deletes the Tensor object used as the Ref Input at
   // input_index. This is not usually necessary and should be used
   // with caution. If !lock_held the input mutex will be acquired
@@ -667,6 +661,37 @@ class OpKernelContext {
   // Usage: if (!context->ValidateInputsAreSameShape(this)) return;
   bool ValidateInputsAreSameShape(OpKernel* op);
 
+  // Input to output forwarding.
+
+  // Set the output Ref Tensor at output_index to be an alias of the
+  // input Ref Tensor at input_index.
+  // REQUIRES: IsRefType(input_dtype(input_index)).
+  // REQUIRES: IsRefType(output_dtype(output_index)).
+  void forward_ref_input_to_ref_output(int input_index, int output_index);
+
+  // Returns true when an alias to input[input_index] that is safe to use for
+  // in-place computation was written to *output. Returns false if
+  // input[input_index] has a refcount greater than or if its type does not
+  // match the expected output type of output[output_index].
+  bool forward_input_to_output(int input_index, int output_index,
+                               Tensor** output);
+  Status forward_input_to_output(StringPiece input_name,
+                                 StringPiece output_name, Tensor** output);
+
+  // Returns true when an alias to input[input_index], reshaped to output_shape,
+  // which is is safe to use for in-place computation was written to *output.
+  // Returns false if input[input_index] has a refcount greater than one, or if
+  // its type does not match the expected output type of output[output_index],
+  // or the number of elements in input[input_index] does not equal the number
+  // of elements in output_shape.
+  bool forward_input_to_output_with_shape(int input_index, int output_index,
+                                          const TensorShape& output_shape,
+                                          Tensor** output);
+  Status forward_input_to_output_with_shape(StringPiece input_name,
+                                            StringPiece output_name,
+                                            const TensorShape& output_shape,
+                                            Tensor** output);
+
   // Output
 
   // Returns the named list-valued output in "list", as defined in the OpDef.
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index f622d031f2..68c6817448 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -526,6 +526,14 @@ void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
   }
 }
 
+// Notice that buf_ either points to a regular TensorBuffer or a SubBuffer.
+// For the latter case, we have to make sure that the refcount is
+// one both for the SubBuffer _and_ the underlying TensorBuffer.
+bool Tensor::RefCountIsOne() const {
+  return buf_ != nullptr && buf_->RefCountIsOne() &&
+         buf_->root_buffer()->RefCountIsOne();
+}
+
 // The macro CASES() expands to a switch statement conditioned on
 // TYPE_ENUM. Each case expands the STMTS after a typedef for T.
 #define SINGLE_ARG(...) __VA_ARGS__
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index c9ddad3bdb..d9b22525c4 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -414,6 +414,9 @@ class Tensor {
                               const TensorShape&);
 
  private:
+  // Returns true if the refcount on buf_ and any possible underlying root
+  // buffer is one.
+  bool RefCountIsOne() const;
   void CheckType(DataType expected_dtype) const;
   void CheckTypeAndIsAligned(DataType expected_dtype) const;
   void CheckIsAlignedAndSingleElement() const;
@@ -439,6 +442,7 @@ class Tensor {
   friend class TensorTestHelper;      // For access to set_shape
   template <typename Device, typename T>
   friend class CreateVariableOp;
+  friend class OpKernelContext;  // For access to RefCountIsOne().
 
   // Creates a tensor with the input datatype, shape and buf.
   //
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 98934b4e5b..144bde2889 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -58,8 +58,10 @@ class AdjustHueOpBase : public OpKernel {
                                 channels, " channels."));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input.shape(), &output));
+    }
 
     if (input.NumElements() > 0) {
       const int64 channel_count = input.NumElements() / channels;
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 50d0cc1727..0f5186eb07 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -49,7 +49,13 @@ class AddNOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
+    bool reused_input_buffer = false;
+    for (int i = 0; i < num && !reused_input_buffer; ++i) {
+      reused_input_buffer = ctx->forward_input_to_output(i, 0, &output);
+    }
+    if (!reused_input_buffer) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output));
+    }
     auto To = output->flat<T>();
 
 #define I(IDX) ctx->input(IDX).flat<T>()
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index f4aa759643..7c95d4dd20 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -115,15 +115,25 @@ class BatchNormGradOp : public OpKernel {
                                         out_backprop.shape().DebugString()));
 
     Tensor* dx = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx));
+    if (!context->forward_input_to_output(0, 0, &dx)) {
+      OP_REQUIRES_OK(context, context->allocate_output(0, input.shape(), &dx));
+    }
     Tensor* dm = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm));
+    if (!context->forward_input_to_output(1, 1, &dm)) {
+      OP_REQUIRES_OK(context, context->allocate_output(1, mean.shape(), &dm));
+    }
     Tensor* dv = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv));
+    if (!context->forward_input_to_output(2, 2, &dv)) {
+      OP_REQUIRES_OK(context, context->allocate_output(2, var.shape(), &dv));
+    }
     Tensor* db = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+    if (!context->forward_input_to_output(3, 3, &db)) {
+      OP_REQUIRES_OK(context, context->allocate_output(3, mean.shape(), &db));
+    }
     Tensor* dg = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
+    if (!context->forward_input_to_output(4, 4, &dg)) {
+      OP_REQUIRES_OK(context, context->allocate_output(4, gamma.shape(), &dg));
+    }
 
     // Scratch buffer of [depth] dimension, aka the 4th dimension of input,
     // which is dim_size(3), for calculating various combinations of
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index 46e12cff2a..92696f8c07 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -74,8 +74,10 @@ class BiasOp<CPUDevice, T> : public BinaryOp<T> {
             bias.shape().DebugString(), " vs. ", input.shape().DebugString()));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input.shape(), &output));
+    }
     if (input.NumElements() == 0) return;
 
     switch (input.shape().dims()) {
@@ -271,8 +273,10 @@ class BiasOp<GPUDevice, T> : public BinaryOp<T> {
                     bias.shape().DebugString(), " vs. ", channel, " in ",
                     input.shape().DebugString()));
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input.shape(), &output));
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input.shape(), &output));
+    }
     if (input.NumElements() > 0) {
       BiasGPU<T>::compute(context->template eigen_device<Device>(),
                           input.flat<T>().data(), bias.flat<T>().data(),
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 306736fe54..0de6f38451 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -228,7 +228,9 @@ class ZerosLikeOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& input = ctx->input(0);
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &out));
+    if (!ctx->forward_input_to_output(0, 0, &out)) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &out));
+    }
     functor::SetZeroFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), out->flat<T>());
   }
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 8160fb74c2..0404d0e997 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -92,7 +92,10 @@ class SelectOp : public OpKernel {
             else_->shape().DebugString()));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    if (!ctx->forward_input_to_output("t", "output", &output).ok() &&
+        !ctx->forward_input_to_output("e", "output", &output).ok()) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    }
     if (output->NumElements() > 0) {
       functor::BatchSelectFunctor<Device, T> func;
       func(ctx->eigen_device<Device>(), output->flat_outer_dims<T>(),
@@ -105,7 +108,10 @@ class SelectOp : public OpKernel {
                           const Tensor* then, const Tensor* else_) {
     if (!ctx->ValidateInputsAreSameShape(this)) return;
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    if (!ctx->forward_input_to_output("t", "output", &output).ok() &&
+        !ctx->forward_input_to_output("e", "output", &output).ok()) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    }
     if (output->NumElements() > 0) {
       functor::SelectFunctor<Device, T> func;
       func(ctx->eigen_device<Device>(), output->flat<T>(), cond->flat<bool>(),
@@ -123,7 +129,10 @@ class SelectOp : public OpKernel {
             else_->shape().DebugString()));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    if (!ctx->forward_input_to_output("t", "output", &output).ok() &&
+        !ctx->forward_input_to_output("e", "output", &output).ok()) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, then->shape(), &output));
+    }
 
     if (output->NumElements() > 0) {
       functor::SelectScalarFunctor<Device, T> func;
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index c675faeea1..0a3b29b970 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -55,11 +55,14 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
                                            in1.shape().DebugString()));
     return;
   }
-  OP_REQUIRES_OK(
-      ctx, ctx->allocate_output(0, BCast::ToShape(bcast.output_shape()), &out));
-  out_num_elements = out->NumElements();
+  const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
+  out_num_elements = output_shape.num_elements();
   in0_num_elements = in0.NumElements();
   in1_num_elements = in1.NumElements();
+  if (!ctx->forward_input_to_output_with_shape(0, 0, output_shape, &out) &&
+      !ctx->forward_input_to_output_with_shape(1, 0, output_shape, &out)) {
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &out));
+  }
 
   ndims = static_cast<int>(bcast.x_reshape().size());
 }
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index c825a91fb1..fdcbba680d 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -48,7 +48,9 @@ class BinaryOpShared : public OpKernel {
  protected:
   struct BinaryOpState {
     // Sets up bcast with the shape of in0 and in1, ensures that the bcast
-    // is valid, and if so, allocates out using ctx->output(...).
+    // is valid, and if so, set out, either by allocating a new buffer using
+    // ctx->output(...) or by creating an alias for an owned input buffer for
+    // in-place computation.
     // Caller must check ctx->status() upon return for non-ok status.
     // If ctx->status().ok() is true, then out is guaranteed to be allocated.
     BinaryOpState(OpKernelContext* ctx);
@@ -168,14 +170,18 @@ class SimpleBinaryOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
-
-    Tensor* out;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
-    auto out_flat = out->flat<Tout>();
     auto in0_flat = in0.flat<Tin>();
     auto in1_flat = in1.flat<Tin>();
     const Device& eigen_device = ctx->eigen_device<Device>();
 
+    Tensor* out = nullptr;
+    if (!std::is_same<Tin, Tout>::value) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+    } else if (!ctx->forward_input_to_output(0, 0, &out) &&
+               !ctx->forward_input_to_output(1, 0, &out)) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in0.shape(), &out));
+    }
+    auto out_flat = out->flat<Tout>();
     functor::SimpleBinaryFunctor<Device, Functor>()(eigen_device, out_flat,
                                                     in0_flat, in1_flat);
   }
@@ -200,7 +206,10 @@ class UnaryOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const Tensor& inp = ctx->input(0);
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    if (!std::is_same<Tin, Tout>::value ||
+        !ctx->forward_input_to_output(0, 0, &out)) {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    }
     functor::UnaryFunctor<Device, Functor>()(
         ctx->eigen_device<Device>(), out->flat<Tout>(), inp.flat<Tin>());
   }
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index f9076cb903..a55365cb49 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -542,9 +542,10 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
     Tensor* in_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_shape, &in_backprop));
-
+    if (!context->forward_input_to_output(0, 0, &in_backprop)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input_shape, &in_backprop));
+    }
     auto out_backprop_ptr = out_backprop.template flat<T>().data();
     auto filter_ptr = filter.template flat<T>().data();
     auto in_backprop_ptr = in_backprop->template flat<T>().data();
@@ -925,8 +926,10 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
     Tensor* filter_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
+    if (!context->forward_input_to_output(1, 0, &filter_backprop)) {
+      OP_REQUIRES_OK(
+          context, context->allocate_output(0, filter_shape, &filter_backprop));
+    }
 
     auto out_backprop_ptr = out_backprop.template flat<T>().data();
     auto input_ptr = input.template flat<T>().data();
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 9bba6712a2..4a3ef59211 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -323,8 +323,10 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Depending on the type, cast double to type T.
     Tensor* in_backprop_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, in_shape, &in_backprop_tensor));
+    if (!context->forward_input_to_output(0, 0, &in_backprop_tensor)) {
+      OP_REQUIRES_OK(
+          context, context->allocate_output(0, in_shape, &in_backprop_tensor));
+    }
     auto in_backprop_tensor_flat = in_backprop_tensor->flat<T>();
     auto in_backprop_tensor_temp_flat = in_backprop_tensor_temp.flat<double>();
     for (int64 i = 0; i < in_backprop_tensor_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/fractional_max_pool_op.cc b/tensorflow/core/kernels/fractional_max_pool_op.cc
index a422433ecf..45567461e2 100644
--- a/tensorflow/core/kernels/fractional_max_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_max_pool_op.cc
@@ -343,8 +343,10 @@ class FractionalMaxPoolGradOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, tensor_in.shape(), &output));
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, tensor_in.shape(), &output));
+    }
     output->flat<T>().setZero();
 
     auto out_backprop_flat = out_backprop.flat<T>();
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 31570e2fc8..43d31fc221 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -520,7 +520,9 @@ class FusedBatchNormOp : public OpKernel {
     }
 
     Tensor* y = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &y));
+    if (!context->forward_input_to_output(0, 0, &y)) {
+      OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &y));
+    }
     Tensor* batch_mean = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(1, scale.shape(), &batch_mean));
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 5fde696963..3ecd3182ff 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -171,15 +171,20 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
           num_outputs, context->num_outputs()));
 
   // Allocate outputs.
-  for (int i = 0; i < context->num_outputs(); ++i) {
-    TensorShape output_tensor_shape({0});
-    if (i < num_outputs) {
+  std::set<int> unused_inputs;
+  for (int input_idx = 0; input_idx < context->num_inputs(); ++input_idx) {
+    unused_inputs.insert(input_idx);
+  }
+  for (int output_idx = 0; output_idx < context->num_outputs(); ++output_idx) {
+    TensorShape output_tensor_shape({});
+    if (output_idx < num_outputs) {
       // This output is used, set up output shape and allocate it.
-      const TensorShape& output_matrix_shape = output_matrix_shapes->at(i);
+      const TensorShape& output_matrix_shape =
+          output_matrix_shapes->at(output_idx);
       OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
                   errors::InvalidArgument(
                       "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
-                      i, output_matrix_shape.dims()));
+                      output_idx, output_matrix_shape.dims()));
 
       // The final output has the shape of the outer batch dimensions
       // concatenated with the output_matrix_shape (if the output is not
@@ -190,8 +195,20 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
       }
     }
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(i, output_tensor_shape, &out));
+    // See if there is an input buffer we can reuse for this output.
+    bool reused_input = false;
+    for (int input_idx : unused_inputs) {
+      if (context->forward_input_to_output_with_shape(
+              input_idx, output_idx, output_tensor_shape, &out)) {
+        reused_input = true;
+        unused_inputs.erase(input_idx);
+        break;
+      }
+    }
+    if (!reused_input) {
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  output_idx, output_tensor_shape, &out));
+    }
     outputs->push_back(out);
   }
 }
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 952da7d8df..1754e4ad69 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -78,8 +78,10 @@ class MatrixSetDiagOp : public OpKernel {
     auto diag_reshaped = diag.flat_inner_dims<T, 2>();
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, input_shape, &output));
-
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, input_shape, &output));
+    }
     auto output_reshaped = output->flat_inner_dims<T, 3>();
     Tensor scratch_tensor;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 98b4558a3a..669597e382 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -290,7 +290,10 @@ class MaxPoolingGradOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    if (!context->forward_input_to_output(0, 0, &output)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, output_shape, &output));
+    }
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
@@ -319,9 +322,10 @@ static void MaxPoolingBackwardCustomKernel(
     const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
   Tensor* output = nullptr;
-
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(0, tensor_in_shape, &output));
+  if (!context->forward_input_to_output(0, 0, &output)) {
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, tensor_in_shape, &output));
+  }
 
   PoolParameters params{context, size,        stride,
                         padding, FORMAT_NHWC, tensor_in_shape};
diff --git a/tensorflow/core/kernels/softmax_op.h b/tensorflow/core/kernels/softmax_op.h
index dc61e26809..e9dbafd589 100644
--- a/tensorflow/core/kernels/softmax_op.h
+++ b/tensorflow/core/kernels/softmax_op.h
@@ -40,8 +40,10 @@ class SoftmaxOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     Tensor* softmax_out = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, logits_in.shape(), &softmax_out));
+    if (!context->forward_input_to_output(0, 0, &softmax_out)) {
+      OP_REQUIRES_OK(context, context->allocate_output(0, logits_in.shape(),
+                                                       &softmax_out));
+    }
     if (logits_in.NumElements()) {
       functor::SoftmaxFunctor<Device, T> functor;
       functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index 9c39841fee..4a61e31e8d 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -78,11 +78,15 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
                                                    labels.shape(), &scratch));
 
     Tensor* loss_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, labels.shape(), &loss_out));
+    if (!context->forward_input_to_output(1, 0, &loss_out)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, labels.shape(), &loss_out));
+    }
     Tensor* back_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, logits.shape(), &back_out));
+    if (!context->forward_input_to_output(0, 1, &back_out)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(1, logits.shape(), &back_out));
+    }
 
     if (logits.dim_size(0) > 0) {
       if (std::is_same<Device, CPUDevice>::value) {
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 6aa9d6accb..e06fe20e79 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -46,7 +46,9 @@ class UniqueOp : public OpKernel {
     const int64 N = static_cast<int64>(Tin.size());
 
     Tensor* idx = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(1, input.shape(), &idx));
+    if (!context->forward_input_to_output(0, 1, &idx)) {
+      OP_REQUIRES_OK(context, context->allocate_output(1, input.shape(), &idx));
+    }
     auto idx_vec = idx->template vec<int32>();
 
     std::unordered_map<T, int32> uniq;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 639bad5f04..2a0ef63eab 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -61,9 +61,11 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
                    context->allocate_output(
                        0, TensorShape({logits_in.dim_size(0)}), &loss_out));
     Tensor* back_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, logits_in.shape(), &back_out));
-
+    // Try to reuse the logits_in buffer for the backprop output.
+    if (!context->forward_input_to_output(0, 1, &back_out)) {
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(1, logits_in.shape(), &back_out));
+    }
     functor::XentFunctor<Device, T> functor;
     functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
             labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index e9db47716d..6c7cbbff9c 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -326,9 +326,8 @@ class ControlFlowTest(test.TestCase):
   def testFetchables(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
-      control_flow_ops.cond(constant_op.constant(True),
-                            lambda: x + 2,
-                            lambda: x + 0)
+      control_flow_ops.cond(
+          constant_op.constant(True), lambda: x + 2, lambda: x + 0)
       tensor_names = all_fetchables()
       for name in tensor_names:
         sess.run(name, feed_dict={x: 3})
@@ -388,11 +387,12 @@ class ControlFlowTest(test.TestCase):
       rv = resource_variable_ops.ResourceVariable(True)
       variables.global_variables_initializer().run()
       t = ops.convert_to_tensor(1.0)
+
       def case():
-        assign = resource_variable_ops.assign_variable_op(
-            rv.handle, False)
+        assign = resource_variable_ops.assign_variable_op(rv.handle, False)
         with ops.control_dependencies([assign]):
           return array_ops.identity(t)
+
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
   def testCondIndexedSlicesDifferentTypes(self):
@@ -544,13 +544,15 @@ class ControlFlowTest(test.TestCase):
     with self.test_session() as sess:
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
       a = constant_op.constant(3)
+
       def true_branch():
         with ops.control_dependencies([control_holder]):
           _ = a + 1
         return a + 2
-      r = control_flow_ops.cond(constant_op.constant(True),
-                                true_branch,
-                                lambda: constant_op.constant(1))
+
+      r = control_flow_ops.cond(
+          constant_op.constant(True), true_branch,
+          lambda: constant_op.constant(1))
       self.assertEqual(5, r.eval())
 
   def testUninitializedRefIdentity(self):
@@ -770,16 +772,37 @@ class ControlFlowTest(test.TestCase):
       o = ops.convert_to_tensor([0])
       x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = array_ops.size(x)
-      r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s),
-                                      compute, [i, c, o], [
-                                          i.get_shape(),
-                                          tensor_shape.unknown_shape(),
-                                          tensor_shape.unknown_shape()
-                                      ])
+      r = control_flow_ops.while_loop(
+          lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [
+              i.get_shape(), tensor_shape.unknown_shape(),
+              tensor_shape.unknown_shape()
+          ])
       result = r[2].eval()
     self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
+  def testBufferForwarding(self):
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    with self.test_session() as sess:
+      with ops.device("/cpu:0"):
+        c = constant_op.constant(2)
+        i0 = constant_op.constant(0)
+        r = control_flow_ops.while_loop(lambda i: i < 1000,
+                                        lambda i: math_ops.square(c) + i, [i0])
+      r_val = sess.run(r, options=run_options, run_metadata=run_metadata)
+      self.assertEqual(1000, r_val)
+      self.assertTrue(run_metadata.HasField("step_stats"))
+      unique_allocs = set()
+      for node_stat in run_metadata.step_stats.dev_stats[0].node_stats:
+        for output in node_stat.output:
+          unique_allocs.add(
+              output.tensor_description.allocation_description.ptr)
+      # Prior to cl/147536680, the number of unique allocations was about 1005.
+      self.assertLess(len(unique_allocs), 756)
+
   def _testWhile_Gpu_1(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       n = constant_op.constant(1.0)
@@ -1368,8 +1391,9 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(45, rx.eval())
 
   def _testWhileGrad_ColocateGradients(self, colocate):
-    gpu_dev_name = test.gpu_device_name() if test.is_gpu_available() else "/gpu:0"
-    gpu_short_name = gpu_dev_name.split('/')[-1]
+    gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
+    ) else "/gpu:0"
+    gpu_short_name = gpu_dev_name.split("/")[-1]
 
     with self.test_session(graph=ops.Graph()) as sess:
       v = constant_op.constant(2.0, name="v")
@@ -1485,16 +1509,21 @@ class ControlFlowTest(test.TestCase):
   def _testNestedWhileCondWhileGrad(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       v = constant_op.constant(1.0)
+
       def inner_loop(s):
         z = constant_op.constant(0)
         c = lambda i, x: math_ops.less(i, 4)
         b = lambda i, x: [math_ops.add(i, 1), math_ops.multiply(x, 2.0)]
         return control_flow_ops.while_loop(c, b, [z, s])
+
       c = lambda x: math_ops.less(x, 128.0)
+
       def b(x):
-        return control_flow_ops.cond(constant_op.constant(True),
-                                     lambda: math_ops.square(inner_loop(x)[1]),
-                                     lambda: math_ops.multiply(x, 2.0))
+        return control_flow_ops.cond(
+            constant_op.constant(True),
+            lambda: math_ops.square(inner_loop(x)[1]),
+            lambda: math_ops.multiply(x, 2.0))
+
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(512.0, r.eval())
@@ -1550,10 +1579,9 @@ class ControlFlowTest(test.TestCase):
     with self.test_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
-          named(
-              a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0), constant_op.constant(3.0)),
-          constant_op.constant(4.0)
+          named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
+          (constant_op.constant(2.0),
+           constant_op.constant(3.0)), constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -1578,10 +1606,9 @@ class ControlFlowTest(test.TestCase):
     with self.test_session():
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
-          named(
-              a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0), constant_op.constant(3.0)),
-          constant_op.constant(4.0)
+          named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
+          (constant_op.constant(2.0),
+           constant_op.constant(3.0)), constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -2522,15 +2549,11 @@ class TupleTest(test.TestCase):
       with self.test_session():
         v1 = variables.Variable([1.0])
         add1 = math_ops.add(
-            control_flow_ops.with_dependencies(
-                [v1.initializer],
-                v1._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             2.0)
         v2 = variables.Variable([10.0])
         add2 = math_ops.add(
-            control_flow_ops.with_dependencies(
-                [v2.initializer],
-                v2._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             20.0)
         t1, _, t2 = control_flow_ops.tuple([add1, None, add2])
 
@@ -2558,18 +2581,14 @@ class TupleTest(test.TestCase):
             np.array([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]]).astype(
                 np.float32))
         v1_at_1 = ops.IndexedSlices(
-            control_flow_ops.with_dependencies(
-                [v1.initializer],
-                v1._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v1.initializer], v1._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
 
         v2 = variables.Variable(
             np.array([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]]).astype(
                 np.float32))
         v2_at_1 = ops.IndexedSlices(
-            control_flow_ops.with_dependencies(
-                [v2.initializer],
-                v2._ref()),  # pylint: disable=protected-access
+            control_flow_ops.with_dependencies([v2.initializer], v2._ref()),  # pylint: disable=protected-access
             constant_op.constant([1]))
 
         st1, st2 = control_flow_ops.tuple([v1_at_1, v2_at_1])
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 29f76a2182..c11f78b77e 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -269,6 +269,15 @@ class SliceTest(test.TestCase):
     c = array_ops.slice(a, [begin, 0], [-1, 2])
     self.assertEqual([None, 2], c.get_shape().as_list())
 
+  def testSliceOfSlice(self):
+    with self.test_session(use_gpu=True):
+      a = constant_op.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
+      b = a[1:, :]
+      c = b[:-1, :]
+      d = c[1, :]
+      res = 2 * d - c[1, :] + a[2, :] - 2 * b[-2, :]
+      self.assertAllEqual([0, 0, 0], res.eval())
+
 
 if __name__ == "__main__":
   test.main()
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-02-21 17:31:57 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-02-21 17:52:15 -0800
commit	4891c01b1cadf085a915a3eac5dd1b8d8cdee203 (patch)
tree	87ec00e1927877ba26a2ffb69bc4f74f25c36f6a /tensorflow
parent	123c2bb0af532d5fdaa05358158da33497d4bfe6 (diff)