Allow (safe) in-place computation in TensorFlow C++ ops. When at least one input tensor has the same size and type as the output, and the underlying buffer is owned by the op, i.e. when its refcount is 1 at the time the op's Compute method executes, the computation can be performed in place and allocation of the output buffer avoided.

I updated the following ops to perform in-place computation automatically when possible: * All standard coefficient-wise unary and binary operators (including with broadcasting) inheriting from base classes in kernels/cwise_ops_common.h. * unary and binary operators inheriting from base classes in framework/numeric_op.h. This is mostly old code for the Relu family and associated gradients. * All linear algebra ops inheriting from linalg_common. * Misc individual files/ops: softmax, select, bias, aggregate ops, batch_norm & fused_batch_norm, adjust_hue, constant, depthwise_conv_grad, fractional_avg_pool, misc. pooling ops, matrix_set_diag, xent & sparse_xent, unique_op. Change: 148166936
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-02-21 17:31:57 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-02-21 17:52:15 -0800
commit: 4891c01b1cadf085a915a3eac5dd1b8d8cdee203 (patch)
tree: 87ec00e1927877ba26a2ffb69bc4f74f25c36f6a /tensorflow/core/kernels/linalg_ops_common.cc
parent: 123c2bb0af532d5fdaa05358158da33497d4bfe6 (diff)
1 files changed, 24 insertions, 7 deletions
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 5fde696963..3ecd3182ff 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -171,15 +171,20 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
           num_outputs, context->num_outputs()));
 
   // Allocate outputs.
-  for (int i = 0; i < context->num_outputs(); ++i) {
-    TensorShape output_tensor_shape({0});
-    if (i < num_outputs) {
+  std::set<int> unused_inputs;
+  for (int input_idx = 0; input_idx < context->num_inputs(); ++input_idx) {
+    unused_inputs.insert(input_idx);
+  }
+  for (int output_idx = 0; output_idx < context->num_outputs(); ++output_idx) {
+    TensorShape output_tensor_shape({});
+    if (output_idx < num_outputs) {
       // This output is used, set up output shape and allocate it.
-      const TensorShape& output_matrix_shape = output_matrix_shapes->at(i);
+      const TensorShape& output_matrix_shape =
+          output_matrix_shapes->at(output_idx);
       OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
                   errors::InvalidArgument(
                       "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
-                      i, output_matrix_shape.dims()));
+                      output_idx, output_matrix_shape.dims()));
 
       // The final output has the shape of the outer batch dimensions
       // concatenated with the output_matrix_shape (if the output is not
@@ -190,8 +195,20 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
       }
     }
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(i, output_tensor_shape, &out));
+    // See if there is an input buffer we can reuse for this output.
+    bool reused_input = false;
+    for (int input_idx : unused_inputs) {
+      if (context->forward_input_to_output_with_shape(
+              input_idx, output_idx, output_tensor_shape, &out)) {
+        reused_input = true;
+        unused_inputs.erase(input_idx);
+        break;
+      }
+    }
+    if (!reused_input) {
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  output_idx, output_tensor_shape, &out));
+    }
     outputs->push_back(out);
   }
 }
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-02-21 17:31:57 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-02-21 17:52:15 -0800
commit	4891c01b1cadf085a915a3eac5dd1b8d8cdee203 (patch)
tree	87ec00e1927877ba26a2ffb69bc4f74f25c36f6a /tensorflow/core/kernels/linalg_ops_common.cc
parent	123c2bb0af532d5fdaa05358158da33497d4bfe6 (diff)