2 files changed, 73 insertions, 44 deletions
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 6987d637b4..98da489b15 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -421,9 +421,9 @@ class ExecutorImpl : public Executor {
 // connected to n by a single edge, but might be a downstream
 // consumer of n's output by reference.  *attr is updated with any
 // necessary attributes.
-static Status InferAllocAttr(const Node* n, const Node* dst,
-                             const DeviceNameUtils::ParsedName& local_dev_name,
-                             AllocatorAttributes* attr);
+Status InferAllocAttr(const Node* n, const Node* dst,
+                      const DeviceNameUtils::ParsedName& local_dev_name,
+                      AllocatorAttributes* attr);
 
 GraphView::~GraphView() {
   static_assert(std::is_trivially_destructible<AllocatorAttributes>::value,
@@ -478,7 +478,7 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   CHECK(node_offsets_[id] == kuint32max);  // Initial value in constructor
 
   const size_t bytes = NodeItemBytes(n);
-  static constexpr size_t kItemAlignment = sizeof(NodeItem*);
+  constexpr size_t kItemAlignment = sizeof(NodeItem*);
   CHECK_EQ(reinterpret_cast<uintptr_t>(ptr) % kItemAlignment, 0);
   NodeItem* item = reinterpret_cast<NodeItem*>(ptr);
 
@@ -568,8 +568,7 @@ void GraphView::Initialize(const Graph* g) {
   CHECK_EQ(ptr, space_ + total_bytes);
 }
 
-static void GetMaxPendingCounts(const Node* n, int* max_pending,
-                                int* max_dead_count) {
+void GetMaxPendingCounts(const Node* n, int* max_pending, int* max_dead_count) {
   const int num_in_edges = n->in_edges().size();
   int initial_count;
   if (IsMerge(n)) {
@@ -691,20 +690,22 @@ Status GraphView::SetAllocAttrs(const Graph* g, const Device* device) {
     }
 
     for (int out = 0; out < n->num_outputs(); out++) {
-      OpKernel* op_kernel = item->kernel;
+      const OpKernel* op_kernel = item->kernel;
       DCHECK_LT(out, op_kernel->output_memory_types().size());
       bool on_host = op_kernel->output_memory_types()[out] == HOST_MEMORY;
-      AllocatorAttributes h;
-      h.set_on_host(on_host);
-      attrs[out].Merge(h);
+      if (on_host) {
+        AllocatorAttributes h;
+        h.set_on_host(on_host);
+        attrs[out].Merge(h);
+      }
     }
   }
   return s;
 }
 
-static Status InferAllocAttr(const Node* n, const Node* dst,
-                             const DeviceNameUtils::ParsedName& local_dev_name,
-                             AllocatorAttributes* attr) {
+Status InferAllocAttr(const Node* n, const Node* dst,
+                      const DeviceNameUtils::ParsedName& local_dev_name,
+                      AllocatorAttributes* attr) {
   Status s;
   // Note that it's possible for *n to be a Recv and *dst to be a Send,
   // so these two cases are not mutually exclusive.
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index da0ccda9eb..1d2e1c8c9a 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -39,60 +39,88 @@ class AssignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    Tensor rhs = context->input(1);
+    const Tensor& rhs = context->input(1);
 
     // We always return the input ref.
     context->forward_ref_input_to_ref_output(0, 0);
 
-    // If the left hand side is not initialized, or the shape of the
-    // right-hand side is different than the left hand side, we need
-    // to allocate a new tensor.
+    // We can't always know how this value will be used downstream,
+    // so make conservative assumptions in specifying constraints on
+    // the memory allocation attributes.
+    // TODO(rmlarsen): These conservative constraints make buffer
+    // forwarding unlikely to happen very often. Try to use graph analysis
+    // (possibly the InferAllocAttr pass in the executer) to improve the
+    // situation.
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+
     {
       mutex_lock l(*context->input_ref_mutex(0));
-
-      Tensor old_lhs = context->mutable_input(0, true);
-
+      const Tensor& old_lhs = context->mutable_input(0, /* lock_held */ true);
+      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
       if (validate_shape_) {
         OP_REQUIRES(
-            context, old_lhs.shape().IsSameSize(rhs.shape()),
+            context, same_shape,
             errors::InvalidArgument(
                 "Assign requires shapes of both tensors to match. lhs shape= ",
-                old_lhs.shape().DebugString(), " rhs shape= ",
-                rhs.shape().DebugString()));
+                old_lhs.shape().DebugString(),
+                " rhs shape= ", rhs.shape().DebugString()));
       }
 
-      const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
-      if (!old_lhs.IsInitialized() || !same_shape) {
-        // Create new tensor whose shape matches the right hand side
-        // and copy then hand off to lhs.
-        // We can't always know how this value will be used downstream,
-        // so make conservative assumptions in specifying the memory
-        // allocation attributes.
-        AllocatorAttributes attr;
-        attr.set_gpu_compatible(true);
-        attr.set_nic_compatible(true);
+      // In the code below we try to minimize the amount of memory allocation
+      // and copying by trying the following two shortcuts:
+      // 1. If we can reuse the rhs buffer we avoid both a memory allocation
+      //   and copying.
+      // 2. If the lhs is initialized and has the same number of elements as the
+      //    rhs we can avoid a memory allocation.
+
+      // 1. Try to reuse the rhs.
+      std::unique_ptr<Tensor> input_alias = context->forward_input(
+          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+      if (input_alias != nullptr) {
+        // Transfer ownership to the ref.
+        context->replace_ref_input(0, *input_alias.release(),
+                                   /* lock_held */ true);
+        return;
+      }
+
+      // 2. Try to copy into an existing buffer.
+      if (old_lhs.IsInitialized() &&
+          old_lhs.shape().num_elements() == rhs.shape().num_elements()) {
+        // The existing lhs tensor has already been initialized and the right
+        // hand side can fit in the underlying buffer.
+        Tensor reshaped_old_lhs;
+        if (same_shape) {
+          reshaped_old_lhs = old_lhs;
+        } else {
+          CHECK(reshaped_old_lhs.CopyFrom(old_lhs, rhs.shape()));
+          context->replace_ref_input(0, reshaped_old_lhs, /* lock_held */ true);
+        }
+        if (use_exclusive_lock_) {
+          Copy(context, &reshaped_old_lhs, rhs);
+          return;
+        }
+      } else {
+        // Create a new persistent tensor whose shape matches the right hand
+        // side, hand off to lhs and copy the rhs into it.
         PersistentTensor copy;
         Tensor* copyTensor = nullptr;
         OP_REQUIRES_OK(
             context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
                                                   &copy, &copyTensor, attr));
-        Copy(context, copyTensor, rhs);
-        context->replace_ref_input(0, *copyTensor, true);
-        return;
-      }
-
-      // The tensor has already been initialized and the right hand side
-      // matches the left hand side's shape.
-      if (use_exclusive_lock_) {
-        Copy(context, &old_lhs, rhs);
-        return;
+        context->replace_ref_input(0, *copyTensor, /* lock_held */ true);
+        if (use_exclusive_lock_) {
+          Copy(context, copyTensor, rhs);
+          return;
+        }
       }
     }
 
     // The tensor has already been initialized and the right hand side
     // matches the left hand side's shape. We have been told to do the
     // copy outside the lock.
-    Tensor old_unlocked_lhs = context->mutable_input(0, false);
+    Tensor old_unlocked_lhs = context->mutable_input(0, /* lock_held */ false);
     Copy(context, &old_unlocked_lhs, rhs);
   }