diff options
-rw-r--r-- | tensorflow/core/common_runtime/executor.cc | 27 | ||||
-rw-r--r-- | tensorflow/core/kernels/assign_op.h | 90 |
2 files changed, 73 insertions, 44 deletions
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index 6987d637b4..98da489b15 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -421,9 +421,9 @@ class ExecutorImpl : public Executor { // connected to n by a single edge, but might be a downstream // consumer of n's output by reference. *attr is updated with any // necessary attributes. -static Status InferAllocAttr(const Node* n, const Node* dst, - const DeviceNameUtils::ParsedName& local_dev_name, - AllocatorAttributes* attr); +Status InferAllocAttr(const Node* n, const Node* dst, + const DeviceNameUtils::ParsedName& local_dev_name, + AllocatorAttributes* attr); GraphView::~GraphView() { static_assert(std::is_trivially_destructible<AllocatorAttributes>::value, @@ -478,7 +478,7 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) { CHECK(node_offsets_[id] == kuint32max); // Initial value in constructor const size_t bytes = NodeItemBytes(n); - static constexpr size_t kItemAlignment = sizeof(NodeItem*); + constexpr size_t kItemAlignment = sizeof(NodeItem*); CHECK_EQ(reinterpret_cast<uintptr_t>(ptr) % kItemAlignment, 0); NodeItem* item = reinterpret_cast<NodeItem*>(ptr); @@ -568,8 +568,7 @@ void GraphView::Initialize(const Graph* g) { CHECK_EQ(ptr, space_ + total_bytes); } -static void GetMaxPendingCounts(const Node* n, int* max_pending, - int* max_dead_count) { +void GetMaxPendingCounts(const Node* n, int* max_pending, int* max_dead_count) { const int num_in_edges = n->in_edges().size(); int initial_count; if (IsMerge(n)) { @@ -691,20 +690,22 @@ Status GraphView::SetAllocAttrs(const Graph* g, const Device* device) { } for (int out = 0; out < n->num_outputs(); out++) { - OpKernel* op_kernel = item->kernel; + const OpKernel* op_kernel = item->kernel; DCHECK_LT(out, op_kernel->output_memory_types().size()); bool on_host = op_kernel->output_memory_types()[out] == HOST_MEMORY; - AllocatorAttributes h; - h.set_on_host(on_host); - attrs[out].Merge(h); + if (on_host) { + AllocatorAttributes h; + h.set_on_host(on_host); + attrs[out].Merge(h); + } } } return s; } -static Status InferAllocAttr(const Node* n, const Node* dst, - const DeviceNameUtils::ParsedName& local_dev_name, - AllocatorAttributes* attr) { +Status InferAllocAttr(const Node* n, const Node* dst, + const DeviceNameUtils::ParsedName& local_dev_name, + AllocatorAttributes* attr) { Status s; // Note that it's possible for *n to be a Recv and *dst to be a Send, // so these two cases are not mutually exclusive. diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h index da0ccda9eb..1d2e1c8c9a 100644 --- a/tensorflow/core/kernels/assign_op.h +++ b/tensorflow/core/kernels/assign_op.h @@ -39,60 +39,88 @@ class AssignOp : public OpKernel { } void Compute(OpKernelContext* context) override { - Tensor rhs = context->input(1); + const Tensor& rhs = context->input(1); // We always return the input ref. context->forward_ref_input_to_ref_output(0, 0); - // If the left hand side is not initialized, or the shape of the - // right-hand side is different than the left hand side, we need - // to allocate a new tensor. + // We can't always know how this value will be used downstream, + // so make conservative assumptions in specifying constraints on + // the memory allocation attributes. + // TODO(rmlarsen): These conservative constraints make buffer + // forwarding unlikely to happen very often. Try to use graph analysis + // (possibly the InferAllocAttr pass in the executer) to improve the + // situation. + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + attr.set_nic_compatible(true); + { mutex_lock l(*context->input_ref_mutex(0)); - - Tensor old_lhs = context->mutable_input(0, true); - + const Tensor& old_lhs = context->mutable_input(0, /* lock_held */ true); + const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape()); if (validate_shape_) { OP_REQUIRES( - context, old_lhs.shape().IsSameSize(rhs.shape()), + context, same_shape, errors::InvalidArgument( "Assign requires shapes of both tensors to match. lhs shape= ", - old_lhs.shape().DebugString(), " rhs shape= ", - rhs.shape().DebugString())); + old_lhs.shape().DebugString(), + " rhs shape= ", rhs.shape().DebugString())); } - const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape()); - if (!old_lhs.IsInitialized() || !same_shape) { - // Create new tensor whose shape matches the right hand side - // and copy then hand off to lhs. - // We can't always know how this value will be used downstream, - // so make conservative assumptions in specifying the memory - // allocation attributes. - AllocatorAttributes attr; - attr.set_gpu_compatible(true); - attr.set_nic_compatible(true); + // In the code below we try to minimize the amount of memory allocation + // and copying by trying the following two shortcuts: + // 1. If we can reuse the rhs buffer we avoid both a memory allocation + // and copying. + // 2. If the lhs is initialized and has the same number of elements as the + // rhs we can avoid a memory allocation. + + // 1. Try to reuse the rhs. + std::unique_ptr<Tensor> input_alias = context->forward_input( + 1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr); + if (input_alias != nullptr) { + // Transfer ownership to the ref. + context->replace_ref_input(0, *input_alias.release(), + /* lock_held */ true); + return; + } + + // 2. Try to copy into an existing buffer. + if (old_lhs.IsInitialized() && + old_lhs.shape().num_elements() == rhs.shape().num_elements()) { + // The existing lhs tensor has already been initialized and the right + // hand side can fit in the underlying buffer. + Tensor reshaped_old_lhs; + if (same_shape) { + reshaped_old_lhs = old_lhs; + } else { + CHECK(reshaped_old_lhs.CopyFrom(old_lhs, rhs.shape())); + context->replace_ref_input(0, reshaped_old_lhs, /* lock_held */ true); + } + if (use_exclusive_lock_) { + Copy(context, &reshaped_old_lhs, rhs); + return; + } + } else { + // Create a new persistent tensor whose shape matches the right hand + // side, hand off to lhs and copy the rhs into it. PersistentTensor copy; Tensor* copyTensor = nullptr; OP_REQUIRES_OK( context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(), ©, ©Tensor, attr)); - Copy(context, copyTensor, rhs); - context->replace_ref_input(0, *copyTensor, true); - return; - } - - // The tensor has already been initialized and the right hand side - // matches the left hand side's shape. - if (use_exclusive_lock_) { - Copy(context, &old_lhs, rhs); - return; + context->replace_ref_input(0, *copyTensor, /* lock_held */ true); + if (use_exclusive_lock_) { + Copy(context, copyTensor, rhs); + return; + } } } // The tensor has already been initialized and the right hand side // matches the left hand side's shape. We have been told to do the // copy outside the lock. - Tensor old_unlocked_lhs = context->mutable_input(0, false); + Tensor old_unlocked_lhs = context->mutable_input(0, /* lock_held */ false); Copy(context, &old_unlocked_lhs, rhs); } |