diff options
author | 2016-01-13 17:30:08 -0800 | |
---|---|---|
committer | 2016-01-13 17:30:08 -0800 | |
commit | 6dbfb95100b73ad26ebebb9be9c0429dc0cece8a (patch) | |
tree | 83a15ef5538529dd3a4fece1276388c20f9260bb /tensorflow | |
parent | cbdf278dbd998aab57898763605f6863e9b42b76 (diff) |
Avoid allocation of std::function<> object per node executed, as
well as a hash-table lookup per allocated output.
Instead, we now pre-compute the AllocatorAttributes for every output
tensor in the graph into an array (indexed by a base number per node +
output index), and changed OpKernelContext::Params to provide
a pointer to the base of the array for the node, rather than providing
a std::function<>.
Updated test code to avoid so much code duplication when setting up
the OpKernelContext::Params object in various places.
Used gtl::InlinedVector<...> instead of std::vector<...> in a few
places in tensorflow/core/kernels/reduction_ops_common.h
Didn't make a measurable change in overall performance but allocations and
time spent in the std::function destructor code was significantly reduced.
Change: 112103260
Diffstat (limited to 'tensorflow')
-rw-r--r-- | tensorflow/core/common_runtime/executor.cc | 94 | ||||
-rw-r--r-- | tensorflow/core/framework/op_kernel.h | 15 | ||||
-rw-r--r-- | tensorflow/core/kernels/core_ops_test.cc | 49 | ||||
-rw-r--r-- | tensorflow/core/kernels/ops_testutil.h | 22 | ||||
-rw-r--r-- | tensorflow/core/kernels/reduction_ops_common.h | 8 | ||||
-rw-r--r-- | tensorflow/core/kernels/restore_op_test.cc | 16 | ||||
-rw-r--r-- | tensorflow/core/kernels/segment_reduction_ops_test.cc | 10 | ||||
-rw-r--r-- | tensorflow/core/kernels/sparse_to_dense_op_test.cc | 8 |
8 files changed, 88 insertions, 134 deletions
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index e42d2e260f..5de368da87 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -175,18 +175,11 @@ struct NodeItem { // ExecutorImpl::tensors_[input_start] is the 1st positional input // for this node. int input_start = 0; -}; -// Map from std::pair<node_id, output_index> to attributes. -struct pairhash { - public: - template <typename T, typename U> - std::size_t operator()(const std::pair<T, U>& x) const { - return std::hash<T>()(x.first) ^ std::hash<U>()(x.second); - } + // ExecutorImpl::output_attrs_[output_attr_start] is the 1st + // positional attribute for the 0th output of this node. + int output_attr_start = 0; }; -typedef std::unordered_map<std::pair<int, int>, AllocatorAttributes, pairhash> - DevAttrMap; typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec; typedef gtl::InlinedVector<DeviceContext*, 4> DeviceContextVec; @@ -231,14 +224,15 @@ class ExecutorImpl : public Executor { // Owned. LocalExecutorParams params_; const Graph* graph_; - std::vector<NodeItem> nodes_; // nodes_.size == graph_.num_node_ids(). - int total_tensors_ = 0; // total_tensors_ = sum(nodes_[*].num_inputs()) + std::vector<NodeItem> nodes_; // nodes_.size == graph_.num_node_ids(). + int total_input_tensors_ = 0; // == sum(nodes_[*].num_inputs()) + int total_output_tensors_ = 0; // == sum(nodes_[*].num_outputs()) // The number of inputs for each frame in this graph. This is static // information of the graph. std::unordered_map<string, int> frame_input_count_; - DevAttrMap alloc_attr_; + std::vector<AllocatorAttributes> output_attrs_; TF_DISALLOW_COPY_AND_ASSIGN(ExecutorImpl); }; @@ -248,7 +242,8 @@ Status ExecutorImpl::Initialize() { nodes_.resize(num_nodes); Status s; - total_tensors_ = 0; + total_input_tensors_ = 0; + total_output_tensors_ = 0; // Preprocess every node in the graph to create an instance of op // kernel for each node; @@ -256,8 +251,13 @@ Status ExecutorImpl::Initialize() { const int id = n->id(); NodeItem* item = &nodes_[id]; item->node = n; - item->input_start = total_tensors_; - total_tensors_ += n->num_inputs(); + + item->input_start = total_input_tensors_; + total_input_tensors_ += n->num_inputs(); + + item->output_attr_start = total_output_tensors_; + total_output_tensors_ += n->num_outputs(); + s = params_.create_kernel(n->def(), &item->kernel); if (!s.ok()) { s = AttachDef(s, n->def()); @@ -283,22 +283,32 @@ Status ExecutorImpl::SetAllocAttrs() { Device* device = params_.device; DeviceNameUtils::ParsedName local_dev_name = device->parsed_name(); + output_attrs_.resize(total_output_tensors_); for (const Node* n : graph_->nodes()) { + NodeItem* item = &nodes_[n->id()]; + const int base_index = item->output_attr_start; // Examine the out edges of each node looking for special use // cases that may affect memory allocation attributes. for (auto e : n->out_edges()) { + const int index = e->src_output(); AllocatorAttributes attr; s = InferAllocAttr(n, e->dst(), local_dev_name, &attr); if (!s.ok()) return s; if (attr.value != 0) { - VLOG(2) << "node " << n->name() << " gets attr " << attr.value - << " for output " << e->src_output(); - alloc_attr_[std::make_pair(n->id(), e->src_output())].Merge(attr); - } else { - VLOG(2) << "default output attr for node " << n->name() << " output " - << e->src_output(); + if (!e->IsControlEdge()) { + output_attrs_[base_index + index].Merge(attr); + } } } + + for (int out = 0; out < n->num_outputs(); out++) { + OpKernel* op_kernel = item->kernel; + DCHECK_LT(out, op_kernel->output_memory_types().size()); + bool on_host = op_kernel->output_memory_types()[out] == HOST_MEMORY; + AllocatorAttributes h; + h.set_on_host(on_host); + output_attrs_[base_index + out].Merge(h); + } } return s; } @@ -712,7 +722,8 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl) iter_state->outstanding_frame_count = 0; iter_state->pending_count = new std::vector<int>; iter_state->dead_count = new std::vector<int>(impl->graph_->num_node_ids()); - iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_); + iter_state->input_tensors = + new std::vector<Entry>(impl_->total_input_tensors_); // Initialize the executor state. outstanding_frames_.insert({root_frame_->frame_name, root_frame_}); @@ -793,32 +804,6 @@ void ExecutorState::RunAsync(Executor::DoneCallback done) { namespace { -// This function is provided for use by OpKernelContext when allocating -// the index'th output of node. It provides access to the -// AllocatorAttributes computed during initialization to determine in -// which memory region the tensor should be allocated. -AllocatorAttributes OutputAttributes(const DevAttrMap* attr_map, - const Node* node, - const OpKernel* op_kernel, int index) { - DCHECK_GE(index, 0); - - AllocatorAttributes attr; - int nid = node->id(); - const auto& iter = attr_map->find(std::make_pair(nid, index)); - if (iter != attr_map->end()) { - attr = iter->second; - VLOG(2) << "nondefault attr " << attr.value << " for node " << node->name() - << " output " << index; - } else { - VLOG(2) << "default attr for node " << node->name() << " output " << index; - } - - DCHECK_LT(index, op_kernel->output_memory_types().size()); - bool on_host = op_kernel->output_memory_types()[index] == HOST_MEMORY; - attr.set_on_host(on_host); - return attr; -} - // Helpers to make a copy of 'p' and makes a copy of the input type // vector and the device context vector. // @@ -926,9 +911,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) { params.op_kernel = op_kernel; params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter); params.is_input_dead = is_input_dead; - params.output_alloc_attr = [this, node, op_kernel](int index) { - return OutputAttributes(&impl_->alloc_attr_, node, op_kernel, index); - }; + params.output_attr_array = + gtl::vector_as_array(&impl_->output_attrs_) + item.output_attr_start; async = op_kernel->AsAsync(); if (async) { @@ -1439,7 +1423,8 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter, InitializePending(impl_->graph_, iter_state->pending_count); iter_state->dead_count = new std::vector<int>(impl_->graph_->num_node_ids()); - iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_); + iter_state->input_tensors = + new std::vector<Entry>(impl_->total_input_tensors_); auto frame_pending = impl_->frame_input_count_.find(enter_name); DCHECK(frame_pending != impl_->frame_input_count_.end()); @@ -1470,7 +1455,8 @@ void ExecutorState::IncrementIteration(FrameState* frame, iter_state->pending_count = new std::vector<int>; InitializePending(impl_->graph_, iter_state->pending_count); iter_state->dead_count = new std::vector<int>(impl_->graph_->num_node_ids()); - iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_); + iter_state->input_tensors = + new std::vector<Entry>(impl_->total_input_tensors_); // Activate the successors of the deferred roots in the new iteration. ActivateNexts(frame, next_iter, ready); diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h index 3fa012b852..ec1d600fa1 100644 --- a/tensorflow/core/framework/op_kernel.h +++ b/tensorflow/core/framework/op_kernel.h @@ -436,7 +436,9 @@ class OpKernelContext { DeviceBase* device = nullptr; bool track_allocations = false; - std::function<AllocatorAttributes(int index)> output_alloc_attr = nullptr; + + // Array indexed by output number for this node + const AllocatorAttributes* output_attr_array = nullptr; // Shared resources accessible by this op kernel invocation. ResourceMgr* resource_manager = nullptr; @@ -642,7 +644,7 @@ class OpKernelContext { // Tensors allocated via allocate_temp. There may be a performance // penalty to using a Tensor that was not allocated using // allocate_output. This is because allocate_output uses the - // AllocatorAttributes stored in output_alloc_attr for the + // AllocatorAttributes stored in output_attr_array for the // designated output. In some cases, using the wrong attributes may // cause an extra copy of the Tensor's buffer. @@ -658,9 +660,9 @@ class OpKernelContext { Status allocate_output(const string& name, const TensorShape& shape, Tensor** tensor) TF_MUST_USE_RESULT; // The following methods use the supplied attributes instead of - // those in output_alloc_attr. The caller is responsible for + // those in output_attr_array. The caller is responsible for // ensuring that the attributes are "compatible" with the - // output_alloc_attr, e.g. the tensor is allocated on the correct + // output_attr_array, e.g. the tensor is allocated on the correct // device. See comment above. Status allocate_output(int index, const TensorShape& shape, Tensor** tensor, AllocatorAttributes attr) TF_MUST_USE_RESULT; @@ -767,7 +769,7 @@ class OpKernelContext { } AllocatorAttributes output_alloc_attr(int index) const { - return params_.output_alloc_attr(index); + return params_.output_attr_array[index]; } gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators() const { @@ -1121,8 +1123,7 @@ inline Status OpKernelContext::allocate_output(int index, Tensor** output) { DCHECK_GE(index, 0); DCHECK_LT(index, num_outputs()); - DCHECK(params_.output_alloc_attr); - AllocatorAttributes attr = params_.output_alloc_attr(index); + AllocatorAttributes attr = output_alloc_attr(index); return allocate_output(index, shape, output, attr); } diff --git a/tensorflow/core/kernels/core_ops_test.cc b/tensorflow/core/kernels/core_ops_test.cc index 0117a571fc..63078c4ddd 100644 --- a/tensorflow/core/kernels/core_ops_test.cc +++ b/tensorflow/core/kernels/core_ops_test.cc @@ -40,6 +40,7 @@ limitations under the License. #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" +#include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/threadpool.h" @@ -443,12 +444,8 @@ static void BM_LRNFloat(int iters, int depth, int cols, int rows, params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> context(new OpKernelContext(params)); @@ -527,12 +524,8 @@ static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth, params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params)); @@ -635,12 +628,8 @@ static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols, params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params)); @@ -725,12 +714,8 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth, params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> maxpool_context(new OpKernelContext(params)); @@ -903,12 +888,8 @@ static void BM_ReluFloat(int iters, int batch_size, int rows, int cols, params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(params)); @@ -975,12 +956,8 @@ static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth, params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> softmax_context(new OpKernelContext(params)); diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h index 152e5b2d03..bb470c8c3a 100644 --- a/tensorflow/core/kernels/ops_testutil.h +++ b/tensorflow/core/kernels/ops_testutil.h @@ -52,6 +52,19 @@ namespace test { NodeDef Node(const string& name, const string& op, const std::vector<string>& inputs); +inline void SetOutputAttrs(OpKernelContext::Params* params, + std::vector<AllocatorAttributes>* attrs) { + attrs->clear(); + for (int index = 0; index < params->op_kernel->num_outputs(); index++) { + AllocatorAttributes attr; + const bool on_host = + (params->op_kernel->output_memory_types()[index] == HOST_MEMORY); + attr.set_on_host(on_host); + attrs->push_back(attr); + } + params->output_attr_array = gtl::vector_as_array(attrs); +} + } // namespace test // Helpful functions to test operators. @@ -142,13 +155,8 @@ class OpsTestBase : public ::testing::Test { params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs_; params.op_kernel = kernel_.get(); - params.output_alloc_attr = [this, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = - (kernel_->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; params.slice_reader_cache = &slice_reader_cache_wrapper; diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h index 44911c9d36..1099dd7193 100644 --- a/tensorflow/core/kernels/reduction_ops_common.h +++ b/tensorflow/core/kernels/reduction_ops_common.h @@ -228,11 +228,9 @@ class ReductionHelper { private: bool reduce_first_axis_; // True if need to reduce the 0-th dimension. - gtl::InlinedVector<int64, 4> - data_reshape_; // Reshape the data before reduction. - gtl::InlinedVector<int64, 4> out_shape_; // The final output shape. - gtl::InlinedVector<int64, 4> - out_reshape_; // Reshape the output for reduction. + gtl::InlinedVector<int64, 4> data_reshape_; // Reshape data before reduction. + gtl::InlinedVector<int64, 4> out_shape_; // The final output shape. + gtl::InlinedVector<int64, 4> out_reshape_; // Reshape output for reduction. }; } // end namespace diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc index 3ea3251430..aa8f626c0f 100644 --- a/tensorflow/core/kernels/restore_op_test.cc +++ b/tensorflow/core/kernels/restore_op_test.cc @@ -164,12 +164,8 @@ TEST_F(RestoreOpTest, RestoreSimple) { params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; params.slice_reader_cache = &slice_reader_cache_wrapper; @@ -392,12 +388,8 @@ TEST_F(RestoreSliceOpTest, RestoreInt) { params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper; params.slice_reader_cache = &slice_reader_cache_wrapper; diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc index 00d8e13338..4121ec232b 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_test.cc +++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/public/tensor.h" @@ -72,13 +73,8 @@ static void BM_SegmentReduction(int iters, string reduction, Index num_rows, params.frame_iter = FrameAndIter(0, 0); params.inputs = &reduction_inputs; params.op_kernel = reduction_op.get(); - params.output_alloc_attr = [&device, &reduction_op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = - (reduction_op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> reduction_context( new OpKernelContext(params)); diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc index b807ee30b4..84ba13ee17 100644 --- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc +++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc @@ -255,12 +255,8 @@ static void BM_SparseToDense(int iters, const int bm_arg) { params.frame_iter = FrameAndIter(0, 0); params.inputs = &inputs; params.op_kernel = op.get(); - params.output_alloc_attr = [&device, &op, ¶ms](int index) { - AllocatorAttributes attr; - const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY); - attr.set_on_host(on_host); - return attr; - }; + std::vector<AllocatorAttributes> attrs; + test::SetOutputAttrs(¶ms, &attrs); std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(params)); op->Compute(sparse_context.get()); |