aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <nobody@tensorflow.org>2016-01-13 17:30:08 -0800
committerGravatar Vijay Vasudevan <vrv@google.com>2016-01-13 17:30:08 -0800
commit6dbfb95100b73ad26ebebb9be9c0429dc0cece8a (patch)
tree83a15ef5538529dd3a4fece1276388c20f9260bb /tensorflow
parentcbdf278dbd998aab57898763605f6863e9b42b76 (diff)
Avoid allocation of std::function<> object per node executed, as
well as a hash-table lookup per allocated output. Instead, we now pre-compute the AllocatorAttributes for every output tensor in the graph into an array (indexed by a base number per node + output index), and changed OpKernelContext::Params to provide a pointer to the base of the array for the node, rather than providing a std::function<>. Updated test code to avoid so much code duplication when setting up the OpKernelContext::Params object in various places. Used gtl::InlinedVector<...> instead of std::vector<...> in a few places in tensorflow/core/kernels/reduction_ops_common.h Didn't make a measurable change in overall performance but allocations and time spent in the std::function destructor code was significantly reduced. Change: 112103260
Diffstat (limited to 'tensorflow')
-rw-r--r--tensorflow/core/common_runtime/executor.cc94
-rw-r--r--tensorflow/core/framework/op_kernel.h15
-rw-r--r--tensorflow/core/kernels/core_ops_test.cc49
-rw-r--r--tensorflow/core/kernels/ops_testutil.h22
-rw-r--r--tensorflow/core/kernels/reduction_ops_common.h8
-rw-r--r--tensorflow/core/kernels/restore_op_test.cc16
-rw-r--r--tensorflow/core/kernels/segment_reduction_ops_test.cc10
-rw-r--r--tensorflow/core/kernels/sparse_to_dense_op_test.cc8
8 files changed, 88 insertions, 134 deletions
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index e42d2e260f..5de368da87 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -175,18 +175,11 @@ struct NodeItem {
// ExecutorImpl::tensors_[input_start] is the 1st positional input
// for this node.
int input_start = 0;
-};
-// Map from std::pair<node_id, output_index> to attributes.
-struct pairhash {
- public:
- template <typename T, typename U>
- std::size_t operator()(const std::pair<T, U>& x) const {
- return std::hash<T>()(x.first) ^ std::hash<U>()(x.second);
- }
+ // ExecutorImpl::output_attrs_[output_attr_start] is the 1st
+ // positional attribute for the 0th output of this node.
+ int output_attr_start = 0;
};
-typedef std::unordered_map<std::pair<int, int>, AllocatorAttributes, pairhash>
- DevAttrMap;
typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec;
typedef gtl::InlinedVector<DeviceContext*, 4> DeviceContextVec;
@@ -231,14 +224,15 @@ class ExecutorImpl : public Executor {
// Owned.
LocalExecutorParams params_;
const Graph* graph_;
- std::vector<NodeItem> nodes_; // nodes_.size == graph_.num_node_ids().
- int total_tensors_ = 0; // total_tensors_ = sum(nodes_[*].num_inputs())
+ std::vector<NodeItem> nodes_; // nodes_.size == graph_.num_node_ids().
+ int total_input_tensors_ = 0; // == sum(nodes_[*].num_inputs())
+ int total_output_tensors_ = 0; // == sum(nodes_[*].num_outputs())
// The number of inputs for each frame in this graph. This is static
// information of the graph.
std::unordered_map<string, int> frame_input_count_;
- DevAttrMap alloc_attr_;
+ std::vector<AllocatorAttributes> output_attrs_;
TF_DISALLOW_COPY_AND_ASSIGN(ExecutorImpl);
};
@@ -248,7 +242,8 @@ Status ExecutorImpl::Initialize() {
nodes_.resize(num_nodes);
Status s;
- total_tensors_ = 0;
+ total_input_tensors_ = 0;
+ total_output_tensors_ = 0;
// Preprocess every node in the graph to create an instance of op
// kernel for each node;
@@ -256,8 +251,13 @@ Status ExecutorImpl::Initialize() {
const int id = n->id();
NodeItem* item = &nodes_[id];
item->node = n;
- item->input_start = total_tensors_;
- total_tensors_ += n->num_inputs();
+
+ item->input_start = total_input_tensors_;
+ total_input_tensors_ += n->num_inputs();
+
+ item->output_attr_start = total_output_tensors_;
+ total_output_tensors_ += n->num_outputs();
+
s = params_.create_kernel(n->def(), &item->kernel);
if (!s.ok()) {
s = AttachDef(s, n->def());
@@ -283,22 +283,32 @@ Status ExecutorImpl::SetAllocAttrs() {
Device* device = params_.device;
DeviceNameUtils::ParsedName local_dev_name = device->parsed_name();
+ output_attrs_.resize(total_output_tensors_);
for (const Node* n : graph_->nodes()) {
+ NodeItem* item = &nodes_[n->id()];
+ const int base_index = item->output_attr_start;
// Examine the out edges of each node looking for special use
// cases that may affect memory allocation attributes.
for (auto e : n->out_edges()) {
+ const int index = e->src_output();
AllocatorAttributes attr;
s = InferAllocAttr(n, e->dst(), local_dev_name, &attr);
if (!s.ok()) return s;
if (attr.value != 0) {
- VLOG(2) << "node " << n->name() << " gets attr " << attr.value
- << " for output " << e->src_output();
- alloc_attr_[std::make_pair(n->id(), e->src_output())].Merge(attr);
- } else {
- VLOG(2) << "default output attr for node " << n->name() << " output "
- << e->src_output();
+ if (!e->IsControlEdge()) {
+ output_attrs_[base_index + index].Merge(attr);
+ }
}
}
+
+ for (int out = 0; out < n->num_outputs(); out++) {
+ OpKernel* op_kernel = item->kernel;
+ DCHECK_LT(out, op_kernel->output_memory_types().size());
+ bool on_host = op_kernel->output_memory_types()[out] == HOST_MEMORY;
+ AllocatorAttributes h;
+ h.set_on_host(on_host);
+ output_attrs_[base_index + out].Merge(h);
+ }
}
return s;
}
@@ -712,7 +722,8 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
iter_state->outstanding_frame_count = 0;
iter_state->pending_count = new std::vector<int>;
iter_state->dead_count = new std::vector<int>(impl->graph_->num_node_ids());
- iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_);
+ iter_state->input_tensors =
+ new std::vector<Entry>(impl_->total_input_tensors_);
// Initialize the executor state.
outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
@@ -793,32 +804,6 @@ void ExecutorState::RunAsync(Executor::DoneCallback done) {
namespace {
-// This function is provided for use by OpKernelContext when allocating
-// the index'th output of node. It provides access to the
-// AllocatorAttributes computed during initialization to determine in
-// which memory region the tensor should be allocated.
-AllocatorAttributes OutputAttributes(const DevAttrMap* attr_map,
- const Node* node,
- const OpKernel* op_kernel, int index) {
- DCHECK_GE(index, 0);
-
- AllocatorAttributes attr;
- int nid = node->id();
- const auto& iter = attr_map->find(std::make_pair(nid, index));
- if (iter != attr_map->end()) {
- attr = iter->second;
- VLOG(2) << "nondefault attr " << attr.value << " for node " << node->name()
- << " output " << index;
- } else {
- VLOG(2) << "default attr for node " << node->name() << " output " << index;
- }
-
- DCHECK_LT(index, op_kernel->output_memory_types().size());
- bool on_host = op_kernel->output_memory_types()[index] == HOST_MEMORY;
- attr.set_on_host(on_host);
- return attr;
-}
-
// Helpers to make a copy of 'p' and makes a copy of the input type
// vector and the device context vector.
//
@@ -926,9 +911,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
params.op_kernel = op_kernel;
params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
params.is_input_dead = is_input_dead;
- params.output_alloc_attr = [this, node, op_kernel](int index) {
- return OutputAttributes(&impl_->alloc_attr_, node, op_kernel, index);
- };
+ params.output_attr_array =
+ gtl::vector_as_array(&impl_->output_attrs_) + item.output_attr_start;
async = op_kernel->AsAsync();
if (async) {
@@ -1439,7 +1423,8 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
InitializePending(impl_->graph_, iter_state->pending_count);
iter_state->dead_count =
new std::vector<int>(impl_->graph_->num_node_ids());
- iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_);
+ iter_state->input_tensors =
+ new std::vector<Entry>(impl_->total_input_tensors_);
auto frame_pending = impl_->frame_input_count_.find(enter_name);
DCHECK(frame_pending != impl_->frame_input_count_.end());
@@ -1470,7 +1455,8 @@ void ExecutorState::IncrementIteration(FrameState* frame,
iter_state->pending_count = new std::vector<int>;
InitializePending(impl_->graph_, iter_state->pending_count);
iter_state->dead_count = new std::vector<int>(impl_->graph_->num_node_ids());
- iter_state->input_tensors = new std::vector<Entry>(impl_->total_tensors_);
+ iter_state->input_tensors =
+ new std::vector<Entry>(impl_->total_input_tensors_);
// Activate the successors of the deferred roots in the new iteration.
ActivateNexts(frame, next_iter, ready);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 3fa012b852..ec1d600fa1 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -436,7 +436,9 @@ class OpKernelContext {
DeviceBase* device = nullptr;
bool track_allocations = false;
- std::function<AllocatorAttributes(int index)> output_alloc_attr = nullptr;
+
+ // Array indexed by output number for this node
+ const AllocatorAttributes* output_attr_array = nullptr;
// Shared resources accessible by this op kernel invocation.
ResourceMgr* resource_manager = nullptr;
@@ -642,7 +644,7 @@ class OpKernelContext {
// Tensors allocated via allocate_temp. There may be a performance
// penalty to using a Tensor that was not allocated using
// allocate_output. This is because allocate_output uses the
- // AllocatorAttributes stored in output_alloc_attr for the
+ // AllocatorAttributes stored in output_attr_array for the
// designated output. In some cases, using the wrong attributes may
// cause an extra copy of the Tensor's buffer.
@@ -658,9 +660,9 @@ class OpKernelContext {
Status allocate_output(const string& name, const TensorShape& shape,
Tensor** tensor) TF_MUST_USE_RESULT;
// The following methods use the supplied attributes instead of
- // those in output_alloc_attr. The caller is responsible for
+ // those in output_attr_array. The caller is responsible for
// ensuring that the attributes are "compatible" with the
- // output_alloc_attr, e.g. the tensor is allocated on the correct
+ // output_attr_array, e.g. the tensor is allocated on the correct
// device. See comment above.
Status allocate_output(int index, const TensorShape& shape, Tensor** tensor,
AllocatorAttributes attr) TF_MUST_USE_RESULT;
@@ -767,7 +769,7 @@ class OpKernelContext {
}
AllocatorAttributes output_alloc_attr(int index) const {
- return params_.output_alloc_attr(index);
+ return params_.output_attr_array[index];
}
gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators() const {
@@ -1121,8 +1123,7 @@ inline Status OpKernelContext::allocate_output(int index,
Tensor** output) {
DCHECK_GE(index, 0);
DCHECK_LT(index, num_outputs());
- DCHECK(params_.output_alloc_attr);
- AllocatorAttributes attr = params_.output_alloc_attr(index);
+ AllocatorAttributes attr = output_alloc_attr(index);
return allocate_output(index, shape, output, attr);
}
diff --git a/tensorflow/core/kernels/core_ops_test.cc b/tensorflow/core/kernels/core_ops_test.cc
index 0117a571fc..63078c4ddd 100644
--- a/tensorflow/core/kernels/core_ops_test.cc
+++ b/tensorflow/core/kernels/core_ops_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/graph/graph_constructor.h"
#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/core/threadpool.h"
@@ -443,12 +444,8 @@ static void BM_LRNFloat(int iters, int depth, int cols, int rows,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> context(new OpKernelContext(params));
@@ -527,12 +524,8 @@ static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params));
@@ -635,12 +628,8 @@ static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> avgpool_context(new OpKernelContext(params));
@@ -725,12 +714,8 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> maxpool_context(new OpKernelContext(params));
@@ -903,12 +888,8 @@ static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(params));
@@ -975,12 +956,8 @@ static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> softmax_context(new OpKernelContext(params));
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index 152e5b2d03..bb470c8c3a 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -52,6 +52,19 @@ namespace test {
NodeDef Node(const string& name, const string& op,
const std::vector<string>& inputs);
+inline void SetOutputAttrs(OpKernelContext::Params* params,
+ std::vector<AllocatorAttributes>* attrs) {
+ attrs->clear();
+ for (int index = 0; index < params->op_kernel->num_outputs(); index++) {
+ AllocatorAttributes attr;
+ const bool on_host =
+ (params->op_kernel->output_memory_types()[index] == HOST_MEMORY);
+ attr.set_on_host(on_host);
+ attrs->push_back(attr);
+ }
+ params->output_attr_array = gtl::vector_as_array(attrs);
+}
+
} // namespace test
// Helpful functions to test operators.
@@ -142,13 +155,8 @@ class OpsTestBase : public ::testing::Test {
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs_;
params.op_kernel = kernel_.get();
- params.output_alloc_attr = [this, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host =
- (kernel_->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
params.slice_reader_cache = &slice_reader_cache_wrapper;
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 44911c9d36..1099dd7193 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -228,11 +228,9 @@ class ReductionHelper {
private:
bool reduce_first_axis_; // True if need to reduce the 0-th dimension.
- gtl::InlinedVector<int64, 4>
- data_reshape_; // Reshape the data before reduction.
- gtl::InlinedVector<int64, 4> out_shape_; // The final output shape.
- gtl::InlinedVector<int64, 4>
- out_reshape_; // Reshape the output for reduction.
+ gtl::InlinedVector<int64, 4> data_reshape_; // Reshape data before reduction.
+ gtl::InlinedVector<int64, 4> out_shape_; // The final output shape.
+ gtl::InlinedVector<int64, 4> out_reshape_; // Reshape output for reduction.
};
} // end namespace
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
index 3ea3251430..aa8f626c0f 100644
--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -164,12 +164,8 @@ TEST_F(RestoreOpTest, RestoreSimple) {
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
params.slice_reader_cache = &slice_reader_cache_wrapper;
@@ -392,12 +388,8 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
params.slice_reader_cache = &slice_reader_cache_wrapper;
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 00d8e13338..4121ec232b 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/graph/node_builder.h"
#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/platform/test_benchmark.h"
#include "tensorflow/core/public/tensor.h"
@@ -72,13 +73,8 @@ static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &reduction_inputs;
params.op_kernel = reduction_op.get();
- params.output_alloc_attr = [&device, &reduction_op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host =
- (reduction_op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> reduction_context(
new OpKernelContext(params));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index b807ee30b4..84ba13ee17 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -255,12 +255,8 @@ static void BM_SparseToDense(int iters, const int bm_arg) {
params.frame_iter = FrameAndIter(0, 0);
params.inputs = &inputs;
params.op_kernel = op.get();
- params.output_alloc_attr = [&device, &op, &params](int index) {
- AllocatorAttributes attr;
- const bool on_host = (op->output_memory_types()[index] == HOST_MEMORY);
- attr.set_on_host(on_host);
- return attr;
- };
+ std::vector<AllocatorAttributes> attrs;
+ test::SetOutputAttrs(&params, &attrs);
std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(params));
op->Compute(sparse_context.get());