This change significantly reduces time and resources used to load large TensorFlow graphs.

For a real-world large graph (13k nodes, 20k edges), this change: * reduces all heap allocations by 19% * reduces retained (final) heap allocations by 2.2% * reduces CPU time by 11.2% In most TF graphs, the set of unique values set to Node::assigned_device_name() is quite small. This change adds an interning table to the Graph object, which contains all of the unique values used for Node::set_assigned_device_name(), as well as a look-up table. This is the main source of the reduction in retained heap memory; nearly all nodes are assigned to just one or two unique devices. This change removes the "string assigned_device_name_" field from the Node class, and replaces it with "int assigned_device_name_index_". However, because you need both the index and the name table to get the actual value, the Node::assigned_device_name() accessor needs access to the parent Graph. This requires adding a "Graph* graph_" field to the Node class. In the future, if all users of this property are converted to use Graph::assigned_device_name(Node*), then the Node::graph_ field can be deleted, and the space reclaimed. However, doing so is out of the scope of this CL, and even with this new pointer field, the Node class is smaller than it was before, so this is still a net win. The placement algorithm in simple_placer.cc is one of the main accessors of the Node::assigned_device_name property. This CL contains significant changes to simple_placer.cc, which directly take advantage of the fact that the property is an index into a name table, rather than treating it simply as a string. Many temporary allocations are also removed, which is the main source of the reduction in total heap allocations. This CL also contains a few changes that remove short-lived allocations in unrelated code, such as the changes in op.cc/h, costmodel.cc, etc. It is extremely easy in C++ to accidentally allocate memory, especially when implicit conversions and copy constructors allocate memory. All of the changes in this CL were motivated by empirical measurement, using CPU profiling and heap profiling. PiperOrigin-RevId: 157762909
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-06-01 14:29:25 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-06-01 14:33:24 -0700
commit: 7866fa01b79a297908e7871d3b274fa02a5ce5e2 (patch)
tree: 75498546b0ba55cb0b2106ae94a222e740e6f5bb /tensorflow/core/graph/graph.cc
parent: fdffafbc19d85a63c72b76ecfeb2d92a4c43dc75 (diff)
1 files changed, 33 insertions, 3 deletions
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 80161ceb56..dcb8520cf7 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -56,6 +56,9 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"GetSessionHandleV2", NC_GET_SESSION_HANDLE},
         {"GetSessionTensor", NC_GET_SESSION_TENSOR},
         {"DeleteSessionTensor", NC_DELETE_SESSION_TENSOR},
+        {"Size", NC_METADATA},
+        {"Shape", NC_METADATA},
+        {"Rank", NC_METADATA},
     });
 
 #undef REF_CLASS
@@ -77,7 +80,7 @@ string Node::DebugString() const {
     strings::StrAppend(&ret, " sink}");
   } else {
     strings::StrAppend(&ret, " op device:");
-    strings::StrAppend(&ret, "{", assigned_device_name_, "}");
+    strings::StrAppend(&ret, "{", assigned_device_name(), "}");
     strings::StrAppend(&ret, " def:{", SummarizeNode(*this), "}}");
   }
   return ret;
@@ -88,7 +91,7 @@ Node::Node()
       cost_id_(-1),
       class_(NC_UNINITIALIZED),
       props_(nullptr),
-      assigned_device_name_() {}
+      assigned_device_name_index_(0) {}
 
 Node::~Node() {
   if (props_) {
@@ -124,7 +127,7 @@ void Node::Clear() {
     props_ = nullptr;
   }
 
-  assigned_device_name_.clear();
+  assigned_device_name_index_ = 0;
 }
 
 gtl::iterator_range<NeighborIter> Node::out_nodes() const {
@@ -241,6 +244,10 @@ Graph::Graph(const OpRegistryInterface* ops)
   versions_.set_producer(TF_GRAPH_DEF_VERSION);
   versions_.set_min_consumer(TF_GRAPH_DEF_VERSION_MIN_CONSUMER);
 
+  // Initialize the name interning table for assigned_device_name.
+  device_names_.push_back("");
+  DCHECK_EQ(0, InternDeviceName(""));
+
   // Source and sink have no endpoints, just control edges.
   NodeDef def;
   def.set_name("_SOURCE");
@@ -503,6 +510,7 @@ Node* Graph::AllocateNode(Node::Properties* props, const Node* cost_node) {
     node = free_nodes_.back();
     free_nodes_.pop_back();
   }
+  node->graph_ = this;
   const int id = nodes_.size();
   int cost_id = cost_node ? cost_node->cost_id() : id;
   node->Initialize(id, cost_id, props);
@@ -519,4 +527,26 @@ void Graph::ReleaseNode(Node* node) {
   node->Clear();
 }
 
+// Ensures that 'device_name' is present in the device name table, and returns
+// the index of that device name. The index is stable, and can be used in
+// calls to Node::set_assigned_device_name_index().
+int Graph::InternDeviceName(const string& device_name) {
+  // Special case, very common.  Also, this allows us to use a single map
+  // lookup below, instead of two.  The 'if (index_cell > 0)' test below
+  // relies on this check.
+  if (device_name.empty()) {
+    return 0;
+  }
+
+  int& index_cell = device_names_map_[device_name];
+  if (index_cell > 0) {
+    return index_cell;
+  }
+
+  const int index = device_names_map_.size();
+  index_cell = index;
+  device_names_.push_back(device_name);
+  return index;
+}
+
 }  // namespace tensorflow
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-06-01 14:29:25 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-06-01 14:33:24 -0700
commit	7866fa01b79a297908e7871d3b274fa02a5ce5e2 (patch)
tree	75498546b0ba55cb0b2106ae94a222e740e6f5bb /tensorflow/core/graph/graph.cc
parent	fdffafbc19d85a63c72b76ecfeb2d92a4c43dc75 (diff)