diff options
Diffstat (limited to 'tensorflow/contrib/tensorrt/segment/segment.cc')
-rw-r--r-- | tensorflow/contrib/tensorrt/segment/segment.cc | 188 |
1 files changed, 148 insertions, 40 deletions
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc index cc42913eca..008fffc954 100644 --- a/tensorflow/contrib/tensorrt/segment/segment.cc +++ b/tensorflow/contrib/tensorrt/segment/segment.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/contrib/tensorrt/segment/segment.h" +#include <queue> #include <set> #include <unordered_map> #include <vector> @@ -32,6 +33,7 @@ namespace tensorflow { namespace tensorrt { namespace segment { using ::tensorflow::strings::StrAppend; + // A simple graph representation to mirror tensorflow::Graph. This structure // helps saving memory since segmenter modifies the graph in place, preventing // the need to create a copy of the graph. It is composed of edges and nodes. @@ -215,7 +217,7 @@ namespace { bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src, const std::vector<SimpleNode*>& start) { - // copied from TF ReverseDFS. + // Copied from TF ReverseDFS, which only works for tensorflow::Graph. struct Work { SimpleNode* node; bool leave; // Are we entering or leaving n? @@ -269,6 +271,24 @@ bool CanContractEdge(const SimpleEdge* edge, // 1. Get all nodes incoming to 'dst', excluding 'src' // 2. Reverse DFS from those nodes // 3. If reverse DFS reaches 'src' then we have a cycle + // + // TODO(aaroey): there are several problems with the current approach: + // 1. src->dst->src, this is not detected but it should be; + // 2. src->dst->...(any node sequence that doesn't contain src)...->dst, this + // is detected but it should not be. + // + // Note that it's fine that dst connects back to src indirectly (i.e. through + // a path with length > 1 that consists of intermedia nodes other than src). + // While loops is one example. + // + // The goal is to make sure that the trt subgraph: + // 1. has no loops (i.e. is a DAG), and + // 2. if there is a path in the subgraph from X to Y (X and Y are both nodes + // in the subgraph), then all paths from X to Y are in the subgraph. + // + // To achieve this goal, the correct way seems to be: + // 1. remove any direct edge from src->dst; + // 2. detect if src can reach dst, if so they cannot be merged. std::vector<SimpleNode*> dfs_start_nodes; for (SimpleNode* node : dst->in_nodes()) { if (node != src) { @@ -276,8 +296,8 @@ bool CanContractEdge(const SimpleEdge* edge, } } - bool is_cycle = CheckCycles(graph, src, dfs_start_nodes); - return !is_cycle; + const bool has_cycle = CheckCycles(graph, src, dfs_start_nodes); + return !has_cycle; } } // namespace @@ -342,22 +362,20 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph, } tensorflow::Status SegmentGraph( - const tensorflow::GraphDef& gdef, - const std::function<bool(const tensorflow::Node*)>& candidate_fn, - const SegmentOptions& options, SegmentNodesVector* segments) { - // Create a Graph representation of the GraphDef. - tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(), - gdef.library()); - tensorflow::Graph graph(flib); - TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph( - tensorflow::GraphConstructorOptions(), gdef, &graph)); - return SegmentGraph(&graph, candidate_fn, options, segments); -} - -tensorflow::Status SegmentGraph( - tensorflow::Graph* tf_graph, + const tensorflow::Graph* tf_graph, const std::function<bool(const tensorflow::Node*)>& candidate_fn, + const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn, + const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn, const SegmentOptions& options, SegmentNodesVector* segments) { + // Steps: + // 1. run the segmentation algorithm to find all the segments, which uses + // candidate_fn to determine the candidates segment nodes; + // 2. for each segments, remove the nodes that are inputs/outputs of the + // segment but are not eligible, using input/output_candidate_fn to + // determine the eligibilities; + // 3. convert the segment into expected return format and return the result. + + // --------------------------------- Step 1 --------------------------------- auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph)); // Use a union-find to collect the nodes that belong to the same // segment. A node value of nullptr indicates that the node is not a candidate @@ -372,14 +390,19 @@ tensorflow::Status SegmentGraph( node_segments.emplace_back(node); } - // The segmentation algorithm below visits nodes in reverse - // topological order and attempts to merge nodes along output - // edges. That means that subgraphs grow from the output-side of the - // network towards the inputs. In general this is not guaranteed to - // produce a globally optimal segmentation. In the future if we have - // a measure of how beneficial it is to include a given node in a - // TRT subgraph then we can revisit this algorithm to take advantage - // of that information. + // The segmentation algorithm below visits nodes in reverse topological order + // and attempts to merge nodes along output edges. That means that subgraphs + // grow from the output-side of the network towards the inputs. + // + // In general this is not guaranteed to produce a globally optimal + // segmentation. For exaample, consider graph with node {A, B, C, D} and edges + // {A->B, A->C, B->D, C->D), where A, B, D are trt compatible but C is not, so + // in theory we can choose to contract either A, B or B, D but not both, but + // here it always choose to contract B, D. + // + // In the future if we have a measure of how beneficial it is to include a + // given node in a TRT subgraph then we can revisit this algorithm to take + // advantage of that information. std::vector<tensorflow::Node*> tforder; tensorflow::GetPostOrder(*tf_graph, &tforder); // use postorder implementation from tensorflow and construct mirror in @@ -392,13 +415,11 @@ tensorflow::Status SegmentGraph( for (const SimpleNode* node : order) { // All output nodes of 'node' have been visited... VLOG(2) << "Trying node " << node->name() << " id=" << node->id(); - // 'node' must be a TRT candidate... if (node_segments[node->id()].Value() == nullptr) { VLOG(2) << "... not a TRT candidate"; continue; } - // Contract output edges to combine 'node' with output // nodes. Iterate since combining two nodes may unblock other // combining. @@ -416,7 +437,6 @@ tensorflow::Status SegmentGraph( VLOG(2) << "... ... not a TRT candidate"; continue; } - if (CanContractEdge(out_edge, graph)) { VLOG(2) << "... ... can contract"; contract_edges.insert(out_edge); @@ -424,11 +444,9 @@ tensorflow::Status SegmentGraph( VLOG(2) << "... ... cannot contract, would form cycle"; } } - if (contract_edges.empty()) { break; } - // Contract edges and collect the adjacent nodes into the same // segment/subgraph. while (!contract_edges.empty()) { @@ -457,11 +475,22 @@ tensorflow::Status SegmentGraph( // Collect the segments/subgraphs. Each subgraph is represented by a // set of the names of the nodes in that subgraph. - std::unordered_map<string, std::set<string>> sg_map; + + // A map from the segment identifier (currently the name of the root node of + // the segment tree) to the segment nodes set. + std::unordered_map<string, std::set<const tensorflow::Node*>> sg_map; + + // A map from the segment identifier (currently the name of the root node of + // the segment tree) to the device names that the nodes in the segment are + // assigned to. + // + // TODO(aaroey): nodes assigned to different devices should not be merged, + // fix this. std::unordered_map<string, std::set<string>> device_maps; + for (auto& u : node_segments) { if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) { - sg_map[u.ParentValue()->name()].insert(u.Value()->name()); + sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node()); auto tf_node = u.Value()->tf_node(); // has_assigned_device_name() is expected to return true // when called from optimization pass. However, since graph @@ -482,25 +511,104 @@ tensorflow::Status SegmentGraph( } } + // --------------------------------- Step 2 --------------------------------- + // Remove ineligible input/output nodes. + for (auto& itr : sg_map) { + std::set<const tensorflow::Node*>& segment_nodes = itr.second; + VLOG(1) << "Segment original size: " << segment_nodes.size(); + while (true) { + std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que; + // Find an input node that is not eligible and add it to the queue. + // Nodes that has no incoming edges should not be treated as "input", + // as there are really no inputs to them. Similar for output nodes. + for (auto node : segment_nodes) { + bool added = false; + for (const tensorflow::Edge* edge : node->in_edges()) { + if (!edge->IsControlEdge() && !edge->src()->IsSource() && + !segment_nodes.count(edge->src())) { // 'node' is an input node. + if (!input_candidate_fn(edge)) { + in_nodes_que.push_back(node); + added = true; + break; + } + } + } + if (added) continue; // Only adding the node once to either queue. + for (const tensorflow::Edge* edge : node->out_edges()) { + if (!edge->dst()->IsSink() && !edge->IsControlEdge() && + !segment_nodes.count(edge->dst())) { // 'node' is an output node. + if (!output_candidate_fn(edge)) { + out_nodes_que.push_back(node); + break; + } + } + } + } + if (in_nodes_que.empty() && out_nodes_que.empty()) { + // No more ineligible input/output nodes. + break; + } + // Now for each ineligible node, remove all of its inputs or outputs from + // the subgraph. + // + // It can be proven that, if the original subgraph: + // 1. is a DAG, and + // 2. all paths between two nodes in the subgraph are all inside the + // subgraph + // then after doing this operation the resulting subgraph will keep the + // same properties 1 and 2. + // + // For simplicity we use heuristics: for input nodes remove all its + // input, for output nodes remove all its output. In this way, for common + // cases the number of removed nodes should be minimum. + auto remove_nodes = [&segment_nodes]( + bool is_input_nodes, + std::deque<const tensorflow::Node*>* que) { + // Run a BFS on the queue to find all the input/output nodes. + std::set<const tensorflow::Node*> visited; + while (!que->empty()) { + auto node = que->front(); + que->pop_front(); + if (!visited.insert(node).second) continue; + segment_nodes.erase(node); + for (auto in : + is_input_nodes ? node->in_nodes() : node->out_nodes()) { + if (segment_nodes.count(in)) { + que->push_back(in); + VLOG(2) << "Need to remove node " << in->name() + << " because one of its " + << (is_input_nodes ? "output" : "input") + << " nodes in the graph was removed: " << node->name(); + } + } + } + }; + remove_nodes(true, &in_nodes_que); + remove_nodes(false, &out_nodes_que); + } + VLOG(1) << "Segment new size: " << segment_nodes.size(); + } + + // --------------------------------- Step 3 --------------------------------- // Convert the segments into the expected return format for (const auto& itr : sg_map) { - const auto& segment_node_names = itr.second; + const std::set<const tensorflow::Node*>& segment_nodes = itr.second; if (VLOG_IS_ON(1)) { string s; - for (const auto& name : segment_node_names) { - s += " " + name; - } - VLOG(1) << "Segment " << segments->size() << ":" << s; + for (auto node : segment_nodes) s += " " + node->name(); + VLOG(1) << "Segment " << segments->size() << ": " << s; } // Don't use small segments. - if (static_cast<int>(segment_node_names.size()) < - options.minimum_segment_size) { + if (static_cast<int>(segment_nodes.size()) < options.minimum_segment_size) { VLOG(1) << "Segment " << segments->size() << " has only " - << segment_node_names.size() << " nodes, dropping"; + << segment_nodes.size() << " nodes, dropping"; continue; } + // TODO(sami): Make segmenter placement aware once trtscopes are in place + std::set<string> segment_node_names; + for (auto node : itr.second) segment_node_names.insert(node->name()); const auto& dev_itr = device_maps.find(itr.first); if (dev_itr == device_maps.end() || dev_itr->second.empty()) { VLOG(1) << "No device assigned to segment " << segments->size(); |