aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/tensorrt/segment/segment.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/tensorrt/segment/segment.cc')
-rw-r--r--tensorflow/contrib/tensorrt/segment/segment.cc188
1 files changed, 148 insertions, 40 deletions
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index cc42913eca..008fffc954 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include <queue>
#include <set>
#include <unordered_map>
#include <vector>
@@ -32,6 +33,7 @@ namespace tensorflow {
namespace tensorrt {
namespace segment {
using ::tensorflow::strings::StrAppend;
+
// A simple graph representation to mirror tensorflow::Graph. This structure
// helps saving memory since segmenter modifies the graph in place, preventing
// the need to create a copy of the graph. It is composed of edges and nodes.
@@ -215,7 +217,7 @@ namespace {
bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
const std::vector<SimpleNode*>& start) {
- // copied from TF ReverseDFS.
+ // Copied from TF ReverseDFS, which only works for tensorflow::Graph.
struct Work {
SimpleNode* node;
bool leave; // Are we entering or leaving n?
@@ -269,6 +271,24 @@ bool CanContractEdge(const SimpleEdge* edge,
// 1. Get all nodes incoming to 'dst', excluding 'src'
// 2. Reverse DFS from those nodes
// 3. If reverse DFS reaches 'src' then we have a cycle
+ //
+ // TODO(aaroey): there are several problems with the current approach:
+ // 1. src->dst->src, this is not detected but it should be;
+ // 2. src->dst->...(any node sequence that doesn't contain src)...->dst, this
+ // is detected but it should not be.
+ //
+ // Note that it's fine that dst connects back to src indirectly (i.e. through
+ // a path with length > 1 that consists of intermedia nodes other than src).
+ // While loops is one example.
+ //
+ // The goal is to make sure that the trt subgraph:
+ // 1. has no loops (i.e. is a DAG), and
+ // 2. if there is a path in the subgraph from X to Y (X and Y are both nodes
+ // in the subgraph), then all paths from X to Y are in the subgraph.
+ //
+ // To achieve this goal, the correct way seems to be:
+ // 1. remove any direct edge from src->dst;
+ // 2. detect if src can reach dst, if so they cannot be merged.
std::vector<SimpleNode*> dfs_start_nodes;
for (SimpleNode* node : dst->in_nodes()) {
if (node != src) {
@@ -276,8 +296,8 @@ bool CanContractEdge(const SimpleEdge* edge,
}
}
- bool is_cycle = CheckCycles(graph, src, dfs_start_nodes);
- return !is_cycle;
+ const bool has_cycle = CheckCycles(graph, src, dfs_start_nodes);
+ return !has_cycle;
}
} // namespace
@@ -342,22 +362,20 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
}
tensorflow::Status SegmentGraph(
- const tensorflow::GraphDef& gdef,
- const std::function<bool(const tensorflow::Node*)>& candidate_fn,
- const SegmentOptions& options, SegmentNodesVector* segments) {
- // Create a Graph representation of the GraphDef.
- tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
- gdef.library());
- tensorflow::Graph graph(flib);
- TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
- tensorflow::GraphConstructorOptions(), gdef, &graph));
- return SegmentGraph(&graph, candidate_fn, options, segments);
-}
-
-tensorflow::Status SegmentGraph(
- tensorflow::Graph* tf_graph,
+ const tensorflow::Graph* tf_graph,
const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+ const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
+ const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
const SegmentOptions& options, SegmentNodesVector* segments) {
+ // Steps:
+ // 1. run the segmentation algorithm to find all the segments, which uses
+ // candidate_fn to determine the candidates segment nodes;
+ // 2. for each segments, remove the nodes that are inputs/outputs of the
+ // segment but are not eligible, using input/output_candidate_fn to
+ // determine the eligibilities;
+ // 3. convert the segment into expected return format and return the result.
+
+ // --------------------------------- Step 1 ---------------------------------
auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
// Use a union-find to collect the nodes that belong to the same
// segment. A node value of nullptr indicates that the node is not a candidate
@@ -372,14 +390,19 @@ tensorflow::Status SegmentGraph(
node_segments.emplace_back(node);
}
- // The segmentation algorithm below visits nodes in reverse
- // topological order and attempts to merge nodes along output
- // edges. That means that subgraphs grow from the output-side of the
- // network towards the inputs. In general this is not guaranteed to
- // produce a globally optimal segmentation. In the future if we have
- // a measure of how beneficial it is to include a given node in a
- // TRT subgraph then we can revisit this algorithm to take advantage
- // of that information.
+ // The segmentation algorithm below visits nodes in reverse topological order
+ // and attempts to merge nodes along output edges. That means that subgraphs
+ // grow from the output-side of the network towards the inputs.
+ //
+ // In general this is not guaranteed to produce a globally optimal
+ // segmentation. For exaample, consider graph with node {A, B, C, D} and edges
+ // {A->B, A->C, B->D, C->D), where A, B, D are trt compatible but C is not, so
+ // in theory we can choose to contract either A, B or B, D but not both, but
+ // here it always choose to contract B, D.
+ //
+ // In the future if we have a measure of how beneficial it is to include a
+ // given node in a TRT subgraph then we can revisit this algorithm to take
+ // advantage of that information.
std::vector<tensorflow::Node*> tforder;
tensorflow::GetPostOrder(*tf_graph, &tforder);
// use postorder implementation from tensorflow and construct mirror in
@@ -392,13 +415,11 @@ tensorflow::Status SegmentGraph(
for (const SimpleNode* node : order) {
// All output nodes of 'node' have been visited...
VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
-
// 'node' must be a TRT candidate...
if (node_segments[node->id()].Value() == nullptr) {
VLOG(2) << "... not a TRT candidate";
continue;
}
-
// Contract output edges to combine 'node' with output
// nodes. Iterate since combining two nodes may unblock other
// combining.
@@ -416,7 +437,6 @@ tensorflow::Status SegmentGraph(
VLOG(2) << "... ... not a TRT candidate";
continue;
}
-
if (CanContractEdge(out_edge, graph)) {
VLOG(2) << "... ... can contract";
contract_edges.insert(out_edge);
@@ -424,11 +444,9 @@ tensorflow::Status SegmentGraph(
VLOG(2) << "... ... cannot contract, would form cycle";
}
}
-
if (contract_edges.empty()) {
break;
}
-
// Contract edges and collect the adjacent nodes into the same
// segment/subgraph.
while (!contract_edges.empty()) {
@@ -457,11 +475,22 @@ tensorflow::Status SegmentGraph(
// Collect the segments/subgraphs. Each subgraph is represented by a
// set of the names of the nodes in that subgraph.
- std::unordered_map<string, std::set<string>> sg_map;
+
+ // A map from the segment identifier (currently the name of the root node of
+ // the segment tree) to the segment nodes set.
+ std::unordered_map<string, std::set<const tensorflow::Node*>> sg_map;
+
+ // A map from the segment identifier (currently the name of the root node of
+ // the segment tree) to the device names that the nodes in the segment are
+ // assigned to.
+ //
+ // TODO(aaroey): nodes assigned to different devices should not be merged,
+ // fix this.
std::unordered_map<string, std::set<string>> device_maps;
+
for (auto& u : node_segments) {
if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
- sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+ sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node());
auto tf_node = u.Value()->tf_node();
// has_assigned_device_name() is expected to return true
// when called from optimization pass. However, since graph
@@ -482,25 +511,104 @@ tensorflow::Status SegmentGraph(
}
}
+ // --------------------------------- Step 2 ---------------------------------
+ // Remove ineligible input/output nodes.
+ for (auto& itr : sg_map) {
+ std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+ VLOG(1) << "Segment original size: " << segment_nodes.size();
+ while (true) {
+ std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
+ // Find an input node that is not eligible and add it to the queue.
+ // Nodes that has no incoming edges should not be treated as "input",
+ // as there are really no inputs to them. Similar for output nodes.
+ for (auto node : segment_nodes) {
+ bool added = false;
+ for (const tensorflow::Edge* edge : node->in_edges()) {
+ if (!edge->IsControlEdge() && !edge->src()->IsSource() &&
+ !segment_nodes.count(edge->src())) { // 'node' is an input node.
+ if (!input_candidate_fn(edge)) {
+ in_nodes_que.push_back(node);
+ added = true;
+ break;
+ }
+ }
+ }
+ if (added) continue; // Only adding the node once to either queue.
+ for (const tensorflow::Edge* edge : node->out_edges()) {
+ if (!edge->dst()->IsSink() && !edge->IsControlEdge() &&
+ !segment_nodes.count(edge->dst())) { // 'node' is an output node.
+ if (!output_candidate_fn(edge)) {
+ out_nodes_que.push_back(node);
+ break;
+ }
+ }
+ }
+ }
+ if (in_nodes_que.empty() && out_nodes_que.empty()) {
+ // No more ineligible input/output nodes.
+ break;
+ }
+ // Now for each ineligible node, remove all of its inputs or outputs from
+ // the subgraph.
+ //
+ // It can be proven that, if the original subgraph:
+ // 1. is a DAG, and
+ // 2. all paths between two nodes in the subgraph are all inside the
+ // subgraph
+ // then after doing this operation the resulting subgraph will keep the
+ // same properties 1 and 2.
+ //
+ // For simplicity we use heuristics: for input nodes remove all its
+ // input, for output nodes remove all its output. In this way, for common
+ // cases the number of removed nodes should be minimum.
+ auto remove_nodes = [&segment_nodes](
+ bool is_input_nodes,
+ std::deque<const tensorflow::Node*>* que) {
+ // Run a BFS on the queue to find all the input/output nodes.
+ std::set<const tensorflow::Node*> visited;
+ while (!que->empty()) {
+ auto node = que->front();
+ que->pop_front();
+ if (!visited.insert(node).second) continue;
+ segment_nodes.erase(node);
+ for (auto in :
+ is_input_nodes ? node->in_nodes() : node->out_nodes()) {
+ if (segment_nodes.count(in)) {
+ que->push_back(in);
+ VLOG(2) << "Need to remove node " << in->name()
+ << " because one of its "
+ << (is_input_nodes ? "output" : "input")
+ << " nodes in the graph was removed: " << node->name();
+ }
+ }
+ }
+ };
+ remove_nodes(true, &in_nodes_que);
+ remove_nodes(false, &out_nodes_que);
+ }
+ VLOG(1) << "Segment new size: " << segment_nodes.size();
+ }
+
+ // --------------------------------- Step 3 ---------------------------------
// Convert the segments into the expected return format
for (const auto& itr : sg_map) {
- const auto& segment_node_names = itr.second;
+ const std::set<const tensorflow::Node*>& segment_nodes = itr.second;
if (VLOG_IS_ON(1)) {
string s;
- for (const auto& name : segment_node_names) {
- s += " " + name;
- }
- VLOG(1) << "Segment " << segments->size() << ":" << s;
+ for (auto node : segment_nodes) s += " " + node->name();
+ VLOG(1) << "Segment " << segments->size() << ": " << s;
}
// Don't use small segments.
- if (static_cast<int>(segment_node_names.size()) <
- options.minimum_segment_size) {
+ if (static_cast<int>(segment_nodes.size()) < options.minimum_segment_size) {
VLOG(1) << "Segment " << segments->size() << " has only "
- << segment_node_names.size() << " nodes, dropping";
+ << segment_nodes.size() << " nodes, dropping";
continue;
}
+
// TODO(sami): Make segmenter placement aware once trtscopes are in place
+ std::set<string> segment_node_names;
+ for (auto node : itr.second) segment_node_names.insert(node->name());
const auto& dev_itr = device_maps.find(itr.first);
if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
VLOG(1) << "No device assigned to segment " << segments->size();