diff options
Diffstat (limited to 'tensorflow/contrib/tensorrt/convert/convert_graph.cc')
-rw-r--r-- | tensorflow/contrib/tensorrt/convert/convert_graph.cc | 131 |
1 files changed, 83 insertions, 48 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 13986127ba..3383f6bc9b 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -31,7 +31,7 @@ limitations under the License. #include "tensorflow/contrib/tensorrt/segment/segment.h" #include "tensorflow/core/common_runtime/gpu/gpu_id.h" #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" -#include "tensorflow/core/common_runtime/gpu/process_state.h" +#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" @@ -86,27 +86,48 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) { // TODO(jie): Segmentation shouldn't associated with op name. // Split it into a registration for each kernel. static const std::set<string> candidate_ops = { - "Identity", - "Snapshot", - "Const", - "Conv2D", - "MaxPool", - "BiasAdd", - "Relu", - "Add", - "Mul", - "Sub", - "Rsqrt", - "Pad", - "Mean", - "AvgPool", - "ConcatV2", - "DepthwiseConv2dNative", - "FusedBatchNorm", - "FusedBatchNormV2", - // TODO(ben,jie): ... + "Identity", + "Snapshot", + "Const", + "Conv2D", + "MaxPool", + "BiasAdd", + "Relu", + "Add", + "Mul", + "Sub", + "Rsqrt", + "Pad", + "Mean", + "AvgPool", + "ConcatV2", + "DepthwiseConv2dNative", + "FusedBatchNorm", + "FusedBatchNormV2", + "Div", + "RealDiv", + "Rsqrt", + "Reciprocal", + "Exp", + "Log", + "Sqrt", + "Abs", + "Neg", +#if NV_TENSORRT_MAJOR > 3 + "MatMul", + "BatchMatMul", + "Softmax", + "Minimum", + "Maximum", + "TopKV2", + "Sum", + "Prod", + "Max", + "Min", +#endif + // TODO(ben,jie): ... }; - // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h) + // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc) return (candidate_ops.count(node->type_string()) || PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string())); } @@ -152,7 +173,7 @@ tensorflow::Status ConvertCalibGraphToInferGraph( "Need to run graph with calibration data first!"); } if (cres->calibrator_) { - cres->calibrator_->setDone(); + cres->calibrator_->waitAndSetDone(); cres->thr_->join(); const auto& calibration_table = cres->calibrator_->getCalibrationTableAsString(); @@ -168,7 +189,7 @@ tensorflow::Status ConvertCalibGraphToInferGraph( "Can't get TRTCalibrator from resource manager!"); } cres->Unref(); - calib_rm->Cleanup(container_name); + TF_RETURN_IF_ERROR(calib_rm->Cleanup(container_name)); } } return tensorflow::Status::OK(); @@ -248,6 +269,7 @@ tensorflow::Status GetEngineInfo( const std::vector<tensorflow::Node*>& reverse_topo_order, EngineInfo* info) { std::vector<int> subgraph_node_ids; + std::set<int> added_const_node_ids; // Used to prevent double insertion. std::set<string> segment_devices; int input_port = 0; int output_port = 0; @@ -257,6 +279,7 @@ tensorflow::Status GetEngineInfo( // edge, thus there must not be any duplicates since source nodes of // input/output edges must be in different split of the graph. // TODO(aaroey): consider using node id and port instead. + // TODO(aaroey): using topo order instead of reverting reverse topo order. std::unordered_map<string, int> created_edges; for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend(); ++it) { @@ -275,19 +298,22 @@ tensorflow::Status GetEngineInfo( << " neither have requested device nor assigned device"; } } - int node_id = node->id(); - subgraph_node_ids.push_back(node_id); + const int node_id = node->id(); for (const auto edge : node->in_edges()) { auto input_node = edge->src(); - if (segment_nodes.count(input_node->name()) == 0) { + if (segment_nodes.count(input_node->name()) == 0 && + !edge->IsControlEdge() && !input_node->IsSource()) { // Add constant input node into the segment. We don't care if it has // other output edges going into other engines or TF nodes. Since we add // it only to the subsegment node list, not the subsegment itself, it // won't be removed from the graph. If it doesn't have any edges, TF // will prune it out. if (input_node->type_string() == "Const") { - subgraph_node_ids.push_back(input_node->id()); - } else if (!edge->IsControlEdge() && !input_node->IsSource()) { + if (added_const_node_ids.count(input_node->id()) == 0) { + added_const_node_ids.insert(input_node->id()); + subgraph_node_ids.push_back(input_node->id()); + } + } else { string s(input_node->name()); StrAppend(&s, ":", edge->src_output()); VLOG(1) << "Input edge = " << s; @@ -304,6 +330,9 @@ tensorflow::Status GetEngineInfo( } } } + // We need to add possible const input nodes before adding this node in + // order to keep the topological order. + subgraph_node_ids.push_back(node_id); for (const auto edge : node->out_edges()) { auto output_node = edge->dst(); if (segment_nodes.count(output_node->name()) == 0 && @@ -350,9 +379,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, nvinfer1::IGpuAllocator* alloc, int max_batch_size) { const auto& info = infos.at(pos); - std::vector<tensorflow::TensorShapeProto> out_shapes; - std::vector<tensorflow::TensorShapeProto> input_shapes; - std::vector<tensorflow::PartialTensorShape> shapes; + std::vector<tensorflow::TensorShapeProto> output_shape_protos; + std::vector<tensorflow::TensorShapeProto> input_shape_protos; + std::vector<tensorflow::PartialTensorShape> input_shapes; std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs; std::vector<tensorflow::DataType> out_types; VLOG(1) << "Processing " << info.engine_name; @@ -365,11 +394,11 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, tensorflow::TensorShapeProto out_shape; // shape of the output node inside segment conn.inside_shape.AsProto(&out_shape); - if (out_shapes.size() <= conn.port_number) { - out_shapes.resize(conn.port_number + 1); + if (output_shape_protos.size() <= conn.port_number) { + output_shape_protos.resize(conn.port_number + 1); out_types.resize(conn.port_number + 1); } - out_shapes.at(conn.port_number) = out_shape; + output_shape_protos.at(conn.port_number) = out_shape; out_types.at(conn.port_number) = conn.connection_type; continue; } @@ -377,12 +406,12 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, // Set the shapes and data types of input edge. tensorflow::TensorShapeProto in_shape; conn.outside_shape.AsProto(&in_shape); - if (input_shapes.size() <= conn.port_number) { + if (input_shape_protos.size() <= conn.port_number) { + input_shape_protos.resize(conn.port_number + 1); input_shapes.resize(conn.port_number + 1); - shapes.resize(conn.port_number + 1); } - input_shapes.at(conn.port_number) = in_shape; - shapes.at(conn.port_number) = conn.outside_shape; + input_shape_protos.at(conn.port_number) = in_shape; + input_shapes.at(conn.port_number) = conn.outside_shape; string input_node = conn.outside_node_name; int input_port = conn.outside_port; @@ -410,6 +439,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> " << info.engine_name << ":" << inputs.size(); // Skip duplicate inputs. + // TODO(aaroey): use std::find instead. GetEngineInfo already remove + // duplicate connections, so here we should never find any duplicate? bool new_input = true; for (const auto& inp : inputs) { if (inp.node == input_node && inp.index == input_port) { @@ -437,8 +468,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, TF_RETURN_IF_ERROR(ConvertGraphDefToEngine( info.segment_graph_def, info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode, - max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger, - alloc, /*calibrator=*/nullptr, &engine, + max_batch_size, info.max_workspace_size_bytes, input_shapes, + &trt_logger, alloc, /*calibrator=*/nullptr, &engine, /*convert_successfully=*/nullptr)); TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize()); segment_string = @@ -486,8 +517,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, } tensorflow::NodeDef trt_node; tensorflow::Status status = - node_builder.Attr("input_shapes", input_shapes) - .Attr("output_shapes", out_shapes) + node_builder.Attr("input_shapes", input_shape_protos) + .Attr("output_shapes", output_shape_protos) .Attr("static_engine", info.engine_type == EngineInfo::EngineType::TRTStatic) .Attr("segment_funcdef_name", @@ -596,7 +627,9 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary( edge->src()->output_type(edge->src_output())); VLOG(1) << " input " << nout.node << ":" << nout.index << " dtype=" << tensorflow::DataTypeString(nout.data_type); - node_builder.Input({nout}); + // nvcc complains that Input(<brace-enclosed initializer list>) is + // ambiguous, so do not use Input({nout}). + node_builder.Input(nout); TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0)) .Attr("index", i) .Finalize(&nd)); @@ -652,7 +685,7 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator( // to allocators. // TODO(sami): when grappler devices become available else path will not be // necessary - auto pm = tensorflow::ProcessState::singleton(); + auto pm = tensorflow::GPUProcessState::singleton(); if (params.cluster) { // get allocator tensorflow::Device* device = nullptr; if (params.cluster->GetDeviceSet()) { @@ -704,6 +737,7 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator( } // Entry function from optimization pass. +// TODO(aaeory): parameter should use pointer type. tensorflow::Status ConvertAfterShapes(ConversionParams& params) { // Convert graphdef to graph. tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(), @@ -721,7 +755,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { segment_options.minimum_segment_size = params.minimum_segment_size; tensorflow::tensorrt::segment::SegmentNodesVector initial_segments; TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph( - &graph, IsTensorRTCandidate, segment_options, &initial_segments)); + &graph, IsTensorRTCandidate, InputEdgeValidator(*params.graph_properties), + OutputEdgeValidator(), segment_options, &initial_segments)); if (initial_segments.size() > 1) { VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << initial_segments.size(); @@ -801,7 +836,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { // The allocator is used to build the engine. The build and the built engine // will be destroyed after we get the serialized engine string, so it's fine // to use unique_ptr here. - std::unique_ptr<nvinfer1::IGpuAllocator> alloc; + std::unique_ptr<TRTBaseAllocator> alloc; auto device_alloc = GetDeviceAndAllocator(params, engine); int cuda_device_id = 0; if (device_alloc.first >= 0) { @@ -823,8 +858,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { } else { // Graph is not modified. LOG(WARNING) << "Engine creation for segment " << i << ", composed of " - << converted_segments.at(i).first.size() << " nodes failed: " - << status << ". Skipping..."; + << converted_segments.at(i).first.size() + << " nodes failed: " << status << ". Skipping..."; } } cudaSetDevice(old_cuda_device); |