aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/tensorrt/convert/convert_graph.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/tensorrt/convert/convert_graph.cc')
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_graph.cc314
1 files changed, 160 insertions, 154 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 17b32c0e30..896968647e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -29,9 +29,6 @@ limitations under the License.
#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
#include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
#include "tensorflow/core/framework/function.h"
#include "tensorflow/core/framework/graph_to_functiondef.h"
#include "tensorflow/core/framework/node_def_builder.h"
@@ -86,27 +83,48 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
// TODO(jie): Segmentation shouldn't associated with op name.
// Split it into a registration for each kernel.
static const std::set<string> candidate_ops = {
- "Identity",
- "Snapshot",
- "Const",
- "Conv2D",
- "MaxPool",
- "BiasAdd",
- "Relu",
- "Add",
- "Mul",
- "Sub",
- "Rsqrt",
- "Pad",
- "Mean",
- "AvgPool",
- "ConcatV2",
- "DepthwiseConv2dNative",
- "FusedBatchNorm",
- "FusedBatchNormV2",
- // TODO(ben,jie): ...
+ "Identity",
+ "Snapshot",
+ "Const",
+ "Conv2D",
+ "MaxPool",
+ "BiasAdd",
+ "Relu",
+ "Add",
+ "Mul",
+ "Sub",
+ "Rsqrt",
+ "Pad",
+ "Mean",
+ "AvgPool",
+ "ConcatV2",
+ "DepthwiseConv2dNative",
+ "FusedBatchNorm",
+ "FusedBatchNormV2",
+ "Div",
+ "RealDiv",
+ "Rsqrt",
+ "Reciprocal",
+ "Exp",
+ "Log",
+ "Sqrt",
+ "Abs",
+ "Neg",
+#if NV_TENSORRT_MAJOR > 3
+ "MatMul",
+ "BatchMatMul",
+ "Softmax",
+ "Minimum",
+ "Maximum",
+ "TopKV2",
+ "Sum",
+ "Prod",
+ "Max",
+ "Min",
+#endif
+ // TODO(ben,jie): ...
};
- // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
+ // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
return (candidate_ops.count(node->type_string()) ||
PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
}
@@ -142,7 +160,7 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
auto n = infer_graph->mutable_node(i);
if (n->op() == "TRTEngineOp") {
VLOG(1) << "Processing " << n->name();
- string container_name = n->attr().at("segment_funcdef_name").s();
+ const string& container_name = n->attr().at("segment_funcdef_name").s();
TRTCalibrationResource* cres = nullptr;
auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
if (!status.ok()) {
@@ -168,25 +186,50 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
"Can't get TRTCalibrator from resource manager!");
}
cres->Unref();
+ TF_RETURN_IF_ERROR(calib_rm->Cleanup(container_name));
}
}
return tensorflow::Status::OK();
}
-// Entry function from Python.
tensorflow::Status ConvertGraphDefToTensorRT(
const tensorflow::GraphDef& graph_def,
const std::vector<string>& output_names, size_t max_batch_size,
size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
int precision_mode, int minimum_segment_size, bool is_dyn_op,
int max_cached_engines, std::vector<int> cached_engine_batches) {
- // optimization pass
+ // Create GrapplerItem.
tensorflow::grappler::GrapplerItem item;
item.fetch = output_names;
item.graph = graph_def;
- // grappler requires a virtual cluster with a proper GPU device
- // in order to calculate flops>0 or fails with FATAL
- // We add numbers from a Pascal card here to have flops>0
+
+ // TODO(aaroey): we should have used single machine cluster like the
+ // following, but the problem is then wrap_conversion will depend on
+ // direct_session and cause double linking problems. To fix this we need to
+ // fix or get rid of the swig dependency. Here we use VirtualCluster
+ // as a work around, and we need to create a session to initialize the
+ // underlying device before calling this method.
+#if 0
+ // Create single machine cluster. Note that this will create a session and
+ // initialize the gpu devices.
+ const int num_cpu_cores =
+ tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+ const int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
+ VLOG(2) << "cpu_cores: " << num_cpu_cores;
+ VLOG(2) << "gpus: " << num_gpus;
+ const int timeout_s = 60 * 10;
+ std::unique_ptr<tensorflow::grappler::Cluster> cluster(
+ new tensorflow::grappler::SingleMachine(
+ timeout_s, num_cpu_cores, num_gpus));
+ // These settings are the defaults in tensorflow/python/grappler/cluster.py.
+ cluster->DisableDetailedStats(true);
+ cluster->AllowSoftPlacement(true);
+ cluster->SetNumWarmupSteps(10);
+ TF_RETURN_IF_ERROR(cluster->Provision());
+#else
+ // Create virtual cluster. Grappler requires a virtual cluster with a proper
+ // GPU device in order to calculate flops>0 or fails with FATAL in dbg mode.
+ // We add numbers from a Pascal card here to have flops>0.
tensorflow::DeviceProperties device_properties;
device_properties.set_type("GPU");
device_properties.mutable_environment()->insert({"architecture", "6"});
@@ -195,47 +238,43 @@ tensorflow::Status ConvertGraphDefToTensorRT(
std::unique_ptr<tensorflow::grappler::Cluster> cluster(
new tensorflow::grappler::VirtualCluster(
{{"/GPU:0", device_properties}}));
+#endif
- // single machine
- int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
- int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
- VLOG(2) << "cpu_cores: " << num_cpu_cores;
- VLOG(2) << "gpus: " << num_gpus;
+ // Create RewriterConfig.
tensorflow::RewriterConfig rw_cfg;
- // use only const folding and layout for the time being since new optimizers
- // break the graph for us
+ // TODO(aaroey): use only const folding and layout for the time being since
+ // new optimizers break the graph for trt.
rw_cfg.add_optimizers("constfold");
rw_cfg.add_optimizers("layout");
- rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE);
+ auto optimizer = rw_cfg.add_custom_optimizers();
+ optimizer->set_name("TensorRTOptimizer");
+ auto& parameters = *(optimizer->mutable_parameter_map());
+ parameters["minimum_segment_size"].set_i(minimum_segment_size);
+ parameters["max_batch_size"].set_i(max_batch_size);
+ parameters["is_dynamic_op"].set_b(is_dyn_op);
+ parameters["max_workspace_size_bytes"].set_i(max_workspace_size_bytes);
+ TF_RETURN_IF_ERROR(GetPrecisionModeName(
+ precision_mode, parameters["precision_mode"].mutable_s()));
+ parameters["maximum_cached_engines"].set_i(max_cached_engines);
+ if (!cached_engine_batches.empty()) {
+ auto list = parameters["cached_engine_batches"].mutable_list();
+ for (const int batch : cached_engine_batches) {
+ list->add_i(batch);
+ }
+ }
+
+ // Run optimizer.
tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
- tensorflow::GraphDef gdef;
- TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef));
- item.graph = gdef;
-
- // AJ refactoring shape inference through grappler/GraphProperties.
- tensorflow::grappler::GraphProperties static_graph_properties(item);
- TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
- // Build full graph
- ConversionParams cp;
- cp.input_graph_def = &gdef;
- cp.output_names = &output_names;
- cp.max_batch_size = max_batch_size;
- cp.output_graph_def = new_graph_def;
- cp.precision_mode = precision_mode;
- cp.is_dyn_op = is_dyn_op;
- cp.max_cached_engines = max_cached_engines;
- cp.cached_engine_batches = cached_engine_batches;
- cp.minimum_segment_size = minimum_segment_size;
- cp.graph_properties = &static_graph_properties;
- cp.max_workspace_size_bytes = max_workspace_size_bytes;
+ TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));
+
if (VLOG_IS_ON(5)) {
std::fstream f;
f.open("TRTConversionInput.pb",
std::fstream::out | std::fstream::binary | std::fstream::trunc);
- f << gdef.SerializeAsString();
+ f << new_graph_def->SerializeAsString();
f.close();
}
- return ConvertAfterShapes(cp);
+ return Status::OK();
}
// Function to get subsegment information structure.
@@ -247,6 +286,7 @@ tensorflow::Status GetEngineInfo(
const std::vector<tensorflow::Node*>& reverse_topo_order,
EngineInfo* info) {
std::vector<int> subgraph_node_ids;
+ std::set<int> added_const_node_ids; // Used to prevent double insertion.
std::set<string> segment_devices;
int input_port = 0;
int output_port = 0;
@@ -256,6 +296,7 @@ tensorflow::Status GetEngineInfo(
// edge, thus there must not be any duplicates since source nodes of
// input/output edges must be in different split of the graph.
// TODO(aaroey): consider using node id and port instead.
+ // TODO(aaroey): using topo order instead of reverting reverse topo order.
std::unordered_map<string, int> created_edges;
for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
++it) {
@@ -274,19 +315,22 @@ tensorflow::Status GetEngineInfo(
<< " neither have requested device nor assigned device";
}
}
- int node_id = node->id();
- subgraph_node_ids.push_back(node_id);
+ const int node_id = node->id();
for (const auto edge : node->in_edges()) {
auto input_node = edge->src();
- if (segment_nodes.count(input_node->name()) == 0) {
+ if (segment_nodes.count(input_node->name()) == 0 &&
+ !edge->IsControlEdge() && !input_node->IsSource()) {
// Add constant input node into the segment. We don't care if it has
// other output edges going into other engines or TF nodes. Since we add
// it only to the subsegment node list, not the subsegment itself, it
// won't be removed from the graph. If it doesn't have any edges, TF
// will prune it out.
if (input_node->type_string() == "Const") {
- subgraph_node_ids.push_back(input_node->id());
- } else if (!edge->IsControlEdge() && !input_node->IsSource()) {
+ if (added_const_node_ids.count(input_node->id()) == 0) {
+ added_const_node_ids.insert(input_node->id());
+ subgraph_node_ids.push_back(input_node->id());
+ }
+ } else {
string s(input_node->name());
StrAppend(&s, ":", edge->src_output());
VLOG(1) << "Input edge = " << s;
@@ -303,6 +347,9 @@ tensorflow::Status GetEngineInfo(
}
}
}
+ // We need to add possible const input nodes before adding this node in
+ // order to keep the topological order.
+ subgraph_node_ids.push_back(node_id);
for (const auto edge : node->out_edges()) {
auto output_node = edge->dst();
if (segment_nodes.count(output_node->name()) == 0 &&
@@ -349,9 +396,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
nvinfer1::IGpuAllocator* alloc,
int max_batch_size) {
const auto& info = infos.at(pos);
- std::vector<tensorflow::TensorShapeProto> out_shapes;
- std::vector<tensorflow::TensorShapeProto> input_shapes;
- std::vector<tensorflow::PartialTensorShape> shapes;
+ std::vector<tensorflow::TensorShapeProto> output_shape_protos;
+ std::vector<tensorflow::TensorShapeProto> input_shape_protos;
+ std::vector<tensorflow::PartialTensorShape> input_shapes;
std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
std::vector<tensorflow::DataType> out_types;
VLOG(1) << "Processing " << info.engine_name;
@@ -364,11 +411,11 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
tensorflow::TensorShapeProto out_shape;
// shape of the output node inside segment
conn.inside_shape.AsProto(&out_shape);
- if (out_shapes.size() <= conn.port_number) {
- out_shapes.resize(conn.port_number + 1);
+ if (output_shape_protos.size() <= conn.port_number) {
+ output_shape_protos.resize(conn.port_number + 1);
out_types.resize(conn.port_number + 1);
}
- out_shapes.at(conn.port_number) = out_shape;
+ output_shape_protos.at(conn.port_number) = out_shape;
out_types.at(conn.port_number) = conn.connection_type;
continue;
}
@@ -376,12 +423,12 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
// Set the shapes and data types of input edge.
tensorflow::TensorShapeProto in_shape;
conn.outside_shape.AsProto(&in_shape);
- if (input_shapes.size() <= conn.port_number) {
+ if (input_shape_protos.size() <= conn.port_number) {
+ input_shape_protos.resize(conn.port_number + 1);
input_shapes.resize(conn.port_number + 1);
- shapes.resize(conn.port_number + 1);
}
- input_shapes.at(conn.port_number) = in_shape;
- shapes.at(conn.port_number) = conn.outside_shape;
+ input_shape_protos.at(conn.port_number) = in_shape;
+ input_shapes.at(conn.port_number) = conn.outside_shape;
string input_node = conn.outside_node_name;
int input_port = conn.outside_port;
@@ -409,6 +456,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> "
<< info.engine_name << ":" << inputs.size();
// Skip duplicate inputs.
+ // TODO(aaroey): use std::find instead. GetEngineInfo already remove
+ // duplicate connections, so here we should never find any duplicate?
bool new_input = true;
for (const auto& inp : inputs) {
if (inp.node == input_node && inp.index == input_port) {
@@ -436,8 +485,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
info.segment_graph_def,
info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
- max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger,
- alloc, /*calibrator=*/nullptr, &engine,
+ max_batch_size, info.max_workspace_size_bytes, input_shapes,
+ &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
/*convert_successfully=*/nullptr));
TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
segment_string =
@@ -453,21 +502,10 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
// TODO(aaroey): use enum instead, and add a helper method to do the
// conversion.
string prec_string;
- switch (info.precision_mode) {
- case FP32MODE:
- prec_string = "FP32";
- break;
- case FP16MODE:
- prec_string = "FP16";
- break;
- case INT8MODE:
- prec_string = "INT8";
- if (!TRTResourceManager::instance()->getManager("TRTCalibration")) {
- LOG(ERROR) << "Failed to construct calibration storage";
- }
- break;
- default:
- return tensorflow::errors::OutOfRange("Unknown precision mode");
+ TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
+ if (info.precision_mode == INT8MODE &&
+ !TRTResourceManager::instance()->getManager("TRTCalibration")) {
+ LOG(ERROR) << "Failed to construct calibration storage";
}
tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
if (!info.device.empty()) node_builder.Device(info.device);
@@ -485,8 +523,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
}
tensorflow::NodeDef trt_node;
tensorflow::Status status =
- node_builder.Attr("input_shapes", input_shapes)
- .Attr("output_shapes", out_shapes)
+ node_builder.Attr("input_shapes", input_shape_protos)
+ .Attr("output_shapes", output_shape_protos)
.Attr("static_engine",
info.engine_type == EngineInfo::EngineType::TRTStatic)
.Attr("segment_funcdef_name",
@@ -595,7 +633,9 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
edge->src()->output_type(edge->src_output()));
VLOG(1) << " input " << nout.node << ":" << nout.index
<< " dtype=" << tensorflow::DataTypeString(nout.data_type);
- node_builder.Input({nout});
+ // nvcc complains that Input(<brace-enclosed initializer list>) is
+ // ambiguous, so do not use Input({nout}).
+ node_builder.Input(nout);
TF_RETURN_IF_ERROR(node_builder.Attr("T", node->output_type(0))
.Attr("index", i)
.Finalize(&nd));
@@ -632,77 +672,42 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
}
std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
- ConversionParams& params, EngineInfo& engine) {
+ const ConversionParams& params, const EngineInfo& engine) {
int cuda_device_id = -1;
- auto check_device_id = [](int tfid) -> int {
- tensorflow::TfGpuId tf_gpu_id(tfid);
- CudaGpuId cuda_gpu_id;
- Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
- if (s.ok()) {
- VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device "
- << cuda_gpu_id.value();
- return cuda_gpu_id.value();
- }
- VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s;
- return -1;
- };
tensorflow::Allocator* dev_allocator = nullptr;
- // we need to us PM here since in python path there is no way to get
- // to allocators.
- // TODO(sami): when grappler devices become available else path will not be
- // necessary
- auto pm = tensorflow::GPUProcessState::singleton();
- if (params.cluster) { // get allocator
- tensorflow::Device* device = nullptr;
- if (params.cluster->GetDeviceSet()) {
- device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device);
+ if (params.cluster) {
+ std::vector<tensorflow::Device*> devices;
+ if (!engine.device.empty() && params.cluster->GetDeviceSet()) {
+ DeviceNameUtils::ParsedName parsed_name;
+ if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) &&
+ parsed_name.has_id) {
+ params.cluster->GetDeviceSet()->FindMatchingDevices(parsed_name,
+ &devices);
+ }
}
- if (device) {
+ if (!devices.empty()) {
+ if (devices.size() > 1) {
+ string msg = "Found multiple matching devices using name '";
+ StrAppend(&msg, engine.device, "': ");
+ for (auto d : devices) StrAppend(&msg, d->name(), ", ");
+ StrAppend(&msg, ". Will get the allocator from first one.");
+ LOG(WARNING) << msg;
+ }
tensorflow::AllocatorAttributes alloc_attr;
- dev_allocator = device->GetAllocator(alloc_attr);
- VLOG(1) << "Using allocator " << dev_allocator->Name();
+ cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id;
+ dev_allocator = devices[0]->GetAllocator(alloc_attr);
+ VLOG(1) << "Using allocator " << dev_allocator->Name()
+ << " and cuda_device_id " << cuda_device_id;
} else {
LOG(WARNING) << "Cluster is set but device '" << engine.device
<< "' is not found in the cluster";
}
- } else { // cluster not found, possibly a python call
- VLOG(1) << "Cluster is not set, probably called from python";
- int found_device = 0;
- bool try_gpu_ids = true;
- // if device is set, try to find the device. Might be a problem for multi
- // host case but TensorRT do not support multi host setups yet.
- if (!engine.device.empty()) {
- DeviceNameUtils::ParsedName parsed_name;
- if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name)) {
- cuda_device_id = parsed_name.has_id ? parsed_name.id : -1;
- }
- try_gpu_ids = !parsed_name.has_id;
- }
- if (try_gpu_ids) {
- while (found_device < 100) {
- cuda_device_id = check_device_id(found_device);
- if (cuda_device_id >= 0) break;
- found_device++;
- }
- }
- if (found_device == 100) {
- LOG(ERROR) << " Can't find a GPU device to work with. Please "
- "instantiate a session to initialize devices";
- return std::make_pair(cuda_device_id, dev_allocator);
- }
- LOG(WARNING)
- << "Can't determine the device, constructing an allocator at device "
- << found_device;
- tensorflow::GPUOptions gpuoptions;
- // this will be a noop if device is already initialized
- gpuoptions.set_allow_growth(true);
- tensorflow::TfGpuId tf_gpu_id(found_device);
- dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
}
return std::make_pair(cuda_device_id, dev_allocator);
}
// Entry function from optimization pass.
+// TODO(aaeory): parameter should use pointer type.
tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
// Convert graphdef to graph.
tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -720,7 +725,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
segment_options.minimum_segment_size = params.minimum_segment_size;
tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
- &graph, IsTensorRTCandidate, segment_options, &initial_segments));
+ &graph, IsTensorRTCandidate, InputEdgeValidator(*params.graph_properties),
+ OutputEdgeValidator(), segment_options, &initial_segments));
if (initial_segments.size() > 1) {
VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
<< initial_segments.size();
@@ -800,7 +806,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
// The allocator is used to build the engine. The build and the built engine
// will be destroyed after we get the serialized engine string, so it's fine
// to use unique_ptr here.
- std::unique_ptr<nvinfer1::IGpuAllocator> alloc;
+ std::unique_ptr<TRTBaseAllocator> alloc;
auto device_alloc = GetDeviceAndAllocator(params, engine);
int cuda_device_id = 0;
if (device_alloc.first >= 0) {