aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Sami Kama <skama@nvidia.com>2018-06-12 11:50:16 -0700
committerGravatar Sami Kama <skama@nvidia.com>2018-06-12 11:50:16 -0700
commitee169363b5583ae7e16461aaf1588d6a0a9aa710 (patch)
treef27706a5551ab93835f03e27e3778659125df437
parentae13b0560666df62967d87072e85619083a2f44b (diff)
Address review comments and add a check for INT8 engine construction for calibration
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_graph.cc39
-rw-r--r--tensorflow/contrib/tensorrt/convert/convert_nodes.cc6
-rw-r--r--tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc29
-rw-r--r--tensorflow/contrib/tensorrt/python/trt_convert.py1
4 files changed, 50 insertions, 25 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 6ddfb01d9f..a102939a6e 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -173,7 +173,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
tensorflow::grappler::GrapplerItem item;
item.fetch = output_names;
item.graph = graph_def;
-
+ // grappler requires a virtual cluster with a proper GPU device
+ // in order to calculate flops>0 or fails with FATAL
+ // We add numbers from a Pascal card here to have flops>0
tensorflow::DeviceProperties device_properties;
device_properties.set_type("GPU");
device_properties.mutable_environment()->insert({"architecture", "6"});
@@ -193,7 +195,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
// break the graph for us
rw_cfg.add_optimizers("constfold");
rw_cfg.add_optimizers("layout");
-
+ rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE);
tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
tensorflow::GraphDef gdef;
TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef));
@@ -385,8 +387,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
}
}
string segment_string;
- if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
- // add static engine creation here
+ if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
+ info.precision_mode == INT8MODE) {
+ // Create static engine and for int8 test validity of the engine.
tensorflow::tensorrt::Logger trt_logger;
auto builder = std::shared_ptr<nvinfer1::IBuilder>(
nvinfer1::createInferBuilder(trt_logger), [](nvinfer1::IBuilder* p) {
@@ -402,7 +405,6 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
auto status = ConvertSubgraphToEngine(info.segment_graph_def, builder.get(),
shapes, &engine, info.precision_mode);
if (!status.ok()) {
- LOG(ERROR) << "Engine conversion failed with " << status;
return status;
}
if (engine) {
@@ -414,6 +416,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
string((const char*)engine_data->data(), engine_data->size());
engine->destroy();
}
+ if (info.precision_mode == INT8MODE) {
+ segment_string = info.segment_graph_def.SerializeAsString();
+ }
} else {
segment_string = info.segment_graph_def.SerializeAsString();
}
@@ -587,9 +592,9 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
auto native_segment = fdeflib.add_function();
TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef(
sgraph, StrCat(name, "_native_segment"), native_segment));
- if (VLOG_IS_ON(3)) {
- VLOG(3) << name << " Function_Def ";
- VLOG(3) << native_segment->DebugString();
+ if (VLOG_IS_ON(7)) {
+ VLOG(7) << name << " Function_Def ";
+ VLOG(7) << native_segment->DebugString();
}
TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib));
return tensorflow::Status::OK();
@@ -692,18 +697,24 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
auto pm = tensorflow::ProcessState::singleton();
// this should be instantiated by now
auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
- VLOG(0) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
+ VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
<< " cuda device= " << cuda_device_id << " at "
<< dev_allocator;
alloc.reset(new TRTDeviceAllocator(dev_allocator));
}
}
cudaSetDevice(cuda_device_id);
- CreateTRTNode(&graph, engine_segments, i, trt_node, alloc.get(),
- params.max_batch_size);
- const auto& internal_nodes = segments.at(i).first;
- for (auto node_id : internal_nodes) {
- graph.RemoveNode(node_map.at(node_id));
+ auto status = CreateTRTNode(&graph, engine_segments, i, trt_node,
+ alloc.get(), params.max_batch_size);
+ if (status.ok()) {
+ const auto& internal_nodes = segments.at(i).first;
+ for (auto node_id : internal_nodes) {
+ graph.RemoveNode(node_map.at(node_id));
+ }
+ } else {
+ LOG(WARNING) << "Engine creation for segment " << i << ", composed of "
+ << segments.at(i).first.size() << " nodes failed. Skipping";
+ VLOG(1) << "Failure reason " << status;
}
}
cudaSetDevice(old_cuda_device);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 3404dde4d9..a38a5e0797 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -2204,9 +2204,11 @@ tensorflow::Status ConvertSubgraphToEngine(
input_dim_pseudo_chw.nbDims = shape.dims() - 1;
nvinfer1::ITensor* input_tensor = converter.network()->addInput(
node_name.c_str(), dtype, input_dim_pseudo_chw);
- if (!input_tensor)
+ if (!input_tensor) {
return tensorflow::errors::InvalidArgument(
- "Failed to create Input layer");
+ StrCat("Failed to create Input layer tensor ", node_name,
+ " rank=", shape.dims()-1));
+ }
VLOG(1) << "Input tensor name :" << node_name;
if (!converter.insert_input_tensor(node_name, input_tensor)) {
return tensorflow::errors::AlreadyExists(
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 76153886a8..2491f34d5a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -62,7 +62,7 @@ void* GetTensorAddress(const Tensor* tensor_ptr) {
TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);
TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);
default: {
- LOG(FATAL) << "Unsupported Data type "
+ LOG(ERROR) << "Unsupported Data type "
<< tensorflow::DataTypeString(tensor_type);
return nullptr;
}
@@ -217,6 +217,11 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
for (int i = 0; i < num_inputs; i++) {
const Tensor& t = ctx->input(i);
void* data_address = GetTensorAddress(&t);
+ if (data_address == nullptr) {
+ ctx->SetStatus(tensorflow::errors::InvalidArgument(
+ StrCat("Unsupported data type encountered in input ", i)));
+ return;
+ }
const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
CHECK_EQ(t.TotalBytes(),
device_tensor->TotalBytes()); // use the tensor so FW keeps it
@@ -234,7 +239,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
return;
}
-int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext *ctx){
+int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
int num_batch = ctx->input(0).shape().dim_size(0);
int smallest_engine = 0;
for (const auto i : cached_engine_batches_) {
@@ -274,9 +279,9 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
}
int num_binding = ctx->num_inputs() + ctx->num_outputs();
std::vector<void*> buffers(num_binding);
- int smallest_engine=GetEngineBatch(ctx);
- if(smallest_engine<0)return;
- int num_batch=ctx->input(0).shape().dim_size(0);
+ int smallest_engine = GetEngineBatch(ctx);
+ if (smallest_engine < 0) return;
+ int num_batch = ctx->input(0).shape().dim_size(0);
size_t binding_index;
auto engine_ctx_pair = GetEngine(smallest_engine, ctx, fixed_input_size_);
auto trt_engine_ptr_ = engine_ctx_pair.first;
@@ -406,8 +411,10 @@ TRTEngineOp::~TRTEngineOp() {
}
TRTEngineOp::EngineCtxPair TRTEngineOp::GetEngine(int batch_size,
- OpKernelContext* ctx,
- bool ignore_dim_change) {
+ OpKernelContext* ctx,
+ bool ignore_dim_change) {
+ // TODO(sami): This method needs to be re-written to use resource manager and
+ // with LRU mechanism option.
tensorflow::mutex_lock lock(engine_mutex_);
if (static_engine_) {
if (engine_map_.size()) {
@@ -550,6 +557,10 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
void* device_address = GetTensorAddress(device_tensor);
+ if (device_address == nullptr) {
+ return tensorflow::errors::InvalidArgument(
+ StrCat("Unsupported data type encountered in input ", i));
+ }
device_buffers_.emplace(
StrCat("InputPH_", i),
std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
@@ -566,7 +577,9 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
tensorflow::tensorrt::convert::INT8MODE); // will loop until we
// terminate calibration
if (!s.ok()) {
- LOG(ERROR) << "Calibration thread failed with " << s;
+ LOG(ERROR)
+ << "Calibration failed. Engine will not be calibrated! Error is" << s;
+ cres->calibrator_->setDone(); // ignore further pushes
}
VLOG(1) << "Calibration loop terminated " << label;
});
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index a03962dda2..c9edc03431 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -168,7 +168,6 @@ def calib_graph_to_infer_graph(calibration_graph_def):
for n in calibration_graph_def.node:
if n.op == "TRTEngineOp":
is_calib_graph = len(n.attr["calibration_data"].s) == 0
- break
if not is_calib_graph:
tf_logging.error(
"Not a calib graph. Doesn't seem to contain any calibration nodes.")