diff options
author | Sami Kama <skama@nvidia.com> | 2018-06-12 11:50:16 -0700 |
---|---|---|
committer | Sami Kama <skama@nvidia.com> | 2018-06-12 11:50:16 -0700 |
commit | ee169363b5583ae7e16461aaf1588d6a0a9aa710 (patch) | |
tree | f27706a5551ab93835f03e27e3778659125df437 | |
parent | ae13b0560666df62967d87072e85619083a2f44b (diff) |
Address review comments and add a check for INT8 engine construction for calibration
4 files changed, 50 insertions, 25 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 6ddfb01d9f..a102939a6e 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -173,7 +173,9 @@ tensorflow::Status ConvertGraphDefToTensorRT( tensorflow::grappler::GrapplerItem item; item.fetch = output_names; item.graph = graph_def; - + // grappler requires a virtual cluster with a proper GPU device + // in order to calculate flops>0 or fails with FATAL + // We add numbers from a Pascal card here to have flops>0 tensorflow::DeviceProperties device_properties; device_properties.set_type("GPU"); device_properties.mutable_environment()->insert({"architecture", "6"}); @@ -193,7 +195,7 @@ tensorflow::Status ConvertGraphDefToTensorRT( // break the graph for us rw_cfg.add_optimizers("constfold"); rw_cfg.add_optimizers("layout"); - + rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE); tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg); tensorflow::GraphDef gdef; TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef)); @@ -385,8 +387,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, } } string segment_string; - if (info.engine_type == EngineInfo::EngineType::TRTStatic) { - // add static engine creation here + if (info.engine_type == EngineInfo::EngineType::TRTStatic || + info.precision_mode == INT8MODE) { + // Create static engine and for int8 test validity of the engine. tensorflow::tensorrt::Logger trt_logger; auto builder = std::shared_ptr<nvinfer1::IBuilder>( nvinfer1::createInferBuilder(trt_logger), [](nvinfer1::IBuilder* p) { @@ -402,7 +405,6 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, auto status = ConvertSubgraphToEngine(info.segment_graph_def, builder.get(), shapes, &engine, info.precision_mode); if (!status.ok()) { - LOG(ERROR) << "Engine conversion failed with " << status; return status; } if (engine) { @@ -414,6 +416,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, string((const char*)engine_data->data(), engine_data->size()); engine->destroy(); } + if (info.precision_mode == INT8MODE) { + segment_string = info.segment_graph_def.SerializeAsString(); + } } else { segment_string = info.segment_graph_def.SerializeAsString(); } @@ -587,9 +592,9 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary( auto native_segment = fdeflib.add_function(); TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef( sgraph, StrCat(name, "_native_segment"), native_segment)); - if (VLOG_IS_ON(3)) { - VLOG(3) << name << " Function_Def "; - VLOG(3) << native_segment->DebugString(); + if (VLOG_IS_ON(7)) { + VLOG(7) << name << " Function_Def "; + VLOG(7) << native_segment->DebugString(); } TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib)); return tensorflow::Status::OK(); @@ -692,18 +697,24 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { auto pm = tensorflow::ProcessState::singleton(); // this should be instantiated by now auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); - VLOG(0) << "Got an allocator for device tf_device=" << tf_gpu_id.value() + VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value() << " cuda device= " << cuda_device_id << " at " << dev_allocator; alloc.reset(new TRTDeviceAllocator(dev_allocator)); } } cudaSetDevice(cuda_device_id); - CreateTRTNode(&graph, engine_segments, i, trt_node, alloc.get(), - params.max_batch_size); - const auto& internal_nodes = segments.at(i).first; - for (auto node_id : internal_nodes) { - graph.RemoveNode(node_map.at(node_id)); + auto status = CreateTRTNode(&graph, engine_segments, i, trt_node, + alloc.get(), params.max_batch_size); + if (status.ok()) { + const auto& internal_nodes = segments.at(i).first; + for (auto node_id : internal_nodes) { + graph.RemoveNode(node_map.at(node_id)); + } + } else { + LOG(WARNING) << "Engine creation for segment " << i << ", composed of " + << segments.at(i).first.size() << " nodes failed. Skipping"; + VLOG(1) << "Failure reason " << status; } } cudaSetDevice(old_cuda_device); diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 3404dde4d9..a38a5e0797 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -2204,9 +2204,11 @@ tensorflow::Status ConvertSubgraphToEngine( input_dim_pseudo_chw.nbDims = shape.dims() - 1; nvinfer1::ITensor* input_tensor = converter.network()->addInput( node_name.c_str(), dtype, input_dim_pseudo_chw); - if (!input_tensor) + if (!input_tensor) { return tensorflow::errors::InvalidArgument( - "Failed to create Input layer"); + StrCat("Failed to create Input layer tensor ", node_name, + " rank=", shape.dims()-1)); + } VLOG(1) << "Input tensor name :" << node_name; if (!converter.insert_input_tensor(node_name, input_tensor)) { return tensorflow::errors::AlreadyExists( diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 76153886a8..2491f34d5a 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -62,7 +62,7 @@ void* GetTensorAddress(const Tensor* tensor_ptr) { TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr); TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr); default: { - LOG(FATAL) << "Unsupported Data type " + LOG(ERROR) << "Unsupported Data type " << tensorflow::DataTypeString(tensor_type); return nullptr; } @@ -217,6 +217,11 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx, for (int i = 0; i < num_inputs; i++) { const Tensor& t = ctx->input(i); void* data_address = GetTensorAddress(&t); + if (data_address == nullptr) { + ctx->SetStatus(tensorflow::errors::InvalidArgument( + StrCat("Unsupported data type encountered in input ", i))); + return; + } const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); // use the tensor so FW keeps it @@ -234,7 +239,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx, return; } -int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext *ctx){ +int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) { int num_batch = ctx->input(0).shape().dim_size(0); int smallest_engine = 0; for (const auto i : cached_engine_batches_) { @@ -274,9 +279,9 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx, } int num_binding = ctx->num_inputs() + ctx->num_outputs(); std::vector<void*> buffers(num_binding); - int smallest_engine=GetEngineBatch(ctx); - if(smallest_engine<0)return; - int num_batch=ctx->input(0).shape().dim_size(0); + int smallest_engine = GetEngineBatch(ctx); + if (smallest_engine < 0) return; + int num_batch = ctx->input(0).shape().dim_size(0); size_t binding_index; auto engine_ctx_pair = GetEngine(smallest_engine, ctx, fixed_input_size_); auto trt_engine_ptr_ = engine_ctx_pair.first; @@ -406,8 +411,10 @@ TRTEngineOp::~TRTEngineOp() { } TRTEngineOp::EngineCtxPair TRTEngineOp::GetEngine(int batch_size, - OpKernelContext* ctx, - bool ignore_dim_change) { + OpKernelContext* ctx, + bool ignore_dim_change) { + // TODO(sami): This method needs to be re-written to use resource manager and + // with LRU mechanism option. tensorflow::mutex_lock lock(engine_mutex_); if (static_engine_) { if (engine_map_.size()) { @@ -550,6 +557,10 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources( const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); void* device_address = GetTensorAddress(device_tensor); + if (device_address == nullptr) { + return tensorflow::errors::InvalidArgument( + StrCat("Unsupported data type encountered in input ", i)); + } device_buffers_.emplace( StrCat("InputPH_", i), std::pair<void*, size_t>(device_address, device_tensor->TotalBytes())); @@ -566,7 +577,9 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources( tensorflow::tensorrt::convert::INT8MODE); // will loop until we // terminate calibration if (!s.ok()) { - LOG(ERROR) << "Calibration thread failed with " << s; + LOG(ERROR) + << "Calibration failed. Engine will not be calibrated! Error is" << s; + cres->calibrator_->setDone(); // ignore further pushes } VLOG(1) << "Calibration loop terminated " << label; }); diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py index a03962dda2..c9edc03431 100644 --- a/tensorflow/contrib/tensorrt/python/trt_convert.py +++ b/tensorflow/contrib/tensorrt/python/trt_convert.py @@ -168,7 +168,6 @@ def calib_graph_to_infer_graph(calibration_graph_def): for n in calibration_graph_def.node: if n.op == "TRTEngineOp": is_calib_graph = len(n.attr["calibration_data"].s) == 0 - break if not is_calib_graph: tf_logging.error( "Not a calib graph. Doesn't seem to contain any calibration nodes.") |