Multiple changes:

1. use unique_ptr instead of shared_ptr, and fix a bug in destructor of TrtEngineOp where it did't reset the shared_ptr but a copy of it 2. fix the include order 3. shorten the reference to tensorflow::tensorrt::xxx 4. remove some code that sets something which will be overwritten later 5. fix format, including: function signature, variable names, const reference, etc 6. remove some deadcode 7. add a lot of comments and TODOs 8. in TrtEngineOp, replace the map of allocators with a single unique_ptr 9. in TrtEngineOp, remove parameter ignore_dim_change from GetEngine(), since it always uses member fixed_input_size_
author: gracehoney <31743510+aaroey@users.noreply.github.com> 2018-06-19 12:14:10 -0700
committer: gracehoney <31743510+aaroey@users.noreply.github.com> 2018-06-19 12:14:10 -0700
commit: b5a8d9ea0ec49b1e3fee5441a78a3fb33cd4d470 (patch)
tree: 5e1aa2478541a54baf43539afe6dfbc1b5b5e57a
parent: 0fb21f608c334dfcaadab7b918c06b88afa8c592 (diff)
9 files changed, 514 insertions, 456 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index c17ef5fdab..bd6ed2d593 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <fstream>
 #include <list>
@@ -25,6 +24,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
@@ -76,6 +77,7 @@ std::vector<int> GetLoadedTensorRTVersion() {
   int ver_patch = ver - ver_minor * 100;
   return {ver_major, ver_minor, ver_patch};
 }
+
 namespace {
 
 bool IsTensorRTCandidate(const tensorflow::Node* node) {
@@ -121,13 +123,14 @@ tensorflow::Status BuildNodeMap(
 }
 
 }  // namespace
+
 // Function to get calibration from ResourceMgr and put them into nodedef.
 tensorflow::Status ConvertCalibGraphToInferGraph(
     const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
     bool is_dyn_op) {
   VLOG(0) << "Starting Calib Conversion";
   infer_graph->CopyFrom(graph_def);
-  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto trt_rm = TRTResourceManager::instance();
   auto calib_rm = trt_rm->getManager("TRTCalibration");
   int num_nodes = infer_graph->node_size();
   if (!is_dyn_op) {
@@ -139,7 +142,7 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
     if (n->op() == "TRTEngineOp") {
       VLOG(1) << "Processing " << n->name();
       string container_name = n->attr().at("segment_funcdef_name").s();
-      tensorflow::tensorrt::TRTCalibrationResource* cres = nullptr;
+      TRTCalibrationResource* cres = nullptr;
       auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
       if (!status.ok()) {
         LOG(ERROR) << "Could not get Calibration information. Did you run with "
@@ -240,14 +243,16 @@ EngineInfo GetEngineInfo(
     const tensorflow::grappler::GraphProperties& graph_properties,
     const std::set<string>& segment_nodes,
     const std::unordered_map<string, tensorflow::Node*>& node_map,
-    const std::vector<tensorflow::Node*>& topological_order) {
+    const std::vector<tensorflow::Node*>& reverse_topo_order) {
   std::vector<int> subgraph_node_ids;
   EngineInfo info;
   std::set<string> segment_devices;
   int input_port = 0;
   int output_port = 0;
+  // TODO(aaroey): consider using node id and port instead. Also, here we assume
+  // that input edge set and output edge set have no intersection, is this true?
   std::unordered_map<string, int> created_edges;
-  for (auto it = topological_order.rbegin(); it != topological_order.rend();
+  for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
        ++it) {
     auto node_name = (*it)->name();
 
@@ -287,9 +292,11 @@ EngineInfo GetEngineInfo(
             created_edges.insert({s, port});
             input_port++;
           }
-          EngineConnections ec(input_node->name(), input_node->id(),
+          EngineConnection ec(input_node->name(), input_node->id(),
                                edge->src_output(), node_name, node_id,
                                edge->dst_input(), true, port);
+          // TODO(aaroey): this will be rewritten in
+          // ConvertSegmentToSubGraphDef, fix it.
           ec.connection_type = input_node->output_type(edge->src_output());
 
           info.connections.emplace_back(std::move(ec));
@@ -317,10 +324,9 @@ EngineInfo GetEngineInfo(
     }
   }
 
-  ConvertSegmentToGraphDef(g, graph_properties, subgraph_node_ids,
-                           &info.connections, &info.segment_graph_def,
-                           &info.engine_name);
-  info.engine_type = EngineInfo::EngineType::TRTStatic;
+  ConvertSegmentToSubGraphDef(g, graph_properties, subgraph_node_ids,
+                              &info.connections, &info.segment_graph_def,
+                              &info.engine_name);
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info.device = *segment_devices.begin();
@@ -336,23 +342,27 @@ EngineInfo GetEngineInfo(
 }
 
 // Function to insert a TRT node into the graph.
+// 'alloc' is only used for creating static engine.
 tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
                                  const std::vector<EngineInfo>& infos, int pos,
-                                 tensorflow::NodeDef* trt_node,
                                  nvinfer1::IGpuAllocator* alloc,
                                  int max_batch_size) {
-  auto& info = infos.at(pos);
+  const auto& info = infos.at(pos);
   std::vector<tensorflow::TensorShapeProto> out_shapes;
   std::vector<tensorflow::TensorShapeProto> input_shapes;
   std::vector<tensorflow::PartialTensorShape> shapes;
   std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
   std::vector<tensorflow::DataType> out_types;
   VLOG(1) << "Processing " << info.engine_name;
-  for (const auto conn : info.connections) {
-    if (!conn.is_input_edge) {  // output edge
+
+  // Update the shape and data types of input/output nodes, and find all unique
+  // inputs.
+  for (const auto& conn : info.connections) {
+    if (!conn.is_input_edge) {
+      // Set the shapes and data types of output edge.
       tensorflow::TensorShapeProto out_shape;
-      conn.inside_shape.AsProto(
-          &out_shape);  // shape of the output node inside segment
+      // shape of the output node inside segment
+      conn.inside_shape.AsProto(&out_shape);
       if (out_shapes.size() <= conn.port_number) {
         out_shapes.resize(conn.port_number + 1);
         out_types.resize(conn.port_number + 1);
@@ -360,10 +370,11 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
       out_shapes.at(conn.port_number) = out_shape;
       out_types.at(conn.port_number) = conn.connection_type;
       continue;
-    }  // input edge
+    }
+
+    // Set the shapes and data types of input edge.
     tensorflow::TensorShapeProto in_shape;
     conn.outside_shape.AsProto(&in_shape);
-
     if (input_shapes.size() <= conn.port_number) {
       input_shapes.resize(conn.port_number + 1);
       shapes.resize(conn.port_number + 1);
@@ -373,18 +384,13 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
 
     string input_node = conn.outside_node_name;
     int input_port = conn.outside_port;
-    auto dtype = conn.connection_type;
     bool found_engine = false;
     // Rewire the inputs to other engines if they contain original input node
     for (size_t t = 0; t < infos.size(); ++t) {
-      if (t == pos) {
-        continue;
-      }
+      if (t == pos) continue;
       auto& engine_info = infos.at(t);
       for (const auto& eng_conn : engine_info.connections) {
-        if (eng_conn.is_input_edge) {
-          continue;
-        }
+        if (eng_conn.is_input_edge) continue;
         if (eng_conn.inside_node_name == input_node) {
           input_node = engine_info.engine_name;
           if (eng_conn.inside_port == input_port) {
@@ -398,6 +404,7 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     }
     VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> "
             << info.engine_name << ":" << inputs.size();
+    // Skip duplicate inputs.
     bool new_input = true;
     for (const auto& inp : inputs) {
       if (inp.node == input_node && inp.index == input_port) {
@@ -406,78 +413,63 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
       }
     }
     if (new_input) {
-      inputs.emplace_back(input_node, input_port, dtype);
+      inputs.emplace_back(input_node, input_port, conn.connection_type);
     }
   }
+
+  // Build the engine and get its serialized representation.
   string segment_string;
   if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
       info.precision_mode == INT8MODE) {
     // Create static engine and for int8 test validity of the engine.
-    tensorflow::tensorrt::Logger trt_logger;
-    auto builder = std::shared_ptr<nvinfer1::IBuilder>(
-        nvinfer1::createInferBuilder(trt_logger), [](nvinfer1::IBuilder* p) {
-          if (p) p->destroy();
-        });
+    Logger trt_logger;
+    auto builder = std::unique_ptr<
+        nvinfer1::IBuilder, std::function<void(nvinfer1::IBuilder*)>>(
+        nvinfer1::createInferBuilder(trt_logger),
+        [](nvinfer1::IBuilder* p) { if (p) p->destroy(); });
     builder->setMaxBatchSize(max_batch_size);
-    if (info.precision_mode == tensorflow::tensorrt::convert::FP16MODE) {
-      builder->setHalf2Mode(true);
-    }
+    if (info.precision_mode == FP16MODE) builder->setHalf2Mode(true);
     builder->setMaxWorkspaceSize(info.max_workspace_size_bytes);
 #if NV_TENSORRT_MAJOR > 3
     builder->setGpuAllocator(alloc);
 #endif
-    nvinfer1::ICudaEngine* engine = nullptr;
+    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
-    auto status = ConvertSubgraphToEngine(info.segment_graph_def, builder.get(),
-                                          shapes, &engine, info.precision_mode);
-    if (!status.ok()) {
-      if (engine) engine->destroy();
-      return status;
-    }
-    if (engine) {
-      auto engine_data = std::shared_ptr<nvinfer1::IHostMemory>(
-          engine->serialize(), [](nvinfer1::IHostMemory* p) {
-            if (p) p->destroy();
-          });
-      segment_string =
-          string((const char*)engine_data->data(), engine_data->size());
-      engine->destroy();
-    }
+    TF_RETURN_IF_ERROR(ConvertSubGraphDefToEngine(
+        info.segment_graph_def, info.precision_mode, shapes, builder.get(),
+        &engine, /*convert_successfully=*/nullptr));
+    TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
+    segment_string =
+        string((const char*)engine_data->data(), engine_data->size());
     if (info.precision_mode == INT8MODE) {
+      // TODO(aaroey): why not put this inside the 'else' branch?
       segment_string = info.segment_graph_def.SerializeAsString();
     }
   } else {
     segment_string = info.segment_graph_def.SerializeAsString();
   }
+
+  // TODO(aaroey): use enum instead, and add a helper method to do the
+  // conversion.
   string prec_string;
   switch (info.precision_mode) {
-    case FP32MODE: {
+    case FP32MODE:
       prec_string = "FP32";
       break;
-    }
-    case FP16MODE: {
+    case FP16MODE:
       prec_string = "FP16";
       break;
-    }
-    case INT8MODE: {
+    case INT8MODE:
       prec_string = "INT8";
-      auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
-      auto calib_rm = trt_rm->getManager("TRTCalibration");
-      if (!calib_rm) {
+      if (!TRTResourceManager::instance()->getManager("TRTCalibration")) {
         LOG(ERROR) << "Failed to construct calibration storage";
       }
       break;
-    }
-    default: {
+    default:
       return tensorflow::errors::OutOfRange("Unknown precision mode");
-    }
   }
-  tensorflow::Status status;
-  tensorflow::Node* engine_node = nullptr;
   tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
-  if (!info.device.empty()) {
-    node_builder.Device(info.device);
-  }
+  if (!info.device.empty()) node_builder.Device(info.device);
   if (VLOG_IS_ON(1)) {
     string ins=StrCat(info.engine_name," inputs= ");
     for (const auto& ii : inputs) {
@@ -486,50 +478,53 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     VLOG(1) << ins;
   }
   node_builder.Input(inputs);
-  if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
-    if (info.cached_engine_batches.size()) {
-      LOG(WARNING) << "Cached engine batches are ignored for static engines";
-    }
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic &&
+      info.cached_engine_batches.size()) {
+    LOG(WARNING) << "Cached engine batches are ignored for static engines";
   }
-  status = node_builder.Attr("input_shapes", input_shapes)
-               .Attr("output_shapes", out_shapes)
-               .Attr("static_engine",
-                     info.engine_type == EngineInfo::EngineType::TRTStatic)
-               .Attr("segment_funcdef_name",
-                     StrCat(info.engine_name, "_native_segment"))
-               .Attr("serialized_segment", segment_string)
-               .Attr("calibration_data", "")
-               .Attr("max_cached_engines_count", info.maximum_cached_engines)
-               .Attr("cached_engine_batches", {max_batch_size})
-               .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
-               .Attr("precision_mode", prec_string)
-               .Attr("OutT", out_types)
-               .Finalize(trt_node);
+  tensorflow::NodeDef trt_node;
+  tensorflow::Status status =
+      node_builder.Attr("input_shapes", input_shapes)
+          .Attr("output_shapes", out_shapes)
+          .Attr("static_engine",
+                info.engine_type == EngineInfo::EngineType::TRTStatic)
+          .Attr("segment_funcdef_name",
+                StrCat(info.engine_name, "_native_segment"))
+          .Attr("serialized_segment", segment_string)
+          .Attr("calibration_data", "")
+          .Attr("max_cached_engines_count", info.maximum_cached_engines)
+          .Attr("cached_engine_batches", {max_batch_size})
+          .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
+          .Attr("precision_mode", prec_string)
+          .Attr("OutT", out_types)
+          .Finalize(&trt_node);
   if (!status.ok()) {
     LOG(ERROR) << "Node construction failed with" << status;
     return status;
   }
   VLOG(1) << "Adding TRTEngine " << info.engine_name << " to graph";
-  engine_node = graph->AddNode(*trt_node, &status);
+  tensorflow::Node* engine_node = graph->AddNode(trt_node, &status);
   if (!status.ok()) {
     LOG(ERROR) << "Adding node failed " << status;
     return status;
   }
-
+  // Updates the inputs of output edges destination nodes, and point them to the
+  // engine node.
   for (auto& conn : info.connections) {
     if (conn.is_input_edge) continue;
     VLOG(1) << " Updating DBG " << engine_node->name() << " out_port "
             << conn.port_number << " out_id " << conn.outside_id
             << " name=" << conn.outside_node_name;
     auto dst_node = graph->FindNodeId(conn.outside_id);
-    if (!dst_node) {  // node removed skip.
-      continue;
-    }
+    // TODO(aaroey): node could be removed during construction of other TRT
+    // nodes, but then in that case who is going to update their input nodes?
+    if (!dst_node) continue;
     VLOG(1) << "Updating " << engine_node->name() << ":" << conn.port_number
             << " to " << dst_node->name() << ":" << conn.outside_port;
     status = graph->UpdateEdge(engine_node, conn.port_number, dst_node,
                                conn.outside_port);
     if (!status.ok()) {
+      // TODO(aaroey): should we return the status?
       LOG(ERROR) << "Edge update failed " << engine_node->name() << ":"
                  << conn.port_number << " -> " << dst_node->name() << ":"
                  << conn.outside_port << " status= " << status;
@@ -631,9 +626,7 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
 std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
     ConversionParams& params, EngineInfo& engine) {
   int cuda_device_id = -1;
-  // we need to us PM here since in python path there is no way to get
-  // to allocators
-  auto CheckDeviceID = [](int tfid) -> int {
+  auto check_device_id = [](int tfid) -> int {
     tensorflow::TfGpuId tf_gpu_id(tfid);
     CudaGpuId cuda_gpu_id;
     Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
@@ -646,6 +639,9 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
     return -1;
   };
   tensorflow::Allocator* dev_allocator = nullptr;
+  // we need to us PM here since in python path there is no way to get
+  // to allocators
+  // TODO(aaroey): fix this.
   auto pm = tensorflow::ProcessState::singleton();
   if (params.cluster) {  // get allocator
     const tensorflow::Device* device = nullptr;
@@ -653,15 +649,15 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
       device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device);
     }
     if (device) {
-      cuda_device_id = CheckDeviceID(device->parsed_name().id);
+      cuda_device_id = check_device_id(device->parsed_name().id);
       if (cuda_device_id < 0) {
-        LOG(ERROR) << "Cuda device identification failed, using device "
-                      "0.";
+        LOG(ERROR) << "Cuda device identification failed, using device 0.";
         cuda_device_id = 0;
       }
       tensorflow::GPUOptions gpuoptions;
       // this should be instantiated by now
       tensorflow::TfGpuId tf_gpu_id(device->parsed_name().id);
+      // TODO(aaroey): why not using device->GetAllocator()?
       dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
       VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
               << " cuda device= " << cuda_device_id << " at " << dev_allocator;
@@ -676,19 +672,16 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
     // if device is set, try to find the device. Might be a problem for multi
     // host case but TensorRT do not support multi host setups yet.
     if (!engine.device.empty()) {
-      tensorflow::DeviceNameUtils::ParsedName parsed_name;
-      if (tensorflow::DeviceNameUtils::ParseFullName(engine.device,
-                                                     &parsed_name)) {
+      DeviceNameUtils::ParsedName parsed_name;
+      if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name)) {
         cuda_device_id = parsed_name.has_id ? parsed_name.id : -1;
       }
       try_gpu_ids = !parsed_name.has_id;
     }
     if (try_gpu_ids) {
       while (found_device < 100) {
-        cuda_device_id = CheckDeviceID(found_device);
-        if (cuda_device_id >= 0) {
-          break;
-        }
+        cuda_device_id = check_device_id(found_device);
+        if (cuda_device_id >= 0) break;
         found_device++;
       }
     }
@@ -698,31 +691,32 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
       return std::make_pair(cuda_device_id, dev_allocator);
     }
     LOG(WARNING)
-        << "Can't determine the device constructing an allocator at device "
+        << "Can't determine the device, constructing an allocator at device "
         << found_device;
     tensorflow::GPUOptions gpuoptions;
-    gpuoptions.set_allow_growth(
-        true);  // this will be a noop if device is already initialized
+    // this will be a noop if device is already initialized
+    gpuoptions.set_allow_growth(true);
     tensorflow::TfGpuId tf_gpu_id(found_device);
     dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
   }
   return std::make_pair(cuda_device_id, dev_allocator);
 }
+
 // Entry function from optimization pass.
 tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
-  // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorflow::tensorrt::segment::SegmentOptions segment_options;
+  // Convert graphdef to graph.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              params.input_graph_def->library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), *params.input_graph_def, &graph));
 
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : *(params.output_names)) {
     segment_options.exclude_node_list.insert(node);
   }
-
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
@@ -730,34 +724,38 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   if (segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
+
+  // Get the EngineInfo for each segment.
   std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  std::unordered_map<string, std::pair<int, string>> output_edge_map;
   float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
   engine_segments.reserve(segments.size());
-  std::vector<tensorflow::Node*> topo_order;
-  tensorflow::GetPostOrder(graph, &topo_order);
-  size_t total_engine_size = 0;
-  std::vector<size_t> engine_sizes;
+  std::vector<tensorflow::Node*> reverse_topo_order;
+  tensorflow::GetPostOrder(graph, &reverse_topo_order);
+  size_t total_engine_bytes_size = 0;
+  std::vector<size_t> engine_bytes_size;
   for (size_t t = 0; t < segments.size(); t++) {
     auto& s = segments.at(t);
-    engine_segments.emplace_back(GetEngineInfo(&graph, *params.graph_properties,
-                                               s.first, node_map, topo_order));
+    engine_segments.emplace_back(GetEngineInfo(
+        &graph, *params.graph_properties, s.first, node_map,
+        reverse_topo_order));
     auto& curr_engine = engine_segments.back();
     curr_engine.precision_mode = params.precision_mode;
-    engine_sizes.push_back(curr_engine.segment_graph_def.ByteSizeLong());
     curr_engine.engine_type =
         (params.is_dyn_op || params.precision_mode == INT8MODE
              ? EngineInfo::EngineType::TRTDynamic
              : EngineInfo::EngineType::TRTStatic);
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    total_engine_size += engine_sizes.back();
-    total_num_nodes_in_segments += s.first.size();
     StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
     RegisterSegmentFunctionToFunctionLibrary(
         &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
+
+    engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
+    total_engine_bytes_size += engine_bytes_size.back();
+    total_num_nodes_in_segments += s.first.size();
+
     if (VLOG_IS_ON(8)) {
       string fname = curr_engine.engine_name;
       StrAppend(&fname, ".pb");
@@ -767,54 +765,54 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       f.close();
     }
   }
-  std::vector<tensorflow::NodeDef*> trt_nodes;
-  trt_nodes.reserve(engine_segments.size());
+
+  // Create a TRT node for each segment using its EngineInfo.
   int old_cuda_device = 0;
   auto err = cudaGetDevice(&old_cuda_device);
   if (err != cudaSuccess) {
-    LOG(ERROR) << "Couldn't get current device error is "
-               << cudaGetErrorString(err);
+    LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err);
   }
   VLOG(1) << "Current cuda device is " << old_cuda_device;
   for (int i = 0; i < engine_segments.size(); ++i) {
-    auto trt_node = new tensorflow::NodeDef;
-    trt_nodes.push_back(trt_node);
     auto& engine = engine_segments.at(i);
     // Partition the workspace size by the average of node ratio and segment
     // graphdef size
     engine.max_workspace_size_bytes =
         params.max_workspace_size_bytes *
-        (engine_sizes.at(i) / total_engine_size +
+        (engine_bytes_size.at(i) / total_engine_bytes_size +
          segments.at(i).first.size() / total_num_nodes_in_segments) /
         2.0;
-    std::shared_ptr<nvinfer1::IGpuAllocator> alloc;
+    // The allocator is used to build the engine. The build and the built engine
+    // will be destroyed after we get the serialized engine string, so it's fine
+    // to use unique_ptr here.
+    std::unique_ptr<nvinfer1::IGpuAllocator> alloc;
     auto device_alloc = GetDeviceAndAllocator(params, engine);
     int cuda_device_id = 0;
     if (device_alloc.first >= 0) {
       cuda_device_id = device_alloc.first;
       alloc.reset(new TRTDeviceAllocator(device_alloc.second));
-    } else {  // Setting allocator as nullptr should get revert to the
-              // cudamalloc
+    } else {
+      // Setting allocator as nullptr should get revert to the cudamalloc
       LOG(WARNING) << "Can't identify the cuda device. Running on device 0 ";
     }
     cudaSetDevice(cuda_device_id);
-    auto status = CreateTRTNode(&graph, engine_segments, i, trt_node,
-                                alloc.get(), params.max_batch_size);
+    auto status = CreateTRTNode(
+        &graph, engine_segments, i, alloc.get(), params.max_batch_size);
     if (status.ok()) {
-      const auto& internal_nodes = segments.at(i).first;
-      for (auto node_id : internal_nodes) {
-        graph.RemoveNode(node_map.at(node_id));
+      for (auto node_name : segments.at(i).first) {
+        graph.RemoveNode(node_map.at(node_name));
       }
     } else {
+      // TODO(aaroey): in this case, the graph is already modified, we should
+      // return the status?
       LOG(WARNING) << "Engine creation for segment " << i << ", composed of "
-                   << segments.at(i).first.size() << " nodes failed. Skipping";
-      VLOG(1) << "Failure reason " << status;
+                   << segments.at(i).first.size() << " nodes failed: "
+                   << status << ". Skipping...";
     }
   }
   cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
-  for (auto tn : trt_nodes) delete tn;
-  VLOG(1)<<"Returning from conversion";
+  VLOG(1) << "Returning from conversion";
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index e2f4c1c83f..9d986e4890 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -64,10 +64,10 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
     const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def,
     bool is_dyn_op);
 
-// max_batch_size: maximum batch size which can be used for inference for
-//                 optimization targets inference run with max batch size.
-// max_workspace_size_bytes: The upper bound of memory allowance for
-//                 engine building.
+// - max_batch_size: maximum batch size which can be used for inference for
+//   optimization targets inference run with max batch size.
+// - max_workspace_size_bytes: The upper bound of memory allowance for engine
+//   building.
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 6ad2d7e68f..a252ea67df 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <algorithm>
 #include <list>
@@ -25,7 +24,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
@@ -125,12 +126,10 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
 
 string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
   size_t last_scope_separator = 0;
-  for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) {
-    if (op_name_a[i] != op_name_b[i]) {
-      break;
-    } else if (op_name_a[i] == '/') {
-      last_scope_separator = i + 1;
-    }
+  const size_t min_size = std::min(op_name_a.size(), op_name_b.size());
+  for (size_t i = 0; i < min_size; ++i) {
+    if (op_name_a[i] != op_name_b[i]) break;
+    if (op_name_a[i] == '/') last_scope_separator = i + 1;
   }
   return op_name_a.substr(0, last_scope_separator);
 }
@@ -2144,10 +2143,14 @@ void Converter::register_op_converters() {
 
 }  // namespace
 
-tensorflow::Status ConvertSubgraphToEngine(
-    const tensorflow::GraphDef& gdef, nvinfer1::IBuilder* builder,
+tensorflow::Status ConvertSubGraphDefToEngine(
+    const tensorflow::GraphDef& gdef, int precision_mode,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
-    nvinfer1::ICudaEngine** engine, int precision_mode) {
+    nvinfer1::IBuilder* builder,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    bool* convert_successfully) {
+  engine->reset();
+  if (convert_successfully) *convert_successfully = false;
   auto trt_network = infer_object(builder->createNetwork());
   if (!trt_network) {
     return tensorflow::errors::Internal(
@@ -2159,7 +2162,7 @@ tensorflow::Status ConvertSubgraphToEngine(
   VLOG(1) << "Starting engine conversion ";
   Converter converter(trt_network.get(), ws.get(), precision_mode == FP16MODE);
   std::vector<std::pair<string, string>> output_tensors;
-  // graph nodes are already topologically sorted during construction
+  // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
     string node_name = node_def.name();
     VLOG(1) << "Converting op name=" << node_name << ", op=" << node_def.op();
@@ -2215,7 +2218,7 @@ tensorflow::Status ConvertSubgraphToEngine(
       }
     } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
                (node_def.op() == "Identity")) {
-      tensorflow::int32 slot_number = -1;
+      int32 slot_number = -1;
       if (!tensorflow::strings::safe_strto32(node_name.c_str() + 9,
                                              &slot_number)) {
         LOG(ERROR) << "Failed to parse slot number from " << node_name
@@ -2248,122 +2251,130 @@ tensorflow::Status ConvertSubgraphToEngine(
 
     converter.network()->markOutput(*tensor);
   }
+  if (convert_successfully) *convert_successfully = true;
+
+  // Build the engine.
   VLOG(1) << "Starting engine creation";
-  *engine = builder->buildCudaEngine(*converter.network());
+  engine->reset(builder->buildCudaEngine(*converter.network()));
+  if (engine->get() == nullptr) {
+    return tensorflow::errors::Internal("Failed to build TensorRT engine");
+  }
   VLOG(1) << "Finished conversion";
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertSegmentToGraphDef(
+tensorflow::Status ConvertSegmentToSubGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::vector<int>& subgraph_node_ids,
-    std::vector<EngineConnections>* connections,
+    const std::vector<int>& subgraph_node_ids,  // In topological order
+    std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope) {
   std::set<string> marker_nodes;
+  // Update connection shapes/data types and add corresponding input/output
+  // nodes in the segment graphdef.
   for (size_t i = 0; i < connections->size(); ++i) {
     auto& connection = connections->at(i);
     auto outside_node = graph->FindNodeId(connection.outside_id);
-    if (outside_node) {
-      tensorflow::DataType input_type = tensorflow::DT_FLOAT;
-      tensorflow::PartialTensorShape partial_shape;
-      if (connection.is_input_edge) {
-        if (graph_properties.HasOutputProperties(
-                connection.outside_node_name)) {
-          auto output_params = graph_properties.GetOutputProperties(
-              connection.outside_node_name);
-          auto out_shape = output_params.at(connection.outside_port);
-          input_type = out_shape.dtype();
-          std::vector<tensorflow::int64> dims;
-          partial_shape = out_shape.shape();
-          connection.outside_shape = partial_shape;
-        } else {
-          VLOG(0) << "Unknown output shape" << outside_node->name();
-          input_type = graph->FindNodeId(connection.outside_id)
-                           ->output_type(connection.outside_port);
-        }
-        connection.connection_type = input_type;
-
-      } else {  // output edge
-        if (graph_properties.HasInputProperties(connection.outside_node_name)) {
-          auto input_params =
-              graph_properties.GetInputProperties(connection.outside_node_name);
-          auto in_shape = input_params.at(connection.outside_port);
-          input_type = in_shape.dtype();
-          partial_shape = in_shape.shape();
-          connection.inside_shape = partial_shape;
-        } else {
-          input_type = graph->FindNodeId(connection.inside_id)
-                           ->output_type(connection.outside_port);
-        }
-        connection.connection_type = input_type;
+    if (!outside_node) {
+      // TODO(aaroey): this should never happen, so make it a CHECK?
+      return tensorflow::errors::NotFound(
+          "Cannot find node with id ", connection.outside_id, " in the graph.");
+    }
+    // Updates the shape and data types of input/output connections.
+    tensorflow::DataType input_type = tensorflow::DT_FLOAT;
+    tensorflow::PartialTensorShape partial_shape;
+    if (connection.is_input_edge) {
+      if (graph_properties.HasOutputProperties(connection.outside_node_name)) {
+        auto output_params = graph_properties.GetOutputProperties(
+            connection.outside_node_name);
+        auto out_shape = output_params.at(connection.outside_port);
+        input_type = out_shape.dtype();
+        std::vector<tensorflow::int64> dims;
+        partial_shape = out_shape.shape();
+        connection.outside_shape = partial_shape;
+      } else {
+        VLOG(0) << "Unknown output shape" << outside_node->name();
+        input_type = graph->FindNodeId(connection.outside_id)
+                         ->output_type(connection.outside_port);
       }
+      connection.connection_type = input_type;
+
+    } else {  // output edge
+      if (graph_properties.HasInputProperties(connection.outside_node_name)) {
+        auto input_params =
+            graph_properties.GetInputProperties(connection.outside_node_name);
+        auto in_shape = input_params.at(connection.outside_port);
+        input_type = in_shape.dtype();
+        partial_shape = in_shape.shape();
+        connection.inside_shape = partial_shape;
+      } else {
+        input_type = graph->FindNodeId(connection.inside_id)
+                         ->output_type(connection.outside_port);
+      }
+      connection.connection_type = input_type;
+    }
 
-      tensorflow::NodeDef dummy_placeholder;
-      string node_name;
-      if (connection.is_input_edge) {
-        StrAppend(&node_name, kInputPHName, connection.port_number);
-        if (marker_nodes.count(node_name)) {
-          VLOG(1) << "Reusing input " << node_name << " for the edge "
-                  << connection.outside_node_name << ":"
-                  << connection.outside_port << " -> "
-                  << connection.inside_node_name << ":"
-                  << connection.inside_port;
-          continue;
-        }
-        marker_nodes.insert(node_name);
-        auto seg_node = segment_def->add_node();
-        tensorflow::NodeDefBuilder dph_builder(node_name, "Placeholder");
-        auto status = dph_builder.Attr("shape", partial_shape)
-                          .Attr("dtype", input_type)
-                          .Finalize(seg_node);
-        VLOG(1) << "Constructing input " << node_name << " for the edge "
+    // Add dummy input/output nodes to the segment graphdef.
+    if (connection.is_input_edge) {
+      const string node_name = StrCat(kInputPHName, connection.port_number);
+      if (marker_nodes.count(node_name)) {
+        VLOG(1) << "Reusing input " << node_name << " for the edge "
                 << connection.outside_node_name << ":"
                 << connection.outside_port << " -> "
-                << connection.inside_node_name << ":" << connection.inside_port;
-      } else {
-        StrAppend(&node_name, kOutputPHName, connection.port_number);
-        if (marker_nodes.count(node_name)) {
-          VLOG(1) << "Reusing output " << node_name << " for the edge "
-                  << connection.inside_node_name << ":"
-                  << connection.inside_port << " -> "
-                  << connection.outside_node_name << ":"
-                  << connection.outside_port;
-          continue;
-        }
-        marker_nodes.insert(node_name);
-        auto seg_node = segment_def->add_node();
-        tensorflow::NodeDefBuilder dph_builder(node_name, "Identity");
-        auto status =
-            dph_builder.Input(connection.inside_node_name, 0, input_type)
-                .Finalize(seg_node);
-        VLOG(1) << "Constructing output " << node_name << " for the edge "
-                << connection.inside_node_name << ":" << connection.inside_port
-                << " -> " << connection.outside_node_name << ":"
+                << connection.inside_node_name << ":"
+                << connection.inside_port;
+        continue;
+      }
+      marker_nodes.insert(node_name);
+      auto seg_node = segment_def->add_node();
+      tensorflow::NodeDefBuilder builder(node_name, "Placeholder");
+      auto status = builder.Attr("shape", partial_shape)
+                        .Attr("dtype", input_type).Finalize(seg_node);
+      VLOG(1) << "Constructing input " << node_name << " for the edge "
+              << connection.outside_node_name << ":"
+              << connection.outside_port << " -> "
+              << connection.inside_node_name << ":" << connection.inside_port;
+    } else {
+      const string node_name = StrCat(kOutputPHName, connection.port_number);
+      if (marker_nodes.count(node_name)) {
+        VLOG(1) << "Reusing output " << node_name << " for the edge "
+                << connection.inside_node_name << ":"
+                << connection.inside_port << " -> "
+                << connection.outside_node_name << ":"
                 << connection.outside_port;
+        continue;
       }
+      marker_nodes.insert(node_name);
+      auto seg_node = segment_def->add_node();
+      tensorflow::NodeDefBuilder builder(node_name, "Identity");
+      auto status = builder.Input(connection.inside_node_name, 0, input_type)
+                        .Finalize(seg_node);
+      VLOG(1) << "Constructing output " << node_name << " for the edge "
+              << connection.inside_node_name << ":" << connection.inside_port
+              << " -> " << connection.outside_node_name << ":"
+              << connection.outside_port;
     }
-  }
-  std::unordered_map<int, int> newIdMap;
-  // Copy nodes to new graphdef
+  }  // for each connection.
+
+  std::unordered_map<int, int> old_to_new_id_map;
+  // Copy internal nodes to new graphdef
   string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name();
   for (const auto node_id : subgraph_node_ids) {
     const auto node = graph->FindNodeId(node_id);
     local_scope = GetCommonNameScope(local_scope, node->name());
-    if (node) {
-      newIdMap[node_id] = segment_def->node_size();
-      auto snode = segment_def->add_node();
-      snode->CopyFrom(node->def());
-      VLOG(1) << "Copying " << snode->name() << " to subgraph";
-    }
+    old_to_new_id_map[node_id] = segment_def->node_size();
+    auto snode = segment_def->add_node();
+    snode->CopyFrom(node->def());
+    VLOG(1) << "Copying " << snode->name() << " to subgraph";
   }
-  // update the inputs of the new nodes to point to dummy inputs
+  // Update the inputs of the new input nodes to point to placeholder nodes.
   for (int i = 0; i < connections->size(); ++i) {
     auto& connection = connections->at(i);
     if (!connection.is_input_edge) continue;
-    auto snode = segment_def->mutable_node(newIdMap[connection.inside_id]);
-    string placeholder_name(kInputPHName);
-    StrAppend(&placeholder_name, connection.port_number);
+    auto snode = segment_def->mutable_node(
+        old_to_new_id_map[connection.inside_id]);
+    const string placeholder_name =
+        StrCat(kInputPHName, connection.port_number);
     VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port
             << " from " << snode->input(connection.inside_port) << " to "
             << placeholder_name;
@@ -2373,6 +2384,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
   VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph";
   return tensorflow::Status::OK();
 }
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 971322d07c..b8d6012df2 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -22,11 +22,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
@@ -36,11 +38,13 @@ static const char* kInputPHName = "InputPH_";
 static const char* kOutputPHName = "OutputPH_";
 namespace convert {
 
+// TODO(aaroey): use an enum instead.
 const int FP32MODE = 0;
 const int FP16MODE = 1;
 const int INT8MODE = 2;
-struct EngineConnections {
-  EngineConnections(const string& outside, int out_id, int out_port,
+
+struct EngineConnection {
+  EngineConnection(const string& outside, int out_id, int out_port,
                     const string& inside, int in_id, int in_port,
                     bool input_edge, int port)
       : outside_node_name(outside),
@@ -51,16 +55,21 @@ struct EngineConnections {
         inside_port(in_port),
         is_input_edge(input_edge),
         port_number(port) {}
+
   const string outside_node_name;
   const int outside_id;
   const int outside_port;
   tensorflow::PartialTensorShape outside_shape;
-  tensorflow::DataType connection_type;
+
   const string inside_node_name;
   const int inside_id;
   const int inside_port;
   tensorflow::PartialTensorShape inside_shape;
+
+  tensorflow::DataType connection_type;
   bool is_input_edge;
+
+  // The port number of the TRT node connecting to this edge.
   int port_number;
 };
 
@@ -68,36 +77,54 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE){};
+        precision_mode(FP32MODE) {};
+
   string engine_name;
   string device;
   tensorflow::GraphDef segment_graph_def;
-  std::vector<EngineConnections> connections;  // order matters!
+
+  // The segment nodes that are on one side of the edges are topological sorted.
+  std::vector<EngineConnection> connections;
+
   enum class EngineType { TRTStatic = 0, TRTDynamic = 1 };
   EngineType engine_type;
-  tensorflow::int64 max_workspace_size_bytes;
+  int64 max_workspace_size_bytes;
   int maximum_cached_engines;
   std::vector<int> cached_engine_batches;
   int precision_mode;
 };
-;
 
-//  Constructs a graphdef from the segment in the given graph. Adds placeholder
-//  nodes for input edges (InputPH_*) and identity nodes for output edges
-//  (OutputPH_*).  This function needs to be called before TensorRT nodes
-//  inserted in order to correctly get sizes from the original graph.
-tensorflow::Status ConvertSegmentToGraphDef(
+// Constructs a graphdef from the segment in the given graph. Adds placeholder
+// nodes for input edges (InputPH_*) and identity nodes for output edges
+// (OutputPH_*). This function needs to be called before TensorRT nodes
+// inserted in order to correctly get sizes from the original graph.
+//
+// - subgraph_node_ids: the node ids of the subgraph, must be sorted in
+//   topological order.
+// - segment_def: the output GraphDef, whose non-input/output nodedefs will be
+//   sorted in topological order.
+tensorflow::Status ConvertSegmentToSubGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
     const std::vector<int>& subgraph_node_ids,
-    std::vector<EngineConnections>* connections,
+    std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope);
 
-// Converts given subgraph to a TRT engine.
-tensorflow::Status ConvertSubgraphToEngine(
-    const tensorflow::GraphDef& gdef, nvinfer1::IBuilder* builder,
+// Converts given subgraph to a TRT engine saved in 'engine'. Returns ok iff
+// 'builder' successfully build the engine. If the result is not ok, 'engine'
+// will be set to nullptr
+// Once returned, 'builder' is not needed any more and can be safely detroyed.
+//
+// - convert_successfully: indicates whether the converson to TensorRT network
+//   is successful. This is different than successfully building the engine:
+//   building can still fail afterwards.
+tensorflow::Status ConvertSubGraphDefToEngine(
+    const tensorflow::GraphDef& gdef, int precision_mode,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
-    nvinfer1::ICudaEngine** engine, int precision_mode);
+    nvinfer1::IBuilder* builder,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    bool* convert_successfully);
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 2dddc4541c..0d1d7e3b0e 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include <algorithm>
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
@@ -32,14 +33,14 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
-static ::tensorflow::tensorrt::Logger logger;
-using IRuntime = nvinfer1::IRuntime;
-using Dims = nvinfer1::Dims;
-
 namespace tensorrt {
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
-// A helper class to call done() for asynchronous execution.
+static Logger logger;
+using ::nvinfer1::IRuntime;
+using ::nvinfer1::Dims;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
+// A helper class to call done() when destructed for asynchronous execution.
 // Helps simultaneous execution of native and TRT engines.
 class AsyncHelper : public tensorflow::core::RefCounted {
  public:
@@ -78,8 +79,8 @@ tensorflow::Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) {
   auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_);
   if (fdef == nullptr) {
     return tensorflow::errors::Internal(
-        StrCat("Native FunctionDef ", funcdef_name_,
-               " can't be found in function library"));
+        "Native FunctionDef ", funcdef_name_,
+        " can't be found in function library");
   }
   tensorflow::FunctionLibraryRuntime::InstantiateOptions inst_ops;
   inst_ops.overlay_lib = nullptr;
@@ -122,15 +123,14 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("segment_funcdef_name", &funcdef_name_));
   if (precision_string == "FP32") {
-    precision_mode_ = tensorflow::tensorrt::convert::FP32MODE;
+    precision_mode_ = convert::FP32MODE;
   } else if (precision_string == "FP16") {
-    precision_mode_ = tensorflow::tensorrt::convert::FP16MODE;
+    precision_mode_ = convert::FP16MODE;
   } else if (precision_string == "INT8") {
-    precision_mode_ = tensorflow::tensorrt::convert::INT8MODE;
+    precision_mode_ = convert::INT8MODE;
   }
-  calibration_mode_ =
-      precision_mode_ == tensorflow::tensorrt::convert::INT8MODE &&
-      calibration_data.size() == 0;
+  calibration_mode_ = (precision_mode_ == convert::INT8MODE &&
+                       calibration_data.size() == 0);
   if (calibration_data.size()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
@@ -190,21 +190,20 @@ void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
                ctx->set_output(t, outputs->at(t));
              }
              delete outputs;
-             return;
            });
-  return;
 }
 
 void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
                                      AsyncHelper* helper) {
+  helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
-  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
+  // TODO(aaroey): remove the ResourceMgr singleton.
+  auto trt_rm = TRTResourceManager::instance();
   auto res_mgr = trt_rm->getManager("TRTCalibration");
-  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
+  TRTCalibrationResource* calib_res = nullptr;
   auto status = res_mgr->LookupOrCreate(
       funcdef_name_, "Calibrator", &calib_res,
-      {[ctx, this](tensorflow::tensorrt::TRTCalibrationResource** cr)
-           -> tensorflow::Status {
+      {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status {
         return this->AllocateCalibrationResources(ctx, cr);
       }});
   if (!status.ok()) {
@@ -219,7 +218,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
     void* data_address = GetTensorAddress(&t);
     if (data_address == nullptr) {
       ctx->SetStatus(tensorflow::errors::InvalidArgument(
-          StrCat("Unsupported data type encountered in input ", i)));
+          "Unsupported data type encountered in input ", i));
       return;
     }
     // Check the allocated buffer is sufficient for input
@@ -237,7 +236,6 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
   calib_res->calibrator_->setBatch(input_data, *stream);
   VLOG(2) << "Passed calibration data";
   ExecuteNativeSegment(ctx, helper);
-  return;
 }
 
 int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
@@ -274,27 +272,28 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
   auto helper = new AsyncHelper(done);
   tensorflow::core::ScopedUnref sc(helper);
   if (calibration_mode_) {
-    helper->Ref();
     ExecuteCalibration(ctx, helper);
     return;
   }
-  int num_binding = ctx->num_inputs() + ctx->num_outputs();
-  std::vector<void*> buffers(num_binding);
-  int smallest_engine = GetEngineBatch(ctx);
-  if (smallest_engine < 0) return;
-  int num_batch = ctx->input(0).shape().dim_size(0);
-  size_t binding_index;
-  auto engine_ctx_pair = GetEngine(smallest_engine, ctx, fixed_input_size_);
-  auto trt_engine_ptr = engine_ctx_pair.first;
+  const int smallest_engine = GetEngineBatch(ctx);
+  if (smallest_engine < 0) return;  // GetEngineBatch already set the status.
+
+  const int num_batch = ctx->input(0).shape().dim_size(0);
+  auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
+  auto& trt_engine_ptr = engine_ctx_pair.first;
   if (!trt_engine_ptr) {
     LOG(WARNING) << "Engine retrieval for batch size " << num_batch
                  << " failed Running native segment";
     ExecuteNativeSegment(ctx, helper);
     return;
   }
+
+  const int num_binding = ctx->num_inputs() + ctx->num_outputs();
+  std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
-    string inp_name = StrCat(kInputPHName, i);
-    binding_index = trt_engine_ptr->getBindingIndex(inp_name.c_str());
+    const string inp_name = StrCat(kInputPHName, i);
+    const size_t binding_index = trt_engine_ptr->getBindingIndex(
+        inp_name.c_str());
 
     const Tensor& input_tensor = ctx->input(i);
     const TensorShape& input_shape = input_tensor.shape();
@@ -322,17 +321,16 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
       default:
         LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
         ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unknown ouput TRT data type! " + int(dtype)));
+            "Unknown ouput TRT data type! ", int(dtype)));
         return;
     }
   }
 
   for (int i = 0; i < ctx->num_outputs(); i++) {
-    // This is bad that we have to reallocate output buffer every run.
     // Create an output tensor
-
-    auto output_name = StrCat(kOutputPHName, i);
-    binding_index = trt_engine_ptr->getBindingIndex(output_name.c_str());
+    const string output_name = StrCat(kOutputPHName, i);
+    const size_t binding_index = trt_engine_ptr->getBindingIndex(
+        output_name.c_str());
     Tensor* output_tensor = nullptr;
 
     TensorShape output_shape;
@@ -346,8 +344,8 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
                                            &output_shape));
     } else {
       LOG(ERROR) << "output node not found, at " << output_name;
-      ctx->SetStatus(tensorflow::errors::Internal("output " + output_name +
-                                                  " but couldn't be found!"));
+      ctx->SetStatus(tensorflow::errors::Internal(
+          "output ", output_name, " couldn't be found!"));
       return;
     }
     auto status = ctx->allocate_output(i, output_shape, &output_tensor);
@@ -375,7 +373,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
       default:
         LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
         ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unsupported output data type! " + int(dtype)));
+            "Unsupported output data type! ", int(dtype)));
         return;
     }
   }
@@ -387,46 +385,47 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
                                                 ->CudaStreamMemberHack()));
 
   // TODO(jie): trt enqueue does not return error
-  auto trt_execution_context_ptr = engine_ctx_pair.second;
+  auto& trt_execution_context_ptr = engine_ctx_pair.second;
   auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
                                                 nullptr);
   if (!ret) {
-    LOG(ERROR) << "Enqueueing of TRT execution failed!";
+    LOG(ERROR) << "Failed to enqueue batch for TRT engine: " << name();
+    ctx->SetStatus(tensorflow::errors::Internal(
+        "Failed to enqueue batch for TRT engine: ", name()));
   }
   // sync should be done by TF.
 }
 
 TRTEngineOp::~TRTEngineOp() {
-  // Order matters!
-  for (auto eng : engine_map_) {
+  // We need to manually destroy the engine and execution context before
+  // the allocator is destructed.
+  for (auto& eng : engine_map_) {
     eng.second.first.reset();
     eng.second.second.reset();
   }
-  for (auto alloc : allocators_) alloc.second.reset();
+  allocator_.reset();
 }
 
 nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
+  if (allocator_) return allocator_.get();
   auto device = ctx->device();
-  const auto& device_name = device->name();
-  if (allocators_.count(device_name)) {
-    return allocators_.at(device_name).get();
-  }
-  auto dev_allocator = device->GetAllocator(tensorflow::AllocatorAttributes());
-  if (!dev_allocator) {
+  auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
+  if (!alloc) {
     LOG(ERROR) << "Can't find device allocator for gpu device "
                << device->name();
     ctx->SetStatus(tensorflow::errors::Internal(
-        StrCat("Can't get device allocator for device ", device_name)));
+        "Can't get device allocator for device ", device->name()));
     return nullptr;
   }
-  auto allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
-  allocators_.insert({device_name, allocator});
-  return allocator.get();
+  allocator_.reset(new TRTDeviceAllocator(alloc));
+  return allocator_.get();
 }
 
-TRTEngineOp::EngineCtxPair TRTEngineOp::GetEngine(int batch_size,
-                                                  OpKernelContext* ctx,
-                                                  bool ignore_dim_change) {
+TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
+                                                  OpKernelContext* ctx) {
+  static EngineCtxPair null_pair = {
+    TrtUniquePtrType<nvinfer1::ICudaEngine>(nullptr),
+    TrtUniquePtrType<nvinfer1::IExecutionContext>(nullptr)};
   // TODO(sami): This method needs to be re-written to use resource manager and
   // with LRU mechanism option.
   tensorflow::mutex_lock lock(engine_mutex_);
@@ -435,113 +434,106 @@ TRTEngineOp::EngineCtxPair TRTEngineOp::GetEngine(int batch_size,
     if (engine_map_.size()) {
       if (engine_map_.begin()->first >= batch_size) {
         return engine_map_.begin()->second;
-      } else {
-        return {nullptr, nullptr};
       }
-    } else {
-      std::shared_ptr<IRuntime> infer(nvinfer1::createInferRuntime(logger),
-                                      [](IRuntime* p) {
-                                        if (p) p->destroy();
-                                      });
+      return null_pair;
+    }
+    TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
 #if NV_TENSORRT_MAJOR > 3
-      auto allocator = GetAllocator(ctx);
-      if (allocator == nullptr) {
-        return {nullptr, nullptr};
-      };
-      infer->setGpuAllocator(allocator);
+    auto allocator = GetAllocator(ctx);
+    if (allocator == nullptr) {
+      return null_pair;
+    };
+    infer->setGpuAllocator(allocator);
 #endif
-      std::shared_ptr<nvinfer1::ICudaEngine> static_engine(
-          infer->deserializeCudaEngine(serialized_segment_.c_str(),
-                                       serialized_segment_.size(), nullptr),
-          Destroyer<nvinfer1::ICudaEngine>());
-      engine_map_.insert({static_engine->getMaxBatchSize(),
-                          {static_engine,
-                           {static_engine->createExecutionContext(),
-                            Destroyer<nvinfer1::IExecutionContext>()}}});
-      // Runtime is safe to delete after engine creation
-      serialized_segment_.clear();
-      if (static_engine->getMaxBatchSize() < batch_size) {
-        return {nullptr, nullptr};
-      }
-      return engine_map_.at(static_engine->getMaxBatchSize());
-    }
-  } else {
-    auto engine_it = engine_map_.find(batch_size);
-    if (engine_it == engine_map_.end() &&
-        engine_map_.size() < (size_t)max_cached_engines_) {
-      auto builder = std::shared_ptr<nvinfer1::IBuilder>(
-          nvinfer1::createInferBuilder(logger),
-          Destroyer<nvinfer1::IBuilder>());  // reset the builder to ensure
-                                             // device is correct
+    TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
+        infer->deserializeCudaEngine(serialized_segment_.c_str(),
+                                     serialized_segment_.size(), nullptr));
+    auto raw_static_engine = static_engine.get();
+    const auto max_batch_size = raw_static_engine->getMaxBatchSize();
+    engine_map_[max_batch_size] = {
+      std::move(static_engine),
+      TrtUniquePtrType<nvinfer1::IExecutionContext>(
+          raw_static_engine->createExecutionContext())};
+    // Runtime is safe to delete after engine creation
+    serialized_segment_.clear();
+    if (max_batch_size < batch_size) return null_pair;
+    return engine_map_.at(max_batch_size);
+  }  // static_engine_
+
+  // Handle the dynamic engine case.
+  auto engine_it = engine_map_.find(batch_size);
+  if (engine_it == engine_map_.end() &&
+      engine_map_.size() < (size_t)max_cached_engines_) {
+    TrtUniquePtrType<nvinfer1::IBuilder> builder(
+        nvinfer1::createInferBuilder(logger));
 #if NV_TENSORRT_MAJOR > 3
-      auto allocator = GetAllocator(ctx);
-      if (allocator == nullptr) {
-        return {nullptr, nullptr};
-      }
-      builder->setGpuAllocator(allocator);
+    auto allocator = GetAllocator(ctx);
+    if (allocator == nullptr) {
+      // GetAllocator already set the Status.
+      return null_pair;
+    }
+    builder->setGpuAllocator(allocator);
 #endif
-      VLOG(0) << name() << " Constructing a new engine with batch size "
-              << batch_size;
-      builder->setMaxBatchSize(batch_size);
-      if (precision_mode_ == tensorflow::tensorrt::convert::FP16MODE) {
-        builder->setHalf2Mode(true);
-      } else if (precision_mode_ == tensorflow::tensorrt::convert::INT8MODE) {
-        builder->setInt8Mode(true);
-        builder->setInt8Calibrator(calibrator_.get());
-      }
-      builder->setMaxWorkspaceSize(workspace_size_);
-      nvinfer1::ICudaEngine* engine = nullptr;
-      std::vector<tensorflow::PartialTensorShape> shapes;
-      for (int i = 0; i < ctx->num_inputs(); ++i) {
-        shapes.emplace_back(ctx->input(i).shape());
-      }
-      VLOG(1) << "Calling conversion for " << batch_size << " " << name();
-      auto status = tensorflow::tensorrt::convert::ConvertSubgraphToEngine(
-          segment_graph_, builder.get(), shapes, &engine, precision_mode_);
-      VLOG(1) << "Conversion is done";
-      if (engine) {
-        engine_map_[batch_size] = {
-            std::shared_ptr<nvinfer1::ICudaEngine>(
-                engine, Destroyer<nvinfer1::ICudaEngine>()),
-            std::shared_ptr<nvinfer1::IExecutionContext>(
-                engine->createExecutionContext(),
-                Destroyer<nvinfer1::IExecutionContext>())};
-      } else {
-        LOG(ERROR) << "Engine creation for batch size " << batch_size
-                   << " failed";
-        ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!"));
+    VLOG(0) << name() << " Constructing a new engine with batch size "
+            << batch_size;
+    builder->setMaxBatchSize(batch_size);
+    if (precision_mode_ == convert::FP16MODE) {
+      builder->setHalf2Mode(true);
+    } else if (precision_mode_ == convert::INT8MODE) {
+      builder->setInt8Mode(true);
+      // TODO(aaroey): what if it's empty? I.e. when calibration data is empty?
+      builder->setInt8Calibrator(calibrator_.get());
+    }
+    // TODO(aaroey): use the allocator to allocate the TRT workspace.
+    builder->setMaxWorkspaceSize(workspace_size_);
+    std::vector<tensorflow::PartialTensorShape> shapes;
+    for (int i = 0; i < ctx->num_inputs(); ++i) {
+      shapes.emplace_back(ctx->input(i).shape());
+    }
+    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+    bool convert_successfully = false;
+    VLOG(1) << "Calling conversion for " << batch_size << " " << name();
+    auto status = convert::ConvertSubGraphDefToEngine(
+        segment_graph_, precision_mode_, shapes, builder.get(), &engine,
+        &convert_successfully);
+    if (!status.ok()) {
+      if (convert_successfully) {
+        // This means it fail to build the engine even when the network is built
+        // successfully, probably due to internal issues. In this case we don't
+        // retry in the future.
         engine_map_[batch_size] = {nullptr, nullptr};
-        return {nullptr, nullptr};
       }
+      LOG(ERROR) << "Engine creation for batch size " << batch_size
+                 << " failed " << status;
+      ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!"));
+      return null_pair;
     }
-    return engine_map_.at(batch_size);
+    VLOG(1) << "Conversion is done";
+    TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
+        engine->createExecutionContext());
+    engine_map_[batch_size] = {std::move(engine), std::move(exec_context)};
   }
+  return engine_map_.at(batch_size);
 }
 
 tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     tensorflow::OpKernelContext* ctx,
-    tensorflow::tensorrt::TRTCalibrationResource** cr) {
+    TRTCalibrationResource** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
-  cres->logger_ = new tensorflow::tensorrt::Logger();
+  cres->logger_ = new Logger();
 
 #if NV_TENSORRT_MAJOR > 3
-  auto dev = ctx->device();
-  auto dev_allocator = dev->GetAllocator(tensorflow::AllocatorAttributes());
-  if (!dev_allocator) {
+  auto alloc = ctx->device()->GetAllocator(tensorflow::AllocatorAttributes());
+  if (!alloc) {
     LOG(WARNING) << "Can't get device allocator will not be able to "
                     "allocate memory from TensorFlow memory pool";
-    cres->allocator_ =
-        std::make_shared<tensorflow::tensorrt::TRTCudaAllocator>();
+    cres->allocator_.reset(new TRTCudaAllocator);
   } else {
-    cres->allocator_ =
-        std::make_shared<tensorflow::tensorrt::TRTDeviceAllocator>(
-            dev_allocator);
+    cres->allocator_.reset(new TRTDeviceAllocator(alloc));
   }
-
 #endif
   int batch_size = ctx->input(0).dim_size(0);
-  cres->engine_ = nullptr;
   std::vector<tensorflow::PartialTensorShape> shapes;
   int num_inputs = ctx->num_inputs();
   // first run instantiate calibrator
@@ -558,7 +550,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     void* device_address = GetTensorAddress(device_tensor);
     if (device_address == nullptr) {
       return tensorflow::errors::InvalidArgument(
-          StrCat("Unsupported data type encountered in input ", i));
+          "Unsupported data type encountered in input ", i);
     }
     device_buffers_.emplace(
         StrCat(kInputPHName, i),
@@ -579,26 +571,29 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
                                 batch_size, workspace_size]() {
     VLOG(0) << "Starting calibration thread on device " << cuda_device
             << ", Calibration Resource @ " << cres;
-    // ConvertSubgraphToEngine() will try to build the engine and this thread
-    // will be consuming the calibration data that is set by the TF op, driving
-    // the builder until calibrator returns false; Engine is discarded after
-    // calibration table is generated
     auto err = cudaSetDevice(cuda_device);
     if (err != cudaSuccess) {
       VLOG(0) << "Couldn't set cuda device to " << cuda_device
               << " in calibration thread";
     }
     // initialize builder here
-    cres->builder_ = nvinfer1::createInferBuilder(*(cres->logger_));
-    cres->builder_->setGpuAllocator(cres->allocator_.get());
+    cres->builder_.reset(nvinfer1::createInferBuilder(*(cres->logger_)));
+    // TODO(aaroey): maybe setting the max batch size using the python
+    // calibration wrapper class.
     cres->builder_->setMaxBatchSize(batch_size);
+#if NV_TENSORRT_MAJOR > 3
+    cres->builder_->setGpuAllocator(cres->allocator_.get());
+#endif
     cres->builder_->setInt8Mode(true);
     cres->builder_->setMaxWorkspaceSize(workspace_size);
     cres->builder_->setInt8Calibrator(cres->calibrator_);
-    auto s = tensorflow::tensorrt::convert::ConvertSubgraphToEngine(
-        *segment_graph, cres->builder_, shapes, &cres->engine_,
-        tensorflow::tensorrt::convert::INT8MODE);  // calibrator will loop until
-                                                   // we terminate calibration
+    // ConvertSubGraphDefToEngine() will try to build the engine. This thread
+    // will loop inside buildCudaEngine() consuming the calibration data
+    // that is set by the TF op, and drive the builder until calibrator returns
+    // false. Engine is discarded after calibration table is generated
+    auto s = convert::ConvertSubGraphDefToEngine(
+        *segment_graph, convert::INT8MODE, shapes, cres->builder_.get(),
+        &cres->engine_, /*convert_successfully=*/nullptr);
     if (!s.ok()) {
       LOG(ERROR)
           << "Calibration failed. Engine will not be calibrated! Error is" << s;
@@ -609,6 +604,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   VLOG(1) << "initialized calibrator resource";
   return tensorflow::Status::OK();
 }
+
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 6faef09b62..cb43403130 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -33,7 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
-class Logger;
 class TRTInt8Calibrator;
 class TRTCalibrationResource;
 class AsyncHelper;
@@ -50,13 +50,6 @@ class TRTEngineOp : public AsyncOpKernel {
   ~TRTEngineOp();
 
  private:
-  template <typename T>
-  struct Destroyer {
-    void operator()(T* d) {
-      if (d) d->destroy();
-    }
-  };
-
   // Execute calibration
   void ExecuteCalibration(tensorflow::OpKernelContext* ctx,
                           AsyncHelper* helper);
@@ -74,11 +67,10 @@ class TRTEngineOp : public AsyncOpKernel {
       tensorflow::tensorrt::TRTCalibrationResource** cr);
 
   // TODO(samikama): context should go to a resource manager!
-  typedef std::pair<std::shared_ptr<nvinfer1::ICudaEngine>,
-                    std::shared_ptr<nvinfer1::IExecutionContext>>
+  typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>,
+                    TrtUniquePtrType<nvinfer1::IExecutionContext>>
       EngineCtxPair;
-  EngineCtxPair GetEngine(int batch_size, OpKernelContext* ctx,
-                          bool ignore_dim_change = true);
+  EngineCtxPair& GetEngine(int batch_size, OpKernelContext* ctx);
 
   // Return engine batch closest to input batch.
   int GetEngineBatch(OpKernelContext* ctx);
@@ -89,32 +81,45 @@ class TRTEngineOp : public AsyncOpKernel {
   std::unordered_map<int, EngineCtxPair> engine_map_;
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
+
   // keep device allocator for TRT.
-  std::unordered_map<string, std::shared_ptr<TRTDeviceAllocator>> allocators_;
+  std::unique_ptr<TRTDeviceAllocator> allocator_;
+
   // serialized protobuf segment or trt engine depending on static_engine_ flag.
   string serialized_segment_;
+
   // Name of the function for TF native execution of the segment.
   string funcdef_name_;
+
   // GraphDef representation of the segment.
   tensorflow::GraphDef segment_graph_;
+
   // Lookup table for temporary staging areas of input tensors for calibration.
   std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
   // Temporary staging areas for calibration inputs.
   std::vector<tensorflow::PersistentTensor> dev_tensors_;
+
   // Engine Precision mode.
   int precision_mode_;
+
   // Whether engine is constructed during the conversion or needs to be
   // constructed from protobuf segment.
   bool static_engine_;
+
   // Whether to calibrate INT8 engine.
   bool calibration_mode_;
+
   // Whether non-batch ranks of the inputs are assumed to be fixed or not for
-  // engine construction
+  // engine construction.
   bool fixed_input_size_;
+
   // Batches of the cached engines
   std::vector<int> cached_engine_batches_;
+
   // Maximum number of cached engines
   int max_cached_engines_;
+
   tensorflow::int64 workspace_size_;
   tensorflow::mutex engine_mutex_;
   tensorflow::FunctionLibraryRuntime::Handle native_func_;
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
index 894e9d6e85..994312d7c3 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -39,30 +39,46 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
   TRTInt8Calibrator(
       const std::unordered_map<string, std::pair<void*, size_t>>& dev_buffers,
       int batch_size, string engine_name);
+
   TRTInt8Calibrator(const string& calibration_data);
+
+  ~TRTInt8Calibrator();
+
   int getBatchSize() const override;
+
   bool getBatch(void* bindings[], const char* names[],
                 int num_bindings) override;
+
   bool setBatch(const std::unordered_map<string, void*>& data,
                 const cudaStream_t stream);
+
   void setDone();
+
+  // If not null, calibration is skipped.
   const void* readCalibrationCache(std::size_t& length) override;
+
   void writeCalibrationCache(const void* ptr, std::size_t length) override;
+
   const string& getCalibrationTableAsString() { return calibration_table_; }
-  ~TRTInt8Calibrator();
 
  private:
   const int batch_size_;
-  tensorflow::mutex cond_mtx_;           // mutex for condition_variable
-  tensorflow::condition_variable cond_;  // condition variable to implement
-                                         // producer-consumer queue for
-                                         // calibration
+
+  // mutex for condition_variable
+  tensorflow::mutex cond_mtx_;
+
+  // condition variable to implement producer-consumer queue for calibration
+  tensorflow::condition_variable cond_;
+
+  // Is calibration finished?
   bool done_;
-  const std::unordered_map<string, std::pair<void*, size_t>>
-      dev_buffers_;  // map to keep tensorrt input buffers and sizes keyed with
-                     // buffer names
+
+  // Map to keep tensorrt input buffers and sizes keyed with buffer names
+  const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
+
   bool calib_running_;
   bool batch_is_set_;
+
   string engine_name_;
   string calibration_table_;
 };
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 022639dc01..43734bbdd8 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <thread>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
@@ -34,21 +35,21 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tensorrt {
+
 class TRTCalibrationResource : public tensorflow::ResourceBase {
  public:
   TRTCalibrationResource()
       : calibrator_(nullptr),
-        builder_(nullptr),
-        network_(nullptr),
-        engine_(nullptr),
         logger_(nullptr),
         thr_(nullptr) {}
 
   ~TRTCalibrationResource() {
     VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
-    builder_->destroy();
-    network_->destroy();
-    engine_->destroy();
+    builder_.reset();
+    engine_.reset();
+    // We need to manually destroy the builder and engine before the allocator
+    // is destroyed.
+    allocator_.reset();
     delete thr_;
     delete logger_;
     delete calibrator_;
@@ -56,22 +57,22 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
 
   string DebugString() override {
     std::stringstream oss;
-    oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
-        << " Builder    = " << std::hex << builder_ << std::dec << std::endl
-        << " Network    = " << std::hex << network_ << std::dec << std::endl
-        << " Engine     = " << std::hex << engine_ << std::dec << std::endl
-        << " Logger     = " << std::hex << logger_ << std::dec << std::endl
-        << " Allocator  = " << std::hex << allocator_.get() << std::dec
-        << std::endl
-        << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
+    using std::hex;
+    using std::dec;
+    using std::endl;
+    oss << " Calibrator = " << hex << calibrator_      << dec << endl
+        << " Builder    = " << hex << builder_.get()   << dec << endl
+        << " Engine     = " << hex << engine_.get()    << dec << endl
+        << " Logger     = " << hex << logger_          << dec << endl
+        << " Allocator  = " << hex << allocator_.get() << dec << endl
+        << " Thread     = " << hex << thr_             << dec << endl;
     return oss.str();
   }
 
   TRTInt8Calibrator* calibrator_;
-  nvinfer1::IBuilder* builder_;
-  nvinfer1::INetworkDefinition* network_;
-  nvinfer1::ICudaEngine* engine_;
-  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  std::unique_ptr<nvinfer1::IGpuAllocator> allocator_;
   tensorflow::tensorrt::Logger* logger_;
   // TODO(sami): Use threadpool threads!
   std::thread* thr_;
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 1568dd9153..81b4bfe49f 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -29,8 +29,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// vector of segments, each entry contains a device name and a set of nodes in
-// segment
+// Vector of segments, each entry contains a set of node names and a device name
+// in the segment.
+// TODO(aaroey): use node pointer instead of node name.
 using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
 
 struct SegmentOptions {
@@ -48,6 +49,8 @@ struct SegmentOptions {
 // in the vector describes a subgraph by giving a set of the names of
 // all the NodeDefs in that subgraph.
 // @return the status.
+//
+// TODO(aaroey): remove this method.
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
author	gracehoney <31743510+aaroey@users.noreply.github.com>	2018-06-19 12:14:10 -0700
committer	gracehoney <31743510+aaroey@users.noreply.github.com>	2018-06-19 12:14:10 -0700
commit	b5a8d9ea0ec49b1e3fee5441a78a3fb33cd4d470 (patch)
tree	5e1aa2478541a54baf43539afe6dfbc1b5b5e57a
parent	0fb21f608c334dfcaadab7b918c06b88afa8c592 (diff)