18 files changed, 2040 insertions, 433 deletions
diff --git a/configure.py b/configure.py
index 3aa1a3e956..68c9bbfb1c 100644
--- a/configure.py
+++ b/configure.py
@@ -1050,12 +1050,15 @@ def set_tf_tensorrt_install_path(environ_cp):
 
     cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
     cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
-    nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
+    nvinfer_pattern = re.compile('.*libnvinfer(?:_debug)?.so.?(.*)$')
     highest_ver = [0, None, None]
 
     for lib_file in possible_files:
       if is_compatible(lib_file, cuda_ver, cudnn_ver):
-        ver_str = nvinfer_pattern.search(lib_file).group(1)
+        matches=nvinfer_pattern.search(lib_file)
+        if len(matches.groups()) == 0:
+          continue
+        ver_str = matches.group(1)
         ver = convert_version_to_int(ver_str) if len(ver_str) else 0
         if ver > highest_ver[0]:
           highest_ver = [ver, ver_str, lib_file]
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index cf67c27b70..dd83c34dfb 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -3,244 +3,279 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(default_visibility=["//tensorflow:__subpackages__"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "tf_copts",
-    "tf_cuda_library",
-    "tf_custom_op_library",
-    "tf_custom_op_library_additional_deps",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
+  "//tensorflow:tensorflow.bzl",
+  "tf_cc_test",
+  "tf_copts",
+  "tf_cuda_library",
+  "tf_custom_op_library",
+  "tf_custom_op_library_additional_deps",
+  "tf_gen_op_libs",
+  "tf_gen_op_wrapper_py",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
-    "@local_config_tensorrt//:build_defs.bzl",
-    "if_tensorrt",
+  "@local_config_tensorrt//:build_defs.bzl",
+  "if_tensorrt",
 )
 
 tf_cuda_cc_test(
-    name = "tensorrt_test_cc",
-    size = "small",
-    srcs = ["tensorrt_test.cc"],
-    tags = [
-        "manual",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
+  name="tensorrt_test_cc",
+  size="small",
+  srcs=["tensorrt_test.cc"],
+  tags=[
+    "manual",
+    "notap",
+  ],
+  deps=[
+         "//tensorflow/core:lib",
+         "//tensorflow/core:test",
+         "//tensorflow/core:test_main",
+       ] + if_tensorrt([
+    "@local_config_cuda//cuda:cuda_headers",
+    "@local_config_tensorrt//:nv_infer",
+  ]),
 )
 
 tf_custom_op_library(
-    name = "python/ops/_trt_engine_op.so",
-    srcs = ["ops/trt_engine_op.cc"],
-    deps = [
-        ":trt_engine_op_kernel",
-        ":trt_shape_function",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
+  name="python/ops/_trt_engine_op.so",
+  srcs=[
+    "ops/trt_calib_op.cc",
+    "ops/trt_engine_op.cc",
+  ],
+  deps=[
+         ":trt_engine_op_kernel",
+         ":trt_shape_function",
+         "//tensorflow/core:lib_proto_parsing",
+       ] + if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]),
 )
 
 tf_cuda_library(
-    name = "trt_shape_function",
-    srcs = ["shape_fn/trt_shfn.cc"],
-    hdrs = ["shape_fn/trt_shfn.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":trt_logging",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
+  name="trt_shape_function",
+  srcs=["shape_fn/trt_shfn.cc"],
+  hdrs=["shape_fn/trt_shfn.h"],
+  visibility=["//visibility:public"],
+  deps=[
+         ":trt_logging",
+       ] + if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]) + tf_custom_op_library_additional_deps(),
 )
 
 cc_library(
-    name = "trt_engine_op_kernel",
-    srcs = ["kernels/trt_engine_op.cc"],
-    hdrs = ["kernels/trt_engine_op.h"],
-    copts = tf_copts(),
-    deps = [
-        ":trt_logging",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor_headers_lib",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
-    alwayslink = 1,
+  name="trt_engine_op_kernel",
+  srcs=[
+    "kernels/trt_calib_op.cc",
+    "kernels/trt_engine_op.cc",
+  ],
+  hdrs=[
+    "kernels/trt_calib_op.h",
+    "kernels/trt_engine_op.h",
+  ],
+  copts=tf_copts(),
+  deps=[
+         ":trt_logging",
+         ":trt_resources",
+         "//tensorflow/core:gpu_headers_lib",
+         "//tensorflow/core:lib_proto_parsing",
+         "//tensorflow/core:stream_executor_headers_lib",
+       ] + if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]) + tf_custom_op_library_additional_deps(),
+  alwayslink=1,
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["trt_engine_op"],
-    deps = if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
+  op_lib_names=[
+    "trt_engine_op",
+    "trt_calib_op",
+  ],
+  deps=if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]),
 )
 
 tf_cuda_library(
-    name = "trt_logging",
-    srcs = ["log/trt_logger.cc"],
-    hdrs = ["log/trt_logger.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
+  name="trt_logging",
+  srcs=["log/trt_logger.cc"],
+  hdrs=["log/trt_logger.h"],
+  visibility=["//visibility:public"],
+  deps=[
+         "//tensorflow/core:lib_proto_parsing",
+       ] + if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]),
 )
 
 tf_gen_op_wrapper_py(
-    name = "trt_engine_op",
-    deps = [
-        ":trt_engine_op_op_lib",
-        ":trt_logging",
-        ":trt_shape_function",
-    ],
+  name="trt_engine_op",
+  deps=[
+    ":trt_engine_op_op_lib",
+    ":trt_calib_op_op_lib",
+    ":trt_logging",
+    ":trt_shape_function",
+  ],
 )
 
 tf_custom_op_py_library(
-    name = "trt_engine_op_loader",
-    srcs = ["python/ops/trt_engine_op.py"],
-    dso = [
+  name="trt_engine_op_loader",
+  srcs=["python/ops/trt_engine_op.py"],
+  dso=[
         ":python/ops/_trt_engine_op.so",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:resources",
-    ],
+      ] + if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]),
+  srcs_version="PY2AND3",
+  deps=[
+    "//tensorflow/python:framework_for_generated_wrappers",
+    "//tensorflow/python:resources",
+  ],
 )
 
 py_library(
-    name = "init_py",
-    srcs = [
-        "__init__.py",
-        "python/__init__.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":trt_convert_py",
-        ":trt_ops_py",
-    ],
+  name="init_py",
+  srcs=[
+    "__init__.py",
+    "python/__init__.py",
+  ],
+  srcs_version="PY2AND3",
+  deps=[
+    ":trt_convert_py",
+    ":trt_ops_py",
+  ],
 )
 
 py_library(
-    name = "trt_ops_py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":trt_engine_op",
-        ":trt_engine_op_loader",
-    ],
+  name="trt_ops_py",
+  srcs_version="PY2AND3",
+  deps=[
+    ":trt_engine_op",
+    ":trt_engine_op_loader",
+  ],
 )
 
 py_library(
-    name = "trt_convert_py",
-    srcs = ["python/trt_convert.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":wrap_conversion",
-    ],
+  name="trt_convert_py",
+  srcs=["python/trt_convert.py"],
+  srcs_version="PY2AND3",
+  deps=[
+    ":wrap_conversion",
+  ],
 )
 
 tf_py_wrap_cc(
-    name = "wrap_conversion",
-    srcs = ["trt_conversion.i"],
-    copts = tf_copts(),
-    deps = [
-        ":trt_conversion",
-        "//tensorflow/core:framework_lite",
-        "//util/python:python_headers",
-    ],
+  name="wrap_conversion",
+  srcs=["trt_conversion.i"],
+  copts=tf_copts(),
+  deps=[
+    ":trt_conversion",
+    "//tensorflow/core:framework_lite",
+    "//util/python:python_headers",
+  ],
+)
+
+tf_cuda_library(
+  name="trt_resources",
+  srcs=[
+    "resources/TRTInt8Calibrator.cc",
+    "resources/TRTResourceManager.cc",
+  ],
+  hdrs=[
+    "resources/TRTInt8Calibrator.h",
+    "resources/TRTResourceManager.h",
+    "resources/TRTResources.h",
+  ],
+  deps=[
+    "@local_config_tensorrt//:nv_infer",
+    "//tensorflow/core:framework_headers_lib",
+    "//tensorflow/core:framework_lite",
+    "//tensorflow/core:lib_proto_parsing",
+
+  ],
 )
 
 # Library for the node-level conversion portion of TensorRT operation creation
 tf_cuda_library(
-    name = "trt_conversion",
-    srcs = [
-        "convert/convert_graph.cc",
-        "convert/convert_nodes.cc",
-    ],
-    hdrs = [
-        "convert/convert_graph.h",
-        "convert/convert_nodes.h",
-    ],
-    deps = [
-        ":segment",
-        ":trt_logging",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:devices",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:constant_folding",
-        "//tensorflow/core/grappler/optimizers:layout_optimizer",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
+  name="trt_conversion",
+  srcs=[
+    "convert/convert_graph.cc",
+    "convert/convert_nodes.cc",
+  ],
+  hdrs=[
+    "convert/convert_graph.h",
+    "convert/convert_nodes.h",
+  ],
+  deps=[
+         ":segment",
+         ":trt_logging",
+         ":trt_resources",
+         "//tensorflow/core/grappler:grappler_item",
+         "//tensorflow/core/grappler:utils",
+         "//tensorflow/core:framework",
+         "//tensorflow/core:framework_lite",
+         "//tensorflow/core:graph",
+         "//tensorflow/core:lib",
+         "//tensorflow/core:lib_internal",
+         "//tensorflow/core:protos_all_cc",
+         "//tensorflow/core/grappler:devices",
+         "//tensorflow/core/grappler/clusters:virtual_cluster",
+         "//tensorflow/core/grappler/costs:graph_properties",
+         "//tensorflow/core/grappler/optimizers:constant_folding",
+         "//tensorflow/core/grappler/optimizers:layout_optimizer",
+       ] + if_tensorrt([
+    "@local_config_tensorrt//:nv_infer",
+  ]) + tf_custom_op_library_additional_deps(),
 )
 
 # Library for the segmenting portion of TensorRT operation creation
 cc_library(
-    name = "segment",
-    srcs = ["segment/segment.cc"],
-    hdrs = [
-        "segment/segment.h",
-        "segment/union_find.h",
-    ],
-    linkstatic = 1,
-    deps = [
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "@protobuf_archive//:protobuf_headers",
-    ],
+  name="segment",
+  srcs=["segment/segment.cc"],
+  hdrs=[
+    "segment/segment.h",
+    "segment/union_find.h",
+  ],
+  linkstatic=1,
+  deps=[
+    "//tensorflow/core:graph",
+    "//tensorflow/core:lib_proto_parsing",
+    "//tensorflow/core:protos_all_cc",
+    "@protobuf_archive//:protobuf_headers",
+  ],
 )
 
 tf_cc_test(
-    name = "segment_test",
-    size = "small",
-    srcs = ["segment/segment_test.cc"],
-    deps = [
-        ":segment",
-        "//tensorflow/c:c_api",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
+  name="segment_test",
+  size="small",
+  srcs=["segment/segment_test.cc"],
+  deps=[
+    ":segment",
+    "//tensorflow/c:c_api",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core:test",
+    "//tensorflow/core:test_main",
+  ],
 )
 
 filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+  name="all_files",
+  srcs=glob(
+    ["**/*"],
+    exclude=[
+      "**/METADATA",
+      "**/OWNERS",
+    ],
+  ),
+  visibility=["//tensorflow:__subpackages__"],
 )
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 899448004f..31ba30b2d9 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 
+#include <list>
 #include <map>
 #include <set>
 #include <unordered_map>
@@ -39,8 +40,8 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+//#if GOOGLE_CUDA
+//#if GOOGLE_TENSORRT
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -48,13 +49,28 @@ namespace tensorrt {
 namespace convert {
 namespace {
 
-static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
   static const std::set<string> candidate_ops = {
-      "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
-      "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
+      "Identity",
+      "Const",
+      "Conv2D",
+      "MaxPool",
+      "BiasAdd",
+      "Relu",
+      "Add",
+      "Mul",
+      "Sub",
+      "Rsqrt",
+      "Pad",
+      "Mean",
+      "AvgPool",
+      "ConcatV2",
+      "DepthwiseConv2dNative"  //, "MatMul",
+                               //"Reshape"
+                               // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
   return candidate_ops.count(node_def.op());
@@ -69,6 +85,8 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource()) {
         incoming_edges->insert(edge);
+      } else {
+        VLOG(2) << edge->src()->name() << " N, ";
       }
     }
   }
@@ -82,7 +100,10 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink()) {
+        VLOG(2) << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
+      } else {
+        VLOG(2) << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -110,73 +131,147 @@ std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
   return result;
 }
 
-tensorflow::Status ConvertSubGraphToTensorRT(
-    const std::vector<string>& output_names,
-    const std::set<int>& subgraph_node_ids,
-    size_t max_batch_size,  // Max batch size that engine will be created for
-    // Max amount of memory that engine will be allowed to consume, in bytes
-    size_t max_workspace_size_bytes,
-    const tensorflow::grappler::GraphProperties& graph_properties,
-    tensorflow::Graph* graph) {
-  tensorflow::EdgeSet subgraph_incoming_edges;
-  GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
-
+struct ConvertGraphParams {
+  ConvertGraphParams(
+      tensorflow::Graph& graph_, const std::vector<string>& output_names_,
+      const std::set<int>& subgraph_node_ids_, size_t max_batch_size_,
+      size_t max_workspace_size_bytes_,
+      const tensorflow::grappler::GraphProperties& graph_properties_,
+      std::unordered_map<string, std::pair<int, string>>*
+          output_edge_map_,
+      int precision_mode_)
+      : graph(graph_),
+        output_names(output_names_),
+        subgraph_node_ids(subgraph_node_ids_),
+        max_batch_size(max_batch_size_),
+        max_workspace_size_bytes(max_workspace_size_bytes_),
+        graph_properties(graph_properties_),
+        output_edge_map(output_edge_map_),
+        precision_mode(precision_mode_) {}
+  tensorflow::Graph& graph;
+  const std::vector<string>& output_names;
+  const std::set<int>& subgraph_node_ids;
+  size_t max_batch_size;
+  size_t max_workspace_size_bytes;
+  const tensorflow::grappler::GraphProperties& graph_properties;
+  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
+  int precision_mode;
   std::vector<std::pair<int, int>> subgraph_inputs;
+  std::vector<std::pair<int, int>> subgraph_outputs;
+  tensorflow::EdgeSet subgraph_incoming_edges;
+  tensorflow::EdgeSet subgraph_outgoing_edges;
+};
 
-  // Collect inputs by looking for incoming edges
-  for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
-    subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams& p) {
+  GetSubGraphIncomingEdges(p.graph, p.subgraph_node_ids,
+                           &p.subgraph_incoming_edges);
+  for (tensorflow::Edge const* edge : p.subgraph_incoming_edges) {
+    p.subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
   }
+  auto output_name_to_index_map = BuildTensorNameMap(p.output_names);
   std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  auto output_name_to_index_map = BuildTensorNameMap(output_names);
-  for (int node_id : subgraph_node_ids) {
-    tensorflow::Node* node = graph->FindNodeId(node_id);
+  for (int node_id : p.subgraph_node_ids) {
+    tensorflow::Node* node = p.graph.FindNodeId(node_id);
     if (output_name_to_index_map.count(node->name())) {
       for (int index : output_name_to_index_map.at(node->name())) {
         subgraph_outputs_set.insert({node_id, index});
       }
     }
   }
-  // Collect outputs referenced from outgoing edges
-  tensorflow::EdgeSet subgraph_outgoing_edges;
-  GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
-  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+  GetSubGraphOutgoingEdges(p.graph, p.subgraph_node_ids,
+                           &p.subgraph_outgoing_edges);
+  for (const tensorflow::Edge *edge : p.subgraph_outgoing_edges) {
     subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
-  // Impose an ordering on the outputs
-  std::vector<std::pair<int, int>> subgraph_outputs(
-      subgraph_outputs_set.begin(), subgraph_outputs_set.end());
-  // Build TensorRT node and add it to the graph
+  p.subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p.subgraph_outputs.insert(p.subgraph_outputs.begin(),
+                            subgraph_outputs_set.begin(),
+                            subgraph_outputs_set.end());
+  return tensorflow::Status::OK();
+};
+
+tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
+  FillSubGraphEdgeSets(*params);
+  tensorflow::NodeDef trt_node_def;
+  SubGraphParams s(params->graph, params->subgraph_node_ids,
+                   params->subgraph_inputs, params->subgraph_outputs,
+                   params->max_batch_size, params->max_workspace_size_bytes,
+                   params->graph_properties, params->output_edge_map,
+                   &trt_node_def, params->precision_mode);
+  TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
+  tensorflow::Status status;
+  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
+
+  TF_RETURN_IF_ERROR(status);
+
+  for (auto in_edge :
+       params->subgraph_incoming_edges) {  // loop over incoming edges and
+                                           // attach them to calib node
+    // tensorflow::Node* src_node = in_edge->src();
+    auto src_output = in_edge->src_output();
+    auto dst_node = in_edge->dst();
+    auto dst_input = in_edge->dst_input();
+    VLOG(1) << " update edge " << trt_node->name() << ":" << src_output
+            << " -> " << dst_node->name() << ":" << dst_input;
+    params->graph.UpdateEdge(trt_node, src_output, dst_node, dst_input);
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
+  FillSubGraphEdgeSets(*params);
   tensorflow::NodeDef trt_node_def;
-  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
-      *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
-      max_batch_size, max_workspace_size_bytes, graph_properties,
-      &trt_node_def));
+
+  SubGraphParams s(params->graph, params->subgraph_node_ids,
+                   params->subgraph_inputs, params->subgraph_outputs,
+                   params->max_batch_size, params->max_workspace_size_bytes,
+                   params->graph_properties, params->output_edge_map,
+                   &trt_node_def,params->precision_mode);
+  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
   tensorflow::Status status;
-  tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
+  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
+
+  // AddNode does not wire edges.
+  // Re-map incoming edges to use the new TRT node instead of the orig subgraph
+  std::map<std::pair<int, int>, int> subgraph_edge_to_input_map;
+  for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
+    subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
+  }
+  for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
+    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    int new_src_output = subgraph_edge_to_input_map.at(old_src);
+    params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
+                          new_src_output);
+    params->graph.RemoveEdge(edge);
+  }
+
+  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
+  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  }
+
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
   std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
-  for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
-    subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
+  for (size_t i = 0; i < params->subgraph_outputs.size(); ++i) {
+    subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
   }
   TF_RETURN_IF_ERROR(status);
-  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+  for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
-    TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
-                                         edge->dst_input()));
+    TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
+        trt_node, new_src_output, edge->dst(), edge->dst_input()));
   }
   // Remove the original subgraph
-  for (int node_id : subgraph_node_ids) {
-    tensorflow::Node* node = graph->FindNodeId(node_id);
+  for (int node_id : params->subgraph_node_ids) {
+    tensorflow::Node* node = params->graph.FindNodeId(node_id);
     // Don't remove the input placeholders
     if (node->type_string() == "Placeholder") {
       continue;
     }
-    graph->RemoveNode(node);
+    params->graph.RemoveNode(node);
   }
   return tensorflow::Status::OK();
 }
@@ -194,31 +289,64 @@ tensorflow::Status BuildNodeMap(
 }
 
 }  // namespace
+tensorflow::Status ConvertCalibGraphToInferGraph(
+    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph) {
+  VLOG(0) << "Starting Calib Conversion";
+  tensorflow::Graph graph(tensorflow::OpRegistry::Global());
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
+      tensorflow::GraphConstructorOptions(), graph_def, &graph));
+  //  get calib nodes
+  std::vector<tensorflow::Node*> calibNodes;
+  for (auto node : graph.op_nodes()) {
+    if (node->type_string() == "TRTCalibOp") {
+      VLOG(1) << "Found Calib Node";
+      calibNodes.push_back(node);
+    }
+  }
+  VLOG(0) << "Num Calib nodes in graph= " << calibNodes.size();
+  if (calibNodes.size() == 0)
+    return tensorflow::errors::FailedPrecondition(
+        "Graph doesn't contain any calibration nodes!."
+        " Please generate calibration graph and run calibration first");
+  for (auto n : calibNodes) {
+    TF_RETURN_IF_ERROR(
+        tensorrt::convert::ConvertCalibrationNodeToEngineNode(graph, n));
+  }
+  return tensorflow::Status::OK();
+}
 
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
-  // Optimization pass
+    size_t max_workspace_size, tensorflow::GraphDef* new_graph_def,
+    int precision_mode = 0) {
+  // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   tensorflow::GraphDef gdef;
 
-  // Layout optimization
+  // layout optimization
   item.graph = graph_def;
   tensorflow::grappler::LayoutOptimizer optimizer;
-  tensorflow::grappler::Cluster* cluster;
+  tensorflow::grappler::Cluster* gCluster;
 
-  // Virtual cluster
+  // virtual cluster
   tensorflow::DeviceProperties device_properties;
+
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
-  cluster =
+  gCluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
-  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
+  // single machine
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
+  VLOG(2) << "cpu_cores: " << num_cpu_cores;
+  VLOG(2) << "gpus: " << num_gpus;
+
+  TF_RETURN_IF_ERROR(optimizer.Optimize(gCluster, item, &gdef));
 
-  // Constant folding
+  // constant folding
   item.graph = gdef;
   tensorflow::grappler::ConstantFolding fold(nullptr);
   TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
@@ -252,14 +380,27 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   }
   std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
+  std::unordered_map<string, std::pair<int, string>> output_edge_map;
+  int count = 0;
   for (const std::set<string>& subgraph_node_names : segments) {
     std::set<int> subgraph_node_ids;
     for (const string& node_name : subgraph_node_names) {
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
-    TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
-        output_names, subgraph_node_ids, max_batch_size,
-        max_workspace_size_bytes, static_graph_properties, &graph));
+    ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
+                         max_workspace_size, static_graph_properties,
+                         &output_edge_map, precision_mode);
+    if (precision_mode == 2) {
+      TF_RETURN_IF_ERROR(GetCalibNode(&p));
+    } else {
+      tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
+      if (status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
+                     << " due to: \n"
+                     << status.ToString() << "SKIPPING......";
+      }
+      count++;
+    }
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
@@ -269,5 +410,5 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+//#endif  // GOOGLE_TENSORRT
+//#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 154ad3f2e8..846d7f2721 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -21,12 +21,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+//#if GOOGLE_CUDA
+//#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
+tensorflow::Status ConvertCalibGraphToInferGraph(
+    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def);
 
 // max_batch_size: maximum batch size which can be used for inference for
 //                 optimization targets inference run with max batch size.
@@ -35,13 +37,13 @@ namespace convert {
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
-
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
+    int precision_mode);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+//#endif  // GOOGLE_TENSORRT
+//#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 9ee717dd7f..ea0eb480f2 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -24,6 +24,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/resources/TRTResourceManager.h"
+#include "tensorflow/contrib/tensorrt/resources/TRTResources.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
 #include "tensorflow/core/framework/types.h"
@@ -32,14 +37,15 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+//#if GOOGLE_CUDA
+//#if GOOGLE_TENSORRT
+// #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorrt/include/NvInfer.h"
 
 //  Check if the types are equal. Cast to int first so that failure log message
@@ -245,6 +251,11 @@ std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
 }
 
 template <>
+std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
+  auto attr = this->at(key)->list().s();
+  return std::vector<string>(attr.begin(), attr.end());
+}
+template <>
 nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
@@ -265,7 +276,7 @@ template <>
 tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
   return this->at(key)->type();
 }
-
+// TODO(jie): reorder4 & reorder2 should be merged?
 template <typename T>
 void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
               nvinfer1::DimsNCHW istrides, T* odata,
@@ -283,16 +294,53 @@ void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
   }
 }
 
+
+template <typename T>
+void reorder2(nvinfer1::DimsHW shape, T const* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// TODO(jie): fail to tensorflow!!
+void reorder_ck_to_kc(TRT_ShapedWeights const& iweights,
+                      TRT_ShapedWeights* oweights) {
+  int c = iweights.shape_.d[0];
+  int k = iweights.shape_.d[1];
+  oweights->shape_.d[0] = k;
+  oweights->shape_.d[1] = c;
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  switch (iweights.type_) {
+    case tensorflow::DataType::DT_FLOAT:
+      reorder2({k, c}, static_cast<float const*>(iweights.GetValues()), istrides,
+               static_cast<float*>(const_cast<void*>(oweights->GetValues())),
+               ostrides);
+      break;
+    default:
+      LOG(FATAL) << "!!!!!!!!!!!!!!!!!!!!!!!!broke!!!!!!!!!!!!";
+  }
+}
+
 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
-                       TRT_ShapedWeights* oweights) {
+                          TRT_ShapedWeights* oweights, int nbGroups) {
   CHECK_EQ(iweights.type_, oweights->type_);
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   int r = iweights.shape_.d[0];
   int s = iweights.shape_.d[1];
-  int c = iweights.shape_.d[2];
-  int k = iweights.shape_.d[3];
-  oweights->shape_.d[0] = k;
-  oweights->shape_.d[1] = c;
+  // TRT requires GKcRS, while TF depthwise has RSCK
+  //   where c=1, C=G
+  VLOG(2) << "nbGroups: " << nbGroups;
+  int c = iweights.shape_.d[2] / nbGroups;
+  VLOG(2) << "c" << iweights.shape_.d[2] << " then " << c;
+  int k = iweights.shape_.d[3] * nbGroups;
+  VLOG(2) << "k" << iweights.shape_.d[3] << " then " << k;
+  oweights->shape_.d[0] = k / nbGroups;
+  oweights->shape_.d[1] = c * nbGroups;
   oweights->shape_.d[2] = r;
   oweights->shape_.d[3] = s;
   nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
@@ -342,9 +390,32 @@ class Converter {
   std::vector<TRT_TensorOrWeights> get_inputs(
       const tensorflow::NodeDef& node_def) {
     std::vector<TRT_TensorOrWeights> inputs;
-    for (const auto& input_name : node_def.input()) {
-      VLOG(2) << "Retrieve input: " << input_name;
-      inputs.push_back(trt_tensors_.at(input_name));
+    for (auto const& input_name : node_def.input()) {
+      /*************************************************************************
+       * TODO(jie) handle case 1) here
+       * Normalizes the inputs and extracts associated metadata:
+       * 1) Inputs can contain a colon followed by a suffix of characters.
+       *    That suffix may be a single number (e.g. inputName:1) or several
+       *    word characters separated from a number by a colon
+       *    (e.g. inputName:foo:1). The
+       *    latter case is used to denote inputs and outputs of functions.
+       * 2) Control dependency inputs contain caret at the beginning and we
+       *    remove this and annotate the edge as a control dependency.
+       ************************************************************************/
+      string name =
+          input_name[0] == '^' ? input_name.substr(1) : input_name;
+      auto first = name.find_first_of(':');
+      if (first != string::npos && first + 2 == name.size() &&
+          name[first + 1] == '0')
+        name.erase(first);
+
+      VLOG(2) << "retrieve input: " << name;
+      if (trt_tensors_.count(name)) {
+        inputs.push_back(trt_tensors_.at(name));
+      } else {
+        LOG(FATAL) << "input: " << name << "not availabled for node at, "
+                   << node_def.name();
+      }
     }
     return inputs;
   }
@@ -722,38 +793,83 @@ tensorflow::Status BinaryTensorOpWeight(
   auto dims_w = weights.shape_;
   auto dims_t = tensor->getDimensions();
 
-  // Default to channel-wise
+  // default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
+  // TODO(jie): maybe use a permuatation instead to support more cases;
+  bool permutation_flag = false;
+
   if (weights.count() == 1) {
     VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // No broadcasting on Batch dimension;
-    assert(dims_w.d[0] == 1);
-
-    // Broadcasting on Channel dimension only allowed in kUNIFORM
-    assert(dims_w.d[1] == dims_t.d[0]);
-    assert(dims_w.nbDims == dims_t.nbDims);
-
-    // Default is element;
-    for (int i = 2; i < dims_w.nbDims; i++) {
-      if (dims_w.d[i] != dims_t.d[i - 1]) {
-        scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-        break;
+    // no broadcasting on Batch dimension;
+    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
+               << " tensor DIM: " << dims_t.nbDims;
+    if (dims_w.nbDims == dims_t.nbDims + 1) {
+      if (dims_w.d[0] == 1) {
+        for (int i = 1; i < dims_w.nbDims; i++){
+          dims_w.d[i - 1] = dims_w.d[i];
+         }
+        dims_w.nbDims--;
+      } else {
+        return tensorflow::errors::InvalidArgument(
+            "Binary op cannot operate on batch, " + node_def.name());
       }
     }
-    if (scale_mode == nvinfer1::ScaleMode::kELEMENTWISE) {
+
+    if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      for (int i = 2; i < dims_w.nbDims; i++) {
-        if (dims_w.d[i] != 1)
-          return tensorflow::errors::InvalidArgument(
-              "Weight shape not compatible at, " + node_def.name());
+      // default is element;
+      for (int i = 1; i < dims_w.nbDims; i++) {
+        if (dims_w.d[i] != dims_t.d[i]) {
+          // if dimension does not match, switch back to channel;
+          VLOG(2) << "channel";
+          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+          break;
+        }
+      }
+      // if channel as candidate, validate it
+      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+        for (int i = 1; i < dims_w.nbDims; i++) {
+          if (dims_w.d[i] != 1)
+            return tensorflow::errors::InvalidArgument(
+                "Weight shape not compatible at, " + node_def.name());
+        }
+      } else {
+        VLOG(2) << "elementwise";
       }
+    } else if (dims_w.nbDims == 1 &&
+               dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
+      // channel wise and broadcast required;
+      permutation_flag = true;
+      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Weight shape not compatible at, " + node_def.name());
     }
   }
 
-  // Prepare weights
+  // transpose last dimension
+  std::vector<int> permutation(dims_t.nbDims + 1);
+  if (permutation_flag) {
+    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
+      // we swap the last dimension into channel for trt.
+      // because of tensorflow default broadcasting rules.
+      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
+        permutation[i] = i;
+      }
+      permutation[1] = dims_t.nbDims;
+      permutation[dims_t.nbDims] = 1;
+      tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                   permutation);
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Transpose cannot be applied, " + node_def.name());
+    }
+  }
+
+  // prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
@@ -779,90 +895,29 @@ tensorflow::Status BinaryTensorOpWeight(
       scale_weights, power_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // transpose back dimension
+  if (permutation_flag) {
+    output_tensor = ctx.TransposeTensor(output_tensor, permutation);
+  }
 
   // Pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status BinaryTensorOpTensor(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
-      {"Add", nvinfer1::ElementWiseOperation::kSUM},
-      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-      // {"max", nvinfer1::ElementWiseOperation::kMAX},
-      // {"min", nvinfer1::ElementWiseOperation::kMIN},
-      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-      {"Div", nvinfer1::ElementWiseOperation::kDIV},
-  };
-
-  // FIXME assume type matches input weights
-  // Get trt type & shape
-  TFAttrs attrs(node_def);
-  // Maybe this part has to be moved into the block of rsqrt later
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 
-  // Check type consistency
-  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
-
-  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
-      *const_cast<nvinfer1::ITensor*>(tensor_l),
-      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  // Pass the output
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertPlaceholder(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
+tensorflow::Status ConvertConv2DHelper(
+    Converter& ctx, tensorflow::NodeDef const& node_def,
     std::vector<TRT_TensorOrWeights> const& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  VLOG(2) << "Placeholder should have been replace already";
-  return tensorflow::errors::Unimplemented(", cannot convert Placeholder op");
-  // OK this make sense since we are supposed to replace it with input
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
-  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
-
-  dims.nbDims--;
-  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
-
-  nvinfer1::ITensor* output =
-      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
-  if (!output) {
-    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
-  }
-  outputs->push_back(TRT_TensorOrWeights(output));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertConv2D(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
+    std::vector<TRT_TensorOrWeights>* outputs,
+    int group  // group ==0 specifies depthwise conv
+) {
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  // TODO(jie): handle NHWC/NCHW transpose;
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
-  ReorderRSCKToKCRS(weights_rsck, &weights);
-  TRT_ShapedWeights biases(weights.type_);
-  int noutput = weights.shape_.d[0];
-  nvinfer1::DimsHW kernel_size;
-  kernel_size.h() = weights.shape_.d[2];
-  kernel_size.w() = weights.shape_.d[3];
+
   TFAttrs attrs(node_def);
 
+  int c_index = 1;
   int h_index = 2;
   int w_index = 3;
   auto data_format = attrs.get<string>("data_format");
@@ -871,14 +926,35 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
                                  {0, 3, 1, 2});
     h_index = 1;
     w_index = 2;
+    c_index = 3;
     // TODO(jie): transpose it
   }
 
+  // tensor after transpose (NCHW)
+  auto tensor_dim = tensor->getDimensions();
+
+  int nbGroups = group;
+  if (nbGroups == 0)  // depthwise convolution
+    nbGroups = tensor_dim.d[0];
+  VLOG(2) << "groups count: " << nbGroups;
+
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
+  ReorderRSCKToKCRS(weights_rsck, &weights, nbGroups);
+  TRT_ShapedWeights biases(weights.type_);
+  int noutput = weights.shape_.d[0] * nbGroups;
+  nvinfer1::DimsHW kernel_size;
+  kernel_size.h() = weights.shape_.d[2];
+  kernel_size.w() = weights.shape_.d[3];
+  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
+
   // TODO(jie): stride. (NHWC/NCHW)
   auto tf_stride = attrs.get<std::vector<int>>("strides");
+  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
+  VLOG(2) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
+             << tf_stride[3];
   nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
-  auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
@@ -919,6 +995,7 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
+  layer->setNbGroups(nbGroups);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   auto dim_after = output_tensor->getDimensions();
@@ -935,6 +1012,99 @@ tensorflow::Status ConvertConv2D(Converter& ctx,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertConv2DHelper(
+    Converter& ctx, tensorflow::NodeDef const& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs, ConvolutionType type) {
+  switch (type) {
+    case ConvolutionType::DEFAULT:
+      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 1);
+    case ConvolutionType::DEPTHWISE_CONV:
+      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 0);
+  }
+  return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
+                                           node_def.name());
+}
+
+tensorflow::Status BinaryTensorOpTensor(
+    Converter& ctx, tensorflow::NodeDef const& node_def,
+    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation>
+      ops{
+          {"Add", nvinfer1::ElementWiseOperation::kSUM},
+          {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+          // {"max", nvinfer1::ElementWiseOperation::kMAX},
+          // {"min", nvinfer1::ElementWiseOperation::kMIN},
+          {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+          {"Div", nvinfer1::ElementWiseOperation::kDIV},
+      };
+
+  // FIXME assume type matches input weights
+  // get trt type & shape
+  TFAttrs attrs(node_def);
+  // maybe this part has to be moved into the block of rsqrt later
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+
+  // check type consistency
+  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
+  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end())
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
+
+  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
+      *const_cast<nvinfer1::ITensor*>(tensor_l),
+      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // pass the output
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPlaceholder(
+    Converter& ctx, tensorflow::NodeDef const& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  VLOG(2) << "Placeholder should have been replace already";
+  return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
+  // OK this make sense since we are supposed to replace it with input
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
+  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
+
+  dims.nbDims--;
+  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
+
+  nvinfer1::ITensor* output =
+      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
+  if (!output) {
+    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
+  }
+  outputs->push_back(TRT_TensorOrWeights(output));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertConv2D(Converter& ctx,
+                                 tensorflow::NodeDef const& node_def,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
+                             ConvolutionType::DEFAULT);
+}
+
+tensorflow::Status ConvertConv2DDepthwise(
+    Converter& ctx, tensorflow::NodeDef const& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
+                             ConvolutionType::DEPTHWISE_CONV);
+}
+
 tensorflow::Status ConvertPool(Converter& ctx,
                                const tensorflow::NodeDef& node_def,
                                std::vector<TRT_TensorOrWeights> const& inputs,
@@ -957,6 +1127,8 @@ tensorflow::Status ConvertPool(Converter& ctx,
   // TODO(jie): support other pooling type
   if (node_def.op() == "MaxPool")
     type = nvinfer1::PoolingType::kMAX;
+  else if (node_def.op() == "AvgPool")
+    type = nvinfer1::PoolingType::kAVERAGE;
   else
     return tensorflow::errors::Unimplemented("Only supports Max pool");
 
@@ -1055,9 +1227,26 @@ tensorflow::Status ConvertScale(Converter& ctx,
   } else {
     VLOG(2) << "NCHW !!!!";
   }
-  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
-      weights, empty_weights, empty_weights);
+
+  auto dims = tensor->getDimensions();
+  VLOG(2) << "tensor dimensions: " << dims.nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    VLOG(2) << "i: " << dims.d[i];
+  }
+  dims = weights.shape_;
+  VLOG(2) << "tensor dimensions: " << dims.nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    VLOG(2) << "i: " << dims.d[i];
+  }
+
+  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
+  if (weights.shape_.d[0] == 1) {
+    mode = nvinfer1::ScaleMode::kUNIFORM;
+  }
+
+  nvinfer1::IScaleLayer* layer =
+      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
+                              weights, empty_weights, empty_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   if (data_format == "NHWC") {
@@ -1091,21 +1280,73 @@ tensorflow::Status ConvertConst(Converter& ctx,
     VLOG(2) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      VLOG(2) << "Dimensions: " << tensor.dims();
-      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
-                                  GetTensorShape(tensor));
+      VLOG(2) << "dimensions: " << tensor.dims();
+      VLOG(2) << "size: " << weights_tensor.float_val_size();
+      scalar_shape = GetTensorShape(tensor);
+      for (int i = 0; i < scalar_shape.nbDims; i++)
+        VLOG(2) << scalar_shape.d[i];
+      if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size()) {
+        if (weights_tensor.float_val_size() == 1 ||
+            scalar_shape.d[0] == weights_tensor.float_val_size()) {
+          scalar_shape.nbDims = 1;
+          // no dimension provided. flatten it
+          scalar_shape.d[0] = weights_tensor.float_val_size();
+          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+        } else {
+          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
+                     << " kUNIFORM, at: " << node_def.name();
+        }
+      }
     } else {
       VLOG(2) << "Dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
-      scalar_shape.d[0] = 1;
+      // no dimension provided. flatten it
+      scalar_shape.d[0] = weights_tensor.float_val_size();
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
       for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
         scalar_shape.d[i] = 0;
         scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
       }
-      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
-                                  scalar_shape);
     }
+    weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                scalar_shape);
+    // LOG(INFO) << " add: " << weights_tensor.float_val().data();
+    // LOG(INFO) << " value: " << (*weights_tensor.float_val().data());
+
+    // weights = ctx.get_temp_weights(dtype, scalar_shape);
+    // std::memcpy(const_cast<void*>(weights.values),
+    //           weights_tensor.float_val().data(), weights.size_bytes());
+  } else if (!weights_tensor.int_val().empty()) {
+    VLOG(2) << "int!!!" << node_def.name();
+    nvinfer1::Dims scalar_shape;
+    if (tensor.dims() > 0) {
+      VLOG(2) << "dimensions: " << tensor.dims();
+      scalar_shape = GetTensorShape(tensor);
+      if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size()) {
+        if (weights_tensor.int_val_size() == 1 ||
+            scalar_shape.d[0] == weights_tensor.int_val_size()) {
+          scalar_shape.nbDims = 1;
+          // no dimension provided. flatten it
+          scalar_shape.d[0] = weights_tensor.int_val_size();
+          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+        } else {
+          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
+                     << " kUNIFORM, at: " << node_def.name();
+        }
+      }
+    } else {
+      VLOG(2) << "dimensions: " << tensor.dims();
+      scalar_shape.nbDims = 1;
+      // no dimension provided. flatten it
+      scalar_shape.d[0] = weights_tensor.int_val_size();
+      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
+        scalar_shape.d[i] = 0;
+        scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
+      }
+    }
+    weights =
+        TRT_ShapedWeights(dtype, weights_tensor.int_val().data(), scalar_shape);
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
     const auto& content = weights_tensor.tensor_content();
@@ -1229,6 +1470,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
       return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at" +
                                                  node_def.name());
     if (index_list_data[i] == 1) permuted_index = 1;
+
     idx_set.emplace(index_list_data[i]);
   }
 
@@ -1236,7 +1478,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   nvinfer1::DimsHW pool_kernel;
   if (permuted_index == 1) {
     for (int i = 2; i < nb_dims; i++) {
-      if (idx_set.count(i)) {
+      if (idx_set.count(i) == 0 ) {
         permuted_index = i;
         break;
       }
@@ -1271,6 +1513,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     output_tensor = ctx.TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
   }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
@@ -1371,19 +1614,206 @@ tensorflow::Status ConvertPad(Converter& ctx,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertConcat(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  // not including the last input (axis) here
+  int input_size = static_cast<int>(inputs.size()) - 1;
+
+  if (!inputs.at(0).is_tensor())
+    return tensorflow::errors::InvalidArgument(
+        "Concat in TRT support only Tensor input, at " + node_def.name());
+
+  // We are retrieving the axis
+  TRT_ShapedWeights axis = inputs.at(input_size).weights();
+
+  TFAttrs attrs(node_def);
+  // auto attr_size = attrs.at("N")->i();
+  // auto data_type = attrs.get<nvinfer1::DataType>("T");
+  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
+
+  // TODO(jie): handle data type
+  // Only expect to handle INT32 as index attributes for now
+  if (index_type != tensorflow::DataType::DT_INT32)
+    return tensorflow::errors::Unimplemented(
+        "Tidx supports only DT_INT32, at " + node_def.name());
+
+  int index = *(static_cast<int*>(const_cast<void*>(axis.GetValues())));
+
+  // TODO(jie): early termination with no-op (attr_size==1)
+
+  auto dim = inputs.at(0).tensor()->getDimensions();
+  // dimension check
+  if (index > dim.nbDims + 1)
+    return tensorflow::errors::InvalidArgument(
+        "Concatenate on axis out of dimension range, at " + node_def.name());
+
+  if (index == 0)
+    return tensorflow::errors::InvalidArgument(
+        "Concatenate on batch dimension not supported, at " + node_def.name());
+
+  // incase we need permutation;
+  std::vector<int> permutation_order(dim.nbDims + 1);
+
+  for (int i = 0; i < dim.nbDims + 1; i++) permutation_order[i] = i;
+
+  if (index != 1) {
+    permutation_order[1] = index - 1;
+    permutation_order[index - 1] = 1;
+  }
+
+  std::vector<nvinfer1::ITensor const*> inputs_vec;
+  // Shap chack (all input tensor should have same shape)
+  // starting from 0 since we are probably also doing transpose here;
+  for (int i = 0; i < input_size; i++) {
+    auto tensor_i = inputs.at(i).tensor();
+    auto dim_i = tensor_i->getDimensions();
+    if (dim_i.nbDims != dim.nbDims)
+      return tensorflow::errors::InvalidArgument(
+          "Concatenate receives inputs with inconsistent dimensions, at " +
+          node_def.name());
+
+    for (int j = 0; j < dim.nbDims; j++) {
+      // check dimension consistency on non-concatenate axis
+      if (j != index - 1 && dim_i.d[j] != dim.d[j])
+        return tensorflow::errors::InvalidArgument(
+            "Concatenate receives inputs with inconsistent shape, at" +
+            node_def.name());
+    }
+
+    // TRT does concatenation only on channel!
+    if (index != 1)
+      tensor_i = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor_i),
+                                     permutation_order);
+
+    inputs_vec.push_back(tensor_i);
+  }
+
+  // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  nvinfer1::IConcatenationLayer* layer = ctx.network()->addConcatenation(
+      const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()),
+      inputs_vec.size());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  if (index != 1) {
+    output_tensor = ctx.TransposeTensor(output_tensor, permutation_order);
+  }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertMatMul(Converter& ctx,
+                                 tensorflow::NodeDef const& node_def,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+
+  // TODO(jie): transpose!
+  TFAttrs attrs(node_def);
+  // bool transpose_w = bool(attrs->at("transpose_b")->i());
+
+  // tensor after transpose (NCHW)
+  auto tensor_dim = tensor->getDimensions();
+
+  TRT_ShapedWeights weights_ck = inputs.at(1).weights();
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_ck);
+  reorder_ck_to_kc(weights_ck, &weights);
+  TRT_ShapedWeights biases(weights.type_);
+
+  int noutput = weights.shape_.d[0];
+
+  nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected(
+      *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertReshape(
+    Converter& ctx, tensorflow::NodeDef const& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights())
+    return tensorflow::errors::InvalidArgument(
+        "Input expects tensor and weights, at" + node_def.name());
+
+  // implement tensor binaryOp weight [channel wise] for now;
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  auto dims = tensor->getDimensions();
+  // restore implicit batch dimension
+  int nbDims = dims.nbDims + 1;
+
+  TRT_ShapedWeights shape = inputs.at(1).weights();
+
+  TFAttrs attrs(node_def);
+
+  auto padding_type = attrs.get<tensorflow::DataType>("Tshape");
+
+  if (shape.shape_.nbDims != 1)
+    return tensorflow::errors::InvalidArgument(
+        "reshape new shape is not 1 dimensional, at " + node_def.name());
+
+  // Only expect to handle INT32 as attributes for now
+  if (padding_type != tensorflow::DataType::DT_INT32)
+    return tensorflow::errors::Unimplemented(
+        "reshape new shape supports only DT_INT32, at " + node_def.name());
+
+  auto shape_data = static_cast<int*>(const_cast<void*>(shape.GetValues()));
+
+  if (shape_data[0] != -1)
+    return tensorflow::errors::InvalidArgument(
+        "reshape new shape first dimension is not -1, at " + node_def.name());
+
+  auto shape_num_dims = shape.shape_.d[0];
+  VLOG(2) << "shape dimensions: " << shape_num_dims;
+  int volume_w = 1;
+  for (int i = 1; i < shape.shape_.d[0]; i++) volume_w *= shape_data[i];
+
+  int volume_t = 1;
+  for (int i = 0; i < dims.nbDims; i++) volume_t *= dims.d[i];
+
+  VLOG(2) << "volume: " << volume_t << " volume weights: " << volume_w;
+  if (volume_w != volume_t)
+    return tensorflow::errors::InvalidArgument(
+        "volume does not agree between tensor and new shape, at " +
+        node_def.name());
+
+  nvinfer1::IShuffleLayer* layer =
+      ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor));
+
+  nvinfer1::Dims reshapeDims;
+  VLOG(2) << "new dimension: " << shape_num_dims - 1;
+  reshapeDims.nbDims = shape_num_dims - 1;
+  for (int32_t i = 0; i < reshapeDims.nbDims; ++i) {
+    reshapeDims.d[i] = shape_data[i + 1];
+  }
+  layer->setReshapeDimensions(reshapeDims);
+  VLOG(2) << "new dimension: " << shape_num_dims - 1;
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  auto dims_output = output_tensor->getDimensions();
+  VLOG(2) << "output tensor dimension:" << dims_output.nbDims;
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
 void Converter::register_op_converters() {
   // vgg_16 slim implementation
   op_registry_["Placeholder"] = ConvertPlaceholder;
   op_registry_["Conv2D"] = ConvertConv2D;
+  op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
+  op_registry_["AvgPool"] = ConvertPool;
   // This could be really handled as ConvertBinary
   op_registry_["BiasAdd"] = ConvertScale;
   op_registry_["Const"] = ConvertConst;
-  // op_registry_["MatMul"] = ConvertFullyConnected;  // Not used in vgg
+  // op_registry_["MatMul"] = ConvertFullyConnected; // not used in vgg
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
-  // op_registry_["AvgPool"] = ConvertPool;
 
   // resnet_50_v1 slim implementation
   op_registry_["Add"] = ConvertBinary;
@@ -1393,26 +1823,355 @@ void Converter::register_op_converters() {
   op_registry_["Mean"] = ConvertReduce;
   op_registry_["Pad"] = ConvertPad;
   // TODO(ben,jie): Add more ops
+
+  op_registry_["ConcatV2"] = ConvertConcat;
+  op_registry_["MatMul"] = ConvertMatMul;
+  op_registry_["Reshape"] = ConvertReshape;
 }
 
 }  // namespace
+tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
+  return tensorflow::errors::Unimplemented("Not implemented yet");
+}
+tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph &graph,
+                                                      tensorflow::Node *c_node) {
+  const auto ndef=c_node->def();
+
+  TFAttrs attrs(ndef);
+  std::vector<string> segment_nodes(attrs.get<std::vector<string>>("segment_nodes"));
+  std::vector<string> output_nodes(attrs.get<std::vector<string>>("segment_output_names"));
+  std::vector<string> input_names(attrs.get<std::vector<string>>("input_names"));
+  string res_name = attrs.get<string>("resource_name");
+  VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name;
+  string engine_name="my_trt_op";
+  {
+    const auto node_id=tensorflow::str_util::Split(res_name,"_");
+    engine_name+=node_id.back();
+  }
+  std::map<string,tensorflow::Node*> nodeMaps;
+
+  for(auto n: graph.op_nodes()){
+    nodeMaps.insert({n->name(),n});
+  }
+  VLOG(1)<<"Output Nodes:";
+  std::vector<tensorflow::DataType> out_types;
+  std::vector<const tensorflow::Edge*> out_edges;
+  for(auto &i : output_nodes ){
+    auto node_port=tensorflow::str_util::Split(i,":");
+    VLOG(1) << " " << i << " in graph " << nodeMaps.count(i);
+    auto out_node_name = node_port.at(0);
+    if(node_port.size()>1){
+      VLOG(1) << "Multi port output" << node_port.at(0) <<
+              " " << node_port.at(1) << " size=" << node_port.size();
+    }
+    auto nodeIt=nodeMaps.find(out_node_name);
+    if(nodeIt!=nodeMaps.end()){
+      tensorflow::Node* outNode=nodeIt->second;
+      int port=0;
+      if(node_port.size()==2){
+        port=std::strtoul(node_port.at(1).c_str(),nullptr,10);
+        out_types.push_back(outNode->output_type(port));
+      }else{
+        out_types.push_back(outNode->output_type(0));
+      }
+      for(auto outEdge : outNode->out_edges()){
+        if(outEdge->src_output()==port){
+          out_edges.push_back(outEdge);
+          break;
+        }
+      }
+    }else{
+      LOG(WARNING)<<" couldn't find output node "<<out_node_name;
+    }
+  }
+  VLOG(1)<<"Input Nodes:";
+  for(auto &i : input_names){
+    VLOG(1) << " " << i << " in graph " << nodeMaps.count(i);
+  }
+  auto trt_rm = tensorflow::trt::TRTResourceManager::instance();
+  auto resmgr = trt_rm->getManager("TRTCalibOps");
+  tensorflow::trt::TRTCalibrationResource* calibRes = nullptr;
+  auto status = resmgr->Lookup(res_name, res_name, &calibRes);
+  if(!status.ok() || !calibRes->calibrator){
+    return tensorflow::errors::FailedPrecondition("You must run calibration"\
+    " and inference conversion in the same proces");
+  }
+
+  calibRes->calibrator->setDone();
+  VLOG(1)<<"Waiting for calibration thread to join";
+  calibRes->thr->join();
+  delete calibRes->thr;
+  if(!calibRes->engine){
+    LOG(FATAL)<<"Calibration failed!, engine is nullptr";
+  }
+  auto engine_plan_string=calibRes->engine->serialize();
+  calibRes->engine->destroy();
+  calibRes->network->destroy();
+  calibRes->builder->destroy();
+  calibRes->thr= nullptr;
+  calibRes->engine= nullptr;
+  calibRes->builder= nullptr;
+  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  for(const auto in_edge : c_node->in_edges()){
+    auto src=in_edge->src();
+    int dest_port=in_edge->dst_input();
+    income_edges.emplace_back(src->name(),in_edge->src_output(),c_node->input_type(dest_port));
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder.Input(input_list);
+  tensorflow::NodeDef engine_node;
+  status = op_builder.Attr("serialized_engine", engine_plan_string)
+                    .Attr("input_nodes", input_names)
+                    .Attr("output_nodes", output_nodes)
+                    .Attr("OutT", out_types)
+                    .Finalize(&engine_node);
+  if(!status.ok()){
+    LOG(ERROR)<<"Engine Node creation failed";
+    return status;
+  }
+  auto trt_engine_node=graph.AddNode(engine_node,&status);
+  TF_CHECK_OK(status);
+  for(size_t i=0;i<out_edges.size();i++) {
+
+    VLOG(1)<<"Connecting trt_engine_node output " << i << " with "
+           << out_edges.at(i)->dst()->name() << " port "
+           << out_edges.at(i)->dst_input();
+    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
+                                        out_edges.at(i)->dst(),
+                                        out_edges.at(i)->dst_input()));
+  }
+  VLOG(1) << "Segment nodes:";
+  for (auto &i : segment_nodes){
+    VLOG(1) << " " << i << " in graph " << nodeMaps.count(i);
+    auto it=nodeMaps.find(i);
+    if(it!=nodeMaps.end()){
+      graph.RemoveNode(it->second);
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+
+  // Toposort
+  std::vector<tensorflow::Node*> order_vec;
+  tensorflow::GetPostOrder(s.graph, &order_vec);
+  // Select just the subgraph
+  std::list<tensorflow::Node*> order;
+  for (tensorflow::Node* node : order_vec) {
+    if (s.subgraph_node_ids.count(node->id())) {
+      // order.push_back(node);
+      order.push_front(node);  // we want topological order to contstruct the
+      // network layer by layer
+    }
+  }
+  // topological order is needed to build TRT network
+  VLOG(2) << "BUILDING 1";
+  static int static_id = 0;
+  string calib_op_name =
+      tensorflow::strings::StrCat("my_trt_calib_op_", static_id);
+  string engine_name = tensorflow::strings::StrCat("my_trt_op", static_id);
+
+  static_id++;
+  VLOG(2) << "BUILDING 2";
+  auto trt_rmgr = tensorflow::trt::TRTResourceManager::instance();
+  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
+  auto op_res = new tensorflow::trt::TRTCalibrationResource();
+  VLOG(1)<<"SAMI Creating calibresource "<<calib_op_name<<" @ "<<op_res;
+  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
+  op_res->logger = new tensorflow::tensorrt::Logger();
+  op_res->builder = nvinfer1::createInferBuilder(*(op_res->logger));
+
+  if (!op_res->builder) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT builder object");
+  }
+
+  VLOG(2) << "BUILDING 3";
+
+  op_res->network = op_res->builder->createNetwork();
+  if (!op_res->network) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT network object");
+  }
+
+  VLOG(2) << "BUILDING 4";
+
+  // Build the network
+  Converter converter(op_res->network);
+
+  VLOG(2) << "BUILDING 5";
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  for (std::pair<int, int> const& input : s.input_inds) {
+    VLOG(2) << "parsing input!!!!!";
+    int node_id = input.first;
+    int output_idx = input.second;
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    auto node_name = node->name();
+    input_names.push_back(node_name);  // insert original node name without port
+    // TODO(jie): alternative :)
+    // tensorflow::DataType tf_dtype = node->output_type(output_idx);
+    if (!s.graph_properties.HasOutputProperties(node_name))
+      return tensorflow::errors::Internal("failed to find input node: " +
+                                          node_name);
+
+    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
+    if (static_cast<int>(op_info_vec.size()) < output_idx)
+      return tensorflow::errors::Internal(
+          "accessing output index of: " + std::to_string(output_idx) +
+          ", at node: " + node_name + "with output entry from shape_map: " +
+          std::to_string(op_info_vec.size()));
+
+    auto op_info = op_info_vec.at(output_idx);
+
+    tensorflow::DataType tf_dtype = op_info.dtype();
+    input_dtypes.push_back(tf_dtype);
+
+    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
+
+    VLOG(2) << "accessing output index of: " << std::to_string(output_idx)
+               << ", at node: " << node_name
+               << "with output entry from shape_map: "
+               << std::to_string(op_info_vec.size());
+
+    // TODO(ben,jie): update TRT input format/dimension
+    nvinfer1::DimsCHW input_dim_psuedo_chw;
+    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
+
+    for (int i = 1; i < op_info.shape().dim_size(); i++) {
+      VLOG(2) << "dimension: " << i
+                 << " , size: " << op_info.shape().dim(i).size();
+      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
+    }
+
+    // TODO(ben,jie): proper way to restore input tensor name?
+    auto input_tensor_name = node_name;
+    if (output_idx != 0)
+      input_tensor_name = node_name + ":" + std::to_string(output_idx);
+
+    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
+        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
+
+    if (!input_tensor)
+      return tensorflow::errors::InvalidArgument(
+          "Failed to create Input layer");
+    VLOG(2) << "input tensor name :" << input_tensor_name;
+
+    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
+      return tensorflow::errors::AlreadyExists(
+          "output tensor already exists for op: " + input_tensor_name);
+  }
+
+  VLOG(2) << "finished sorting";
+
+  for (const tensorflow::Node* node : order) {
+    tensorflow::NodeDef const& node_def = node->def();
+    VLOG(2) << "converting node: " << node_def.name() << " , "
+               << node_def.op();
+    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
+  }
+
+  VLOG(2) << "finished conversion";
+
+  // Gather output metadata
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  int trt_engine_op_output_idx = 0;
+  for (std::pair<int, int> const& output : s.output_inds) {
+    int node_id = output.first;
+    int output_idx = output.second;
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    string op_name = node->name();
+    string tensor_name = op_name;
+
+    s.output_edge_map->insert(
+        {trt_engine_op_output_idx == 0
+         ? engine_name
+         : engine_name + ":" + std::to_string(trt_engine_op_output_idx),
+         {output_idx, tensor_name}});
+    trt_engine_op_output_idx++;
+    if (output_idx != 0)
+      tensor_name = tensor_name + ":" + std::to_string(output_idx);
+    VLOG(1) << "output tensor name: " << tensor_name;
+    output_names.push_back(tensor_name);
+    auto tensor_or_weights = converter.get_tensor(tensor_name);
+    if (!tensor_or_weights.is_tensor()) {
+      return tensorflow::errors::InvalidArgument(
+          "Output node is weights not tensor");
+    }
+    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
+    if (!tensor) {
+      return tensorflow::errors::NotFound("Output tensor not found: " +
+          tensor_name);
+    }
+    converter.network()->markOutput(*tensor);
+    tensorflow::DataType tf_dtype = node->output_type(output_idx);
+    output_dtypes.push_back(tf_dtype);
+    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
+    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
+    tensor->setType(trt_dtype);
+  }
+
+  VLOG(2) << "finished output";
+
+  // Build the engine
+  op_res->builder->setMaxBatchSize(s.max_batch_size);
+  op_res->builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+
+  // Build the TRT op
+  // TODO(sami,ben,jie): proper naming!
+  tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names.at(i), output_idx, input_dtypes.at(i));
+    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
+            << ":" << output_idx
+            <<" dType= "<< tensorflow::DataTypeString(input_dtypes.at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder.Input(input_list);
+  std::vector<string> segment_names;
+  segment_names.reserve(s.subgraph_node_ids.size());
+  for (int i : s.subgraph_node_ids) {
+    auto node = s.graph.FindNodeId(i);
+    segment_names.push_back(node->name());
+  }
+  LOG(INFO) << "finished op preparation";
+
+  auto status = op_builder.Attr("segment_nodes", segment_names)
+                    .Attr("input_names",input_names)
+                    .Attr("segment_output_names", output_names)
+                    .Attr("resource_name",calib_op_name)
+                    .Finalize(s.trt_node);
+
+  LOG(INFO) << status.ToString();
+  LOG(INFO) << "finished op building";
+
+  return tensorflow::Status::OK();
+}
 
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
-    const std::vector<std::pair<int, int>>& input_inds,
-    const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
-    size_t max_workspace_size_bytes,
-    const tensorflow::grappler::GraphProperties& graph_properties,
-    tensorflow::NodeDef* trt_node) {
+    tensorrt::convert::SubGraphParams& s) {
   // Visit nodes in reverse topological order and construct the TRT network.
 
   // Toposort
   std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(graph, &order_vec);
+  tensorflow::GetPostOrder(s.graph, &order_vec);
   // Select just the subgraph
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
-    if (subgraph_node_ids.count(node->id())) {
+    if (s.subgraph_node_ids.count(node->id())) {
       // We want topological order to contstruct the
       // network layer by layer
       order.push_front(node);
@@ -1439,26 +2198,46 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (std::pair<int, int> const& input : input_inds) {
+  for (const std::pair<int, int>& input : s.input_inds) {
+    VLOG(2) << "parsing input!!!!!";
     int node_id = input.first;
     int output_idx = input.second;
-    tensorflow::Node* node = graph.FindNodeId(node_id);
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
     auto node_name = node->name();
-    input_names.push_back(node_name);  // Insert original node name without port
-    // TODO(jie): alternative :)
-    if (!graph_properties.HasOutputProperties(node_name))
-      return tensorflow::errors::Internal("Failed to find input node: " +
-                                          node_name);
+    // input_names should use the node name in the graph
+    // here it should be the input tensor name -> matching the binding
+    // insert original node name without port
+    auto tensor_name = node_name;
+    if (output_idx != 0)
+      tensor_name = tensor_name + ":" + std::to_string(output_idx);
 
-    auto op_info_vec = graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
+    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
+               << " idx: " << output_idx;
+
+    auto shape_inference_node_name = node_name;
+    auto shape_inference_output_idx = output_idx;
+    // rewire the shape inference to original node in the graph
+    if (s.output_edge_map->count(tensor_name)) {
+      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
+      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
+    }
+    VLOG(2) << "shapeinference name: " << shape_inference_node_name
+               << " idx: " << shape_inference_output_idx;
+
+    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
+      return tensorflow::errors::Internal("failed to find input node: " +
+                                          shape_inference_node_name);
+
+    auto op_info_vec =
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
+    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
       return tensorflow::errors::Internal(
-          "Accessing output index of: " + std::to_string(output_idx) +
-          ", at node: " + node_name + " with output entry from shape_map: " +
+          "accessing output index of: " +
+          std::to_string(shape_inference_output_idx) + ", at node: " +
+          shape_inference_node_name + " with output entry from shape_map: " +
           std::to_string(op_info_vec.size()));
 
-    auto op_info = op_info_vec.at(output_idx);
-
+    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
@@ -1469,14 +2248,18 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
             << ", at node: " << node_name
             << " with output entry from shape_map: "
             << std::to_string(op_info_vec.size());
-
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
+    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
+    //            update the code once TRT 4.0 comes out.
+    if (op_info.shape().dim_size() != 4)
+      return tensorflow::errors::Unimplemented("require 4 dimensional input");
+
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
+                 << " , size: " << op_info.shape().dim(i).size();
       input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
@@ -1485,6 +2268,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     if (output_idx != 0)
       input_tensor_name = node_name + ":" + std::to_string(output_idx);
 
+    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -1508,17 +2292,29 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   VLOG(2) << "Finished conversion";
 
+  // TODO(sami,ben,jie): proper naming!
+  static int static_id = 0;
+  string engine_name = tensorflow::strings::StrCat("my_trt_op", static_id++);
+
   // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  for (std::pair<int, int> const& output : output_inds) {
+  int trt_engine_op_output_idx = 0;
+  for (std::pair<int, int> const& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
-    tensorflow::Node* node = graph.FindNodeId(node_id);
+    tensorflow::Node* node = s.graph.FindNodeId(node_id);
     string op_name = node->name();
     string tensor_name = op_name;
+
+    s.output_edge_map->insert(
+        {trt_engine_op_output_idx == 0
+             ? engine_name
+             : tensorflow::strings::StrCat(engine_name,":",trt_engine_op_output_idx),
+         {output_idx, tensor_name}});
+    trt_engine_op_output_idx++;
     if (output_idx != 0)
-      tensor_name = tensor_name + ":" + std::to_string(output_idx);
+      tensorflow::strings::StrAppend(&tensor_name, ":" ,std::to_string(output_idx));
     VLOG(2) << "Output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
@@ -1541,12 +2337,15 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   VLOG(2) << "Finished output";
   // TODO(jie): static_id is not thread safe.
-  static int static_id = 0;
+
 
   // Build the engine
-  trt_builder->setMaxBatchSize(max_batch_size);
-  trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
-  VLOG(0) << "Starting build engine " << static_id;
+  trt_builder->setMaxBatchSize(s.max_batch_size);
+  trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+  if(s.precision_mode==1){
+    trt_builder->setHalf2Mode(true);
+  }
+  LOG(INFO) << "starting build engine";
   // TODO(ben,jie): half2 and int8 mode support
   string engine_plan_string;
   {
@@ -1561,17 +2360,18 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         string(engine_plan_data, engine_plan_data + engine_plan->size());
   }
 
-  VLOG(0) << "Finished engine";
+  LOG(INFO) << "finished engine " << engine_name;
 
   // Build the TRT op
-  // TODO(sami,ben,jie): proper naming!
-  tensorflow::NodeDefBuilder op_builder(
-      tensorflow::strings::StrCat("my_trt_op", static_id++), "TRTEngineOp");
+  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  VLOG(2) << "input edge size: " << input_names.size();
   for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = input_inds.at(i).second;
-    // We wired up the input here already, it is redundant to do it again in
-    // ConvertSubGraphToTensorRT(convert_graph.cc)
+    VLOG(2) << "input edges: " << std::to_string(i) << " "
+               << input_names.at(i);
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
     auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
         input_names.at(i), output_idx, input_dtypes.at(i));
     income_edges.push_back(incoming_edge);
@@ -1586,7 +2386,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
-                    .Finalize(trt_node);
+                    .Finalize(s.trt_node);
 
   VLOG(0) << status.ToString() << " finished op building";
 
@@ -1597,5 +2397,5 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+//#endif  // GOOGLE_TENSORRT
+//#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 2e7fd19566..49e060a553 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
+#include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -25,28 +27,56 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
+//#if GOOGLE_CUDA
+//#if GOOGLE_TENSORRT
 
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
-    const std::vector<std::pair<int, int>>&
-        input_inds,  // {node_id, output_idx}
-    const std::vector<std::pair<int, int>>&
-        output_inds,  // {node_id, output_idx}
-    size_t max_batch_size, size_t max_workspace_size_bytes,
-    const tensorflow::grappler::GraphProperties& graph_prop,
-    tensorflow::NodeDef* trt_node);
+struct SubGraphParams {
+  SubGraphParams(tensorflow::Graph& graph_,
+                 const std::set<int>& subgraph_node_ids_,
+                 const std::vector<std::pair<int, int>>& input_inds_,
+                 const std::vector<std::pair<int, int>>& output_inds_,
+                 size_t max_batch_size_, size_t max_workspace_size_bytes_,
+                 const tensorflow::grappler::GraphProperties& graph_properties_,
+                 std::unordered_map<string, std::pair<int, string>>*
+                 output_edge_map_,
+                 tensorflow::NodeDef* trt_node_,
+                 int precision_mode_ = 0)
+      : graph(graph_),
+        subgraph_node_ids(subgraph_node_ids_),
+        input_inds(input_inds_),
+        output_inds(output_inds_),
+        max_batch_size(max_batch_size_),
+        max_workspace_size_bytes(max_workspace_size_bytes_),
+        graph_properties(graph_properties_),
+        output_edge_map(output_edge_map_),
+        trt_node(trt_node_),
+        precision_mode(precision_mode_) {}
 
+  tensorflow::Graph& graph;
+  const std::set<int>& subgraph_node_ids;
+  const std::vector<std::pair<int, int>>& input_inds;   // {node_id, output_idx}
+  const std::vector<std::pair<int, int>>& output_inds;  // {node_id, output_idx}
+  size_t max_batch_size;
+  size_t max_workspace_size_bytes;
+  const tensorflow::grappler::GraphProperties& graph_properties;
+  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
+  tensorflow::NodeDef* trt_node;
+  const int precision_mode;
+};
+
+tensorflow::Status ConvertSubGraphToTensorRTNodeDef(SubGraphParams &params);
+tensorflow::Status InjectCalibrationNode(SubGraphParams& params);
+tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph& graph,
+                                                      tensorflow::Node* c_node);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
+//#endif  // GOOGLE_TENSORRT
+//#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
new file mode 100644
index 0000000000..7cd41c4933
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
@@ -0,0 +1,111 @@
+//
+// Created by skama on 1/25/18.
+//
+
+#include "tensorflow/contrib/tensorrt/kernels/trt_calib_op.h"
+#include "tensorrt/include/NvInfer.h"
+#include <cuda_runtime_api.h>
+#include "tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.h"
+#include "tensorflow/contrib/tensorrt/resources/TRTResourceManager.h"
+#include "tensorflow/contrib/tensorrt/resources/TRTResources.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+namespace trt {
+TRTCalibOp::TRTCalibOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("segment_nodes", &segment_nodes_));
+  OP_REQUIRES_OK(context, context->GetAttr("input_names", &input_names_));
+  OP_REQUIRES_OK(context, context->GetAttr("resource_name", &repo_name));
+};
+
+#define TYPECASE(dt, X, Y)                                             \
+  case dt: {                                                           \
+    Y = (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
+    break;                                                             \
+  }
+#define GET_TENSOR_ADDRESS(tensor_ptr, dest_ptr)               \
+  {                                                            \
+    auto TENSOR_TYPE = tensor_ptr->dtype();                    \
+    switch (TENSOR_TYPE) {                                     \
+      TYPECASE(tensorflow::DT_FLOAT, tensor_ptr, dest_ptr);    \
+      TYPECASE(tensorflow::DT_HALF, tensor_ptr, dest_ptr);     \
+      TYPECASE(tensorflow::DT_INT8, tensor_ptr, dest_ptr);     \
+      default: {                                               \
+        LOG(FATAL) << "Unsupported Data type "                 \
+                   << tensorflow::DataTypeString(TENSOR_TYPE); \
+        break;                                                 \
+      }                                                        \
+    }                                                          \
+  }
+void TRTCalibOp::Compute(tensorflow::OpKernelContext* ctx) {
+  auto trt_rm = tensorflow::trt::TRTResourceManager::instance();
+  VLOG(2) << "Op Name= " << name() << " nodedef name= " << repo_name;
+  auto resmgr = trt_rm->getManager("TRTCalibOps");
+  tensorflow::trt::TRTCalibrationResource* calibRes = nullptr;
+  auto status = resmgr->Lookup(repo_name, repo_name, &calibRes);
+  if (status.ok()) {
+    int batchSize = ctx->input(0).dim_size(0);
+    VLOG(2) << "SAMI Batchsize= " << batchSize;
+    int numInputs = ctx->num_inputs();
+    VLOG(2) << "SAMI numInputs= " << numInputs;
+    dev_tensors_.resize(numInputs);
+    if (calibRes->calibrator == nullptr) {
+      VLOG(1) << " Constructing calibrator";
+      // first run
+      for (int i = 0; i < numInputs; i++) {
+        const tensorflow::Tensor& t = ctx->input(i);
+        VLOG(1) << "Tensor " << i << " " << t.shape().DebugString();
+        OP_REQUIRES_OK(ctx,
+                       ctx->allocate_persistent(t.dtype(), t.shape(),
+                                                &dev_tensors_.at(i), nullptr));
+        const auto dTensor = dev_tensors_.at(i).AccessTensor(ctx);
+        CHECK_EQ(t.TotalBytes(), dTensor->TotalBytes());
+        void* devAddr = nullptr;
+        GET_TENSOR_ADDRESS(dTensor, devAddr)
+        device_buffers_.emplace(
+            input_names_.at(i),
+            std::pair<void*, size_t>(devAddr, dTensor->TotalBytes()));
+      }
+      calibRes->calibrator = new TRTInt8Calibrator(device_buffers_, batchSize);
+      calibRes->thr = new std::thread([calibRes]() {
+        VLOG(0)<<"Starting calibration thread, Calibration Resource @ "<<calibRes;
+        calibRes->builder->setInt8Calibrator(calibRes->calibrator);
+        calibRes->builder->setInt8Mode(true);
+        calibRes->engine = calibRes->builder->buildCudaEngine(
+            *calibRes->network);  // will loop until we terminate calibrator
+        VLOG(0) << "SAMI Calibration loop terminated";
+      });
+      VLOG(0) << "SAMI initialized calibrator resource";
+    }
+
+    std::unordered_map<string, void*> input_data;
+    for (int i = 0; i < numInputs; i++) {
+      const Tensor& t = ctx->input(i);
+      void* data_address = nullptr;
+      const Tensor* t_ptr = &t;
+      GET_TENSOR_ADDRESS(t_ptr, data_address);
+      const auto dTensor = dev_tensors_.at(i).AccessTensor(ctx);
+      CHECK_EQ(t.TotalBytes(),
+               dTensor->TotalBytes());  // use the tensor so FW keeps it
+      input_data.emplace(input_names_.at(i), data_address);
+      ctx->set_output(i, t);
+    }
+    VLOG(1) << "Filled map for sending";
+    calibRes->calibrator->setBatch(input_data);
+    VLOG(1) << "Passed calibration data";
+  } else {
+    ctx->SetStatus(status);
+    return;
+  }
+};
+
+#undef TYPECASE
+
+REGISTER_KERNEL_BUILDER(Name("TRTCalibOp").Device(DEVICE_GPU), TRTCalibOp);
+
+}  // namespace trt
+}  // namespace tensorflow
+\ No newline at end of file
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h
new file mode 100644
index 0000000000..792e7bae4c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.h
@@ -0,0 +1,36 @@
+//
+// Created by skama on 1/25/18.
+//
+
+#ifndef TFGITHUB_TRT_CALIB_OP_H
+#define TFGITHUB_TRT_CALIB_OP_H
+
+#include <memory>
+#include <string>
+#include <vector>
+#include <utility>
+#include <unordered_map>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace trt {
+class TRTCalibOp: public OpKernel {
+public:
+  explicit TRTCalibOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  std::string repo_name;
+  std::vector<std::string> segment_nodes_;
+  std::vector<std::string> input_names_;
+  std::vector<tensorflow::TensorShape> shapes_;
+  std::unordered_map<std::string, std::pair<void*, size_t>> device_buffers_;
+  std::vector<tensorflow::PersistentTensor> dev_tensors_;
+
+};
+}
+}
+#endif //TFGITHUB_TRT_CALIB_OP_H
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 8efdf63ebe..e4e8ab9e0a 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -24,8 +24,11 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
+static ::tensorflow::tensorrt::Logger gLogger;
+
+using IRuntime=nvinfer1::IRuntime;
+using Dims=nvinfer1::Dims;
 namespace tensorrt {
-static ::tensorflow::tensorrt::Logger logger;
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
@@ -38,14 +41,22 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
 
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  // Only engine should be in the op and context and runtime should be taken
-  // from resourcemanager
-  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  //  Only engine should be in the op and context and runtime should be taken
+  //  from resourcemanager
+
+  // TODO(jie): cudaSetDevice make sure trt engine is allocated on the same
+  //  gpu where the input/output is also located.
+  int gpu_id = context->device()->tensorflow_gpu_device_info()->gpu_id;
+  cudaSetDevice(gpu_id);
+  int device;
+  cudaGetDevice(&device);
+  if (gpu_id != device)
+    LOG(FATAL) << "set device failed!";
+
+  IRuntime* infer = nvinfer1::createInferRuntime(gLogger);
   trt_engine_ptr_.reset(infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr));
-
   trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
-  // Runtime is safe to delete after engine creation
   infer->destroy();
 }
 
@@ -64,10 +75,16 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     const TensorShape& input_shape = input_tensor.shape();
     if (i == 0) {
       num_batch = input_shape.dim_size(0);
+      if (num_batch > trt_engine_ptr_->getMaxBatchSize())
+        LOG(FATAL) << "input tensor batch larger than max_batch_size: "
+                   << trt_engine_ptr_->getMaxBatchSize(); 
     } else if (num_batch != input_shape.dim_size(0)) {
       valid = false;
       break;
     }
+    // int64 input_shape.dim_size(int d)
+    // int input_shape.dims()
+    LOG(INFO) << "INPUT BINDING index: " << binding_index << " with name: " << input_nodes_[i];
     switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
@@ -81,9 +98,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     }
   }
 
-  // Might want a different way to inform the user of batch size inconsistency
-  if (!valid) LOG(WARNING) << "input data inconsistent batch size";
-
+  if (!valid) LOG(FATAL) << "input data inconsistent batch size";
   for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
     // This is bad that we have to reallocate output buffer every run.
     // Create an output tensor
@@ -119,6 +134,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
         break;
     }
   }
+  LOG(INFO) << "getting stream";
   // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
   const cudaStream_t* stream = CHECK_NOTNULL(
       reinterpret_cast<const cudaStream_t*>(context->op_device_context()
@@ -126,9 +142,11 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
 
-  // execution handled by TF since we are getting stream from TF.
-  // it is safe for CPU pointer array (buffers) to go out of scope after enqueue
-  trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
+  // TODO(jie): trt enqueue does not return error
+  auto ret=trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
+  VLOG(2) << "enqueue returns: " << ret;
+  // sync should be done by TF.
+  // cudaStreamSynchronize(*stream);
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
diff --git a/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc b/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc
new file mode 100644
index 0000000000..dab5a3e0e8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/ops/trt_calib_op.cc
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+namespace tensorflow {
+
+
+REGISTER_OP("TRTCalibOp")
+    .Attr("segment_nodes: list(string)")  // names of the ops in segment
+    .Attr("segment_output_names: list(string)")  // names of the output ops in segment
+    .Attr("input_names: list(string)")  // names of the inputs for passing into tensorrt
+    .Attr("resource_name: string")
+    .Attr("InT: list({int8, float16, float32})")
+    .Input("in_tensor: InT")
+    .Output("out_tensor: InT")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_inputs(); i++){
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index 7e050a768c..3941d150d1 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -21,4 +21,5 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
+from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 9454862f85..94afb75897 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -20,10 +20,18 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
 import six as _six
-from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl as _impl
+from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert,calib_convert
+from tensorflow.python.util import compat
+import tensorflow as tf
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+
+
 from tensorflow.python.framework import ops
 
 
@@ -32,7 +40,8 @@ from tensorflow.python.framework import ops
 def create_inference_graph(input_graph_def,
                            outputs,
                            max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20):
+                           max_workspace_size_bytes=2 << 20,
+                           precision_mode="FP32"):
   """Python wrapper for the TRT transormation.
 
 
@@ -48,7 +57,13 @@ def create_inference_graph(input_graph_def,
   Raises:
     RuntimeError: if the returned status message is malformed.
   """
-
+  supported_precision_modes={"FP32":0,
+                             "FP16":1,
+                             "INT8":2}
+  if precision_mode.upper() not in supported_precision_modes:
+    raise ValueError(("precision mode '{}' is not supported."
+    "It should be one of {}").format(precision_mode,"{'FP32','FP16','INT8'}"))
+  mode=supported_precision_modes[precision_mode.upper()]
   def py2bytes(inp):
     return inp
 
@@ -83,7 +98,7 @@ def create_inference_graph(input_graph_def,
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
   out = trt_convert(input_graph_def_str, out_names, max_batch_size,
-                    max_workspace_size_bytes)
+                    max_workspace_size_bytes,mode)
   status = to_string(out[0])
   output_graph_def_string = out[1]
   del input_graph_def_str  # Save some memory
@@ -101,3 +116,40 @@ def create_inference_graph(input_graph_def,
   output_graph_def.ParseFromString(output_graph_def_string)
   del output_graph_def_string  # Save some memory
   return output_graph_def
+
+def calib_graph_to_infer_graph(calibration_graph_def):
+  def py2bytes(inp):
+    return inp
+
+  def py3bytes(inp):
+    return inp.encode("utf-8", errors="surrogateescape")
+
+  def py2string(inp):
+    return inp
+
+  def py3string(inp):
+    return inp.decode("utf-8")
+
+  if _six.PY2:
+    to_bytes = py2bytes
+    to_string = py2string
+  else:
+    to_bytes = py3bytes
+    to_string = py3string
+
+  graph_str=calibration_graph_def.SerializeToString()
+  out=calib_convert(graph_str)
+  status=to_string(out[0])
+  output_graph_def_string = out[1]
+  del graph_str #save some memory
+  if len(status) < 2:
+    raise _impl.UnknownError(None,None,status)
+  if status[:2] != "OK":
+    msg=status.split(";")
+    if len(msg) == 1:
+      raise RuntimeError("Status message is malformed {}".format(status))
+    raise _impl._make_specific_exception(None,None,";".join(msg[1:]), int(msg[0]))
+  output_graph_def = graph_pb2.GraphDef()
+  output_graph_def.ParseFromString(output_graph_def_string)
+  del output_graph_def_string #save some memory
+  return output_graph_def
diff --git a/tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.cc b/tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.cc
new file mode 100644
index 0000000000..f5dc4886af
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.cc
@@ -0,0 +1,132 @@
+//
+// Created by skama on 1/24/18.
+//
+
+#include "tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.h"
+
+#include <cuda_runtime_api.h>
+#include <atomic>
+#include <chrono>
+#include <unordered_map>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace trt {
+// set the batch size before constructing the thread to execute engine
+int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+
+TRTInt8Calibrator::TRTInt8Calibrator(const std::unordered_map<
+    string, std::pair<void*, size_t>>& dev_buffers,
+                  int batch_size)
+    : batch_size_(batch_size),
+      done_(false),
+      dev_buffers_(dev_buffers),
+      calib_running_(false){
+  cudaPointerAttributes pa;
+  int devid=-1;
+  cudaGetDevice(&devid);
+  VLOG(0)<<"Constructing calibrator with batch size "<<batch_size<<" on device"<<devid;
+  for(auto b : dev_buffers_) {
+    if(cudaPointerGetAttributes(&pa,b.second.first)==cudaSuccess){
+      VLOG(1) << "CALIBRATOR Device buffer name " << b.first << " size" << b.second.second
+              << " @ " << b.second.first << " onDevice "<<((pa.memoryType==cudaMemoryTypeHost)?"HOST":"DEVICE");
+    }else {
+      VLOG(1) << "CALIBRATOR Device buffer name " << b.first << " size" << b.second.second << " @ " << b.second.first;
+    }
+  }
+}
+
+bool TRTInt8Calibrator::setBatch(
+    const std::unordered_map<string, void*>& data) {
+  VLOG(1)<<"SAMI SAMI Waiting to set new batch";
+  if(done_)return false;
+  while (calib_running_.load(
+      std::memory_order_acquire)) {  // wait while calibration is running
+    tensorflow::mutex_lock l(cond_mtx_);
+    cond_.wait_for(l, std::chrono::milliseconds(50));
+    if(done_)return false;
+  }
+  VLOG(1)<<"Set Batch Waiting finished";
+  for (const auto it : data) {
+
+    auto devptr = dev_buffers_.find(it.first);
+    if (devptr == dev_buffers_.end()) {
+      LOG(FATAL) << "FATAL input name '" << it.first
+                 << "' does not match with the buffer names";
+    }
+    cudaPointerAttributes pa;
+    const auto& d = devptr->second;
+    VLOG(1)<<"cuda memcopy buff name= "<<it.first<<" dst= "
+           <<d.first<<" size= "<<d.second<<" inp= "<<it.second;
+    if(cudaPointerGetAttributes(&pa,it.second)==cudaSuccess) {
+      VLOG(1) << "CALIBRATOR Device buffer name " << it.first << " size" << d.second
+          << " @ " << d.first << " onDevice " << ((pa.memoryType == cudaMemoryTypeHost) ? "HOST" : "DEVICE");
+    }
+
+    auto status =
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice);
+    if (status != cudaSuccess) {
+      LOG(FATAL) << "cudaMemcpy for '" << it.first << "' failed with "
+                 << status;
+    }
+    float f[2];
+    f[0]=3.;
+    f[1]=0.14159;
+    status=cudaMemcpy(f,d.first,sizeof(float)*2,cudaMemcpyDeviceToHost);
+    int devid=-1;
+    cudaGetDevice(&devid);
+    VLOG(0)<<"SAMI ORDER SETTING Data in perm storage [0]="<<f[0]<<" [1]="<<f[1]<<" current device="<<devid;
+  }
+  calib_running_.store(true, std::memory_order_release);  // release builder
+  cond_.notify_all();
+  return true;
+}
+
+bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
+                                 int nbBindings) {
+  calib_running_.store(false, std::memory_order_release);  // wait for new batch
+  VLOG(1)<<"SAMI SAMI Calibrator is waiting for new batch";
+  cond_.notify_all();
+  while (!calib_running_.load(
+      std::memory_order_acquire)) {  // wait until new batch arrives
+    tensorflow::mutex_lock l(cond_mtx_);
+    cond_.wait_for(l, std::chrono::milliseconds(50));
+    if(done_)return false;
+  }
+  if (done_) {
+    return false;
+  }
+
+  for (int i = 0; i < nbBindings; i++) {
+    auto it = dev_buffers_.find(names[i]);
+    if (it == dev_buffers_.end()) {
+      LOG(FATAL) << "Calibration engine asked for unknown tensor name '"
+                 << names[i] << "' at position " << i;
+    }
+    VLOG(1)<<"Setting buffer "<< i <<" named=" << names[i] <<" @ "<<it->second.first;
+    bindings[i] = it->second.first;
+    float f[2];
+    f[0]=3.;
+    f[1]=0.14159;
+    auto status=cudaMemcpy(f,bindings[i],sizeof(float)*2,cudaMemcpyDeviceToHost);
+    int devid=-1;
+    cudaGetDevice(&devid);
+    VLOG(0)<<"SAMI ORDER GETTING, Data in perm storage [0]="<<f[0]<<" [1]="
+           <<f[1]<<" on device="<<devid;
+
+  }
+  return true;
+}
+const void *TRTInt8Calibrator::readCalibrationCache(std::size_t &length) {
+  return nullptr;
+}
+void TRTInt8Calibrator::writeCalibrationCache(const void *ptr, std::size_t length) {
+
+}
+TRTInt8Calibrator::~TRTInt8Calibrator() {
+  VLOG(1)<<"Destroying calibrator";
+}
+
+}  // namespace trt
+}  // namespace tensorflow
+\ No newline at end of file
diff --git a/tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.h b/tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.h
new file mode 100644
index 0000000000..b8bf55f56e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.h
@@ -0,0 +1,39 @@
+//
+// Created by skama on 1/24/18.
+//
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTINT8CALIBRATOR_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTINT8CALIBRATOR_H_
+
+#include "tensorrt/include/NvInfer.h"
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "tensorflow/core/platform/mutex.h"
+namespace tensorflow {
+namespace trt {
+
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+ public:
+  TRTInt8Calibrator(const std::unordered_map<
+                        string, std::pair<void*, size_t>>& dev_buffers,
+                    int batch_size);
+  int getBatchSize() const;
+  bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
+  bool setBatch(const std::unordered_map<string, void*> &data);
+  void setDone(){done_=true;}
+  const void *readCalibrationCache(std::size_t &length) override;
+  void writeCalibrationCache(const void *ptr, std::size_t length) override;
+  ~TRTInt8Calibrator();
+ private:
+  int batch_size_;
+  tensorflow::mutex cond_mtx_;
+  tensorflow::condition_variable cond_;
+  bool done_;
+  const std::unordered_map<string, std::pair<void*, size_t>> dev_buffers_;
+  std::atomic_bool calib_running_;
+};
+}  // namespace trt
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTINT8CALIBRATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/TRTResourceManager.cc b/tensorflow/contrib/tensorrt/resources/TRTResourceManager.cc
new file mode 100644
index 0000000000..62d27c1104
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/TRTResourceManager.cc
@@ -0,0 +1,21 @@
+//
+// Created by skama on 1/23/18.
+//
+
+#include "tensorflow/contrib/tensorrt/resources/TRTResourceManager.h"
+#include "tensorflow/core/platform/default/logging.h"
+
+
+std::shared_ptr<tensorflow::ResourceMgr> tensorflow::trt::TRTResourceManager::getManager(const std::string &mgr_name) {
+  // mutex is held for lookup only. Most instantiations where mutex will be held longer
+  // will be during op creation and should be ok.
+  tensorflow::mutex_lock lock(map_mutex_);
+  auto s=managers_.find(mgr_name);
+  if(s==managers_.end()){
+    auto it=managers_.emplace(mgr_name,std::make_shared<tensorflow::ResourceMgr>(mgr_name));
+    VLOG(0)<<"Returning a new manager "<<mgr_name;
+    return it.first->second;
+  }
+  VLOG(1)<<"Returning old manager "<<mgr_name;
+  return s->second;
+}
diff --git a/tensorflow/contrib/tensorrt/resources/TRTResourceManager.h b/tensorflow/contrib/tensorrt/resources/TRTResourceManager.h
new file mode 100644
index 0000000000..e3b50093e7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/TRTResourceManager.h
@@ -0,0 +1,37 @@
+//
+// Created by skama on 1/23/18.
+//
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCEMANAGER_H_
+
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCE_TRTRESOURCEMANAGER_H_
+#include <memory>
+
+#include <string>
+#include <unordered_map>
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace trt {
+class TRTResourceManager {
+  TRTResourceManager() = default;
+
+ public:
+  static std::shared_ptr<TRTResourceManager> instance() {
+    static std::shared_ptr<TRTResourceManager> instance_(
+        new TRTResourceManager);
+    return instance_;
+  }
+  // returns a manager for given op, if it doesn't exists it creates one
+  std::shared_ptr<tensorflow::ResourceMgr> getManager(
+      const string& op_name);
+
+ private:
+  std::unordered_map<string, std::shared_ptr<tensorflow::ResourceMgr>>
+      managers_;
+  tensorflow::mutex map_mutex_;
+};
+}  // namespace trt
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCEMANAGER_H_
diff --git a/tensorflow/contrib/tensorrt/resources/TRTResources.h b/tensorflow/contrib/tensorrt/resources/TRTResources.h
new file mode 100644
index 0000000000..cd23100af8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/TRTResources.h
@@ -0,0 +1,59 @@
+//
+// Created by skama on 1/23/18.
+//
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+
+#include <string>
+#include <sstream>
+#include "tensorrt/include/NvInfer.h"
+#include <thread>
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/resources/TRTInt8Calibrator.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+namespace trt {
+
+struct TRTCalibrationResource : public tensorflow::ResourceBase {
+  TRTCalibrationResource()
+      : calibrator(nullptr),
+        builder(nullptr),
+        network(nullptr),
+        engine(nullptr),
+        logger(nullptr),
+        thr(nullptr) {}
+  string DebugString() override {
+    std::stringstream oss;
+#define VALID_OR_NULL(ptr) (!ptr ? "nullptr" : std::hex<<(void)ptr<<std::dec<<std::endl)
+    oss<<" Calibrator = "<<std::hex<<calibrator<<std::dec<<std::endl
+       <<" Builder    = "<<std::hex<<builder<<std::dec<<std::endl
+       <<" Network    = "<<std::hex<<network<<std::dec<<std::endl
+       <<" Engine     = "<<std::hex<<engine<<std::dec<<std::endl
+       <<" Logger     = "<<std::hex<<logger<<std::dec<<std::endl
+       <<" Thread     = "<<std::hex<<thr<<std::dec<<std::endl;
+    return oss.str();
+#undef VALID_OR_NULL
+  }
+  ~TRTCalibrationResource(){
+    VLOG(0)<<"Destroying Calibration Resource "<<std::endl<<DebugString();
+  }
+  TRTInt8Calibrator* calibrator;
+  nvinfer1::IBuilder* builder;
+  nvinfer1::INetworkDefinition* network;
+  nvinfer1::ICudaEngine* engine;
+  tensorflow::tensorrt::Logger* logger;
+  std::thread* thr;
+};
+
+struct TRTEngineResource : public tensorflow::ResourceBase {
+  TRTEngineResource() : runtime(nullptr), ctx(nullptr){};
+  nvinfer1::IRuntime* runtime;
+  nvinfer1::IExecutionContext* ctx;
+};
+
+}  // namespace trt
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCEMGR_TRTRESOURCES_H_
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index d679945d56..0ae3c91a63 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -64,13 +64,16 @@ PyObject* pair_helper(std::pair<string, string>* in) {
 %ignoreall
 %unignore tensorflow;
 %unignore trt_convert;
+%unignore calib_convert;
 
 %{
+
 std::pair<string, string> trt_convert(
     string graph_def_string,  // The serialized GraphDef string.
     std::vector<string> output_names,
     size_t max_batch_size,
-    size_t max_workspace_size_bytes
+    size_t max_workspace_size_bytes,
+    int precision_mode
     // Unfortunately we can't use TF_Status here since it
     // is in c/c_api and brings in a lot of other libraries
     // which in turn declare ops. These ops are included
@@ -90,16 +93,19 @@ std::pair<string, string> trt_convert(
     return std::pair<string, string>{out_status, ""};
   }
 
+  if(precision_mode < 0 || precision_mode > 2){
+    out_status = "InvalidArgument;Invalid precision_mode";
+    return std::pair<string, string>{out_status, ""};
+  }
   if (!output_names.size()) {
     out_status = "InvalidArgument;Size of the output_names vector is 0";
     return std::pair<string, string>{out_status, ""};
-    // return "";
   }
   tensorflow::GraphDef outGraph;
   tensorflow::Status conversion_status =
       tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
           graph_def, output_names, max_batch_size, max_workspace_size_bytes,
-          &outGraph);
+          &outGraph, precision_mode);
   if (!conversion_status.ok()) {
     auto retCode = (int)conversion_status.code();
     char buff[2000];
@@ -120,12 +126,60 @@ std::pair<string, string> trt_convert(
   return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 }
+
+std::pair<string, string> calib_convert(string graph_def_string  //  const tensorflow::GraphDef&
+    // unfortunately we can't use TF_Status here since it
+    // is in c/c_api and brings in a lot of other libraries
+    // which in turn declare ops. These ops are included
+    // statically in our library and cause an abort when
+    // module is loaded due to double registration
+    // until Tensorflow properly exposes these headers
+    // we have to work around this by returning a string
+    // and converting it to exception on python side.
+    //,TF_Status* out_status) {
+) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  string out_status;
+
+  tensorflow::GraphDef graph_def;
+  if (!graph_def.ParseFromString(graph_def_string)) {
+    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+
+  tensorflow::GraphDef outGraph;
+  tensorflow::Status conversion_status =
+      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(graph_def,
+                                                   &outGraph);
+  if (!conversion_status.ok()) {
+    auto retCode = (int)conversion_status.code();
+    char buff[2000];
+    snprintf(buff, 2000, "%d;%s", retCode,
+             conversion_status.error_message().c_str());
+    out_status=buff;
+    return std::pair<string, string>{out_status, ""};
+  }
+  string result;
+  if (!outGraph.SerializeToString(&result)) {
+    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+  out_status="OK;All good!";
+  return std::pair<string, string>{out_status, result};
+#else
+  // Returns FAILED_PRECONDITION.
+  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+}
 %}
 
+std::pair<string, string> calib_convert(string graph_def_string);
+
 std::pair<string, string> trt_convert(string graph_def_string,
                                       std::vector<string> output_names,
                                       size_t max_batch_size,
-                                      size_t max_workspace_size_bytes);
+                                      size_t max_workspace_size_bytes,
+                                      int precision_mode);
 
 
 %unignoreall