1 files changed, 204 insertions, 1265 deletions
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 75a3c3d034..9ee717dd7f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -24,10 +24,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
 #include "tensorflow/core/framework/types.h"
@@ -36,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tensor_coding.h"
@@ -44,6 +39,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorrt/include/NvInfer.h"
 
 //  Check if the types are equal. Cast to int first so that failure log message
@@ -53,7 +49,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-using ::tensorflow::strings::StrCat;
 
 namespace {
 
@@ -70,8 +65,7 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
       *trt_dtype = nvinfer1::DataType::kHALF;
       break;
     default:
-      return tensorflow::errors::InvalidArgument(
-          "Unsupported data type " + tensorflow::DataTypeString(tf_dtype));
+      return tensorflow::errors::InvalidArgument("Unsupported data type");
   }
   return tensorflow::Status::OK();
 }
@@ -118,18 +112,6 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
   return padding;
 }
 
-string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
-  size_t last_scope_separator = 0;
-  for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) {
-    if (op_name_a[i] != op_name_b[i]) {
-      break;
-    } else if (op_name_a[i] == '/') {
-      last_scope_separator = i + 1;
-    }
-  }
-  return op_name_a.substr(0, last_scope_separator);
-}
-
 class TRT_ShapedWeights {
  public:
   TRT_ShapedWeights(tensorflow::DataType type, const void* values,
@@ -263,11 +245,6 @@ std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
 }
 
 template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
-  auto attr = this->at(key)->list().s();
-  return std::vector<string>(attr.begin(), attr.end());
-}
-template <>
 nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
@@ -289,17 +266,6 @@ tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
   return this->at(key)->type();
 }
 
-template <>
-float TFAttrs::get<float>(string key) const {
-  return this->at(key)->f();
-}
-
-template <>
-bool TFAttrs::get<bool>(string key) const {
-  return this->at(key)->b();
-}
-
-// TODO(jie): reorder4 & reorder2 should be merged?
 template <typename T>
 void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
               nvinfer1::DimsNCHW istrides, T* odata,
@@ -317,87 +283,29 @@ void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
   }
 }
 
-template <typename T>
-void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
-              T* odata, nvinfer1::DimsHW ostrides) {
-  for (int h = 0; h < shape.h(); ++h) {
-    for (int w = 0; w < shape.w(); ++w) {
-      odata[h * ostrides.h() + w * ostrides.w()] =
-          idata[h * ostrides.h() + w * ostrides.w()];
-    }
-  }
-}
-
-// TODO(jie): fallback to tensorflow!!
-void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
-                   TRT_ShapedWeights* oweights) {
-  int c = iweights.shape_.d[0];
-  int k = iweights.shape_.d[1];
-  oweights->shape_.d[0] = k;
-  oweights->shape_.d[1] = c;
-  nvinfer1::DimsHW istrides = {1, k};
-  nvinfer1::DimsHW ostrides = {c, 1};
-  switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
-      Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
-               istrides,
-               static_cast<float*>(const_cast<void*>(oweights->GetValues())),
-               ostrides);
-      break;
-    }
-    case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unsupported type in reorder expected fp32 or fp16 but got "
-                 << DataTypeString(iweights.type_);
-  }
-}
-
 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
-                       TRT_ShapedWeights* oweights, int num_groups) {
+                       TRT_ShapedWeights* oweights) {
   CHECK_EQ(iweights.type_, oweights->type_);
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   int r = iweights.shape_.d[0];
   int s = iweights.shape_.d[1];
-  // TRT requires GKcRS, while TF depthwise has RSCK
-  //   where c=1, C=G
-  VLOG(2) << "num_groups: " << num_groups;
-  int c = iweights.shape_.d[2] / num_groups;
-  VLOG(2) << "c" << iweights.shape_.d[2] << " then " << c;
-  int k = iweights.shape_.d[3] * num_groups;
-  VLOG(2) << "k" << iweights.shape_.d[3] << " then " << k;
-  oweights->shape_.d[0] = k / num_groups;
-  oweights->shape_.d[1] = c * num_groups;
+  int c = iweights.shape_.d[2];
+  int k = iweights.shape_.d[3];
+  oweights->shape_.d[0] = k;
+  oweights->shape_.d[1] = c;
   oweights->shape_.d[2] = r;
   oweights->shape_.d[3] = s;
   nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
   nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
+    case tensorflow::DataType::DT_FLOAT:
       Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
                istrides,
                static_cast<float*>(const_cast<void*>(oweights->GetValues())),
                ostrides);
       break;
-    }
-    case tensorflow::DataType::DT_HALF: {
-      Reorder4(
-          {k, c, r, s}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
-      break;
-    }
-
     default:
-      LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
-                 << DataTypeString(iweights.type_);
+      LOG(FATAL) << "!!!!!!!!!!!!!!!!!!!!!!!!broke!!!!!!!!!!!!";
   }
 }
 
@@ -415,11 +323,12 @@ inline std::shared_ptr<T> infer_object(T* obj) {
   return std::shared_ptr<T>(obj, InferDeleter());
 }
 
+// Logger for GIE info/warning/errors
 class Converter;
 
 using OpConverter =
     std::function<tensorflow::Status(Converter&, const tensorflow::NodeDef&,
-                                     const std::vector<TRT_TensorOrWeights>&,
+                                     std::vector<TRT_TensorOrWeights> const&,
                                      std::vector<TRT_TensorOrWeights>*)>;
 
 class Converter {
@@ -427,57 +336,34 @@ class Converter {
   std::unordered_map<string, OpConverter> op_registry_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
-  tensorflow::tensorrt::TRTWeightStore* weight_store_;
-  bool fp16_;
+
   void register_op_converters();
+
   std::vector<TRT_TensorOrWeights> get_inputs(
       const tensorflow::NodeDef& node_def) {
     std::vector<TRT_TensorOrWeights> inputs;
-    for (auto const& input_name : node_def.input()) {
-      /*************************************************************************
-       * TODO(jie) handle case 1) here
-       * Normalizes the inputs and extracts associated metadata:
-       * 1) Inputs can contain a colon followed by a suffix of characters.
-       *    That suffix may be a single number (e.g. inputName:1) or several
-       *    word characters separated from a number by a colon
-       *    (e.g. inputName:foo:1). The
-       *    latter case is used to denote inputs and outputs of functions.
-       * 2) Control dependency inputs contain caret at the beginning and we
-       *    remove this and annotate the edge as a control dependency.
-       ************************************************************************/
-      string name = input_name[0] == '^' ? input_name.substr(1) : input_name;
-      auto first = name.find_first_of(':');
-      if (first != string::npos && first + 2 == name.size() &&
-          name[first + 1] == '0')
-        name.erase(first);
-
-      VLOG(2) << "retrieve input: " << name;
-      if (trt_tensors_.count(name)) {
-        inputs.push_back(trt_tensors_.at(name));
-      } else {
-        LOG(FATAL) << "input: " << name << " not availabled for node at, "
-                   << node_def.name();
-      }
+    for (const auto& input_name : node_def.input()) {
+      VLOG(2) << "Retrieve input: " << input_name;
+      inputs.push_back(trt_tensors_.at(input_name));
     }
     return inputs;
   }
 
  public:
-  explicit Converter(nvinfer1::INetworkDefinition* trt_network,
-                     tensorflow::tensorrt::TRTWeightStore* ws, bool fp16)
-      : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
+  explicit Converter(nvinfer1::INetworkDefinition* trt_network)
+      : trt_network_(trt_network) {
     this->register_op_converters();
   }
-  tensorflow::tensorrt::TRTWeightStore* weight_store() { return weight_store_; }
+
   TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
     // TODO(jie): check weights size_bytes. 0 means type error
-    weight_store_->store_.push_back(std::vector<uint8_t>(weights.size_bytes()));
-    weights.SetValues(weight_store_->store_.back().data());
+    temp_bufs_.push_back(std::vector<uint8_t>(weights.size_bytes()));
+    weights.SetValues(temp_bufs_.back().data());
     return weights;
   }
-  bool isFP16() { return fp16_; };
+
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
@@ -496,7 +382,7 @@ class Converter {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
       string output_name = node_def.name();
-      if (i != 0) output_name = StrCat(output_name, ":", i);
+      if (i != 0) output_name = output_name + ":" + std::to_string(i);
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
@@ -562,7 +448,7 @@ struct LambdaFactory {
     switch (op) {
       case OP_CATEGORY::RSQRT: {
         VLOG(2) << "RSQRT GETS DONE";
-        return [](T t) -> T { return 1.0 / sqrt(t); };
+        return [](T t) -> T { return 1.0 / std::sqrt(t); };
       }
       case OP_CATEGORY::NEG:
         return [](T t) -> T { return -t; };
@@ -648,22 +534,6 @@ struct LambdaFactory {
   }
 };
 
-template <>
-std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
-  switch (op) {
-    case OP_CATEGORY::RSQRT: {
-      VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) -> Eigen::half {
-        return Eigen::half(1.0 / sqrt(float(t)));
-      };
-    }
-    case OP_CATEGORY::NEG:
-      return [](Eigen::half t) -> Eigen::half { return -t; };
-    default:
-      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
-      return nullptr;
-  }
-}
 tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
                                 TRT_ShapedWeights* oweights,
                                 LambdaFactory unary_op) {
@@ -675,14 +545,6 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
       std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
-      auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
-      auto oup =
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
-      std::transform(inp, inp + iweights.count(), oup,
-                     unary_op.unary<Eigen::half>());
-      break;
-    }
     default:
       return tensorflow::errors::Unimplemented(
           "Data type not supported: " +
@@ -726,32 +588,6 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
       }
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
-      auto inp_l = static_cast<const Eigen::half*>(iweights_l.GetValues());
-      auto inp_r = static_cast<const Eigen::half*>(iweights_r.GetValues());
-      auto oup =
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
-
-      if (iweights_l.count() != iweights_r.count()) {
-        // We only supports broadcast of RankZero
-        if (iweights_l.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_l);
-          std::transform(inp_r, inp_r + iweights_r.count(), oup,
-                         binary_op.broadcast_l<Eigen::half>(*inp_l));
-        } else if (iweights_r.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_r);
-          std::transform(inp_l, inp_l + iweights_l.count(), oup,
-                         binary_op.broadcast_r<Eigen::half>(*inp_r));
-        } else {
-          return tensorflow::errors::Unimplemented(
-              "Binary op with non-rankZero broadcast not supported");
-        }
-      } else {
-        std::transform(inp_l, inp_l + iweights_l.count(), inp_r, oup,
-                       binary_op.binary<Eigen::half>());
-      }
-      break;
-    }
     default:
       return tensorflow::errors::Unimplemented(
           "Data type not supported: " +
@@ -763,7 +599,7 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
 
 tensorflow::Status ConstantFoldUnary(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input = inputs.at(0).weights();
 
@@ -777,12 +613,13 @@ tensorflow::Status ConstantFoldUnary(
   CHECK_EQ(weights_input.type_,
            TFAttrs(node_def).get<tensorflow::DataType>("T"));
 
+  // Maybe I should do a switch
   LambdaFactory unary_op;
   if (node_def.op() == "Rsqrt") {
     // Compute rsqrt
     unary_op.op = LambdaFactory::OP_CATEGORY::RSQRT;
     auto ret = UnaryCompute(weights_input, &weights_output, unary_op);
-    // Pass the output
+    // PAss the output
     if (ret == tensorflow::Status::OK()) {
       outputs->push_back(TRT_TensorOrWeights(weights_output));
     }
@@ -794,11 +631,11 @@ tensorflow::Status ConstantFoldUnary(
 }
 
 // TODO(jie,ben) broadcast is needed yet not implemented
-// Let's get the simple stuff working first. Maybe we should fall back to TF
+// Let's get the simple stuff working first. Maybe we should fall bakc to TF
 //   approach for constant folding
 tensorflow::Status ConstantFoldBinary(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input_l = inputs.at(0).weights();
   TRT_ShapedWeights weights_input_r = inputs.at(1).weights();
@@ -811,12 +648,12 @@ tensorflow::Status ConstantFoldBinary(
         "Binary op implicit broadcast not supported: " + node_def.op());
 
   // TODO(jie): constant fold should really fall back to TF.
-  int num_dims = weights_input_l.shape_.nbDims;
+  int nb_dims = weights_input_l.shape_.nbDims;
   nvinfer1::Dims output_shape;
-  output_shape.nbDims = num_dims;
-  VLOG(2) << "nb_dims: " << num_dims
+  output_shape.nbDims = nb_dims;
+  VLOG(2) << "nb_dims: " << nb_dims
           << ", the other: " << weights_input_r.shape_.nbDims;
-  for (int i = 0; i < num_dims; i++) {
+  for (int i = 0; i < nb_dims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
       output_shape.d[i] = weights_input_l.shape_.d[i];
     } else if (weights_input_l.shape_.d[i] == 1 ||
@@ -841,6 +678,7 @@ tensorflow::Status ConstantFoldBinary(
   // Allocate output weights
   TRT_ShapedWeights weights_output = ctx.get_temp_weights(dtype, output_shape);
 
+  // Maybe I should do a switch
   LambdaFactory binary_op;
   if (node_def.op() == "Sub") {
     binary_op.op = LambdaFactory::OP_CATEGORY::SUB;
@@ -874,90 +712,48 @@ tensorflow::Status BinaryTensorOpWeight(
   // Maybe this part has to be moved into the block of rsqrt later
 
   // Check type consistency
+  auto dtype = TFAttrs(node_def).get<nvinfer1::DataType>("T");
+  CHECK_EQ_TYPE(tensor->getType(), dtype);  // Cast to int for error messages
   nvinfer1::DataType ttype;
   TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
+  CHECK_EQ_TYPE(ttype, dtype);  // Cast to int for error message
 
   // Check scale mode
   auto dims_w = weights.shape_;
   auto dims_t = tensor->getDimensions();
 
-  // default to element-wise
+  // Default to channel-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-  // TODO(jie): maybe use a permuatation instead to support more cases;
-  bool permutation_flag = false;
-
   if (weights.count() == 1) {
     VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
-    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
-            << " tensor DIM: " << dims_t.nbDims;
-    if (dims_w.nbDims == dims_t.nbDims + 1) {
-      if (dims_w.d[0] == 1) {
-        for (int i = 1; i < dims_w.nbDims; i++) {
-          dims_w.d[i - 1] = dims_w.d[i];
-        }
-        dims_w.nbDims--;
-      } else {
-        return tensorflow::errors::InvalidArgument(
-            "Binary op cannot operate on batch, " + node_def.name());
-      }
-    }
+    // No broadcasting on Batch dimension;
+    assert(dims_w.d[0] == 1);
 
-    if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
-      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // default is element;
-      for (int i = 1; i < dims_w.nbDims; i++) {
-        if (dims_w.d[i] != dims_t.d[i]) {
-          // if dimension does not match, switch back to channel;
-          VLOG(2) << "channel";
-          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-          break;
-        }
-      }
-      // if channel as candidate, validate it
-      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
-        for (int i = 1; i < dims_w.nbDims; i++) {
-          if (dims_w.d[i] != 1)
-            return tensorflow::errors::InvalidArgument(
-                "Weight shape not compatible at, " + node_def.name());
-        }
-      } else {
-        VLOG(2) << "elementwise";
+    // Broadcasting on Channel dimension only allowed in kUNIFORM
+    assert(dims_w.d[1] == dims_t.d[0]);
+    assert(dims_w.nbDims == dims_t.nbDims);
+
+    // Default is element;
+    for (int i = 2; i < dims_w.nbDims; i++) {
+      if (dims_w.d[i] != dims_t.d[i - 1]) {
+        scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+        break;
       }
-    } else if (dims_w.nbDims == 1 &&
-               dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // channel wise and broadcast required;
-      permutation_flag = true;
-      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Weight shape not compatible at, " + node_def.name());
     }
-  }
-
-  // transpose last dimension
-  std::vector<int> permutation(dims_t.nbDims + 1);
-  if (permutation_flag) {
-    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-      // we swap the last dimension into channel for trt.
-      // because of tensorflow default broadcasting rules.
-      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-        permutation[i] = i;
+    if (scale_mode == nvinfer1::ScaleMode::kELEMENTWISE) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 2; i < dims_w.nbDims; i++) {
+        if (dims_w.d[i] != 1)
+          return tensorflow::errors::InvalidArgument(
+              "Weight shape not compatible at, " + node_def.name());
       }
-      permutation[1] = dims_t.nbDims;
-      permutation[dims_t.nbDims] = 1;
-      tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                   permutation);
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Transpose cannot be applied, " + node_def.name());
     }
   }
 
-  // prepare weights
+  // Prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
@@ -983,26 +779,88 @@ tensorflow::Status BinaryTensorOpWeight(
       scale_weights, power_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  if (permutation_flag) {
-    output_tensor = ctx.TransposeTensor(output_tensor, permutation);
-  }
 
   // Pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
+tensorflow::Status BinaryTensorOpTensor(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
+      {"Add", nvinfer1::ElementWiseOperation::kSUM},
+      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+      // {"max", nvinfer1::ElementWiseOperation::kMAX},
+      // {"min", nvinfer1::ElementWiseOperation::kMIN},
+      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+      {"Div", nvinfer1::ElementWiseOperation::kDIV},
+  };
+
+  // FIXME assume type matches input weights
+  // Get trt type & shape
+  TFAttrs attrs(node_def);
+  // Maybe this part has to be moved into the block of rsqrt later
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+
+  // Check type consistency
+  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
+  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end())
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
-tensorflow::Status ConvertConv2DHelper(
+  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
+      *const_cast<nvinfer1::ITensor*>(tensor_l),
+      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Pass the output
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPlaceholder(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs,
-    int group  // group ==0 specifies depthwise conv
-) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  VLOG(2) << "Placeholder should have been replace already";
+  return tensorflow::errors::Unimplemented(", cannot convert Placeholder op");
+  // OK this make sense since we are supposed to replace it with input
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
+  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
+
+  dims.nbDims--;
+  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
+
+  nvinfer1::ITensor* output =
+      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
+  if (!output) {
+    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
+  }
+  outputs->push_back(TRT_TensorOrWeights(output));
+  return tensorflow::Status::OK();
+}
 
+tensorflow::Status ConvertConv2D(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  // TODO(jie): handle NHWC/NCHW transpose;
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
+  ReorderRSCKToKCRS(weights_rsck, &weights);
+  TRT_ShapedWeights biases(weights.type_);
+  int noutput = weights.shape_.d[0];
+  nvinfer1::DimsHW kernel_size;
+  kernel_size.h() = weights.shape_.d[2];
+  kernel_size.w() = weights.shape_.d[3];
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -1016,31 +874,11 @@ tensorflow::Status ConvertConv2DHelper(
     // TODO(jie): transpose it
   }
 
-  // tensor after transpose (NCHW)
-  auto tensor_dim = tensor->getDimensions();
-
-  int num_groups = group;
-  if (num_groups == 0)  // depthwise convolution
-    num_groups = tensor_dim.d[0];
-  VLOG(2) << "groups count: " << num_groups;
-
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
-  ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
-  TRT_ShapedWeights biases(weights.type_);
-  int noutput = weights.shape_.d[0] * num_groups;
-  nvinfer1::DimsHW kernel_size;
-  kernel_size.h() = weights.shape_.d[2];
-  kernel_size.w() = weights.shape_.d[3];
-  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
-
   // TODO(jie): stride. (NHWC/NCHW)
   auto tf_stride = attrs.get<std::vector<int>>("strides");
-  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
-  VLOG(2) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
-          << tf_stride[3];
   nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
+  auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
@@ -1081,11 +919,10 @@ tensorflow::Status ConvertConv2DHelper(
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
-  layer->setNbGroups(num_groups);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   auto dim_after = output_tensor->getDimensions();
-  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1] << ", "
+  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
           << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
@@ -1098,101 +935,11 @@ tensorflow::Status ConvertConv2DHelper(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConv2DHelper(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs, ConvolutionType type) {
-  switch (type) {
-    case ConvolutionType::DEFAULT:
-      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 1);
-    case ConvolutionType::DEPTHWISE_CONV:
-      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 0);
-  }
-  return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
-                                           node_def.name());
-}
-
-tensorflow::Status BinaryTensorOpTensor(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
-      {"Add", nvinfer1::ElementWiseOperation::kSUM},
-      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-      {"Div", nvinfer1::ElementWiseOperation::kDIV},
-  };
-
-  // FIXME assume type matches input weights
-  // get trt type & shape
-  TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
-
-  // check type consistency
-  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
-
-  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
-      *const_cast<nvinfer1::ITensor*>(tensor_l),
-      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  // pass the output
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertPlaceholder(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  VLOG(2) << "Placeholder should have been replace already";
-  return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
-  // OK this make sense since we are supposed to replace it with input
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
-  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
-
-  dims.nbDims--;
-  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
-
-  nvinfer1::ITensor* output =
-      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
-  if (!output) {
-    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
-  }
-  outputs->push_back(TRT_TensorOrWeights(output));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertConv2D(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
-                             ConvolutionType::DEFAULT);
-}
-
-tensorflow::Status ConvertConv2DDepthwise(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
-                             ConvolutionType::DEPTHWISE_CONV);
-}
-
 tensorflow::Status ConvertPool(Converter& ctx,
                                const tensorflow::NodeDef& node_def,
-                               const std::vector<TRT_TensorOrWeights>& inputs,
+                               std::vector<TRT_TensorOrWeights> const& inputs,
                                std::vector<TRT_TensorOrWeights>* outputs) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -1210,8 +957,6 @@ tensorflow::Status ConvertPool(Converter& ctx,
   // TODO(jie): support other pooling type
   if (node_def.op() == "MaxPool")
     type = nvinfer1::PoolingType::kMAX;
-  else if (node_def.op() == "AvgPool")
-    type = nvinfer1::PoolingType::kAVERAGE;
   else
     return tensorflow::errors::Unimplemented("Only supports Max pool");
 
@@ -1274,9 +1019,9 @@ tensorflow::Status ConvertPool(Converter& ctx,
 
 tensorflow::Status ConvertActivation(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   nvinfer1::IActivationLayer* layer = ctx.network()->addActivation(
       *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ActivationType::kRELU);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
@@ -1286,14 +1031,14 @@ tensorflow::Status ConvertActivation(
 
 tensorflow::Status ConvertScale(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
     return tensorflow::errors::Unimplemented(
         "Only supports tensor op weight for now, at " + node_def.name());
   // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
 
   // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights = inputs.at(1).weights();
@@ -1310,26 +1055,9 @@ tensorflow::Status ConvertScale(Converter& ctx,
   } else {
     VLOG(2) << "NCHW !!!!";
   }
-
-  auto dims = tensor->getDimensions();
-  VLOG(2) << "tensor dimensions: " << dims.nbDims;
-  for (int i = 0; i < dims.nbDims; i++) {
-    VLOG(2) << "i: " << dims.d[i];
-  }
-  dims = weights.shape_;
-  VLOG(2) << "tensor dimensions: " << dims.nbDims;
-  for (int i = 0; i < dims.nbDims; i++) {
-    VLOG(2) << "i: " << dims.d[i];
-  }
-
-  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
-  if (weights.shape_.d[0] == 1) {
-    mode = nvinfer1::ScaleMode::kUNIFORM;
-  }
-
-  nvinfer1::IScaleLayer* layer =
-      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
-                              weights, empty_weights, empty_weights);
+  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
+      weights, empty_weights, empty_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   if (data_format == "NHWC") {
@@ -1344,7 +1072,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
 
 tensorflow::Status ConvertConst(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   const auto& weights_tensor = node_def.attr().at("value").tensor();
 
@@ -1363,144 +1091,20 @@ tensorflow::Status ConvertConst(Converter& ctx,
     VLOG(2) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      VLOG(2) << "size: " << weights_tensor.float_val_size();
-      scalar_shape = GetTensorShape(tensor);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        VLOG(2) << scalar_shape.d[i];
-      if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size()) {
-        if (weights_tensor.float_val_size() == 1 ||
-            scalar_shape.d[0] == weights_tensor.float_val_size()) {
-          scalar_shape.nbDims = 1;
-          // no dimension provided. flatten it
-          scalar_shape.d[0] = weights_tensor.float_val_size();
-          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-        } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
-        }
-      }
-    } else {
       VLOG(2) << "Dimensions: " << tensor.dims();
-      scalar_shape.nbDims = 1;
-      // no dimension provided. flatten it
-      scalar_shape.d[0] = weights_tensor.float_val_size();
-      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
-        scalar_shape.d[i] = 0;
-        scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
-      }
-    }
-    if (ctx.isFP16()) {
-      auto dtype_new = tensorflow::DataType::DT_HALF;
-      size_t len_data = tensorflow::DataTypeSize(dtype_new);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      tensorflow::Tensor temp_tensor(tensorflow::DT_HALF, tensor.shape());
-      auto half_tensor = temp_tensor.flat<Eigen::half>();
-      Eigen::DefaultDevice defd;
-      half_tensor.device(defd) =
-          tensor.flat<float>().template cast<Eigen::half>();
-      memcpy(dst, half_tensor.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype_new, dst, scalar_shape);
+      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                  GetTensorShape(tensor));
     } else {
-      size_t len_data = tensorflow::DataTypeSize(dtype);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      std::vector<float> tensor_data(
-          weights_tensor.float_val().begin(),
-          weights_tensor.float_val()
-              .end());  //  make a local copy first to flatten
-      memcpy(dst, tensor_data.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
-    }
-  } else if (!weights_tensor.int_val().empty()) {
-    VLOG(2) << "int!!!" << node_def.name();
-    nvinfer1::Dims scalar_shape;
-    if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      scalar_shape = GetTensorShape(tensor);
-      if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size()) {
-        if (weights_tensor.int_val_size() == 1 ||
-            scalar_shape.d[0] == weights_tensor.int_val_size()) {
-          scalar_shape.nbDims = 1;
-          // no dimension provided. flatten it
-          scalar_shape.d[0] = weights_tensor.int_val_size();
-          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-        } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
-        }
-      }
-    } else {
-      VLOG(2) << "dimensions: " << tensor.dims();
+      VLOG(2) << "Dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
-      // no dimension provided. flatten it
-      scalar_shape.d[0] = weights_tensor.int_val_size();
+      scalar_shape.d[0] = 1;
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
       for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
         scalar_shape.d[i] = 0;
         scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
       }
-    }
-    if (ctx.isFP16()) {
-      auto dtype_new = tensorflow::DataType::DT_HALF;
-      size_t len_data = tensorflow::DataTypeSize(dtype_new);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      tensorflow::Tensor temp_tensor(tensorflow::DT_HALF, tensor.shape());
-      TTypes<Eigen::half>::Flat half_tensor = temp_tensor.flat<Eigen::half>();
-      Eigen::DefaultDevice defd;
-      switch (dtype) {
-        case (tensorflow::DT_INT32): {
-          half_tensor.device(defd) =
-              tensor.flat<int32>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_INT16): {
-          half_tensor.device(defd) =
-              tensor.flat<int16>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_INT8): {
-          half_tensor.device(defd) =
-              tensor.flat<int8>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_UINT8): {
-          half_tensor.device(defd) =
-              tensor.flat<uint8>().template cast<Eigen::half>();
-          break;
-        }
-        default:
-          return tensorflow::errors::InvalidArgument(
-              "Datatype " + tensorflow::DataTypeString(dtype) +
-              " for FP16 conversion");
-          break;
-      };
-      memcpy(dst, half_tensor.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype_new, dst, scalar_shape);
-    } else {
-      size_t len_data = tensorflow::DataTypeSize(dtype);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32);
-      len_data = std::max(len_data, len_tensor);
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      std::vector<int32> tensor_data(
-          weights_tensor.int_val().begin(),
-          weights_tensor.int_val()
-              .end());  //  make a local copy first to flatten
-                        //  doesn't have to be contigous
-      memcpy(dst, tensor_data.data(), len_tensor);  // store into weight store
-      weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
+      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                  scalar_shape);
     }
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
@@ -1526,7 +1130,7 @@ tensorflow::Status ConvertConst(Converter& ctx,
 
 tensorflow::Status ConvertIdentity(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   outputs->push_back(inputs.at(0));
   return tensorflow::Status::OK();
@@ -1534,7 +1138,7 @@ tensorflow::Status ConvertIdentity(
 
 tensorflow::Status ConvertBinary(Converter& ctx,
                                  const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2)
     return tensorflow::errors::FailedPrecondition(
@@ -1561,7 +1165,7 @@ tensorflow::Status ConvertBinary(Converter& ctx,
 
 tensorflow::Status ConvertUnary(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 1)
     return tensorflow::errors::FailedPrecondition(
@@ -1579,7 +1183,7 @@ tensorflow::Status ConvertUnary(Converter& ctx,
 
 tensorflow::Status ConvertReduce(Converter& ctx,
                                  const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
@@ -1587,7 +1191,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
         "Input expects tensor and weights, at" + node_def.name());
 
   // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
   int nb_dims = dims.nbDims + 1;
@@ -1625,7 +1229,6 @@ tensorflow::Status ConvertReduce(Converter& ctx,
       return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at" +
                                                  node_def.name());
     if (index_list_data[i] == 1) permuted_index = 1;
-
     idx_set.emplace(index_list_data[i]);
   }
 
@@ -1633,7 +1236,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   nvinfer1::DimsHW pool_kernel;
   if (permuted_index == 1) {
     for (int i = 2; i < nb_dims; i++) {
-      if (idx_set.count(i) == 0) {
+      if (idx_set.count(i)) {
         permuted_index = i;
         break;
       }
@@ -1668,13 +1271,12 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     output_tensor = ctx.TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
   }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConvertPad(Converter& ctx,
                               const tensorflow::NodeDef& node_def,
-                              const std::vector<TRT_TensorOrWeights>& inputs,
+                              std::vector<TRT_TensorOrWeights> const& inputs,
                               std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
@@ -1682,7 +1284,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
         "Input expects tensor and weights, at" + node_def.name());
 
   // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
   int nb_dims = dims.nbDims + 1;
@@ -1769,287 +1371,19 @@ tensorflow::Status ConvertPad(Converter& ctx,
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConcat(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  // not including the last input (axis) here
-  int input_size = static_cast<int>(inputs.size()) - 1;
-
-  if (!inputs.at(0).is_tensor())
-    return tensorflow::errors::InvalidArgument(
-        "Concat in TRT support only Tensor input, at " + node_def.name());
-
-  // We are retrieving the axis
-  TRT_ShapedWeights axis = inputs.at(input_size).weights();
-
-  TFAttrs attrs(node_def);
-  // auto attr_size = attrs.at("N")->i();
-  // auto data_type = attrs.get<nvinfer1::DataType>("T");
-  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
-
-  // TODO(jie): handle data type
-  // Only expect to handle INT32 as index attributes for now
-  if (index_type != tensorflow::DataType::DT_INT32)
-    return tensorflow::errors::Unimplemented(
-        "Tidx supports only DT_INT32, at " + node_def.name());
-
-  int index = *(static_cast<int*>(const_cast<void*>(axis.GetValues())));
-
-  // TODO(jie): early termination with no-op (attr_size==1)
-
-  auto dim = inputs.at(0).tensor()->getDimensions();
-  // dimension check
-  if (index > dim.nbDims + 1)
-    return tensorflow::errors::InvalidArgument(
-        "Concatenate on axis out of dimension range, at " + node_def.name());
-
-  if (index == 0)
-    return tensorflow::errors::InvalidArgument(
-        "Concatenate on batch dimension not supported, at " + node_def.name());
-
-  // incase we need permutation;
-  std::vector<int> permutation_order(dim.nbDims + 1);
-
-  for (int i = 0; i < dim.nbDims + 1; i++) permutation_order[i] = i;
-
-  if (index != 1) {
-    permutation_order[1] = index - 1;
-    permutation_order[index - 1] = 1;
-  }
-
-  std::vector<nvinfer1::ITensor const*> inputs_vec;
-  // Shap chack (all input tensor should have same shape)
-  // starting from 0 since we are probably also doing transpose here;
-  for (int i = 0; i < input_size; i++) {
-    auto tensor_i = inputs.at(i).tensor();
-    auto dim_i = tensor_i->getDimensions();
-    if (dim_i.nbDims != dim.nbDims)
-      return tensorflow::errors::InvalidArgument(
-          "Concatenate receives inputs with inconsistent dimensions, at " +
-          node_def.name());
-
-    for (int j = 0; j < dim.nbDims; j++) {
-      // check dimension consistency on non-concatenate axis
-      if (j != index - 1 && dim_i.d[j] != dim.d[j])
-        return tensorflow::errors::InvalidArgument(
-            "Concatenate receives inputs with inconsistent shape, at" +
-            node_def.name());
-    }
-
-    // TRT does concatenation only on channel!
-    if (index != 1)
-      tensor_i = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor_i),
-                                     permutation_order);
-
-    inputs_vec.push_back(tensor_i);
-  }
-
-  // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  nvinfer1::IConcatenationLayer* layer = ctx.network()->addConcatenation(
-      const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()),
-      inputs_vec.size());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  if (index != 1) {
-    output_tensor = ctx.TransposeTensor(output_tensor, permutation_order);
-  }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertFusedBatchNorm(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  TFAttrs attrs(node_def);
-  float epsilon = attrs.get<float>("epsilon");
-  auto data_format = attrs.get<string>("data_format");
-  if (data_format != "NCHW") {
-    return tensorflow::errors::Unimplemented(
-        "only data_format=NCHW is supported, at " + node_def.name());
-  }
-  bool is_training = attrs.get<bool>("is_training");
-  if (is_training) {
-    return tensorflow::errors::Unimplemented(
-        "only is_training=false is supported, at " + node_def.name());
-  }
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights scale_weights = inputs.at(1).weights();
-  TRT_ShapedWeights offset_weights = inputs.at(2).weights();
-  TRT_ShapedWeights mean_weights = inputs.at(3).weights();
-  TRT_ShapedWeights variance_weights = inputs.at(4).weights();
-  TRT_ShapedWeights dummy_power_weights(scale_weights.type_);
-  TRT_ShapedWeights combined_scale_weights =
-      ctx.get_temp_weights_like(scale_weights);
-  TRT_ShapedWeights combined_offset_weights =
-      ctx.get_temp_weights_like(offset_weights);
-  size_t nweight = scale_weights.count();
-  if ((scale_weights.type_ == offset_weights.type_) &&
-      (mean_weights.type_ == variance_weights.type_) &&
-      (scale_weights.type_ == variance_weights.type_)) {
-    if ((scale_weights.type_ != tensorflow::DataType::DT_FLOAT) &&
-        (scale_weights.type_ != tensorflow::DataType::DT_HALF)) {
-      return tensorflow::errors::Unimplemented(
-          "only float32 or float16 weight data type is supported, for node " +
-          node_def.name() + " got " +
-          tensorflow::DataTypeString(scale_weights.type_));
-    }
-    if (scale_weights.type_ == tensorflow::DT_FLOAT) {
-      for (size_t i = 0; i < nweight; ++i) {
-        float scale = (static_cast<float const*>(scale_weights.GetValues()))[i];
-        float offset =
-            (static_cast<float const*>(offset_weights.GetValues()))[i];
-        float mean = (static_cast<float const*>(mean_weights.GetValues()))[i];
-        float variance =
-            (static_cast<float const*>(variance_weights.GetValues()))[i];
-        float& combined_scale_ref = const_cast<float*>(
-            static_cast<float const*>(combined_scale_weights.GetValues()))[i];
-        float& combined_offset_ref = const_cast<float*>(
-            static_cast<float const*>(combined_offset_weights.GetValues()))[i];
-        combined_scale_ref = scale / sqrtf(variance + epsilon);
-        combined_offset_ref = offset - mean * combined_scale_ref;
-      }
-    } else {
-      const Eigen::half* scale_vals =
-          (static_cast<Eigen::half const*>(scale_weights.GetValues()));
-      const Eigen::half* off_vals =
-          (static_cast<Eigen::half const*>(offset_weights.GetValues()));
-      const Eigen::half* mean_vals =
-          (static_cast<Eigen::half const*>(mean_weights.GetValues()));
-      const Eigen::half* variance_vals =
-          (static_cast<Eigen::half const*>(variance_weights.GetValues()));
-      Eigen::half* comb_scale_vals = const_cast<Eigen::half*>(
-          static_cast<Eigen::half const*>(combined_scale_weights.GetValues()));
-      Eigen::half* comb_off_vals = const_cast<Eigen::half*>(
-          static_cast<Eigen::half const*>(combined_offset_weights.GetValues()));
-      for (size_t i = 0; i < nweight; ++i) {
-        float scale(scale_vals[i]);
-        float offset(off_vals[i]);
-        float mean(mean_vals[i]);
-        float variance(variance_vals[i]);
-        float combined_scale_ref = scale / sqrtf(variance + epsilon);
-        comb_scale_vals[i] = Eigen::half(combined_scale_ref);
-        float combined_offset_ref = offset - mean * combined_scale_ref;
-        comb_off_vals[i] = Eigen::half(combined_offset_ref);
-      }
-    }
-  }
-  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
-      combined_offset_weights.GetWeightsForTRT(),
-      combined_scale_weights.GetWeightsForTRT(),
-      dummy_power_weights.GetWeightsForTRT());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertMatMul(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-
-  // TODO(jie): transpose!
-  TFAttrs attrs(node_def);
-
-  TRT_ShapedWeights weights_ck = inputs.at(1).weights();
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_ck);
-  ReorderCKtoKC(weights_ck, &weights);
-  TRT_ShapedWeights biases(weights.type_);
-
-  int noutput = weights.shape_.d[0];
-
-  nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected(
-      *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases);
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertReshape(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights())
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at" + node_def.name());
-
-  // implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  auto dims = tensor->getDimensions();
-  // restore implicit batch dimension
-
-  TRT_ShapedWeights shape = inputs.at(1).weights();
-
-  TFAttrs attrs(node_def);
-
-  auto padding_type = attrs.get<tensorflow::DataType>("Tshape");
-
-  if (shape.shape_.nbDims != 1)
-    return tensorflow::errors::InvalidArgument(
-        "reshape new shape is not 1 dimensional, at " + node_def.name());
-
-  // Only expect to handle INT32 as attributes for now
-  if (padding_type != tensorflow::DataType::DT_INT32)
-    return tensorflow::errors::Unimplemented(
-        "reshape new shape supports only DT_INT32, at " + node_def.name());
-
-  auto shape_data = static_cast<int*>(const_cast<void*>(shape.GetValues()));
-
-  if (shape_data[0] != -1)
-    return tensorflow::errors::InvalidArgument(
-        "reshape new shape first dimension is not -1, at " + node_def.name());
-
-  auto shape_num_dims = shape.shape_.d[0];
-  VLOG(2) << "shape dimensions: " << shape_num_dims;
-  int volume_w = 1;
-  for (int i = 1; i < shape.shape_.d[0]; i++) volume_w *= shape_data[i];
-
-  int volume_t = 1;
-  for (int i = 0; i < dims.nbDims; i++) volume_t *= dims.d[i];
-
-  VLOG(2) << "volume: " << volume_t << " volume weights: " << volume_w;
-  if (volume_w != volume_t)
-    return tensorflow::errors::InvalidArgument(
-        "volume does not agree between tensor and new shape, at " +
-        node_def.name());
-
-  nvinfer1::IShuffleLayer* layer =
-      ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor));
-
-  nvinfer1::Dims reshape_dims;
-  VLOG(2) << "new dimension: " << shape_num_dims - 1;
-  reshape_dims.nbDims = shape_num_dims - 1;
-  for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
-    reshape_dims.d[i] = shape_data[i + 1];
-  }
-  layer->setReshapeDimensions(reshape_dims);
-  VLOG(2) << "new dimension: " << shape_num_dims - 1;
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  auto dims_output = output_tensor->getDimensions();
-  VLOG(2) << "output tensor dimension:" << dims_output.nbDims;
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
 void Converter::register_op_converters() {
   // vgg_16 slim implementation
   op_registry_["Placeholder"] = ConvertPlaceholder;
   op_registry_["Conv2D"] = ConvertConv2D;
-  op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
-  op_registry_["AvgPool"] = ConvertPool;
   // This could be really handled as ConvertBinary
   op_registry_["BiasAdd"] = ConvertScale;
   op_registry_["Const"] = ConvertConst;
+  // op_registry_["MatMul"] = ConvertFullyConnected;  // Not used in vgg
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
+  // op_registry_["AvgPool"] = ConvertPool;
 
   // resnet_50_v1 slim implementation
   op_registry_["Add"] = ConvertBinary;
@@ -2059,364 +1393,26 @@ void Converter::register_op_converters() {
   op_registry_["Mean"] = ConvertReduce;
   op_registry_["Pad"] = ConvertPad;
   // TODO(ben,jie): Add more ops
-
-  op_registry_["ConcatV2"] = ConvertConcat;
-  op_registry_["MatMul"] = ConvertMatMul;
-  op_registry_["Reshape"] = ConvertReshape;
-  op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
-  op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
-tensorflow::Status ConvertCalibrationNodeToEngineNode(
-    tensorflow::Graph& graph, tensorflow::Node* c_node) {
-  const auto ndef = c_node->def();
-
-  TFAttrs attrs(ndef);
-  std::vector<string> segment_nodes(
-      attrs.get<std::vector<string>>("segment_nodes"));
-  std::vector<string> output_nodes(
-      attrs.get<std::vector<string>>("segment_output_names"));
-  std::vector<string> input_names(
-      attrs.get<std::vector<string>>("input_names"));
-  string res_name = attrs.get<string>("resource_name");
-  VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name;
-  string engine_name = "my_trt_op";
-  {
-    const auto node_id = tensorflow::str_util::Split(res_name, "_");
-    engine_name += node_id.back();
-  }
-  std::map<string, tensorflow::Node*> node_maps;
-
-  for (auto n : graph.op_nodes()) {
-    node_maps.insert({n->name(), n});
-  }
-  VLOG(1) << "Output Nodes:";
-  std::vector<tensorflow::DataType> out_types;
-  std::vector<const tensorflow::Edge*> out_edges;
-  for (auto& i : output_nodes) {
-    auto node_port = tensorflow::str_util::Split(i, ":");
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-    auto out_node_name = node_port.at(0);
-    if (node_port.size() > 1) {
-      VLOG(1) << "Multi port output" << node_port.at(0) << " "
-              << node_port.at(1) << " size=" << node_port.size();
-    }
-    auto node_it = node_maps.find(out_node_name);
-    if (node_it != node_maps.end()) {
-      tensorflow::Node* out_node = node_it->second;
-      int port = 0;
-      if (node_port.size() == 2) {
-        port = std::strtoul(node_port.at(1).c_str(), nullptr, 10);
-        out_types.push_back(out_node->output_type(port));
-      } else {
-        out_types.push_back(out_node->output_type(0));
-      }
-      for (auto out_edge : out_node->out_edges()) {
-        if (out_edge->src_output() == port) {
-          out_edges.push_back(out_edge);
-          break;
-        }
-      }
-    } else {
-      LOG(WARNING) << " couldn't find output node " << out_node_name;
-    }
-  }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-  }
-  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto resmgr = trt_rm->getManager("TRTCalibOps");
-  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
-  auto status = resmgr->Lookup(res_name, res_name, &calib_res);
-  if (!status.ok() || !calib_res->calibrator_) {
-    return tensorflow::errors::FailedPrecondition(
-        "You must run calibration"
-        " and inference conversion in the same proces");
-  }
-
-  calib_res->calibrator_->setDone();
-  calib_res->thr_->join();
-  delete calib_res->thr_;
-  if (!calib_res->engine_) {
-    LOG(FATAL) << "Calibration failed!, engine is nullptr. Did you run "
-                  "calibration graph?";
-  }
-  auto weight_rmgr = trt_rm->getManager("WeightStore");
-  TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
-      res_name, res_name));
-  auto engine_plan = calib_res->engine_->serialize();
-  calib_res->engine_->destroy();
-  calib_res->network_->destroy();
-  calib_res->builder_->destroy();
-  calib_res->thr_ = nullptr;
-  calib_res->engine_ = nullptr;
-  calib_res->builder_ = nullptr;
-  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (const auto in_edge : c_node->in_edges()) {
-    auto src = in_edge->src();
-    int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
-  tensorflow::NodeDef engine_node;
-  const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
-  string engine_plan_string(engine_plan_data,
-                            engine_plan_data + engine_plan->size());
-  status = op_builder.Attr("serialized_engine", engine_plan_string)
-               .Attr("input_nodes", input_names)
-               .Attr("output_nodes", output_nodes)
-               .Attr("OutT", out_types)
-               .Finalize(&engine_node);
-  if (!status.ok()) {
-    LOG(ERROR) << "Engine Node creation failed";
-    return status;
-  }
-  auto trt_engine_node = graph.AddNode(engine_node, &status);
-  TF_CHECK_OK(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
-  }
-  VLOG(1) << "Segment nodes:";
-  for (auto& i : segment_nodes) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-    auto it = node_maps.find(i);
-    if (it != node_maps.end()) {
-      graph.RemoveNode(it->second);
-    }
-  }
-  graph.RemoveNode(c_node);
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
-  std::list<tensorflow::Node*> order;
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to contstruct the
-      // network layer by layer
-    }
-  }
-  // topological order is needed to build TRT network
-  static int static_id = 0;
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
-  }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
-    // TODO(jie): alternative :)
-    if (!s.graph_properties.HasOutputProperties(node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          node_name);
-
-    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", output_idx, ", at node: ", node_name,
-          "with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(output_idx);
-
-    tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
-
-    VLOG(2) << "accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << "with output entry from shape_map: " << op_info_vec.size();
-
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx);
-
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
-  }
-
-  VLOG(2) << "finished sorting";
-
-  for (const tensorflow::Node* node : order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "finished conversion";
-
-  // Gather output metadata
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-    VLOG(1) << "output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
-    }
-    converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
-  }
-
-  VLOG(2) << "finished output";
-
-  // Build the engine
-  op_res->builder_->setMaxBatchSize(s.max_batch_size);
-  op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
-
-  // Build the TRT op
-  // TODO(sami,ben,jie): proper naming!
-  tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
-            << ":" << output_idx
-            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
-  std::vector<string> segment_names;
-  segment_names.reserve(s.subgraph_node_ids.size());
-  for (int i : s.subgraph_node_ids) {
-    auto node = s.graph.FindNodeId(i);
-    segment_names.push_back(node->name());
-  }
-  LOG(INFO) << "finished op preparation";
-
-  auto status = op_builder.Attr("segment_nodes", segment_names)
-                    .Attr("input_names", input_names)
-                    .Attr("segment_output_names", output_names)
-                    .Attr("resource_name", calib_op_name)
-                    .Finalize(s.trt_node);
-
-  LOG(INFO) << status.ToString();
-  LOG(INFO) << "finished op building";
-
-  return tensorflow::Status::OK();
-}
 
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    tensorrt::convert::SubGraphParams& s) {
+    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
+    const std::vector<std::pair<int, int>>& input_inds,
+    const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
+    size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::NodeDef* trt_node) {
   // Visit nodes in reverse topological order and construct the TRT network.
 
   // Toposort
   std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
+  tensorflow::GetPostOrder(graph, &order_vec);
   // Select just the subgraph
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
+    if (subgraph_node_ids.count(node->id())) {
       // We want topological order to contstruct the
       // network layer by layer
       order.push_front(node);
@@ -2438,86 +1434,46 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         "Failed to create TensorRT network object");
   }
 
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  static int static_id = 0;
-  // TODO(sami,ben,jie): proper naming!
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
-  engine_name = StrCat(engine_name, static_id++);
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(engine_name, engine_name, ws));
-
   // Build the network
-  Converter converter(trt_network.get(), ws, s.precision_mode == FP16MODE);
+  Converter converter(trt_network.get());
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input!!!!!";
+  for (std::pair<int, int> const& input : input_inds) {
     int node_id = input.first;
     int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    tensorflow::Node* node = graph.FindNodeId(node_id);
     auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
+    input_names.push_back(node_name);  // Insert original node name without port
+    // TODO(jie): alternative :)
+    if (!graph_properties.HasOutputProperties(node_name))
+      return tensorflow::errors::Internal("Failed to find input node: " +
+                                          node_name);
 
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
+    auto op_info_vec = graph_properties.GetOutputProperties(node_name);
+    if (static_cast<int>(op_info_vec.size()) < output_idx)
       return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
+          "Accessing output index of: " + std::to_string(output_idx) +
+          ", at node: " + node_name + " with output entry from shape_map: " +
+          std::to_string(op_info_vec.size()));
+
+    auto op_info = op_info_vec.at(output_idx);
 
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
-    VLOG(2) << "Accessing output index of: " << output_idx
+    VLOG(2) << "Accessing output index of: " << std::to_string(output_idx)
             << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
+            << " with output entry from shape_map: "
+            << std::to_string(op_info_vec.size());
+
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4)
-      return tensorflow::errors::Unimplemented("require 4 dimensional input");
-
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
@@ -2526,11 +1482,9 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
     // TODO(ben,jie): proper way to restore input tensor name?
     auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
+    if (output_idx != 0)
+      input_tensor_name = node_name + ":" + std::to_string(output_idx);
 
-    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -2557,22 +1511,14 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
+  for (std::pair<int, int> const& output : output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    tensorflow::Node* node = graph.FindNodeId(node_id);
     string op_name = node->name();
     string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
     if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
+      tensor_name = tensor_name + ":" + std::to_string(output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
@@ -2594,25 +1540,19 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
 
   VLOG(2) << "Finished output";
+  // TODO(jie): static_id is not thread safe.
+  static int static_id = 0;
 
   // Build the engine
-  trt_builder->setMaxBatchSize(s.max_batch_size);
-  trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
-  VLOG(0) << "Max batch size= " << s.max_batch_size
-          << " max workspace size= " << s.max_workspace_size_bytes;
-  if (s.precision_mode == FP16MODE) {
-    trt_builder->setHalf2Mode(true);
-    VLOG(0) << "Using FP16 precision mode";
-  }
-  LOG(INFO) << "starting build engine";
+  trt_builder->setMaxBatchSize(max_batch_size);
+  trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
+  VLOG(0) << "Starting build engine " << static_id;
+  // TODO(ben,jie): half2 and int8 mode support
   string engine_plan_string;
   {
     auto trt_engine =
         infer_object(trt_builder->buildCudaEngine(*converter.network()));
     VLOG(0) << "Built network";
-    if (trt_engine.get() == nullptr) {
-      return tensorflow::errors::Internal("Engine building failure");
-    }
     auto engine_plan = infer_object(trt_engine->serialize());
     VLOG(0) << "Serialized engine";
     const char* engine_plan_data =
@@ -2620,19 +1560,18 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     engine_plan_string =
         string(engine_plan_data, engine_plan_data + engine_plan->size());
   }
-  TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
-      engine_name, engine_name));
-  LOG(INFO) << "finished engine " << engine_name;
+
+  VLOG(0) << "Finished engine";
 
   // Build the TRT op
-  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
+  // TODO(sami,ben,jie): proper naming!
+  tensorflow::NodeDefBuilder op_builder(
+      tensorflow::strings::StrCat("my_trt_op", static_id++), "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names.size();
   for (size_t i = 0; i < input_names.size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    int output_idx = input_inds.at(i).second;
+    // We wired up the input here already, it is redundant to do it again in
+    // ConvertSubGraphToTensorRT(convert_graph.cc)
     auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
         input_names.at(i), output_idx, input_dtypes.at(i));
     income_edges.push_back(incoming_edge);
@@ -2647,7 +1586,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
-                    .Finalize(s.trt_node);
+                    .Finalize(trt_node);
 
   VLOG(0) << status.ToString() << " finished op building";