98 files changed, 478 insertions, 2571 deletions
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 986b61b3ea..bab37e8906 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -8,7 +8,6 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
-load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
     name = "contrib_py",
@@ -52,6 +51,7 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -63,6 +63,7 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
+        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -109,10 +110,6 @@ py_library(
         "//tensorflow/python:util",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
-    ]) + if_not_windows([
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",  # unix dependency, need to fix code
-        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
-        "//tensorflow/contrib/kafka",  # has some linking issue on opensssl.
     ]),
 )
 
@@ -124,7 +121,6 @@ cc_library(
         "//tensorflow/contrib/coder:all_kernels",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_kernels",
         "//tensorflow/contrib/data/kernels:dataset_kernels",
-        "//tensorflow/contrib/kafka:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
@@ -151,7 +147,7 @@ cc_library(
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
-        "//tensorflow/contrib/kafka:dataset_ops_op_lib",
+        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 669d611b01..4f6f539027 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
@@ -85,8 +83,7 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-if os.name != 'nt':
-  from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
@@ -95,7 +92,6 @@ from tensorflow.contrib.summary import summary
 from tensorflow.python.util.lazy_loader import LazyLoader
 ffmpeg = LazyLoader("ffmpeg", globals(),
                     "tensorflow.contrib.ffmpeg")
-del os
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index 7815fa049a..da5e744851 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -48,9 +48,9 @@ class BatchFeatures {
   Status GetFeatureColumnSizes(int64* const num_dense_float_features,
                                int64* const num_sparse_float_features,
                                int64* const num_sparse_int_features) const {
-    QCHECK_NE(num_dense_float_features, (int64*) nullptr);
-    QCHECK_NE(num_sparse_float_features, (int64*) nullptr);
-    QCHECK_NE(num_sparse_int_features, (int64*) nullptr);
+    QCHECK_NE(num_dense_float_features, nullptr);
+    QCHECK_NE(num_sparse_float_features, nullptr);
+    QCHECK_NE(num_sparse_int_features, nullptr);
     *num_dense_float_features = dense_float_feature_columns_.size();
     *num_sparse_float_features = sparse_float_feature_columns_.size();
     *num_sparse_int_features = sparse_int_feature_columns_.size();
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index fe83bb3204..8f85a75ee4 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -26,7 +26,7 @@ The CMake files in this directory can build the core TensorFlow runtime, an
 example C++ binary, and a PIP package containing the runtime and Python
 bindings.
 
-### Prerequisites
+### Pre-requisites
 
 * CMake version 3.5 or later.
 
@@ -34,16 +34,14 @@ bindings.
 
 * [SWIG](http://www.swig.org/download.html)
 
-* Additional prerequisites for Microsoft Windows:
+* Additional pre-requisites for Microsoft Windows:
   - Visual Studio 2015
   - Python 3.5
+  - NumPy 1.11.0 or later
 
-* Additional prerequisites for Linux:
+* Additional pre-requisites for Linux:
   - Python 2.7 or later
   - [Docker](https://www.docker.com/) (for automated testing)
-
-* Python dependencies:
-  - wheel
   - NumPy 1.11.0 or later
 
 ### Known-good configurations
@@ -104,7 +102,7 @@ ops or APIs.
 Step-by-step Windows build
 ==========================
 
-1. Install the prerequisites detailed above, and set up your environment.
+1. Install the pre-requisites detailed above, and set up your environment.
 
    * The following commands assume that you are using the Windows Command
      Prompt (`cmd.exe`). You will need to set up your environment to use the
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 17f65999fa..a9f43a3ecb 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -35,7 +35,6 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
 
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index ab464bc99a..aba8a5244e 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index b3e5b30826..9f96a4b797 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -476,10 +476,6 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*_test.cc"
   )
 
-  list(REMOVE_ITEM tf_test_src_simple
-    ${tf_core_profiler_test_srcs}
-  )
-
   set(tf_test_lib tf_test_lib)
   add_library(${tf_test_lib} STATIC ${tf_src_testlib})
 
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 9212b69700..f09d156832 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -40,7 +40,6 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@rejection_resample
 @@scan
 @@shuffle_and_repeat
-@@sliding_window_batch
 @@sloppy_interleave
 @@unbatch
 
@@ -73,9 +72,6 @@ from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
 from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
-from tensorflow.contrib.data.python.ops.sliding import sliding_window_batch
-from tensorflow.python.data.ops.iterator_ops import Iterator
-from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_single_example
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 2c4d4adfda..22418b38e3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -498,23 +498,6 @@ py_test(
     ],
 )
 
-tf_py_test(
-    name = "slide_dataset_op_test",
-    size = "small",
-    srcs = ["slide_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:transformation_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//third_party/py/numpy",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
deleted file mode 100644
index 33c48e20be..0000000000
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import sliding
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class SlideDatasetTest(test.TestCase):
-
-  def testSlideDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    window_size = array_ops.placeholder(dtypes.int64, shape=[])
-    stride = array_ops.placeholder(dtypes.int64, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count) -> _SlideDataset(window_size, stride).
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .map(_map_fn)
-                .repeat(count)
-                .apply(sliding.sliding_window_batch(window_size, stride))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.test_session() as sess:
-      # Slide over a finite input, where the window_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 20, window_size: 14, stride: 7})
-      # Same formula with convolution layer.
-      num_batches = (20 * 7 - 14) // 7 + 1
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i*7 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Slide over a finite input, where the window_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 20, window_size: 17, stride: 9})
-
-      num_batches = (20 * 7 - 17) // 9 + 1
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(17):
-            self.assertAllEqual(component[(i*9 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Slide over a finite input, which is less than window_size,
-      # should fail straight away.
-      sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 4})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      sess.run(init_op, feed_dict={count: 1, window_size: 10, stride: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Slide over an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, window_size: 8, stride: 4})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Empty window_size should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, window_size: 0, stride: 0})
-
-      # Invalid stride should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 0})
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 3})
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, window_size: 3, stride: 5})
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def testSlideSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(5, 3)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSlideSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(5, 3)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 3 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 3 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, i * 3 + 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedSlideSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse)
-                .apply(sliding.sliding_window_batch(4, 2))
-                .apply(sliding.sliding_window_batch(3, 1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      # Slide: 1st batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0],
-                   [2, 0, 0], [2, 1, 0], [2, 2, 0], [2, 3, 0]],
-          values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      # Slide: 2nd batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0],
-                   [2, 0, 0], [2, 1, 0], [2, 2, 0], [2, 3, 0]],
-          values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSlideShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = (dataset_ops.Dataset.from_generator(generator, dtypes.float32,
-                                                   output_shapes=[None])
-                .apply(sliding.sliding_window_batch(3, 1))
-                .make_initializable_iterator())
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Cannot batch tensors with different shapes in component 0. "
-          r"First element had shape \[3\] and element 2 had shape \[4\]."):
-        sess.run(next_element)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index c3331e9636..f03430c5c5 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -106,7 +106,6 @@ py_library(
         "interleave_ops.py",
         "resampling.py",
         "scan_ops.py",
-        "sliding.py",
         "stats_ops.py",
         "threadpool.py",
         "unique.py",
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
deleted file mode 100644
index 19cc3cb89f..0000000000
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Sliding dataset transformations."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
-
-
-class _SlideDataset(dataset_ops.Dataset):
-  """A `Dataset` that passes a sliding window over its input."""
-
-  def __init__(self, input_dataset, window_size, stride=1):
-    """See `sliding_window_batch` for details."""
-    super(_SlideDataset, self).__init__()
-    self._input_dataset = input_dataset
-    self._window_size = ops.convert_to_tensor(
-        window_size, dtype=dtypes.int64, name="window_size")
-    self._stride = ops.convert_to_tensor(
-        stride, dtype=dtypes.int64, name="stride")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.slide_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        window_size=self._window_size,
-        stride=self._stride,
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-def sliding_window_batch(window_size, stride=1):
-  """A sliding window with size of `window_size` and step of `stride`.
-
-  This transformation passes a sliding window over this dataset. The
-  window size is `window_size` and step size is `stride`. If the left
-  elements cannot fill up the sliding window, this transformation will
-  drop the final smaller element. For example:
-
-  ```python
-  # NOTE: The following examples use `{ ... }` to represent the
-  # contents of a dataset.
-  a = { [1], [2], [3], [4], [5], [6] }
-
-  a.apply(tf.contrib.data.sliding_window_batch(window_size=3, stride=2)) ==
-  {
-      [[1], [2], [3]],
-      [[3], [4], [5]],
-  }
-  ```
-
-  Args:
-    window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
-      elements in the sliding window.
-    stride: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-      steps moving the sliding window forward for one iteration. The default
-      is `1`. It must be in `[1, window_size)`.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    @{tf.data.Dataset.apply}.
-  """
-  def _apply_fn(dataset):
-    return _SlideDataset(dataset, window_size, stride)
-
-  return _apply_fn
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 1bd73ee704..6bd3f5f09b 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -454,7 +454,6 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
@@ -1144,7 +1143,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index eb810e06dd..32aa2c0a4a 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -267,10 +267,7 @@ cuda_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras",
     ],
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",
-    ],
+    tags = ["notsan"],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index 2f6cfdf31e..f86331af6f 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -22,7 +22,6 @@ cuda_py_test(
         ":linear_regression",
         "//tensorflow:tensorflow_py",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index ad8568ad44..90f10f1fa8 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -224,10 +224,7 @@ py_test(
     srcs = ["python/ops/kmeans_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/73741358
-        "notsan",  # b/67512932
-    ],
+    tags = ["notsan"],  # b/67512932
     deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 35341406a0..e61221a6b0 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -256,9 +256,6 @@ Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
         if (p != std::string::npos) {
           string rgb24 = line.substr(p + 9, line.find(" ", p + 9));
           rgb24 = rgb24.substr(0, rgb24.find(","));
-          // Strip anything after " ", in case the format is
-          // `640x360 [SAR 1:1 DAR 16:9]`
-          rgb24 = rgb24.substr(0, rgb24.find(" "));
           string rgb24_width = rgb24.substr(0, rgb24.find("x"));
           string rgb24_height = rgb24.substr(rgb24_width.length() + 1);
           if (strings::safe_strtou32(rgb24_width, &width_value) &&
@@ -273,10 +270,8 @@ Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
       // We only look for the first stream mapping to have the number of the
       // frames.
       // Once processed we will not further process stream mapping section.
-      if (line.find("frame=") == 0) {
-        // The format might be `frame=  166 ` or `frame=12488 `
-        string number = line.substr(6);
-        number = number.substr(number.find_first_not_of(" "));
+      if (line.find("frame=  ") == 0) {
+        string number = line.substr(8, line.find(" ", 8));
         number = number.substr(0, number.find(" "));
         if (strings::safe_strtou32(number, &frames_value)) {
           in_mapping = false;
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index ff6f3b7441..0eb0e3cbe2 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -354,7 +354,6 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
index 1c3974871c..efb403462a 100644
--- a/tensorflow/contrib/kafka/BUILD
+++ b/tensorflow/contrib/kafka/BUILD
@@ -1,93 +1,66 @@
-package(default_visibility = ["//tensorflow:internal"])
+package(
+    default_visibility = ["//visibility:private"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrapper_py",
-    "tf_kernel_library",
-    "tf_custom_op_library",
-    "tf_custom_op_py_library",
-    "tf_gen_op_libs",
-    "tf_py_test",
-)
-
-py_library(
-    name = "kafka",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":dataset_ops",
-    ],
-)
-
-tf_custom_op_library(
-    name = "_dataset_ops.so",
-    srcs = ["ops/dataset_ops.cc"],
-    deps = [":dataset_kernels"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["dataset_ops"],
-)
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
-cc_library(
-    name = "dataset_kernels",
+tf_kernel_library(
+    name = "kafka_kernels",
     srcs = ["kernels/kafka_dataset_ops.cc"],
+    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:dataset",
         "//third_party/eigen3",
         "@kafka",
-        "@protobuf_archive//:protobuf_headers",
     ],
-    alwayslink = 1,
 )
 
-py_library(
-    name = "dataset_ops",
-    srcs = [
-        "python/ops/kafka_dataset_ops.py",
-    ],
-    srcs_version = "PY2AND3",
+tf_gen_op_libs(
+    op_lib_names = ["kafka_ops"],
     deps = [
-        ":kafka_op_loader",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/core:lib",
     ],
 )
 
 tf_gen_op_wrapper_py(
-    name = "gen_dataset_ops",
-    out = "python/ops/gen_dataset_ops.py",
-    deps = ["//tensorflow/contrib/kafka:dataset_ops_op_lib"],
-)
-
-tf_kernel_library(
-    name = "dataset_ops_kernels",
-    deps = [
-        ":dataset_kernels",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
+    name = "gen_kafka_ops",
+    out = "python/ops/gen_kafka_ops.py",
+    require_shape_functions = True,
+    deps = [":kafka_ops_op_lib"],
 )
 
-tf_custom_op_py_library(
-    name = "kafka_op_loader",
-    srcs = ["python/ops/kafka_op_loader.py"],
-    dso = ["//tensorflow/contrib/kafka:_dataset_ops.so"],
-    kernels = [
-        ":dataset_ops_kernels",
-        "//tensorflow/contrib/kafka:dataset_ops_op_lib",
+py_library(
+    name = "kafka",
+    srcs = [
+        "__init__.py",
+        "python/ops/kafka_dataset_ops.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
-        ":gen_dataset_ops",
+        ":gen_kafka_ops",
         "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
     ],
 )
 
@@ -115,7 +88,6 @@ tf_py_test(
     ],
     tags = [
         "manual",
-        "no_windows",
         "notap",
     ],
 )
@@ -123,9 +95,7 @@ tf_py_test(
 filegroup(
     name = "all_files",
     srcs = glob(
-        include = [
-            "**/*",
-        ],
+        ["**/*"],
         exclude = [
             "**/METADATA",
             "**/OWNERS",
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index a4cd4a2cc4..88ef5f3571 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/tensor.h"
 
 #include "src-cpp/rdkafkacpp.h"
 
diff --git a/tensorflow/contrib/kafka/ops/dataset_ops.cc b/tensorflow/contrib/kafka/ops/dataset_ops.cc
deleted file mode 100644
index 8cdf16103b..0000000000
--- a/tensorflow/contrib/kafka/ops/dataset_ops.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-REGISTER_OP("KafkaDataset")
-    .Input("topics: string")
-    .Input("servers: string")
-    .Input("group: string")
-    .Input("eof: bool")
-    .Input("timeout: int64")
-    .Output("handle: variant")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that emits the messages of one or more Kafka topics.
-
-topics: A `tf.string` tensor containing one or more subscriptions,
-  in the format of [topic:partition:offset:length],
-  by default length is -1 for unlimited.
-servers: A list of bootstrap servers.
-group: The consumer group id.
-eof: If True, the kafka reader will stop on EOF.
-timeout: The timeout value for the Kafka Consumer to wait
-  (in millisecond).
-)doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index a1624614d1..8e51d27a34 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -17,9 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.kafka.python.ops import kafka_op_loader  # pylint: disable=unused-import
-from tensorflow.contrib.kafka.python.ops import gen_dataset_ops
-from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
+from tensorflow.python.data.ops.readers import Dataset
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -59,8 +58,8 @@ class KafkaDataset(Dataset):
         timeout, dtype=dtypes.int64, name="timeout")
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.kafka_dataset(self._topics, self._servers,
-                                         self._group, self._eof, self._timeout)
+    return gen_kafka_ops.kafka_dataset(self._topics, self._servers, self._group,
+                                       self._eof, self._timeout)
 
   @property
   def output_classes(self):
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_op_loader.py b/tensorflow/contrib/kafka/python/ops/kafka_op_loader.py
deleted file mode 100644
index ec2fdea962..0000000000
--- a/tensorflow/contrib/kafka/python/ops/kafka_op_loader.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python helper for loading kafka ops and kernels."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
-
-_dataset_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index d1c449402a..146ae8b7e2 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -114,7 +114,6 @@ py_test(
     name = "utils_test",
     srcs = ["utils_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/contrib/tpu",
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 544065dac6..894e6f6946 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,7 +70,6 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":_typecheck",
         ":core",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index cc7bbabf21..852d06e1e3 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -188,7 +188,6 @@ py_test(
     size = "small",
     srcs = ["python/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/contrib/framework:framework_py",
@@ -354,7 +353,6 @@ py_test(
     size = "small",
     srcs = ["python/ops/sparse_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index ffa208540d..b62e3050cd 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -470,7 +470,7 @@ def embedding_lookup_unique(params, ids, name=None):
     ids = ops.convert_to_tensor(ids)
     shape = array_ops.shape(ids)
     ids_flat = array_ops.reshape(
-        ids, math_ops.reduce_prod(shape, keepdims=True))
+        ids, math_ops.reduce_prod(shape, keep_dims=True))
     unique_ids, idx = array_ops.unique(ids_flat)
     unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids)
     embeds_flat = array_ops.gather(unique_embeddings, idx)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index b05f5eeaee..f837ca3265 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -5,8 +5,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 package(default_visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
     "//tensorflow:internal",
@@ -117,7 +115,6 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -173,7 +170,6 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -192,7 +188,6 @@ py_test(
     size = "small",
     srcs = ["python/learn/graph_actions_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -431,10 +426,7 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/kmeans_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "noasan",  # b/73741358
-        "nomac",
-    ],
+    tags = ["noasan"],
     deps = [
         ":learn",
         "//tensorflow/python:array_ops",
@@ -593,7 +585,6 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/learn/python/learn/datasets",
@@ -823,7 +814,6 @@ py_test(
     size = "small",
     srcs = ["python/learn/utils/saved_model_export_utils_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py b/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py
index 8f9811cf25..b3b067b8e1 100644
--- a/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/embeddings_ops.py
@@ -61,7 +61,7 @@ def embedding_lookup(params, ids, name='embedding_lookup'):
     ids = ops.convert_to_tensor(ids)
     shape = array_ops_.shape(ids)
     ids_flat = array_ops_.reshape(
-        ids, math_ops.reduce_prod(shape, keepdims=True))
+        ids, math_ops.reduce_prod(shape, keep_dims=True))
     embeds_flat = nn.embedding_lookup(params, ids_flat, name)
     embed_shape = array_ops_.concat([shape, [-1]], 0)
     embeds = array_ops_.reshape(embeds_flat, embed_shape)
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index b4504f246a..7f31629272 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -27,10 +27,10 @@ LIBDIR := $(MAKEFILE_DIR)/gen/lib/
 GENDIR := $(MAKEFILE_DIR)/gen/obj/
 
 # Settings for the host compiler.
-CXX := $(CC_PREFIX)gcc
+CXX := $(CC_PREFIX) gcc
 CXXFLAGS := --std=c++11 -O3 -DNDEBUG
-CC := $(CC_PREFIX)gcc
-CFLAGS := -O3 -DNDEBUG
+CC := $(CC_PREFIX) gcc
+CFLAGS :=
 LDOPTS :=
 LDOPTS += -L/usr/local/lib
 ARFLAGS := -r
@@ -57,11 +57,10 @@ LIBS := \
 
 # If we're on Linux, also link in the dl library.
 ifeq ($(HOST_OS),LINUX)
-	LIBS += -ldl
+	LIBS += -ldl -lpthread
 endif
 
 include $(MAKEFILE_DIR)/ios_makefile.inc
-include $(MAKEFILE_DIR)/rpi_makefile.inc
 
 # This library is the main target for this makefile. It will contain a minimal
 # runtime that can be linked in to other programs.
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index f84b3dad95..58bc164619 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -33,7 +33,7 @@ class AllocationInfo;
 // each tensor needs to be allocated and deallocated, and preallocates all the
 // necessary memory (the PlanAllocations phase). It then assigns portions of
 // this memory buffer to each tensor (the ExecuteAllocations phase). Tensors may
-// share some of the buffer if a tensor B is to be allocated after another tensor
+// share some of the bufer if a tensor B is to be allocated after another tensor
 // A has been deallocated.
 //
 // If dynamic tensors are used the planning steps can be repeated during model
diff --git a/tensorflow/contrib/lite/build_rpi_lib.sh b/tensorflow/contrib/lite/build_rpi_lib.sh
deleted file mode 100755
index 3824b16412..0000000000
--- a/tensorflow/contrib/lite/build_rpi_lib.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash -x
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../.."
-
-CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/Makefile TARGET=RPI TARGET_ARCH=armv7
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index ea3ae3489e..2218ea8eac 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -24,7 +24,7 @@ extern "C" {
 #endif  // __cplusplus
 
 // The enum for builtin operators.
-// Note: CUSTOM and DELEGATE are 2 special ops which are not real builtin
+// Note: CUSTOM and DELEGATE are 2 special ops which are not real biultin
 // ops.
 typedef enum {
   kTfLiteBuiltinAdd = 0,
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
index 3c5f805f12..da193d2586 100644
--- a/tensorflow/contrib/lite/error_reporter.h
+++ b/tensorflow/contrib/lite/error_reporter.h
@@ -30,7 +30,7 @@ namespace tflite {
 //  va_list args;
 //  foo.Report("test %d", args); // where args is va_list
 //
-// Subclass ErrorReporter to provide another reporting destination.
+// Sublclass ErrorReporter to provide another reporting destination.
 // For example, if you have a GUI program, you might redirect to a buffer
 // that drives a GUI error log box.
 class ErrorReporter {
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
deleted file mode 100644
index 7a3a231626..0000000000
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# TensorFlow Lite for Raspberry Pi
-
-## Cross compiling
-### Installing toolchian
-This has been tested on Ubuntu 16.04.3 64bit and Tensorflow devel docker image [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
-
-To cross compiling TensorFlow Lite. First you should install the toolchain and libs.
-```bash
-sudo apt-get update
-sudo apt-get install crossbuild-essential-armhf
-```
-> If you are using docker, you may not use `sudo`
-
-### Building
-Clone this Tensorflow repository, Run this script at the root of the repository to download all the dependencies:
-> The Tensorflow repository is in `/tensorflow` if you are using `tensorflow/tensorflow:nightly-devel` docker image, just try it.
-```bash
-./tensorflow/contrib/lite/download_dependencies.sh
-```
-Note than you only need to to this once.
-
-You should then be able to compile:
-```bash
-./tensorflow/contrib/lite/build_rpi_lib.sh
-```
-
-This should compile a static library in:
-`tensorflow/contrib/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
-
-## Native compiling
-This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
-
-Log in to you RPI, install the toolchain.
-```bash
-sudo apt-get instal build-essential
-```
-
-First, clone this TensorFlow repository. Run this at the root of the repository:
-```bash
-./tensorflow/contrib/lite/download_dependencies.sh
-```
-Note than you only need to to this once.
-
-You should then be able to compile:
-```bash
-./tensorflow/contrib/lite/build_rpi_lib.sh
-```
-
-This should compile a static library in:
-`tensorflow/contrib/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 3749869f58..af143370ee 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -481,7 +481,7 @@ class Interpreter {
   // During Invoke(), Interpreter will allocate input tensors first, which are
   // known to be fixed size. Then it will allocate outputs from nodes as many
   // as possible. When there is a node that produces dynamic sized tensor.
-  // Interpreter will stop allocating tensors, set the value of next allocate
+  // Intepreter will stop allocating tensors, set the value of next allocate
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 72d4acedbe..7a029c7df8 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -40,7 +40,7 @@ TEST(BasicInterpreter, InvokeInvalidModel) {
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
 }
 
-// Test size accessor functions.
+// Test size accesser functions.
 TEST(BasicInterpreter, TestSizeFunctions) {
   Interpreter interpreter;
   int base_index;
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index e0cd12f1b4..b91ba1a03d 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -64,7 +64,7 @@ struct OpData {
 
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
+  // be represented as a fixed point multipler plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index cad9ce114c..15dbfe08c8 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -52,7 +52,7 @@ enum KernelType {
 struct OpData {
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
+  // be represented as a fixed point multipler plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index 888e67966c..a77fe94e49 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -48,7 +48,7 @@ enum KernelType {
 
 struct OpData {
   // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
+  // be represented as a fixed point multipler plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 21da1daff7..28f53b9fbb 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -58,7 +58,7 @@ inline bool IsConstantTensor(TfLiteTensor* tensor) {
 }
 
 // Determines whether tensor is dynamic. Note that a tensor can be non-const and
-// not dynamic. This function specifically checks for a dynamic tensor.
+// not dynamic. This function specificially checks for a dynamic tensor.
 inline bool IsDynamicTensor(TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteDynamic;
 }
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
index 0ee35775d5..5f73b56ed9 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// LSH Projection projects an input to a bit vector via locality sensitive
+// LSH Projection projects an input to a bit vector via locality senstive
 // hashing.
 //
 // Options:
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 8cf1165135..b9255b23a5 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -213,9 +213,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   // present.
   // 2) If projection weight is present, then projection bias is optional.
   // TODO(ghodrat): make sure this is correct.
-  const bool projection_tensors_consistent =
+  const bool projecton_tensors_consistent =
       ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projection_tensors_consistent == true);
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
 
   return kTfLiteOk;
 }
@@ -357,7 +357,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
+  // check the existense of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
index 438f70d311..f3e6ddc9f4 100644
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -49,20 +49,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions);
   int num_output_elements = 1;
-  int stretch_dim = -1;
+  int strech_dim = -1;
   for (int i = 0; i < params->num_dimensions; ++i) {
     int value = params->shape[i];
     if (value == -1) {
-      TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
-      stretch_dim = i;
+      TF_LITE_ENSURE_EQ(context, strech_dim, -1);
+      strech_dim = i;
     } else {
       num_output_elements *= value;
       output_size->data[i] = value;
     }
   }
-  if (stretch_dim != -1) {
-    output_size->data[stretch_dim] = num_input_elements / num_output_elements;
-    num_output_elements *= output_size->data[stretch_dim];
+  if (strech_dim != -1) {
+    output_size->data[strech_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_size->data[strech_dim];
   }
 
   TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
index aecbd0399f..0fbcf6e6aa 100644
--- a/tensorflow/contrib/lite/kernels/reshape_test.cc
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -60,7 +60,7 @@ TEST(ReshapeOpTest, TooManyDimensions) {
 
 TEST(ReshapeOpTest, TooManySpecialDimensions) {
   EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {-1, -1, 2, 4}),
-               "stretch_dim != -1");
+               "strech_dim != -1");
 }
 
 TEST(ReshapeOpTest, SimpleTest) {
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 0bb28b50b2..373310bd87 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -141,8 +141,8 @@ void SingleOpModel::SetBuiltinOp(BuiltinOperator type,
 
 void SingleOpModel::SetCustomOp(
     const string& name, const std::vector<uint8_t>& custom_option,
-    const std::function<TfLiteRegistration*()>& registration) {
-  custom_registrations_[name] = registration;
+    const std::function<TfLiteRegistration*()>& registeration) {
+  custom_registrations_[name] = registeration;
   opcodes_.push_back(
       CreateOperatorCodeDirect(builder_, BuiltinOperator_CUSTOM, name.data()));
   operators_.push_back(CreateOperator(
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index 42941a97db..508a570e2e 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -360,7 +360,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
+  // check the existense of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
diff --git a/tensorflow/contrib/lite/memory_planner.h b/tensorflow/contrib/lite/memory_planner.h
index 0294ec815c..5cd6c20850 100644
--- a/tensorflow/contrib/lite/memory_planner.h
+++ b/tensorflow/contrib/lite/memory_planner.h
@@ -34,8 +34,8 @@ class MemoryPlanner {
   // [first_node, last_node].
   virtual TfLiteStatus ExecuteAllocations(int first_node, int last_node) = 0;
 
-  // Invalidates allocations made earlier. This is called when tensors sizes
-  // have changed. All planned allocations remain, but can't be used until
+  // Invalidates allocations made earliers. This is called when tensors sizes
+  // have change. All planned allocations remain, but can't be used until
   // ExecuteAllocations() is called.
   virtual TfLiteStatus ResetAllocations() = 0;
 };
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 38eea0e26b..51a622a28d 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -81,7 +81,7 @@ class FlatBufferModel {
       const tflite::Model* model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Releases memory or unmaps mmaped memory.
+  // Releases memory or unmaps mmaped meory.
   ~FlatBufferModel();
 
   // Copying or assignment is disallowed to simplify ownership semantics.
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index bd49d327c9..76032771af 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -569,7 +569,7 @@ enum {
   ANEURALNETWORKS_LOGISTIC = 14,
 
   /**
-   * Projects an input to a bit vector via locality sensitive hashing.
+   * Projects an input to a bit vector via locality senstive hashing.
    *
    * Inputs:
    * * 0: Hash functions. Dim.size == 2, DataType: Float.
diff --git a/tensorflow/contrib/lite/rpi_makefile.inc b/tensorflow/contrib/lite/rpi_makefile.inc
deleted file mode 100644
index 832ef5824b..0000000000
--- a/tensorflow/contrib/lite/rpi_makefile.inc
+++ /dev/null
@@ -1,33 +0,0 @@
-# Settings for Raspberry Pi.
-ifeq ($(TARGET), RPI)
-	ifeq ($(TARGET_ARCH), armv7)
-		CXXFLAGS += \
-			-march=armv7-a \
-			-mfpu=neon-vfpv4 \
-			-funsafe-math-optimizations \
-			-ftree-vectorize
-
-		CCFLAGS += \
-			-march=armv7-a \
-			-mfpu=neon-vfpv4 \
-			-funsafe-math-optimizations \
-			-ftree-vectorize
-
-		LDFLAGS := \
-			-Wl,--no-export-dynamic \
-			-Wl,--exclude-libs,ALL \
-			-Wl,--gc-sections \
-			-Wl,--as-needed
-	endif
-
-	LIBS := \
-	-lstdc++ \
-	-lpthread \
-	-lm \
-	-ldl
-
-	OBJDIR := $(OBJDIR)rpi_$(TARGET_ARCH)/
-	LIBDIR := $(LIBDIR)rpi_$(TARGET_ARCH)/
-	BINDIR := $(BINDIR)rpi_$(TARGET_ARCH)/
-	DEPDIR := $(DEPDIR)rpi_$(TARGET_ARCH)/
-endif
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
index 640972de77..08bcfe4516 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
@@ -46,7 +46,7 @@ extern "C" {
 #endif  // __cplusplus
 
 // The enum for builtin operators.
-// Note: CUSTOM and DELEGATE are 2 special ops which are not real builtin
+// Note: CUSTOM and DELEGATE are 2 special ops which are not real biultin
 // ops.
 typedef enum {
 )";
diff --git a/tensorflow/contrib/lite/simple_memory_arena.cc b/tensorflow/contrib/lite/simple_memory_arena.cc
index 2f2004f56b..4aab244989 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.cc
+++ b/tensorflow/contrib/lite/simple_memory_arena.cc
@@ -113,21 +113,21 @@ TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context) {
     underlying_buffer_size_ = required_size;
     underlying_buffer_aligned_ptr_ = new_underlying_buffer_aligned_ptr;
   }
-  committed_ = true;
+  commited_ = true;
   return underlying_buffer_ != nullptr ? kTfLiteOk : kTfLiteError;
 }
 
 TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context,
                                              const ArenaAlloc& alloc,
                                              char** output_ptr) {
-  TF_LITE_ENSURE(context, committed_);
+  TF_LITE_ENSURE(context, commited_);
   TF_LITE_ENSURE(context, output_ptr != nullptr);
   *output_ptr = underlying_buffer_aligned_ptr_ + alloc.offset;
   return kTfLiteOk;
 }
 
 TfLiteStatus SimpleMemoryArena::Clear() {
-  committed_ = false;
+  commited_ = false;
   high_water_mark_ = 0;
   allocs_.clear();
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
index 5faf78b59e..0535522374 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 
 // This little structure holds the offset and the size for a dynamic memory
-// allocation in the memory arena. When the arena is committed and the
+// allocation in the memory arena. When the arena is commited and the
 // underlying buffer is set, the alloc can be resolved into an actual memory
 // pointer.
 struct ArenaAlloc {
@@ -43,7 +43,7 @@ struct ArenaAlloc {
 class SimpleMemoryArena {
  public:
   explicit SimpleMemoryArena(size_t arena_alignment)
-      : committed_(false),
+      : commited_(false),
         arena_alignment_(arena_alignment),
         high_water_mark_(0),
         underlying_buffer_size_(0),
@@ -73,7 +73,7 @@ class SimpleMemoryArena {
   }
 
  private:
-  bool committed_;
+  bool commited_;
   size_t arena_alignment_;
   size_t high_water_mark_;
   std::unique_ptr<char[]> underlying_buffer_;
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 0a6edc33c5..8ca03f4193 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -46,7 +46,6 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     grpc_enabled = True,
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 filegroup(
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 6c3b02e12b..995230dfa8 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -194,8 +194,6 @@ with:
 srcs = glob(["libs/arm64-v8a/*.so"]),
 ```
 
-If you are building for Android TV (Shield TV devices), replace "portrait" with "landscape" for android:screenOrientation in all four activities in tensorflow/examples/android/AndroidManifest.xml
-
 Then run:
 ```bash
 # Create dir for native libs
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index 9b148688c4..2d99791839 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -80,9 +80,10 @@ if [[ ! -z "${OPTIMIZE_FOR_GRAPH}" ]]; then
         fi
     else
         echo "${PRNT_SLCTV_BIN} found. Using it"
+        ${PRNT_SLCTV_BIN} --graphs=${OPTIMIZE_FOR_GRAPH} > ${TOP_SRCDIR}/tensorflow/core/framework/ops_to_register.h
+
     fi
 
-    ${PRNT_SLCTV_BIN} --graphs=${OPTIMIZE_FOR_GRAPH} > ${TOP_SRCDIR}/tensorflow/core/framework/ops_to_register.h
 fi
 
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
index df055ff567..fa297c28cb 100644
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -24,8 +24,6 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/str_util.h"
 
-// Skip MPI C++ bindings support, this matches the usage in other places
-#define OMPI_SKIP_MPICXX
 #include "third_party/mpi/mpi.h"
 #define MPI_CHECK(cmd)                                                \
   do {                                                                \
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe..04b5d5bdf1 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -53,7 +53,7 @@ def from_contrib_estimator(estimator,
       `Estimator`.
   """
   if isinstance(estimator, core_estimator.Estimator):
-    raise TypeError('Expected estimator to be of type '
+    raise TypeError('Espected estimator to be of type '
                     'tf.contrib.learn.Estimator, but got type '
                     'tf.python.estimator.Estimator. You likely want to call '
                     'from_estimator.')
@@ -88,7 +88,7 @@ def from_estimator(estimator,
       `Estimator`.
   """
   if isinstance(estimator, contrib_estimator.Estimator):
-    raise TypeError('Expected estimator to be of type '
+    raise TypeError('Espected estimator to be of type '
                     'tf.python.estimator.Estimator, but got type '
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
index 4bb6f76019..f624c42686 100644
--- a/tensorflow/contrib/py2tf/converters/BUILD
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -81,7 +81,6 @@ py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -92,7 +91,6 @@ py_test(
     name = "call_trees_test",
     srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":test_lib",
         "//tensorflow/contrib/py2tf/impl",
diff --git a/tensorflow/contrib/py2tf/converters/single_return.py b/tensorflow/contrib/py2tf/converters/single_return.py
index 1194b98f5e..90bc22008f 100644
--- a/tensorflow/contrib/py2tf/converters/single_return.py
+++ b/tensorflow/contrib/py2tf/converters/single_return.py
@@ -212,7 +212,7 @@ class DetectReturnInUnsupportedControlFlow(gast.NodeVisitor):
 
   def __init__(self):
     self.cant_return = False
-    super(DetectReturnInUnsupportedControlFlow, self).__init__()
+    super(gast.NodeVisitor, self).__init__()
 
   def visit_While(self, node):
     self.cant_return = True
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
index 8bc338e801..d029289f5a 100644
--- a/tensorflow/contrib/py2tf/utils/BUILD
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -83,7 +83,6 @@ py_test(
     name = "py_func_test",
     srcs = ["py_func_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 1afcbb8504..b278265639 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -237,7 +237,7 @@ def _FindFusedBatchNorms(graph):
       # The batch variance used during forward and backward prop is biased,
       # i.e it is calculated as: V=sum(x(k)-mu)^2/N. For the moving average
       # calculation, the variance is corrected by the term N/N-1 (Bessel's
-      # correction). The variance tensor read from FuseBatchNorm has Bessel's
+      # correction). The variance tensor read from FuseBatchNorm has bessel's
       # correction applied, so we undo it here.
       scope, sep, _ = bn_op.name.rpartition('/')
       g = ops.get_default_graph()
@@ -306,7 +306,7 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
 
   Args:
     context: The scope under which we look for batch norm params
-    match: Object containing required batch norm tensors for correction
+    match: Object containg required batch norm tensors for correction
       computation.
     freeze_batch_norm_delay: Delay in steps at which computation switches
       from regular batch norm to frozen mean and variance.
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index a4f7b1b221..0a8e35080c 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -282,8 +282,8 @@ def _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel, num_bits,
   Args:
     inputs: a tensor containing values to be quantized.
     min_var: a variable containing quantization range lower end(s).
-    max_var: a variable containing quantization range upper end(s).
-    per_channel: a boolean specifying whether to use per-channel quantization.
+    max_var: a variable containing quantization range lupper end(s).
+    per_channel: a boolean specifying whether to use per-channel quantizatioh.
     num_bits: Number of bits to use for quantization, must be between 2 and 8.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1].
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index ec721afbc8..0608ab9302 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -267,7 +267,7 @@ def _InsertQuantOp(context,
   """Inserts a quant op between a producer op and (multiple) consumer ops.
 
   Args:
-    context: Context where producer and consumer operations are nested.
+    context: Context w,here producer and consumer operations are nested.
     name: Name for the new quantization op within the context.
     producer: Producer operation of the pairs where quantization will be
       inserted.
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 5abdcd2475..5a3a74cec4 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -158,7 +158,7 @@ def experimental_create_training_graph(input_graph=None,
   often fail.
 
   Args:
-    input_graph: The tf.Graph to be transformed, if None then defaults to the
+    input_graph: The tf.Graph to be transformed,if None then defaults to the
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index db745aa562..0624cc878b 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -419,7 +419,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           normalizer_params=self._BatchNormParams(fused_batch_norm),
           scope=scope)
 
-      # Manually add a bypass (optional) and an activation.
+      # Manually add a bypass (optionaly) and an activation.
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
 
@@ -470,7 +470,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           normalizer_params=self._BatchNormParams(fused_batch_norm),
           scope=scope)
 
-      # Manually add a bypass (optional) and an activation.
+      # Manually add a bypass (optionaly) and an activation.
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
 
@@ -526,7 +526,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
           normalizer_params=self._BatchNormParams(fused_batch_norm),
           scope=scope)
 
-      # Manually add a bypass (optional) and an activation.
+      # Manually add a bypass (optionaly) and an activation.
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
 
@@ -565,7 +565,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       stddev: Standard deviation of normal variable.
 
     Returns:
-      An initialized that initializes with a truncated normal variable.
+      An initialized that initialzes with a truncated normal variable.
     """
     return init_ops.truncated_normal_initializer(stddev=stddev)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index b2e5707a6d..ef59475167 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -144,7 +144,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       stddev: Standard deviation of normal variable.
 
     Returns:
-      An initialized that initializes with a truncated normal variable.
+      An initialized that initialzes with a truncated normal variable.
     """
     return init_ops.truncated_normal_initializer(stddev=stddev)
 
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 54c66271cd..27f0a7f58f 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -38,6 +38,7 @@ py_test(
     size = "small",
     srcs = ["python/ops/remote_fused_graph_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":remote_fused_graph_ops_py",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 358b2eb02b..73f2607d84 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2133,7 +2133,7 @@ class Conv1DLSTMCell(ConvLSTMCell):
 
   def __init__(self, name="conv_1d_lstm_cell", **kwargs):
     """Construct Conv1DLSTM. See `ConvLSTMCell` for more details."""
-    super(Conv1DLSTMCell, self).__init__(conv_ndims=1, name=name, **kwargs)
+    super(Conv1DLSTMCell, self).__init__(conv_ndims=1, **kwargs)
 
 
 class Conv2DLSTMCell(ConvLSTMCell):
@@ -2144,7 +2144,7 @@ class Conv2DLSTMCell(ConvLSTMCell):
 
   def __init__(self, name="conv_2d_lstm_cell", **kwargs):
     """Construct Conv2DLSTM. See `ConvLSTMCell` for more details."""
-    super(Conv2DLSTMCell, self).__init__(conv_ndims=2, name=name, **kwargs)
+    super(Conv2DLSTMCell, self).__init__(conv_ndims=2, **kwargs)
 
 
 class Conv3DLSTMCell(ConvLSTMCell):
@@ -2155,7 +2155,7 @@ class Conv3DLSTMCell(ConvLSTMCell):
 
   def __init__(self, name="conv_3d_lstm_cell", **kwargs):
     """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
-    super(Conv3DLSTMCell, self).__init__(conv_ndims=3, name=name, **kwargs)
+    super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
 
 
 def _conv(args, filter_size, num_features, bias, bias_start=0.0):
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index b10757df47..245fe07f2b 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -53,7 +53,6 @@ py_test(
     size = "small",
     srcs = ["python/saved_model/reader_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
     deps = [
         ":saved_model_py",
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 6adbb8be40..03fe31abf7 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -299,13 +299,12 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     finished, start_inputs = self._finished, self._start_inputs
 
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
     log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
         array_ops.zeros([self._batch_size], dtype=dtypes.int32),
         depth=self._beam_width,
-        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
-        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
-        dtype=dtype)
+        on_value=0.0,
+        off_value=-np.Inf,
+        dtype=nest.flatten(self._initial_cell_state)[0].dtype)
 
     initial_state = BeamSearchDecoderState(
         cell_state=self._initial_cell_state,
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 3ad88a8a22..67011c8fef 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -165,7 +165,6 @@ py_test(
     name = "gc_test",
     srcs = ["gc_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
     deps = [
         ":gc",
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index 7aa1684839..5daabbd62e 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -61,7 +61,6 @@ py_test(
     name = "dataset_data_provider_test",
     srcs = ["dataset_data_provider_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":dataset",
         ":dataset_data_provider",
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 07b6b1f142..1e4cc3f095 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -553,6 +553,7 @@ py_test(
     srcs = ["client/random_forest_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "nomac",  # b/63258195
         "notsan",
     ],
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index db2e000ef8..d833744d0c 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -9,7 +9,6 @@ exports_files(["LICENSE"])
 
 # For platform specific build config
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow:tensorflow.bzl", "py_test")
 
 tf_proto_library(
     name = "protos_all",
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 906cc3f034..c832c6f2e0 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -83,7 +83,6 @@ cc_library(
         "kernels/trt_engine_op.h",
     ],
     copts = tf_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
         ":trt_resources",
@@ -155,7 +154,6 @@ py_library(
     deps = [
         ":trt_convert_py",
         ":trt_ops_py",
-        "//tensorflow/python:errors",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
index 461e627e99..dfcce0fd00 100644
--- a/tensorflow/contrib/tensorrt/README.md
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -2,8 +2,7 @@ Using TensorRT in TensorFlow
 ============================
 
 This module provides necessary bindings and introduces TRT_engine_op
-operator that wraps a subgraph in TensorRT. This is still a work in progress
-but should be useable with most common graphs.
+operator that wraps a subgraph in TensorRT.
 
 Compilation
 -----------
@@ -16,10 +15,26 @@ configure script should find the necessary components from the system
 automatically. If installed from tar packages, user has to set path to
 location where the library is installed during configuration.
 
-```shell
+
+```
 bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
 ```
 
 After the installation of tensorflow package, TensorRT transformation
-will be available. An example use can be found in test/test_tftrt.py directory
+will be available. An example use is shown below.
+
+```python
+import tensorflow as tf
+import tensorflow.contrib.tensorrt as trt
+#... create and train or load model
+gdef = sess.graph.as_graph_def()
+trt_gdef = trt.create_inference_graph(
+    gdef, #original graph_def
+    ["output"], #name of output node(s)
+    max_batch_size, #maximum batch size to run the inference
+    max_workspace_size_bytes) # max memory for TensorRT to use
+tf.reset_default_graph()
+tf.import_graph_def(graph_def=trt_gdef)
+#...... run inference
+```
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
index 140ad48282..fd551d70b4 100644
--- a/tensorflow/contrib/tensorrt/__init__.py
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -18,18 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import errors
-
-# pylint: disable=unused-import,wildcard-import,g-import-not-at-top
-try:
-  from tensorflow.contrib.tensorrt.python import *
-except errors.NotFoundError as e:
-  no_trt_message = (
-      '**** Failed to initialize TensorRT. This is either because the TensorRT'
-      ' installation path is not in LD_LIBRARY_PATH, or because you do not have'
-      ' it installed. If not installed, please go to'
-      ' https://developer.nvidia.com/tensorrt to download and install'
-      ' TensorRT ****')
-  print(no_trt_message)
-  raise e
-# pylint: enable=unused-import,wildcard-import,g-import-not-at-top
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.tensorrt.python import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index eea8c8efa2..970f810473 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 
-#include <list>
 #include <map>
 #include <set>
 #include <unordered_map>
@@ -49,29 +48,13 @@ namespace tensorrt {
 namespace convert {
 namespace {
 
-bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
   // LINT.IfChange
   // TODO(jie): Segmentation shouldn't associated with op name.
   //            Split it into a registration for each kernel.
   static const std::set<string> candidate_ops = {
-      "Identity",
-      "Const",
-      "Conv2D",
-      "MaxPool",
-      "BiasAdd",
-      "Relu",
-      "Add",
-      "Mul",
-      "Sub",
-      "Rsqrt",
-      "Pad",
-      "Mean",
-      "AvgPool",
-      "ConcatV2",
-      "DepthwiseConv2dNative",
-      "FusedBatchNorm",
-      "FusedBatchNormV2",
-      // TODO(ben,jie): ...
+      "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
+      "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
   return candidate_ops.count(node_def.op());
@@ -86,8 +69,6 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource()) {
         incoming_edges->insert(edge);
-      } else {
-        VLOG(2) << edge->src()->name() << " N, ";
       }
     }
   }
@@ -101,10 +82,7 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink()) {
-        VLOG(2) << edge->dst()->name() << " Y, ";
         outgoing_edges->insert(edge);
-      } else {
-        VLOG(2) << edge->dst()->name() << " N, ";
       }
     }
   }
@@ -131,150 +109,74 @@ std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
   }
   return result;
 }
-// TODO(sami): convert references to pointers
-struct ConvertGraphParams {
-  ConvertGraphParams(
-      tensorflow::Graph& inp_graph,
-      const std::vector<string>& output_node_names,
-      const std::set<int>& subgraph_node_id_numbers,
-      size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
-      const tensorflow::grappler::GraphProperties& current_graph_properties,
-      std::unordered_map<string, std::pair<int, string>>* output_edges,
-      int engine_precision_mode)
-      : graph(inp_graph),
-        output_names(output_node_names),
-        subgraph_node_ids(subgraph_node_id_numbers),
-        max_batch_size(max_supported_batch_size),
-        max_workspace_size_bytes(max_consumed_workspace_size_bytes),
-        graph_properties(current_graph_properties),
-        output_edge_map(output_edges),
-        precision_mode(engine_precision_mode) {}
-  tensorflow::Graph& graph;
-  const std::vector<string>& output_names;
-  const std::set<int>& subgraph_node_ids;
-  size_t max_batch_size;
-  size_t max_workspace_size_bytes;
-  const tensorflow::grappler::GraphProperties& graph_properties;
-  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
-  int precision_mode;
-  std::vector<std::pair<int, int>> subgraph_inputs;
-  std::vector<std::pair<int, int>> subgraph_outputs;
+
+tensorflow::Status ConvertSubGraphToTensorRT(
+    const std::vector<string>& output_names,
+    const std::set<int>& subgraph_node_ids,
+    size_t max_batch_size,  // Max batch size that engine will be created for
+    // Max amount of memory that engine will be allowed to consume, in bytes
+    size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::Graph* graph) {
   tensorflow::EdgeSet subgraph_incoming_edges;
-  tensorflow::EdgeSet subgraph_outgoing_edges;
-};
+  GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
+
+  std::vector<std::pair<int, int>> subgraph_inputs;
 
-static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
-  GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
-                           &p->subgraph_incoming_edges);
-  for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+  // Collect inputs by looking for incoming edges
+  for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
+    subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
   }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
   std::set<std::pair<int, int>> subgraph_outputs_set;
   // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
+  auto output_name_to_index_map = BuildTensorNameMap(output_names);
+  for (int node_id : subgraph_node_ids) {
+    tensorflow::Node* node = graph->FindNodeId(node_id);
     if (output_name_to_index_map.count(node->name())) {
       for (int index : output_name_to_index_map.at(node->name())) {
         subgraph_outputs_set.insert({node_id, index});
       }
     }
   }
-  GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
-                           &p->subgraph_outgoing_edges);
-  for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
+  // Collect outputs referenced from outgoing edges
+  tensorflow::EdgeSet subgraph_outgoing_edges;
+  GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
-  p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
-  return tensorflow::Status::OK();
-};
-
-tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
-  TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
+  // Impose an ordering on the outputs
+  std::vector<std::pair<int, int>> subgraph_outputs(
+      subgraph_outputs_set.begin(), subgraph_outputs_set.end());
+  // Build TensorRT node and add it to the graph
   tensorflow::NodeDef trt_node_def;
-  SubGraphParams s(params->graph, params->subgraph_node_ids,
-                   params->subgraph_inputs, params->subgraph_outputs,
-                   params->max_batch_size, params->max_workspace_size_bytes,
-                   params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
-  TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
+  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
+      *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
+      max_batch_size, max_workspace_size_bytes, graph_properties,
+      &trt_node_def));
   tensorflow::Status status;
-  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
-
-  TF_RETURN_IF_ERROR(status);
-
-  for (auto in_edge :
-       params->subgraph_incoming_edges) {  // loop over incoming edges and
-                                           // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
-    auto src_output = in_edge->src_output();
-    auto dst_node = in_edge->dst();
-    auto dst_input = in_edge->dst_input();
-    VLOG(1) << " update edge " << trt_node->name() << ":" << src_output
-            << " -> " << dst_node->name() << ":" << dst_input;
-    TF_RETURN_IF_ERROR(
-        params->graph.UpdateEdge(trt_node, src_output, dst_node, dst_input));
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
-  TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
-  tensorflow::NodeDef trt_node_def;
-
-  SubGraphParams s(params->graph, params->subgraph_node_ids,
-                   params->subgraph_inputs, params->subgraph_outputs,
-                   params->max_batch_size, params->max_workspace_size_bytes,
-                   params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
-  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
-  tensorflow::Status status;
-  tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
-
-  // AddNode does not wire edges.
-  // Re-map incoming edges to use the new TRT node instead of the orig subgraph
-  std::map<std::pair<int, int>, int> subgraph_edge_to_input_map;
-  for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
-    subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
-  }
-  for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
-    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
-    int new_src_output = subgraph_edge_to_input_map.at(old_src);
-    params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
-                          new_src_output);
-    params->graph.RemoveEdge(edge);
-  }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
-  }
-
+  tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
   std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
-  for (size_t i = 0; i < params->subgraph_outputs.size(); ++i) {
-    subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
+  for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
+    subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
   }
   TF_RETURN_IF_ERROR(status);
-  for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
-    TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
-        trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
+                                         edge->dst_input()));
   }
   // Remove the original subgraph
-  for (int node_id : params->subgraph_node_ids) {
-    tensorflow::Node* node = params->graph.FindNodeId(node_id);
+  for (int node_id : subgraph_node_ids) {
+    tensorflow::Node* node = graph->FindNodeId(node_id);
     // Don't remove the input placeholders
     if (node->type_string() == "Placeholder") {
       continue;
     }
-    params->graph.RemoveNode(node);
+    graph->RemoveNode(node);
   }
   return tensorflow::Status::OK();
 }
@@ -292,39 +194,12 @@ tensorflow::Status BuildNodeMap(
 }
 
 }  // namespace
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph) {
-  VLOG(0) << "Starting Calib Conversion";
-  tensorflow::Graph graph(tensorflow::OpRegistry::Global());
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), graph_def, &graph));
-  //  get calib nodes
-  std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
-    if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
-      calib_nodes.push_back(node);
-    }
-  }
-  VLOG(0) << "Num Calib nodes in graph= " << calib_nodes.size();
-  if (calib_nodes.size() == 0)
-    return tensorflow::errors::FailedPrecondition(
-        "Graph doesn't contain any calibration nodes!."
-        " Please generate calibration graph and run calibration first");
-  for (auto n : calib_nodes) {
-    TF_RETURN_IF_ERROR(
-        tensorrt::convert::ConvertCalibrationNodeToEngineNode(graph, n));
-  }
-  graph.ToGraphDef(infer_graph);
-  return tensorflow::Status::OK();
-}
 
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode = FP32MODE, int minimum_segment_size = 3) {
-  // optimization pass
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
+  // Optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   tensorflow::GraphDef gdef;
@@ -334,23 +209,16 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   tensorflow::grappler::LayoutOptimizer optimizer;
   tensorflow::grappler::Cluster* cluster;
 
-  // virtual cluster
+  // Virtual cluster
   tensorflow::DeviceProperties device_properties;
-
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
   cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
-  // single machine
-  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
-  VLOG(2) << "cpu_cores: " << num_cpu_cores;
-  VLOG(2) << "gpus: " << num_gpus;
-
   TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
 
-  // constant folding
+  // Constant folding
   item.graph = gdef;
   tensorflow::grappler::ConstantFolding fold(nullptr);
   TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
@@ -358,6 +226,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
+
   // Build full graph
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              gdef.library());
@@ -374,7 +243,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   }
 
   // TODO(sami): this should be passed as a knob!!!!
-  segment_options.minimum_segment_size = minimum_segment_size;
+  segment_options.minimum_segment_size = 2;
   tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       gdef, IsTensorRTCandidate, segment_options, &segments));
@@ -383,37 +252,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   }
   std::unordered_map<string, tensorflow::Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  std::unordered_map<string, std::pair<int, string>> output_edge_map;
-  int count = 0;
-  float total_num_nodes_in_segments = 0.;
-  for (auto s : segments) {
-    total_num_nodes_in_segments += s.size();
-  }
   for (const std::set<string>& subgraph_node_names : segments) {
     std::set<int> subgraph_node_ids;
-    size_t max_mem_per_engine =
-        max_workspace_size_bytes *
-        ((float)subgraph_node_names.size() / total_num_nodes_in_segments);
-    std::stringstream oss;
     for (const string& node_name : subgraph_node_names) {
-      oss << " " << node_name;
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
-    VLOG(2) << "Subgraph nodes" << oss.str();
-    ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
-                         max_mem_per_engine, static_graph_properties,
-                         &output_edge_map, precision_mode);
-    if (precision_mode == INT8MODE) {
-      TF_RETURN_IF_ERROR(GetCalibNode(&p));
-    } else {
-      tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
-      if (status != tensorflow::Status::OK()) {
-        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
-                     << " due to: \n"
-                     << status.ToString() << " SKIPPING......";
-      }
-      count++;
-    }
+    TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
+        output_names, subgraph_node_ids, max_batch_size,
+        max_workspace_size_bytes, static_graph_properties, &graph));
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index e1596e89e2..154ad3f2e8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -28,11 +28,6 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-// This method converts an already generated calibration graph which was used in
-// calibration runs to an inference graph
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def);
-
 // max_batch_size: maximum batch size which can be used for inference for
 //                 optimization targets inference run with max batch size.
 // max_workspace_size_bytes: The upper bound of memory allowence for
@@ -40,8 +35,7 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
-    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode, int minimum_segment_size);
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
 
 }  // namespace convert
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 75a3c3d034..9ee717dd7f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -24,10 +24,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
 #include "tensorflow/core/framework/types.h"
@@ -36,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tensor_coding.h"
@@ -44,6 +39,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorrt/include/NvInfer.h"
 
 //  Check if the types are equal. Cast to int first so that failure log message
@@ -53,7 +49,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-using ::tensorflow::strings::StrCat;
 
 namespace {
 
@@ -70,8 +65,7 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
       *trt_dtype = nvinfer1::DataType::kHALF;
       break;
     default:
-      return tensorflow::errors::InvalidArgument(
-          "Unsupported data type " + tensorflow::DataTypeString(tf_dtype));
+      return tensorflow::errors::InvalidArgument("Unsupported data type");
   }
   return tensorflow::Status::OK();
 }
@@ -118,18 +112,6 @@ static std::vector<std::pair<int, int>> CreateSamePadding(
   return padding;
 }
 
-string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
-  size_t last_scope_separator = 0;
-  for (size_t i = 0; i < std::min(op_name_a.size(), op_name_b.size()); ++i) {
-    if (op_name_a[i] != op_name_b[i]) {
-      break;
-    } else if (op_name_a[i] == '/') {
-      last_scope_separator = i + 1;
-    }
-  }
-  return op_name_a.substr(0, last_scope_separator);
-}
-
 class TRT_ShapedWeights {
  public:
   TRT_ShapedWeights(tensorflow::DataType type, const void* values,
@@ -263,11 +245,6 @@ std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
 }
 
 template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
-  auto attr = this->at(key)->list().s();
-  return std::vector<string>(attr.begin(), attr.end());
-}
-template <>
 nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
@@ -289,17 +266,6 @@ tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
   return this->at(key)->type();
 }
 
-template <>
-float TFAttrs::get<float>(string key) const {
-  return this->at(key)->f();
-}
-
-template <>
-bool TFAttrs::get<bool>(string key) const {
-  return this->at(key)->b();
-}
-
-// TODO(jie): reorder4 & reorder2 should be merged?
 template <typename T>
 void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
               nvinfer1::DimsNCHW istrides, T* odata,
@@ -317,87 +283,29 @@ void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
   }
 }
 
-template <typename T>
-void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
-              T* odata, nvinfer1::DimsHW ostrides) {
-  for (int h = 0; h < shape.h(); ++h) {
-    for (int w = 0; w < shape.w(); ++w) {
-      odata[h * ostrides.h() + w * ostrides.w()] =
-          idata[h * ostrides.h() + w * ostrides.w()];
-    }
-  }
-}
-
-// TODO(jie): fallback to tensorflow!!
-void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
-                   TRT_ShapedWeights* oweights) {
-  int c = iweights.shape_.d[0];
-  int k = iweights.shape_.d[1];
-  oweights->shape_.d[0] = k;
-  oweights->shape_.d[1] = c;
-  nvinfer1::DimsHW istrides = {1, k};
-  nvinfer1::DimsHW ostrides = {c, 1};
-  switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
-      Reorder2({k, c}, static_cast<float const*>(iweights.GetValues()),
-               istrides,
-               static_cast<float*>(const_cast<void*>(oweights->GetValues())),
-               ostrides);
-      break;
-    }
-    case tensorflow::DataType::DT_HALF: {
-      Reorder2(
-          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unsupported type in reorder expected fp32 or fp16 but got "
-                 << DataTypeString(iweights.type_);
-  }
-}
-
 void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
-                       TRT_ShapedWeights* oweights, int num_groups) {
+                       TRT_ShapedWeights* oweights) {
   CHECK_EQ(iweights.type_, oweights->type_);
   CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
   int r = iweights.shape_.d[0];
   int s = iweights.shape_.d[1];
-  // TRT requires GKcRS, while TF depthwise has RSCK
-  //   where c=1, C=G
-  VLOG(2) << "num_groups: " << num_groups;
-  int c = iweights.shape_.d[2] / num_groups;
-  VLOG(2) << "c" << iweights.shape_.d[2] << " then " << c;
-  int k = iweights.shape_.d[3] * num_groups;
-  VLOG(2) << "k" << iweights.shape_.d[3] << " then " << k;
-  oweights->shape_.d[0] = k / num_groups;
-  oweights->shape_.d[1] = c * num_groups;
+  int c = iweights.shape_.d[2];
+  int k = iweights.shape_.d[3];
+  oweights->shape_.d[0] = k;
+  oweights->shape_.d[1] = c;
   oweights->shape_.d[2] = r;
   oweights->shape_.d[3] = s;
   nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
   nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
   switch (iweights.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
+    case tensorflow::DataType::DT_FLOAT:
       Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
                istrides,
                static_cast<float*>(const_cast<void*>(oweights->GetValues())),
                ostrides);
       break;
-    }
-    case tensorflow::DataType::DT_HALF: {
-      Reorder4(
-          {k, c, r, s}, static_cast<Eigen::half const*>(iweights.GetValues()),
-          istrides,
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
-          ostrides);
-      break;
-    }
-
     default:
-      LOG(FATAL) << "Unsupported type, expected fp32 or fp16 but got "
-                 << DataTypeString(iweights.type_);
+      LOG(FATAL) << "!!!!!!!!!!!!!!!!!!!!!!!!broke!!!!!!!!!!!!";
   }
 }
 
@@ -415,11 +323,12 @@ inline std::shared_ptr<T> infer_object(T* obj) {
   return std::shared_ptr<T>(obj, InferDeleter());
 }
 
+// Logger for GIE info/warning/errors
 class Converter;
 
 using OpConverter =
     std::function<tensorflow::Status(Converter&, const tensorflow::NodeDef&,
-                                     const std::vector<TRT_TensorOrWeights>&,
+                                     std::vector<TRT_TensorOrWeights> const&,
                                      std::vector<TRT_TensorOrWeights>*)>;
 
 class Converter {
@@ -427,57 +336,34 @@ class Converter {
   std::unordered_map<string, OpConverter> op_registry_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
-  tensorflow::tensorrt::TRTWeightStore* weight_store_;
-  bool fp16_;
+
   void register_op_converters();
+
   std::vector<TRT_TensorOrWeights> get_inputs(
       const tensorflow::NodeDef& node_def) {
     std::vector<TRT_TensorOrWeights> inputs;
-    for (auto const& input_name : node_def.input()) {
-      /*************************************************************************
-       * TODO(jie) handle case 1) here
-       * Normalizes the inputs and extracts associated metadata:
-       * 1) Inputs can contain a colon followed by a suffix of characters.
-       *    That suffix may be a single number (e.g. inputName:1) or several
-       *    word characters separated from a number by a colon
-       *    (e.g. inputName:foo:1). The
-       *    latter case is used to denote inputs and outputs of functions.
-       * 2) Control dependency inputs contain caret at the beginning and we
-       *    remove this and annotate the edge as a control dependency.
-       ************************************************************************/
-      string name = input_name[0] == '^' ? input_name.substr(1) : input_name;
-      auto first = name.find_first_of(':');
-      if (first != string::npos && first + 2 == name.size() &&
-          name[first + 1] == '0')
-        name.erase(first);
-
-      VLOG(2) << "retrieve input: " << name;
-      if (trt_tensors_.count(name)) {
-        inputs.push_back(trt_tensors_.at(name));
-      } else {
-        LOG(FATAL) << "input: " << name << " not availabled for node at, "
-                   << node_def.name();
-      }
+    for (const auto& input_name : node_def.input()) {
+      VLOG(2) << "Retrieve input: " << input_name;
+      inputs.push_back(trt_tensors_.at(input_name));
     }
     return inputs;
   }
 
  public:
-  explicit Converter(nvinfer1::INetworkDefinition* trt_network,
-                     tensorflow::tensorrt::TRTWeightStore* ws, bool fp16)
-      : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
+  explicit Converter(nvinfer1::INetworkDefinition* trt_network)
+      : trt_network_(trt_network) {
     this->register_op_converters();
   }
-  tensorflow::tensorrt::TRTWeightStore* weight_store() { return weight_store_; }
+
   TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
                                      nvinfer1::Dims shape) {
     TRT_ShapedWeights weights(type, nullptr, shape);
     // TODO(jie): check weights size_bytes. 0 means type error
-    weight_store_->store_.push_back(std::vector<uint8_t>(weights.size_bytes()));
-    weights.SetValues(weight_store_->store_.back().data());
+    temp_bufs_.push_back(std::vector<uint8_t>(weights.size_bytes()));
+    weights.SetValues(temp_bufs_.back().data());
     return weights;
   }
-  bool isFP16() { return fp16_; };
+
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
@@ -496,7 +382,7 @@ class Converter {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
       string output_name = node_def.name();
-      if (i != 0) output_name = StrCat(output_name, ":", i);
+      if (i != 0) output_name = output_name + ":" + std::to_string(i);
       if (output.is_tensor()) {
         output.tensor()->setName(output_name.c_str());
       }
@@ -562,7 +448,7 @@ struct LambdaFactory {
     switch (op) {
       case OP_CATEGORY::RSQRT: {
         VLOG(2) << "RSQRT GETS DONE";
-        return [](T t) -> T { return 1.0 / sqrt(t); };
+        return [](T t) -> T { return 1.0 / std::sqrt(t); };
       }
       case OP_CATEGORY::NEG:
         return [](T t) -> T { return -t; };
@@ -648,22 +534,6 @@ struct LambdaFactory {
   }
 };
 
-template <>
-std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
-  switch (op) {
-    case OP_CATEGORY::RSQRT: {
-      VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) -> Eigen::half {
-        return Eigen::half(1.0 / sqrt(float(t)));
-      };
-    }
-    case OP_CATEGORY::NEG:
-      return [](Eigen::half t) -> Eigen::half { return -t; };
-    default:
-      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
-      return nullptr;
-  }
-}
 tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
                                 TRT_ShapedWeights* oweights,
                                 LambdaFactory unary_op) {
@@ -675,14 +545,6 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
       std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
-      auto inp = static_cast<Eigen::half const*>(iweights.GetValues());
-      auto oup =
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
-      std::transform(inp, inp + iweights.count(), oup,
-                     unary_op.unary<Eigen::half>());
-      break;
-    }
     default:
       return tensorflow::errors::Unimplemented(
           "Data type not supported: " +
@@ -726,32 +588,6 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
       }
       break;
     }
-    case tensorflow::DataType::DT_HALF: {
-      auto inp_l = static_cast<const Eigen::half*>(iweights_l.GetValues());
-      auto inp_r = static_cast<const Eigen::half*>(iweights_r.GetValues());
-      auto oup =
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
-
-      if (iweights_l.count() != iweights_r.count()) {
-        // We only supports broadcast of RankZero
-        if (iweights_l.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_l);
-          std::transform(inp_r, inp_r + iweights_r.count(), oup,
-                         binary_op.broadcast_l<Eigen::half>(*inp_l));
-        } else if (iweights_r.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_r);
-          std::transform(inp_l, inp_l + iweights_l.count(), oup,
-                         binary_op.broadcast_r<Eigen::half>(*inp_r));
-        } else {
-          return tensorflow::errors::Unimplemented(
-              "Binary op with non-rankZero broadcast not supported");
-        }
-      } else {
-        std::transform(inp_l, inp_l + iweights_l.count(), inp_r, oup,
-                       binary_op.binary<Eigen::half>());
-      }
-      break;
-    }
     default:
       return tensorflow::errors::Unimplemented(
           "Data type not supported: " +
@@ -763,7 +599,7 @@ tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
 
 tensorflow::Status ConstantFoldUnary(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input = inputs.at(0).weights();
 
@@ -777,12 +613,13 @@ tensorflow::Status ConstantFoldUnary(
   CHECK_EQ(weights_input.type_,
            TFAttrs(node_def).get<tensorflow::DataType>("T"));
 
+  // Maybe I should do a switch
   LambdaFactory unary_op;
   if (node_def.op() == "Rsqrt") {
     // Compute rsqrt
     unary_op.op = LambdaFactory::OP_CATEGORY::RSQRT;
     auto ret = UnaryCompute(weights_input, &weights_output, unary_op);
-    // Pass the output
+    // PAss the output
     if (ret == tensorflow::Status::OK()) {
       outputs->push_back(TRT_TensorOrWeights(weights_output));
     }
@@ -794,11 +631,11 @@ tensorflow::Status ConstantFoldUnary(
 }
 
 // TODO(jie,ben) broadcast is needed yet not implemented
-// Let's get the simple stuff working first. Maybe we should fall back to TF
+// Let's get the simple stuff working first. Maybe we should fall bakc to TF
 //   approach for constant folding
 tensorflow::Status ConstantFoldBinary(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   TRT_ShapedWeights weights_input_l = inputs.at(0).weights();
   TRT_ShapedWeights weights_input_r = inputs.at(1).weights();
@@ -811,12 +648,12 @@ tensorflow::Status ConstantFoldBinary(
         "Binary op implicit broadcast not supported: " + node_def.op());
 
   // TODO(jie): constant fold should really fall back to TF.
-  int num_dims = weights_input_l.shape_.nbDims;
+  int nb_dims = weights_input_l.shape_.nbDims;
   nvinfer1::Dims output_shape;
-  output_shape.nbDims = num_dims;
-  VLOG(2) << "nb_dims: " << num_dims
+  output_shape.nbDims = nb_dims;
+  VLOG(2) << "nb_dims: " << nb_dims
           << ", the other: " << weights_input_r.shape_.nbDims;
-  for (int i = 0; i < num_dims; i++) {
+  for (int i = 0; i < nb_dims; i++) {
     if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
       output_shape.d[i] = weights_input_l.shape_.d[i];
     } else if (weights_input_l.shape_.d[i] == 1 ||
@@ -841,6 +678,7 @@ tensorflow::Status ConstantFoldBinary(
   // Allocate output weights
   TRT_ShapedWeights weights_output = ctx.get_temp_weights(dtype, output_shape);
 
+  // Maybe I should do a switch
   LambdaFactory binary_op;
   if (node_def.op() == "Sub") {
     binary_op.op = LambdaFactory::OP_CATEGORY::SUB;
@@ -874,90 +712,48 @@ tensorflow::Status BinaryTensorOpWeight(
   // Maybe this part has to be moved into the block of rsqrt later
 
   // Check type consistency
+  auto dtype = TFAttrs(node_def).get<nvinfer1::DataType>("T");
+  CHECK_EQ_TYPE(tensor->getType(), dtype);  // Cast to int for error messages
   nvinfer1::DataType ttype;
   TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
+  CHECK_EQ_TYPE(ttype, dtype);  // Cast to int for error message
 
   // Check scale mode
   auto dims_w = weights.shape_;
   auto dims_t = tensor->getDimensions();
 
-  // default to element-wise
+  // Default to channel-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-  // TODO(jie): maybe use a permuatation instead to support more cases;
-  bool permutation_flag = false;
-
   if (weights.count() == 1) {
     VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
-    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
-            << " tensor DIM: " << dims_t.nbDims;
-    if (dims_w.nbDims == dims_t.nbDims + 1) {
-      if (dims_w.d[0] == 1) {
-        for (int i = 1; i < dims_w.nbDims; i++) {
-          dims_w.d[i - 1] = dims_w.d[i];
-        }
-        dims_w.nbDims--;
-      } else {
-        return tensorflow::errors::InvalidArgument(
-            "Binary op cannot operate on batch, " + node_def.name());
-      }
-    }
+    // No broadcasting on Batch dimension;
+    assert(dims_w.d[0] == 1);
 
-    if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
-      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // default is element;
-      for (int i = 1; i < dims_w.nbDims; i++) {
-        if (dims_w.d[i] != dims_t.d[i]) {
-          // if dimension does not match, switch back to channel;
-          VLOG(2) << "channel";
-          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-          break;
-        }
-      }
-      // if channel as candidate, validate it
-      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
-        for (int i = 1; i < dims_w.nbDims; i++) {
-          if (dims_w.d[i] != 1)
-            return tensorflow::errors::InvalidArgument(
-                "Weight shape not compatible at, " + node_def.name());
-        }
-      } else {
-        VLOG(2) << "elementwise";
+    // Broadcasting on Channel dimension only allowed in kUNIFORM
+    assert(dims_w.d[1] == dims_t.d[0]);
+    assert(dims_w.nbDims == dims_t.nbDims);
+
+    // Default is element;
+    for (int i = 2; i < dims_w.nbDims; i++) {
+      if (dims_w.d[i] != dims_t.d[i - 1]) {
+        scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+        break;
       }
-    } else if (dims_w.nbDims == 1 &&
-               dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // channel wise and broadcast required;
-      permutation_flag = true;
-      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Weight shape not compatible at, " + node_def.name());
     }
-  }
-
-  // transpose last dimension
-  std::vector<int> permutation(dims_t.nbDims + 1);
-  if (permutation_flag) {
-    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-      // we swap the last dimension into channel for trt.
-      // because of tensorflow default broadcasting rules.
-      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-        permutation[i] = i;
+    if (scale_mode == nvinfer1::ScaleMode::kELEMENTWISE) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 2; i < dims_w.nbDims; i++) {
+        if (dims_w.d[i] != 1)
+          return tensorflow::errors::InvalidArgument(
+              "Weight shape not compatible at, " + node_def.name());
       }
-      permutation[1] = dims_t.nbDims;
-      permutation[dims_t.nbDims] = 1;
-      tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                   permutation);
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Transpose cannot be applied, " + node_def.name());
     }
   }
 
-  // prepare weights
+  // Prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
@@ -983,26 +779,88 @@ tensorflow::Status BinaryTensorOpWeight(
       scale_weights, power_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  if (permutation_flag) {
-    output_tensor = ctx.TransposeTensor(output_tensor, permutation);
-  }
 
   // Pass the output
   outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
+tensorflow::Status BinaryTensorOpTensor(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
+      {"Add", nvinfer1::ElementWiseOperation::kSUM},
+      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+      // {"max", nvinfer1::ElementWiseOperation::kMAX},
+      // {"min", nvinfer1::ElementWiseOperation::kMIN},
+      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+      {"Div", nvinfer1::ElementWiseOperation::kDIV},
+  };
+
+  // FIXME assume type matches input weights
+  // Get trt type & shape
+  TFAttrs attrs(node_def);
+  // Maybe this part has to be moved into the block of rsqrt later
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+
+  // Check type consistency
+  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
+  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end())
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
-tensorflow::Status ConvertConv2DHelper(
+  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
+      *const_cast<nvinfer1::ITensor*>(tensor_l),
+      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Pass the output
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPlaceholder(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs,
-    int group  // group ==0 specifies depthwise conv
-) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  VLOG(2) << "Placeholder should have been replace already";
+  return tensorflow::errors::Unimplemented(", cannot convert Placeholder op");
+  // OK this make sense since we are supposed to replace it with input
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
+  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
+
+  dims.nbDims--;
+  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
+
+  nvinfer1::ITensor* output =
+      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
+  if (!output) {
+    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
+  }
+  outputs->push_back(TRT_TensorOrWeights(output));
+  return tensorflow::Status::OK();
+}
 
+tensorflow::Status ConvertConv2D(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  // TODO(jie): handle NHWC/NCHW transpose;
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
+  ReorderRSCKToKCRS(weights_rsck, &weights);
+  TRT_ShapedWeights biases(weights.type_);
+  int noutput = weights.shape_.d[0];
+  nvinfer1::DimsHW kernel_size;
+  kernel_size.h() = weights.shape_.d[2];
+  kernel_size.w() = weights.shape_.d[3];
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -1016,31 +874,11 @@ tensorflow::Status ConvertConv2DHelper(
     // TODO(jie): transpose it
   }
 
-  // tensor after transpose (NCHW)
-  auto tensor_dim = tensor->getDimensions();
-
-  int num_groups = group;
-  if (num_groups == 0)  // depthwise convolution
-    num_groups = tensor_dim.d[0];
-  VLOG(2) << "groups count: " << num_groups;
-
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
-  ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
-  TRT_ShapedWeights biases(weights.type_);
-  int noutput = weights.shape_.d[0] * num_groups;
-  nvinfer1::DimsHW kernel_size;
-  kernel_size.h() = weights.shape_.d[2];
-  kernel_size.w() = weights.shape_.d[3];
-  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
-
   // TODO(jie): stride. (NHWC/NCHW)
   auto tf_stride = attrs.get<std::vector<int>>("strides");
-  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
-  VLOG(2) << "stride!!!: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
-          << tf_stride[3];
   nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
+  auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
   // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
@@ -1081,11 +919,10 @@ tensorflow::Status ConvertConv2DHelper(
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
-  layer->setNbGroups(num_groups);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   auto dim_after = output_tensor->getDimensions();
-  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1] << ", "
+  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
           << dim_after.d[2] << ", " << dim_after.d[3];
 
   if (data_format == "NHWC") {
@@ -1098,101 +935,11 @@ tensorflow::Status ConvertConv2DHelper(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConv2DHelper(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs, ConvolutionType type) {
-  switch (type) {
-    case ConvolutionType::DEFAULT:
-      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 1);
-    case ConvolutionType::DEPTHWISE_CONV:
-      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 0);
-  }
-  return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
-                                           node_def.name());
-}
-
-tensorflow::Status BinaryTensorOpTensor(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
-      {"Add", nvinfer1::ElementWiseOperation::kSUM},
-      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
-      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
-      {"Div", nvinfer1::ElementWiseOperation::kDIV},
-  };
-
-  // FIXME assume type matches input weights
-  // get trt type & shape
-  TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
-
-  // check type consistency
-  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
-
-  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
-      *const_cast<nvinfer1::ITensor*>(tensor_l),
-      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  // pass the output
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertPlaceholder(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  VLOG(2) << "Placeholder should have been replace already";
-  return tensorflow::errors::Unimplemented("cannot convert Placeholder op");
-  // OK this make sense since we are supposed to replace it with input
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
-  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
-
-  dims.nbDims--;
-  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
-
-  nvinfer1::ITensor* output =
-      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
-  if (!output) {
-    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
-  }
-  outputs->push_back(TRT_TensorOrWeights(output));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertConv2D(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
-                             ConvolutionType::DEFAULT);
-}
-
-tensorflow::Status ConvertConv2DDepthwise(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
-                             ConvolutionType::DEPTHWISE_CONV);
-}
-
 tensorflow::Status ConvertPool(Converter& ctx,
                                const tensorflow::NodeDef& node_def,
-                               const std::vector<TRT_TensorOrWeights>& inputs,
+                               std::vector<TRT_TensorOrWeights> const& inputs,
                                std::vector<TRT_TensorOrWeights>* outputs) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   TFAttrs attrs(node_def);
 
   int h_index = 2;
@@ -1210,8 +957,6 @@ tensorflow::Status ConvertPool(Converter& ctx,
   // TODO(jie): support other pooling type
   if (node_def.op() == "MaxPool")
     type = nvinfer1::PoolingType::kMAX;
-  else if (node_def.op() == "AvgPool")
-    type = nvinfer1::PoolingType::kAVERAGE;
   else
     return tensorflow::errors::Unimplemented("Only supports Max pool");
 
@@ -1274,9 +1019,9 @@ tensorflow::Status ConvertPool(Converter& ctx,
 
 tensorflow::Status ConvertActivation(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   nvinfer1::IActivationLayer* layer = ctx.network()->addActivation(
       *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ActivationType::kRELU);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
@@ -1286,14 +1031,14 @@ tensorflow::Status ConvertActivation(
 
 tensorflow::Status ConvertScale(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
     return tensorflow::errors::Unimplemented(
         "Only supports tensor op weight for now, at " + node_def.name());
   // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
 
   // TODO(jie): handle NHWC/NCHW transpose;
   TRT_ShapedWeights weights = inputs.at(1).weights();
@@ -1310,26 +1055,9 @@ tensorflow::Status ConvertScale(Converter& ctx,
   } else {
     VLOG(2) << "NCHW !!!!";
   }
-
-  auto dims = tensor->getDimensions();
-  VLOG(2) << "tensor dimensions: " << dims.nbDims;
-  for (int i = 0; i < dims.nbDims; i++) {
-    VLOG(2) << "i: " << dims.d[i];
-  }
-  dims = weights.shape_;
-  VLOG(2) << "tensor dimensions: " << dims.nbDims;
-  for (int i = 0; i < dims.nbDims; i++) {
-    VLOG(2) << "i: " << dims.d[i];
-  }
-
-  nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
-  if (weights.shape_.d[0] == 1) {
-    mode = nvinfer1::ScaleMode::kUNIFORM;
-  }
-
-  nvinfer1::IScaleLayer* layer =
-      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
-                              weights, empty_weights, empty_weights);
+  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
+      weights, empty_weights, empty_weights);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   if (data_format == "NHWC") {
@@ -1344,7 +1072,7 @@ tensorflow::Status ConvertScale(Converter& ctx,
 
 tensorflow::Status ConvertConst(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   const auto& weights_tensor = node_def.attr().at("value").tensor();
 
@@ -1363,144 +1091,20 @@ tensorflow::Status ConvertConst(Converter& ctx,
     VLOG(2) << "SCALAR!!!" << node_def.name();
     nvinfer1::Dims scalar_shape;
     if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      VLOG(2) << "size: " << weights_tensor.float_val_size();
-      scalar_shape = GetTensorShape(tensor);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        VLOG(2) << scalar_shape.d[i];
-      if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size()) {
-        if (weights_tensor.float_val_size() == 1 ||
-            scalar_shape.d[0] == weights_tensor.float_val_size()) {
-          scalar_shape.nbDims = 1;
-          // no dimension provided. flatten it
-          scalar_shape.d[0] = weights_tensor.float_val_size();
-          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-        } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
-        }
-      }
-    } else {
       VLOG(2) << "Dimensions: " << tensor.dims();
-      scalar_shape.nbDims = 1;
-      // no dimension provided. flatten it
-      scalar_shape.d[0] = weights_tensor.float_val_size();
-      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
-        scalar_shape.d[i] = 0;
-        scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
-      }
-    }
-    if (ctx.isFP16()) {
-      auto dtype_new = tensorflow::DataType::DT_HALF;
-      size_t len_data = tensorflow::DataTypeSize(dtype_new);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      tensorflow::Tensor temp_tensor(tensorflow::DT_HALF, tensor.shape());
-      auto half_tensor = temp_tensor.flat<Eigen::half>();
-      Eigen::DefaultDevice defd;
-      half_tensor.device(defd) =
-          tensor.flat<float>().template cast<Eigen::half>();
-      memcpy(dst, half_tensor.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype_new, dst, scalar_shape);
+      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                  GetTensorShape(tensor));
     } else {
-      size_t len_data = tensorflow::DataTypeSize(dtype);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      std::vector<float> tensor_data(
-          weights_tensor.float_val().begin(),
-          weights_tensor.float_val()
-              .end());  //  make a local copy first to flatten
-      memcpy(dst, tensor_data.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
-    }
-  } else if (!weights_tensor.int_val().empty()) {
-    VLOG(2) << "int!!!" << node_def.name();
-    nvinfer1::Dims scalar_shape;
-    if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      scalar_shape = GetTensorShape(tensor);
-      if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size()) {
-        if (weights_tensor.int_val_size() == 1 ||
-            scalar_shape.d[0] == weights_tensor.int_val_size()) {
-          scalar_shape.nbDims = 1;
-          // no dimension provided. flatten it
-          scalar_shape.d[0] = weights_tensor.int_val_size();
-          scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-        } else {
-          LOG(FATAL) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
-        }
-      }
-    } else {
-      VLOG(2) << "dimensions: " << tensor.dims();
+      VLOG(2) << "Dimensions: " << tensor.dims();
       scalar_shape.nbDims = 1;
-      // no dimension provided. flatten it
-      scalar_shape.d[0] = weights_tensor.int_val_size();
+      scalar_shape.d[0] = 1;
       scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
       for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
         scalar_shape.d[i] = 0;
         scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
       }
-    }
-    if (ctx.isFP16()) {
-      auto dtype_new = tensorflow::DataType::DT_HALF;
-      size_t len_data = tensorflow::DataTypeSize(dtype_new);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      tensorflow::Tensor temp_tensor(tensorflow::DT_HALF, tensor.shape());
-      TTypes<Eigen::half>::Flat half_tensor = temp_tensor.flat<Eigen::half>();
-      Eigen::DefaultDevice defd;
-      switch (dtype) {
-        case (tensorflow::DT_INT32): {
-          half_tensor.device(defd) =
-              tensor.flat<int32>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_INT16): {
-          half_tensor.device(defd) =
-              tensor.flat<int16>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_INT8): {
-          half_tensor.device(defd) =
-              tensor.flat<int8>().template cast<Eigen::half>();
-          break;
-        }
-        case (tensorflow::DT_UINT8): {
-          half_tensor.device(defd) =
-              tensor.flat<uint8>().template cast<Eigen::half>();
-          break;
-        }
-        default:
-          return tensorflow::errors::InvalidArgument(
-              "Datatype " + tensorflow::DataTypeString(dtype) +
-              " for FP16 conversion");
-          break;
-      };
-      memcpy(dst, half_tensor.data(), len_data);  // store into weight store
-      weights = TRT_ShapedWeights(dtype_new, dst, scalar_shape);
-    } else {
-      size_t len_data = tensorflow::DataTypeSize(dtype);
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        len_data *= scalar_shape.d[i];
-      size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32);
-      len_data = std::max(len_data, len_tensor);
-      ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-      void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-      std::vector<int32> tensor_data(
-          weights_tensor.int_val().begin(),
-          weights_tensor.int_val()
-              .end());  //  make a local copy first to flatten
-                        //  doesn't have to be contigous
-      memcpy(dst, tensor_data.data(), len_tensor);  // store into weight store
-      weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
+      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                  scalar_shape);
     }
   } else if (!weights_tensor.tensor_content().empty()) {
     VLOG(2) << "TENSOR!!!" << node_def.name();
@@ -1526,7 +1130,7 @@ tensorflow::Status ConvertConst(Converter& ctx,
 
 tensorflow::Status ConvertIdentity(
     Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
+    std::vector<TRT_TensorOrWeights> const& inputs,
     std::vector<TRT_TensorOrWeights>* outputs) {
   outputs->push_back(inputs.at(0));
   return tensorflow::Status::OK();
@@ -1534,7 +1138,7 @@ tensorflow::Status ConvertIdentity(
 
 tensorflow::Status ConvertBinary(Converter& ctx,
                                  const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2)
     return tensorflow::errors::FailedPrecondition(
@@ -1561,7 +1165,7 @@ tensorflow::Status ConvertBinary(Converter& ctx,
 
 tensorflow::Status ConvertUnary(Converter& ctx,
                                 const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
                                 std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 1)
     return tensorflow::errors::FailedPrecondition(
@@ -1579,7 +1183,7 @@ tensorflow::Status ConvertUnary(Converter& ctx,
 
 tensorflow::Status ConvertReduce(Converter& ctx,
                                  const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
                                  std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
@@ -1587,7 +1191,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
         "Input expects tensor and weights, at" + node_def.name());
 
   // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
   int nb_dims = dims.nbDims + 1;
@@ -1625,7 +1229,6 @@ tensorflow::Status ConvertReduce(Converter& ctx,
       return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at" +
                                                  node_def.name());
     if (index_list_data[i] == 1) permuted_index = 1;
-
     idx_set.emplace(index_list_data[i]);
   }
 
@@ -1633,7 +1236,7 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   nvinfer1::DimsHW pool_kernel;
   if (permuted_index == 1) {
     for (int i = 2; i < nb_dims; i++) {
-      if (idx_set.count(i) == 0) {
+      if (idx_set.count(i)) {
         permuted_index = i;
         break;
       }
@@ -1668,13 +1271,12 @@ tensorflow::Status ConvertReduce(Converter& ctx,
     output_tensor = ctx.TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
   }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
 tensorflow::Status ConvertPad(Converter& ctx,
                               const tensorflow::NodeDef& node_def,
-                              const std::vector<TRT_TensorOrWeights>& inputs,
+                              std::vector<TRT_TensorOrWeights> const& inputs,
                               std::vector<TRT_TensorOrWeights>* outputs) {
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights())
@@ -1682,7 +1284,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
         "Input expects tensor and weights, at" + node_def.name());
 
   // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   auto dims = tensor->getDimensions();
   // Restore implicit batch dimension
   int nb_dims = dims.nbDims + 1;
@@ -1769,287 +1371,19 @@ tensorflow::Status ConvertPad(Converter& ctx,
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConcat(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  // not including the last input (axis) here
-  int input_size = static_cast<int>(inputs.size()) - 1;
-
-  if (!inputs.at(0).is_tensor())
-    return tensorflow::errors::InvalidArgument(
-        "Concat in TRT support only Tensor input, at " + node_def.name());
-
-  // We are retrieving the axis
-  TRT_ShapedWeights axis = inputs.at(input_size).weights();
-
-  TFAttrs attrs(node_def);
-  // auto attr_size = attrs.at("N")->i();
-  // auto data_type = attrs.get<nvinfer1::DataType>("T");
-  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
-
-  // TODO(jie): handle data type
-  // Only expect to handle INT32 as index attributes for now
-  if (index_type != tensorflow::DataType::DT_INT32)
-    return tensorflow::errors::Unimplemented(
-        "Tidx supports only DT_INT32, at " + node_def.name());
-
-  int index = *(static_cast<int*>(const_cast<void*>(axis.GetValues())));
-
-  // TODO(jie): early termination with no-op (attr_size==1)
-
-  auto dim = inputs.at(0).tensor()->getDimensions();
-  // dimension check
-  if (index > dim.nbDims + 1)
-    return tensorflow::errors::InvalidArgument(
-        "Concatenate on axis out of dimension range, at " + node_def.name());
-
-  if (index == 0)
-    return tensorflow::errors::InvalidArgument(
-        "Concatenate on batch dimension not supported, at " + node_def.name());
-
-  // incase we need permutation;
-  std::vector<int> permutation_order(dim.nbDims + 1);
-
-  for (int i = 0; i < dim.nbDims + 1; i++) permutation_order[i] = i;
-
-  if (index != 1) {
-    permutation_order[1] = index - 1;
-    permutation_order[index - 1] = 1;
-  }
-
-  std::vector<nvinfer1::ITensor const*> inputs_vec;
-  // Shap chack (all input tensor should have same shape)
-  // starting from 0 since we are probably also doing transpose here;
-  for (int i = 0; i < input_size; i++) {
-    auto tensor_i = inputs.at(i).tensor();
-    auto dim_i = tensor_i->getDimensions();
-    if (dim_i.nbDims != dim.nbDims)
-      return tensorflow::errors::InvalidArgument(
-          "Concatenate receives inputs with inconsistent dimensions, at " +
-          node_def.name());
-
-    for (int j = 0; j < dim.nbDims; j++) {
-      // check dimension consistency on non-concatenate axis
-      if (j != index - 1 && dim_i.d[j] != dim.d[j])
-        return tensorflow::errors::InvalidArgument(
-            "Concatenate receives inputs with inconsistent shape, at" +
-            node_def.name());
-    }
-
-    // TRT does concatenation only on channel!
-    if (index != 1)
-      tensor_i = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor_i),
-                                     permutation_order);
-
-    inputs_vec.push_back(tensor_i);
-  }
-
-  // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  nvinfer1::IConcatenationLayer* layer = ctx.network()->addConcatenation(
-      const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()),
-      inputs_vec.size());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-  if (index != 1) {
-    output_tensor = ctx.TransposeTensor(output_tensor, permutation_order);
-  }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertFusedBatchNorm(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  TFAttrs attrs(node_def);
-  float epsilon = attrs.get<float>("epsilon");
-  auto data_format = attrs.get<string>("data_format");
-  if (data_format != "NCHW") {
-    return tensorflow::errors::Unimplemented(
-        "only data_format=NCHW is supported, at " + node_def.name());
-  }
-  bool is_training = attrs.get<bool>("is_training");
-  if (is_training) {
-    return tensorflow::errors::Unimplemented(
-        "only is_training=false is supported, at " + node_def.name());
-  }
-  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights scale_weights = inputs.at(1).weights();
-  TRT_ShapedWeights offset_weights = inputs.at(2).weights();
-  TRT_ShapedWeights mean_weights = inputs.at(3).weights();
-  TRT_ShapedWeights variance_weights = inputs.at(4).weights();
-  TRT_ShapedWeights dummy_power_weights(scale_weights.type_);
-  TRT_ShapedWeights combined_scale_weights =
-      ctx.get_temp_weights_like(scale_weights);
-  TRT_ShapedWeights combined_offset_weights =
-      ctx.get_temp_weights_like(offset_weights);
-  size_t nweight = scale_weights.count();
-  if ((scale_weights.type_ == offset_weights.type_) &&
-      (mean_weights.type_ == variance_weights.type_) &&
-      (scale_weights.type_ == variance_weights.type_)) {
-    if ((scale_weights.type_ != tensorflow::DataType::DT_FLOAT) &&
-        (scale_weights.type_ != tensorflow::DataType::DT_HALF)) {
-      return tensorflow::errors::Unimplemented(
-          "only float32 or float16 weight data type is supported, for node " +
-          node_def.name() + " got " +
-          tensorflow::DataTypeString(scale_weights.type_));
-    }
-    if (scale_weights.type_ == tensorflow::DT_FLOAT) {
-      for (size_t i = 0; i < nweight; ++i) {
-        float scale = (static_cast<float const*>(scale_weights.GetValues()))[i];
-        float offset =
-            (static_cast<float const*>(offset_weights.GetValues()))[i];
-        float mean = (static_cast<float const*>(mean_weights.GetValues()))[i];
-        float variance =
-            (static_cast<float const*>(variance_weights.GetValues()))[i];
-        float& combined_scale_ref = const_cast<float*>(
-            static_cast<float const*>(combined_scale_weights.GetValues()))[i];
-        float& combined_offset_ref = const_cast<float*>(
-            static_cast<float const*>(combined_offset_weights.GetValues()))[i];
-        combined_scale_ref = scale / sqrtf(variance + epsilon);
-        combined_offset_ref = offset - mean * combined_scale_ref;
-      }
-    } else {
-      const Eigen::half* scale_vals =
-          (static_cast<Eigen::half const*>(scale_weights.GetValues()));
-      const Eigen::half* off_vals =
-          (static_cast<Eigen::half const*>(offset_weights.GetValues()));
-      const Eigen::half* mean_vals =
-          (static_cast<Eigen::half const*>(mean_weights.GetValues()));
-      const Eigen::half* variance_vals =
-          (static_cast<Eigen::half const*>(variance_weights.GetValues()));
-      Eigen::half* comb_scale_vals = const_cast<Eigen::half*>(
-          static_cast<Eigen::half const*>(combined_scale_weights.GetValues()));
-      Eigen::half* comb_off_vals = const_cast<Eigen::half*>(
-          static_cast<Eigen::half const*>(combined_offset_weights.GetValues()));
-      for (size_t i = 0; i < nweight; ++i) {
-        float scale(scale_vals[i]);
-        float offset(off_vals[i]);
-        float mean(mean_vals[i]);
-        float variance(variance_vals[i]);
-        float combined_scale_ref = scale / sqrtf(variance + epsilon);
-        comb_scale_vals[i] = Eigen::half(combined_scale_ref);
-        float combined_offset_ref = offset - mean * combined_scale_ref;
-        comb_off_vals[i] = Eigen::half(combined_offset_ref);
-      }
-    }
-  }
-  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
-      combined_offset_weights.GetWeightsForTRT(),
-      combined_scale_weights.GetWeightsForTRT(),
-      dummy_power_weights.GetWeightsForTRT());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertMatMul(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-
-  // TODO(jie): transpose!
-  TFAttrs attrs(node_def);
-
-  TRT_ShapedWeights weights_ck = inputs.at(1).weights();
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_ck);
-  ReorderCKtoKC(weights_ck, &weights);
-  TRT_ShapedWeights biases(weights.type_);
-
-  int noutput = weights.shape_.d[0];
-
-  nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected(
-      *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases);
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status ConvertReshape(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights())
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at" + node_def.name());
-
-  // implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  auto dims = tensor->getDimensions();
-  // restore implicit batch dimension
-
-  TRT_ShapedWeights shape = inputs.at(1).weights();
-
-  TFAttrs attrs(node_def);
-
-  auto padding_type = attrs.get<tensorflow::DataType>("Tshape");
-
-  if (shape.shape_.nbDims != 1)
-    return tensorflow::errors::InvalidArgument(
-        "reshape new shape is not 1 dimensional, at " + node_def.name());
-
-  // Only expect to handle INT32 as attributes for now
-  if (padding_type != tensorflow::DataType::DT_INT32)
-    return tensorflow::errors::Unimplemented(
-        "reshape new shape supports only DT_INT32, at " + node_def.name());
-
-  auto shape_data = static_cast<int*>(const_cast<void*>(shape.GetValues()));
-
-  if (shape_data[0] != -1)
-    return tensorflow::errors::InvalidArgument(
-        "reshape new shape first dimension is not -1, at " + node_def.name());
-
-  auto shape_num_dims = shape.shape_.d[0];
-  VLOG(2) << "shape dimensions: " << shape_num_dims;
-  int volume_w = 1;
-  for (int i = 1; i < shape.shape_.d[0]; i++) volume_w *= shape_data[i];
-
-  int volume_t = 1;
-  for (int i = 0; i < dims.nbDims; i++) volume_t *= dims.d[i];
-
-  VLOG(2) << "volume: " << volume_t << " volume weights: " << volume_w;
-  if (volume_w != volume_t)
-    return tensorflow::errors::InvalidArgument(
-        "volume does not agree between tensor and new shape, at " +
-        node_def.name());
-
-  nvinfer1::IShuffleLayer* layer =
-      ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor));
-
-  nvinfer1::Dims reshape_dims;
-  VLOG(2) << "new dimension: " << shape_num_dims - 1;
-  reshape_dims.nbDims = shape_num_dims - 1;
-  for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
-    reshape_dims.d[i] = shape_data[i + 1];
-  }
-  layer->setReshapeDimensions(reshape_dims);
-  VLOG(2) << "new dimension: " << shape_num_dims - 1;
-
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  auto dims_output = output_tensor->getDimensions();
-  VLOG(2) << "output tensor dimension:" << dims_output.nbDims;
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
-}
-
 void Converter::register_op_converters() {
   // vgg_16 slim implementation
   op_registry_["Placeholder"] = ConvertPlaceholder;
   op_registry_["Conv2D"] = ConvertConv2D;
-  op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
   op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
-  op_registry_["AvgPool"] = ConvertPool;
   // This could be really handled as ConvertBinary
   op_registry_["BiasAdd"] = ConvertScale;
   op_registry_["Const"] = ConvertConst;
+  // op_registry_["MatMul"] = ConvertFullyConnected;  // Not used in vgg
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
+  // op_registry_["AvgPool"] = ConvertPool;
 
   // resnet_50_v1 slim implementation
   op_registry_["Add"] = ConvertBinary;
@@ -2059,364 +1393,26 @@ void Converter::register_op_converters() {
   op_registry_["Mean"] = ConvertReduce;
   op_registry_["Pad"] = ConvertPad;
   // TODO(ben,jie): Add more ops
-
-  op_registry_["ConcatV2"] = ConvertConcat;
-  op_registry_["MatMul"] = ConvertMatMul;
-  op_registry_["Reshape"] = ConvertReshape;
-  op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
-  op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
-tensorflow::Status ConvertCalibrationNodeToEngineNode(
-    tensorflow::Graph& graph, tensorflow::Node* c_node) {
-  const auto ndef = c_node->def();
-
-  TFAttrs attrs(ndef);
-  std::vector<string> segment_nodes(
-      attrs.get<std::vector<string>>("segment_nodes"));
-  std::vector<string> output_nodes(
-      attrs.get<std::vector<string>>("segment_output_names"));
-  std::vector<string> input_names(
-      attrs.get<std::vector<string>>("input_names"));
-  string res_name = attrs.get<string>("resource_name");
-  VLOG(1) << "Node name " << c_node->name() << " res_name " << res_name;
-  string engine_name = "my_trt_op";
-  {
-    const auto node_id = tensorflow::str_util::Split(res_name, "_");
-    engine_name += node_id.back();
-  }
-  std::map<string, tensorflow::Node*> node_maps;
-
-  for (auto n : graph.op_nodes()) {
-    node_maps.insert({n->name(), n});
-  }
-  VLOG(1) << "Output Nodes:";
-  std::vector<tensorflow::DataType> out_types;
-  std::vector<const tensorflow::Edge*> out_edges;
-  for (auto& i : output_nodes) {
-    auto node_port = tensorflow::str_util::Split(i, ":");
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-    auto out_node_name = node_port.at(0);
-    if (node_port.size() > 1) {
-      VLOG(1) << "Multi port output" << node_port.at(0) << " "
-              << node_port.at(1) << " size=" << node_port.size();
-    }
-    auto node_it = node_maps.find(out_node_name);
-    if (node_it != node_maps.end()) {
-      tensorflow::Node* out_node = node_it->second;
-      int port = 0;
-      if (node_port.size() == 2) {
-        port = std::strtoul(node_port.at(1).c_str(), nullptr, 10);
-        out_types.push_back(out_node->output_type(port));
-      } else {
-        out_types.push_back(out_node->output_type(0));
-      }
-      for (auto out_edge : out_node->out_edges()) {
-        if (out_edge->src_output() == port) {
-          out_edges.push_back(out_edge);
-          break;
-        }
-      }
-    } else {
-      LOG(WARNING) << " couldn't find output node " << out_node_name;
-    }
-  }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-  }
-  auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto resmgr = trt_rm->getManager("TRTCalibOps");
-  tensorflow::tensorrt::TRTCalibrationResource* calib_res = nullptr;
-  auto status = resmgr->Lookup(res_name, res_name, &calib_res);
-  if (!status.ok() || !calib_res->calibrator_) {
-    return tensorflow::errors::FailedPrecondition(
-        "You must run calibration"
-        " and inference conversion in the same proces");
-  }
-
-  calib_res->calibrator_->setDone();
-  calib_res->thr_->join();
-  delete calib_res->thr_;
-  if (!calib_res->engine_) {
-    LOG(FATAL) << "Calibration failed!, engine is nullptr. Did you run "
-                  "calibration graph?";
-  }
-  auto weight_rmgr = trt_rm->getManager("WeightStore");
-  TF_CHECK_OK(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
-      res_name, res_name));
-  auto engine_plan = calib_res->engine_->serialize();
-  calib_res->engine_->destroy();
-  calib_res->network_->destroy();
-  calib_res->builder_->destroy();
-  calib_res->thr_ = nullptr;
-  calib_res->engine_ = nullptr;
-  calib_res->builder_ = nullptr;
-  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (const auto in_edge : c_node->in_edges()) {
-    auto src = in_edge->src();
-    int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
-  tensorflow::NodeDef engine_node;
-  const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
-  string engine_plan_string(engine_plan_data,
-                            engine_plan_data + engine_plan->size());
-  status = op_builder.Attr("serialized_engine", engine_plan_string)
-               .Attr("input_nodes", input_names)
-               .Attr("output_nodes", output_nodes)
-               .Attr("OutT", out_types)
-               .Finalize(&engine_node);
-  if (!status.ok()) {
-    LOG(ERROR) << "Engine Node creation failed";
-    return status;
-  }
-  auto trt_engine_node = graph.AddNode(engine_node, &status);
-  TF_CHECK_OK(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
-  }
-  VLOG(1) << "Segment nodes:";
-  for (auto& i : segment_nodes) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-    auto it = node_maps.find(i);
-    if (it != node_maps.end()) {
-      graph.RemoveNode(it->second);
-    }
-  }
-  graph.RemoveNode(c_node);
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
-  std::list<tensorflow::Node*> order;
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to contstruct the
-      // network layer by layer
-    }
-  }
-  // topological order is needed to build TRT network
-  static int static_id = 0;
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
-  }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
-    // TODO(jie): alternative :)
-    if (!s.graph_properties.HasOutputProperties(node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          node_name);
-
-    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", output_idx, ", at node: ", node_name,
-          "with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(output_idx);
-
-    tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
-
-    VLOG(2) << "accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << "with output entry from shape_map: " << op_info_vec.size();
-
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx);
-
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
-  }
-
-  VLOG(2) << "finished sorting";
-
-  for (const tensorflow::Node* node : order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "finished conversion";
-
-  // Gather output metadata
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-    VLOG(1) << "output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
-    }
-    converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
-  }
-
-  VLOG(2) << "finished output";
-
-  // Build the engine
-  op_res->builder_->setMaxBatchSize(s.max_batch_size);
-  op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
-
-  // Build the TRT op
-  // TODO(sami,ben,jie): proper naming!
-  tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
-            << ":" << output_idx
-            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
-  std::vector<string> segment_names;
-  segment_names.reserve(s.subgraph_node_ids.size());
-  for (int i : s.subgraph_node_ids) {
-    auto node = s.graph.FindNodeId(i);
-    segment_names.push_back(node->name());
-  }
-  LOG(INFO) << "finished op preparation";
-
-  auto status = op_builder.Attr("segment_nodes", segment_names)
-                    .Attr("input_names", input_names)
-                    .Attr("segment_output_names", output_names)
-                    .Attr("resource_name", calib_op_name)
-                    .Finalize(s.trt_node);
-
-  LOG(INFO) << status.ToString();
-  LOG(INFO) << "finished op building";
-
-  return tensorflow::Status::OK();
-}
 
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
-    tensorrt::convert::SubGraphParams& s) {
+    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
+    const std::vector<std::pair<int, int>>& input_inds,
+    const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
+    size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::NodeDef* trt_node) {
   // Visit nodes in reverse topological order and construct the TRT network.
 
   // Toposort
   std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
+  tensorflow::GetPostOrder(graph, &order_vec);
   // Select just the subgraph
   std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
+    if (subgraph_node_ids.count(node->id())) {
       // We want topological order to contstruct the
       // network layer by layer
       order.push_front(node);
@@ -2438,86 +1434,46 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
         "Failed to create TensorRT network object");
   }
 
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  static int static_id = 0;
-  // TODO(sami,ben,jie): proper naming!
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
-  engine_name = StrCat(engine_name, static_id++);
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(engine_name, engine_name, ws));
-
   // Build the network
-  Converter converter(trt_network.get(), ws, s.precision_mode == FP16MODE);
+  Converter converter(trt_network.get());
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input!!!!!";
+  for (std::pair<int, int> const& input : input_inds) {
     int node_id = input.first;
     int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    tensorflow::Node* node = graph.FindNodeId(node_id);
     auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
+    input_names.push_back(node_name);  // Insert original node name without port
+    // TODO(jie): alternative :)
+    if (!graph_properties.HasOutputProperties(node_name))
+      return tensorflow::errors::Internal("Failed to find input node: " +
+                                          node_name);
 
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
+    auto op_info_vec = graph_properties.GetOutputProperties(node_name);
+    if (static_cast<int>(op_info_vec.size()) < output_idx)
       return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
+          "Accessing output index of: " + std::to_string(output_idx) +
+          ", at node: " + node_name + " with output entry from shape_map: " +
+          std::to_string(op_info_vec.size()));
+
+    auto op_info = op_info_vec.at(output_idx);
 
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
-    VLOG(2) << "Accessing output index of: " << output_idx
+    VLOG(2) << "Accessing output index of: " << std::to_string(output_idx)
             << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
+            << " with output entry from shape_map: "
+            << std::to_string(op_info_vec.size());
+
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4)
-      return tensorflow::errors::Unimplemented("require 4 dimensional input");
-
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
@@ -2526,11 +1482,9 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
     // TODO(ben,jie): proper way to restore input tensor name?
     auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
+    if (output_idx != 0)
+      input_tensor_name = node_name + ":" + std::to_string(output_idx);
 
-    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -2557,22 +1511,14 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
+  for (std::pair<int, int> const& output : output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
+    tensorflow::Node* node = graph.FindNodeId(node_id);
     string op_name = node->name();
     string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
     if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
+      tensor_name = tensor_name + ":" + std::to_string(output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
     output_names.push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
@@ -2594,25 +1540,19 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   }
 
   VLOG(2) << "Finished output";
+  // TODO(jie): static_id is not thread safe.
+  static int static_id = 0;
 
   // Build the engine
-  trt_builder->setMaxBatchSize(s.max_batch_size);
-  trt_builder->setMaxWorkspaceSize(s.max_workspace_size_bytes);
-  VLOG(0) << "Max batch size= " << s.max_batch_size
-          << " max workspace size= " << s.max_workspace_size_bytes;
-  if (s.precision_mode == FP16MODE) {
-    trt_builder->setHalf2Mode(true);
-    VLOG(0) << "Using FP16 precision mode";
-  }
-  LOG(INFO) << "starting build engine";
+  trt_builder->setMaxBatchSize(max_batch_size);
+  trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
+  VLOG(0) << "Starting build engine " << static_id;
+  // TODO(ben,jie): half2 and int8 mode support
   string engine_plan_string;
   {
     auto trt_engine =
         infer_object(trt_builder->buildCudaEngine(*converter.network()));
     VLOG(0) << "Built network";
-    if (trt_engine.get() == nullptr) {
-      return tensorflow::errors::Internal("Engine building failure");
-    }
     auto engine_plan = infer_object(trt_engine->serialize());
     VLOG(0) << "Serialized engine";
     const char* engine_plan_data =
@@ -2620,19 +1560,18 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     engine_plan_string =
         string(engine_plan_data, engine_plan_data + engine_plan->size());
   }
-  TF_RETURN_IF_ERROR(weight_rmgr->Delete<tensorflow::tensorrt::TRTWeightStore>(
-      engine_name, engine_name));
-  LOG(INFO) << "finished engine " << engine_name;
+
+  VLOG(0) << "Finished engine";
 
   // Build the TRT op
-  tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
+  // TODO(sami,ben,jie): proper naming!
+  tensorflow::NodeDefBuilder op_builder(
+      tensorflow::strings::StrCat("my_trt_op", static_id++), "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names.size();
   for (size_t i = 0; i < input_names.size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    int output_idx = input_inds.at(i).second;
+    // We wired up the input here already, it is redundant to do it again in
+    // ConvertSubGraphToTensorRT(convert_graph.cc)
     auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
         input_names.at(i), output_idx, input_dtypes.at(i));
     income_edges.push_back(incoming_edge);
@@ -2647,7 +1586,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
-                    .Finalize(s.trt_node);
+                    .Finalize(trt_node);
 
   VLOG(0) << status.ToString() << " finished op building";
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 954a1e72f8..2e7fd19566 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -17,8 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
-#include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -34,49 +32,16 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-const int FP32MODE = 0;
-const int FP16MODE = 1;
-const int INT8MODE = 2;
+tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
+    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
+    const std::vector<std::pair<int, int>>&
+        input_inds,  // {node_id, output_idx}
+    const std::vector<std::pair<int, int>>&
+        output_inds,  // {node_id, output_idx}
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_prop,
+    tensorflow::NodeDef* trt_node);
 
-struct SubGraphParams {
-  SubGraphParams(
-      tensorflow::Graph& inp_graph,
-      const std::set<int>& subgraph_node_id_numbers,
-      const std::vector<std::pair<int, int>>& input_indices,
-      const std::vector<std::pair<int, int>>& output_indices,
-      size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
-      const tensorflow::grappler::GraphProperties& current_graph_properties,
-      std::unordered_map<string, std::pair<int, string>>* output_edges,
-      tensorflow::NodeDef* constructed_trt_node,
-      int engine_precision_mode = FP32MODE)
-      : graph(inp_graph),
-        subgraph_node_ids(subgraph_node_id_numbers),
-        input_inds(input_indices),
-        output_inds(output_indices),
-        max_batch_size(max_supported_batch_size),
-        max_workspace_size_bytes(max_consumed_workspace_size_bytes),
-        graph_properties(current_graph_properties),
-        output_edge_map(output_edges),
-        trt_node(constructed_trt_node),
-        precision_mode(engine_precision_mode) {}
-
-  tensorflow::Graph& graph;
-  const std::set<int>& subgraph_node_ids;
-  const std::vector<std::pair<int, int>>& input_inds;   // {node_id, output_idx}
-  const std::vector<std::pair<int, int>>& output_inds;  // {node_id, output_idx}
-  size_t max_batch_size;
-  size_t max_workspace_size_bytes;
-  const tensorflow::grappler::GraphProperties& graph_properties;
-  std::unordered_map<string, std::pair<int, string>>* output_edge_map;
-  tensorflow::NodeDef* trt_node;
-  const int precision_mode;
-};
-
-// TODO(sami): Replace references with const reference or pointers
-tensorflow::Status ConvertSubGraphToTensorRTNodeDef(SubGraphParams& params);
-tensorflow::Status InjectCalibrationNode(SubGraphParams& params);
-tensorflow::Status ConvertCalibrationNodeToEngineNode(tensorflow::Graph& graph,
-                                                      tensorflow::Node* c_node);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
index aea44fd8a2..1dcb87e768 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_calib_op.cc
@@ -21,11 +21,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/stream_executor.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -114,13 +113,7 @@ void TRTCalibOp::Compute(tensorflow::OpKernelContext* ctx) {
     ctx->set_output(i, t);
   }
   VLOG(2) << "Filled map for sending";
-  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
-  const cudaStream_t* stream = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
-                                                ->stream()
-                                                ->implementation()
-                                                ->CudaStreamMemberHack()));
-  calib_res->calibrator_->setBatch(input_data, *stream);
+  calib_res->calibrator_->setBatch(input_data);
   VLOG(2) << "Passed calibration data";
   // TODO(aaroey): make sure we wait for the completion of calibration on the
   // last batch in future PR.
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index b32371b642..8efdf63ebe 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -24,12 +24,8 @@ limitations under the License.
 #include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
-static ::tensorflow::tensorrt::Logger logger;
-namespace gpu = ::perftools::gputools;
-using IRuntime = nvinfer1::IRuntime;
-using Dims = nvinfer1::Dims;
-
 namespace tensorrt {
+static ::tensorflow::tensorrt::Logger logger;
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
@@ -44,21 +40,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
   // Only engine should be in the op and context and runtime should be taken
   // from resourcemanager
-  // TODO(jie): cudaSetDevice make sure trt engine is allocated on the same
-  // gpu where the input/output is also located.
-  int gpu_id = context->device()->tensorflow_gpu_device_info()->gpu_id;
-  cudaSetDevice(gpu_id);
-  int device;
-  cudaGetDevice(&device);
-  if (gpu_id != device) LOG(FATAL) << "set device failed!";
-
-  // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  // Only engine should be in the op and context and runtime should be taken
-  // from resourcemanager
-
-  IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   trt_engine_ptr_.reset(infer->deserializeCudaEngine(
       serialized_engine.c_str(), serialized_engine.size(), nullptr));
+
   trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
   // Runtime is safe to delete after engine creation
   infer->destroy();
@@ -70,6 +55,7 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
   size_t binding_index;
   int num_batch = 0;
+  bool valid = true;
   for (int i = 0; i < context->num_inputs(); i++) {
     // Grab the input tensor
     binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
@@ -78,12 +64,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     const TensorShape& input_shape = input_tensor.shape();
     if (i == 0) {
       num_batch = input_shape.dim_size(0);
-      if (num_batch > trt_engine_ptr_->getMaxBatchSize()) {
-        LOG(FATAL) << "input tensor batch larger than max_batch_size: "
-                   << trt_engine_ptr_->getMaxBatchSize();
-      }
     } else if (num_batch != input_shape.dim_size(0)) {
-      LOG(FATAL) << "input data inconsistent batch size";
+      valid = false;
       break;
     }
     switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
@@ -99,6 +81,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
     }
   }
 
+  // Might want a different way to inform the user of batch size inconsistency
+  if (!valid) LOG(WARNING) << "input data inconsistent batch size";
+
   for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
     // This is bad that we have to reallocate output buffer every run.
     // Create an output tensor
@@ -141,11 +126,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
                                                 ->implementation()
                                                 ->CudaStreamMemberHack()));
 
-  // TODO(jie): trt enqueue does not return error
-  auto ret = trt_execution_context_ptr_->enqueue(num_batch, &buffers[0],
-                                                 *stream, nullptr);
-  VLOG(2) << "enqueue returns: " << ret;
-  // sync should be done by TF.
+  // execution handled by TF since we are getting stream from TF.
+  // it is safe for CPU pointer array (buffers) to go out of scope after enqueue
+  trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
 }
 
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
index dda0dc9e71..7add8cb8b3 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -27,19 +27,19 @@ void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
     case Severity::kINFO: {  // Mark TRT info messages as debug!
-      VLOG(2) << name_ << " " << msg;
+      VLOG(2) << msg;
       break;
     }
     case Severity::kWARNING: {
-      LOG(WARNING) << name_ << " " << msg;
+      LOG(WARNING) << msg;
       break;
     }
     case Severity::kERROR: {
-      LOG(ERROR) << name_ << " " << msg;
+      LOG(ERROR) << msg;
       break;
     }
     case Severity::kINTERNAL_ERROR: {
-      LOG(FATAL) << name_ << " " << msg;
+      LOG(FATAL) << msg;
       break;
     }
     // This is useless for now. But would catch it in future if enum changes. It
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 7f3544f8cf..d71f66b933 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -27,11 +27,9 @@ namespace tensorrt {
 
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
- public:
-  Logger(string name = "DefaultLogger") : name_(name){};
+ private:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
- private:
   string name_;
 };
 
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index 0b2321b5fc..7e050a768c 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -20,6 +20,5 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
-from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph
 from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 666220d78c..9454862f85 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -20,17 +20,11 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
 import six as _six
-from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
 from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
 from tensorflow.core.framework import graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl as _impl
-from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.grappler import tf_optimizer
-from tensorflow.python.util import compat
-# pylint: enable=unused-import,line-too-long
 
 
 # TODO(skama): get outputs from session when implemented as c++
@@ -38,33 +32,22 @@ from tensorflow.python.util import compat
 def create_inference_graph(input_graph_def,
                            outputs,
                            max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20,
-                           precision_mode="FP32",
-                           minimum_segment_size=3):
+                           max_workspace_size_bytes=2 << 20):
   """Python wrapper for the TRT transormation.
 
+
   Args:
     input_graph_def: GraphDef object containing a model to be transformed.
-    outputs: list of tensors or node names for the model outputs.
+    outputs: List of tensors or node names for the model outputs.
     max_batch_size: max size for the input batch
     max_workspace_size_bytes: parameter to control memory allocation (in Bytes)
-    precision_mode: one of 'FP32', 'FP16' and 'INT8'
-    minimum_segment_size: the minimum number of nodes required for a subgraph to
-      be replaced by TRTEngineOp.
 
   Returns:
     New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
 
   Raises:
-    ValueError: if the provided precision mode is invalid.
     RuntimeError: if the returned status message is malformed.
   """
-  supported_precision_modes = {"FP32": 0, "FP16": 1, "INT8": 2}
-  if precision_mode.upper() not in supported_precision_modes:
-    raise ValueError(("precision mode '{}' is not supported."
-                      "It should be one of {}").format(
-                          precision_mode, "{'FP32', 'FP16', 'INT8'}"))
-  mode = supported_precision_modes[precision_mode.upper()]
 
   def py2bytes(inp):
     return inp
@@ -100,7 +83,7 @@ def create_inference_graph(input_graph_def,
   # pair or strings where first one is encoded status and the second
   # one is the transformed graphs protobuf string.
   out = trt_convert(input_graph_def_str, out_names, max_batch_size,
-                    max_workspace_size_bytes, mode, minimum_segment_size)
+                    max_workspace_size_bytes)
   status = to_string(out[0])
   output_graph_def_string = out[1]
   del input_graph_def_str  # Save some memory
@@ -118,46 +101,3 @@ def create_inference_graph(input_graph_def,
   output_graph_def.ParseFromString(output_graph_def_string)
   del output_graph_def_string  # Save some memory
   return output_graph_def
-
-
-def calib_graph_to_infer_graph(calibration_graph_def):
-  """Convert an existing calibration graph to inference graph.
-
-  Args:
-    calibration_graph_def: the calibration GraphDef object with calibration data
-  Returns:
-    New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
-  Raises:
-    RuntimeError: if the returned status message is malformed.
-  """
-
-  def py2string(inp):
-    return inp
-
-  def py3string(inp):
-    return inp.decode("utf-8")
-
-  if _six.PY2:
-    to_string = py2string
-  else:
-    to_string = py3string
-
-  graph_str = calibration_graph_def.SerializeToString()
-  out = calib_convert(graph_str)
-  status = to_string(out[0])
-  output_graph_def_string = out[1]
-  del graph_str  # Save some memory
-  if len(status) < 2:
-    raise _impl.UnknownError(None, None, status)
-  if status[:2] != "OK":
-    msg = status.split(";")
-    if len(msg) == 1:
-      raise RuntimeError("Status message is malformed {}".format(status))
-    # pylint: disable=protected-access
-    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
-                                         int(msg[0]))
-    # pylint: enable=protected-access
-  output_graph_def = graph_pb2.GraphDef()
-  output_graph_def.ParseFromString(output_graph_def_string)
-  del output_graph_def_string  # Save some memory
-  return output_graph_def
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
index 74df75902e..3d5cc76c42 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
+#include "cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -38,18 +38,22 @@ TRTInt8Calibrator::TRTInt8Calibrator(
       done_(false),
       dev_buffers_(dev_buffers),
       calib_running_(false),
-      batch_is_set_(false),
       engine_name_(engine_name) {}
 
-bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
-                                 const cudaStream_t stream) {
-  tensorflow::mutex_lock lock(cond_mtx_);
-  while ((calib_running_ || batch_is_set_) &&
-         !done_) {  // wait while calibration is running
-    cond_.wait(lock);
-  }
+bool TRTInt8Calibrator::setBatch(
+    const std::unordered_map<string, void*>& data) {
+  // TODO(aaroey): make sure that in future PR:
+  // 1. the mutex_lock is outside of the loop
+  // 2. wait() is used instead of wait_for()
+  // 3. done_ is to be protected by the mutex
+  // 4. the first batch is not missed
   if (done_) return false;
-  CHECK(!calib_running_ && !batch_is_set_);
+  while (calib_running_.load(
+      std::memory_order_acquire)) {  // wait while calibration is running
+    tensorflow::mutex_lock l(cond_mtx_);
+    cond_.wait_for(l, std::chrono::milliseconds(50));
+    if (done_) return false;
+  }
   VLOG(1) << "Set Batch Waiting finished";
   for (const auto it : data) {
     auto devptr = dev_buffers_.find(it.first);
@@ -61,32 +65,27 @@ bool TRTInt8Calibrator::setBatch(const std::unordered_map<string, void*>& data,
 
     // TODO(aaroey): we should not use sync copy on default stream. Make sure
     // stream->ThenMemcpy() is used in future PRs.
-    // TODO(sami,aaroey): Need to figureout a way to ensure synchronization
-    // between stream, perhaps using a tensor?
-    auto status = cudaMemcpyAsync(d.first, it.second, d.second,
-                                  cudaMemcpyDeviceToDevice, stream);
+    auto status =
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice);
     if (status != cudaSuccess) {
       LOG(FATAL) << "cudaMemcpy " << engine_name_ << " for '" << it.first
                  << "' failed with " << status;
     }
   }
-
-  // TODO(Sami, aaorey): Find an alternative way!
-  cudaStreamSynchronize(
-      stream);  // we have to wait for the stream before returning!
-  batch_is_set_ = true;
+  calib_running_.store(true, std::memory_order_release);  // release builder
   cond_.notify_all();
   return true;
 }
 
 bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
                                  int num_bindings) {
-  tensorflow::mutex_lock lock(cond_mtx_);
-  calib_running_ = false;
+  calib_running_.store(false, std::memory_order_release);  // wait for new batch
   cond_.notify_all();
-  while ((!batch_is_set_ && !done_)) {  // wait until new batch arrives
-    cond_.wait(lock);
-
+  while (!calib_running_.load(
+      std::memory_order_acquire)) {  // wait until new batch arrives
+    tensorflow::mutex_lock l(cond_mtx_);
+    cond_.wait_for(l, std::chrono::milliseconds(50));
+    if (done_) return false;
   }
   if (done_) {
     return false;
@@ -101,8 +100,6 @@ bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
 
     bindings[i] = it->second.first;
   }
-  batch_is_set_ = false;
-  calib_running_ = true;
   return true;
 }
 
@@ -110,12 +107,6 @@ const void* TRTInt8Calibrator::readCalibrationCache(std::size_t& length) {
   return nullptr;
 }
 
-void TRTInt8Calibrator::setDone() {
-  tensorflow::mutex_lock lock(cond_mtx_);
-  done_ = true;
-  cond_.notify_all();
-}
-
 void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
                                               std::size_t length) {}
 TRTInt8Calibrator::~TRTInt8Calibrator() {
@@ -124,6 +115,5 @@ TRTInt8Calibrator::~TRTInt8Calibrator() {
 
 }  // namespace tensorrt
 }  // namespace tensorflow
-
 #endif
 #endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
index d77aa2c5ab..8830f7efe7 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
@@ -24,10 +24,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-
-#include "cuda/include/cuda_runtime_api.h"
 #include "tensorrt/include/NvInfer.h"
-
 namespace tensorflow {
 namespace tensorrt {
 // This class provides a 1 element queue to match TFs push model to
@@ -42,9 +39,8 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
   int getBatchSize() const override;
   bool getBatch(void* bindings[], const char* names[],
                 int num_bindings) override;
-  bool setBatch(const std::unordered_map<string, void*>& data,
-                const cudaStream_t stream);
-  void setDone();
+  bool setBatch(const std::unordered_map<string, void*>& data);
+  void setDone() { done_ = true; }
   const void* readCalibrationCache(std::size_t& length) override;
   void writeCalibrationCache(const void* ptr, std::size_t length) override;
   ~TRTInt8Calibrator();
@@ -59,14 +55,11 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
   const std::unordered_map<string, std::pair<void*, size_t>>
       dev_buffers_;  // map to keep tensorrt input buffers and sizes keyed with
                      // buffer names
-  bool calib_running_;
-  bool batch_is_set_;
+  std::atomic_bool calib_running_;
   string engine_name_;
 };
-
 }  // namespace tensorrt
 }  // namespace tensorflow
-
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
 #endif
 #endif
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index 0b661bd536..c78f6f2224 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -60,7 +60,6 @@ def get_simple_graph_def():
 
 
 def run_graph(gdef, dumm_inp):
-  """Run given graphdef once."""
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
   g = ops.Graph()
@@ -75,65 +74,15 @@ def run_graph(gdef, dumm_inp):
   return val
 
 
-# Use real data that is representatitive of the inference dataset
-# for calibration. For this test script it is random data.
-def run_calibration(gdef, dumm_inp):
-  """Run given calibration graph multiple times."""
-  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  ops.reset_default_graph()
-  g = ops.Graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=gdef, return_elements=["input", "output"])
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-  with csess.Session(
-      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
-    # run over real calibration data here, we are mimicking a calibration set of
-    # 30 different batches. Use as much calibration data as you want
-    for _ in range(30):
-      val = sess.run(out, {inp: dumm_inp})
-  return val
-
-
 if "__main__" in __name__:
   inp_dims = (100, 24, 24, 2)
   dummy_input = np.random.random_sample(inp_dims)
-  orig_graph = get_simple_graph_def()  # use a frozen graph for inference
+  gdef = get_simple_graph_def()
   # Get optimized graph
-  trt_graph = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2  # minimum number of nodes in an engine
-  )
-  o1 = run_graph(orig_graph, dummy_input)
+  trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0])
+  o1 = run_graph(gdef, dummy_input)
   o2 = run_graph(trt_graph, dummy_input)
   o3 = run_graph(trt_graph, dummy_input)
   assert np.array_equal(o1, o2)
   assert np.array_equal(o3, o2)  # sanity check
-  fp16_graph = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2  # minimum number of nodes in an engine
-  )
-  int8_calib_gdef = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2  # minimum number of nodes in an engine
-  )
-  o4 = run_graph(fp16_graph, dummy_input)
-  _ = run_calibration(int8_calib_gdef, dummy_input)
-  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
-  o5 = run_graph(int8_graph, dummy_input)
-  assert np.allclose(o1, o4)
-  assert np.allclose(o1, o5)
   print("Pass")
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
index 46480e99a1..d679945d56 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -64,17 +64,13 @@ PyObject* pair_helper(std::pair<string, string>* in) {
 %ignoreall
 %unignore tensorflow;
 %unignore trt_convert;
-%unignore calib_convert;
 
 %{
-
 std::pair<string, string> trt_convert(
     string graph_def_string,  // The serialized GraphDef string.
     std::vector<string> output_names,
     size_t max_batch_size,
-    size_t max_workspace_size_bytes,
-    int precision_mode,
-    int minimum_segment_size
+    size_t max_workspace_size_bytes
     // Unfortunately we can't use TF_Status here since it
     // is in c/c_api and brings in a lot of other libraries
     // which in turn declare ops. These ops are included
@@ -94,64 +90,16 @@ std::pair<string, string> trt_convert(
     return std::pair<string, string>{out_status, ""};
   }
 
-  if(precision_mode < 0 || precision_mode > 2){
-    out_status = "InvalidArgument;Invalid precision_mode";
-    return std::pair<string, string>{out_status, ""};
-  }
   if (!output_names.size()) {
     out_status = "InvalidArgument;Size of the output_names vector is 0";
     return std::pair<string, string>{out_status, ""};
+    // return "";
   }
   tensorflow::GraphDef outGraph;
   tensorflow::Status conversion_status =
       tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
           graph_def, output_names, max_batch_size, max_workspace_size_bytes,
-          &outGraph, precision_mode, minimum_segment_size);
-  if (!conversion_status.ok()) {
-    auto retCode = (int)conversion_status.code();
-    char buff[2000];
-    snprintf(buff, 2000, "%d;%s", retCode,
-             conversion_status.error_message().c_str());
-    out_status = buff;
-    return std::pair<string, string>{out_status, ""};
-  }
-  string result;
-  if (!outGraph.SerializeToString(&result)) {
-    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  out_status = "OK;All good!";
-  return std::pair<string, string>{out_status, result};
-#else
-  // Returns FAILED_PRECONDITION.
-  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
-}
-
-std::pair<string, string> calib_convert(string graph_def_string  //  const tensorflow::GraphDef&
-    // unfortunately we can't use TF_Status here since it
-    // is in c/c_api and brings in a lot of other libraries
-    // which in turn declare ops. These ops are included
-    // statically in our library and cause an abort when
-    // module is loaded due to double registration
-    // until Tensorflow properly exposes these headers
-    // we have to work around this by returning a string
-    // and converting it to exception on python side.
-    //,TF_Status* out_status) {
-) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  string out_status;
-
-  tensorflow::GraphDef graph_def;
-  if (!graph_def.ParseFromString(graph_def_string)) {
-    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-
-  tensorflow::GraphDef outGraph;
-  tensorflow::Status conversion_status =
-      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(graph_def,
-                                                                   &outGraph);
+          &outGraph);
   if (!conversion_status.ok()) {
     auto retCode = (int)conversion_status.code();
     char buff[2000];
@@ -174,13 +122,10 @@ std::pair<string, string> calib_convert(string graph_def_string  //  const tenso
 }
 %}
 
-std::pair<string, string> calib_convert(string graph_def_string);
-
 std::pair<string, string> trt_convert(string graph_def_string,
                                       std::vector<string> output_names,
                                       size_t max_batch_size,
-                                      size_t max_workspace_size_bytes,
-                                      int precision_mode, int minimum_segment_size);
+                                      size_t max_workspace_size_bytes);
 
 
 %unignoreall
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 70bf67c779..bb86ecb220 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -25,10 +25,7 @@ py_test(
     srcs = ["predict_test.py"],
     data = ["data/period_trend.csv"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/67513579
-    ],
+    tags = ["notsan"],  # b/67513579
     deps = [
         ":predict",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 64f5cd8357..ed3ed4c0e1 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -156,7 +156,9 @@ py_test(
         "head_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_pip_gpu"],  # b/63391119
+    tags = [
+        "no_pip_gpu",  # b/63391119
+    ],
     deps = [
         ":feature_keys",
         ":head",
@@ -425,7 +427,6 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip_gpu",  # b/63391119
-        "no_windows",  # TODO: needs investigation on Windows
     ],
     deps = [
         ":feature_keys",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 07df7bc9a5..c86d06e923 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,7 +40,6 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index f9d433a45b..ed930e44e8 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -225,7 +225,6 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:layers",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 tf_py_test(
diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py
index dca01d26f4..f4283cd9ed 100644
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@@ -42,10 +42,9 @@ def load_op_library(path):
     plugin.
   """
   if os.name == 'nt':
-    # To avoid making every user_ops aware of windows, re-write
-    # the file extension from .so to .dll if .so file doesn't exist.
-    if not os.path.exists(path):
-      path = re.sub(r'\.so$', '.dll', path)
+    # To avoid makeing every user_ops aware of windows, re-write
+    # the file extension from .so to .dll.
+    path = re.sub(r'\.so$', '.dll', path)
 
     # Currently we have only some user_ops as dlls on windows - don't try
     # to load them if the dll is not found.