Merge changes from github.

Revert #18413. Too many internal test failures due to the name scope change caused by this change. Revert #18192. Cannot use re2::StringPiece internally. Need alternative for set call. Will pull and clean this up in a separate change. PiperOrigin-RevId: 197991247
author: Yifei Feng <yifeif@google.com> 2018-05-24 19:12:26 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-05-24 19:15:01 -0700
commit: b59833c3fd91511b33255369016868e4ae6cda2e (patch)
tree: ecbd70cfd3abb5d934f6eb4b7280a35e8589f5cf /tensorflow
parent: 2b99d9cbc7166efedaff9eee11744348da30fc8a (diff)
258 files changed, 4096 insertions, 922 deletions
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index f06debaf31..6d1e3325eb 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -240,7 +240,7 @@ class Encapsulator {
   // Once edges between compiled and outside_compilation clusters have been
   // replaced by send/recv ops, some dependencies may no longer be apparent.
   // A clustering pass finds all the dependencies between HC nodes that are only
-  // present as a result of edges between nodes in outside_compilaton clusters.
+  // present as a result of edges between nodes in outside_compilation clusters.
   // Suppose there is a path from outside_compilation cluster C in subgraph S
   // to outside_compilation cluster D in subgraph T. If S != T then a control
   // edge is added from the call node for S to the call node for T, which
diff --git a/tensorflow/compiler/xla/README.md b/tensorflow/compiler/xla/README.md
index c93c39e180..39f8caaa96 100644
--- a/tensorflow/compiler/xla/README.md
+++ b/tensorflow/compiler/xla/README.md
@@ -1 +1,7 @@
-This is the home of XLA.
+<p align="center">
+  <img width="200" src="xlalogo.png"/>
+</p>
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that optimizes TensorFlow computations. See the
+[documentation](https://www.tensorflow.org/performance/xla/) for more details.
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index e560abc87f..e9ec796121 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -35,7 +35,7 @@ namespace xla {
 
 // Tries to replace a conditional with a call operation of the corresponding
 // computation. If the given conditional has a constant predicate, tries to
-// replace it with a call to its true/false computation as appropirate and then
+// replace it with a call to its true/false computation as appropriate and then
 // inline that computation.
 //
 // Returns true if it made a change to the graph.
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index dce2014564..33d8338809 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -64,7 +64,7 @@ struct SpecialCaseCopyPolicy {
   // output tuple.
   bool copy_root_replicated_buffers = false;
   // If true, insert a copy if a buffer coming from a constant or a parameter
-  // is found wihtin the output tuple.
+  // is found within the output tuple.
   bool copy_parameters_and_constants = false;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 557aa4a6bf..2e55181eed 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -33,8 +33,8 @@ namespace cpu {
 // emitters for function and function argument access.
 // The llvm::Function is created with the standard function signature
 // used in the XLA CPU backend (see ir_function.cc for argument details).
-// In addtion IrFunction saves the callers IR insert point during contruction,
-// and restores it after desctruction.
+// In addition IrFunction saves the callers IR insert point during construction,
+// and restores it after destruction.
 //
 // Example usage:
 //
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.h b/tensorflow/compiler/xla/service/cpu/shape_partition.h
index 33d02b70e6..db2cda2936 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.h
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.h
@@ -38,7 +38,7 @@ namespace cpu {
 //
 //     [0, 1), [1, 2), [2, 3), [3, 4), [4, 5) [5, 8)
 //
-//   Note that the last partition has residule because the dimension size is
+//   Note that the last partition has residual because the dimension size is
 //   not a multiple of the partition count.
 //
 //
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
index af48f4ab6e..cc1695b7f8 100644
--- a/tensorflow/compiler/xla/service/despecializer.h
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -25,7 +25,7 @@ namespace xla {
 
 // Creates an HloPassPipeline containing multiple HloPasses that can
 // despecialize an optimized HloModule. This is useful to run an HloModule
-// optimized for one specfic platform on a different platform (undoing platform
+// optimized for one specific platform on a different platform (undoing platform
 // specific passes) with matching numerics for comparison.
 //
 // Current despecialization passes are Defuser, ImplicitBroadcastRemover,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index a1d4dca5e0..b41eaa303b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -38,7 +38,7 @@ namespace gpu {
 //
 // Examples of things that are not unnested computations:
 //
-//  - The reducer of a kReduce HLO.  This is emited using IrEmitterNested.
+//  - The reducer of a kReduce HLO.  This is emitted using IrEmitterNested.
 //  - The body of a fusion node.  IrEmitterUnenested emits the relevant code
 //    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
 //    really an IrEmitter, but is more an "IR generator generator".)
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 2beac3227e..fa59a5fb20 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -135,6 +135,7 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
 
 }  // namespace
 
+
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
   typed_visitors_[PRED] = MakeUnique<HloEvaluatorTypedVisitor<bool>>(this);
diff --git a/tensorflow/compiler/xla/service/interpreter/README.md b/tensorflow/compiler/xla/service/interpreter/README.md
index 4c19a1b916..0b21b251c3 100644
--- a/tensorflow/compiler/xla/service/interpreter/README.md
+++ b/tensorflow/compiler/xla/service/interpreter/README.md
@@ -5,7 +5,7 @@ evaluating the result of the HLO graph directly with HloEvaluator, without
 lowering it further (to LLVM IR for example) before execution as other backends
 (CPU and GPU for example) do.
 
-Its key componenets are:
+Its key components are:
 
 *   [`InterpreterCompiler`] despite the inherited naming of "compiler", all
     `InterpreterCompiler` really does is the following:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 8b4e07995a..c287cca0c5 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -282,8 +282,8 @@ class LayoutAssignment : public HloPassInterface {
   // the case that no particular layout is requested.
   //
   // channel_constraints is both an input and output. Any sends or recvs that
-  // are present in channel_constraints will be layed out as constrained. Any
-  // unconstrained sends or recvs will be layed out as locally optimal and their
+  // are present in channel_constraints will be laid out as constrained. Any
+  // unconstrained sends or recvs will be laid out as locally optimal and their
   // layout will be added as a constraint to channel_constraints.
   //
   // If channel_constraints is nullptr, no kSend or kRecvs must be contained
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index e2c07e3827..688cceff0c 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -75,7 +75,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_after(
     return false;
   }
 
-  // Check that we haven't already inserted an equivalant reduce-precision
+  // Check that we haven't already inserted an equivalent reduce-precision
   // operation after this instruction.  (The zero-user case occurs when this is
   // the root instruction.)
   if (instruction->user_count() > 0) {
diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h
index a776d745f4..18e2651abb 100644
--- a/tensorflow/compiler/xla/service/source_map_util.h
+++ b/tensorflow/compiler/xla/service/source_map_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 namespace source_map_util {
 
-// Creates an INVALID_ARUGMENT status with the given format string.
+// Creates an INVALID_ARGUMENT status with the given format string.
 //
 // Also, attempts to extract the OpMetadata for parameter_number on executable
 // and append it to the status message for source mapping to user code.
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 73e014805f..6f5765849a 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -234,7 +234,7 @@ class ShapeUtil {
   }
 
   // Returns the higher-precision element type if a and b are both floating
-  // point types; otherwise, checks that that they have the same element type
+  // point types; otherwise, checks that they have the same element type
   // and returns it.
   static PrimitiveType HigherPrecisionElementType(const Shape& a,
                                                   const Shape& b) {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index efa5aed2d1..0fd846cef8 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -61,7 +61,7 @@ using TypesF16F32F64CF64 = ::testing::Types<Eigen::half, float>;
 #endif
 
 // Check that we can safely pass an input tuple's elements to a dot operation.
-TEST_F(DotOperationTest, DotOfInputTupleElem) {
+XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
   XlaBuilder builder(TestName());
 
   XlaOp param;
@@ -798,7 +798,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64,
       this->error_spec_);
 }
 
-TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
       {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
   std::unique_ptr<Array2D<float>> constant_rhs_array(
@@ -826,7 +826,7 @@ TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
       {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
   std::unique_ptr<Array2D<float>> constant_rhs_array(
@@ -855,7 +855,7 @@ TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
            DotOfGatherOptimizationWithConstRHSReverseMM)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -886,7 +886,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
            DotOfGatherOptimizationWithConstLHSReverseMM)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -917,7 +917,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSRows)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -953,7 +953,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSRows)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(
@@ -989,7 +989,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSCols)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
@@ -1017,7 +1017,7 @@ TEST_F(DotOperationTest,
 }
 
 // TODO (b/69062148) Enable when Dot implements general contracting dimensions.
-TEST_F(DotOperationTest,
+XLA_TEST_F(DotOperationTest,
        DISABLED_ON_CPU(DISABLED_ON_GPU(
            DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSCols)))) {
   std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 098443824e..41189231b9 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -514,7 +514,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 class TupleHloTest : public HloTestBase {};
 
 // Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
   const char* testcase = R"(
     HloModule m
 
diff --git a/tensorflow/compiler/xla/xlalogo.png b/tensorflow/compiler/xla/xlalogo.png
new file mode 100644
index 0000000000..7a0a295953
--- /dev/null
+++ b/tensorflow/compiler/xla/xlalogo.png
diff --git a/tensorflow/contrib/autograph/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
index 2600088595..878bb7e12f 100644
--- a/tensorflow/contrib/autograph/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -33,7 +33,7 @@ DEFAULT_UNCOMPILED_MODULES = set((
     (utils.__name__,),
 
     # All of tensorflow's subpackages. Unlike the root tf module, they don't
-    # have well-known names. Not refering to the module directly to avoid
+    # have well-known names. Not referring to the module directly to avoid
     # circular imports.
     (
         utils.__name__[:-len('.contrib.autograph.utils')],),
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index 9f7202821f..671c9ccc13 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -174,7 +174,7 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
     Tuple containing the final state.
   """
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
-  # That could be somethins as simple as a collection of dispatch rules, with
+  # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
   if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
     return _tf_while_stmt(test, body, init_state, opts)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index c492ef19f1..5dd2e0c7f2 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -371,7 +371,7 @@ class GradientBoostedDecisionTreeModel(object):
     Returns:
       a dictionary of prediction results -
         ENSEMBLE_STAMP, PREDICTION, PARTITION_IDS,
-        NUM_LAYER_ATTEMPTED, NUM_TREES_ATTEMPED.
+        NUM_LAYER_ATTEMPTED, NUM_TREES_ATTEMPTED.
     """
     ensemble_stats = training_ops.tree_ensemble_stats(ensemble_handle,
                                                       ensemble_stamp)
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 44e39f7f7b..0708d6b7b9 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -172,19 +172,20 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
   endif()
 endif()
 
+include(CheckCXXCompilerFlag)
+
+# OpenMP Support
+CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+if (GCC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+endif()
+CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+if (MSVC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+endif()
+
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
-  include(CheckCXXCompilerFlag)
-  if (tensorflow_ENABLE_MKL_SUPPORT)
-    add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
-    if (NOT tensorflow_ENABLE_MKLDNN_SUPPORT)
-      add_definitions(-DINTEL_MKL_ML)
-    endif()
-  endif()
-  CHECK_CXX_COMPILER_FLAG("-fopenmp" COMPILER_OPT_OPENMP_SUPPORT)
-  if (COMPILER_OPT_OPENMP_SUPPORT)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-  endif()
   if (WIN32)
     CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
@@ -323,10 +324,13 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+# MKL Support
 if (tensorflow_ENABLE_MKL_SUPPORT)
+  add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
   if (WIN32)
     find_path(MKL_HOME_PLATFORM mkl
       PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
       PATH_SUFFIXES windows)
     set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
     set(MKL_LINK_DIRS
@@ -345,6 +349,7 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     # Fix me: complete the path on linux
     find_path(MKL_HOME_PLATFORM mkl
       HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
       PATH_SUFFIXES linux)
     set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
     set(MKL_LINK_DIRS) # incompleted
@@ -357,6 +362,8 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
     list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
     include_directories(${mkldnn_INCLUDE_DIRS})
+  else (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    add_definitions(-DINTEL_MKL_ML)
   endif()
 endif (tensorflow_ENABLE_MKL_SUPPORT)
 
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index 116d423093..8942f3eecf 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -31,7 +31,8 @@ else (systemlib_ZLIB)
   set(ZLIB_URL https://github.com/madler/zlib)
   set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
   set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
-  set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+  # Match zlib version in tensorflow/workspace.bzl
+  set(ZLIB_TAG v1.2.11)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 92f2ab6dea..5942ff3363 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -267,6 +267,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
+      # Flaky on Windows cpu with py36 (b/73556968)
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/sparse_reshape_op_test.py"
       # Windows file management related issues.
       "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"
       # training tests
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index d269b5b69a..c483a43769 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -355,11 +355,15 @@ py_test(
     deps = [
         "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 5f47dcb339..bdc003a8a5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -18,6 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import time
+from absl.testing import parameterized
 
 from tensorflow.contrib.data.python.ops import resampling
 from tensorflow.python.data.ops import dataset_ops
@@ -30,52 +33,98 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class ResampleTest(test.TestCase):
+def _time_resampling(
+    test_obj, data_np, target_dist, init_dist, num_to_sample):
+  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
 
-  def testInitialKnownDistribution(self):
-    self._testDistribution(initial_known=True)
+  # Reshape distribution via rejection sampling.
+  dataset = dataset.apply(
+      resampling.rejection_resample(
+          class_func=lambda x: x,
+          target_dist=target_dist,
+          initial_dist=init_dist,
+          seed=142))
 
-  def testInitialNotKnownDistribution(self):
-    self._testDistribution(initial_known=False)
+  get_next = dataset.make_one_shot_iterator().get_next()
 
-  def _testDistribution(self, initial_known):
+  with test_obj.test_session() as sess:
+    start_time = time.time()
+    for _ in xrange(num_to_sample):
+      sess.run(get_next)
+    end_time = time.time()
+
+  return end_time - start_time
+
+
+class ResampleTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("InitialDistributionKnown", True),
+      ("InitialDistributionUnknown", False))
+  def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
     initial_dist = [0.2] * 5 if initial_known else None
-    iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
-        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
-            resampling.rejection_resample(
-                target_dist=target_dist,
-                initial_dist=initial_dist,
-                class_func=lambda c, _: c,
-                seed=27)).make_one_shot_iterator())
-    get_next = iterator.get_next()
+    classes = math_ops.to_int64(classes)  # needed for Windows build.
+    dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
+        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
+
+    get_next = dataset.apply(
+        resampling.rejection_resample(
+            target_dist=target_dist,
+            initial_dist=initial_dist,
+            class_func=lambda c, _: c,
+            seed=27)).make_one_shot_iterator().get_next()
 
     with self.test_session() as sess:
       returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+      while len(returned) < 4000:
+        returned.append(sess.run(get_next))
 
     returned_classes, returned_classes_and_data = zip(*returned)
     _, returned_data = zip(*returned_classes_and_data)
     self.assertAllEqual([compat.as_bytes(str(c))
                          for c in returned_classes], returned_data)
     total_returned = len(returned_classes)
-    # Subsampling rejects a large percentage of the initial data in
-    # this case.
-    self.assertGreater(total_returned, 20000 * 0.2)
     class_counts = np.array([
         len([True for v in returned_classes if v == c])
         for c in range(5)])
     returned_dist = class_counts / total_returned
     self.assertAllClose(target_dist, returned_dist, atol=1e-2)
 
+  @parameterized.named_parameters(
+      ("OnlyInitial", True),
+      ("NotInitial", False))
+  def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
+    init_dist = [0.5, 0.5]
+    target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test that this works.
+    num_samples = 100
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)
+
+    # Reshape distribution.
+    dataset = dataset.apply(
+        resampling.rejection_resample(
+            class_func=lambda x: x,
+            target_dist=target_dist,
+            initial_dist=init_dist))
+
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
     num_classes = len(init_dist)
-    # We don't need many samples to test a dirac-delta target distribution
+    # We don't need many samples to test a dirac-delta target distribution.
     num_samples = 100
     data_np = np.random.choice(num_classes, num_samples, p=init_dist)
 
@@ -109,5 +158,23 @@ class ResampleTest(test.TestCase):
 
     self.assertAllClose(target_dist, bincount, atol=1e-2)
 
+
+class ResampleDatasetBenchmark(test.Benchmark):
+
+  def benchmarkResamplePerformance(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 1000
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    resample_time = _time_resampling(
+        self, data_np, target_dist, init_dist, num_to_sample=1000)
+
+    self.report_benchmark(
+        iters=1000, wall_time=resample_time, name="benchmark_resample")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 144460fde0..eceecfd174 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -214,6 +214,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":batching",
+        ":interleave_ops",
         ":scan_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -223,6 +224,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index a182dddd38..bad6edd514 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
@@ -50,79 +52,182 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
   """
-
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    dist_estimation_batch_size = 32
     target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
     class_values_ds = dataset.map(class_func)
+
+    # Get initial distribution.
     if initial_dist is not None:
       initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
-      acceptance_dist = _calculate_acceptance_probs(initial_dist_t,
-                                                    target_dist_t)
+      acceptance_dist, prob_of_original = (
+          _calculate_acceptance_probs_with_mixing(initial_dist_t,
+                                                  target_dist_t))
       initial_dist_ds = dataset_ops.Dataset.from_tensors(
           initial_dist_t).repeat()
       acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
           acceptance_dist).repeat()
+      prob_of_original_ds = dataset_ops.Dataset.from_tensors(
+          prob_of_original).repeat()
+    else:
+      initial_dist_ds = _estimate_initial_dist_ds(
+          target_dist_t, class_values_ds)
+      acceptance_and_original_prob_ds = initial_dist_ds.map(
+          lambda initial: _calculate_acceptance_probs_with_mixing(
+              initial, target_dist_t))
+      acceptance_dist_ds = acceptance_and_original_prob_ds.map(
+          lambda accept_prob, _: accept_prob)
+      prob_of_original_ds = acceptance_and_original_prob_ds.map(
+          lambda _, prob_original: prob_original)
+    filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
+                             class_values_ds, seed)
+    # Prefetch filtered dataset for speed.
+    filtered_ds = filtered_ds.prefetch(3)
+
+    prob_original_static = _get_prob_original_static(
+        initial_dist_t, target_dist_t) if initial_dist is not None else None
+    if prob_original_static == 1:
+      return dataset_ops.Dataset.zip((class_values_ds, dataset))
+    elif prob_original_static == 0:
+      return filtered_ds
     else:
-      num_classes = (target_dist_t.shape[0].value or
-                     array_ops.shape(target_dist_t)[0])
-      smoothing_constant = 10
-      initial_examples_per_class_seen = array_ops.fill(
-          [num_classes], np.int64(smoothing_constant))
-
-      def update_estimate_and_tile(num_examples_per_class_seen, c):
-        updated_examples_per_class_seen, dist = _estimate_data_distribution(
-            c, num_examples_per_class_seen)
-        tiled_dist = array_ops.tile(
-            array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
-        return updated_examples_per_class_seen, tiled_dist
-
-      initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
-                         .apply(scan_ops.scan(initial_examples_per_class_seen,
-                                              update_estimate_and_tile))
-                         .apply(batching.unbatch()))
-      acceptance_dist_ds = initial_dist_ds.map(
-          lambda initial: _calculate_acceptance_probs(initial, target_dist_t))
-
-    def maybe_warn_on_large_rejection(accept_dist, initial_dist):
-      proportion_rejected = math_ops.reduce_sum(
-          (1 - accept_dist) * initial_dist)
-      return control_flow_ops.cond(
-          math_ops.less(proportion_rejected, .5),
-          lambda: accept_dist,
-          lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
-              accept_dist, [proportion_rejected, initial_dist, accept_dist],
-              message="Proportion of examples rejected by sampler is high: ",
-              summarize=100,
-              first_n=10))
-
-    acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
-                                                   initial_dist_ds))
-                          .map(maybe_warn_on_large_rejection))
-
-    def _gather_and_copy(class_val, acceptance_prob, data):
-      return (class_val, array_ops.gather(acceptance_prob, class_val), data)
-    current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
-        (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
-    filtered_ds = (
-        current_probabilities_and_class_and_data_ds
-        .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
-    return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+      return interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds],
+          weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]),
+          seed=seed)
 
   return _apply_fn
 
 
-def _calculate_acceptance_probs(initial_probs, target_probs):
-  """Calculate the per-class acceptance rates.
+def _get_prob_original_static(initial_dist_t, target_dist_t):
+  """Returns the static probability of sampling from the original.
+
+  `tensor_util.constant_value(prob_of_original)` returns `None` if it encounters
+  an Op that it isn't defined for. We have some custom logic to avoid this.
+
+  Args:
+    initial_dist_t: A tensor of the initial distribution.
+    target_dist_t: A tensor of the target distribution.
+
+  Returns:
+    The probability of sampling from the original distribution as a constant,
+    if it is a constant, or `None`.
+  """
+  init_static = tensor_util.constant_value(initial_dist_t)
+  target_static = tensor_util.constant_value(target_dist_t)
+
+  if init_static is None or target_static is None:
+    return None
+  else:
+    return np.min(target_static / init_static)
+
+
+def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
+               seed):
+  """Filters a dataset based on per-class acceptance probabilities.
 
   Args:
-    initial_probs: The class probabilities of the data.
-    target_probs: The desired class proportion in minibatches.
+    dataset: The dataset to be filtered.
+    acceptance_dist_ds: A dataset of acceptance probabilities.
+    initial_dist_ds: A dataset of the initial probability distribution, given or
+        estimated.
+    class_values_ds: A dataset of the corresponding classes.
+    seed: (Optional.) Python integer seed for the resampler.
+
   Returns:
-    A list of the per-class acceptance probabilities.
+    A dataset of (class value, data) after filtering.
+  """
+  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
+    proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
+    return control_flow_ops.cond(
+        math_ops.less(proportion_rejected, .5),
+        lambda: accept_dist,
+        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
+            accept_dist, [proportion_rejected, initial_dist, accept_dist],
+            message="Proportion of examples rejected by sampler is high: ",
+            summarize=100,
+            first_n=10))
+
+  acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
+                                                 initial_dist_ds))
+                        .map(maybe_warn_on_large_rejection))
+
+  def _gather_and_copy(class_val, acceptance_prob, data):
+    return class_val, array_ops.gather(acceptance_prob, class_val), data
+
+  current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
+      (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
+  filtered_ds = (
+      current_probabilities_and_class_and_data_ds
+      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
+  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+
+
+def _estimate_initial_dist_ds(
+    target_dist_t, class_values_ds, dist_estimation_batch_size=32,
+    smoothing_constant=10):
+  num_classes = (target_dist_t.shape[0].value or
+                 array_ops.shape(target_dist_t)[0])
+  initial_examples_per_class_seen = array_ops.fill(
+      [num_classes], np.int64(smoothing_constant))
+
+  def update_estimate_and_tile(num_examples_per_class_seen, c):
+    updated_examples_per_class_seen, dist = _estimate_data_distribution(
+        c, num_examples_per_class_seen)
+    tiled_dist = array_ops.tile(
+        array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
+    return updated_examples_per_class_seen, tiled_dist
 
-  This method is based on solving the following analysis:
+  initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
+                     .apply(scan_ops.scan(initial_examples_per_class_seen,
+                                          update_estimate_and_tile))
+                     .apply(batching.unbatch()))
+
+  return initial_dist_ds
+
+
+def _get_target_to_initial_ratio(initial_probs, target_probs):
+  # Add tiny to initial_probs to avoid divide by zero.
+  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
+  return target_probs / denom
+
+
+def _estimate_data_distribution(c, num_examples_per_class_seen):
+  """Estimate data distribution as labels are seen.
+
+  Args:
+    c: The class labels.  Type `int32`, shape `[batch_size]`.
+    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
+      containing counts.
+
+  Returns:
+    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
+      `[num_classes]`.
+    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+  """
+  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  # Update the class-count based on what labels are seen in batch.
+  num_examples_per_class_seen = math_ops.add(
+      num_examples_per_class_seen, math_ops.reduce_sum(
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
+  return num_examples_per_class_seen, dist
+
+
+def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs):
+  """Calculates the acceptance probabilities and mixing ratio.
+
+  In this case, we assume that we can *either* sample from the original data
+  distribution with probability `m`, or sample from a reshaped distribution
+  that comes from rejection sampling on the original distribution. This
+  rejection sampling is done on a per-class basis, with `a_i` representing the
+  probability of accepting data from class `i`.
+
+  This method is based on solving the following analysis for the reshaped
+  distribution:
 
   Let F be the probability of a rejection (on any example).
   Let p_i be the proportion of examples in the data in class i (init_probs)
@@ -151,39 +256,39 @@ def _calculate_acceptance_probs(initial_probs, target_probs):
   0 <= t_i <= 1, sum_i(t_i) = 1
   ```
 
-
   A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
-  """
-  # Add tiny to initial_probs to avoid divide by zero.
-  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
-  ratio_l = target_probs / denom
 
-  # Calculate list of acceptance probabilities.
-  max_ratio = math_ops.reduce_max(ratio_l)
-  return ratio_l / max_ratio
+  If we try to minimize the amount of data rejected, we get the following:
 
+  M_max = max_i [ t_i / p_i ]
+  M_min = min_i [ t_i / p_i ]
 
-def _estimate_data_distribution(c, num_examples_per_class_seen):
-  """Estimate data distribution as labels are seen.
+  The desired probability of accepting data if it comes from class `i`:
+
+  a_i = (t_i/p_i - m) / (M_max - m)
+
+  The desired probability of pulling a data element from the original dataset,
+  rather than the filtered one:
+
+  m = M_min
 
   Args:
-    c: The class labels.  Type `int32`, shape `[batch_size]`.
-    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
-      containing counts.
+    initial_probs: A Tensor of the initial probability distribution, given or
+      estimated.
+    target_probs: A Tensor of the corresponding classes.
 
   Returns:
-    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
-      `[num_classes]`.
-    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+    (A 1D Tensor with the per-class acceptance probabilities, the desired
+    probability of pull from the original distribution.)
   """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
-  # Update the class-count based on what labels are seen in batch.
-  num_examples_per_class_seen = math_ops.add(
-      num_examples_per_class_seen, math_ops.reduce_sum(
-          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
-  init_prob_estimate = math_ops.truediv(
-      num_examples_per_class_seen,
-      math_ops.reduce_sum(num_examples_per_class_seen))
-  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
-  return num_examples_per_class_seen, dist
+  ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs)
+  max_ratio = math_ops.reduce_max(ratio_l)
+  min_ratio = math_ops.reduce_min(ratio_l)
+
+  # Target prob to sample from original distribution.
+  m = min_ratio
+
+  # TODO(joelshor): Simplify fraction, if possible.
+  a_i = (ratio_l - m) / (max_ratio - m)
+  return a_i, m
+\ No newline at end of file
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index ecdb8967f4..268c8d0342 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -53,7 +53,7 @@ class CholeskyOuterProduct(bijector.Bijector):
   its spectrum), and that the product of two positive-diagonal lower-triangular
   matrices is another positive-diagonal lower-triangular matrix.
 
-  A simple inductive argument (proceding one column of L_3 at a time) shows
+  A simple inductive argument (proceeding one column of L_3 at a time) shows
   that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
   diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
 
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 762685db14..4384431e7b 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,6 +1,6 @@
 # Eager Execution
 
-Eager execution provides an imperative interface to TensorFlow (similiar to
+Eager execution provides an imperative interface to TensorFlow (similar to
 [NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
 operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index a8d5a0dd83..bf2aa75545 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -53,7 +53,7 @@ Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
                        int32 samples_per_second, int32 channel_count,
                        const std::vector<float>& samples, string* output_data);
 
-// Reads an video file using ffmpeg adn converts it into a RGB24 in uint8
+// Reads an video file using ffmpeg and converts it into a RGB24 in uint8
 // [frames, height, width, 3]. The w, h, and frames are obtained from ffmpeg.
 Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
                      uint32* width, uint32* height, uint32* frames);
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
index bd764ed57a..72835c3ad8 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -202,7 +202,7 @@ class CriticalSection(object):
         or lazy way that may cause a deadlock.
       ValueError: If `exclusive_resource_access` is not provided (is `True`) and
         another `CriticalSection` has an execution requesting the same
-        resources as in `*args`, `**kwargs`, and any additionaly captured
+        resources as in `*args`, `**kwargs`, and any additionally captured
         inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
         if another execution in another `CriticalSection` was created without
         `exclusive_resource_access=True`, a `ValueError` will be raised.
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index e3fc6bf0f0..4092b32004 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -112,6 +112,7 @@ class GANEstimator(estimator.Estimator):
                generator_optimizer=None,
                discriminator_optimizer=None,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -146,6 +147,9 @@ class GANEstimator(estimator.Estimator):
         list of hooks. These hooks are run on the generator and discriminator
         train ops, and can be used to implement the GAN training scheme.
         Defaults to `train.get_sequential_train_hooks()`.
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -160,7 +164,8 @@ class GANEstimator(estimator.Estimator):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries, get_hooks_fn=get_hooks_fn)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn,
+          get_eval_metric_ops_fn=get_eval_metric_ops_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 387a62bd74..955482599b 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -194,6 +195,12 @@ class GANEstimatorIntegrationTest(test.TestCase):
       lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
       return training.GradientDescentOptimizer(lr)
 
+    def get_metrics(gan_model):
+      return {
+          'mse_custom_metric': metrics_lib.mean_squared_error(
+              gan_model.real_data, gan_model.generated_data)
+      }
+
     gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     est = estimator.GANEstimator(
@@ -203,6 +210,7 @@ class GANEstimatorIntegrationTest(test.TestCase):
         discriminator_loss_fn=losses.wasserstein_discriminator_loss,
         generator_optimizer=gopt,
         discriminator_optimizer=dopt,
+        get_eval_metric_ops_fn=get_metrics,
         model_dir=self._model_dir)
 
     # TRAIN
@@ -213,6 +221,9 @@ class GANEstimatorIntegrationTest(test.TestCase):
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
     self.assertIn('loss', six.iterkeys(scores))
+    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
+                     scores['loss'])
+    self.assertIn('mse_custom_metric', six.iterkeys(scores))
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index a21358c50b..ff903a78cc 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -25,17 +25,21 @@ from tensorflow.contrib.gan.python import train as tfgan_train
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.canned import head
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import metrics as metrics_lib
 
 __all__ = [
     'GANHead',
     'gan_head',
 ]
 
+def _summary_key(head_name, val):
+  return '%s/%s' % (val, head_name) if head_name else val
+
 
 def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
              discriminator_optimizer, use_loss_summaries=True,
              get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
-             name=None):
+             get_eval_metric_ops_fn=None, name=None):
   """Creates a `GANHead`.
 
   Args:
@@ -47,9 +51,12 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
     discriminator_optimizer: Same as `generator_optimizer`, but for the
       discriminator updates.
     use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-    get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+      If `None`, uses defaults.
+    get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+      list of hooks.
+    get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+      dict of metric results keyed by name. The output of this function is
+      passed into `tf.estimator.EstimatorSpec` during evaluation.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`.
 
@@ -62,6 +69,7 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
                  discriminator_optimizer=discriminator_optimizer,
                  use_loss_summaries=use_loss_summaries,
                  get_hooks_fn=get_hooks_fn,
+                 get_eval_metric_ops_fn=get_eval_metric_ops_fn,
                  name=name)
 
 
@@ -72,6 +80,7 @@ class GANHead(head._Head):  # pylint: disable=protected-access
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -85,8 +94,11 @@ class GANHead(head._Head):  # pylint: disable=protected-access
         discriminator updates.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
-      get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
@@ -104,6 +116,8 @@ class GANHead(head._Head):  # pylint: disable=protected-access
     self._generator_optimizer = generator_optimizer
     self._discriminator_optimizer = discriminator_optimizer
     self._get_hooks_fn = get_hooks_fn
+    self._get_eval_metric_ops_fn = get_eval_metric_ops_fn
+    self._name = name
 
   @property
   def name(self):
@@ -173,13 +187,26 @@ class GANHead(head._Head):  # pylint: disable=protected-access
         gan_loss = self.create_loss(
             features=None, mode=mode, logits=gan_model, labels=None)
         scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+        with ops.name_scope(None, 'metrics',
+                            [gan_loss.generator_loss,
+                             gan_loss.discriminator_loss]):
+          eval_metric_ops = {
+              _summary_key(self._name, 'generator_loss'):
+                  metrics_lib.mean(gan_loss.generator_loss),
+              _summary_key(self._name, 'discriminator_loss'):
+                  metrics_lib.mean(gan_loss.discriminator_loss)
+          }
+          if self._get_eval_metric_ops_fn is not None:
+            custom_eval_metric_ops = self._get_eval_metric_ops_fn(gan_model)
+            if not isinstance(custom_eval_metric_ops, dict):
+              raise TypeError('get_eval_metric_ops_fn must return a dict, '
+                              'received: {}'.format(custom_eval_metric_ops))
+            eval_metric_ops.update(custom_eval_metric_ops)
         return model_fn_lib.EstimatorSpec(
             mode=model_fn_lib.ModeKeys.EVAL,
             predictions=gan_model.generated_data,
             loss=scalar_loss,
-            # TODO(joelshor): Add metrics. If head name provided, append it to
-            # metric keys.
-            eval_metric_ops={})
+            eval_metric_ops=eval_metric_ops)
       elif mode == model_fn_lib.ModeKeys.TRAIN:
         if train_op_fn is None:
           raise ValueError('train_op_fn can not be None.')
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 8168f005cd..6587f1fc60 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -62,9 +62,14 @@ class GANHeadTest(test.TestCase):
         generator_loss_fn=dummy_loss,
         discriminator_loss_fn=dummy_loss,
         generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0))
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        get_eval_metric_ops_fn=self.get_metrics)
     self.assertTrue(isinstance(self.gan_head, head.GANHead))
 
+  def get_metrics(self, gan_model):
+    self.assertTrue(isinstance(gan_model, tfgan_tuples.GANModel))
+    return {}
+
   def _test_modes_helper(self, mode):
     self.gan_head.create_estimator_spec(
         features=None,
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
index df71187fbd..a9b8faa712 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Miscellanous utilities for TFGAN code and examples."""
+"""Miscellaneous utilities for TFGAN code and examples."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index a320a3f232..592d37b432 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -677,7 +677,7 @@ def copy_with_input_replacements(sgv, replacement_ts,
 
 
 def _add_control_flow_ops(ops, control_ios):
-  """Complete `ops` so that the tranformed graph is valid.
+  """Complete `ops` so that the transformed graph is valid.
 
   Partially copying a graph can lead to a malformed graph. For instance,
   copying half of a while construct is likely to result in an invalid graph.
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
index 60281951dd..66939fbb0f 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -115,7 +115,7 @@ static void CheckOpsSupport(const GraphDef& graph_def,
       HexagonOpsDefinitions::getInstance();
   LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
   LOG(INFO) << "dump_all_nodes = " << dump_all_nodes
-            << ", dump_shape_and_tpye = " << dump_shape_and_type;
+            << ", dump_shape_and_type = " << dump_shape_and_type;
 
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index 8f406ace1d..f230d93da4 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -17,7 +17,7 @@
 ### API
 
 This module provides functions for image manipulation; currently, chrominance
-transformas (including changing saturation and hue) in YIQ space and
+transforms (including changing saturation and hue) in YIQ space and
 projective transforms (including rotation) are supported.
 
 ## Image Transformation `Ops`
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index b261f41bf9..d6b1a61b71 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -325,7 +325,7 @@ def distributed_grads_only_and_ops_chief_worker(
 
   All workers perform gradient computation. Chief worker applies gradient after
   averaging the gradients obtained from all the workers. All workers block
-  execution untill the update is applied. Chief worker runs covariance and
+  execution until the update is applied. Chief worker runs covariance and
   inverse update ops. Covariance and inverse matrices are placed on parameter
   servers in a round robin manner. For further details on synchronous
   distributed optimization check `tf.train.SyncReplicasOptimizer`.
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 45a760c9f1..b7f63d8d94 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -66,7 +66,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           the local approximation with the Fisher information matrix, and to
           regularize the update direction by making it closer to the gradient.
           If damping is adapted during training then this value is used for
-          initializing damping varaible.
+          initializing damping variable.
           (Higher damping means the update looks more like a standard gradient
           update - see Tikhonov regularization.)
       layer_collection: The layer collection object, which holds the fisher
@@ -114,7 +114,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     self._estimation_mode = estimation_mode
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
 
-    # The below paramaters are required only if damping needs to be adapated.
+    # The below parameters are required only if damping needs to be adapated.
     # These parameters can be set by calling
     # set_damping_adaptation_params() explicitly.
     self._damping_adaptation_decay = 0.95
@@ -195,7 +195,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       min_damping: `float`(Optional), Minimum value the damping parameter
         can take. Default value 1e-5.
       damping_adaptation_decay: `float`(Optional), The `damping` parameter is
-        multipled by the `damping_adaptation_decay` every
+        multiplied by the `damping_adaptation_decay` every
         `damping_adaptation_interval` number of iterations. Default value 0.99.
       damping_adaptation_interval: `int`(Optional), Number of steps in between
         updating the `damping` parameter. Default value 5.
diff --git a/tensorflow/contrib/kfac/python/ops/placement.py b/tensorflow/contrib/kfac/python/ops/placement.py
index 8a20ebe198..c4454325ae 100644
--- a/tensorflow/contrib/kfac/python/ops/placement.py
+++ b/tensorflow/contrib/kfac/python/ops/placement.py
@@ -51,7 +51,7 @@ class RoundRobinPlacementMixin(object):
     self._inv_devices = inv_devices
 
   def make_vars_and_create_op_thunks(self, scope=None):
-    """Make vars and create op thunks w/ a round-robin device placement strat.
+    """Make vars and create op thunks w/ a round-robin device placement start.
 
     For each factor, all of that factor's cov variables and their associated
     update ops will be placed on a particular device.  A new device is chosen
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index f708da6693..b7194ae333 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -1411,7 +1543,7 @@ def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
   Args:
      tensor: An `int` `Tensor` to be converted to a `Sparse`.
      eos_token: An integer.
-       It is part of the target label that signfies the end of a sentence.
+       It is part of the target label that signifies the end of a sentence.
      outputs_collections: Collection to add the outputs.
      scope: Optional scope for name_scope.
   """
@@ -1555,7 +1687,7 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
     output_collections: Collection to which the outputs will be added.
     scope: Optional scope for `name_scope`.
   Returns:
-    A `Tensor` or `SparseTensor` conataining the same values as `inputs`, but
+    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
     with innermost dimensions flattened to obtain rank `new_rank`.
 
   Raises:
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 997f910a2a..b01fd5d5c9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index c7cdb41312..f8106d1e4a 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -343,7 +343,8 @@ def get_temp_export_dir(timestamped_export_dir):
   """
   (dirname, basename) = os.path.split(timestamped_export_dir)
   temp_export_dir = os.path.join(
-      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
+      compat.as_bytes(dirname),
+      compat.as_bytes('temp-{}'.format(compat.as_text(basename))))
   return temp_export_dir
 
 
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 01c76b7a66..55b984f260 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -6,8 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
-exports_files(["LICENSE"])
-
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index 1053cce385..cc8a8035d1 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -1,4 +1,3 @@
-
 # Find where we're running from, so we can store generated files here.
 ifeq ($(origin MAKEFILE_DIR), undefined)
 	MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
@@ -69,12 +68,12 @@ LIB_NAME := libtensorflow-lite.a
 LIB_PATH := $(LIBDIR)$(LIB_NAME)
 
 # A small example program that shows how to link against the library.
-BENCHMARK_PATH := $(BINDIR)benchmark_model
+MINIMAL_PATH := $(BINDIR)minimal
 
-BENCHMARK_SRCS := \
-tensorflow/contrib/lite/tools/benchmark_model.cc
-BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_SRCS))))
+MINIMAL_SRCS := \
+tensorflow/contrib/lite/examples/minimal/minimal.cc
+MINIMAL_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
 
 # What sources we want to compile, must be kept in sync with the main Bazel
 # build files.
@@ -100,7 +99,7 @@ $(wildcard tensorflow/contrib/lite/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/*/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/*/*/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/kernels/test_util.cc) \
-$(BENCHMARK_SRCS)
+$(MINIMAL_SRCS)
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # File names of the intermediate files target compilation generates.
@@ -119,17 +118,17 @@ $(OBJDIR)%.o: %.c
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
-all: $(LIB_PATH) $(BENCHMARK_PATH)
+all: $(LIB_PATH)  $(MINIMAL_PATH)
 
 # Gathers together all the objects we've compiled into a single '.a' archive.
 $(LIB_PATH): $(LIB_OBJS)
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
 
-$(BENCHMARK_PATH): $(BENCHMARK_OBJS) $(LIB_PATH)
+$(MINIMAL_PATH): $(MINIMAL_OBJS) $(LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(BENCHMARK_PATH) $(BENCHMARK_OBJS) \
+	-o $(MINIMAL_PATH) $(MINIMAL_OBJS) \
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Gets rid of all generated files.
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
new file mode 100644
index 0000000000..106e3b0270
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include <cstdio>
+
+// This is an example that is minimal to read a model
+// from disk and perform inference. There is no data being loaded
+// that is up to you to add as a user.
+//
+// NOTE: Do not add any dependencies to this that cannot be built with
+// the minimal makefile. This example must remain trivial to build with
+// the minimal build tool.
+//
+// Usage: minimal <tflite model>
+
+using namespace tflite;
+
+#define TFLITE_MINIMAL_CHECK(x) \
+  if(!(x)) {                                                    \
+    fprintf(stderr, "Error at %s:%d\n",  __FILE__, __LINE__); \
+    exit(1); \
+  }
+
+
+int main(int argc, char *argv[]) {
+  if(argc != 2) {
+    fprintf(stderr, "Usage: %s <model>\n");
+    return 1;
+  }
+  const char* filename = argv[1];
+
+  // Load model
+  std::unique_ptr<tflite::FlatBufferModel> model
+      = tflite::FlatBufferModel::BuildFromFile(filename);
+  TFLITE_MINIMAL_CHECK(model != nullptr);
+
+  // Build the interpreter
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model.get(), resolver);
+  std::unique_ptr<Interpreter> interpreter;
+  builder(&interpreter);
+  TFLITE_MINIMAL_CHECK(interpreter != nullptr);
+
+  // Allocate tensor buffers.
+  TFLITE_MINIMAL_CHECK(interpreter->AllocateTensors() == kTfLiteOk);
+
+  // Fill input buffers
+  // TODO(user): Insert code to fill input tensors
+
+  // Run inference
+  TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk);
+
+  // Read output buffers
+  // TODO(user): Insert getting data out code.
+
+  return 0;
+}
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
index 7a3a231626..ab50789307 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -32,7 +32,7 @@ This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc v
 
 Log in to you RPI, install the toolchain.
 ```bash
-sudo apt-get instal build-essential
+sudo apt-get install build-essential
 ```
 
 First, clone this TensorFlow repository. Run this at the root of the repository:
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index f23b90d9dc..d48178d608 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -3387,7 +3387,7 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
                           const int32 output_zeropoint,
                           const float output_scale) {
   // The arguments input_zeropoint and input_scale are expected to be an array
-  // that have the quantization paramaters for all the inputs to the concat
+  // that have the quantization parameters for all the inputs to the concat
   // operator.
   gemmlowp::ScopedProfilingLabel label("Concatenation");
   TFLITE_DCHECK_GT(inputs_count, 1);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index f6d8d3257b..62d6fe0bb3 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1789,7 +1789,7 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
                           const int32 output_zeropoint,
                           const float output_scale) {
   // The arguments input_zeropoint and input_scale are expected to be an array
-  // that have the quantization paramaters for all the inputs to the concat
+  // that have the quantization parameters for all the inputs to the concat
   // operator.
   TFLITE_DCHECK_GT(inputs_count, 1);
   int64_t concat_size = 0;
@@ -1975,7 +1975,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // requiring a power-of-two representation interval. Thus, we should right
 // away quantize this array to a power-of-two interval; otherwise,
 // implementation will need to rescale that, losing any benefit that a tighter
-// representation interval might otherwise yield, while introducting some
+// representation interval might otherwise yield, while introducing some
 // numerical error and computational overhead.
 //
 // Now, Logistic and Tanh
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index e5b640fcee..8bdeb035f5 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -65,7 +65,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v0.fbs b/tensorflow/contrib/lite/schema/schema_v0.fbs
index 852ea988f3..891d8366cc 100644
--- a/tensorflow/contrib/lite/schema/schema_v0.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v0.fbs
@@ -48,7 +48,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v1.fbs b/tensorflow/contrib/lite/schema/schema_v1.fbs
index 06cd9408ed..b438b569e6 100644
--- a/tensorflow/contrib/lite/schema/schema_v1.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v1.fbs
@@ -53,7 +53,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v2.fbs b/tensorflow/contrib/lite/schema/schema_v2.fbs
index 96731c8aae..b90408ff6d 100644
--- a/tensorflow/contrib/lite/schema/schema_v2.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v2.fbs
@@ -54,7 +54,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v3.fbs b/tensorflow/contrib/lite/schema/schema_v3.fbs
index cedefe08f3..020da38493 100644
--- a/tensorflow/contrib/lite/schema/schema_v3.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v3.fbs
@@ -53,7 +53,7 @@ table Tensor {
   type:TensorType;
   // An index that refers to the buffers table at the root of the model. Or,
   // if there is no data buffer associated (i.e. intermediate results), then
-  // this is 0 (which refers to an always existant empty buffer).
+  // this is 0 (which refers to an always existent empty buffer).
   //
   // The data_buffer itself is an opaque container, with the assumption that the
   // target device is little-endian. In addition, all builtin operators assume
@@ -64,7 +64,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 07d2b28bbe..0e036bda92 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -109,7 +109,7 @@ KNOWN_BUGS = {
 
 
 class ExtraTocoOptions(object):
-  """Additonal toco options besides input, output, shape."""
+  """Additional toco options besides input, output, shape."""
 
   def __init__(self):
     # Whether to ignore control dependency nodes.
@@ -2016,7 +2016,7 @@ def make_lstm_tests(zip_path):
     return inputs_after_split, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    """Feed inputs, assign vairables, and freeze graph."""
+    """Feed inputs, assign variables, and freeze graph."""
 
     with tf.variable_scope("", reuse=True):
       kernel = tf.get_variable("rnn/basic_lstm_cell/kernel")
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 1f07068aee..8cab6cd8cd 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -227,8 +227,8 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   if (!IsValid()) return;
   auto* tensor = interpreter_->tensor(id);
   if (expected_output_.count(id) != 0) {
-    fprintf(stderr, "Overriden expectation for tensor %d\n", id);
-    Invalidate("Overriden expectation");
+    fprintf(stderr, "Overridden expectation for tensor %d\n", id);
+    Invalidate("Overridden expectation");
   }
   expected_output_[id].reset(new Expectation);
   switch (tensor->type) {
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 495014c6fc..7680cdd344 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -115,7 +115,7 @@ bazel run --config=opt \
 
 In order to evaluate the possible benefit of generating a quantized graph, TOCO
 allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
-and `--default_ranges_max` accept plausable values for the min-max ranges of the
+and `--default_ranges_max` accept plausible values for the min-max ranges of the
 values in all arrays that do not have min-max information. "Dummy-quantization"
 will produce lower accuracy but will emulate the performance of a correctly
 quantized model.
@@ -338,7 +338,7 @@ below outline the use cases for each.
 ### Using `--output_format=GRAPHVIZ_DOT`
 
 The first way to get a graphviz rendering is to pass `GRAPHVIZ_DOT` into
-`--output_format`. This results in a plausable visualization of the graph. This
+`--output_format`. This results in a plausible visualization of the graph. This
 reduces the requirements that normally exist during conversion between other
 input and output formats. For example, this may be useful if conversion from
 TENSORFLOW_GRAPHDEF to TFLITE is failing.
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 50f0620b3c..5e9c20e40d 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -25,10 +25,10 @@ namespace tflite {
 
 class BaseOperator;
 
-// Return a map contained all knwo TF Lite Operators, keyed by their names.
+// Return a map contained all know TF Lite Operators, keyed by their names.
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap();
 
-// Return a map contained all knwo TF Lite Operators, keyed by the type of
+// Return a map contained all know TF Lite Operators, keyed by the type of
 // their tf.mini counterparts.
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap();
 
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 253f022e6b..8589ca361d 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -127,7 +127,7 @@ message TocoFlags {
   // transformations that are necessary in order to generate inference
   // code for these graphs. Such graphs should be fixed, but as a
   // temporary work-around, setting this reorder_across_fake_quant flag
-  // allows toco to perform necessary graph transformaitons on them,
+  // allows toco to perform necessary graph transformations on them,
   // at the cost of no longer faithfully matching inference and training
   // arithmetic.
   optional bool reorder_across_fake_quant = 8;
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
index 37539b9599..5ed8057b86 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -58,7 +58,7 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 
 # Creates the workers and return their sessions, graphs, train_ops.
-# Cheif worker will update at last
+# Chief worker will update at last
 def _get_workers(num_workers, period, workers, moving_rate):
   sessions = []
   graphs = []
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
index 6cca0a8a00..3acd940268 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -57,7 +57,7 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 
 # Creates the workers and return their sessions, graphs, train_ops.
-# Cheif worker will update at last
+# Chief worker will update at last
 def _get_workers(num_workers, steps, workers):
   sessions = []
   graphs = []
@@ -146,7 +146,7 @@ class ModelAverageOptimizerTest(test.TestCase):
     self.assertAllEqual(1.0, sessions[0].run(global_var_1))
     self.assertAllEqual(0, sessions[0].run(global_step))
 
-    # iteration 2, global varibale update
+    # iteration 2, global variable update
     thread_0 = self.checkedThread(
         target=self._run, args=(train_ops[0], sessions[0]))
     thread_1 = self.checkedThread(
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/contrib/signal/python/ops/window_ops.py
index 50094010dc..59e67e8ba4 100644
--- a/tensorflow/contrib/signal/python/ops/window_ops.py
+++ b/tensorflow/contrib/signal/python/ops/window_ops.py
@@ -47,7 +47,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hann_window', window_length, periodic,
                                dtype, 0.5, 0.5)
@@ -72,7 +72,7 @@ def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hamming_window', window_length, periodic,
                                dtype, 0.54, 0.46)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index f2d31dc8db..d877831fce 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -102,7 +102,7 @@ class BoundingBox(ItemHandler):
   """An ItemHandler that concatenates a set of parsed Tensors to Bounding Boxes.
   """
 
-  def __init__(self, keys=None, prefix=None):
+  def __init__(self, keys=None, prefix=''):
     """Initialize the bounding box handler.
 
     Args:
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 8a2c74742a..6e55b9407b 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -571,7 +571,7 @@ def train(train_op,
       default, two `Boolean`, scalar ops called "should_stop" and "should_log"
       are provided.
     log_every_n_steps: The frequency, in terms of global steps, that the loss
-      and global step and logged.
+      and global step are logged.
     graph: The graph to pass to the supervisor. If no graph is supplied the
       default graph is used.
     master: The address of the tensorflow master.
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
index d5d8e4100f..cfdc884277 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
@@ -1080,14 +1080,20 @@ class SummaryDbWriter : public SummaryWriterInterface {
     // See tensorboard/plugins/histogram/summary.py and data_compat.py
     Tensor t{DT_DOUBLE, {k, 3}};
     auto data = t.flat<double>();
-    for (int i = 0; i < k; ++i) {
-      double left_edge = ((i - 1 >= 0) ? histo.bucket_limit(i - 1)
-                                       : std::numeric_limits<double>::min());
-      double right_edge = ((i + 1 < k) ? histo.bucket_limit(i + 1)
-                                       : std::numeric_limits<double>::max());
-      data(i + 0) = left_edge;
-      data(i + 1) = right_edge;
-      data(i + 2) = histo.bucket(i);
+    for (int i = 0, j = 0; i < k; ++i) {
+      // TODO(nickfelt): reconcile with TensorBoard's data_compat.py
+      // From summary.proto
+      // Parallel arrays encoding the bucket boundaries and the bucket values.
+      // bucket(i) is the count for the bucket i.  The range for
+      // a bucket is:
+      //   i == 0:  -DBL_MAX .. bucket_limit(0)
+      //   i != 0:  bucket_limit(i-1) .. bucket_limit(i)
+      double left_edge = (i == 0) ? std::numeric_limits<double>::min()
+                                  : histo.bucket_limit(i - 1);
+
+      data(j++) = left_edge;
+      data(j++) = histo.bucket_limit(i);
+      data(j++) = histo.bucket(i);
     }
     int64 tag_id;
     PatchPluginName(s->mutable_metadata(), kHistogramPluginName);
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
index c34b6763a1..2e8d4109dd 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
@@ -100,6 +100,56 @@ class SummaryDbWriterTest : public ::testing::Test {
   SummaryWriterInterface* writer_ = nullptr;
 };
 
+TEST_F(SummaryDbWriterTest, WriteHistogram_VerifyTensorValues) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "histtest", "test1", "user1", &env_,
+                                     &writer_));
+  int step = 0;
+  std::unique_ptr<Event> e{new Event};
+  e->set_step(step);
+  e->set_wall_time(123);
+  Summary::Value* s = e->mutable_summary()->add_value();
+  s->set_tag("normal/myhisto");
+
+  double dummy_value = 10.123;
+  HistogramProto* proto = s->mutable_histo();
+  proto->Clear();
+  proto->set_min(dummy_value);
+  proto->set_max(dummy_value);
+  proto->set_num(dummy_value);
+  proto->set_sum(dummy_value);
+  proto->set_sum_squares(dummy_value);
+
+  int size = 3;
+  double bucket_limits[] = {-30.5, -10.5, -5.5};
+  double bucket[] = {-10, 10, 20};
+  for (int i = 0; i < size; i++) {
+    proto->add_bucket_limit(bucket_limits[i]);
+    proto->add_bucket(bucket[i]);
+  }
+  TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
+  TF_ASSERT_OK(writer_->Flush());
+  writer_->Unref();
+  writer_ = nullptr;
+
+  // TODO(nickfelt): implement QueryTensor() to encapsulate this
+  // Verify the data
+  string result = QueryString("SELECT data FROM Tensors");
+  const double* val = reinterpret_cast<const double*>(result.data());
+  double histarray[] = {std::numeric_limits<double>::min(),
+                        -30.5,
+                        -10,
+                        -30.5,
+                        -10.5,
+                        10,
+                        -10.5,
+                        -5.5,
+                        20};
+  int histarray_size = 9;
+  for (int i = 0; i < histarray_size; i++) {
+    EXPECT_EQ(histarray[i], val[i]);
+  }
+}
+
 TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
                                      &writer_));
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 6d6feb3c39..a5d8b061b6 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -67,6 +67,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -86,6 +87,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
         ":trt_resources",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -197,10 +199,12 @@ tf_py_wrap_cc(
 tf_cuda_library(
     name = "trt_resources",
     srcs = [
+        "resources/trt_allocator.cc",
         "resources/trt_int8_calibrator.cc",
         "resources/trt_resource_manager.cc",
     ],
     hdrs = [
+        "resources/trt_allocator.h",
         "resources/trt_int8_calibrator.h",
         "resources/trt_resource_manager.h",
         "resources/trt_resources.h",
@@ -221,18 +225,25 @@ tf_cuda_library(
     srcs = [
         "convert/convert_graph.cc",
         "convert/convert_nodes.cc",
+        "convert/trt_optimization_pass.cc",
     ],
     hdrs = [
         "convert/convert_graph.h",
         "convert/convert_nodes.h",
+        "convert/trt_optimization_pass.h",
     ],
     deps = [
         ":segment",
+        ":trt_plugins",
         ":trt_logging",
         ":trt_resources",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -241,8 +252,7 @@ tf_cuda_library(
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:constant_folding",
-        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -256,7 +266,6 @@ cc_library(
         "segment/segment.h",
         "segment/union_find.h",
     ],
-    linkstatic = 1,
     deps = [
         "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
@@ -279,6 +288,46 @@ tf_cc_test(
     ],
 )
 
+# Library for the plugin factory
+tf_cuda_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 py_test(
     name = "tf_trt_integration_test",
     srcs = ["test/tf_trt_integration_test.py"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 0774027711..b7b26cfb1c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <list>
 #include <map>
@@ -24,6 +25,9 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -31,8 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
-#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -75,7 +78,8 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
       // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node->type_string());
+  return (candidate_ops.count(node->type_string()) ||
+          PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
@@ -144,7 +148,8 @@ struct ConvertGraphParams {
       size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
       const tensorflow::grappler::GraphProperties& current_graph_properties,
       std::unordered_map<string, std::pair<int, string>>* output_edges,
-      int engine_precision_mode)
+      int engine_precision_mode, const string& device_name,
+      std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id)
       : graph(inp_graph),
         output_names(output_node_names),
         subgraph_node_ids(subgraph_node_id_numbers),
@@ -152,7 +157,10 @@ struct ConvertGraphParams {
         max_workspace_size_bytes(max_consumed_workspace_size_bytes),
         graph_properties(current_graph_properties),
         output_edge_map(output_edges),
-        precision_mode(engine_precision_mode) {}
+        precision_mode(engine_precision_mode),
+        device_name_(device_name),
+        allocator_(allocator),
+        cuda_gpu_id_(cuda_gpu_id) {}
   tensorflow::Graph& graph;
   const std::vector<string>& output_names;
   const std::set<int>& subgraph_node_ids;
@@ -161,6 +169,9 @@ struct ConvertGraphParams {
   const tensorflow::grappler::GraphProperties& graph_properties;
   std::unordered_map<string, std::pair<int, string>>* output_edge_map;
   int precision_mode;
+  string device_name_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  int cuda_gpu_id_;
   std::vector<std::pair<int, int>> subgraph_inputs;
   std::vector<std::pair<int, int>> subgraph_outputs;
   tensorflow::EdgeSet subgraph_incoming_edges;
@@ -194,7 +205,7 @@ static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
                              subgraph_outputs_set.begin(),
                              subgraph_outputs_set.end());
   return tensorflow::Status::OK();
-};
+}
 
 tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
@@ -203,7 +214,8 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
                    params->subgraph_inputs, params->subgraph_outputs,
                    params->max_batch_size, params->max_workspace_size_bytes,
                    params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
+                   &trt_node_def, params->precision_mode, params->device_name_,
+                   params->allocator_, params->cuda_gpu_id_);
   TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
   tensorflow::Status status;
   tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
@@ -233,7 +245,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
                    params->subgraph_inputs, params->subgraph_outputs,
                    params->max_batch_size, params->max_workspace_size_bytes,
                    params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
+                   &trt_node_def, params->precision_mode, params->device_name_,
+                   params->allocator_, params->cuda_gpu_id_);
   TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
   tensorflow::Status status;
   tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
@@ -331,19 +344,12 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
-  tensorflow::GraphDef gdef;
-
-  // Layout optimization
   item.graph = graph_def;
-  tensorflow::grappler::LayoutOptimizer optimizer;
-  tensorflow::grappler::Cluster* cluster;
 
-  // virtual cluster
   tensorflow::DeviceProperties device_properties;
-
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
-  cluster =
+  tensorflow::grappler::Cluster* cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
   // single machine
@@ -351,27 +357,38 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   VLOG(2) << "cpu_cores: " << num_cpu_cores;
   VLOG(2) << "gpus: " << num_gpus;
-
-  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
-
-  // constant folding
+  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::GraphDef gdef;
+  TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef));
   item.graph = gdef;
-  tensorflow::grappler::ConstantFolding fold(nullptr);
-  TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
 
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
-  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   // Build full graph
+
+  return ConvertAfterShapes(gdef, output_names, max_batch_size,
+                            max_workspace_size_bytes, new_graph_def,
+                            precision_mode, minimum_segment_size,
+                            static_graph_properties, nullptr);
+}
+
+tensorflow::Status ConvertAfterShapes(
+    const tensorflow::GraphDef& gdef, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    tensorflow::GraphDef* new_graph_def, int precision_mode,
+    int minimum_segment_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const tensorflow::grappler::Cluster* cluster) {
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              gdef.library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
 
-  // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorflow::tensorrt::segment::SegmentOptions segment_options;
-
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : output_names) {
     segment_options.exclude_node_list.insert(node);
@@ -381,7 +398,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   segment_options.minimum_segment_size = minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      gdef, IsTensorRTCandidate, segment_options, &segments));
+      &graph, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
@@ -391,9 +408,21 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int count = 0;
   float total_num_nodes_in_segments = 0.;
   for (auto s : segments) {
-    total_num_nodes_in_segments += s.size();
+    total_num_nodes_in_segments += s.first.size();
   }
-  for (const std::set<string>& subgraph_node_names : segments) {
+  // We create the map here since cluster may not be available in all cases.
+  std::map<string, tensorflow::Device*> name_to_device_map;
+  if (cluster) {
+    // TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a
+    // distributed environment, devices from different workers can have same
+    // short name.
+    for (const auto dm : cluster->GetDeviceSet()->devices()) {
+      name_to_device_map[dm->name()] = dm;
+    }
+  }
+  for (const auto& segment_nodes_and_device : segments) {
+    const std::set<string>& subgraph_node_names =
+        segment_nodes_and_device.first;
     std::set<int> subgraph_node_ids;
     size_t max_mem_per_engine =
         max_workspace_size_bytes *
@@ -403,10 +432,40 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       oss << " " << node_name;
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
-    VLOG(2) << "Subgraph nodes" << oss.str();
+    VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second
+            << " : " << oss.str();
+    auto target_device =
+        name_to_device_map.find(segment_nodes_and_device.second);
+    std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0);
+
+    int cuda_device_id = 0;
+    if (target_device != name_to_device_map.end()) {
+      tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (!s.ok()) {
+        LOG(ERROR)
+            << "Cuda device identification failed, using device 0. Error= "
+            << s;
+      } else {
+        cuda_device_id = cuda_gpu_id.value();
+      }
+      tensorflow::GPUOptions gpuoptions;
+      // we need to us PM here since in python path there is no way to get to
+      // allocators
+      auto pm = tensorflow::ProcessState::singleton();
+      // this should be instantiated by now
+      auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
+      VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
+              << " cuda device= " << cuda_device_id << " at " << dev_allocator;
+      allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
+    } else {  // device unknown or not available
+      allocator = std::make_shared<TRTCudaAllocator>();
+    }
     ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
-                         max_mem_per_engine, static_graph_properties,
-                         &output_edge_map, precision_mode);
+                         max_mem_per_engine, graph_properties, &output_edge_map,
+                         precision_mode, segment_nodes_and_device.second,
+                         allocator, cuda_device_id);
     if (precision_mode == INT8MODE) {
       tensorflow::Status status = GetCalibNode(&p);
       if (status != tensorflow::Status::OK()) {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index e01e4a5328..65a67d7e73 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -43,6 +45,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size);
 
+// Method to call from optimization pass
+tensorflow::Status ConvertAfterShapes(
+    const tensorflow::GraphDef& graph, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    tensorflow::GraphDef* new_graph_def, int precision_mode,
+    int minimum_segment_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const tensorflow::grappler::Cluster* cluster);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index b81ae9dc3e..32b211dcd1 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <algorithm>
 #include <list>
@@ -240,35 +241,49 @@ class TFAttrs {
     return attrs_.at(key);
   }
   template <typename T>
-  T get(string key) const;
+  T get(const string& key) const;
   template <typename T>
-  T get(string key, const T& default_value) const {
+  T get(const string& key, const T& default_value) const {
     return attrs_.count(key) ? this->get<T>(key) : default_value;
   }
 
+  std::vector<string> GetAllAttrKey() {
+    std::vector<string> attr_list;
+    for (const auto& attr_item : attrs_) {
+      attr_list.emplace_back(attr_item.first);
+    }
+    return attr_list;
+  }
+
  private:
   typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
   AttrMap attrs_;
 };
 
 template <>
-string TFAttrs::get<string>(string key) const {
+string TFAttrs::get<string>(const string& key) const {
   return this->at(key)->s();
 }
 
 template <>
-std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
+std::vector<int> TFAttrs::get<std::vector<int>>(const string& key) const {
   auto attr = this->at(key)->list().i();
   return std::vector<int>(attr.begin(), attr.end());
 }
 
 template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
+std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
+  auto attr = this->at(key)->list().f();
+  return std::vector<float>(attr.begin(), attr.end());
+}
+
+template <>
+std::vector<string> TFAttrs::get<std::vector<string>>(const string& key) const {
   auto attr = this->at(key)->list().s();
   return std::vector<string>(attr.begin(), attr.end());
 }
 template <>
-nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
+nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(const string& key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
   dims.nbDims = values.size();
@@ -278,24 +293,25 @@ nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
 }
 
 template <>
-nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(string key) const {
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
   TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
   return trt_dtype;
 }
 
 template <>
-tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
+tensorflow::DataType TFAttrs::get<tensorflow::DataType>(
+    const string& key) const {
   return this->at(key)->type();
 }
 
 template <>
-float TFAttrs::get<float>(string key) const {
+float TFAttrs::get<float>(const string& key) const {
   return this->at(key)->f();
 }
 
 template <>
-bool TFAttrs::get<bool>(string key) const {
+bool TFAttrs::get<bool>(const string& key) const {
   return this->at(key)->b();
 }
 
@@ -424,6 +440,7 @@ using OpConverter =
 class Converter {
   std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
   std::unordered_map<string, OpConverter> op_registry_;
+  OpConverter plugin_converter_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
   tensorflow::tensorrt::TRTWeightStore* weight_store_;
@@ -481,7 +498,7 @@ class Converter {
     weights.SetValues(weight_store_->store_.back().data());
     return weights;
   }
-  bool isFP16() { return fp16_; };
+  bool isFP16() { return fp16_; }
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
@@ -490,13 +507,17 @@ class Converter {
     std::vector<TRT_TensorOrWeights> inputs;
     TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
-    if (!op_registry_.count(op)) {
-      return tensorflow::errors::Unimplemented(
-          "No converter registered for op: " + op);
-    }
-    OpConverter op_converter = op_registry_.at(op);
     std::vector<TRT_TensorOrWeights> outputs;
-    TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
+      TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs));
+    } else {
+      if (!op_registry_.count(op)) {
+        return tensorflow::errors::Unimplemented(
+            "No converter registered for op: " + op);
+      }
+      OpConverter op_converter = op_registry_.at(op);
+      TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    }
     for (size_t i = 0; i < outputs.size(); ++i) {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
@@ -672,7 +693,7 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
       return [](Eigen::half t) -> Eigen::half {
-        return Eigen::half(1.0 / sqrt(float(t)));
+        return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
@@ -1158,9 +1179,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented(
-        "binary op: " + node_def.op() +
-        " not supported at: " + node_def.name());
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -1173,6 +1194,45 @@ tensorflow::Status BinaryTensorOpTensor(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertPlugin(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  // prepare input
+  std::vector<nvinfer1::ITensor*> all_inputs;
+  for (auto input : inputs) {
+    all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
+  }
+
+  // plugin is owned by PluginFactory
+  // TODO(jie): destroy plugins later (resource management)
+  PluginTensorRT* plugin =
+      PluginFactoryTensorRT::GetInstance()->CreatePlugin(node_def.op());
+
+  // passing attributes
+  // TODO(jie): support more general attribute
+  TFAttrs attrs(node_def);
+  auto attr_key_vector = attrs.GetAllAttrKey();
+  for (auto attr_key : attr_key_vector) {
+    // TODO(jie): support only list of float for toy example here.
+    auto data = attrs.get<std::vector<float>>(attr_key);
+    size_t size_data = data.size() * sizeof(float);
+    if (!plugin->SetAttribute(attr_key, static_cast<void*>(data.data()),
+                              size_data)) {
+      return tensorflow::errors::InvalidArgument("plugin SetAttribute failed");
+    }
+  }
+
+  nvinfer1::IPluginLayer* layer = ctx.network()->addPlugin(
+      &all_inputs[0], static_cast<int>(inputs.size()), *plugin);
+
+  for (int i = 0; i < layer->getNbOutputs(); i++) {
+    nvinfer1::ITensor* output_tensor = layer->getOutput(i);
+    outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertPlaceholder(
     Converter& ctx, const tensorflow::NodeDef& node_def,
     const std::vector<TRT_TensorOrWeights>& inputs,
@@ -2073,6 +2133,8 @@ void Converter::register_op_converters() {
   op_registry_["Reshape"] = ConvertReshape;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
+
+  plugin_converter_ = ConvertPlugin;
 }
 
 }  // namespace
@@ -2144,7 +2206,7 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   if (!status.ok() || !calib_res->calibrator_) {
     return tensorflow::errors::FailedPrecondition(
         "You must run calibration"
-        " and inference conversion in the same proces");
+        " and inference conversion in the same process");
   }
 
   calib_res->calibrator_->setDone();
@@ -2213,60 +2275,63 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
+tensorflow::Status ReverseTopologicalSort(
+    const tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order) {
   std::vector<tensorflow::Node*> order_vec;
   tensorflow::GetPostOrder(s.graph, &order_vec);
   // Select just the subgraph
-  std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to construct the
+      // We want topological order to contstruct the
       // network layer by layer
+      order->push_front(node);
     }
   }
-  // topological order is needed to build TRT network
-  static int static_id = 0;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetInputList(
+    const tensorrt::convert::SubGraphParams& s,
+    tensorflow::NodeDefBuilder* op_builder,
+    const std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes) {
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  VLOG(2) << "input edge size: " << input_names->size();
+  for (size_t i = 0; i < input_names->size(); ++i) {
+    VLOG(2) << "input edges: " << i << " " << input_names->at(i);
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names->at(i), output_idx, input_dtypes->at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder->Input(input_list);
+  return tensorflow::Status::OK();
+}
+
+string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) {
   string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
+  if (!order->empty()) {
+    subgraph_name_scope = order->front()->name();
   }
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
   }
   // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
-  }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+  return subgraph_name_scope;
+}
 
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
+tensorflow::Status ConvertSubgraph(
+    Converter& converter, tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order, std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes,
+    std::vector<string>* output_names,
+    std::vector<tensorflow::DataType>* output_dtypes,
+    const string& engine_name) {
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2309,22 +2374,21 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
+    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
     if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Data type conversion for input '" << node_name
-                   << "' failed";
+      LOG(WARNING) << "Type conversion failed for " << node_name;
       return type_status;
     }
 
-    VLOG(2) << "accessing output index of: " << output_idx
+    VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
-            << "with output entry from shape_map: " << op_info_vec.size();
+            << " with output entry from shape_map: " << op_info_vec.size();
     // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
+    nvinfer1::DimsCHW input_dim_pseudo_chw;
+    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
 
     // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
     //            update the code once TRT 4.0 comes out.
@@ -2338,7 +2402,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
+      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
     // TODO(ben,jie): proper way to restore input tensor name?
@@ -2347,33 +2411,29 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
 
-    input_names.push_back(input_tensor_name);
+    input_names->push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
+        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
+    VLOG(2) << "Input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
+          "Output tensor already exists for op: " + input_tensor_name);
   }
 
-  VLOG(2) << "finished sorting";
-
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
+    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  VLOG(2) << "finished conversion";
+  VLOG(2) << "Finished conversion";
 
   // Gather output metadata
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
   int trt_engine_op_output_idx = 0;
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
@@ -2388,14 +2448,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
              : StrCat(engine_name, ":", trt_engine_op_output_idx),
          {output_idx, tensor_name}});
     trt_engine_op_output_idx++;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-    VLOG(1) << "output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
+    if (output_idx != 0)
+      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
+    VLOG(2) << "Output tensor name: " << tensor_name;
+    output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
                                                  "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
@@ -2405,12 +2464,65 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     }
     converter.network()->markOutput(*tensor);
     tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
+    output_dtypes->push_back(tf_dtype);
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
     TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
     tensor->setType(trt_dtype);
   }
 
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+  // Toposort
+  std::list<tensorflow::Node*> order;
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
+
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  // TODO(sami,ben,jie): proper naming!
+  string calib_op_name =
+      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
+  static_id++;
+
+  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
+  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
+  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
+  op_res->logger_ = new tensorflow::tensorrt::Logger();
+  cudaSetDevice(s.cuda_gpu_id_);
+  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
+  op_res->allocator_ = s.allocator_;
+#if NV_TENSORRT_MAJOR > 3
+  op_res->builder_->setGpuAllocator(s.allocator_.get());
+#endif
+  if (!op_res->builder_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT builder object");
+  }
+
+  op_res->network_ = op_res->builder_->createNetwork();
+  if (!op_res->network_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT network object");
+  }
+
+  // Build the network
+  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
+  auto ws = new tensorflow::tensorrt::TRTWeightStore();
+  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
+  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
+
   VLOG(2) << "Finished processing outputs";
 
   // Build the engine
@@ -2422,21 +2534,8 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
   tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
-            << ":" << output_idx
-            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  SetInputList(s, &op_builder, &input_names, &input_dtypes);
+
   std::vector<string> segment_names;
   segment_names.reserve(s.subgraph_node_ids.size());
   for (int i : s.subgraph_node_ids) {
@@ -2460,46 +2559,29 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensorrt::convert::SubGraphParams& s) {
   // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
   std::list<tensorflow::Node*> order;
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      // We want topological order to contstruct the
-      // network layer by layer
-      order.push_front(node);
-    }
-  }
-  // Topological order is needed to build TRT network
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
 
-  tensorflow::tensorrt::Logger trt_logger;
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++);
 
+  tensorflow::tensorrt::Logger trt_logger;
+  cudaSetDevice(s.cuda_gpu_id_);
   auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
   if (!trt_builder) {
     return tensorflow::errors::Internal(
         "Failed to create TensorRT builder object");
   }
-
+#if NV_TENSORRT_MAJOR > 3
+  trt_builder->setGpuAllocator(s.allocator_.get());
+#endif
   auto trt_network = infer_object(trt_builder->createNetwork());
   if (!trt_network) {
     return tensorflow::errors::Internal(
         "Failed to create TensorRT network object");
   }
 
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  static int static_id = 0;
-  // TODO(sami,ben,jie): proper naming!
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
-  engine_name = StrCat(engine_name, static_id++);
   auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
   auto weight_rmgr = trt_rmgr->getManager("WeightStore");
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
@@ -2510,147 +2592,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
-
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
-    tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    auto type_status = ConvertDType(tf_dtype, &dtype);
-    if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Type conversion failed for " << node_name;
-      return type_status;
-    }
-
-    VLOG(2) << "Accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
-
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4) {
-      string err_str = "Require 4 dimensional input.";
-      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
-                shape_inference_node_name);
-      return tensorflow::errors::Unimplemented(err_str);
-    }
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
-
-    input_names.push_back(input_tensor_name);
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "Input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "Output tensor already exists for op: " + input_tensor_name);
-  }
-
-  VLOG(2) << "Finished sorting";
-
-  for (const tensorflow::Node* node : order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "Finished conversion";
-
-  // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
-    VLOG(2) << "Output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
-                                                 "' is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
-    }
-    converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
-  }
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
 
   VLOG(2) << "Finished output";
 
@@ -2686,20 +2632,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   // Build the TRT op
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names.size();
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  SetInputList(s, &op_builder, &input_names, &input_dtypes);
 
   VLOG(0) << "Finished op preparation";
 
@@ -2707,9 +2640,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
+                    .Device(s.device_name_)
                     .Finalize(s.trt_node);
 
-  VLOG(0) << status.ToString() << " finished op building";
+  VLOG(0) << status.ToString() << " finished op building for " << engine_name
+          << " on device " << s.device_name_;
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 954a1e72f8..3f6592cd25 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
-
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
@@ -48,7 +48,9 @@ struct SubGraphParams {
       const tensorflow::grappler::GraphProperties& current_graph_properties,
       std::unordered_map<string, std::pair<int, string>>* output_edges,
       tensorflow::NodeDef* constructed_trt_node,
-      int engine_precision_mode = FP32MODE)
+      int engine_precision_mode = FP32MODE, const string& device_name = "",
+      std::shared_ptr<nvinfer1::IGpuAllocator> allocator = nullptr,
+      int cuda_gpu_id = 0)
       : graph(inp_graph),
         subgraph_node_ids(subgraph_node_id_numbers),
         input_inds(input_indices),
@@ -58,7 +60,10 @@ struct SubGraphParams {
         graph_properties(current_graph_properties),
         output_edge_map(output_edges),
         trt_node(constructed_trt_node),
-        precision_mode(engine_precision_mode) {}
+        precision_mode(engine_precision_mode),
+        device_name_(device_name),
+        allocator_(allocator),
+        cuda_gpu_id_(cuda_gpu_id) {}
 
   tensorflow::Graph& graph;
   const std::set<int>& subgraph_node_ids;
@@ -70,6 +75,9 @@ struct SubGraphParams {
   std::unordered_map<string, std::pair<int, string>>* output_edge_map;
   tensorflow::NodeDef* trt_node;
   const int precision_mode;
+  const string device_name_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  const int cuda_gpu_id_;
 };
 
 // TODO(sami): Replace references with const reference or pointers
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
new file mode 100644
index 0000000000..8f634b1f74
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -0,0 +1,246 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+// TODO(sami): Remove VLOG messages once the code matures
+using tensorflow::str_util::Uppercase;
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
+tensorflow::Status TRTOptimizationPass::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  VLOG(1) << "Called INIT for " << name_ << " with config = " << config;
+  if (config == nullptr) {
+    maximum_workspace_size_ = 2 << 30;
+    return tensorflow::Status::OK();
+  }
+  const auto params = config->parameter_map();
+  if (params.count("minimum_segment_size")) {
+    minimum_segment_size_ = params.at("minimum_segment_size").i();
+  }
+  if (params.count("max_batch_size")) {
+    maximum_batch_size_ = params.at("max_batch_size").i();
+  }
+  if (params.count("max_workspace_size_bytes"))
+    maximum_workspace_size_ = params.at("max_workspace_size_bytes").i();
+  if (params.count("precision_mode")) {
+    string pm = Uppercase(params.at("precision_mode").s());
+    if (pm == "FP32") {
+      precision_mode_ = 0;
+    } else if (pm == "FP16") {
+      precision_mode_ = 1;
+    } else if (pm == "INT8") {
+      precision_mode_ = 2;
+    } else {
+      LOG(ERROR) << "Unknown precision mode '" << pm << "'";
+      return tensorflow::errors::InvalidArgument(
+          "Unknown precision mode argument" + pm +
+          " Valid values are FP32, FP16, INT8");
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+void TRTOptimizationPass::PrintDebugInfo(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item) {
+  VLOG(1) << "Cluster = " << cluster;
+  string offset("  ");
+  string offset2 = StrCat(offset, offset);
+  string offset3 = StrCat(offset2, offset);
+  string offset4 = StrCat(offset2, offset2);
+  if (cluster) {
+    VLOG(1) << offset << "type             = " << cluster->type();
+    VLOG(1) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
+    const auto dev_names = cluster->GetDeviceNames();
+    if (dev_names.size()) {
+      VLOG(1) << offset << " Device names:";
+      for (const auto s : dev_names) {
+        VLOG(1) << offset2 << s;
+      }
+    }
+    std::unordered_map<string, uint64> peak_mem;
+    auto status = cluster->GetPeakMemoryUsage(&peak_mem);
+    if (status == tensorflow::Status::OK()) {
+      VLOG(1) << offset << "Peak Memory Usage :";
+      for (auto s : peak_mem) {
+        VLOG(1) << offset2 << s.first << " = " << s.second;
+      }
+    }
+
+    const auto dev_props = cluster->GetDevices();
+    if (dev_props.size()) {
+      VLOG(1) << offset << "Device properties:";
+      for (auto k : dev_props) {
+        VLOG(1) << offset2 << k.first;
+        const auto& dt = k.second;
+        VLOG(1) << offset3 << "type          = " << dt.type();
+        VLOG(1) << offset3 << "vendor        = " << dt.vendor();
+        VLOG(1) << offset3 << "model         = " << dt.model();
+        VLOG(1) << offset3 << "frequency     = " << dt.frequency();
+        VLOG(1) << offset3 << "num cores     = " << dt.num_cores();
+        VLOG(1) << offset3 << "num registers = " << dt.num_registers();
+        VLOG(1) << offset3 << "L1 cache size = " << dt.l1_cache_size();
+        VLOG(1) << offset3 << "L2 cache size = " << dt.l2_cache_size();
+        VLOG(1) << offset3 << "L3 cache size = " << dt.l3_cache_size();
+        VLOG(1) << offset3 << "SHMem per SMP = "
+                << dt.shared_memory_size_per_multiprocessor();
+        VLOG(1) << offset3 << "memory size   = " << dt.memory_size();
+        VLOG(1) << offset3 << "bandwidth     = " << dt.bandwidth();
+        if (dt.environment_size()) {
+          VLOG(1) << offset3 << "environment   :";
+          for (const auto e : dt.environment()) {
+            VLOG(1) << offset4 << e.first << " = " << e.second;
+          }
+        }
+      }
+    }
+  }
+  VLOG(1) << "item: " << item.id;
+  if (item.feed.size()) {
+    VLOG(1) << offset << "Feeds  :";
+    for (const auto& f : item.feed) {
+      const auto& shape = f.second.shape();
+      VLOG(1) << offset2 << f.first << " = shaped " << shape.DebugString();
+    }
+  } else {
+    VLOG(1) << offset << "No Feeds";
+  }
+  if (item.fetch.size()) {
+    VLOG(1) << offset << "Fetches  :";
+    for (const auto& f : item.fetch) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No Fetches";
+  }
+
+  if (item.init_ops.size()) {
+    VLOG(1) << offset << "init ops  :";
+    for (const auto& f : item.init_ops) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No init ops";
+  }
+  VLOG(1) << "Save Op = " << item.save_op;
+  VLOG(1) << "Restore Op = " << item.restore_op;
+  VLOG(1) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
+  if (item.keep_ops.size()) {
+    VLOG(1) << offset << "keep ops  :";
+    for (const auto& f : item.keep_ops) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No keep ops";
+  }
+  VLOG(3) << item.graph.DebugString();
+  for (const auto dev : cluster->GetDeviceSet()->devices()) {
+    const auto& pname = dev->parsed_name();
+    VLOG(1) << "Device name= " << dev->name()
+            << " parsedname job= " << pname.job << " id= " << pname.id
+            << " has_id: " << pname.has_id << " has_job: " << pname.has_job
+            << "has_type: " << pname.has_type << " type =" << pname.type;
+  }
+}
+
+tensorflow::Status TRTOptimizationPass::Optimize(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item, GraphDef* optimized_graph) {
+  VLOG(1) << "Called TRTOptimization Pass " << name_;
+  if (VLOG_IS_ON(1)) {
+    PrintDebugInfo(cluster, item);
+  }
+  int max_dim = -1;
+  if (item.feed.size()) {
+    for (const auto& f : item.feed) {
+      const auto& shape = f.second.shape();
+      if (shape.dims() > 0) {
+        if (shape.dim_size(0) > max_dim) max_dim = shape.dim_size(0);
+      }
+    }
+  }
+  if (maximum_batch_size_ < 0) {  // automatic batch size from input
+    if (max_dim > 0) {
+      maximum_batch_size_ = max_dim;
+      VLOG(1) << "Setting maximum batch size to " << max_dim;
+    } else {
+      maximum_batch_size_ = 128;
+      LOG(WARNING) << "Maximum batch size is not set"
+                      " and can't be deduced from inputs setting it to"
+                   << maximum_batch_size_
+                   << ". Suggest configuring it from configuration parameters";
+    }
+  } else {
+    if (max_dim > maximum_batch_size_) {
+      LOG(WARNING) << "Configured batch size " << maximum_batch_size_
+                   << " is less than input batch size " << max_dim
+                   << " adjusting maximum batch size to match input batch size";
+    }
+  }
+  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
+  auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(
+      item.graph, item.fetch, maximum_batch_size_, maximum_workspace_size_,
+      optimized_graph, precision_mode_, minimum_segment_size_,
+      static_graph_properties, cluster);
+  VLOG(2) << optimized_graph->DebugString();
+  return status;
+}
+
+void TRTOptimizationPass::Feedback(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item,
+    const GraphDef& optimized_graph, double result) {}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+class VerboseCustomGraphOptimizerRegistrar
+    : public tensorflow::grappler::CustomGraphOptimizerRegistrar {
+ public:
+  VerboseCustomGraphOptimizerRegistrar(
+      const tensorflow::grappler::CustomGraphOptimizerRegistry::Creator& cr,
+      const tensorflow::string& name)
+      : tensorflow::grappler::CustomGraphOptimizerRegistrar(cr, name) {
+    VLOG(1) << "Constructing a CustomOptimizationPass registration object for "
+            << name;
+  }
+};
+
+static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
+    []() {
+      VLOG(1)
+          << "Instantiating CustomOptimizationPass object TensorRTOptimizer";
+      return new tensorflow::tensorrt::convert::TRTOptimizationPass(
+          "TensorRTOptimizer");
+    },
+    ("TensorRTOptimizer"));
+
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
new file mode 100644
index 0000000000..d8ecead23e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
+ public:
+  TRTOptimizationPass(const string& name = "TRTOptimizationPass")
+      : name_(name),
+        minimum_segment_size_(3),
+        precision_mode_(0),
+        maximum_batch_size_(-1),
+        maximum_workspace_size_(-1) {
+    VLOG(1) << "Constructing " << name_;
+  }
+
+  string name() const override { return name_; };
+
+  tensorflow::Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
+                              config = nullptr) override;
+
+  tensorflow::Status Optimize(tensorflow::grappler::Cluster* cluster,
+                              const tensorflow::grappler::GrapplerItem& item,
+                              GraphDef* optimized_graph) override;
+
+  void Feedback(tensorflow::grappler::Cluster* cluster,
+                const tensorflow::grappler::GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+  void PrintDebugInfo(tensorflow::grappler::Cluster* cluster,
+                      const tensorflow::grappler::GrapplerItem& item);
+
+ private:
+  string name_;
+  int minimum_segment_size_;
+  int precision_mode_;
+  int maximum_batch_size_;
+  int64_t maximum_workspace_size_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
new file mode 100644
index 0000000000..a89cf3ab8b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -0,0 +1,118 @@
+# Description:
+#   Example for plugin support in TensorRT(http://developer.nvidia.com/tensorrt)
+#   through TensorFlow integration. Targeting TensorRT 3.0.4
+#   APIs are meant to change while upgrading TRT.
+#   add init_py into pip package BUILD dependency to install it.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["inc_op"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "inc_op",
+    deps = [":inc_op_op_lib"],
+)
+
+tf_custom_op_library(
+    name = "_inc_op.so",
+    srcs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.cc",
+        "inc_op_plugin.h",
+        "ops/inc_op.cc",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:framework_lite",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_kernel_library(
+    name = "inc_op_plugin_kernel",
+    srcs = ["inc_op_plugin.cc"],
+    hdrs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.h",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_custom_op_py_library(
+    name = "inc_op_loader",
+    srcs = ["inc_op.py"],
+    dso = [
+        ":_inc_op.so",
+    ],
+    kernels = [
+        ":inc_op_op_lib",
+        ":inc_op_plugin_kernel",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inc_op",
+        ":inc_op_loader",
+    ],
+)
+
+cuda_py_test(
+    name = "plugin_test",
+    size = "small",
+    srcs = ["plugin_test.py"],
+    additional_deps = [
+        ":init_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/contrib/tensorrt:init_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:tf_optimizer",
+    ],
+    tags = [
+        "manual",
+        "noguitar",
+        "notap",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
new file mode 100644
index 0000000000..363edab2e8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Import custom op for plugin and register it in plugin factory registry."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.custom_plugin_examples import inc_op as import_inc_op_so
+from tensorflow.contrib.tensorrt.custom_plugin_examples.ops import gen_inc_op
+
+inc_op = gen_inc_op.inc_plugin_trt
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
new file mode 100644
index 0000000000..a007c3f54e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Loader for the custom inc_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+
+if platform.system() != "Windows":
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=g-import-not-at-top
+
+  _inc_op = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_inc_op.so"))
+else:
+  raise RuntimeError("Windows not supported")
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
new file mode 100644
index 0000000000..988b35f74f
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+__global__ void VecInc(const float* vec, float inc, float* dest, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) dest[i] = vec[i] + inc;
+}
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream) {
+  int threads_per_block = 256;
+  int blocks_per_grid = (count + threads_per_block - 1) / threads_per_block;
+
+  VecInc<<<threads_per_block, blocks_per_grid, 0, stream>>>(d_input, inc,
+                                                            d_output, count);
+}
+
+// Note: this kernel definition is not needed in the plugin_test rule, but it is
+// required for correctness of the TF program, i.e. if not using plugin or when
+// run with trt optimization pass, the test should work.
+class IncPluginTRT : public OpKernel {
+ public:
+  explicit IncPluginTRT(OpKernelConstruction* context) : OpKernel(context) {
+    std::vector<float> inc_list;
+    OP_REQUIRES_OK(context, context->GetAttr("inc", &inc_list));
+    OP_REQUIRES(context, inc_list.size() == 1,
+                errors::InvalidArgument(
+                    "The increment list should contain single element."));
+    inc_ = inc_list[0];
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &output_tensor));
+    const cudaStream_t* stream = CHECK_NOTNULL(
+        reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                  ->stream()
+                                                  ->implementation()
+                                                  ->CudaStreamMemberHack()));
+    IncrementKernel(input_tensor.flat<float>().data(), inc_,
+                    output_tensor->flat<float>().data(),
+                    input_shape.num_elements(), *stream);
+  }
+
+ private:
+  float inc_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IncPluginTRT").Device(DEVICE_GPU), IncPluginTRT);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
new file mode 100644
index 0000000000..c35955e105
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
new file mode 100644
index 0000000000..8d4c893af5
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+const char* kPluginName = "IncPluginTRT";
+
+IncOpPlugin* CreateIncPlugin() { return new IncOpPlugin(); }
+
+IncOpPlugin* CreateIncPluginDeserialize(const void* buffer, size_t length) {
+  return new IncOpPlugin(buffer, length);
+}
+
+REGISTER_TRT_PLUGIN(kPluginName, CreateIncPluginDeserialize, CreateIncPlugin);
+
+IncOpPlugin::IncOpPlugin() : plugin_name_(kPluginName) {}
+
+IncOpPlugin::IncOpPlugin(const void* serialized_data, size_t length)
+    : PluginTensorRT(serialized_data, length), plugin_name_(kPluginName) {
+  // account for the consumed pointer.
+  size_t consumed_data = PluginTensorRT::getSerializationSize();
+  assert(length - consumed_data >= sizeof(float));
+  const char* buffer = reinterpret_cast<const char*>(serialized_data);
+  SetAttribute("inc", buffer + consumed_data, sizeof(float));
+}
+
+bool IncOpPlugin::SetAttribute(const string& key, const void* ptr,
+                               const size_t size) {
+  if (strcmp(key.c_str(), "inc") == 0 && size == sizeof(float)) {
+    StoreAttribute(key, ptr, size);  // save the attribute to own the data;
+    inc_ = *static_cast<const float*>(ptr);
+    return true;
+  }
+  return false;
+}
+
+bool IncOpPlugin::GetAttribute(const string& key, const void** ptr,
+                               size_t* size) const {
+  const auto& iter = attr_map_.find(key);
+  if (iter != attr_map_.end()) {
+    *ptr = iter->second.data();
+    *size = iter->second.size();
+    return true;
+  }
+  return false;
+}
+
+int IncOpPlugin::enqueue(int batch_size, const void* const* inputs,
+                         void** outputs, void*, cudaStream_t stream) {
+  int count = 1;
+  for (int i = 0; i < input_dim_list_[0].nbDims; i++) {
+    count *= input_dim_list_[0].d[i];
+  }
+  count *= batch_size;
+  const float* input = reinterpret_cast<const float*>(inputs[0]);
+  float* output = reinterpret_cast<float*>(outputs[0]);
+  IncrementKernel(input, inc_, output, count, stream);
+  return 0;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
new file mode 100644
index 0000000000..189e9c939b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+
+#include <cassert>
+#include <cstring>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class IncOpPlugin : public PluginTensorRT {
+ public:
+  IncOpPlugin();
+
+  IncOpPlugin(const void* serialized_data, size_t length);
+
+  const string& GetPluginName() const override { return plugin_name_; };
+
+  bool Finalize() override { return true; };
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override;
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override;
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int num_input_dims) override {
+    assert(index == 0);
+    assert(num_input_dims == 1);
+    return inputs[0];
+  }
+
+  // use configure to setup input dimensions
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override {
+    assert(num_inputs == 1);
+    PluginTensorRT::configure(inputs, num_inputs, outputs, num_outputs,
+                              max_batch_size);
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int max_batch_size) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+  size_t getSerializationSize() override {
+    return PluginTensorRT::getSerializationSize() + sizeof(float);
+  }
+
+  void serialize(void* buffer) override {
+    // Serialize parent data.
+    PluginTensorRT::serialize(buffer);
+    // Incremented buffer after parent serialization.
+    buffer =
+        static_cast<char*>(buffer) + PluginTensorRT::getSerializationSize();
+    std::memcpy(buffer, &inc_, sizeof(float));
+    buffer = static_cast<char*>(buffer) + sizeof(float);
+  }
+
+ protected:
+  float inc_;
+  nvinfer1::Dims dim_;
+
+ private:
+  const string plugin_name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
new file mode 100644
index 0000000000..d0eb0d299d
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+
+REGISTER_OP("IncPluginTRT")
+    .Attr("inc: list(float)")
+    .Input("input: float32")
+    .Output("output: float32")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
new file mode 100644
index 0000000000..bc4d270bec
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to show usage of TensorRT custom op & plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib import tensorrt
+from tensorflow.contrib.tensorrt import custom_plugin_examples
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class TrtPluginTest(test_util.TensorFlowTestCase):
+
+  def _get_plugin_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = array_ops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      relu = nn.relu(a, "relu")
+      v = nn_ops.max_pool(
+          relu, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+
+      # insert custom_op in the graph
+      v = custom_plugin_examples.inc_op(v, inc=[16.5], name="plugin_test")
+
+      v *= 2.0
+      v = nn.relu(v)
+      v = nn.relu(v)
+      array_ops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def _run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+
+    with session.Session(
+        config=config_pb2.ConfigProto(gpu_options=gpu_options),
+        graph=g) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def testIncOpPlugin(self):
+    inp_dims = (5, 24, 24, 2)
+    dummy_input = numpy.ones(inp_dims).astype(numpy.float32)
+    orig_graph = self._get_plugin_graph_def()  # graph with plugin node
+
+    # trigger conversion.
+    # plugin nodes have been registered during import, converter will be able to
+    # create corresponding plugin layer during conversion.
+    trt_graph = tensorrt.create_inference_graph(
+        input_graph_def=orig_graph,
+        outputs=["output"],
+        max_batch_size=inp_dims[0],
+        max_workspace_size_bytes=1 << 25,
+        precision_mode="FP32",
+        minimum_segment_size=2)
+    o2 = self._run_graph(trt_graph, dummy_input)
+    self.assertEqual(35, o2.reshape([-1])[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index b8f881ceb1..9ac8047944 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -32,38 +33,40 @@ namespace tensorrt {
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
-  string serialized_engine;
   OP_REQUIRES_OK(context,
-                 context->GetAttr("serialized_engine", &serialized_engine));
+                 context->GetAttr("serialized_engine", &serialized_engine_));
 
   // register input output node name in trt_sub_graph
   OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
   OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+}
 
-  // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  // Only engine should be in the op and context and runtime should be taken
-  // from resourcemanager
-  // TODO(jie): cudaSetDevice make sure trt engine is allocated on the same
-  // gpu where the input/output is also located.
-  int gpu_id = context->device()->tensorflow_gpu_device_info()->gpu_id;
-  cudaSetDevice(gpu_id);
-  int device;
-  cudaGetDevice(&device);
-  if (gpu_id != device) LOG(FATAL) << "set device failed!";
-
+void TRTEngineOp::Compute(OpKernelContext* context) {
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
   // Only engine should be in the op and context and runtime should be taken
   // from resourcemanager
 
-  IRuntime* infer = nvinfer1::createInferRuntime(logger);
-  trt_engine_ptr_.reset(infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr));
-  trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
-  // Runtime is safe to delete after engine creation
-  infer->destroy();
-}
-
-void TRTEngineOp::Compute(OpKernelContext* context) {
+  if (!trt_execution_context_ptr_) {
+    IRuntime* infer = nvinfer1::createInferRuntime(logger);
+#if NV_TENSORRT_MAJOR > 3
+    auto device = context->device();
+    auto dev_allocator =
+        device->GetAllocator(tensorflow::AllocatorAttributes());
+    if (!dev_allocator) {
+      LOG(FATAL) << "Can't find device allocator for gpu device "
+                 << device->name();
+    }
+    allocator_ = std::make_shared<TRTDeviceAllocator>(dev_allocator);
+    infer->setGpuAllocator(allocator_.get());
+#endif
+    trt_engine_ptr_.reset(infer->deserializeCudaEngine(
+        serialized_engine_.c_str(), serialized_engine_.size(),
+        PluginFactoryTensorRT::GetInstance()));
+    trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
+    // Runtime is safe to delete after engine creation
+    infer->destroy();
+    serialized_engine_.clear();
+  }
   int num_binding = context->num_inputs() + context->num_outputs();
   std::vector<void*> buffers(num_binding);
 
@@ -154,7 +157,12 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
   VLOG(2) << "enqueue returns: " << ret;
   // sync should be done by TF.
 }
-
+TRTEngineOp::~TRTEngineOp() {
+  // Order matters!
+  trt_execution_context_ptr_.reset();
+  trt_engine_ptr_.reset();
+  allocator_.reset();
+}
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 0964b4b18a..e613a71422 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -17,25 +17,28 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 
 #include <memory>
-#include <string>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 class Logger;
 
+//  TODO(Sami): Remove this file?
 class TRTEngineOp : public OpKernel {
  public:
   explicit TRTEngineOp(OpKernelConstruction* context);
 
   void Compute(OpKernelContext* context) override;
+  ~TRTEngineOp();
 
  private:
   template <typename T>
@@ -51,6 +54,8 @@ class TRTEngineOp : public OpKernel {
 
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  string serialized_engine_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 7f3544f8cf..96ccacb791 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -28,7 +28,7 @@ namespace tensorrt {
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
  public:
-  Logger(string name = "DefaultLogger") : name_(name){};
+  Logger(string name = "DefaultLogger") : name_(name) {}
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
  private:
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
new file mode 100644
index 0000000000..062f86e8bb
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include <cassert>
+#include <cstring>
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT::PluginTensorRT(const void* serialized_data, size_t length) {
+  const char* buffer = static_cast<const char*>(serialized_data);
+  size_t op_name_char_count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+  buffer += op_name_char_count;
+
+  size_t count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+
+  for (int i = 0; i < count; i++) {
+    nvinfer1::Dims dim;
+    std::memcpy(&(dim.nbDims), buffer, sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(dim.d, buffer, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(dim.type, buffer, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+void PluginTensorRT::configure(const nvinfer1::Dims* inputs, int num_inputs,
+                               const nvinfer1::Dims* outputs, int num_outputs,
+                               int max_batch_size) {
+  for (int index = 0; index < num_inputs; index++) {
+    nvinfer1::Dims dim;
+    dim.nbDims = inputs[index].nbDims;
+    for (int i = 0; i < dim.nbDims; i++) {
+      dim.d[i] = inputs[index].d[i];
+      dim.type[i] = inputs[index].type[i];
+    }
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+size_t PluginTensorRT::getSerializationSize() {
+  nvinfer1::Dims dim;
+  return sizeof(size_t) + GetPluginName().size() +
+         sizeof(input_dim_list_.size()) + sizeof(dim.nbDims) + sizeof(dim.d) +
+         sizeof(dim.type);
+}
+
+void PluginTensorRT::serialize(void* serialized_data) {
+  size_t op_name_size = GetPluginName().size();
+  char* buffer = static_cast<char*>(serialized_data);
+  std::memcpy(buffer, &op_name_size, sizeof(size_t));
+  buffer += sizeof(size_t);
+
+  std::memcpy(buffer, GetPluginName().data(), op_name_size);
+  buffer += op_name_size;
+
+  auto list_size = input_dim_list_.size();
+  std::memcpy(buffer, &list_size, sizeof(input_dim_list_.size()));
+  buffer += sizeof(input_dim_list_.size());
+
+  for (int i = 0; i < input_dim_list_.size(); i++) {
+    auto dim = input_dim_list_[i];
+    std::memcpy(buffer, &(dim.nbDims), sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(buffer, dim.d, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(buffer, dim.type, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+  }
+}
+
+bool PluginTensorRT::StoreAttribute(const string& key, const void* ptr,
+                                    const size_t size) {
+  if (attr_map_.count(key) != 0) return false;
+
+  attr_map_.emplace(key, std::vector<char>(size));
+  std::memcpy(attr_map_[key].data(), ptr, size);
+  return true;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
new file mode 100644
index 0000000000..754920b60c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// A wrapper class for TensorRT plugin
+// User application should inherit from this class to write custom kernels.
+// Allows user to insert custom op in TensorRT engine
+// To register plugin in converter, user should also register custom
+// PluginDeserializeFunc & PluginConstructFunc through PluginFactoryTensorRT
+class PluginTensorRT : public nvinfer1::IPlugin {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length);
+
+  virtual const string& GetPluginName() const = 0;
+
+  virtual bool Finalize() = 0;
+
+  virtual bool SetAttribute(const string& key, const void* ptr,
+                            const size_t size) = 0;
+  virtual bool GetAttribute(const string& key, const void** ptr,
+                            size_t* size) const = 0;
+
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override;
+
+  virtual bool StoreAttribute(const string& key, const void* ptr,
+                              const size_t size);
+
+  size_t getSerializationSize() override;
+
+  void serialize(void* buffer) override;
+
+ protected:
+  std::unordered_map<string, std::vector<char> > attr_map_;
+
+  std::vector<nvinfer1::Dims> input_dim_list_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
new file mode 100644
index 0000000000..2bc591484d
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  size_t parsed_byte = 0;
+  // extract op_name from serial_data
+  string encoded_op_name =
+      ExtractOpName(serial_data, serial_length, &parsed_byte);
+
+  if (!IsPlugin(encoded_op_name)) {
+    return nullptr;
+  }
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr =
+      plugin_registry_[encoded_op_name].first(serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string& op_name) {
+  if (!IsPlugin(op_name)) return nullptr;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr = plugin_registry_[op_name].second();
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const string& op_name, PluginDeserializeFunc deserialize_func,
+    PluginConstructFunc construct_func) {
+  if (IsPlugin(op_name)) return false;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto ret = plugin_registry_.emplace(
+      op_name, std::make_pair(deserialize_func, construct_func));
+
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() {
+  tensorflow::mutex_lock lock(instance_m_);
+  for (auto& owned_plugin_ptr : owned_plugins_) {
+    owned_plugin_ptr.release();
+  }
+  owned_plugins_.clear();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
new file mode 100644
index 0000000000..bbae9fb65c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
+ public:
+  // TODO(aaroey): this static method has to be inlined to make the singleton a
+  // unique global symbol. Find a way to fix it.
+  static PluginFactoryTensorRT* GetInstance() {
+    static PluginFactoryTensorRT* factory_instance =
+        new PluginFactoryTensorRT();
+    return factory_instance;
+  }
+
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  // Plugin construction, PluginFactoryTensorRT owns the plugin.
+  PluginTensorRT* CreatePlugin(const string& op_name);
+
+  bool RegisterPlugin(const string& op_name,
+                      PluginDeserializeFunc deserialize_func,
+                      PluginConstructFunc construct_func);
+
+  bool IsPlugin(const string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  size_t CountOwnedPlugins() { return owned_plugins_.size(); }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<string,
+                     std::pair<PluginDeserializeFunc, PluginConstructFunc>>
+      plugin_registry_;
+
+  // TODO(jie): Owned plugin should be associated with different sessions;
+  //            should really hand ownership of plugins to resource management;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+  tensorflow::mutex instance_m_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const string& name, PluginDeserializeFunc deserialize_func,
+                     PluginConstructFunc construct_func) {
+    auto factory = PluginFactoryTensorRT::GetInstance();
+    QCHECK(factory->RegisterPlugin(name, deserialize_func, construct_func))
+        << "Failed to register plugin: " << name;
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func)    \
+  REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \
+                                  construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \
+                                        construct_func)              \
+  REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
+  static ::tensorflow::tensorrt::TrtPluginRegistrar trt_plugin_registrar##ctr \
+      TF_ATTRIBUTE_UNUSED = ::tensorflow::tensorrt::TrtPluginRegistrar(       \
+          name, deserialize_func, construct_func)
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
new file mode 100644
index 0000000000..129bdcdbc2
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace test {
+
+class StubPlugin : public PluginTensorRT {
+ public:
+  static const char* kPluginName;
+
+  StubPlugin() : plugin_name_(kPluginName) {}
+
+  StubPlugin(const void* serialized_data, size_t length)
+      : PluginTensorRT(serialized_data, length) {}
+
+  const string& GetPluginName() const override { return plugin_name_; }
+
+  bool Finalize() override { return true; }
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override {
+    return true;
+  }
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override {
+    return true;
+  }
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) override {
+    return inputs[0];
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override {
+    return 0;
+  }
+
+ private:
+  const string plugin_name_;
+};
+
+const char* StubPlugin::kPluginName = "StubPlugin";
+
+StubPlugin* CreateStubPlugin() { return new StubPlugin(); }
+
+StubPlugin* CreateStubPluginDeserialize(const void* serialized_data,
+                                        size_t length) {
+  return new StubPlugin(serialized_data, length);
+}
+
+class TrtPluginFactoryTest : public ::testing::Test {
+ public:
+  bool RegisterStubPlugin() {
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(
+            StubPlugin::kPluginName)) {
+      return true;
+    }
+    return PluginFactoryTensorRT::GetInstance()->RegisterPlugin(
+        StubPlugin::kPluginName, CreateStubPluginDeserialize, CreateStubPlugin);
+  }
+};
+
+TEST_F(TrtPluginFactoryTest, Registration) {
+  EXPECT_FALSE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+  EXPECT_TRUE(RegisterStubPlugin());
+
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+}
+
+TEST_F(TrtPluginFactoryTest, CreationDeletion) {
+  EXPECT_TRUE(RegisterStubPlugin());
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_TRUE(PluginFactoryTensorRT::GetInstance()->CreatePlugin(
+      StubPlugin::kPluginName));
+  ASSERT_EQ(1, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_EQ(0, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+}
+
+}  // namespace test
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
new file mode 100644
index 0000000000..a8f60886c0
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include <cassert>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental) {
+  size_t op_name_char_count = *static_cast<const size_t*>(serial_data);
+  *incremental = sizeof(size_t) + op_name_char_count;
+
+  assert(serial_length >= *incremental);
+
+  const char* buffer = static_cast<const char*>(serial_data) + sizeof(size_t);
+  string op_name(buffer, op_name_char_count);
+
+  return op_name;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
new file mode 100644
index 0000000000..274ce42fec
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
+// TODO(jie): work on error handling here
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
new file mode 100644
index 0000000000..0f0508331c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#if NV_TENSORRT_MAJOR > 2
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
+                                 uint32_t flags) {
+  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
+  void* memory;
+  cudaMalloc(&memory, size);
+  return memory;
+}
+
+void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
+
+void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
+                                   uint32_t flags) {
+  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
+  void* mem = allocator_->AllocateRaw(alignment, size);
+  VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
+          << " @ " << mem;
+  return mem;
+}
+
+TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
+    : allocator_(allocator) {
+  VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
+}
+
+void TRTDeviceAllocator::free(void* memory) {
+  VLOG(2) << "Deallocating " << memory;
+  allocator_->DeallocateRaw(memory);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
new file mode 100644
index 0000000000..a0c2540a76
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/framework/allocator.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+#if NV_TENSORRT_MAJOR == 3
+// Define interface here temporarily until TRT 4.0 is released
+namespace nvinfer1 {
+class IGpuAllocator {
+ public:
+  virtual void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) = 0;
+  virtual void free(void* memory) = 0;
+};
+}  // namespace nvinfer1
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+
+class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
+  // Allocator implementation that is using cuda allocator instead of device
+  // allocator in case we can't get device allocator from TF.
+ public:
+  TRTCudaAllocator() {}
+  virtual ~TRTCudaAllocator() {}
+  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
+  void free(void* memory) override;
+};
+
+class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
+  // Allocator implementation wrapping TF device allocators.
+ public:
+  TRTDeviceAllocator(tensorflow::Allocator* allocator);
+  virtual ~TRTDeviceAllocator() {}
+  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
+  void free(void* memory) override;
+
+ private:
+  tensorflow::Allocator* allocator_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 3c85968ae7..e3469124ac 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
 
 #include <list>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <vector>
+
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -40,6 +43,11 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
         engine_(nullptr),
         logger_(nullptr),
         thr_(nullptr) {}
+
+  ~TRTCalibrationResource() {
+    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  }
+
   string DebugString() override {
     std::stringstream oss;
     oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
@@ -47,16 +55,17 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
         << " Network    = " << std::hex << network_ << std::dec << std::endl
         << " Engine     = " << std::hex << engine_ << std::dec << std::endl
         << " Logger     = " << std::hex << logger_ << std::dec << std::endl
+        << " Allocator  = " << std::hex << allocator_.get() << std::dec
+        << std::endl
         << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
     return oss.str();
   }
-  ~TRTCalibrationResource() {
-    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
-  }
+
   TRTInt8Calibrator* calibrator_;
   nvinfer1::IBuilder* builder_;
   nvinfer1::INetworkDefinition* network_;
   nvinfer1::ICudaEngine* engine_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
   tensorflow::tensorrt::Logger* logger_;
   // TODO(sami): Use threadpool threads!
   std::thread* thr_;
@@ -65,31 +74,28 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
 class TRTWeightStore : public tensorflow::ResourceBase {
  public:
   TRTWeightStore() {}
-  std::list<std::vector<uint8_t>> store_;
+
+  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
+
   string DebugString() override {
     std::stringstream oss;
-    size_t lenBytes = 0;
+    size_t len_bytes = 0;
     for (const auto& v : store_) {
-      lenBytes += v.size() * sizeof(uint8_t);
+      len_bytes += v.size() * sizeof(uint8_t);
     }
     oss << " Number of entries     = " << store_.size() << std::endl
         << " Total number of bytes = "
-        << store_.size() * sizeof(std::vector<uint8_t>) + lenBytes << std::endl;
+        << store_.size() * sizeof(std::vector<uint8_t>) + len_bytes
+        << std::endl;
     return oss.str();
   }
-  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
-};
 
-class TRTEngineResource : public tensorflow::ResourceBase {
- public:
-  TRTEngineResource() : runtime_(nullptr), ctx_(nullptr){};
-  string DebugString() override { return string(""); }
-  nvinfer1::IRuntime* runtime_;
-  nvinfer1::IExecutionContext* ctx_;
+  std::list<std::vector<uint8_t>> store_;
 };
 
 }  // namespace tensorrt
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCEMGR_TRTRESOURCES_H_
+
 #endif
 #endif
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 8fc4697c51..cc42913eca 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -25,18 +25,239 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
+using ::tensorflow::strings::StrAppend;
+// A simple graph representation to mirror tensorflow::Graph. This structure
+// helps saving memory since segmenter modifies the graph in place, preventing
+// the need to create a copy of the graph. It is composed of edges and nodes.
+// Nodes keep pointers to original TF nodes.
+class SimpleNode;
+class SimpleGraph;
+class SimpleEdge {
+ public:
+  SimpleEdge(int id, SimpleNode* src, int src_port, SimpleNode* dst,
+             int dst_port, bool is_control = false)
+      : id_(id),
+        src_(src),
+        src_port_(src_port),
+        dst_(dst),
+        dst_port_(dst_port),
+        control_(is_control) {}
+  ~SimpleEdge() {}
+
+  SimpleNode* src() const { return src_; }
+  SimpleNode* dst() const { return dst_; }
+  int src_output() const { return src_port_; }
+  int dst_input() const { return dst_port_; }
+  int id() const { return id_; }
+  bool IsControlEdge() const { return control_; }
+
+ private:
+  int id_;
+  SimpleNode* src_;
+  int src_port_;
+  SimpleNode* dst_;
+  int dst_port_;
+  bool control_;
+};
+
+class SimpleNode {
+ public:
+  SimpleNode(const tensorflow::Node* node, const int id);
+
+  const std::vector<SimpleEdge*>& in_edges() const { return in_edges_; }
+  const std::vector<SimpleEdge*>& out_edges() const { return out_edges_; }
+  std::vector<SimpleNode*> in_nodes() const {
+    std::vector<SimpleNode*> res;
+    res.reserve(in_edges_.size());
+    for (const auto e : in_edges_) {
+      if (e) res.push_back(e->src());
+    }
+    return res;
+  }
+  const string& name() const { return node_->name(); }
+  const tensorflow::Node* tf_node() const { return node_; }
+  int id() const { return id_; }
+
+ private:
+  const tensorflow::Node* node_;
+  std::vector<SimpleEdge*> in_edges_;
+  std::vector<SimpleEdge*> out_edges_;
+  int id_;
+
+  friend class SimpleGraph;
+};
+
+class SimpleGraph {
+ public:
+  explicit SimpleGraph(const tensorflow::Graph* g);
+  ~SimpleGraph();
+
+  void AddControlEdge(SimpleNode* src, SimpleNode* dst);
+  void AddEdge(SimpleNode* src, int out_port, SimpleNode* dst, int in_port);
+  void RemoveEdge(const SimpleEdge*);
+  SimpleNode* FindNodeId(int node_id) {
+    if (node_id < 0 || node_id > static_cast<int>(nodes_.size())) {
+      return nullptr;
+    }
+    return nodes_[node_id];
+  }
+  int num_node_ids() const { return nodes_.size(); }
+  const SimpleNode* source_node() const {
+    return nodes_[tensorflow::Graph::kSourceId];
+  }
+  const SimpleNode* sink_node() const {
+    return nodes_[tensorflow::Graph::kSinkId];
+  }
+
+ private:
+  const tensorflow::Graph* g_;
+  std::vector<SimpleNode*> nodes_;
+  std::vector<SimpleEdge*> edges_;
+  // free_edge_ids_ and free_node_ids_ contain freed indices.
+  std::set<int> free_edge_ids_;
+  std::set<int> free_node_ids_;
+};
+
+SimpleNode::SimpleNode(const tensorflow::Node* node, const int id)
+    : node_(node), id_(id) {
+  if (node_) {
+    in_edges_.reserve(node_->in_edges().size());
+    out_edges_.reserve(node_->out_edges().size());
+  }
+}
+
+SimpleGraph::SimpleGraph(const tensorflow::Graph* g) : g_(g) {
+  int n_nodes = g_->num_node_ids();
+  nodes_.resize(n_nodes, nullptr);
+  nodes_[g->kSourceId] = new SimpleNode(g->source_node(), g->kSourceId);
+  nodes_[g->kSinkId] = new SimpleNode(g->sink_node(), g->kSinkId);
+  int n_edges = g->num_edge_ids();
+  edges_.resize(n_edges, nullptr);
+  for (int i = 2; i < n_nodes; i++) {
+    const auto n = g->FindNodeId(i);
+    if (n) {
+      nodes_[i] = new SimpleNode(n, i);
+    } else {
+      free_node_ids_.insert(i);
+    }
+  }
+  for (int i = 0; i < n_edges; i++) {
+    const auto e = g->FindEdgeId(i);
+    if (e) {
+      const auto tfsrc = e->src();
+      const auto tfdst = e->dst();
+      bool is_control = e->IsControlEdge();
+      auto src = nodes_[tfsrc->id()];
+      auto dst = nodes_[tfdst->id()];
+      auto edge = new SimpleEdge(i, src, e->src_output(), dst, e->dst_input(),
+                                 is_control);
+      edges_[i] = edge;
+      src->out_edges_.push_back(edge);
+      dst->in_edges_.push_back(edge);
+    } else {
+      free_edge_ids_.insert(i);
+    }
+  }
+}
+
+void SimpleGraph::AddEdge(SimpleNode* src, int out_port, SimpleNode* dst,
+                          int in_port) {
+  int i = edges_.size();
+  if (!free_edge_ids_.empty()) {
+    auto it = free_edge_ids_.begin();
+    i = *it;
+    free_edge_ids_.erase(it);
+  } else {
+    edges_.push_back(nullptr);
+  }
+  bool is_control = (out_port == tensorflow::Graph::kControlSlot);
+  is_control |= (in_port == tensorflow::Graph::kControlSlot);
+  auto edge = new SimpleEdge(i, src, out_port, dst, in_port, is_control);
+  edges_[i] = edge;
+  src->out_edges_.push_back(edge);
+  dst->in_edges_.push_back(edge);
+}
+
+void SimpleGraph::AddControlEdge(SimpleNode* src, SimpleNode* dst) {
+  AddEdge(src, tensorflow::Graph::kControlSlot, dst,
+          tensorflow::Graph::kControlSlot);
+}
+
+void SimpleGraph::RemoveEdge(const SimpleEdge* edge) {
+  auto src = edge->src();
+  auto dst = edge->dst();
+  for (auto it = src->out_edges_.begin(); it != src->out_edges_.end(); ++it) {
+    if (*it == edge) {
+      src->out_edges_.erase(it);
+      break;
+    }
+  }
+  for (auto it = dst->in_edges_.begin(); it != dst->in_edges_.end(); ++it) {
+    if (*it == edge) {
+      dst->in_edges_.erase(it);
+      break;
+    }
+  }
+}
+
+SimpleGraph::~SimpleGraph() {
+  for (auto x : nodes_) delete x;
+  for (auto x : edges_) delete x;
+}
 
 namespace {
 
-bool CanContractEdge(const tensorflow::Edge* edge,
-                     const tensorflow::Graph& graph) {
-  const tensorflow::Node* src = edge->src();
-  const tensorflow::Node* dst = edge->dst();
+bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
+                 const std::vector<SimpleNode*>& start) {
+  // copied from TF ReverseDFS.
+  struct Work {
+    SimpleNode* node;
+    bool leave;  // Are we entering or leaving n?
+  };
+
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
+
+  std::vector<bool> visited(g->num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+
+    auto n = w.node;
+    if (w.leave) {
+      if (n == src) {
+        return true;
+      }
+      continue;
+    }
+
+    if (visited[n->id()]) continue;
+    visited[n->id()] = true;
+    // Arrange to call leave(n) when all done with descendants.
+    stack.push_back(Work{n, true});
+
+    auto nodes = n->in_nodes();
+    for (const auto node : nodes) {
+      if (!visited[node->id()]) {
+        stack.push_back(Work{node, false});
+      }
+    }
+  }
+  return false;
+}
+
+bool CanContractEdge(const SimpleEdge* edge,
+                     const std::unique_ptr<SimpleGraph>& graph) {
+  const auto src = edge->src();
+  const auto dst = edge->dst();
 
   // Can't contract edge if doing so would cause a cycle in the
   // graph. So, if there is a directed path from 'src' to 'dst', other
@@ -48,46 +269,38 @@ bool CanContractEdge(const tensorflow::Edge* edge,
   //   1. Get all nodes incoming to 'dst', excluding 'src'
   //   2. Reverse DFS from those nodes
   //   3. If reverse DFS reaches 'src' then we have a cycle
-  std::vector<tensorflow::Node*> dfs_start_nodes;
-  for (tensorflow::Node* node : dst->in_nodes()) {
+  std::vector<SimpleNode*> dfs_start_nodes;
+  for (SimpleNode* node : dst->in_nodes()) {
     if (node != src) {
       dfs_start_nodes.push_back(node);
     }
   }
 
-  bool is_cycle = false;
-  if (!dfs_start_nodes.empty()) {
-    tensorflow::ReverseDFSFrom(graph, dfs_start_nodes, {},
-                               [&is_cycle, src](tensorflow::Node* node) {
-                                 if (node == src) {
-                                   is_cycle = true;
-                                 }
-                               });
-  }
-
+  bool is_cycle = CheckCycles(graph, src, dfs_start_nodes);
   return !is_cycle;
 }
+}  // namespace
 
-void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
-                  std::vector<const tensorflow::Edge*>* remove_edges) {
+void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
+                  std::vector<const SimpleEdge*>* remove_edges) {
   // Transfer all inputs and outputs of 'dst' to 'src' except edges
   // connecting the two.
-  tensorflow::Node* src = edge->src();
-  tensorflow::Node* dst = edge->dst();
+  auto src = edge->src();
+  auto dst = edge->dst();
 
   // We can use '0' for input/output index because we don't need them
   // to be accurate for the way we are using the graph.
-  std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
-                                                dst->in_edges().end());
-  for (const tensorflow::Edge* in_edge : in_edges) {
+  std::vector<const SimpleEdge*> in_edges(dst->in_edges().begin(),
+                                          dst->in_edges().end());
+  for (const SimpleEdge* in_edge : in_edges) {
     if (in_edge->IsControlEdge()) {
       if (in_edge->src() != src) {
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         graph->AddControlEdge(e->src(), src);
       }
     } else {
       if (in_edge->src() != src) {
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         if (e->src() == graph->source_node()) {
           graph->AddEdge(e->src(), e->src_output(), src,
                          tensorflow::Graph::kControlSlot);
@@ -98,14 +311,14 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
     }
   }
 
-  std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
-                                                 dst->out_edges().end());
-  for (const tensorflow::Edge* out_edge : out_edges) {
+  std::vector<const SimpleEdge*> out_edges(dst->out_edges().begin(),
+                                           dst->out_edges().end());
+  for (const SimpleEdge* out_edge : out_edges) {
     if (out_edge->IsControlEdge()) {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      SimpleEdge* e = const_cast<SimpleEdge*>(out_edge);
       graph->AddControlEdge(src, e->dst());
     } else {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      SimpleEdge* e = const_cast<SimpleEdge*>(out_edge);
       if (e->dst() == graph->sink_node()) {
         VLOG(1) << " edge to sink node " << src->name() << " -> "
                 << e->dst()->name();
@@ -128,8 +341,6 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   }
 }
 
-}  // namespace
-
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
@@ -140,17 +351,22 @@ tensorflow::Status SegmentGraph(
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
+  return SegmentGraph(&graph, candidate_fn, options, segments);
+}
 
-  // tensorflow::DumpGraph("Pre-Segment", &graph);
-
+tensorflow::Status SegmentGraph(
+    tensorflow::Graph* tf_graph,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments) {
+  auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
   // Use a union-find to collect the nodes that belong to the same
-  // segment. A node value of nullptr indicates that the node is not a
-  // candidate for TRT.
-  std::vector<UnionFind<tensorflow::Node*>> node_segments;
-  for (int i = 0; i < graph.num_node_ids(); ++i) {
-    tensorflow::Node* node = graph.FindNodeId(i);
+  // segment. A node value of nullptr indicates that the node is not a candidate
+  // for TRT.
+  std::vector<UnionFind<SimpleNode*>> node_segments;
+  for (int i = 0; i < graph->num_node_ids(); ++i) {
+    SimpleNode* node = graph->FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node)) {
+        !candidate_fn(node->tf_node())) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -164,10 +380,16 @@ tensorflow::Status SegmentGraph(
   // a measure of how beneficial it is to include a given node in a
   // TRT subgraph then we can revisit this algorithm to take advantage
   // of that information.
-  std::vector<tensorflow::Node*> order;
-  tensorflow::GetPostOrder(graph, &order);
-
-  for (const tensorflow::Node* node : order) {
+  std::vector<tensorflow::Node*> tforder;
+  tensorflow::GetPostOrder(*tf_graph, &tforder);
+  // use postorder implementation from tensorflow and construct mirror in
+  // internal format
+  std::vector<SimpleNode*> order;
+  order.reserve(tforder.size());
+  for (const auto tfnode : tforder) {
+    order.push_back(graph->FindNodeId(tfnode->id()));
+  }
+  for (const SimpleNode* node : order) {
     // All output nodes of 'node' have been visited...
     VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
 
@@ -181,8 +403,8 @@ tensorflow::Status SegmentGraph(
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
     while (true) {
-      std::set<const tensorflow::Edge*> contract_edges;
-      for (const tensorflow::Edge* out_edge : node->out_edges()) {
+      std::set<const SimpleEdge*> contract_edges;
+      for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(2) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
         if (out_edge->IsControlEdge()) {
@@ -210,9 +432,9 @@ tensorflow::Status SegmentGraph(
       // Contract edges and collect the adjacent nodes into the same
       // segment/subgraph.
       while (!contract_edges.empty()) {
-        const tensorflow::Edge* contract_edge = *contract_edges.begin();
-        const tensorflow::Node* src = contract_edge->src();
-        const tensorflow::Node* dst = contract_edge->dst();
+        const SimpleEdge* contract_edge = *contract_edges.begin();
+        const SimpleNode* src = contract_edge->src();
+        const SimpleNode* dst = contract_edge->dst();
 
         VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " ("
                 << src->id() << " <- " << dst->id();
@@ -221,13 +443,13 @@ tensorflow::Status SegmentGraph(
         // Contracting the edge leaves disconnected graph edges.
         // Remove these from the graph and from 'contract_edges' so we
         // don't visit them again.
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(contract_edge);
-        std::vector<const tensorflow::Edge*> remove_edges;
-        ContractEdge(e, &graph, &remove_edges);
+        SimpleEdge* e = const_cast<SimpleEdge*>(contract_edge);
+        std::vector<const SimpleEdge*> remove_edges;
+        ContractEdge(e, graph.get(), &remove_edges);
 
-        for (const tensorflow::Edge* r : remove_edges) {
+        for (const SimpleEdge* r : remove_edges) {
           contract_edges.erase(r);
-          graph.RemoveEdge(r);
+          graph->RemoveEdge(r);
         }
       }
     }
@@ -236,9 +458,27 @@ tensorflow::Status SegmentGraph(
   // Collect the segments/subgraphs. Each subgraph is represented by a
   // set of the names of the nodes in that subgraph.
   std::unordered_map<string, std::set<string>> sg_map;
+  std::unordered_map<string, std::set<string>> device_maps;
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
       sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+      auto tf_node = u.Value()->tf_node();
+      // has_assigned_device_name() is expected to return true
+      // when called from optimization pass. However, since graph
+      // is converted back and forth between graph and graphdef,
+      // assigned devices demoted to requested devices. If the graph
+      // is passed directly to this module, assigned devices will be set.
+      if (tf_node->has_assigned_device_name()) {
+        device_maps[u.ParentValue()->name()].insert(
+            tf_node->assigned_device_name());
+      } else if (!tf_node->requested_device().empty()) {
+        device_maps[u.ParentValue()->name()].insert(
+            tf_node->requested_device());
+      } else {
+        VLOG(1) << "Node " << tf_node->name()
+                << " has no device assigned requested device is: "
+                << tf_node->requested_device();
+      }
     }
   }
 
@@ -260,10 +500,35 @@ tensorflow::Status SegmentGraph(
               << segment_node_names.size() << " nodes, dropping";
       continue;
     }
-
-    segments->emplace_back(segment_node_names);
+    // TODO(sami): Make segmenter placement aware once trtscopes are in place
+    const auto& dev_itr = device_maps.find(itr.first);
+    if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
+      VLOG(1) << "No device assigned to segment " << segments->size();
+      segments->emplace_back(std::make_pair(segment_node_names, string()));
+    } else if (dev_itr->second.size() > 1) {
+      string s("Segment ");
+      StrAppend(&s, segments->size(), " has multiple devices attached: ");
+      for (const auto& dev : dev_itr->second) {
+        StrAppend(&s, dev, ", ");
+      }
+      LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
+      segments->emplace_back(
+          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+    } else {
+      segments->emplace_back(
+          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+    }
+  }
+  if (VLOG_IS_ON(1)) {
+    for (const auto& d : device_maps) {
+      string s("Segment ");
+      StrAppend(&s, ": '", d.first, "' ");
+      for (const auto& dd : d.second) {
+        StrAppend(&s, dd, ", ");
+      }
+      VLOG(1) << "Devices " << s;
+    }
   }
-
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 7e8685f44a..1568dd9153 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -29,7 +29,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-using SegmentNodesVector = std::vector<std::set<string>>;
+// vector of segments, each entry contains a device name and a set of nodes in
+// segment
+using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
@@ -51,6 +53,20 @@ tensorflow::Status SegmentGraph(
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
+// Get the subgraphs of a graph that can be handled by TensorRT.
+//
+// @param graph tensorflow::Graph of the network
+// @param candidate_fn A function that returns true for a Node* if
+// that node can be handled by TensorRT.
+// @param segments Returns the TensorRT segments/subgraphs. Each entry
+// in the vector describes a subgraph by giving a set of the names of
+// all the NodeDefs in that subgraph.
+// @return the status.
+tensorflow::Status SegmentGraph(
+    tensorflow::Graph* tf_graph,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments);
+
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 6f7655fcab..2de3923b06 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -34,7 +34,7 @@ class SegmentTest : public ::testing::Test {
   TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                     TF_Status* s, const char* name);
 
-  std::function<bool(const Node*)> MakeCandidateFn(
+  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
       const std::set<string>& node_names);
 
  protected:
@@ -59,9 +59,9 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
   return ret;
 }
 
-std::function<bool(const Node*)> SegmentTest::MakeCandidateFn(
+std::function<bool(const tensorflow::Node*)> SegmentTest::MakeCandidateFn(
     const std::set<string>& node_names) {
-  return [node_names](const Node* node) -> bool {
+  return [node_names](const tensorflow::Node* node) -> bool {
     return node_names.find(node->name()) != node_names.end();
   };
 }
@@ -164,7 +164,7 @@ TEST_F(SegmentTest, Simple) {
   ASSERT_EQ(segments.size(), 1);
   std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
   for (const auto& ex : expected) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
@@ -277,13 +277,13 @@ TEST_F(SegmentTest, Multiple) {
 
   std::vector<string> expected0{"add0", "add1", "add2", "add3"};
   for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
   std::vector<string> expected1{"add6", "add8"};
   for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
@@ -347,13 +347,13 @@ TEST_F(SegmentTest, BigIfElse) {
 
   std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
   for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
   std::vector<string> expected1{"add0", "add1"};
   for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 8b475177bc..f36495f6b6 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
@@ -33,7 +34,8 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
   nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr);
+      serialized_engine.c_str(), serialized_engine.size(),
+      tensorrt::PluginFactoryTensorRT::GetInstance());
 
   int num_batch = -1;
   std::vector<::tensorflow::DataType> input_type;
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index ad01bedd8f..175ccd8006 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import numpy as np
+
 # normally we should do import tensorflow as tf and then
 # tf.placeholder, tf.constant, tf.nn.conv2d etc but
 # it looks like internal builds don't like it so
@@ -26,6 +28,7 @@ import numpy as np
 
 from tensorflow.contrib import tensorrt as trt
 from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.core.protobuf import rewriter_config_pb2 as rwpb2
 from tensorflow.python.client import session as csess
 from tensorflow.python.framework import constant_op as cop
 from tensorflow.python.framework import dtypes as dtypes
@@ -59,9 +62,11 @@ def get_simple_graph_def():
   return g.as_graph_def()
 
 
-def run_graph(gdef, dumm_inp):
+def execute_graph(gdef, dumm_inp):
   """Run given graphdef once."""
+  print("executing")
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
   ops.reset_default_graph()
   g = ops.Graph()
   with g.as_default():
@@ -69,15 +74,14 @@ def run_graph(gdef, dumm_inp):
         graph_def=gdef, return_elements=["input", "output"])
     inp = inp.outputs[0]
     out = out.outputs[0]
-  with csess.Session(
-      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+  with csess.Session(config=sessconfig, graph=g) as sess:
     val = sess.run(out, {inp: dumm_inp})
   return val
 
 
 # Use real data that is representative of the inference dataset
 # for calibration. For this test script it is random data.
-def run_calibration(gdef, dumm_inp):
+def execute_calibration(gdef, dumm_inp):
   """Run given calibration graph multiple times."""
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
@@ -96,7 +100,9 @@ def run_calibration(gdef, dumm_inp):
   return val
 
 
-if "__main__" in __name__:
+def user(run_graph=execute_graph, run_calibration=execute_calibration):
+  """Example function that converts a graph to TFTRT graph."""
+
   inp_dims = (100, 24, 24, 2)
   dummy_input = np.random.random_sample(inp_dims)
   orig_graph = get_simple_graph_def()  # use a frozen graph for inference
@@ -137,3 +143,51 @@ if "__main__" in __name__:
   assert np.allclose(o1, o4)
   assert np.allclose(o1, o5)
   print("Pass")
+
+
+def auto():
+  """Run the conversion as an optimization pass."""
+  inp_dims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inp_dims)
+  orig_graph = get_simple_graph_def()
+  opt_config = rwpb2.RewriterConfig()
+  opt_config.optimizers.extend(["constfold", "layout"])
+  custom_op = opt_config.custom_optimizers.add()
+  custom_op.name = "TensorRTOptimizer"
+  custom_op.parameter_map["minimum_segment_size"].i = 3
+  custom_op.parameter_map["precision_mode"].s = "FP32"
+  custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
+  custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
+  print(custom_op)
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
+  sessconfig = cpb2.ConfigProto(
+      gpu_options=gpu_options, graph_options=graph_options)
+  print(sessconfig)
+  g = ops.Graph()
+  ops.reset_default_graph()
+  with g.as_default():
+    inp, out = importer.import_graph_def(
+        graph_def=orig_graph, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+    with csess.Session(config=sessconfig, graph=g) as sess:
+      val = sess.run(out, {inp: dummy_input})
+  print(val.shape)
+
+
+if "__main__" in __name__:
+  P = argparse.ArgumentParser(
+      prog="tftrt_test",
+      description="Example utilization of TensorFlow-TensorRT integration")
+  P.add_argument(
+      "--automatic",
+      "-a",
+      action="store_true",
+      help="Do TRT conversion automatically",
+      default=False)
+  flags, unparsed = P.parse_known_args()
+  if flags.automatic:
+    auto()
+  else:
+    user()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
index d426e9f12c..0403b652d7 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -44,8 +44,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
     inp_dims = (100, 24, 24, 2)
     self._input = np.random.random_sample(inp_dims)
     self._original_graph = self.get_simple_graph_def()
-    self._gpu_options = cpb2.GPUOptions(
-        per_process_gpu_memory_fraction=0.50)
+    self._gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
     self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
     self._reference = self.run_graph(self._original_graph, self._input)
 
@@ -60,11 +59,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
           name="weights",
           dtype=dtypes.float32)
       conv = nn.conv2d(
-          input=a,
-          filter=e,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
+          input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
       b = cop.constant(
           [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
       t = nn.bias_add(conv, b, name="biasAdd")
@@ -85,8 +80,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
       inp = inp.outputs[0]
       out = out.outputs[0]
     with self.test_session(
-        graph=g, config=self._config, use_gpu=True,
-        force_gpu=True) as sess:
+        graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess:
       val = sess.run(out, {inp: dumm_inp})
     return val
 
@@ -104,15 +98,14 @@ class IntegrationTest(test_util.TensorFlowTestCase):
       # run over real calibration data here, we are mimicking a calibration
       # set of 30 different batches. Use as much calibration data as you want
     with self.test_session(
-        graph=g, config=self._config, use_gpu=True,
-        force_gpu=True) as sess:
+        graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess:
       for _ in range(30):
         val = sess.run(out, {inp: dumm_inp})
     return val
 
   def get_trt_graph(self, mode):
     """Return trt converted graph."""
-    if mode in  ["FP32", "FP16", "INT8"]:
+    if mode in ["FP32", "FP16", "INT8"]:
       return trt.create_inference_graph(
           input_graph_def=self._original_graph,
           outputs=["output"],
@@ -120,7 +113,7 @@ class IntegrationTest(test_util.TensorFlowTestCase):
           max_workspace_size_bytes=1 << 25,
           precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
           minimum_segment_size=2  # minimum number of nodes in an engine
-          )
+      )
     return None
 
   def testFP32(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 5dd7bde205..5b9aeaa879 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-"""TPU system metdata and associated tooling."""
+"""TPU system metadata and associated tooling."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 4b6104a8b4..3137bfd03e 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -159,7 +159,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 	* step_id - Step ID.
 	* request_index - Request index.
 	* remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
-* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occured on the sender side, so it can propagate it to the upper levels.
+* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occurred on the sender side, so it can propagate it to the upper levels.
 	* type - The message type.
 	* name (name_size) - Name of the requested tensor.
 	* step_id - Step ID.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 2dd8e6fb31..3286f856db 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2762,6 +2762,7 @@ cc_library(
     ],
     visibility = [
         "//tensorflow/compiler:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/core/profiler:__subpackages__",
     ],
     deps = [":lib_internal"],
@@ -3683,7 +3684,11 @@ tf_cuda_only_cc_test(
         ":test",
         ":test_main",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(
+        [
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
 )
 
 tf_cc_test_gpu(
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 0000000000..8cef243aee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  in_arg {
+    name: "input"
+    description: <<END
+A string tensor of the text to be processed.
+END
+  }
+  in_arg {
+    name: "pattern"
+    description: <<END
+A 1-D string tensor of the regular expression to match the input.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A bool tensor with the same shape as `input`.
+END
+  }
+  summary: "Check if the input matches the regex pattern."
+  description: <<END
+The input is a string tensor of any shape. The pattern is a scalar
+string tensor which is applied to every element of the input tensor.
+The boolean values (True or False) of the output tensor indicate
+if the input matches the regex pattern provided.
+
+The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 0000000000..ec310c8aeb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc
index 9ceff86678..9646a0856e 100644
--- a/tensorflow/core/common_runtime/broadcaster.cc
+++ b/tensorflow/core/common_runtime/broadcaster.cc
@@ -80,7 +80,7 @@ void Broadcaster::Run(StatusCallback done) {
 // continuing to occupy its current position.  Hence we calculate as
 // though each device's rank is actually r+1, then subtract 1 again to
 // get the descendent ranks.  If the source is not rank 0 then its
-// decendents include both {0,1} and the descendents of its current
+// descendants include both {0,1} and the descendents of its current
 // position.  Where a non-0-rank source is a descendent of another
 // device, no send to it is necessary.
 
@@ -115,7 +115,7 @@ void Broadcaster::TreeSendTo(const CollectiveParams& cp,
   DCHECK_NE(successor_rank, my_rank);
   if (cp.is_source && source_rank != 0) {
     // The source sends to rank 0,1 in addition to its positional
-    // decendents.
+    // descendants.
     if (cp.group.group_size > 1) {
       targets->push_back(0);
     }
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
index e94e88b323..9eb9f060f6 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -79,7 +79,7 @@ class BufRendezvous {
                   const ProducerCallback& done);
 
   // Called to request access to a Tensor value corresponding to key.
-  // Consumer is provide with a Hook as soon as availble.
+  // Consumer is provide with a Hook as soon as available.
   void ConsumeBuf(const string& key, const ConsumerCallback& done);
 
   // Consumer must call this function when it's done reading the Hook provided
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 6b072f3cc9..f8428f2fde 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -283,7 +283,7 @@ void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
   // Note on field indexing: There are group_size_ devices in the
   // instance, implying the same number of chunks per tensor, where a
   // chunk is the unit of data transferred in a time step.  However, if
-  // a device can simultaenously send data by 2 or more independent
+  // a device can simultaneously send data by 2 or more independent
   // channels we can speed up the transfer by subdividing chunks and
   // processing multiple subdivisions at once.  So the actual number
   // of RingFields is group_size_ * num_subdivs_.
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index be79cc4507..c045596a69 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -104,7 +104,7 @@ ScopedAllocatorContainer::~ScopedAllocatorContainer() {
   // contents deleted via Drop.  When when a step ends early
   // (e.g. through abnormal termination) we need to clean up
   // explicitly.  So long as graph execution of the associated step has
-  // completey terminated this should be safe.
+  // completely terminated this should be safe.
   for (auto& it : allocators_) {
     if (it.second.field_index == ScopedAllocator::kBackingIndex) {
       delete it.second.scoped_allocator;
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 4998a7acfe..03a011f79e 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -52,7 +52,7 @@ namespace {
 
 // Creates an Event proto representing a chunk of a Tensor. This method only
 // populates the field of the Event proto that represent the envelope
-// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype,
+// information (e.g., timestamp, device_name, num_chunks, chunk_index, dtype,
 // shape). It does not set the value.tensor field, which should be set by the
 // caller separately.
 Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 18998bbccb..b9f21ea211 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -115,7 +115,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   size_t AssignWorkerToThread(const string& target) {
     // Round-robin target assignment, but keeps the same target on the same
-    // polling thread always, as this is important for gRPC performace
+    // polling thread always, as this is important for gRPC performance
     mutex_lock lock(assignment_mu_);
     auto it = target_assignments_.find(target);
     if (it == target_assignments_.end()) {
diff --git a/tensorflow/core/example/example.proto b/tensorflow/core/example/example.proto
index b2b723278b..e7142a4ef9 100644
--- a/tensorflow/core/example/example.proto
+++ b/tensorflow/core/example/example.proto
@@ -7,7 +7,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ExampleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 // An Example is a mostly-normalized data format for storing data for
diff --git a/tensorflow/core/example/example_parser_configuration.proto b/tensorflow/core/example/example_parser_configuration.proto
index 15846c0e30..b2c115d80e 100644
--- a/tensorflow/core/example/example_parser_configuration.proto
+++ b/tensorflow/core/example/example_parser_configuration.proto
@@ -6,6 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ExampleParserConfigurationProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 import "tensorflow/core/framework/tensor_shape.proto";
diff --git a/tensorflow/core/example/feature.proto b/tensorflow/core/example/feature.proto
index da3dc59a12..6d81974aac 100644
--- a/tensorflow/core/example/feature.proto
+++ b/tensorflow/core/example/feature.proto
@@ -58,7 +58,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "FeatureProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 // Containers to hold repeated fundamental values.
diff --git a/tensorflow/core/framework/allocation_description.proto b/tensorflow/core/framework/allocation_description.proto
index bb1037c2df..64133b05e1 100644
--- a/tensorflow/core/framework/allocation_description.proto
+++ b/tensorflow/core/framework/allocation_description.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "AllocationDescriptionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 message AllocationDescription {
   // Total number of bytes requested
diff --git a/tensorflow/core/framework/api_def.proto b/tensorflow/core/framework/api_def.proto
index e878ab620b..3f8dd272e7 100644
--- a/tensorflow/core/framework/api_def.proto
+++ b/tensorflow/core/framework/api_def.proto
@@ -8,6 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ApiDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 // Used to specify and override the default API & behavior in the
diff --git a/tensorflow/core/framework/attr_value.proto b/tensorflow/core/framework/attr_value.proto
index 62f0a9050f..054e3ec97c 100644
--- a/tensorflow/core/framework/attr_value.proto
+++ b/tensorflow/core/framework/attr_value.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "AttrValueProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index 7885b0171a..19d765cd32 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CostGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
diff --git a/tensorflow/core/framework/device_attributes.proto b/tensorflow/core/framework/device_attributes.proto
index 0b3c0d5bdf..44236ca979 100644
--- a/tensorflow/core/framework/device_attributes.proto
+++ b/tensorflow/core/framework/device_attributes.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DeviceAttributesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 message InterconnectLink {
   int32 device_id = 1;
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index 72e3c43831..e69d3938d9 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "FunctionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/op_def.proto";
diff --git a/tensorflow/core/framework/graph.proto b/tensorflow/core/framework/graph.proto
index 7d6e16d5c1..76d358971d 100644
--- a/tensorflow/core/framework/graph.proto
+++ b/tensorflow/core/framework/graph.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/versions.proto";
diff --git a/tensorflow/core/framework/graph_transfer_info.proto b/tensorflow/core/framework/graph_transfer_info.proto
index 41dd54d78c..232297d460 100644
--- a/tensorflow/core/framework/graph_transfer_info.proto
+++ b/tensorflow/core/framework/graph_transfer_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphTransferInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/types.proto";
 
 message GraphTransferNodeInput {
diff --git a/tensorflow/core/framework/iterator.proto b/tensorflow/core/framework/iterator.proto
index 7e5f5ea2e0..f015342e13 100644
--- a/tensorflow/core/framework/iterator.proto
+++ b/tensorflow/core/framework/iterator.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "IteratorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing the metadata for an iterator's state stored
 // as a Variant tensor.
diff --git a/tensorflow/core/framework/kernel_def.proto b/tensorflow/core/framework/kernel_def.proto
index 65e9ef04a0..a17b9c8492 100644
--- a/tensorflow/core/framework/kernel_def.proto
+++ b/tensorflow/core/framework/kernel_def.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "KernelDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 message KernelDef {
diff --git a/tensorflow/core/framework/log_memory.proto b/tensorflow/core/framework/log_memory.proto
index d1e126330d..7f37eadc3b 100644
--- a/tensorflow/core/framework/log_memory.proto
+++ b/tensorflow/core/framework/log_memory.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "LogMemoryProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor_description.proto";
 
 message MemoryLogStep {
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index 8fcee32e29..0a095f903f 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NodeProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 message NodeDef {
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index ca0e5e7133..aea2d2bb09 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "OpDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/types.proto";
 
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index ff38e4b221..533dd64805 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -59,14 +59,14 @@ class ApiDefMap {
   // You can call this method multiple times to load multiple
   // sets of files. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
-  // definitions take precedense.
+  // definitions take precedence.
   // ApiDefs loaded from files must contain a subset of ops defined
   // in the OpList passed to the constructor.
   Status LoadFileList(Env* env, const std::vector<string>& filenames);
 
   // Load a single file. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
-  // definitions take precedense.
+  // definitions take precedence.
   // ApiDefs loaded from file must contain a subset of ops defined
   // in the OpList passed to the constructor.
   Status LoadFile(Env* env, const string& filename);
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 67943377b9..f577664709 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -534,7 +534,7 @@ class OpKernelContext {
     Rendezvous* rendezvous = nullptr;
 
     // Mechanism for executing a collective op that needs to coordinate
-    // with parallel instances runing on other devices.
+    // with parallel instances running on other devices.
     CollectiveExecutor* collective_executor = nullptr;
 
     // The session state for this op.
diff --git a/tensorflow/core/framework/reader_base.proto b/tensorflow/core/framework/reader_base.proto
index 1b8b965ee1..9e187cfa79 100644
--- a/tensorflow/core/framework/reader_base.proto
+++ b/tensorflow/core/framework/reader_base.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ReaderBaseProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // For serializing and restoring the state of ReaderBase, see
 // reader_base.h for details.
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 946da40d0e..eb689ec1e6 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+//add go_package externally
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index b1921337f5..a54d3d906c 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ResourceHandle";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 65c8089d51..d98999cb54 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "StepStatsProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/allocation_description.proto";
 import "tensorflow/core/framework/tensor_description.proto";
 
diff --git a/tensorflow/core/framework/summary.proto b/tensorflow/core/framework/summary.proto
index 55879f8783..532e4fcd87 100644
--- a/tensorflow/core/framework/summary.proto
+++ b/tensorflow/core/framework/summary.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SummaryProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor.proto";
 
 // Metadata associated with a series of Summary data
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index abbf16e810..55921af1d0 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/resource_handle.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_description.proto b/tensorflow/core/framework/tensor_description.proto
index 6ac3c1b881..4c23c7e620 100644
--- a/tensorflow/core/framework/tensor_description.proto
+++ b/tensorflow/core/framework/tensor_description.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorDescriptionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/allocation_description.proto";
diff --git a/tensorflow/core/framework/tensor_shape.proto b/tensorflow/core/framework/tensor_shape.proto
index 1ec3c5323c..286156a012 100644
--- a/tensorflow/core/framework/tensor_shape.proto
+++ b/tensorflow/core/framework/tensor_shape.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorShapeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 package tensorflow;
 
diff --git a/tensorflow/core/framework/tensor_slice.proto b/tensorflow/core/framework/tensor_slice.proto
index 24b01661dc..a5c366ed60 100644
--- a/tensorflow/core/framework/tensor_slice.proto
+++ b/tensorflow/core/framework/tensor_slice.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorSliceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 package tensorflow;
 
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index e003fd0010..03835d1b92 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TypesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // LINT.IfChange
 enum DataType {
diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto
index e0df01cc9b..93ae423bab 100644
--- a/tensorflow/core/framework/variable.proto
+++ b/tensorflow/core/framework/variable.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VariableProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing a Variable.
 message VariableDef {
diff --git a/tensorflow/core/framework/versions.proto b/tensorflow/core/framework/versions.proto
index 7d5e58ae7d..dd2ec55238 100644
--- a/tensorflow/core/framework/versions.proto
+++ b/tensorflow/core/framework/versions.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VersionsProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Version information for a piece of serialized data
 //
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 5e2a465e22..029cdcf94a 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -2022,6 +2022,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2051,6 +2052,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
@@ -2069,6 +2071,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2095,6 +2098,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2125,6 +2129,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2151,6 +2156,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2178,6 +2184,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2204,6 +2211,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2233,6 +2241,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2272,6 +2281,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2289,6 +2299,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['F', 'B', 'E']}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2319,6 +2330,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }");
@@ -2341,6 +2353,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2348,6 +2361,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
@@ -2370,6 +2384,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
       " input: ['B', 'C'] }");
@@ -2389,6 +2404,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2411,6 +2427,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['B', 'A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2477,6 +2494,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2529,6 +2547,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2536,6 +2555,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2572,6 +2592,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -2634,6 +2655,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2641,6 +2663,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2678,6 +2701,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -3274,6 +3298,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }",
@@ -3296,6 +3321,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -3323,6 +3349,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }",
diff --git a/tensorflow/core/graph/while_context.h b/tensorflow/core/graph/while_context.h
index 5944e36897..2a83eb7bd8 100644
--- a/tensorflow/core/graph/while_context.h
+++ b/tensorflow/core/graph/while_context.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 // future to support these features.
 //
 // TODO(skyewm): de/serialize in MetaGraphDef so imported while loops will be
-// differentiable. Figure out backwards compatability story.
+// differentiable. Figure out backwards compatibility story.
 class WhileContext {
  public:
   WhileContext(StringPiece frame_name, std::vector<Node*> enter_nodes,
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 4941fb2b38..203f7b09e3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -1082,7 +1082,7 @@ Status GraphProperties::PropagateShapes(
     const std::unordered_map<const NodeDef*, const NodeDef*>& resource_handles,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
-  // incorrect shape functions. The algoritm should converge in at most
+  // incorrect shape functions. The algorithm should converge in at most
   // num_nested_loops^2 * max_rank. We approximate max_rank with the constant 4.
   // The same applies to resources.
   VLOG(1) << "Propagating " << new_shapes->size() << " new shapes through "
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 67bf1e6980..34d48819ac 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -328,7 +328,7 @@ class VirtualScheduler {
   Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
-  // Auxilliary data structures for constructing NodeState and DeviceState.
+  // Auxiliary data structures for constructing NodeState and DeviceState.
   GraphProperties graph_properties_;
   Cluster* cluster_;  // Not owned.
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 87ab460862..e08ab1eb67 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2183,7 +2183,7 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   TuningConfig config;
   config.no_gemm = true;
-  // TODO(yaozhang): Enable tuning with various TuningConfig choices wtih
+  // TODO(yaozhang): Enable tuning with various TuningConfig choices with
   // the measurement-based estimator.
   status = Tune(item, graph_properties, config, output);
   if (!status.ok()) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b2b631a222..5948f8d39f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4248,6 +4248,7 @@ cc_library(
         ":as_string_op",
         ":base64_ops",
         ":reduce_join_op",
+        ":regex_full_match_op",
         ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
@@ -4285,6 +4286,12 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
+    name = "regex_full_match_op",
+    prefix = "regex_full_match_op",
+    deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
+)
+
+tf_kernel_library(
     name = "regex_replace_op",
     prefix = "regex_replace_op",
     deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
@@ -5174,6 +5181,7 @@ filegroup(
             "debug_ops.*",
             "mutex_ops.*",
             "batch_kernels.*",
+            "regex_full_match_op.cc",
             "regex_replace_op.cc",
         ],
     ),
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index a1c03f9918..475bda848d 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -329,6 +329,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       c_ptrs.push_back(&c_device_memory.back());
     }
 
+    typedef Scalar Coefficient;
+
     // Cublas does
     // C = A x B
     // where A, B and C are assumed to be in column major.
@@ -352,9 +354,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
-                               static_cast<Scalar>(1.0), *(a_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(a_ptrs[0]),
                                adj_x ? m : k, *(b_ptrs[0]), 1,
-                               static_cast<Scalar>(0.0), c_ptrs[0], 1)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], 1)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -366,9 +368,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
-                               static_cast<Scalar>(1.0), *(b_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(b_ptrs[0]),
                                adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
-                               static_cast<Scalar>(0.0), c_ptrs[0], n)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], n)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -383,8 +385,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           stream
               ->ThenBlasGemmBatchedWithScratch(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                  adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
                   batch_size, &scratch_allocator)
               .ok();
       if (!blas_launch_status) {
@@ -398,6 +400,98 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
   }
 };
 
+template <>
+struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+    typedef Eigen::half Scalar;
+    constexpr perftools::gputools::blas::Transpose kTranspose =
+        is_complex<Scalar>::value
+            ? perftools::gputools::blas::Transpose::kConjugateTranspose
+            : perftools::gputools::blas::Transpose::kTranspose;
+    perftools::gputools::blas::Transpose trans[] = {
+        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+    const uint64 batch_size = in_x.dim_size(0);
+    auto blas_transpose_a = trans[adj_x];
+    auto blas_transpose_b = trans[adj_y];
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    std::vector<DeviceMemoryType> a_device_memory;
+    std::vector<DeviceMemoryType> b_device_memory;
+    std::vector<DeviceMemoryType> c_device_memory;
+    std::vector<DeviceMemoryType*> a_ptrs;
+    std::vector<DeviceMemoryType*> b_ptrs;
+    std::vector<DeviceMemoryType*> c_ptrs;
+    a_device_memory.reserve(batch_size);
+    b_device_memory.reserve(batch_size);
+    c_device_memory.reserve(batch_size);
+    a_ptrs.reserve(batch_size);
+    b_ptrs.reserve(batch_size);
+    c_ptrs.reserve(batch_size);
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* b_base_ptr = in_y.template flat<Scalar>().data();
+    auto* c_base_ptr = out->template flat<Scalar>().data();
+    for (int64 i = 0; i < batch_size; ++i) {
+      a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+      b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+      c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+      a_ptrs.push_back(&a_device_memory.back());
+      b_ptrs.push_back(&b_device_memory.back());
+      c_ptrs.push_back(&c_device_memory.back());
+    }
+
+    typedef float Coefficient;
+
+    // Cublas does
+    // C = A x B
+    // where A, B and C are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // C' = B' x A', where ' stands for transpose (not adjoint).
+    // TODO(yangzihao): Choose the best of the three strategies using autotune.
+    if (batch_size == 1) {
+      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+      // overhead of the scratch allocator and the batch interface.
+      // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                             static_cast<Coefficient>(1.0), *(b_ptrs[0]),
+                             adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                             static_cast<Coefficient>(0.0), c_ptrs[0], n)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k));
+      }
+    } else {
+      CublasScratchAllocator scratch_allocator(context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
+                  batch_size, &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(
+            errors::Internal("Blas xGEMMBatched launch failed : a.shape=",
+                             in_x.shape().DebugString(), ", b.shape=",
+                             in_y.shape().DebugString(), ", m=", m, ", n=", n,
+                             ", k=", k, ", batch_size=", batch_size));
+      }
+    }
+  }
+};
+
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 97cec3a5cc..87a0795f2f 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 #if !defined(INTEL_MKL)
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index c6119b5011..b77c14d012 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -76,7 +76,7 @@ class AdaptiveSharedBatchScheduler
           AdaptiveSharedBatchScheduler<TaskType>> {
  public:
   ~AdaptiveSharedBatchScheduler() {
-    // Finish processing batches before destorying other class members.
+    // Finish processing batches before destroying other class members.
     batch_thread_pool_.reset();
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 9edc6d416e..980b1063de 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -195,8 +195,8 @@ class Conv3DBackpropInputOp : public OpKernel {
     TensorShape input_shape;
     if (takes_shape_) {
       const Tensor& input_sizes = context->input(0);
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_sizes.vec<int32>(), &input_shape));
+      // MakeShape is able to handle both DT_INT32 and DT_INT64 for input_sizes.
+      OP_REQUIRES_OK(context, MakeShape(input_sizes, &input_shape));
     } else {
       input_shape = context->input(0).shape();
     }
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index 180531b8c0..a2e7342b04 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -595,7 +595,7 @@ constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
   // For a tile size combination (longside, shortside), lying on the frontier
   // implies that (longside, shortside) is on or within the frontier but
   // (longside*2, shortside) or (longside, shortside+1) is not. With the above
-  // critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // criterion, we simply need to use !TileSizeOnLongSideFrontier to ensure that
   // it is not on the long side frontier.
   return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
          (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index 7f12eb953a..0e43cc19aa 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -114,7 +114,7 @@ struct NthElementFunctor<CPUDevice, T> {
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     // The average time complexity of partition-based nth_element (BFPRT) is
-    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // O(n), although the worst time complexity could be O(n^2). Here, 20 is a
     // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
new file mode 100644
index 0000000000..5863a2c8e4
--- /dev/null
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "re2/re2.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class RegexFullMatchOp : public OpKernel {
+ public:
+  explicit RegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    const Tensor* pattern_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
+                errors::InvalidArgument("Pattern must be scalar, but received ",
+                                        pattern_tensor->shape().DebugString()));
+    const string pattern = pattern_tensor->flat<string>()(0);
+    const RE2 match(pattern);
+    OP_REQUIRES(ctx, match.ok(),
+                errors::InvalidArgument("Invalid pattern: ", pattern,
+                                        ", error: ", match.error()));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
+                                             &output_tensor));
+    auto output_flat = output_tensor->flat<bool>();
+    for (size_t i = 0; i < input_flat.size(); ++i) {
+      output_flat(i) = RE2::FullMatch(input_flat(i), match);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
+                        RegexFullMatchOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index 96f94d80df..722116f86f 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -84,7 +84,7 @@ void DoRoll(OpKernelContext* context, const int64 num_elements,
   // Shard
   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
   // 15 - expiramentally determined with float and bool types
-  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  const int cost_per_element = 15 * sizeof(T);  // rough estimate
   Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
         cost_per_element, std::move(work));
 }
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index c87ce78e05..2328fc6afd 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -320,7 +320,9 @@ class SegmentSumGPUOp : public AsyncOpKernel {
   REGISTER_CPU_KERNEL_SEGMENT("SegmentSum", Eigen::internal::SumReducer<type>, \
                               type, index_type, 0);                            \
   REGISTER_CPU_KERNEL_SEGMENT(                                                 \
-      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1)
+      "SegmentMean", Eigen::internal::MeanReducer<type>, type, index_type, 0); \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1);
 
 #define REGISTER_REAL_CPU_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_KERNELS(type, int32);   \
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 4abfbfb1a6..7796bf3587 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -130,4 +130,4 @@ struct Highest {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
diff --git a/tensorflow/core/lib/core/error_codes.proto b/tensorflow/core/lib/core/error_codes.proto
index b82d389146..5ced65a973 100644
--- a/tensorflow/core/lib/core/error_codes.proto
+++ b/tensorflow/core/lib/core/error_codes.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ErrorCodesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/lib/core";
 
 // The canonical error codes for TensorFlow APIs.
 //
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 82330ec9d1..d949e70c66 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -435,6 +435,25 @@ REGISTER_OP("DrawBoundingBoxes")
     .Output("output: T")
     .Attr("T: {float, half} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
+      // The rank of images should be 4.
+      ShapeHandle images;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &images));
+      // Channel depth should be either 1 (GRY), 3 (RGB), or 4 (RGBA).
+      if (c->ValueKnown(c->Dim(images, 3))) {
+        int64 depth = c->Value(c->Dim(images, 3));
+        if (!(depth == 1 || depth == 3 || depth == 4)) {
+          return errors::InvalidArgument("Channel depth should be either 1 (GRY), "
+                                         "3 (RGB), or 4 (RGBA)");
+        }
+      }
+
+      // The rank of boxes is 3: [batch, num_bounding_boxes, 4].
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &boxes));
+      // The last value of boxes shape is 4.
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
+
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 5f0b391b0d..517af26b44 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -312,4 +312,23 @@ TEST(ImageOpsTest, QuantizedResizeBilinear_ShapeFn) {
   INFER_OK(op, "[1,?,3,?];[2];[];[]", "[d0_0,20,30,d0_3];[];[]");
 }
 
+TEST(ImageOpsTest, DrawBoundingBoxes_ShapeFn) {
+  ShapeInferenceTestOp op("DrawBoundingBoxes");
+  op.input_tensors.resize(2);
+
+  // Check images.
+  INFER_ERROR("must be rank 4", op, "[1,?,3];?");
+  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)",
+      op, "[1,?,?,5];?");
+
+  // Check boxes.
+  INFER_ERROR("must be rank 3", op, "[1,?,?,4];[1,4]");
+  INFER_ERROR("Dimension must be 4", op, "[1,?,?,4];[1,2,2]");
+
+  // OK shapes.
+  INFER_OK(op, "[4,?,?,4];?", "in0");
+  INFER_OK(op, "[?,?,?,?];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,4]", "in0");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f8443a46c..8c0b073ce4 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1017,7 +1017,7 @@ REGISTER_OP("SegmentMean")
     .Input("data: T")
     .Input("segment_ids: Tindices")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn(SegmentReductionShapeFn);
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index bb46dafd42..fc60e807b9 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -547,7 +547,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     });
 
 REGISTER_OP("Conv3DBackpropInputV2")
-    .Input("input_sizes: int32")
+    .Input("input_sizes: Tshape")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
@@ -556,6 +556,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 416ce9c0d8..80ffae5796 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -72,7 +72,15 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      // Parameters must be 0-d or 1-d.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(3), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(4), 1, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("TruncatedNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 469f193cf4..1d5c743a56 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -37,6 +37,17 @@ REGISTER_OP("RegexReplace")
       return Status::OK();
     });
 
+REGISTER_OP("RegexFullMatch")
+    .Input("input: string")
+    .Input("pattern: string")
+    .Output("output: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringToHashBucketFast")
     .Input("input: string")
     .Output("output: int64")
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 8307758936..dc12c78a4b 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -103,7 +103,7 @@ constexpr char kResolveCacheSecs[] = "GCS_RESOLVE_REFRESH_SECS";
 // The environment variable to configure the http request's connection timeout.
 constexpr char kRequestConnectionTimeout[] =
     "GCS_REQUEST_CONNECTION_TIMEOUT_SECS";
-// The environment varaible to configure the http request's idle timeout.
+// The environment variable to configure the http request's idle timeout.
 constexpr char kRequestIdleTimeout[] = "GCS_REQUEST_IDLE_TIMEOUT_SECS";
 // The environment variable to configure the overall request timeout for
 // metadata requests.
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
index 97a858e3fe..8c9e2e074c 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.h
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -132,7 +132,7 @@ class GcsThrottle {
    * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
    *
    * UpdateState should be called in order to mark the passage of time, and
-   * therefore add tokens to the availble_tokens_ pool.
+   * therefore add tokens to the available_tokens_ pool.
    */
   void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
diff --git a/tensorflow/core/profiler/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md
index bbaf55e613..cc6d9def47 100644
--- a/tensorflow/core/profiler/g3doc/command_line.md
+++ b/tensorflow/core/profiler/g3doc/command_line.md
@@ -82,7 +82,7 @@ bazel-bin/tensorflow/core/profiler/profiler \
 #
 # Alternatively, user can pass separate files.
 #
-# --graph_path contains the model architecutre and tensor shapes.
+# --graph_path contains the model architecture and tensor shapes.
 # --run_meta_path contains the memory and time information.
 # --op_log_path contains float operation and code traces.
 # --checkpoint_path contains the model checkpoint data.
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
index 33c87eefe0..c696d345e0 100644
--- a/tensorflow/core/protobuf/cluster.proto
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -20,6 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ClusterProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // This file contains protos to be used when defining a TensorFlow
 // cluster.
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 410ad227e9..9a48f43a63 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 3c05b4f0e2..5f44878c44 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ControlFlowProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Control flow context related protocol buffers.
 
diff --git a/tensorflow/core/protobuf/critical_section.proto b/tensorflow/core/protobuf/critical_section.proto
index 0b3f531e6d..7954e7ba87 100644
--- a/tensorflow/core/protobuf/critical_section.proto
+++ b/tensorflow/core/protobuf/critical_section.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CriticalSectionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Protocol buffer representing a CriticalSection.
 message CriticalSectionDef {
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 56983f3b7d..499900f965 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DebugProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // EXPERIMENTAL. Option for watching a node.
 message DebugTensorWatch {
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
index 3bd3015900..11e1258e75 100644
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -18,6 +18,7 @@ syntax = "proto3";
 package tensorflow;
 option cc_enable_arenas = true;
 option java_outer_classname = "DevicePropertiesProtos";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 message DeviceProperties {
   // Device type (CPU, GPU, ...)
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 96c91536f7..03022875e6 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -20,7 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index 1170611f37..ce0e4f6435 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -19,7 +19,7 @@ package tensorflow.grpc;
 option java_outer_classname = "MasterServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/master.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index fd86c0da12..75a2a88ed7 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "MetaGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 
 import "tensorflow/core/framework/graph.proto";
diff --git a/tensorflow/core/protobuf/named_tensor.proto b/tensorflow/core/protobuf/named_tensor.proto
index dd4976e354..6e2f7feee2 100644
--- a/tensorflow/core/protobuf/named_tensor.proto
+++ b/tensorflow/core/protobuf/named_tensor.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NamedTensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/tensor.proto";
 
 // A pair of tensor name and tensor values.
diff --git a/tensorflow/core/protobuf/queue_runner.proto b/tensorflow/core/protobuf/queue_runner.proto
index 05a48d0acf..f4df649f7d 100644
--- a/tensorflow/core/protobuf/queue_runner.proto
+++ b/tensorflow/core/protobuf/queue_runner.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "QueueRunnerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/lib/core/error_codes.proto";
 
 // Protocol buffer representing a QueueRunner.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 10bfe3034d..45e57594e4 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 import "tensorflow/core/framework/attr_value.proto";
 
@@ -31,7 +32,7 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
-  // Enum controling the number of times to run optimizers. The default is to
+  // Enum controlling the number of times to run optimizers. The default is to
   // run them once.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
diff --git a/tensorflow/core/protobuf/saved_model.proto b/tensorflow/core/protobuf/saved_model.proto
index c2595ddf88..03789d3df7 100644
--- a/tensorflow/core/protobuf/saved_model.proto
+++ b/tensorflow/core/protobuf/saved_model.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SavedModelProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/meta_graph.proto";
 
 // SavedModel is the high level serialization format for TensorFlow Models.
diff --git a/tensorflow/core/protobuf/saver.proto b/tensorflow/core/protobuf/saver.proto
index a757d3f756..4245386145 100644
--- a/tensorflow/core/protobuf/saver.proto
+++ b/tensorflow/core/protobuf/saver.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SaverProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Protocol buffer representing the configuration of a Saver.
 message SaverDef {
diff --git a/tensorflow/core/protobuf/tensor_bundle.proto b/tensorflow/core/protobuf/tensor_bundle.proto
index 80e87f14f9..681c01bbbd 100644
--- a/tensorflow/core/protobuf/tensor_bundle.proto
+++ b/tensorflow/core/protobuf/tensor_bundle.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorBundleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/tensor_slice.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index 6199e707e5..be25804a1b 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -23,7 +23,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
   // The cluster of which this server is a member.
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index b400638df1..a3bc2f422e 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -20,7 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "WorkerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index e0c27f394a..9ebbd553f2 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -19,7 +19,7 @@ package tensorflow.grpc;
 option java_outer_classname = "WorkerServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/worker.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ba69efb289..522a9d84fd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
index f2d4e470c8..b91f8bb8ef 100644
--- a/tensorflow/core/util/cuda_device_functions.h
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -537,7 +537,7 @@ __device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
   return atomicSub(ptr, value);
 }
 
-// Specializations of substraction which add the negative value.
+// Specializations of subtraction which add the negative value.
 __device__ inline float CudaAtomicSub(float* ptr, float value) {
   return CudaAtomicAdd(ptr, -value);
 }
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 50a8e30574..8105121e7c 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1359,7 +1359,7 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
-/// in Tensorflow's TensorShape object by perserving dimension order.
+/// in Tensorflow's TensorShape object by preserving dimension order.
 ///
 /// @input MKL-DNN memory::dims object
 /// @output TensorShape corresponding to memory::dims
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 58bc79aebf..d3d5602f92 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -75,7 +75,7 @@ enum FilterTensorFormat {
   FORMAT_OIHW = 1,
 
   // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
-  // int8 convolution and fused convolution. It is analagous to the NCHW_VECT_C
+  // int8 convolution and fused convolution. It is analogous to the NCHW_VECT_C
   // data format. It is laid out in the same order as OIHW, except that the size
   // of the Input Channels dimension is divided by 4, and a new dimension of
   // size 4 is appended, which packs 4 adjacent input channel weights into an
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index b3ca958370..5bbbfd3216 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -184,7 +184,7 @@ The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}
     dataset = dataset.map(decode)
 ```
 
-To acomplish the same task with a queue based input pipeline requires the following code 
+To accomplish the same task with a queue based input pipeline requires the following code
 (using the same `decode` function from the above example): 
 
 ``` python
diff --git a/tensorflow/docs_src/community/benchmarks.md b/tensorflow/docs_src/community/benchmarks.md
index 67856ce869..153ef4a015 100644
--- a/tensorflow/docs_src/community/benchmarks.md
+++ b/tensorflow/docs_src/community/benchmarks.md
@@ -1,14 +1,14 @@
 # Defining and Running Benchmarks
 
-This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to TensorFlow github repo, then we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
+This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to the TensorFlow github repo, we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
 
 [TOC]
 
 
 ## Defining a Benchmark
 
-Defining a TensorFlow benchmark requires extending from `tf.test.Benchmark`
-class and calling `self.report_benchmark` method. For example, take a look at the sample benchmark code below:
+Defining a TensorFlow benchmark requires extending the `tf.test.Benchmark`
+class and calling the `self.report_benchmark` method. Below, you'll find an example of benchmark code:
 
 ```python
 import time
@@ -54,20 +54,20 @@ Key points to note in the example above:
 
 ## Running with Python
 
-Use the `--benchmarks` flag to run the benchmark with python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
+Use the `--benchmarks` flag to run the benchmark with Python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
 
 ```
 python sample_benchmark.py --benchmarks=SampleBenchmark
 ```
 
-Setting the flag as `--benchmarks=.` or `--benchmarks=all` would work as well.
+Setting the flag as `--benchmarks=.` or `--benchmarks=all` works as well.
 
-(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with bazel.)
+(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with Bazel.)
 
 
 ## Adding a `bazel` Target
 
-We have a special target called `tf_py_logged_benchmark` for benchmarks defined under TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
+We have a special target called `tf_py_logged_benchmark` for benchmarks defined under the TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
 
 First, define a regular `py_test` target. See example below:
 
@@ -82,7 +82,7 @@ py_test(
 )
 ```
 
-You can run benchmarks in a `py_test` target by passing `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
+You can run benchmarks in a `py_test` target by passing the `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
 
 ```shell
 bazel test :sample_benchmark --test_arg=--benchmarks=all
@@ -90,7 +90,7 @@ bazel test :sample_benchmark --test_arg=--benchmarks=all
 
 
 Now, add the `tf_py_logged_benchmark` target (if available). This target would
-pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. `tf_py_logged_benchmark` target should be available in TensorFlow repository.
+pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. The target `tf_py_logged_benchmark` should be available in TensorFlow repository.
 
 ```build
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
index e5e9e8e0a4..d1625d3b93 100644
--- a/tensorflow/docs_src/community/swift.md
+++ b/tensorflow/docs_src/community/swift.md
@@ -8,7 +8,7 @@ Welcome to the Swift for TensorFlow development community!
 
 Swift for TensorFlow is a new way to develop machine learning models. It
 gives you the power of
-[TensorFlow](programmers_guide/eager) directly
+[TensorFlow](https://www.tensorflow.org) directly
 integrated into the [Swift programming language](https://swift.org/about).
 With Swift, you can write the following imperative code, and Swift
 automatically turns it into **a single TensorFlow Graph** and runs it
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index ef3b030e32..9ef9674338 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -1,6 +1,6 @@
 # How to run TensorFlow on S3
 
-Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitous, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
 This document guides you through the required setup, and provides examples on usage.
 
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index c3795492ce..1b028be4ea 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -863,48 +863,53 @@ REGISTER_OP("ZeroOut")
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
 registration (`REGISTER_KERNEL_BUILDER` call) per overload.
-<pre class="prettyprint"><code class="lang-cpp">
-<b>template &lt;typename T&gt;</b>
+```c++
+template <typename T>
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
+  
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat<b>&lt;T&gt;</b>();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<T>();
+    
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat<b>&lt;T&gt;</b>();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<T>();
+    
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+    
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;int32&gt;("T"),
-    <b>ZeroOutOp&lt;int32&gt;</b>);
-REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOp<int32>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
-    <b>ZeroOutOp&lt;float&gt;</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
+    ZeroOutOp<float>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;double&gt;("T"),
-    ZeroOutOp&lt;double&gt;);
-</b></code></pre>
+    .Device(DEVICE_CPU)
+    .TypeConstraint<double>("T"),
+    ZeroOutOp<double>);
+```
 
 If you have more than a couple overloads, you can put the registration in a
 macro.
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index c0fc714a44..c8f522a03a 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -4,8 +4,8 @@ We designed TensorFlow for large-scale distributed training and inference, but
 it is also flexible enough to support experimentation with new machine
 learning models and system-level optimizations.
 
-This document describes the system architecture that makes possible this
-combination of scale and flexibility. It assumes that you have basic familiarity
+This document describes the system architecture that makes this
+combination of scale and flexibility possible. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
 and sessions. See @{$programmers_guide/low_level_intro$this document}
 for an introduction to these topics. Some familiarity
@@ -15,8 +15,8 @@ will also be helpful.
 This document is for developers who want to extend TensorFlow in some way not
 supported by current APIs, hardware engineers who want to optimize for
 TensorFlow, implementers of machine learning systems working on scaling and
-distribution, or anyone who wants to look under Tensorflow's hood. After
-reading it you should understand TensorFlow architecture well enough to read
+distribution, or anyone who wants to look under Tensorflow's hood. By the end of this document 
+you should understand the TensorFlow architecture well enough to read
 and modify the core TensorFlow code.
 
 ## Overview
@@ -35,7 +35,7 @@ This document focuses on the following layers:
 *  **Client**:
    *  Defines the computation as a dataflow graph.
    *  Initiates graph execution using a [**session**](
-      https://www.tensorflow.org/code/tensorflow/python/client/session.py)
+      https://www.tensorflow.org/code/tensorflow/python/client/session.py).
 *  **Distributed Master**
    *  Prunes a specific subgraph from the graph, as defined by the arguments
       to Session.run().
@@ -55,7 +55,7 @@ Figure 2 illustrates the interaction of these components. "/job:worker/task:0" a
 server": a task responsible for storing and updating the model's parameters.
 Other tasks send updates to these parameters as they work on optimizing the
 parameters. This particular division of labor between tasks is not required, but
-it is common for distributed training.
+ is common for distributed training.
 
 ![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"}
 
@@ -193,7 +193,7 @@ https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py))
 
 ## Kernel Implementations
 
-The runtime contains over 200 standard operations, including mathematical, array
+The runtime contains over 200 standard operations including mathematical, array
 manipulation, control flow, and state management operations. Each of these
 operations can have kernel implementations optimized for a variety of devices.
 Many of the operation kernels are implemented using Eigen::Tensor, which uses
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 8c165aad52..1abd840ab3 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 26cbcc9a9b..52a2a3f8a6 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 05b2878701..1256fb99c4 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0-rc1</version>
+                 <version>1.8.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 9d9322dbb5..0ed8160027 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -438,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -684,14 +684,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +703,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +722,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +741,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 0906b55008..29a867a9e3 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 8bbdf013ca..5ba522b436 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0rc1 on Linux:
+for TensorFlow 1.8.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/mobile/mobile_intro.md b/tensorflow/docs_src/mobile/mobile_intro.md
index 1b0b9b44b4..241f01d460 100644
--- a/tensorflow/docs_src/mobile/mobile_intro.md
+++ b/tensorflow/docs_src/mobile/mobile_intro.md
@@ -212,7 +212,7 @@ handle the task then it will be difficult to train a computer to do better.
 
 After you’ve solved any fundamental issues with your use case, you need to
 create a labeled dataset to define what problem you’re trying to solve. This
-step is extremely important, moreso than picking which model to use. You want it
+step is extremely important, more than picking which model to use. You want it
 to be as representative as possible of your actual use case, since the model
 will only be effective at the task you teach it. It’s also worth investing in
 tools to make labeling the data as efficient and accurate as possible. For
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index 01881ccf3b..5622034827 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -155,7 +155,7 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-<img src="/images/tflite-architecture.jpg"
+<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
      alt="TensorFlow Lite architecture diagram"
      style="max-width:600px;">
 
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 51c1a1e032..b6291a9ffa 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -72,7 +72,7 @@ tensors in the execution of a step.
 
 If `t` is a @{tf.Tensor} object,
 @{tf.Tensor.eval} is shorthand for
-@{tf.Session.run} (where `sess` is the
+@{tf.Session.run}, where `sess` is the
 current @{tf.get_default_session}. The
 two following snippets of code are equivalent:
 
@@ -101,9 +101,8 @@ sessions, it may be more straightforward to make explicit calls to
 Sessions can own resources, such as
 @{tf.Variable},
 @{tf.QueueBase}, and
-@{tf.ReaderBase}; and these resources can use
-a significant amount of memory. These resources (and the associated memory) are
-released when the session is closed, by calling
+@{tf.ReaderBase}. These resources can sometimes use
+a significant amount of memory, and can be released when the session is closed by calling
 @{tf.Session.close}.
 
 The intermediate tensors that are created as part of a call to
@@ -137,7 +136,7 @@ TensorFlow also has a
 to help build support for more client languages.  We invite contributions of new
 language bindings.
 
-Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the opensource community build on top of the C API supported by the TensorFlow maintainers.
+Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the open source community build on top of the C API supported by the TensorFlow maintainers.
 
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
@@ -210,8 +209,8 @@ a new tensor with a different dynamic shape.
 
 #### How do I build a graph that works with variable batch sizes?
 
-It is often useful to build a graph that works with variable batch sizes, for
-example so that the same code can be used for (mini-)batch training, and
+It is often useful to build a graph that works with variable batch sizes 
+so that the same code can be used for (mini-)batch training, and
 single-instance inference. The resulting graph can be
 @{tf.Graph.as_graph_def$saved as a protocol buffer}
 and
@@ -260,7 +259,7 @@ See the how-to documentation for
 There are three main options for dealing with data in a custom format.
 
 The easiest option is to write parsing code in Python that transforms the data
-into a numpy array. Then use @{tf.data.Dataset.from_tensor_slices} to
+into a numpy array. Then, use @{tf.data.Dataset.from_tensor_slices} to
 create an input pipeline from the in-memory data.
 
 If your data doesn't fit in memory, try doing the parsing in the Dataset
@@ -274,7 +273,7 @@ If your data is not easily parsable with the built-in TensorFlow operations,
 consider converting it, offline, to a format that is easily parsable, such
 as @{tf.python_io.TFRecordWriter$`TFRecord`} format.
 
-The more efficient method to customize the parsing behavior is to
+The most efficient method to customize the parsing behavior is to
 @{$adding_an_op$add a new op written in C++} that parses your
 data format. The @{$new_data_formats$guide to handling new data formats} has
 more information about the steps for doing this.
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index 58a80d5339..1248c3cabe 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -265,7 +265,7 @@ example:
 ```python
 constant = tf.constant([1, 2, 3])
 tensor = constant * constant
-print tensor.eval()
+print(tensor.eval())
 ```
 
 The `eval` method only works when a default `tf.Session` is active (see
@@ -306,8 +306,8 @@ Note that you rarely want to use the following pattern when printing a
 
 ``` python
 t = <<some tensorflow operation>>
-print t  # This will print the symbolic tensor when the graph is being built.
-         # This tensor does not have a value in this context.
+print(t)  # This will print the symbolic tensor when the graph is being built.
+          # This tensor does not have a value in this context.
 ```
 
 This code prints the `tf.Tensor` object (which represents deferred computation)
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index e8cf771155..cd8c4b5b9a 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -237,7 +237,7 @@ TensorFlow supports two ways of sharing variables:
 While code which explicitly passes variables around is very clear, it is
 sometimes convenient to write TensorFlow functions that implicitly use
 variables in their implementations. Most of the functional layers from
-`tf.layer` use this approach, as well as all `tf.metrics`, and a few other
+`tf.layers` use this approach, as well as all `tf.metrics`, and a few other
 library utilities.
 
 Variable scopes allow you to control variable reuse when calling functions which
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index ead5a636b9..0f17899dae 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -209,7 +209,6 @@ for two-dimensional image data expect input tensors to have a shape of
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
-*   _`image_height`_. Height of the example images.
 *   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
       `channels_last` corresponds to inputs with shape
       `(batch, ..., channels)` while `channels_first` corresponds to
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index 9e21aee87f..a40a9eaecb 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -73,7 +73,7 @@ def cnn_model(features, labels, mode):
         kernel_size=FILTER_SHAPE2,
         padding='VALID')
     # Max across each filter to get useful features for classification.
-    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
+    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), axis=[1])
 
   # Apply regular WX + B and classification.
   logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a503b3b00a..36db3dda6b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -21321,7 +21321,7 @@ func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 //    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
 //
 // The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
 // Each element must be in the range `[0, 255]` (It represents the value of a
 // pixel in the output image).  Non-finite values in the input tensor are
 // replaced by this tensor in the output image.  The default value is the color
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 9af2e9b8b6..32e08021dc 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -103,7 +103,7 @@ def is_sequence(seq):
   NOTE(mrry): This differs from `tensorflow.python.util.nest.is_sequence()`,
   which *does* treat a Python list as a sequence. For ergonomic
   reasons, `tf.data` users would prefer to treat lists as
-  implict `tf.Tensor` objects, and dicts as (nested) sequences.
+  implicit `tf.Tensor` objects, and dicts as (nested) sequences.
 
   Args:
     seq: an input sequence.
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index f66cefb427..7b87972d69 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -190,8 +190,6 @@ class ScrollBar(object):
     return layout
 
   def get_click_command(self, mouse_y):
-    # TODO(cais): Support continuous scrolling when the mouse button is held
-    # down.
     if self._output_num_rows <= 1:
       return None
     elif mouse_y == self._min_y:
@@ -271,6 +269,10 @@ class CursesUI(base_ui.BaseUI):
 
   _UI_WAIT_MESSAGE = "Processing..."
 
+  # The delay (in ms) between each update of the scroll bar when the mouse
+  # button is held down on the scroll bar. Controls how fast the screen scrolls.
+  _MOUSE_SCROLL_DELAY_MS = 100
+
   _single_instance_lock = threading.Lock()
 
   def __init__(self, on_ui_exit=None, config=None):
@@ -855,7 +857,30 @@ class CursesUI(base_ui.BaseUI):
       except curses.error:
         mouse_event_type = None
 
-      if mouse_event_type == curses.BUTTON1_RELEASED:
+      if mouse_event_type == curses.BUTTON1_PRESSED:
+        # Logic for held mouse-triggered scrolling.
+        if mouse_x >= self._max_x - 2:
+          # Disable blocking on checking for user input.
+          self._command_window.nodelay(True)
+
+          # Loop while mouse button is pressed.
+          while mouse_event_type == curses.BUTTON1_PRESSED:
+            # Sleep for a bit.
+            curses.napms(self._MOUSE_SCROLL_DELAY_MS)
+            scroll_command = self._scroll_bar.get_click_command(mouse_y)
+            if scroll_command in (_SCROLL_UP_A_LINE, _SCROLL_DOWN_A_LINE):
+              self._scroll_output(scroll_command)
+
+            # Check to see if different mouse event is in queue.
+            self._command_window.getch()
+            try:
+              _, _, _, _, mouse_event_type = self._screen_getmouse()
+            except curses.error:
+              pass
+
+          self._command_window.nodelay(False)
+          return x
+      elif mouse_event_type == curses.BUTTON1_RELEASED:
         # Logic for mouse-triggered scrolling.
         if mouse_x >= self._max_x - 2:
           scroll_command = self._scroll_bar.get_click_command(mouse_y)
@@ -1677,4 +1702,7 @@ class CursesUI(base_ui.BaseUI):
       self._redraw_output()
 
   def _screen_set_mousemask(self):
-    curses.mousemask(self._mouse_enabled)
+    if self._mouse_enabled:
+      curses.mousemask(curses.BUTTON1_RELEASED | curses.BUTTON1_PRESSED)
+    else:
+      curses.mousemask(0)
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 9b4b866697..347a760333 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -1163,7 +1163,7 @@ class Estimator(object):
             model_fn_lib.ModeKeys.TRAIN,
             self.config)
 
-        # TODO(anjalisridhar): Figure out how to resolve the folowing scaffold
+        # TODO(anjalisridhar): Figure out how to resolve the following scaffold
         # parameters: init_feed_dict, init_fn.
         scaffold_list = self._distribution.unwrap(
             grouped_estimator_spec.scaffold)
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e5d8141a1..8e2ec83020 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -52,7 +52,7 @@ def _fill_array(arr, seq, fillvalue=0):
   If length of seq is less than arr padded length, fillvalue used.
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
-    seq: Non-padded list of data sampels of shape
+    seq: Non-padded list of data samples of shape
       [batch_size, ..., padded_dim(None)]
     fillvalue: Default fillvalue to use.
   """
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 9961fa74c2..7bcf3d84bb 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -74,7 +74,7 @@ def _any_variable_initalized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
-    boolean, True if at least one variable has been initalized, else False.
+    boolean, True if at least one variable has been initialized, else False.
   """
   variables = variables_module.global_variables()
   for v in variables:
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 08fff3ba64..522662cd32 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -597,7 +597,7 @@ class _TrainingExecutor(object):
     # max_steps, the evaluator will send the final export signal. There is a
     # small chance that the Estimator.train stopping logic sees a different
     # global_step value (due to global step race condition and the fact the
-    # saver sees a larger value for checkpoing saving), which does not end
+    # saver sees a larger value for checkpoint saving), which does not end
     # the training. When the training ends, a new checkpoint is generated, which
     # triggers the listener again. So, it could be the case the final export is
     # triggered twice.
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index ede6e0d159..ffcb9990d5 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -48,7 +48,7 @@ should choose depends on (1) the feature type and (2) the model type.
 
       embedded_dept_column = embedding_column(
           categorical_column_with_vocabulary_list(
-              "department", ["math", "philosphy", ...]), dimension=10)
+              "department", ["math", "philosophy", ...]), dimension=10)
 
   * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
 
@@ -280,7 +280,7 @@ def input_layer(features,
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
 # should implement the logic in input_layer using Layer's build-and-call
 # paradigm; input_layer should create an instance of InputLayer and
-# return the result of inovking its apply method, just as functional layers do.
+# return the result of invoking its apply method, just as functional layers do.
 class InputLayer(object):
   """An object-oriented version of `input_layer` that reuses variables."""
 
@@ -834,7 +834,7 @@ def shared_embedding_columns(
     tensor_name_in_ckpt=None, max_norm=None, trainable=True):
   """List of dense columns that convert from sparse, categorical input.
 
-  This is similar to `embedding_column`, except that that it produces a list of
+  This is similar to `embedding_column`, except that it produces a list of
   embedding columns that share the same embedding weights.
 
   Use this when your inputs are sparse and of the same type (e.g. watched and
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 19928314ef..17d112a1ec 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -7,6 +7,18 @@ cimport numpy as np
 from tensorflow.python.util import compat
 
 
+def AppendFloat16ArrayToTensorProto(
+    # For numpy, npy_half is a typedef for npy_uint16,
+    # see: https://github.com/numpy/numpy/blob/master/doc/source/reference/c-api.coremath.rst#half-precision-functions
+    # Because np.float16_t dosen't exist in cython, we use uint16_t here.
+    # TODO: Use np.float16_t when cython supports it.
+    tensor_proto, np.ndarray[np.uint16_t, ndim=1] nparray):
+  cdef long i, n
+  n = nparray.size
+  for i in range(n):
+    tensor_proto.half_val.append(nparray[i])
+
+
 def AppendFloat32ArrayToTensorProto(
     tensor_proto, np.ndarray[np.float32_t, ndim=1] nparray):
   cdef long i, n
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 80140e4063..9fc8136348 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -2582,7 +2582,7 @@ def set_shape_and_handle_data_for_outputs(op):
 
   When _USE_C_API = True, this is lazily called when a tensor's shape is first
   requested. Usually this should work automatically, but some edge cases may
-  require manaully calling this first to make sure Tensor._shape_val and
+  require manually calling this first to make sure Tensor._shape_val and
   Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
   Tensor).
   """
@@ -5426,36 +5426,30 @@ def enable_eager_execution(config=None, device_policy=None,
       in which operations are executed. Note that @{tf.ConfigProto} is also
       used to configure graph execution (via @{tf.Session}) and many options
       within `tf.ConfigProto` are not implemented (or are irrelevant) when
-     eager execution is enabled.
+      eager execution is enabled.
     device_policy: (Optional.) Policy controlling how operations requiring
-     inputs on a specific device (e.g., a GPU 0) handle inputs on a different
-     device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
-     picked automatically. The value picked may change between TensorFlow
-     releases.
-     Valid values:
-
+      inputs on a specific device (e.g., a GPU 0) handle inputs on a different
+      device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
+      picked automatically. The value picked may change between TensorFlow
+      releases.
+      Valid values:
       - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
         placement is not correct.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_WARN: copies the tensors which are not
         on the right device but logs a warning.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT: silently copies the tensors.
         Note that this may hide performance problems as there is no notification
         provided when operations are blocked on the tensor being copied between
         devices.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
         int32 tensors, raising errors on the other ones.
     execution_mode: (Optional.) Policy controlling how operations dispatched are
       actually executed. When set to None, an appropriate value will be picked
       automatically. The value picked may change between TensorFlow releases.
       Valid values:
-
-        - tf.contrib.eager.SYNC: executes each operation synchronously.
-
-        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
-          operations may return "non-ready" handles.
+      - tf.contrib.eager.SYNC: executes each operation synchronously.
+      - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+        operations may return "non-ready" handles.
 
   Raises:
     ValueError: If eager execution is enabled after creating/executing a
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 8cf24206ed..ca63efbc84 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -50,6 +50,13 @@ def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
       [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
+def _MediumAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  # TODO: Remove the conversion if cython supports np.float16_t
+  fast_tensor_util.AppendFloat16ArrayToTensorProto(
+      tensor_proto,
+      np.asarray(proto_values, dtype=np.float16).view(np.uint16))
+
+
 def ExtractBitsFromBFloat16(x):
   return np.asscalar(
       np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
@@ -64,11 +71,8 @@ if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
           SlowAppendBFloat16ArrayToTensorProto,
-      # TODO(sesse): We should have a
-      # fast_tensor_util.AppendFloat16ArrayToTensorProto,
-      # but it seems np.float16_t doesn't exist?
       np.float16:
-          SlowAppendFloat16ArrayToTensorProto,
+          _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
           fast_tensor_util.AppendFloat32ArrayToTensorProto,
       np.float64:
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 97cd22e47a..5b01df48fe 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -682,7 +682,7 @@ def run_in_graph_and_eager_modes(__unused__=None,
 
 
   Args:
-    __unused__: Prevents sliently skipping tests.
+    __unused__: Prevents silently skipping tests.
     config: An optional config_pb2.ConfigProto to use to configure the
       session when executing graphs.
     use_gpu: If True, attempt to run as many operations as possible on GPU.
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 7b5eecc153..69337b6a8d 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
 from tensorflow.python.keras.utils.generic_utils import custom_object_scope
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 72cc357c71..3dfad9c130 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -742,6 +742,18 @@ tf_py_test(
 )
 
 tf_py_test(
+    name = "regex_full_match_op_test",
+    size = "small",
+    srcs = ["regex_full_match_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
     srcs = ["save_restore_ops_test.py"],
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index e2e6205911..fcba456004 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -31,9 +31,7 @@ class Conv1DTest(test.TestCase):
 
   def testBasic(self):
     """Test that argument passing to conv1d is handled properly."""
-    # TODO(yongtang): dtypes.float64 can only be enabled once conv2d support
-    # dtypes.float64, as conv1d implicitly calls conv2d after expand_dims.
-    for dtype in [dtypes.float16, dtypes.float32]:
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant([1, 2, 3, 4], dtype=dtype)
       x = array_ops.expand_dims(x, 0)  # Add batch dimension
       x = array_ops.expand_dims(x, 2)  # And depth dimension
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 8973a450fa..289ae29fce 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -131,6 +131,23 @@ class Conv3DTransposeTest(test.TestCase):
     nn_ops.conv3d_transpose(
         x_value, f_value, y_shape, strides, data_format='NCDHW')
 
+  def testConv3DTransposeOutputShapeType(self):
+    # Test case for GitHub issue 18887
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session():
+        x_shape = [2, 5, 6, 4, 3]
+        y_shape = [2, 5, 6, 4, 2]
+        f_shape = [3, 3, 3, 2, 3]
+        strides = [1, 1, 1, 1, 1]
+        x_value = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f_value = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv3d_transpose(
+            x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
+            strides=strides, padding="SAME")
+        output.eval()
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 8e5556d0a0..63d19c15cf 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -735,7 +735,7 @@ class FillTriangularTest(test.TestCase):
       raise ValueError("Invalid shape.")
     n = np.int32(n)
     # We can't do: `x[..., -(n**2-m):]` because this doesn't correctly handle
-    # `m == n == 1`. Hence, we do absoulte indexing.
+    # `m == n == 1`. Hence, we do absolute indexing.
     x_tail = x[..., (m - (n * n - m)):]
     y = np.concatenate(
         [x, x_tail[..., ::-1]] if upper else [x_tail, x[..., ::-1]],
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index f31426713c..dc3ea38671 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -93,7 +93,7 @@ class RollTest(test_util.TensorFlowTestCase):
   def testNegativeAxis(self):
     self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
-    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    # Make sure negative axis should be 0 <= axis + dims < dims
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of range"):
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
new file mode 100644
index 0000000000..5daae1b79b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RegexFullMatch op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class RegexFullMatchOpTest(test.TestCase):
+
+  def testRegexFullMatch(self):
+    values = ["abaaba", "abcdabcde"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "a.*a").eval()
+      self.assertAllEqual([True, False], matched)
+
+  def testEmptyMatch(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "").eval()
+      self.assertAllEqual([False, False], matched)
+
+  def testInvalidPattern(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      invalid_pattern = "A["
+      matched = string_ops.regex_full_match(input_vector, invalid_pattern)
+      with self.assertRaisesOpError("Invalid pattern"):
+        matched.eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3bca5fadc4..794be096b7 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -91,16 +91,18 @@ class SegmentReductionOpTest(SegmentReductionHelper):
     ]
 
     # Each item is np_op1, np_op2, tf_op
-    ops_list = [(np.add, None, math_ops.segment_sum), (self._mean_cum_op,
-                                                       self._mean_reduce_op,
-                                                       math_ops.segment_mean),
+    ops_list = [(np.add, None, math_ops.segment_sum),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.segment_mean),
                 (np.ndarray.__mul__, None, math_ops.segment_prod),
                 (np.minimum, None, math_ops.segment_min),
                 (np.maximum, None, math_ops.segment_max)]
 
     # A subset of ops has been enabled for complex numbers
     complex_ops_list = [(np.add, None, math_ops.segment_sum),
-                        (np.ndarray.__mul__, None, math_ops.segment_prod)]
+                        (np.ndarray.__mul__, None, math_ops.segment_prod),
+                        (self._mean_cum_op, self._mean_reduce_op,
+                         math_ops.segment_mean)]
 
     n = 10
     shape = [n, 2]
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 340c34fc5e..eda036ece4 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -191,6 +191,16 @@ class Layer(base_layer.Layer):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
+    
+    def _should_add_regularizer(variable, existing_variable_set):
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        for var in variable:
+          if var in existing_variable_set:
+            return False
+        return True
+      else:
+        return variable not in existing_variable_set
+
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -233,7 +243,8 @@ class Layer(base_layer.Layer):
             getter=vs.get_variable)
 
         if regularizer:
-          if context.executing_eagerly() or variable not in existing_variables:
+          if context.executing_eagerly() or _should_add_regularizer(
+              variable, existing_variables):
             self._handle_weight_regularization(name, variable, regularizer)
 
         if init_graph is not None:
@@ -353,4 +364,3 @@ def _add_elements_to_collection(elements, collection_list):
     for element in elements:
       if element not in collection_set:
         collection.append(element)
-
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index f08b552840..ab49e37b90 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -95,6 +96,21 @@ class BaseLayerTest(test.TestCase):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testReusePartitionedVaraiblesAndRegularizers(self):
+    regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
+    partitioner = partitioned_variables.fixed_size_partitioner(3)
+    for reuse in [False, True]:
+      with variable_scope.variable_scope(variable_scope.get_variable_scope(),
+                                         partitioner=partitioner,
+                                         reuse=reuse):
+        layer = base_layers.Layer(name='my_layer')
+        variable = layer.add_variable(
+            'reg_part_var', [4, 4],
+            initializer=init_ops.zeros_initializer(),
+            regularizer=regularizer)
+    self.assertEqual(
+        len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 3)
+
   def testNoEagerActivityRegularizer(self):
     with context.eager_mode():
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index ab5997e85c..3a31ef7f88 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1285,7 +1285,7 @@ def reduce_sum(input_tensor,
     The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  Equivalent to np.sum apart the fact that numpy upcast uint8 and int32 to
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 1271ee5108..ae79c01949 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -39,6 +39,8 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
+# Expose regex_full_match in strings namespace
+tf_export("strings.regex_full_match")(regex_full_match)
 
 @tf_export("string_split")
 def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 75580fc630..9e49188c1e 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -232,7 +232,12 @@ class PrintModelAnalysisTest(test.TestCase):
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertLess(145660, tfprof_node.total_float_ops)
+        #The graph is modifed when MKL is enabled,total_float_ops will
+        #be different
+        if test_util.IsMklEnabled():
+          self.assertLess(101600, tfprof_node.total_float_ops)
+        else:
+          self.assertLess(145660, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 4b3982677f..24a13c0f33 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -130,7 +130,8 @@ class SavedModelBuilder(object):
       if not file_io.file_exists(asset_destination_filepath):
         file_io.copy(asset_source_filepath, asset_destination_filepath)
 
-    tf_logging.info("Assets written to: %s", assets_destination_dir)
+    tf_logging.info("Assets written to: %s",
+                    compat.as_text(assets_destination_dir))
 
   def _maybe_add_legacy_init_op(self, legacy_init_op=None):
     """Add legacy init op to the SavedModel.
@@ -461,7 +462,7 @@ class SavedModelBuilder(object):
           compat.as_bytes(self._export_dir),
           compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
       file_io.write_string_to_file(path, self._saved_model.SerializeToString())
-    tf_logging.info("SavedModel written to: %s", path)
+    tf_logging.info("SavedModel written to: %s", compat.as_text(path))
 
     return path
 
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 6d05a2ee29..ab8b37bb65 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -750,7 +750,7 @@ class DistributionStrategy(object):
     `fn` may call `tf.get_tower_context()` to access methods such as
     `tower_id()` and `merge_call()`.
 
-    `merge_call()` is used to communicate betwen the towers and
+    `merge_call()` is used to communicate between the towers and
     re-enter the cross-tower context. All towers pause their execution
     having encountered a `merge_call()` call. After that the
     `merge_fn`-function is executed. Its results are then unwrapped and
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index fc89f88063..4d464135fd 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1743,7 +1743,7 @@ class Saver(object):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
-    logging.info("Restoring parameters from %s", save_path)
+    logging.info("Restoring parameters from %s", compat.as_text(save_path))
     try:
       if context.executing_eagerly():
         self._build_eager(save_path, build_save=False, build_restore=True)
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 5faf644c91..fbd6561767 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -232,7 +232,7 @@ def getcallargs(func, *positional, **named):
   it. If no attached decorators modify argspec, the final unwrapped target's
   argspec will be used.
   """
-  argspec = getargspec(func)
+  argspec = getfullargspec(func)
   call_args = named.copy()
   this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
   if ismethod(func) and this:
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 2b33d106bc..0f465eda4f 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -320,7 +320,7 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 
 // Returns true iff there were no "internal" errors. In other words,
 // errors that has nothing to do with structure checking.
-// If an "internal" error occured, the appropriate Python error will be
+// If an "internal" error occurred, the appropriate Python error will be
 // set and the caller can propage it directly to the user.
 //
 // Both `error_msg` and `is_type_error` must be non-null. `error_msg` must
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 9851c11c2e..70efc10c9a 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -97,7 +97,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 // used instead. The same convention is followed in `pack_sequence_as`. This
 // correctly repacks dicts and `OrderedDict`s after they have been flattened,
 // and also allows flattening an `OrderedDict` and then repacking it back using
-// a correponding plain dict, or vice-versa.
+// a corresponding plain dict, or vice-versa.
 // Dictionaries with non-sortable keys cannot be flattened.
 //
 // Args:
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index be0b0bf5fb..ea87744b22 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -1086,6 +1086,13 @@ class BlasSupport {
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
       uint64 n, uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha,
       const port::ArraySlice<DeviceMemory<float> *> &a, int lda,
       const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,
       const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
@@ -1948,6 +1955,13 @@ class BlasSupport {
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64 m, uint64 n, uint64 k, float alpha,                               \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,         \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,         \
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,      \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha,                               \
       const port::ArraySlice<DeviceMemory<float> *> &a, int lda,               \
       const port::ArraySlice<DeviceMemory<float> *> &b, int ldb, float beta,   \
       const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,               \
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 3e9a23c658..08fe153b59 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -286,6 +286,10 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
+#if CUDA_VERSION >= 9010
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmBatchedEx)
+#endif
+
 }  // namespace wrap
 
 static string ToString(cublasStatus_t status) {
@@ -2330,13 +2334,23 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
       computation_type, algorithm, output_profile_result);
 }
 
-template <typename T, typename FuncT>
+template <typename T>
+struct HalfAsFloat {
+  typedef T type;
+};
+
+template <>
+struct HalfAsFloat<Eigen::half> {
+  typedef float type;
+};
+
+template <typename T, typename Scalar, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
-    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
     const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
     const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
-    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   for (int i = 0; i < batch_count; ++i) {
@@ -2345,7 +2359,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename CUDAComplexT<T>::type CUDA_T;
+  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2397,18 +2411,84 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
                         "CUDABlas::DoBlasGemmBatched");
   }
 
-  bool ok = DoBlasInternal(
-      cublas_func, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-      const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+  cudaDataType_t data_type = CUDADataType<T>::type;
 
-  if (ok) {
+#if CUDA_VERSION >= 9010
+  int cc_major, cc_minor;
+  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    cublasGemmAlgo_t algo =
+        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    cudaDataType_t compute_type =
+        (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
+    const void **a_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(a)));
+    const void **b_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(b)));
+    void **c_void_ptrs =
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+    bool ok;
+    ok = DoBlasInternalImpl(
+        wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
+        b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
+        batch_count, compute_type, algo);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  }
+#endif
+  // either CUDA_VERSION < 9.1 or SM < 5.0
+  if (data_type != CUDA_R_16F) {
+    bool ok = DoBlasInternal(
+        cublas_func, stream, true /* = pointer_mode_host */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
+        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  } else {
+    // Fall back to a loop for fp16
+    for (int b = 0; b < batch_count; ++b) {
+      const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
+      const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
+      DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
+      bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
+                           lda, b_matrix, ldb, beta, c_matrix, ldc);
+      if (!ok) {
+        return port::Status(port::error::INTERNAL,
+                            "failed BLAS call, see log for details");
+      }
+    }
     return port::Status::OK();
   }
-  return port::Status(port::error::INTERNAL,
-                      "failed BLAS call, see log for details");
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
+    float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // Note: The func passed here (cublasSgemmBatched) is not actually called,
+  // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 12dc5e47fd..42b3fde5b0 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -107,12 +107,12 @@ class CUDABlas : public blas::BlasSupport {
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
-  template <typename T, typename FuncT>
+  template <typename T, typename Scalar, typename FuncT>
   port::Status DoBlasGemmBatchedInternal(
       FuncT cublas_func, Stream *stream, blas::Transpose transa,
-      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
       const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
-      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, Scalar beta,
       const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
       int batch_count, ScratchAllocator *scratch_allocator);
 
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 2bc9b6b798..4a98cfe164 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -4482,6 +4482,40 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
 
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        /*scratch_allocator=*/nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
+
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               float, const port::ArraySlice<DeviceMemory<Eigen::half> *> &,
+               int, int, ScratchAllocator *>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
+}
+
+Stream &Stream::ThenBlasGemmBatched(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
     uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
     int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
     float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index c6e37da6d1..3da1b856d6 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -1471,6 +1471,13 @@ class Stream {
       blas::ProfileResult *output_profile_result);
 
   // See BlasSupport::DoBlasGemmBatched.
+  Stream &ThenBlasGemmBatched(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64 m, uint64 n, uint64 k, float alpha,
                               const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -1505,6 +1512,13 @@ class Stream {
       int batch_count);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
       uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
       int lda, const port::ArraySlice<DeviceMemory<float> *> &b, int ldb,
       float beta, const port::ArraySlice<DeviceMemory<float> *> &c, int ldc,
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 880ec0523d..d71fd71bbd 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1300,7 +1300,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
     native.cc_library(
         name=basename + "_gpu",
         srcs=gpu_srcs,
-        copts=_cuda_copts(),
+        copts=_cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
         deps=deps + if_cuda(cuda_deps))
     cuda_deps.extend([":" + basename + "_gpu"])
 
@@ -1483,7 +1483,7 @@ def tf_py_wrap_cc(name,
 # This macro is for running python tests against system installed pip package
 # on Windows.
 #
-# py_test is built as an exectuable python zip file on Windows, which contains all
+# py_test is built as an executable python zip file on Windows, which contains all
 # dependencies of the target. Because of the C++ extensions, it would be very
 # inefficient if the py_test zips all runfiles, plus we don't need them when running
 # tests against system installed pip package. So we'd like to get rid of the deps
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index e58de5b63e..f46bb4b5fc 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -101,6 +101,7 @@ genrule(
         "api/profiler/__init__.py",
         "api/python_io/__init__.py",
         "api/resource_loader/__init__.py",
+        "api/strings/__init__.py",
         "api/saved_model/__init__.py",
         "api/saved_model/builder/__init__.py",
         "api/saved_model/constants/__init__.py",
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 74b1b39d9f..dc2bd40096 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -501,6 +501,10 @@ tf_module {
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
+    name: "strings"
+    mtype: "<type \'module\'>"
+  }
+  member {
     name: "summary"
     mtype: "<type \'module\'>"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
new file mode 100644
index 0000000000..a3fbe95bba
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.strings"
+tf_module {
+  member_method {
+    name: "regex_full_match"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 5aaf544afd..982161cefe 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -17,14 +17,9 @@
 set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
-# Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
-if $(cat /etc/*-release | grep -q 14.04); then
-  easy_install -U pip==9.0.3
-  easy_install3 -U pip==9.0.3
-else
-  pip2 install --upgrade pip==9.0.3
-  pip3 install --upgrade pip==9.0.3
-fi
+# Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
+easy_install -U pip==9.0.3
+easy_install3 -U pip==9.0.3
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index b9996395d0..406d134699 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -85,7 +85,7 @@ RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.g
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
         # For ivy-bridge or sandy-bridge
         # --copt=-march="ivybridge" \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index c65e0b72bc..a6cd44ced1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -35,10 +35,10 @@ ENV CI_BUILD_PYTHON=python \
     PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
     CC_OPT_FLAGS='-march=native' \
     TF_NEED_JEMALLOC=0 \
-    TF_NEED_GCP=0 \
+    TF_NEED_GCP=1 \
     TF_NEED_CUDA=0 \
     TF_NEED_HDFS=0 \
-    TF_NEED_S3=0 \
+    TF_NEED_S3=1 \
     TF_NEED_OPENCL=0 \
     TF_NEED_GDR=0 \
     TF_ENABLE_XLA=0 \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 7e5e6ef2d5..2fe47f3356 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -98,7 +98,7 @@ ENV TF_CUDNN_VERSION=7
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda \
+    bazel build -c opt --copt=-mavx --config=cuda \
 	--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         tensorflow/tools/pip_package:build_pip_package && \
     rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 67badb4869..9f6f553ba1 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -388,7 +388,7 @@ input is collapsed down into a simple constant.
 Args:
 
 *   clear_output_shapes: Clears tensor shape information saved as attributes.
-    Some older graphs containes out-of-date information and may cause import
+    Some older graphs contains out-of-date information and may cause import
     errors. Defaults to true.
 
 Prerequisites: None
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index b66d5bdd37..1a83c6e757 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,7 +24,7 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
   mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index f7385e5991..319878e1b5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -31,7 +31,7 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0-rc1'
+_VERSION = '1.8.0'
 
 _SHORT_DESCRIPTION = ('TensorFlow is an open source machine learning framework '
                       'for everyone.')
@@ -55,7 +55,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorboard >= 1.7.0, < 1.8.0',
+    'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
author	Yifei Feng <yifeif@google.com>	2018-05-24 19:12:26 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-05-24 19:15:01 -0700
commit	b59833c3fd91511b33255369016868e4ae6cda2e (patch)
tree	ecbd70cfd3abb5d934f6eb4b7280a35e8589f5cf /tensorflow
parent	2b99d9cbc7166efedaff9eee11744348da30fc8a (diff)